From 76c69de591b388ffb9fbf57bae337cafde811008 Mon Sep 17 00:00:00 2001 From: Weibo He Date: Fri, 8 May 2026 15:18:43 +0800 Subject: [PATCH 001/538] [CUDA/HIP] Do not check function calls in discarded statement (#194606) Previously, calling a host-device mismatch function inside a discarded `if constexpr` branch would trigger an error. This patch recognizes that discarded statements are never instantiated and allows such code. --- clang/docs/ReleaseNotes.rst | 3 +++ clang/lib/Sema/SemaCUDA.cpp | 20 +++++++++++++------ .../test/SemaCUDA/call-device-fn-from-host.cu | 20 +++++++++++++++++++ .../test/SemaCUDA/call-host-fn-from-device.cu | 18 +++++++++++++++++ clang/test/SemaCUDA/device-kernel-call.cu | 16 +++++++++++++++ 5 files changed, 71 insertions(+), 6 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c83a1bd0ab2e9..ac462e3bf4732 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -481,6 +481,9 @@ Improvements to Clang's diagnostics - Removed the body of lambdas from some diagnostic messages. +- Fixed false positive host-device mismatch errors in discarded `if constexpr` branches for CUDA/HIP; + such calls are now correctly skipped. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index c086f9a32ce4e..9e05de941f335 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -84,10 +84,6 @@ ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc, return ExprError( Diag(LLLLoc, diag::err_cuda_device_kernel_launch_not_supported)); - if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode) - return ExprError( - Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc)); - FunctionDecl *ConfigDecl = IsDeviceKernelCall ? getASTContext().getcudaLaunchDeviceDecl() : getASTContext().getcudaConfigureCallDecl(); @@ -990,7 +986,8 @@ bool SemaCUDA::CheckCall(SourceLocation Loc, FunctionDecl *Callee) { assert(Callee && "Callee may not be null."); const auto &ExprEvalCtx = SemaRef.currentEvaluationContext(); - if (ExprEvalCtx.isUnevaluated() || ExprEvalCtx.isConstantEvaluated()) + if (ExprEvalCtx.isUnevaluated() || ExprEvalCtx.isConstantEvaluated() || + ExprEvalCtx.isDiscardedStatementContext()) return true; // C++ deduction guides participate in overload resolution but are not @@ -1026,9 +1023,20 @@ bool SemaCUDA::CheckCall(SourceLocation Loc, FunctionDecl *Callee) { } }(); + bool IsDeviceKernelCall = Callee == getASTContext().getcudaLaunchDeviceDecl(); + bool CallerHD = Caller && Caller->hasAttr() && + Caller->hasAttr(); + bool CallerDiscard = SemaRef.getEmissionStatus(Caller) == + Sema::FunctionEmissionStatus::TemplateDiscarded; + bool RDC = getLangOpts().GPURelocatableDeviceCode; + if (IsDeviceKernelCall && !(CallerHD && CallerDiscard) && !RDC) { + Diag(Loc, diag::err_cuda_device_kernel_launch_require_rdc); + return false; + } + if (DiagKind == SemaDiagnosticBuilder::K_Nop) { // For -fgpu-rdc, keep track of external kernels used by host functions. - if (getLangOpts().CUDAIsDevice && getLangOpts().GPURelocatableDeviceCode && + if (getLangOpts().CUDAIsDevice && RDC && Callee->hasAttr() && !Callee->isDefined() && (!Caller || (!Caller->getDescribedFunctionTemplate() && getASTContext().GetGVALinkageForFunction(Caller) == diff --git a/clang/test/SemaCUDA/call-device-fn-from-host.cu b/clang/test/SemaCUDA/call-device-fn-from-host.cu index 4d66fccd84d53..64394c7a4d958 100644 --- a/clang/test/SemaCUDA/call-device-fn-from-host.cu +++ b/clang/test/SemaCUDA/call-device-fn-from-host.cu @@ -3,6 +3,11 @@ // RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - \ // RUN: -verify=expected,omp -verify-ignore-unexpected=note -fopenmp +// RUN: %clang_cc1 %s --std=c++17 -triple x86_64-unknown-linux -emit-llvm -o - \ +// RUN: -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 %s --std=c++17 -triple x86_64-unknown-linux -emit-llvm -o - \ +// RUN: -verify=expected,omp -verify-ignore-unexpected=note -fopenmp + // Note: This test won't work with -fsyntax-only, because some of these errors // are emitted during codegen. @@ -97,3 +102,18 @@ void host_func(void) { kernel<<<1, 1>>>(); } __device__ void f(); template __global__ void t() { F(); } __host__ void g() { t<<<1,1>>>(); } + +#if __cplusplus >= 201703L +namespace template_if_constexpr { + template + __host__ __device__ void fn() { + if constexpr (B) + device_fn(); + } + + void call() { + fn(); + fn(); // expected-error@-5 {{reference to __device__ function 'device_fn' in __host__ __device__ function}} + } +} +#endif diff --git a/clang/test/SemaCUDA/call-host-fn-from-device.cu b/clang/test/SemaCUDA/call-host-fn-from-device.cu index acdd291b66457..d172cd966c823 100644 --- a/clang/test/SemaCUDA/call-host-fn-from-device.cu +++ b/clang/test/SemaCUDA/call-host-fn-from-device.cu @@ -1,6 +1,9 @@ // RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device \ // RUN: -emit-llvm -o /dev/null -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 %s --std=c++17 -triple nvptx-unknown-unknown -fcuda-is-device \ +// RUN: -emit-llvm -o /dev/null -verify -verify-ignore-unexpected=note + // Note: This test won't work with -fsyntax-only, because some of these errors // are emitted during codegen. @@ -138,3 +141,18 @@ __host__ __device__ void TmplStruct::fn() { host_fn(); } // expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} __device__ void double_specialization() { TmplStruct().fn(); } + +#if __cplusplus >= 201703L +namespace template_if_constexpr { + template + __host__ __device__ void fn() { + if constexpr (B) + host_fn(); + } + + __device__ void call() { + fn(); + fn(); // expected-error@-5 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} + } +} +#endif diff --git a/clang/test/SemaCUDA/device-kernel-call.cu b/clang/test/SemaCUDA/device-kernel-call.cu index 856cbd88404e6..7511cf148a077 100644 --- a/clang/test/SemaCUDA/device-kernel-call.cu +++ b/clang/test/SemaCUDA/device-kernel-call.cu @@ -13,3 +13,19 @@ __global__ void g1(void) { // nordc-error@-1 {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} // hip-error@-2 {{device-side kernel call/launch is not supported}} } + +namespace template_if_constexpr { + template + __host__ __device__ void fn() { + if constexpr (B) + g2<<<1, 1>>>(42); + // hip-error@-1 {{device-side kernel call/launch is not supported}} + } + + void call() { + fn(); + fn(); + // nordc-error@-7 {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} + // nordc-note@-2 {{in instantiation of function template specialization 'template_if_constexpr::fn' requested here}} + } +} From a33ba5f0948c2e6c365468a113a7d1ae5afb37b5 Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Fri, 8 May 2026 08:28:37 +0100 Subject: [PATCH 002/538] [libc] Include CPU model in overlay CI sccache key (#196477) [libc] Include CPU model in overlay CI sccache key The overlay CI compiles opt_host memory tests with `-march=native`, which generates object files specific to the runner CPU model. sccache treats `-march=native` as a literal string in its hash key, so cached `.o` files compiled on one CPU model get served to runners with a different CPU. When the cached binary uses instructions the current CPU lacks, the test crashes with SIGILL. ## Symptoms The `memcmp_opt_host`, `memmove_opt_host`, `memset_opt_host`, `bcmp_opt_host`, and `bzero_opt_host` tests crash when SIMD code paths are first exercised. Simple tests like `CmpZeroByte` pass because they use small sizes that do not enter SIMD routines. The failures are fully reproducible on reruns because the cache stays poisoned. ## Evidence Three consecutive runs of the same fwide PR (#196157), same code: | Run | Azure Region | Cache Hits | Cache Misses | Result | |-----|-------------|-----------|-------------|--------| | [25512875679](https://github.com/llvm/llvm-project/actions/runs/25512875679/job/74876008545) | westus3 | 9 | 5354 | PASS | | [25524024922](https://github.com/llvm/llvm-project/actions/runs/25524024922/job/74916241365) | northcentralus | 5345 | 0 | CRASH | | [25524839613](https://github.com/llvm/llvm-project/actions/runs/25524839613/job/74965830435) | westus | 5345 | 0 | CRASH | The first run had a nearly empty cache and compiled everything locally (0.17% hit rate). An intermediate [syscall-unistd run](https://github.com/llvm/llvm-project/actions/runs/25517783708/job/74893495220) in eastus then populated the cache with object files compiled for that region's CPU. Subsequent runs on different hardware got 100% cache hits and crashed because the cached `.o` files use instructions their CPUs lack. ## Fix Added a "Detect CPU model" step that reads the CPU model string from `/proc/cpuinfo` (Linux) or `sysctl` (macOS) and appends it to the sccache cache key. Runners with different CPUs now get separate cache buckets. Assisted-by: Automated tooling, human reviewed. --- .github/workflows/libc-overlay-tests.yml | 27 +++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml index 07059e0271af5..a020f0bfd5cd3 100644 --- a/.github/workflows/libc-overlay-tests.yml +++ b/.github/workflows/libc-overlay-tests.yml @@ -44,6 +44,31 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false + + # The libc build uses -march=native for opt_host tests, which means + # the generated object files are specific to the runner's CPU model. + # sccache treats -march=native as a literal string in its cache key, + # so without per-CPU cache keys, object files compiled on one CPU + # model are silently served to runners with a different CPU, causing + # illegal instruction crashes at runtime. + - name: Detect CPU model + id: cpu-info + shell: bash + run: | + if [ "$RUNNER_OS" = "Linux" ]; then + # x86 has 'model name', ARM has 'CPU implementer' + 'CPU part'. + cpu_model=$(grep -m1 'model name' /proc/cpuinfo | cut -d: -f2 | xargs | tr ' ' '-' || true) + if [ -z "$cpu_model" ]; then + impl=$(grep -m1 'CPU implementer' /proc/cpuinfo | cut -d: -f2 | xargs) + part=$(grep -m1 'CPU part' /proc/cpuinfo | cut -d: -f2 | xargs) + cpu_model="arm-${impl:-unknown}-${part:-unknown}" + fi + elif [ "$RUNNER_OS" = "macOS" ]; then + cpu_model=$(sysctl -n machdep.cpu.brand_string | tr ' ' '-') + else + cpu_model="generic" + fi + echo "cpu-model=${cpu_model:-unknown}" >> "$GITHUB_OUTPUT" # Libc's build is relatively small comparing with other components of LLVM. # A fresh linux overlay takes about 180MiB of uncompressed disk space, which can @@ -56,7 +81,7 @@ jobs: uses: hendrikmuhs/ccache-action@33522472633dbd32578e909b315f5ee43ba878ce # v1.2.22 with: max-size: 1G - key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }} + key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }}_${{ steps.cpu-info.outputs.cpu-model }} variant: sccache # MPFR is required by some of the mathlib tests. From 77b7183542f7f6b3b47a271324c2ac93feb8f811 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 8 May 2026 15:31:57 +0800 Subject: [PATCH 003/538] [Runtimes] Fix /clang: prefix warning for GNU-like clang on Windows (#192041) libclc has configure warning on Windows: clang: error: no such file or directory: '/clang:--target=amdgcn-amd-amdhsa-llvm' clang: error: no such file or directory: '/clang:-print-target-triple' CMake Warning at CMakeLists.txt:239 (message): Failed to execute `llvm-project/build/bin/clang.exe /clang:--target=amdgcn-amd-amdhsa-llvm /clang:-print-target-triple` to normalize target triple. Switch to check CMAKE_C_COMPILER_FRONTEND_VARIANT because - CMAKE_C_SIMULATE_ID=MSVC: true for both clang and clang-cl. - CMAKE_C_COMPILER_FRONTEND_VARIANT=MSVC: true for clang-cl; false for clang. --- compiler-rt/cmake/Modules/CompilerRTUtils.cmake | 2 +- runtimes/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake index dc9d2f4c473f8..f4e5560f88412 100644 --- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake @@ -392,7 +392,7 @@ macro(construct_compiler_rt_default_triple) if(CMAKE_C_COMPILER_ID MATCHES "Clang") set(option_prefix "") - if (CMAKE_C_SIMULATE_ID MATCHES "MSVC") + if (CMAKE_C_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") set(option_prefix "/clang:") endif() set(print_target_triple ${CMAKE_C_COMPILER} ${option_prefix}--target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} ${option_prefix}-print-target-triple) diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt index 75551ba0c651a..fd4230b15f72e 100644 --- a/runtimes/CMakeLists.txt +++ b/runtimes/CMakeLists.txt @@ -223,7 +223,7 @@ set(LLVM_TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}") if(CMAKE_C_COMPILER_ID MATCHES "Clang") set(option_prefix "") - if (CMAKE_C_SIMULATE_ID MATCHES "MSVC") + if (CMAKE_C_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") set(option_prefix "/clang:") endif() set(print_target_triple ${CMAKE_C_COMPILER} ${option_prefix}--target=${LLVM_DEFAULT_TARGET_TRIPLE} ${option_prefix}-print-target-triple) From 0ebfe12891923c3a1d5ccec191135e35b142bbf4 Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Fri, 8 May 2026 09:44:52 +0200 Subject: [PATCH 004/538] [libc++] Recognize _BitInt(N) as signed/unsigned integer type (#185027) Replace the explicit specialization lists in `__is_signed_integer_v` and `__is_unsigned_integer_v` with detection using `is_integral`, `is_signed`, and `is_unsigned`. This covers `_BitInt(N)` for any N, in addition to all standard and extended integer types. Character types and `bool` are excluded via `__is_character_or_bool_v`. This unblocks `` operations (`popcount`, `countl_zero`, `rotl`, etc.) for `_BitInt(N)`. Part of the [_BitInt(N) libc++ effort](https://discourse.llvm.org/t/bitint-n-support-in-libc-investigations-possible-improvements-looking-for-guidance/90063). Assisted-by: Claude (Anthropic) --------- Co-authored-by: Claude Opus 4.6 --- libcxx/include/__type_traits/integer_traits.h | 53 ++-- .../views/mdspan/extents/bitint.pass.cpp | 83 +++++++ .../bit/bit.pow.two/bit_ceil.pass.cpp | 62 +++++ .../bit/bit.pow.two/bit_floor.pass.cpp | 63 +++++ .../bit/bit.pow.two/bit_width.pass.cpp | 56 +++++ .../bit/bit.pow.two/has_single_bit.pass.cpp | 87 +++++++ .../bit/bitops.count/countl_one.pass.cpp | 63 +++++ .../bit/bitops.count/countl_zero.pass.cpp | 85 +++++++ .../bit/bitops.count/countr_one.pass.cpp | 76 ++++++ .../bit/bitops.count/countr_zero.pass.cpp | 74 ++++++ .../bit/bitops.count/popcount.pass.cpp | 109 +++++++++ .../std/numerics/bit/bitops.rot/rotl.pass.cpp | 56 +++++ .../std/numerics/bit/bitops.rot/rotr.pass.cpp | 56 +++++ .../saturating.bitint.pass.cpp | 227 ++++++++++++++++++ .../make_format_args.bitint.verify.cpp | 57 +++++ .../utility.intcmp/intcmp.bitint.pass.cpp | 173 +++++++++++++ 16 files changed, 1351 insertions(+), 29 deletions(-) create mode 100644 libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp create mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp create mode 100644 libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp diff --git a/libcxx/include/__type_traits/integer_traits.h b/libcxx/include/__type_traits/integer_traits.h index fad502c44e301..d7ac89be9c2a7 100644 --- a/libcxx/include/__type_traits/integer_traits.h +++ b/libcxx/include/__type_traits/integer_traits.h @@ -10,6 +10,10 @@ #define _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H #include <__config> +#include <__type_traits/is_integral.h> +#include <__type_traits/is_same.h> +#include <__type_traits/is_signed.h> +#include <__type_traits/is_unsigned.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -17,43 +21,34 @@ _LIBCPP_BEGIN_NAMESPACE_STD -// This trait is to determine whether a type is a /signed integer type/ -// See [basic.fundamental]/p1 +// These traits determine whether a type is a /signed integer type/ or +// /unsigned integer type/ per [basic.fundamental]/p1-2. +// +// Character types (char, wchar_t, char8_t, char16_t, char32_t) and bool +// are integral but are NOT signed/unsigned integer types. + template -inline const bool __is_signed_integer_v = false; +inline const bool __is_character_v = false; template <> -inline const bool __is_signed_integer_v = true; +inline const bool __is_character_v = true; template <> -inline const bool __is_signed_integer_v = true; +inline const bool __is_character_v = true; +#if _LIBCPP_HAS_CHAR8_T template <> -inline const bool __is_signed_integer_v = true; -template <> -inline const bool __is_signed_integer_v = true; +inline const bool __is_character_v = true; +#endif template <> -inline const bool __is_signed_integer_v = true; -#if _LIBCPP_HAS_INT128 +inline const bool __is_character_v = true; template <> -inline const bool __is_signed_integer_v<__int128_t> = true; -#endif +inline const bool __is_character_v = true; -// This trait is to determine whether a type is an /unsigned integer type/ -// See [basic.fundamental]/p2 template -inline const bool __is_unsigned_integer_v = false; -template <> -inline const bool __is_unsigned_integer_v = true; -template <> -inline const bool __is_unsigned_integer_v = true; -template <> -inline const bool __is_unsigned_integer_v = true; -template <> -inline const bool __is_unsigned_integer_v = true; -template <> -inline const bool __is_unsigned_integer_v = true; -#if _LIBCPP_HAS_INT128 -template <> -inline const bool __is_unsigned_integer_v<__uint128_t> = true; -#endif +inline const bool __is_signed_integer_v = + is_integral<_Tp>::value && is_signed<_Tp>::value && !__is_character_v<_Tp> && !is_same<_Tp, bool>::value; + +template +inline const bool __is_unsigned_integer_v = + is_integral<_Tp>::value && is_unsigned<_Tp>::value && !__is_character_v<_Tp> && !is_same<_Tp, bool>::value; #if _LIBCPP_STD_VER >= 20 template diff --git a/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp new file mode 100644 index 0000000000000..9a4dc02a15c6e --- /dev/null +++ b/libcxx/test/std/containers/views/mdspan/extents/bitint.pass.cpp @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// class extents; +// +// After [libc++] recognized _BitInt as an integer type in +// __type_traits/integer_traits.h, extents silently starts accepting +// _BitInt(N) as IndexType because __signed_or_unsigned_integer is now +// satisfied. This test pins that behavior: construction, extent() and +// static_extent() all work, and the representability static_assert on +// static extents fires when the extent does not fit in the index type. + +#include +#include +#include +#include + +#include "test_macros.h" + +#if TEST_HAS_EXTENSION(bit_int) + +template +constexpr bool test_extents_with_index_type() { + using Ext = std::extents; + static_assert(std::is_same_v); + static_assert(Ext::rank() == 3); + static_assert(Ext::rank_dynamic() == 1); + + Ext e(IndexType{5}); + assert(e.extent(0) == IndexType{3}); + assert(e.extent(1) == IndexType{5}); + assert(e.extent(2) == IndexType{7}); + assert(Ext::static_extent(0) == 3); + assert(Ext::static_extent(1) == std::dynamic_extent); + assert(Ext::static_extent(2) == 7); + + // All-dynamic form. + using DynExt = std::dextents; + static_assert(std::is_same_v); + DynExt d(IndexType{4}, IndexType{6}); + assert(d.extent(0) == IndexType{4}); + assert(d.extent(1) == IndexType{6}); + return true; +} + +constexpr bool test() { + // Signed _BitInt index types across the width tiers. + test_extents_with_index_type<_BitInt(13)>(); + test_extents_with_index_type<_BitInt(32)>(); + test_extents_with_index_type<_BitInt(64)>(); + + // Unsigned _BitInt index types. + test_extents_with_index_type(); + test_extents_with_index_type(); + test_extents_with_index_type(); + +# if __BITINT_MAXWIDTH__ >= 128 + test_extents_with_index_type<_BitInt(128)>(); + test_extents_with_index_type(); +# endif + + return true; +} + +#endif // TEST_HAS_EXTENSION(bit_int) + +int main(int, char**) { +#if TEST_HAS_EXTENSION(bit_int) + test(); + static_assert(test()); +#endif + return 0; +} diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp index 1ab1aa60ab826..1aaddafe40cc7 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp @@ -140,5 +140,67 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. + // bit_ceil uses numeric_limits::digits, so only byte-aligned widths. +#if TEST_HAS_EXTENSION(bit_int) + { + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + assert(std::bit_ceil(T32(0)) == T32(1)); + assert(std::bit_ceil(T32(1)) == T32(1)); + assert(std::bit_ceil(T32(2)) == T32(2)); + assert(std::bit_ceil(T32(3)) == T32(4)); + assert(std::bit_ceil(T32(4)) == T32(4)); + assert(std::bit_ceil(T32(5)) == T32(8)); + assert(std::bit_ceil(T32(7)) == T32(8)); + assert(std::bit_ceil(T32(8)) == T32(8)); + assert(std::bit_ceil(T32(9)) == T32(16)); + assert(std::bit_ceil(T32(60)) == T32(64)); + assert(std::bit_ceil(T32(64)) == T32(64)); + assert(std::bit_ceil(T32(65)) == T32(128)); + assert(std::bit_ceil(T32(128)) == T32(128)); + assert(std::bit_ceil(T32(129)) == T32(256)); + assert(std::bit_ceil(T64(0)) == T64(1)); + assert(std::bit_ceil(T64(1)) == T64(1)); + assert(std::bit_ceil(T64(3)) == T64(4)); + assert(std::bit_ceil(T64(65)) == T64(128)); + assert(std::bit_ceil(T64(T64(1) << 62)) == T64(1) << 62); + assert(std::bit_ceil((T64(1) << 62) + 1) == T64(1) << 63); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T128 = unsigned _BitInt(128); + assert(std::bit_ceil(T128(0)) == T128(1)); + assert(std::bit_ceil(T128(1)) == T128(1)); + assert(std::bit_ceil(T128(3)) == T128(4)); + // Boundary around 64-bit limb. + assert(std::bit_ceil(T128(1) << 64) == T128(1) << 64); + assert(std::bit_ceil((T128(1) << 64) + 1) == T128(1) << 65); + // Near the top of the width. + assert(std::bit_ceil(T128(1) << 126) == T128(1) << 126); + assert(std::bit_ceil((T128(1) << 126) + 1) == T128(1) << 127); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::bit_ceil(T256(0)) == T256(1)); + assert(std::bit_ceil(T256(1)) == T256(1)); + assert(std::bit_ceil(T256(2)) == T256(2)); + assert(std::bit_ceil(T256(3)) == T256(4)); + assert(std::bit_ceil(T256(7)) == T256(8)); + assert(std::bit_ceil(T256(127)) == T256(128)); + assert(std::bit_ceil(T256(128)) == T256(128)); + assert(std::bit_ceil(T256(129)) == T256(256)); + // Large value just below a power of two. + assert(std::bit_ceil(T256(1) << 128) == T256(1) << 128); + assert(std::bit_ceil((T256(1) << 128) + 1) == T256(1) << 129); + assert(std::bit_ceil(T256(1) << 200) == T256(1) << 200); + assert(std::bit_ceil((T256(1) << 200) + 1) == T256(1) << 201); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp index f243e9d1f63b5..07dae010b99fa 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp @@ -139,5 +139,68 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. + // bit_floor uses numeric_limits::digits via __bit_log2, so only + // byte-aligned widths are safe. +#if TEST_HAS_EXTENSION(bit_int) + { + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + assert(std::bit_floor(T32(0)) == T32(0)); + assert(std::bit_floor(T32(1)) == T32(1)); + assert(std::bit_floor(T32(2)) == T32(2)); + assert(std::bit_floor(T32(3)) == T32(2)); + assert(std::bit_floor(T32(4)) == T32(4)); + assert(std::bit_floor(T32(5)) == T32(4)); + assert(std::bit_floor(T32(7)) == T32(4)); + assert(std::bit_floor(T32(8)) == T32(8)); + assert(std::bit_floor(T32(9)) == T32(8)); + assert(std::bit_floor(T32(127)) == T32(64)); + assert(std::bit_floor(T32(128)) == T32(128)); + assert(std::bit_floor(T32(129)) == T32(128)); + assert(std::bit_floor(T32(255)) == T32(128)); + assert(std::bit_floor(T32(~T32(0))) == T32(T32(1) << 31)); + assert(std::bit_floor(T64(0)) == T64(0)); + assert(std::bit_floor(T64(1)) == T64(1)); + assert(std::bit_floor(T64(127)) == T64(64)); + assert(std::bit_floor(T64(128)) == T64(128)); + assert(std::bit_floor(T64(~T64(0))) == T64(T64(1) << 63)); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T128 = unsigned _BitInt(128); + assert(std::bit_floor(T128(0)) == T128(0)); + assert(std::bit_floor(T128(1)) == T128(1)); + // Boundary: values at and above 64-bit limb. + assert(std::bit_floor(T128(1) << 64) == T128(1) << 64); + assert(std::bit_floor((T128(1) << 64) - 1) == T128(1) << 63); + assert(std::bit_floor((T128(1) << 64) + 1) == T128(1) << 64); + assert(std::bit_floor(T128(~T128(0))) == T128(T128(1) << 127)); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::bit_floor(T256(0)) == T256(0)); + assert(std::bit_floor(T256(1)) == T256(1)); + assert(std::bit_floor(T256(2)) == T256(2)); + assert(std::bit_floor(T256(3)) == T256(2)); + assert(std::bit_floor(T256(7)) == T256(4)); + assert(std::bit_floor(T256(127)) == T256(64)); + assert(std::bit_floor(T256(128)) == T256(128)); + assert(std::bit_floor(T256(129)) == T256(128)); + // Boundary at 128-bit limb. + assert(std::bit_floor((T256(1) << 128) - 1) == T256(1) << 127); + assert(std::bit_floor(T256(1) << 128) == T256(1) << 128); + assert(std::bit_floor((T256(1) << 128) + 1) == T256(1) << 128); + // Bits near the top. + assert(std::bit_floor(T256(1) << 200) == T256(1) << 200); + assert(std::bit_floor((T256(1) << 200) - 1) == T256(1) << 199); + assert(std::bit_floor(T256(~T256(0))) == T256(T256(1) << 255)); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp index e6a0cfb9d11e0..efba0dcd2b77b 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp @@ -142,5 +142,61 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. + // bit_width uses numeric_limits::digits via __bit_log2, so only + // byte-aligned widths are safe. +#if TEST_HAS_EXTENSION(bit_int) + { + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + assert(std::bit_width(T32(0)) == 0); + assert(std::bit_width(T32(1)) == 1); + assert(std::bit_width(T32(2)) == 2); + assert(std::bit_width(T32(3)) == 2); + assert(std::bit_width(T32(4)) == 3); + assert(std::bit_width(T32(7)) == 3); + assert(std::bit_width(T32(8)) == 4); + assert(std::bit_width(T32(9)) == 4); + assert(std::bit_width(T32(127)) == 7); + assert(std::bit_width(T32(128)) == 8); + assert(std::bit_width(T32(1024)) == 11); + assert(std::bit_width(T32(~T32(0) - 1)) == 32); + assert(std::bit_width(T32(~T32(0))) == 32); + assert(std::bit_width(T64(0)) == 0); + assert(std::bit_width(T64(1)) == 1); + assert(std::bit_width(T64(127)) == 7); + assert(std::bit_width(T64(128)) == 8); + assert(std::bit_width(T64(T64(1) << 63)) == 64); + assert(std::bit_width(T64(~T64(0))) == 64); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T128 = unsigned _BitInt(128); + assert(std::bit_width(T128(0)) == 0); + assert(std::bit_width(T128(1)) == 1); + // Bit at position 64 (just above 64-bit limb boundary). + assert(std::bit_width(T128(1) << 64) == 65); + assert(std::bit_width(T128(1) << 127) == 128); + assert(std::bit_width(T128(~T128(0))) == 128); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::bit_width(T256(0)) == 0); + assert(std::bit_width(T256(1)) == 1); + assert(std::bit_width(T256(127)) == 7); + assert(std::bit_width(T256(128)) == 8); + // Boundary: bit at position 128 (just above 128-bit limb). + assert(std::bit_width(T256(1) << 128) == 129); + assert(std::bit_width(T256(1) << 100) == 101); + assert(std::bit_width(T256(1) << 200) == 201); + assert(std::bit_width(T256(1) << 255) == 256); + assert(std::bit_width(T256(~T256(0))) == 256); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp index a1088218a35f0..6bab2b9f9069a 100644 --- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp @@ -140,5 +140,92 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. +#if TEST_HAS_EXTENSION(bit_int) + { + using T13 = unsigned _BitInt(13); + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + assert(!std::has_single_bit(T32(0))); + assert(std::has_single_bit(T32(1))); + assert(std::has_single_bit(T32(2))); + assert(!std::has_single_bit(T32(3))); + assert(std::has_single_bit(T32(4))); + assert(!std::has_single_bit(T32(5))); + assert(!std::has_single_bit(T32(6))); + assert(!std::has_single_bit(T32(7))); + assert(std::has_single_bit(T32(8))); + assert(!std::has_single_bit(T32(9))); + assert(std::has_single_bit(T32(128))); + assert(!std::has_single_bit(T32(127))); + assert(!std::has_single_bit(T32(129))); + assert(!std::has_single_bit(T32(~T32(0)))); + assert(!std::has_single_bit(T64(0))); + assert(std::has_single_bit(T64(1))); + assert(std::has_single_bit(T64(T64(1) << 32))); + assert(std::has_single_bit(T64(T64(1) << 63))); + assert(!std::has_single_bit(T64(~T64(0)))); + + // Odd widths: has_single_bit has no digits dependency. + assert(!std::has_single_bit(T13(0))); + assert(std::has_single_bit(T13(1))); + assert(std::has_single_bit(T13(2))); + assert(!std::has_single_bit(T13(3))); + assert(std::has_single_bit(T13(4))); + assert(std::has_single_bit(T13(64))); + assert(!std::has_single_bit(T13(65))); + assert(!std::has_single_bit(T13(~T13(0)))); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T77 = unsigned _BitInt(77); + using T128 = unsigned _BitInt(128); + assert(!std::has_single_bit(T77(0))); + assert(std::has_single_bit(T77(1))); + assert(std::has_single_bit(T77(2))); + assert(!std::has_single_bit(T77(3))); + assert(std::has_single_bit(T77(T77(1) << 76))); + assert(!std::has_single_bit(T77((T77(1) << 76) | T77(1)))); + assert(!std::has_single_bit(T77(~T77(0)))); + + assert(!std::has_single_bit(T128(0))); + assert(std::has_single_bit(T128(1))); + assert(std::has_single_bit(T128(T128(1) << 64))); + assert(std::has_single_bit(T128(T128(1) << 127))); + assert(!std::has_single_bit(T128(~T128(0)))); + // Two bits: definitely not a single bit. + assert(!std::has_single_bit(T128((T128(1) << 127) | T128(1)))); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T129 = unsigned _BitInt(129); + using T256 = unsigned _BitInt(256); + assert(std::has_single_bit(T129(1) << 128)); + assert(!std::has_single_bit(T129(~T129(0)))); + + assert(!std::has_single_bit(T256(0))); + assert(std::has_single_bit(T256(1))); + assert(std::has_single_bit(T256(1) << 200)); + assert(std::has_single_bit(T256(1) << 255)); + assert(!std::has_single_bit((T256(1) << 200) | T256(1))); + assert(!std::has_single_bit(T256(~T256(0)))); + assert(!std::has_single_bit(T256(~T256(0) / 3))); // 0x5555... = 128 bits + } +# endif +# if __BITINT_MAXWIDTH__ >= 4096 + { + using T4096 = unsigned _BitInt(4096); + assert(!std::has_single_bit(T4096(0))); + assert(std::has_single_bit(T4096(1))); + assert(std::has_single_bit(T4096(1) << 4095)); + assert(std::has_single_bit(T4096(1) << 2048)); + assert(!std::has_single_bit(T4096(~T4096(0)))); + assert(!std::has_single_bit((T4096(1) << 4095) | T4096(1))); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp index 82931162b4f39..39d5db1ed22a8 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp @@ -137,5 +137,68 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. +#if TEST_HAS_EXTENSION(bit_int) + { + using T13 = unsigned _BitInt(13); + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + // Byte-aligned widths: numeric_limits::digits is correct, so all + // values including all-ones are safe to test. + assert(std::countl_one(T32(0)) == 0); + assert(std::countl_one(T32(1)) == 0); + assert(std::countl_one(T32(~T32(0))) == 32); + assert(std::countl_one(T32(~T32(0) - 1)) == 31); + assert(std::countl_one(T32(~T32(0) - 2)) == 30); + assert(std::countl_one(T32(~T32(0) - 8)) == 28); + assert(std::countl_one(T32(~T32(0) - 127)) == 25); + assert(std::countl_one(T32(~T32(0) - 128)) == 24); + assert(std::countl_one(T64(0)) == 0); + assert(std::countl_one(T64(~T64(0))) == 64); + assert(std::countl_one(T64(~T64(0) - 1)) == 63); + + // Odd widths: safe for values that are not all-ones. + assert(std::countl_one(T13(0)) == 0); + assert(std::countl_one(T13(1)) == 0); + assert(std::countl_one(T13(~T13(0) - 1)) == 12); + assert(std::countl_one(T13(~T13(0) - 2)) == 11); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T77 = unsigned _BitInt(77); + using T128 = unsigned _BitInt(128); + assert(std::countl_one(T77(0)) == 0); + assert(std::countl_one(T77(1)) == 0); + assert(std::countl_one(T77(~T77(0) - 1)) == 76); + + assert(std::countl_one(T128(0)) == 0); + assert(std::countl_one(T128(~T128(0))) == 128); + assert(std::countl_one(T128(~T128(0) - 1)) == 127); + // Clear a single bit at position 64: 63 leading ones. + assert(std::countl_one(T128(~T128(0) ^ (T128(1) << 64))) == 63); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::countl_one(T256(0)) == 0); + assert(std::countl_one(T256(~T256(0))) == 256); + assert(std::countl_one(T256(~T256(0) - 1)) == 255); + // Clear a single bit at position 100: 155 leading ones. + assert(std::countl_one(T256(~T256(0) ^ (T256(1) << 100))) == 155); + } +# endif +# if __BITINT_MAXWIDTH__ >= 4096 + { + using T4096 = unsigned _BitInt(4096); + assert(std::countl_one(T4096(0)) == 0); + assert(std::countl_one(T4096(~T4096(0))) == 4096); + // Clear a single bit at position 1000: 3095 leading ones. + assert(std::countl_one(T4096(~T4096(0) ^ (T4096(1) << 1000))) == 3095); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp index 20e0eff91b253..a73175d51a201 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp @@ -136,5 +136,90 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. +#if TEST_HAS_EXTENSION(bit_int) + { + using T8 = unsigned _BitInt(8); + using T13 = unsigned _BitInt(13); + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + // Byte-aligned widths: numeric_limits::digits is correct, so all + // values including zero are safe to test. + assert(std::countl_zero(T8(0)) == 8); + assert(std::countl_zero(T8(1)) == 7); + assert(std::countl_zero(T8(2)) == 6); + assert(std::countl_zero(T8(3)) == 6); + assert(std::countl_zero(T8(4)) == 5); + assert(std::countl_zero(T8(8)) == 4); + assert(std::countl_zero(T8(127)) == 1); + assert(std::countl_zero(T8(128)) == 0); + assert(std::countl_zero(T8(~T8(0))) == 0); + assert(std::countl_zero(T32(0)) == 32); + assert(std::countl_zero(T32(1)) == 31); + assert(std::countl_zero(T32(2)) == 30); + assert(std::countl_zero(T32(3)) == 30); + assert(std::countl_zero(T32(127)) == 25); + assert(std::countl_zero(T32(128)) == 24); + assert(std::countl_zero(T32(~T32(0))) == 0); + assert(std::countl_zero(T64(0)) == 64); + assert(std::countl_zero(T64(1)) == 63); + assert(std::countl_zero(T64(T64(1) << 63)) == 0); + assert(std::countl_zero(T64(~T64(0))) == 0); + + // Odd widths: safe for nonzero inputs only (digits is the fallback + // for zero via __builtin_clzg). + assert(std::countl_zero(T13(1)) == 12); + assert(std::countl_zero(T13(2)) == 11); + assert(std::countl_zero(T13(3)) == 11); + assert(std::countl_zero(T13(127)) == 6); + assert(std::countl_zero(T13(128)) == 5); + assert(std::countl_zero(T13(~T13(0))) == 0); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T77 = unsigned _BitInt(77); + using T128 = unsigned _BitInt(128); + assert(std::countl_zero(T77(1)) == 76); + assert(std::countl_zero(T77(T77(1) << 76)) == 0); + assert(std::countl_zero(T77(~T77(0))) == 0); + + assert(std::countl_zero(T128(0)) == 128); + assert(std::countl_zero(T128(1)) == 127); + assert(std::countl_zero(T128(T128(1) << 64)) == 63); + assert(std::countl_zero(T128(T128(1) << 127)) == 0); + assert(std::countl_zero(T128(~T128(0))) == 0); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T129 = unsigned _BitInt(129); + using T256 = unsigned _BitInt(256); + // Odd width around 128-bit limb boundary. + assert(std::countl_zero(T129(1)) == 128); + assert(std::countl_zero(T129(1) << 128) == 0); + assert(std::countl_zero(T129(~T129(0))) == 0); + + assert(std::countl_zero(T256(~T256(0))) == 0); + assert(std::countl_zero(T256(1)) == 255); + // Bit set at position 200: 55 leading zeros. + assert(std::countl_zero(T256(1) << 200) == 55); + // Bit at position 127 (just below 128-bit boundary): 128 leading zeros. + assert(std::countl_zero(T256(1) << 127) == 128); + // Bit at position 128 (just at 128-bit boundary): 127 leading zeros. + assert(std::countl_zero(T256(1) << 128) == 127); + } +# endif +# if __BITINT_MAXWIDTH__ >= 4096 + { + using T4096 = unsigned _BitInt(4096); + assert(std::countl_zero(T4096(1)) == 4095); + assert(std::countl_zero(T4096(1) << 4095) == 0); + assert(std::countl_zero(T4096(1) << 2048) == 2047); + assert(std::countl_zero(T4096(~T4096(0))) == 0); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp index 1fedc4f8a5386..ba350a76d96af 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp @@ -141,5 +141,81 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. +#if TEST_HAS_EXTENSION(bit_int) + { + using T13 = unsigned _BitInt(13); + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + // Byte-aligned widths: numeric_limits::digits is correct, so all + // values including all-ones are safe to test. + assert(std::countr_one(T32(0)) == 0); + assert(std::countr_one(T32(1)) == 1); + assert(std::countr_one(T32(2)) == 0); + assert(std::countr_one(T32(3)) == 2); + assert(std::countr_one(T32(4)) == 0); + assert(std::countr_one(T32(5)) == 1); + assert(std::countr_one(T32(7)) == 3); + assert(std::countr_one(T32(15)) == 4); + assert(std::countr_one(T32(127)) == 7); + assert(std::countr_one(T32(128)) == 0); + assert(std::countr_one(T32(~T32(0) - 1)) == 0); + assert(std::countr_one(T32(~T32(0))) == 32); + assert(std::countr_one(T64(0)) == 0); + assert(std::countr_one(T64(1)) == 1); + assert(std::countr_one(T64(7)) == 3); + assert(std::countr_one(T64(~T64(0))) == 64); + + // Odd widths: safe for values that are not all-ones. + assert(std::countr_one(T13(0)) == 0); + assert(std::countr_one(T13(1)) == 1); + assert(std::countr_one(T13(3)) == 2); + assert(std::countr_one(T13(7)) == 3); + assert(std::countr_one(T13(15)) == 4); + assert(std::countr_one(T13(127)) == 7); + assert(std::countr_one(T13(128)) == 0); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T77 = unsigned _BitInt(77); + using T128 = unsigned _BitInt(128); + assert(std::countr_one(T77(0)) == 0); + assert(std::countr_one(T77(1)) == 1); + assert(std::countr_one(T77(3)) == 2); + assert(std::countr_one(T77(7)) == 3); + assert(std::countr_one(T77(127)) == 7); + + assert(std::countr_one(T128(0)) == 0); + assert(std::countr_one(T128(1)) == 1); + assert(std::countr_one(T128(~T128(0))) == 128); + // Mask of low 64 bits: 64 trailing ones, then a zero. + assert(std::countr_one(T128((T128(1) << 64) - 1)) == 64); + // Mask of low 65 bits: 65 trailing ones (spans 64-bit boundary). + assert(std::countr_one(T128((T128(1) << 65) - 1)) == 65); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::countr_one(T256(0)) == 0); + assert(std::countr_one(T256(~T256(0))) == 256); + // Mask of low 128 bits: 128 trailing ones. + assert(std::countr_one(T256((T256(1) << 128) - 1)) == 128); + // Mask of low 200 bits: 200 trailing ones. + assert(std::countr_one(T256((T256(1) << 200) - 1)) == 200); + } +# endif +# if __BITINT_MAXWIDTH__ >= 4096 + { + using T4096 = unsigned _BitInt(4096); + assert(std::countr_one(T4096(0)) == 0); + assert(std::countr_one(T4096(~T4096(0))) == 4096); + // Mask of low 1000 bits: 1000 trailing ones. + assert(std::countr_one(T4096((T4096(1) << 1000) - 1)) == 1000); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp index 4221b86fe1cc6..e7e9d6542ab86 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp @@ -138,5 +138,79 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. +#if TEST_HAS_EXTENSION(bit_int) + { + using T8 = unsigned _BitInt(8); + using T13 = unsigned _BitInt(13); + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + // Byte-aligned widths: numeric_limits::digits is correct, so all + // values including zero are safe to test. + assert(std::countr_zero(T8(0)) == 8); + assert(std::countr_zero(T8(1)) == 0); + assert(std::countr_zero(T8(2)) == 1); + assert(std::countr_zero(T8(3)) == 0); + assert(std::countr_zero(T8(4)) == 2); + assert(std::countr_zero(T8(8)) == 3); + assert(std::countr_zero(T8(128)) == 7); + assert(std::countr_zero(T8(~T8(0))) == 0); + assert(std::countr_zero(T32(0)) == 32); + assert(std::countr_zero(T32(1)) == 0); + assert(std::countr_zero(T32(2)) == 1); + assert(std::countr_zero(T32(4)) == 2); + assert(std::countr_zero(T32(126)) == 1); + assert(std::countr_zero(T32(128)) == 7); + assert(std::countr_zero(T32(1) << 31) == 31); + assert(std::countr_zero(T64(0)) == 64); + assert(std::countr_zero(T64(1)) == 0); + assert(std::countr_zero(T64(1) << 63) == 63); + + // Odd widths: safe for nonzero inputs only. + assert(std::countr_zero(T13(1)) == 0); + assert(std::countr_zero(T13(2)) == 1); + assert(std::countr_zero(T13(4)) == 2); + assert(std::countr_zero(T13(128)) == 7); + assert(std::countr_zero(T13(1) << 12) == 12); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T77 = unsigned _BitInt(77); + using T128 = unsigned _BitInt(128); + assert(std::countr_zero(T77(1)) == 0); + assert(std::countr_zero(T77(2)) == 1); + assert(std::countr_zero(T77(1) << 76) == 76); + + assert(std::countr_zero(T128(0)) == 128); + assert(std::countr_zero(T128(1)) == 0); + assert(std::countr_zero(T128(T128(1) << 63)) == 63); + assert(std::countr_zero(T128(T128(1) << 64)) == 64); + assert(std::countr_zero(T128(1) << 127) == 127); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T129 = unsigned _BitInt(129); + using T256 = unsigned _BitInt(256); + assert(std::countr_zero(T129(1) << 128) == 128); + + assert(std::countr_zero(T256(1)) == 0); + assert(std::countr_zero(T256(1) << 127) == 127); + assert(std::countr_zero(T256(1) << 128) == 128); + assert(std::countr_zero(T256(1) << 200) == 200); + assert(std::countr_zero(T256(1) << 255) == 255); + } +# endif +# if __BITINT_MAXWIDTH__ >= 4096 + { + using T4096 = unsigned _BitInt(4096); + assert(std::countr_zero(T4096(1)) == 0); + assert(std::countr_zero(T4096(1) << 2048) == 2048); + assert(std::countr_zero(T4096(1) << 4095) == 4095); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp index a7c5c43a4e2c2..dc5cdf89f147b 100644 --- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp @@ -148,5 +148,114 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5: BITINT_MAXWIDTH is + // guaranteed to be >= ULLONG_WIDTH (>= 64). Anything beyond that is + // optional and must be guarded by __BITINT_MAXWIDTH__. +#if TEST_HAS_EXTENSION(bit_int) + { + // Guaranteed widths (<= 64 bits). + using T8 = unsigned _BitInt(8); + using T13 = unsigned _BitInt(13); + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + assert(std::popcount(T8(0)) == 0); + assert(std::popcount(T8(1)) == 1); + assert(std::popcount(T8(2)) == 1); + assert(std::popcount(T8(3)) == 2); + assert(std::popcount(T8(7)) == 3); + assert(std::popcount(T8(0x55)) == 4); + assert(std::popcount(T8(0xFF)) == 8); + + assert(std::popcount(T32(0)) == 0); + assert(std::popcount(T32(1)) == 1); + assert(std::popcount(T32(3)) == 2); + assert(std::popcount(T32(127)) == 7); + assert(std::popcount(T32(128)) == 1); + assert(std::popcount(T32(130)) == 2); + assert(std::popcount(T32(~T32(0))) == 32); + + assert(std::popcount(T64(0)) == 0); + assert(std::popcount(T64(1)) == 1); + assert(std::popcount(T64(127)) == 7); + assert(std::popcount(T64(~T64(0))) == 64); + assert(std::popcount(T64(~T64(0) >> 1)) == 63); + + // Odd (non-byte-aligned) widths: popcount has no digits dependency. + assert(std::popcount(T13(0)) == 0); + assert(std::popcount(T13(1)) == 1); + assert(std::popcount(T13(3)) == 2); + assert(std::popcount(T13(7)) == 3); + assert(std::popcount(T13(127)) == 7); + assert(std::popcount(T13(128)) == 1); + assert(std::popcount(T13(~T13(0))) == 13); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T77 = unsigned _BitInt(77); + using T128 = unsigned _BitInt(128); + assert(std::popcount(T77(0)) == 0); + assert(std::popcount(T77(1)) == 1); + assert(std::popcount(T77(3)) == 2); + assert(std::popcount(T77(127)) == 7); + assert(std::popcount(T77(~T77(0))) == 77); + assert(std::popcount(T77(~T77(0) - 1)) == 76); + + assert(std::popcount(T128(0)) == 0); + assert(std::popcount(T128(1)) == 1); + assert(std::popcount(T128(~T128(0))) == 128); + assert(std::popcount(T128(~T128(0) - 1)) == 127); + // Alternating bit pattern: ~0 / 3 == 0x5555...5555 in any width. + assert(std::popcount(T128(~T128(0) / 3)) == 64); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T129 = unsigned _BitInt(129); + using T255 = unsigned _BitInt(255); + using T256 = unsigned _BitInt(256); + + // Odd widths at 128-bit boundary. + assert(std::popcount(T129(0)) == 0); + assert(std::popcount(T129(~T129(0))) == 129); + assert(std::popcount(T129(1) << 128) == 1); + assert(std::popcount(T255(~T255(0))) == 255); + assert(std::popcount(T255(1) << 254) == 1); + + assert(std::popcount(T256(0)) == 0); + assert(std::popcount(T256(~T256(0))) == 256); + // Alternating bit pattern: ~0 / 3 == 0x5555...5555 (128 bits set). + assert(std::popcount(T256(~T256(0) / 3)) == 128); + // (1 << 200) - 1 has exactly 200 bits set. + T256 mask200 = T256(1) << 200; + mask200 -= 1; + assert(std::popcount(mask200) == 200); + // Single high bit at position 255. + assert(std::popcount(T256(1) << 255) == 1); + // Two bits spanning the low/high halves. + assert(std::popcount(T256(1) | (T256(1) << 255)) == 2); + // Exactly 4 bits at positions 0, 64, 128, 255. + T256 scattered = T256(1) | (T256(1) << 64) | (T256(1) << 128) | (T256(1) << 255); + assert(std::popcount(scattered) == 4); + // All ones minus a single bit. + assert(std::popcount(T256(~T256(0)) ^ (T256(1) << 200)) == 255); + } +# endif +# if __BITINT_MAXWIDTH__ >= 4096 + { + // Huge width exercises multi-limb iteration. + using T4096 = unsigned _BitInt(4096); + assert(std::popcount(T4096(0)) == 0); + assert(std::popcount(T4096(~T4096(0))) == 4096); + assert(std::popcount(T4096(~T4096(0) / 3)) == 2048); // alternating bits + assert(std::popcount(T4096(1) << 4095) == 1); + // 1000 bits set starting from position 0. + T4096 mask1000 = T4096(1) << 1000; + mask1000 -= 1; + assert(std::popcount(mask1000) == 1000); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp index 72e412772fb08..e9859ac6398b3 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp @@ -165,5 +165,61 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. + // rotl uses numeric_limits::digits internally, so only byte-aligned + // widths are safe (where digits matches the actual bit width). +#if TEST_HAS_EXTENSION(bit_int) + { + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + T32 m32 = ~T32(0); + assert(std::rotl(T32(1), 0) == T32(1)); + assert(std::rotl(T32(1), 1) == T32(2)); + assert(std::rotl(T32(1), 4) == T32(16)); + assert(std::rotl(T32(1), 7) == T32(128)); + assert(std::rotl(T32(128), -1) == T32(64)); + assert(std::rotl(T32(128), -7) == T32(1)); + assert(std::rotl(T32(m32 - 1), 0) == T32(m32 - 1)); + assert(std::rotl(T32(m32 - 1), 1) == T32(m32 - 2)); + assert(std::rotl(T32(m32 - 1), 4) == T32(m32 - 16)); + // Full rotation returns original. + assert(std::rotl(T32(1), 32) == T32(1)); + + assert(std::rotl(T64(1), 0) == T64(1)); + assert(std::rotl(T64(1), 4) == T64(16)); + assert(std::rotl(T64(1), -1) == (T64(1) << 63)); + assert(std::rotl(T64(1), 64) == T64(1)); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T128 = unsigned _BitInt(128); + assert(std::rotl(T128(1), 0) == T128(1)); + assert(std::rotl(T128(1), 4) == T128(16)); + assert(std::rotl(T128(1), 63) == (T128(1) << 63)); + assert(std::rotl(T128(1), 64) == (T128(1) << 64)); + assert(std::rotl(T128(1), -1) == (T128(1) << 127)); + assert(std::rotl(T128(1), 128) == T128(1)); + // Multi-bit wrap-around across limb boundary. + assert(std::rotl(T128(3) << 62, 4) == T128(3) << 66); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::rotl(T256(1), 0) == T256(1)); + assert(std::rotl(T256(1), 4) == T256(16)); + assert(std::rotl(T256(1), 200) == (T256(1) << 200)); + assert(std::rotl(T256(1), -1) == (T256(1) << 255)); + assert(std::rotl(T256(1), 256) == T256(1)); + assert(std::rotl(T256(~T256(0) - 1), 1) == T256(~T256(0) - 2)); + // Wrap-around: rotate a high bit to low. + assert(std::rotl(T256(1) << 255, 1) == T256(1)); + // Modulo: rotation amount larger than width. + assert(std::rotl(T256(1), 256 + 4) == T256(1) << 4); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp index fc0fff60394e3..428e11dba4969 100644 --- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp +++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp @@ -165,5 +165,61 @@ int main(int, char**) test(); test(); + // _BitInt tests. Width tiers follow C23 7.18.2.5. + // rotr uses numeric_limits::digits internally, so only byte-aligned + // widths are safe. +#if TEST_HAS_EXTENSION(bit_int) + { + using T32 = unsigned _BitInt(32); + using T64 = unsigned _BitInt(64); + + T32 m32 = ~T32(0); + T32 h32 = T32(1) << 31; + assert(std::rotr(T32(1), 0) == T32(1)); + assert(std::rotr(T32(16), 4) == T32(1)); + assert(std::rotr(T32(128), 1) == T32(64)); + assert(std::rotr(T32(128), 7) == T32(1)); + assert(std::rotr(T32(1), -1) == T32(2)); + assert(std::rotr(T32(1), -7) == T32(128)); + assert(std::rotr(T32(m32 - 1), 0) == T32(m32 - 1)); + assert(std::rotr(T32(m32 - 1), 1) == T32(m32 - h32)); + // Full rotation returns original. + assert(std::rotr(T32(1), 32) == T32(1)); + + assert(std::rotr(T64(1), 0) == T64(1)); + assert(std::rotr(T64(16), 4) == T64(1)); + assert(std::rotr(T64(1), -1) == T64(2)); + assert(std::rotr(T64(1), 64) == T64(1)); + } +# if __BITINT_MAXWIDTH__ >= 128 + { + using T128 = unsigned _BitInt(128); + assert(std::rotr(T128(1), 0) == T128(1)); + assert(std::rotr(T128(16), 4) == T128(1)); + assert(std::rotr(T128(1), -1) == T128(2)); + assert(std::rotr(T128(1), 128) == T128(1)); + // Wrap low bit to high position. + assert(std::rotr(T128(1), 1) == T128(1) << 127); + // Multi-bit rotation across the 64-bit boundary. + assert(std::rotr(T128(3), 2) == ((T128(1) << 127) | (T128(1) << 126))); + } +# endif +# if __BITINT_MAXWIDTH__ >= 256 + { + using T256 = unsigned _BitInt(256); + assert(std::rotr(T256(1), 0) == T256(1)); + assert(std::rotr(T256(16), 4) == T256(1)); + assert(std::rotr(T256(1) << 200, 200) == T256(1)); + assert(std::rotr(T256(1), -1) == T256(2)); + assert(std::rotr(T256(1), 256) == T256(1)); + assert(std::rotr(T256(~T256(0) - 1), 1) == T256(~T256(0) - (T256(1) << 255))); + // Wrap low bit to highest. + assert(std::rotr(T256(1), 1) == T256(1) << 255); + // Modulo: rotation amount larger than width. + assert(std::rotr(T256(1), 256 + 4) == T256(1) << 252); + } +# endif +#endif // TEST_HAS_EXTENSION(bit_int) + return 0; } diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp new file mode 100644 index 0000000000000..a4c68b0d582ad --- /dev/null +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturating.bitint.pass.cpp @@ -0,0 +1,227 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// + +// add_sat, sub_sat, mul_sat, div_sat, saturate_cast applied to _BitInt(N). +// +// After [libc++] recognized _BitInt as an integer type in +// __type_traits/integer_traits.h, these functions silently started +// accepting _BitInt arguments. Saturation at min/max depends on +// numeric_limits<_BitInt(N)>::min/max being correct, which requires the +// digits10 fix from #193002 for odd widths. +// +// Widths covered: +// - _BitInt(13): odd narrow width, signed range -4096..4095. +// Exercises fixed digits10 for saturation clamp. +// - _BitInt(64): equal to long long, integer_traits boundary. +// - _BitInt(128): matches __int128 on targets that support it. +// - _BitInt(200): beyond __int128 (optional via __BITINT_MAXWIDTH__). + +#include +#include +#include + +#include "test_macros.h" + +#if TEST_HAS_EXTENSION(bit_int) + +template +constexpr bool test_signed_add_sub() { + constexpr T min_v = std::numeric_limits::min(); + constexpr T max_v = std::numeric_limits::max(); + + // Basic: no overflow. + assert(std::add_sat(T(1), T(2)) == T(3)); + assert(std::add_sat(T(-1), T(1)) == T(0)); + assert(std::sub_sat(T(5), T(3)) == T(2)); + assert(std::sub_sat(T(-1), T(-1)) == T(0)); + + // Positive overflow clamps to max. + assert(std::add_sat(max_v, T(1)) == max_v); + assert(std::add_sat(T(1), max_v) == max_v); + assert(std::add_sat(max_v, max_v) == max_v); + + // Negative overflow clamps to min. + assert(std::add_sat(min_v, T(-1)) == min_v); + assert(std::add_sat(T(-1), min_v) == min_v); + assert(std::add_sat(min_v, min_v) == min_v); + + // sub_sat positive overflow (x >= 0, y < 0). + assert(std::sub_sat(max_v, T(-1)) == max_v); + assert(std::sub_sat(max_v, min_v) == max_v); + + // sub_sat negative overflow (x < 0, y > 0). + assert(std::sub_sat(min_v, T(1)) == min_v); + assert(std::sub_sat(min_v, max_v) == min_v); + + return true; +} + +template +constexpr bool test_unsigned_add_sub() { + constexpr T max_v = std::numeric_limits::max(); + + // Basic. + assert(std::add_sat(T(1), T(2)) == T(3)); + assert(std::sub_sat(T(5), T(3)) == T(2)); + + // Upper clamp. + assert(std::add_sat(max_v, T(1)) == max_v); + assert(std::add_sat(T(1), max_v) == max_v); + assert(std::add_sat(max_v, max_v) == max_v); + + // Lower clamp (wrap-to-zero on unsigned). + assert(std::sub_sat(T(0), T(1)) == T(0)); + assert(std::sub_sat(T(0), max_v) == T(0)); + assert(std::sub_sat(T(3), T(5)) == T(0)); + + return true; +} + +template +constexpr bool test_signed_mul_div() { + constexpr T min_v = std::numeric_limits::min(); + constexpr T max_v = std::numeric_limits::max(); + + // Basic mul. + assert(std::mul_sat(T(2), T(3)) == T(6)); + assert(std::mul_sat(T(-2), T(3)) == T(-6)); + + // Overflow to max. + assert(std::mul_sat(max_v, T(2)) == max_v); + assert(std::mul_sat(T(-1), min_v) == max_v); // -(-min) overflows to +max + assert(std::mul_sat(min_v, T(-1)) == max_v); + + // Overflow to min. + assert(std::mul_sat(max_v, T(-2)) == min_v); + assert(std::mul_sat(T(-2), max_v) == min_v); + + // div_sat: regular values. + assert(std::div_sat(T(6), T(3)) == T(2)); + assert(std::div_sat(T(7), T(3)) == T(2)); + assert(std::div_sat(T(-6), T(3)) == T(-2)); + + // The one signed division overflow case: INT_MIN / -1. + assert(std::div_sat(min_v, T(-1)) == max_v); + + return true; +} + +template +constexpr bool test_unsigned_mul_div() { + constexpr T max_v = std::numeric_limits::max(); + + assert(std::mul_sat(T(2), T(3)) == T(6)); + assert(std::mul_sat(max_v, T(2)) == max_v); // clamp + assert(std::mul_sat(T(0), max_v) == T(0)); + assert(std::mul_sat(max_v, max_v) == max_v); + + assert(std::div_sat(T(10), T(3)) == T(3)); + assert(std::div_sat(max_v, T(1)) == max_v); + return true; +} + +template +constexpr bool test_saturate_cast() { + constexpr S s_min = std::numeric_limits::min(); + constexpr S s_max = std::numeric_limits::max(); + constexpr U u_max = std::numeric_limits::max(); + + // Same-type: no clamp. + assert(std::saturate_cast(S(0)) == S(0)); + assert(std::saturate_cast(s_max) == s_max); + assert(std::saturate_cast(s_min) == s_min); + assert(std::saturate_cast(U(0)) == U(0)); + assert(std::saturate_cast(u_max) == u_max); + + // Signed -> unsigned: negative clamps to zero. + assert(std::saturate_cast(S(-1)) == U(0)); + assert(std::saturate_cast(s_min) == U(0)); + assert(std::saturate_cast(S(1)) == U(1)); + + // Unsigned -> signed: overflow clamps to s_max. + assert(std::saturate_cast(u_max) == s_max); + + return true; +} + +constexpr bool test() { + // Guaranteed width (<= 64). + test_signed_add_sub<_BitInt(13)>(); + test_unsigned_add_sub(); + test_signed_mul_div<_BitInt(13)>(); + test_unsigned_mul_div(); + test_saturate_cast<_BitInt(13), unsigned _BitInt(13)>(); + + test_signed_add_sub<_BitInt(64)>(); + test_unsigned_add_sub(); + test_signed_mul_div<_BitInt(64)>(); + test_unsigned_mul_div(); + test_saturate_cast<_BitInt(64), unsigned _BitInt(64)>(); + + // Cross-width saturate_cast: wide source clamped into narrow target. + { + using S13 = _BitInt(13); + using S64 = _BitInt(64); + using U13 = unsigned _BitInt(13); + using U64 = unsigned _BitInt(64); + + // wide signed -> narrow signed + assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + assert(std::saturate_cast(std::numeric_limits::min()) == std::numeric_limits::min()); + // wide unsigned -> narrow signed + assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + // wide signed -> narrow unsigned + assert(std::saturate_cast(std::numeric_limits::min()) == U13{0}); + assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + // exact-fit no clamp + assert(std::saturate_cast(S13{-1}) == S64{-1}); + assert(std::saturate_cast(U13{42}) == U64{42}); + } + +# if __BITINT_MAXWIDTH__ >= 128 + test_signed_add_sub<_BitInt(128)>(); + test_unsigned_add_sub(); + test_signed_mul_div<_BitInt(128)>(); + test_unsigned_mul_div(); + test_saturate_cast<_BitInt(128), unsigned _BitInt(128)>(); +# endif + +# if __BITINT_MAXWIDTH__ >= 200 + // Beyond __int128: exercises the overflow-detection fallback on widths + // with no builtin add/sub/mul_sat mapping. + test_signed_add_sub<_BitInt(200)>(); + test_unsigned_add_sub(); + test_signed_mul_div<_BitInt(200)>(); + test_unsigned_mul_div(); + test_saturate_cast<_BitInt(200), unsigned _BitInt(200)>(); + + // Cross-width between 128- and 200-bit widths. + { + using S200 = _BitInt(200); + using S128 = _BitInt(128); + assert(std::saturate_cast(std::numeric_limits::max()) == std::numeric_limits::max()); + assert(std::saturate_cast(std::numeric_limits::min()) == std::numeric_limits::min()); + } +# endif + + return true; +} + +#endif // TEST_HAS_EXTENSION(bit_int) + +int main(int, char**) { +#if TEST_HAS_EXTENSION(bit_int) + test(); + static_assert(test()); +#endif + return 0; +} diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp new file mode 100644 index 0000000000000..52107b8b91527 --- /dev/null +++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.bitint.verify.cpp @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// make_format_args with _BitInt(N) wider than __int128 is unsupported. +// +// After [libc++] recognized _BitInt as an integer type in +// __type_traits/integer_traits.h, format_arg_store's __determine_arg_t +// dispatches on sizeof(_Tp) and maps _BitInt up to sizeof(__int128) onto +// the i128 storage slot. For wider _BitInt (sizeof > sizeof(__int128)), +// no storage slot exists and a static_assert fires. +// +// This test pins down that diagnostic so that if the dispatch ever changes +// to silently accept a wider type (or drops the diagnostic), the test +// breaks and forces a reconsideration. + +#include + +#include "test_macros.h" + +#if TEST_HAS_EXTENSION(bit_int) && __BITINT_MAXWIDTH__ >= 129 + +void f_signed() { + // _BitInt(129) has sizeof == 32 on x86-64 (first size wider than __int128). + _BitInt(129) value = 0; + // expected-error-re@*:* {{{{(static assertion|static_assert)}} failed{{.*}}"an unsupported signed integer was used"}} + (void)std::make_format_args(value); +} + +void f_unsigned() { + unsigned _BitInt(129) value = 0; + // expected-error-re@*:* {{{{(static assertion|static_assert)}} failed{{.*}}"an unsupported unsigned integer was used"}} + (void)std::make_format_args(value); +} + +# if __BITINT_MAXWIDTH__ >= 256 +void f_signed_256() { + _BitInt(256) value = 0; + // expected-error-re@*:* {{{{(static assertion|static_assert)}} failed{{.*}}"an unsupported signed integer was used"}} + (void)std::make_format_args(value); +} +# endif + +#else +// When _BitInt is unavailable or the implementation limits preclude the +// test, keep the file well-formed with a trivial positive expectation so +// the driver does not fail. +// expected-no-diagnostics +#endif diff --git a/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp b/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp new file mode 100644 index 0000000000000..f96ac1c9f7a32 --- /dev/null +++ b/libcxx/test/std/utilities/utility/utility.intcmp/intcmp.bitint.pass.cpp @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// cmp_equal, cmp_not_equal, cmp_less, cmp_less_equal, cmp_greater, +// cmp_greater_equal, in_range applied to _BitInt(N). +// +// Exercises the four implementation branches of cmp_less/cmp_equal: +// 1. same-signedness shortcut (__t < __u) +// 2. both promote to int (branch via int) +// 3. both promote to long long (branch via long long) +// 4. fallback using make_unsigned_t (wider than long long) +// +// _BitInt widths chosen to land in each branch: +// - _BitInt(7) sizeof==1, < sizeof(int) -> branch 2 +// - _BitInt(13) sizeof==2 -> branch 2 +// - _BitInt(32) sizeof==4 == sizeof(int) -> branch 2 (signed) / 3 (unsigned) +// - _BitInt(33) sizeof==8 -> branch 3 +// - _BitInt(63) sizeof==8 -> branch 3 +// - _BitInt(65) sizeof==16 -> branch 4 +// - _BitInt(128) sizeof==16 -> branch 4 +// - _BitInt(200) sizeof==32 (requires __BITINT_MAXWIDTH__ >= 200) +// -> branch 4 + +#include +#include +#include + +#include "test_macros.h" + +#if TEST_HAS_EXTENSION(bit_int) + +template +constexpr bool test_same_sign() { + // Branch 1: same signedness. Trivial equality/ordering. + static_assert(std::cmp_equal(T(0), U(0))); + static_assert(std::cmp_equal(T(42), U(42))); + static_assert(!std::cmp_equal(T(0), U(1))); + static_assert(std::cmp_less(T(0), U(1))); + static_assert(!std::cmp_less(T(1), U(0))); + static_assert(std::cmp_less_equal(T(1), U(1))); + static_assert(std::cmp_greater_equal(T(1), U(1))); + static_assert(std::cmp_not_equal(T(0), U(1))); + return true; +} + +template +constexpr bool test_mixed_sign() { + // Signed vs unsigned of the SAME width: negative signed values must + // compare less than any unsigned value, regardless of the promotion + // branch chosen. + constexpr auto s_min = std::numeric_limits::min(); + constexpr auto u_max = std::numeric_limits::max(); + + static_assert(std::cmp_less(S(-1), U(0))); + static_assert(!std::cmp_equal(S(-1), U(-1))); // U(-1) wraps to u_max + static_assert(std::cmp_less(s_min, U(0))); + static_assert(std::cmp_greater(u_max, S(0))); + static_assert(std::cmp_greater(u_max, s_min)); + static_assert(std::cmp_less_equal(S(-1), U(0))); + static_assert(std::cmp_greater_equal(U(0), S(-1))); + + // Equal-value mixed-sign: a non-negative signed value must compare + // equal to the corresponding unsigned value. + static_assert(std::cmp_equal(S(7), U(7))); + static_assert(std::cmp_equal(U(7), S(7))); + return true; +} + +template +constexpr bool test_in_range() { + // in_range relies on numeric_limits<_Tp>::min/max, which requires + // the digits10 fix (#193002) to be correct for odd _BitInt widths. + + // Signed target: value in range. + static_assert(std::in_range(S(0))); + static_assert(std::in_range(std::numeric_limits::max())); + static_assert(std::in_range(std::numeric_limits::min())); + // Signed target: value out of range via a wider unsigned source. + static_assert(!std::in_range(std::numeric_limits::max())); + // Unsigned target: negative signed value is out of range. + static_assert(!std::in_range(S(-1))); + // Unsigned target: zero is in range. + static_assert(std::in_range(S(0))); + static_assert(std::in_range(std::numeric_limits::max())); + return true; +} + +constexpr bool test() { + // Branch 2 territory (sizeof <= sizeof(int)). + test_same_sign<_BitInt(7), _BitInt(7)>(); + test_same_sign(); + test_same_sign<_BitInt(13), _BitInt(13)>(); + test_mixed_sign<_BitInt(7), unsigned _BitInt(7)>(); + test_mixed_sign<_BitInt(13), unsigned _BitInt(13)>(); + test_in_range<_BitInt(7), unsigned _BitInt(7)>(); + test_in_range<_BitInt(13), unsigned _BitInt(13)>(); + + // Equal-sizeof-as-int boundary: signed _BitInt(32) can promote to int, + // unsigned _BitInt(32) cannot (would lose the high bit), so it falls + // into branch 3. + test_same_sign<_BitInt(32), _BitInt(32)>(); + test_same_sign(); + test_mixed_sign<_BitInt(32), unsigned _BitInt(32)>(); + test_in_range<_BitInt(32), unsigned _BitInt(32)>(); + + // Branch 3 territory (sizeof <= sizeof(long long)). + test_same_sign<_BitInt(33), _BitInt(33)>(); + test_same_sign<_BitInt(63), _BitInt(63)>(); + test_same_sign(); + test_mixed_sign<_BitInt(33), unsigned _BitInt(33)>(); + test_mixed_sign<_BitInt(63), unsigned _BitInt(63)>(); + test_in_range<_BitInt(33), unsigned _BitInt(33)>(); + test_in_range<_BitInt(63), unsigned _BitInt(63)>(); + + // Equal-sizeof-as-long-long boundary: _BitInt(64) signed promotes, + // unsigned _BitInt(64) does not, so the mixed-sign case lands in + // branch 4. + test_same_sign<_BitInt(64), _BitInt(64)>(); + test_same_sign(); + test_mixed_sign<_BitInt(64), unsigned _BitInt(64)>(); + test_in_range<_BitInt(64), unsigned _BitInt(64)>(); + +# if __BITINT_MAXWIDTH__ >= 128 + // Branch 4 territory (sizeof > sizeof(long long)). + test_same_sign<_BitInt(65), _BitInt(65)>(); + test_same_sign<_BitInt(128), _BitInt(128)>(); + test_same_sign(); + test_mixed_sign<_BitInt(65), unsigned _BitInt(65)>(); + test_mixed_sign<_BitInt(128), unsigned _BitInt(128)>(); + test_in_range<_BitInt(65), unsigned _BitInt(65)>(); + test_in_range<_BitInt(128), unsigned _BitInt(128)>(); +# endif + +# if __BITINT_MAXWIDTH__ >= 200 + // Beyond __int128: verifies make_unsigned_t<_BitInt(N)> works on the + // fallback path for widths with no builtin mapping. + test_same_sign<_BitInt(200), _BitInt(200)>(); + test_same_sign(); + test_mixed_sign<_BitInt(200), unsigned _BitInt(200)>(); + test_in_range<_BitInt(200), unsigned _BitInt(200)>(); +# endif + + // Cross-width: narrow signed _BitInt vs wide unsigned builtin. + // Negative source must be reported as less than any non-negative target. + static_assert(std::cmp_less(_BitInt(7)(-1), 0ull)); + static_assert(std::cmp_less(_BitInt(13)(-1), 0u)); + static_assert(std::cmp_less(_BitInt(63)(-1), 0ull)); + // Cross-type round-trip equality. + static_assert(std::cmp_equal(_BitInt(13)(42), 42)); + static_assert(std::cmp_equal(42, _BitInt(13)(42))); + static_assert(std::cmp_equal(unsigned _BitInt(13)(42), 42u)); + + return true; +} + +#endif // TEST_HAS_EXTENSION(bit_int) + +int main(int, char**) { +#if TEST_HAS_EXTENSION(bit_int) + test(); + static_assert(test()); +#endif + return 0; +} From 8b9cfeac98b99c3edfe0b5f71c8a11044750e50b Mon Sep 17 00:00:00 2001 From: Liao Chunyu Date: Fri, 8 May 2026 16:07:21 +0800 Subject: [PATCH 005/538] [RISCV] Add support for Ziccid 1.0 (#196459) No codegen and instruction. It may be ratified in the future. https://github.com/riscv/riscv-isa-manual/pull/2598 --- clang/test/Driver/print-supported-extensions-riscv.c | 1 + clang/test/Preprocessor/riscv-target-features.c | 9 +++++++++ llvm/docs/RISCVUsage.rst | 1 + llvm/docs/ReleaseNotes.md | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 4 ++++ llvm/test/CodeGen/RISCV/attributes.ll | 2 ++ llvm/test/CodeGen/RISCV/features-info.ll | 1 + llvm/test/MC/RISCV/attribute-arch.s | 3 +++ llvm/unittests/TargetParser/RISCVISAInfoTest.cpp | 1 + 9 files changed, 23 insertions(+) diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 83920246d5dad..d0a773f8f43d8 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -21,6 +21,7 @@ // CHECK-NEXT: zicboz 1.0 'Zicboz' (Cache-Block Zero Instructions) // CHECK-NEXT: ziccamoa 1.0 'Ziccamoa' (Main Memory Supports All Atomics in A) // CHECK-NEXT: ziccamoc 1.0 'Ziccamoc' (Main Memory Supports Atomics in Zacas) +// CHECK-NEXT: ziccid 1.0 'Ziccid' (Instruction/Data Coherence and Consistency) // CHECK-NEXT: ziccif 1.0 'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement) // CHECK-NEXT: zicclsm 1.0 'Zicclsm' (Main Memory Supports Misaligned Loads/Stores) // CHECK-NEXT: ziccrse 1.0 'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences) diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 3882f2889eb59..6682948869f94 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -116,6 +116,7 @@ // CHECK-NOT: __riscv_zicboz {{.*$}} // CHECK-NOT: __riscv_ziccamoa {{.*$}} // CHECK-NOT: __riscv_ziccamoc {{.*$}} +// CHECK-NOT: __riscv_ziccid {{.*$}} // CHECK-NOT: __riscv_ziccif {{.*$}} // CHECK-NOT: __riscv_zicclsm {{.*$}} // CHECK-NOT: __riscv_ziccrse {{.*$}} @@ -897,6 +898,14 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICCAMOC-EXT %s // CHECK-ZICCAMOC-EXT: __riscv_ziccamoc 1000000{{$}} +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32iziccid -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCID-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64iziccid -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCID-EXT %s +// CHECK-ZICCID-EXT: __riscv_ziccid 1000000{{$}} + // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32iziccif -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICCIF-EXT %s diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 2c8805f5fe796..2b68827e7b136 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -206,6 +206,7 @@ on support follow. ``Zicboz`` Assembly Support ``Ziccamoa`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Ziccamoc`` Supported (`See note <#riscv-profiles-extensions-note>`__) + ``Ziccid`` Supported ``Ziccif`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Zicclsm`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Ziccrse`` Supported (`See note <#riscv-profiles-extensions-note>`__) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 424e67b8b4235..ec613d64e20a3 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -202,6 +202,7 @@ Makes programs 10x faster by doing Special New Thing. * `-mcpu=sifive-x160` and `-mcpu=sifive-x180` were added. * Support for the experimental `XRivosVisni` vendor extension has been removed. * Adds experimental assembler support for the 'Zvvmm` (RISC-V Integer Matrix Multiply-Accumulate) extension. +* Adds support for 'Ziccid' (Instruction/Data Coherence and Consistency) extension. ### Changes to the WebAssembly Backend diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index b905870a482ff..a92fb63e3c3cd 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -116,6 +116,10 @@ def FeatureStdExtZiccif : RISCVExtension<1, 0, "Main Memory Supports Instruction Fetch with Atomicity Requirement">; +def FeatureStdExtZiccid + : RISCVExtension<1, 0, + "Instruction/Data Coherence and Consistency", [FeatureStdExtZiccif]>; + def FeatureStdExtZicclsm : RISCVExtension<1, 0, "Main Memory Supports Misaligned Loads/Stores">; diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 9da6766692e99..2c1ae20ee44c1 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -242,6 +242,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zcmt %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCMT %s ; RUN: llc -mtriple=riscv64 -mattr=+ziccamoa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCAMOA %s ; RUN: llc -mtriple=riscv64 -mattr=+ziccamoc %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCAMOC %s +; RUN: llc -mtriple=riscv64 -mattr=+ziccid %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCID %s ; RUN: llc -mtriple=riscv64 -mattr=+ziccif %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCIF %s ; RUN: llc -mtriple=riscv64 -mattr=+zicclsm %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCLSM %s ; RUN: llc -mtriple=riscv64 -mattr=+ziccrse %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCRSE %s @@ -563,6 +564,7 @@ ; RV64ZCMT: .attribute 5, "rv64i2p1_c2p0_zicsr2p0_zca1p0_zcmt1p0" ; RV64ZICCAMOA: .attribute 5, "rv64i2p1_ziccamoa1p0" ; RV64ZICCAMOC: .attribute 5, "rv64i2p1_ziccamoc1p0" +; RV64ZICCID: .attribute 5, "rv64i2p1_ziccid1p0_ziccif1p0" ; RV64ZICCIF: .attribute 5, "rv64i2p1_ziccif1p0" ; RV64ZICCLSM: .attribute 5, "rv64i2p1_zicclsm1p0" ; RV64ZICCRSE: .attribute 5, "rv64i2p1_ziccrse1p0" diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 92e033cb90dc9..d4b920c08a096 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -305,6 +305,7 @@ ; CHECK-NEXT: zicboz - 'Zicboz' (Cache-Block Zero Instructions). ; CHECK-NEXT: ziccamoa - 'Ziccamoa' (Main Memory Supports All Atomics in A). ; CHECK-NEXT: ziccamoc - 'Ziccamoc' (Main Memory Supports Atomics in Zacas). +; CHECK-NEXT: ziccid - 'Ziccid' (Instruction/Data Coherence and Consistency). ; CHECK-NEXT: ziccif - 'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement). ; CHECK-NEXT: zicclsm - 'Zicclsm' (Main Memory Supports Misaligned Loads/Stores). ; CHECK-NEXT: ziccrse - 'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences). diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 241fac8f2bd17..ef5e091e7e41e 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -109,6 +109,9 @@ .attribute arch, "rv32iziccamoc" # CHECK: attribute 5, "rv32i2p1_ziccamoc1p0" +.attribute arch, "rv32iziccid" +# CHECK: attribute 5, "rv32i2p1_ziccid1p0_ziccif1p0" + .attribute arch, "rv32iziccif" # CHECK: attribute 5, "rv32i2p1_ziccif1p0" diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index e96e2d9d462dc..d28066d423b77 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1152,6 +1152,7 @@ R"(All available -march extensions for RISC-V zicboz 1.0 ziccamoa 1.0 ziccamoc 1.0 + ziccid 1.0 ziccif 1.0 zicclsm 1.0 ziccrse 1.0 From 59aaa538c60212702a954caf7e1817c2c1810c07 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 8 May 2026 09:25:19 +0100 Subject: [PATCH 006/538] [AArch64] Reflect cost of integer sub-reductions. (#194594) The cost of sub-reductions is either the cost of *mlslb + *mlslt, or the cost of a dot operation with 2 negations: ``` partial_reduce_umls acc, lhs, rhs <=> -partial_reduce_umla -acc, lhs, rhs ``` (codegen for this was added by #186809) The cost-model was previously a bit of a hack, since sub-reductions were expanded and therefore expensive, although we made the expansion cost artifically cheaper so that it would still be a candidate for cdot instructions. --- .../AArch64/AArch64TargetTransformInfo.cpp | 44 +- .../AArch64/partial-reduce-chained.ll | 14 +- .../AArch64/partial-reduce-costs.ll | 666 ++++++++++++++++++ .../AArch64/partial-reduce-sub-sdot.ll | 4 +- 4 files changed, 699 insertions(+), 29 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-costs.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index aff89e00523c0..3685177328f12 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -6047,26 +6047,32 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( bool IsSub = Opcode == Instruction::Sub; InstructionCost Cost = InputLT.first * TTI::TCC_Basic; + // Integer partial sub-reductions that don't map to a specific instruction, + // carry an extra cost for implementing a double negation: + // partial_reduce_umls acc, lhs, rhs + // <=> -partial_reduce_umla -acc, lhs, rhs + InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0; if (AccumLT.second.getScalarType() == MVT::i32 && - InputLT.second.getScalarType() == MVT::i8 && !IsSub) { + InputLT.second.getScalarType() == MVT::i8) { // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE. if (!IsUSDot && IsSupported(true, ST->hasDotProd())) - return Cost; + return Cost + INegCost; // i8 -> i32 usdot requires +i8mm if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8())) - return Cost; + return Cost + INegCost; } - if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot && !IsSub) { + if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) { // i16 -> i64 is natively supported for udot/sdot if (AccumLT.second.getScalarType() == MVT::i64 && InputLT.second.getScalarType() == MVT::i16) - return Cost; - // i16 -> i32 is natively supported with SVE2p1 + return Cost + INegCost; + // i16 -> i32 is natively supported with SVE2p1 udot/sdot. + // For sub-reductions, we prefer using the *mlslb/t instructions. if (AccumLT.second.getScalarType() == MVT::i32 && InputLT.second.getScalarType() == MVT::i16 && - (ST->hasSVE2p1() || ST->hasSME2())) + (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub) return Cost; // i8 -> i64 is supported with an extra level of extends if (AccumLT.second.getScalarType() == MVT::i64 && @@ -6076,11 +6082,12 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( // that now, a regular reduction would be cheaper because the costs of // the extends in the IR are still counted. This can be fixed // after https://github.com/llvm/llvm-project/pull/147302 has landed. - return Cost; - // i8 -> i16 is natively supported with SVE2p3 + return Cost + INegCost; + // i8 -> i16 is natively supported with SVE2p3 udot/sdot + // For sub-reductions, we prefer using the *mlslb/t instructions. if (AccumLT.second.getScalarType() == MVT::i16 && InputLT.second.getScalarType() == MVT::i8 && - (ST->hasSVE2p3() || ST->hasSME2p3())) + (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub) return Cost; } @@ -6092,11 +6099,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( InputLT.second.getScalarType() == MVT::f16) return Cost; - // For a ratio of 2, we can use *mlal top/bottom instructions. - if (Ratio == 2 && !IsSub) { + // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions. + if (Ratio == 2 && !IsUSDot) { MVT InVT = InputLT.second.getScalarType(); - // SVE2 [us]mlalb/t and NEON [us]mlal(2) + // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2) if (IsSupported(ST->hasSVE2(), true) && llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy)) return Cost * 2; @@ -6110,14 +6117,9 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( return Cost * 2; } - InstructionCost ExpandCost = BaseT::getPartialReductionCost( - Opcode, InputTypeA, InputTypeB, AccumType, VF, OpAExtend, OpBExtend, - BinOp, CostKind, FMF); - - // Slightly lower the cost of a sub reduction so that it can be considered - // as candidate for 'cdot' operations. This is a somewhat arbitrary number, - // because we don't yet model these operations directly. - return ExpandCost.isValid() && IsSub ? ((8 * ExpandCost) / 10) : ExpandCost; + return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB, + AccumType, VF, OpAExtend, OpBExtend, + BinOp, CostKind, FMF); } InstructionCost diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 5ee1ebd2ad90b..ff9eea5342607 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -892,7 +892,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] @@ -901,18 +901,20 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sub [[VEC_PHI]], [[TMP10]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sub zeroinitializer, [[TMP10]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP11]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP12]] -; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = add [[TMP11]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw [[TMP14]], [[TMP12]] -; CHECK-SVE-MAXBW-NEXT: [[TMP15]] = sub [[TMP16]], [[TMP19]] +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP19]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP15]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP15]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-costs.ll new file mode 100644 index 0000000000000..38530d3460423 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-costs.ll @@ -0,0 +1,666 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost.of.*EXPRESSION" --version 6 +; RUN: opt -passes=loop-vectorize \ +; RUN: -scalable-vectorization=off \ +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NEON +; RUN: opt -passes=loop-vectorize \ +; RUN: -scalable-vectorization=on -mattr=+sve \ +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE +; RUN: opt -passes=loop-vectorize \ +; RUN: -scalable-vectorization=on -mattr=+sve2 \ +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2 +; RUN: opt -passes=loop-vectorize \ +; RUN: -scalable-vectorization=on -mattr=+sve2p1 \ +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2p1 +; RUN: opt -passes=loop-vectorize \ +; RUN: -scalable-vectorization=on -mattr=+sve2p3 \ +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2p3 +; RUN: opt -passes=loop-vectorize \ +; RUN: -scalable-vectorization=on -mattr=+sve2,+i8mm \ +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=I8MM + +; REQUIRES: asserts +target triple = "aarch64" + +; sub(i16, zext(i8)->i16 * zext(i8)->i16) +define i16 @sub_reduction_i16_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i16 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' +; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; +; SVE-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' +; SVE2-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' +; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; +; SVE2p1-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' +; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; +; SVE2p3-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' +; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; SVE2p3: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; SVE2p3: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; +; I8MM-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' +; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) +; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i16 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv + %load1 = load i8, ptr %gep1 + %zext1 = zext i8 %load1 to i16 + %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv + %load2 = load i8, ptr %gep2 + %zext2 = zext i8 %load2 to i16 + %mul12 = mul i16 %zext1, %zext2 + %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv + %load3 = load i8, ptr %gep3 + %zext3 = zext i8 %load3 to i16 + %mul13 = mul i16 %zext2, %zext3 + %add1 = add i16 %acc, %mul12 + %sub2 = sub i16 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !0 + +exit: + ret i16 %sub2 +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.interleave.count", i32 1} +!2 = !{!"llvm.loop.vectorize.width", i32 16} + +; There is no usdot for i8 -> i16, so a regular reduction is preferred due to +; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. +; +; sub(i16, zext(i8)->i16 * sext(i8)->i16) +define i16 @sub_reduction_i16_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i16 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' +; SVE-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' +; SVE2-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' +; SVE2p1-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' +; SVE2p3-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' +; I8MM-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i16 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv + %load1 = load i8, ptr %gep1 + %zext1 = zext i8 %load1 to i16 + %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv + %load2 = load i8, ptr %gep2 + %sext2 = sext i8 %load2 to i16 + %mul12 = mul i16 %zext1, %sext2 + %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv + %load3 = load i8, ptr %gep3 + %sext3 = sext i8 %load3 to i16 + %mul13 = mul i16 %zext1, %sext3 + %add1 = add i16 %acc, %mul12 + %sub2 = sub i16 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !3 + +exit: + ret i16 %sub2 +} + +!3 = distinct !{!3, !4, !5} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.width", i32 16} + +; sub(i32, zext(i8)->i32 * zext(i8)->i32) +define i32 @sub_reduction_i32_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { +; +; NEON-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' +; SVE-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' +; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; SVE2-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' +; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; SVE2p1-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' +; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; SVE2p3-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' +; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; I8MM-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' +; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv + %load1 = load i8, ptr %gep1 + %zext1 = zext i8 %load1 to i32 + %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv + %load2 = load i8, ptr %gep2 + %zext2 = zext i8 %load2 to i32 + %mul12 = mul i32 %zext1, %zext2 + %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv + %load3 = load i8, ptr %gep3 + %zext3 = zext i8 %load3 to i32 + %mul13 = mul i32 %zext2, %zext3 + %add1 = add i32 %acc, %mul12 + %sub2 = sub i32 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !6 + +exit: + ret i32 %sub2 +} + +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.interleave.count", i32 1} +!8 = !{!"llvm.loop.vectorize.width", i32 16} + +; sub(i32, zext(i8)->i32 * sext(i8)->i32) +define i32 @sub_reduction_i32_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' +; SVE-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' +; SVE2-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' +; SVE2p1-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' +; SVE2p3-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' +; I8MM-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' +; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> sext to i32)) +; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load1> zext to i32), (ir<%load3> sext to i32))) +; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> sext to i32)) +; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load1> zext to i32), (ir<%load3> sext to i32))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv + %load1 = load i8, ptr %gep1 + %zext1 = zext i8 %load1 to i32 + %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv + %load2 = load i8, ptr %gep2 + %sext2 = sext i8 %load2 to i32 + %mul12 = mul i32 %zext1, %sext2 + %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv + %load3 = load i8, ptr %gep3 + %sext3 = sext i8 %load3 to i32 + %mul13 = mul i32 %zext1, %sext3 + %add1 = add i32 %acc, %mul12 + %sub2 = sub i32 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !9 + +exit: + ret i32 %sub2 +} + +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.interleave.count", i32 1} +!11 = !{!"llvm.loop.vectorize.width", i32 16} + +; sub(i64, zext(i8)->i64 * zext(i8)->i64) +define i64 @sub_reduction_i64_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { +; +; NEON-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' +; SVE-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' +; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' +; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2p1-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' +; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2p3-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' +; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; I8MM-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' +; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv + %load1 = load i8, ptr %gep1 + %zext1 = zext i8 %load1 to i64 + %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv + %load2 = load i8, ptr %gep2 + %zext2 = zext i8 %load2 to i64 + %mul12 = mul i64 %zext1, %zext2 + %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv + %load3 = load i8, ptr %gep3 + %zext3 = zext i8 %load3 to i64 + %mul13 = mul i64 %zext2, %zext3 + %add1 = add i64 %acc, %mul12 + %sub2 = sub i64 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !12 + +exit: + ret i64 %sub2 +} + +!12 = distinct !{!12, !13, !14} +!13 = !{!"llvm.loop.interleave.count", i32 1} +!14 = !{!"llvm.loop.vectorize.width", i32 16} + +; There is no usdot for i8 -> i64, so a regular reduction is preferred due to +; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. +; +; sub(i64, zext(i8)->i64 * sext(i8)->i64) +define i64 @sub_reduction_i64_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' +; SVE-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' +; SVE2-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' +; SVE2p1-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' +; SVE2p3-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' +; I8MM-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv + %load1 = load i8, ptr %gep1 + %zext1 = zext i8 %load1 to i64 + %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv + %load2 = load i8, ptr %gep2 + %sext2 = sext i8 %load2 to i64 + %mul12 = mul i64 %zext1, %sext2 + %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv + %load3 = load i8, ptr %gep3 + %sext3 = sext i8 %load3 to i64 + %mul13 = mul i64 %zext1, %sext3 + %add1 = add i64 %acc, %mul12 + %sub2 = sub i64 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !15 + +exit: + ret i64 %sub2 +} + +!15 = distinct !{!15, !16, !17} +!16 = !{!"llvm.loop.interleave.count", i32 1} +!17 = !{!"llvm.loop.vectorize.width", i32 16} + +; sub(i32, zext(i16)->i32 * zext(i16)->i32) +define i32 @sub_reduction_i32_zext_i16_zext_i16(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' +; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; SVE-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' +; SVE2-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' +; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; SVE2p1-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' +; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; SVE2p3-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' +; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +; I8MM-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' +; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) +; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv + %load1 = load i16, ptr %gep1 + %zext1 = zext i16 %load1 to i32 + %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv + %load2 = load i16, ptr %gep2 + %zext2 = zext i16 %load2 to i32 + %mul12 = mul i32 %zext1, %zext2 + %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv + %load3 = load i16, ptr %gep3 + %zext3 = zext i16 %load3 to i32 + %mul13 = mul i32 %zext2, %zext3 + %add1 = add i32 %acc, %mul12 + %sub2 = sub i32 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !18 + +exit: + ret i32 %sub2 +} + +!18 = distinct !{!18, !19, !20} +!19 = !{!"llvm.loop.interleave.count", i32 1} +!20 = !{!"llvm.loop.vectorize.width", i32 8} + +; There is no usdot for i16 -> i32, so a regular reduction is preferred due to +; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. +; +; sub(i32, zext(i16)->i32 * sext(i16)->i32) +define i32 @sub_reduction_i32_zext_i16_sext_i16(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' +; SVE-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' +; SVE2-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' +; SVE2p1-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' +; SVE2p3-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' +; I8MM-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv + %load1 = load i16, ptr %gep1 + %zext1 = zext i16 %load1 to i32 + %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv + %load2 = load i16, ptr %gep2 + %sext2 = sext i16 %load2 to i32 + %mul12 = mul i32 %zext1, %sext2 + %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv + %load3 = load i16, ptr %gep3 + %sext3 = sext i16 %load3 to i32 + %mul13 = mul i32 %zext1, %sext3 + %add1 = add i32 %acc, %mul12 + %sub2 = sub i32 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !21 + +exit: + ret i32 %sub2 +} + +!21 = distinct !{!21, !22, !23} +!22 = !{!"llvm.loop.interleave.count", i32 1} +!23 = !{!"llvm.loop.vectorize.width", i32 8} + +; sub(i64, zext(i16)->i64 * zext(i16)->i64) +define i64 @sub_reduction_i64_zext_i16_zext_i16(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { +; +; NEON-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' +; SVE-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' +; SVE: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' +; SVE2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2p1-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' +; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p1: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p1: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2p3-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' +; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p3: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p3: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; I8MM-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' +; I8MM: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; I8MM: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; I8MM: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; I8MM: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv + %load1 = load i16, ptr %gep1 + %zext1 = zext i16 %load1 to i64 + %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv + %load2 = load i16, ptr %gep2 + %zext2 = zext i16 %load2 to i64 + %mul12 = mul i64 %zext1, %zext2 + %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv + %load3 = load i16, ptr %gep3 + %zext3 = zext i16 %load3 to i64 + %mul13 = mul i64 %zext2, %zext3 + %add1 = add i64 %acc, %mul12 + %sub2 = sub i64 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !24 + +exit: + ret i64 %sub2 +} + +!24 = distinct !{!24, !25, !26} +!25 = !{!"llvm.loop.interleave.count", i32 1} +!26 = !{!"llvm.loop.vectorize.width", i32 8} + +; sub(i64, zext(i16)->i64 * sext(i16)->i64) +define i64 @sub_reduction_i64_zext_i16_sext_i16(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' +; SVE-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' +; SVE2-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' +; SVE2p1-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' +; SVE2p3-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' +; I8MM-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv + %load1 = load i16, ptr %gep1 + %zext1 = zext i16 %load1 to i64 + %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv + %load2 = load i16, ptr %gep2 + %sext2 = sext i16 %load2 to i64 + %mul12 = mul i64 %zext1, %sext2 + %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv + %load3 = load i16, ptr %gep3 + %sext3 = sext i16 %load3 to i64 + %mul13 = mul i64 %zext1, %sext3 + %add1 = add i64 %acc, %mul12 + %sub2 = sub i64 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !27 + +exit: + ret i64 %sub2 +} + +!27 = distinct !{!27, !28, !29} +!28 = !{!"llvm.loop.interleave.count", i32 1} +!29 = !{!"llvm.loop.vectorize.width", i32 8} + +; sub(i64, zext(i32)->i64 * zext(i32)->i64) +define i64 @sub_reduction_i64_zext_i32_zext_i32(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' +; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' +; SVE2-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' +; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2p1-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' +; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; SVE2p3-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' +; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +; I8MM-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' +; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) +; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i32, ptr %src1, i32 %iv + %load1 = load i32, ptr %gep1 + %zext1 = zext i32 %load1 to i64 + %gep2 = getelementptr inbounds i32, ptr %src2, i32 %iv + %load2 = load i32, ptr %gep2 + %zext2 = zext i32 %load2 to i64 + %mul12 = mul i64 %zext1, %zext2 + %gep3 = getelementptr inbounds i32, ptr %src3, i32 %iv + %load3 = load i32, ptr %gep3 + %zext3 = zext i32 %load3 to i64 + %mul13 = mul i64 %zext2, %zext3 + %add1 = add i64 %acc, %mul12 + %sub2 = sub i64 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !30 + +exit: + ret i64 %sub2 +} + +!30 = distinct !{!30, !31, !32} +!31 = !{!"llvm.loop.interleave.count", i32 1} +!32 = !{!"llvm.loop.vectorize.width", i32 4} + +; There is no usdot for i32 -> i64, so a regular reduction is preferred due to +; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. +; +; sub(i64, zext(i32)->i64 * sext(i32)->i64) +define i64 @sub_reduction_i64_zext_i32_sext_i32(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { +; NEON-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' +; SVE-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' +; SVE2-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' +; SVE2p1-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' +; SVE2p3-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' +; I8MM-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] + %gep1 = getelementptr inbounds i32, ptr %src1, i32 %iv + %load1 = load i32, ptr %gep1 + %zext1 = zext i32 %load1 to i64 + %gep2 = getelementptr inbounds i32, ptr %src2, i32 %iv + %load2 = load i32, ptr %gep2 + %sext2 = sext i32 %load2 to i64 + %mul12 = mul i64 %zext1, %sext2 + %gep3 = getelementptr inbounds i32, ptr %src3, i32 %iv + %load3 = load i32, ptr %gep3 + %sext3 = sext i32 %load3 to i64 + %mul13 = mul i64 %zext1, %sext3 + %add1 = add i64 %acc, %mul12 + %sub2 = sub i64 %add1, %mul13 + %iv.next = add i32 %iv, 1 + %cmp = icmp ult i32 %iv.next, %n + br i1 %cmp, label %loop, label %exit, !llvm.loop !33 + +exit: + ret i64 %sub2 +} + +!33 = distinct !{!33, !34, !35} +!34 = !{!"llvm.loop.interleave.count", i32 1} +!35 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-sdot.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-sdot.ll index ff3881f2c7fda..d40b984618d45 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-sdot.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-sdot.ll @@ -15,9 +15,9 @@ ; COMMON: LV: Checking a loop in 'add_sub_chained_reduction' ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) -; SVE: Cost of 16 for VF vscale x 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) +; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) -; NEON: Cost of 16 for VF 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) +; NEON: Cost of 3 for VF 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) target triple = "aarch64" From dabb0797505e913b2207f3f60b66566d566f6d76 Mon Sep 17 00:00:00 2001 From: anjenner <161845516+anjenner@users.noreply.github.com> Date: Fri, 8 May 2026 09:26:43 +0100 Subject: [PATCH 007/538] AMDGPU/GlobalISel: Implement RegBankLegalizeRules for amdgcn_log, amdgcn_rcp, and amdgcn_sqrt (#195099) --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 10 +- .../AMDGPU/pseudo-scalar-transcendental.ll | 501 +++++++++++++++--- 2 files changed, 449 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 2ce2545f57893..3a5d3e6ff1345 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1948,7 +1948,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST) .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST); - addRulesForIOpcs({amdgcn_sqrt}, Standard) + addRulesForIOpcs({amdgcn_rcp, amdgcn_sqrt}, Standard) .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}}) .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST) .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST) @@ -1958,6 +1958,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}}) .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}}); + addRulesForIOpcs({amdgcn_log}, Standard) + .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}}) + .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST) + .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}}) + .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST) + .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST); + addRulesForIOpcs({amdgcn_ds_atomic_async_barrier_arrive_b64}) .Any({{}, {{}, {IntrId, VgprP3}}}); diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index f001b26030896..56bb3ce1742b8 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN-GISEL %s ; TODO: GlobalISel should avoid generating v_ldexp_f32. define amdgpu_cs float @v_s_exp_f32(float inreg %src) { @@ -30,18 +31,41 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-GISEL-NEXT: v_ldexp_f32 v0, s0, s1 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_exp_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s1, 0x42800000, 0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GCN-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; GCN-GISEL-NEXT: v_ldexp_f32 v0, v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.exp2.f32(float %src) ret float %result } define amdgpu_cs half @v_s_exp_f16(half inreg %src) { -; GFX12-LABEL: v_s_exp_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_s_exp_f16 s0, s0 -; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: v_s_exp_f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_s_exp_f16 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: v_s_exp_f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_exp_f16_e32 v0.l, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_exp_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_exp_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call half @llvm.exp2.f16(half %src) ret half %result } @@ -54,6 +78,11 @@ define amdgpu_cs float @v_s_amdgcn_exp_f32(float inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_amdgcn_exp_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_exp_f32_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.amdgcn.exp2.f32(float %src) ret float %result } @@ -66,6 +95,11 @@ define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_amdgcn_exp_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_exp_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call half @llvm.amdgcn.exp2.f16(half %src) ret half %result } @@ -88,29 +122,60 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) { ; GFX12-GISEL-LABEL: v_s_log_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5 +; GFX12-GISEL-NEXT: s_lshl_b32 s2, s2, 5 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: v_ldexp_f32 v0, s0, s2 -; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 -; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_log_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-GISEL-NEXT: s_lshl_b32 s2, s2, 5 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GCN-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GCN-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GCN-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.log2.f32(float %src) ret float %result } define amdgpu_cs half @v_s_log_f16(half inreg %src) { -; GFX12-LABEL: v_s_log_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_s_log_f16 s0, s0 -; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: v_s_log_f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_s_log_f16 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: v_s_log_f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_log_f16_e32 v0.l, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_log_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_log_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call half @llvm.log2.f16(half %src) ret half %result } @@ -123,6 +188,11 @@ define amdgpu_cs float @v_s_amdgcn_log_f32(float inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_amdgcn_log_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_log_f32_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.amdgcn.log.f32(float %src) ret float %result } @@ -135,10 +205,63 @@ define amdgpu_cs half @v_s_amdgcn_log_f16(half inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_amdgcn_log_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_log_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call half @llvm.amdgcn.log.f16(half %src) ret half %result } +define void @v_amdgcn_log_f16_div(half %src, ptr addrspace(1) %out) { +; GFX12-LABEL: v_amdgcn_log_f16_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX12-NEXT: global_store_b16 v[1:2], v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_amdgcn_log_f16_div: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_log_f16_e32 v0, v0 +; GCN-GISEL-NEXT: flat_store_short v[1:2], v0 +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.amdgcn.log.f16(half %src) + store half %result, ptr addrspace(1) %out + ret void +} + +define void @v_amdgcn_log_f32_div(float %src, ptr addrspace(1) %out) { +; GFX12-LABEL: v_amdgcn_log_f32_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_log_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_amdgcn_log_f32_div: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GCN-GISEL-NEXT: flat_store_dword v[1:2], v0 +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.amdgcn.log.f32(float %src) + store float %result, ptr addrspace(1) %out + ret void +} + define amdgpu_cs float @v_s_rcp_f32(float inreg %src) { ; GFX12-LABEL: v_s_rcp_f32: ; GFX12: ; %bb.0: @@ -147,6 +270,11 @@ define amdgpu_cs float @v_s_rcp_f32(float inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_rcp_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call fast float @llvm.amdgcn.rcp.f32(float %src) ret float %result } @@ -159,10 +287,107 @@ define amdgpu_cs half @v_s_rcp_f16(half inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_rcp_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_rcp_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call fast half @llvm.amdgcn.rcp.f16(half %src) ret half %result } +define void @v_rcp_f16_div(half %src, ptr addrspace(1) %out) { +; GFX12-LABEL: v_rcp_f16_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_rcp_f16_e32 v0.l, v0.l +; GFX12-NEXT: global_store_b16 v[1:2], v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_rcp_f16_div: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_rcp_f16_e32 v0, v0 +; GCN-GISEL-NEXT: flat_store_short v[1:2], v0 +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.amdgcn.rcp.f16(half %src) + store half %result, ptr addrspace(1) %out + ret void +} + +define void @v_rcp_f32_div(float %src, ptr addrspace(1) %out) { +; GFX12-LABEL: v_rcp_f32_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_rcp_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_rcp_f32_div: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: flat_store_dword v[1:2], v0 +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.amdgcn.rcp.f32(float %src) + store float %result, ptr addrspace(1) %out + ret void +} + +define void @v_rcp_f64_div(double %src, ptr addrspace(1) %out) { +; GFX12-LABEL: v_rcp_f64_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_rcp_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GCN-GISEL-LABEL: v_rcp_f64_div: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-GISEL-NEXT: v_rcp_f64_e32 v[0:1], v[0:1] +; GCN-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) +; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.amdgcn.rcp.f64(double %src) + store double %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs double @v_s_rcp_f64(double inreg %src) { +; GFX12-LABEL: v_s_rcp_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_rcp_f64: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] +; GCN-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-GISEL-NEXT: ; return to shader part epilog + %result = call double @llvm.amdgcn.rcp.f64(double %src) + ret double %result +} + ; TODO: GlobalISel should generate v_s_rsq. define amdgpu_cs float @v_s_rsq_f32(float inreg %src) { ; GFX12-SDAG-LABEL: v_s_rsq_f32: @@ -182,6 +407,12 @@ define amdgpu_cs float @v_s_rsq_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_rsq_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: ; return to shader part epilog %sqrt = call fast float @llvm.sqrt.f32(float %src) %fdiv = fdiv fast float 1.0, %sqrt ret float %fdiv @@ -195,6 +426,11 @@ define amdgpu_cs half @v_s_rsq_f16(half inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_rsq_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_rsq_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %sqrt = call fast half @llvm.sqrt.f16(half %src) %result = fdiv fast half 1.0, %sqrt ret half %result @@ -260,17 +496,54 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_fmac_f32 s6, s7, s2 ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s3, s2 ; GFX12-GISEL-NEXT: s_cmp_gt_f32 s6, 0 +; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s3, s0, 0x260 ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2 +; GFX12-GISEL-NEXT: s_mul_f32 s4, s2, 0x37800000 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: s_cselect_b32 s1, s4, s2 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1 -; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260 -; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_sqrt_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-GISEL-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-GISEL-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-GISEL-NEXT: s_add_i32 s3, s2, -1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GCN-GISEL-NEXT: s_add_i32 s4, s2, 1 +; GCN-GISEL-NEXT: v_fma_f32 v1, -v1, v0, s0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GCN-GISEL-NEXT: v_fma_f32 v0, -v2, v0, s0 +; GCN-GISEL-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 +; GCN-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; GCN-GISEL-NEXT: s_cselect_b32 s2, s3, s2 +; GCN-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s2, s4, s2 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x37800000 +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GCN-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x260 +; GCN-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; GCN-GISEL-NEXT: s_cselect_b32 s1, s3, s2 +; GCN-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s0, s0, s1 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.sqrt.f32(float %src) ret float %result } @@ -283,6 +556,11 @@ define amdgpu_cs half @v_s_sqrt_f16(half inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_s_sqrt_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_sqrt_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call half @llvm.sqrt.f16(half %src) ret half %result } @@ -295,6 +573,11 @@ define amdgpu_cs float @v_amdgcn_sqrt_f32(float inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_amdgcn_sqrt_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.amdgcn.sqrt.f32(float %src) ret float %result } @@ -307,6 +590,11 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) { ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: v_amdgcn_sqrt_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_sqrt_f16_e32 v0, s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %result = call half @llvm.amdgcn.sqrt.f16(half %src) ret half %result } @@ -331,18 +619,40 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { ; GFX12-GISEL-LABEL: srcmods_abs_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_and_b32 s1, s0, 0x7fffffff -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_cmp_lt_f32 s1, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 -; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5 +; GFX12-GISEL-NEXT: s_lshl_b32 s2, s2, 5 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: v_ldexp_f32 v0, |s0|, s2 -; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: srcmods_abs_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |s0|, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-GISEL-NEXT: s_lshl_b32 s2, s2, 5 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GCN-GISEL-NEXT: v_ldexp_f32 v0, |s0|, v0 +; GCN-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GCN-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GCN-GISEL-NEXT: ; return to shader part epilog %abs = call float @llvm.fabs.f32(float %src) %result = call float @llvm.log2.f32(float %abs) ret float %result @@ -367,44 +677,86 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-GISEL-LABEL: srcmods_neg_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_xor_b32 s1, s0, 0x80000000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_cmp_lt_f32 s1, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 -; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5 +; GFX12-GISEL-NEXT: s_lshl_b32 s2, s2, 5 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: v_ldexp_f32 v0, -s0, s2 -; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: srcmods_neg_f32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], -s0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-GISEL-NEXT: s_lshl_b32 s2, s2, 5 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GCN-GISEL-NEXT: v_ldexp_f32 v0, -s0, v0 +; GCN-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GCN-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GCN-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GCN-GISEL-NEXT: ; return to shader part epilog %neg = fneg float %src %result = call float @llvm.log2.f32(float %neg) ret float %result } define amdgpu_cs half @srcmods_abs_f16(half inreg %src) { -; GFX12-LABEL: srcmods_abs_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_s_log_f16 s0, |s0| -; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: srcmods_abs_f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_s_log_f16 s0, |s0| +; GFX12-SDAG-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: srcmods_abs_f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_log_f16_e64 v0.l, |s0| +; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: srcmods_abs_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_log_f16_e64 v0, |s0| +; GCN-GISEL-NEXT: ; return to shader part epilog %abs = call half @llvm.fabs.f16(half %src) %result = call half @llvm.log2.f16(half %abs) ret half %result } define amdgpu_cs half @srcmods_neg_f16(half inreg %src) { -; GFX12-LABEL: srcmods_neg_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_s_log_f16 s0, -s0 -; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: srcmods_neg_f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_s_log_f16 s0, -s0 +; GFX12-SDAG-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: srcmods_neg_f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_log_f16_e64 v0.l, -s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: srcmods_neg_f16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_log_f16_e64 v0, -s0 +; GCN-GISEL-NEXT: ; return to shader part epilog %neg = fneg half %src %result = call half @llvm.log2.f16(half %neg) ret half %result @@ -430,19 +782,46 @@ define amdgpu_cs float @fdiv_f32_i32(float inreg %a, i32 inreg %b) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: fdiv_f32_i32: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GCN-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GCN-GISEL-NEXT: ; return to shader part epilog %uint = uitofp i32 %b to float %result = fdiv afn float %a, %uint ret float %result } define amdgpu_cs half @fdiv_f16_i16(half inreg %a, i16 inreg %b) { -; GFX12-LABEL: fdiv_f16_i16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_f16_u16_e32 v0.l, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_rcp_f16_e32 v0.l, v0.l -; GFX12-NEXT: v_mul_f16_e32 v0.l, s0, v0.l -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: fdiv_f16_i16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f16_u16_e32 v0.l, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_rcp_f16_e32 v0.l, v0.l +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.l, s0, v0.l +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: fdiv_f16_i16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f16_u16_e32 v0.l, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-GISEL-NEXT: v_s_rcp_f16 s1, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_mul_f16 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog +; +; GCN-GISEL-LABEL: fdiv_f16_i16: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: v_cvt_f16_u16_e32 v0, s1 +; GCN-GISEL-NEXT: v_rcp_f16_e32 v0, v0 +; GCN-GISEL-NEXT: v_mul_f16_e32 v0, s0, v0 +; GCN-GISEL-NEXT: ; return to shader part epilog %uint = uitofp i16 %b to half %result = fdiv afn half %a, %uint ret half %result From d791e3a6f4c832ae189e72c0368e5195d6328756 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 8 May 2026 09:31:16 +0100 Subject: [PATCH 008/538] [AArch64][SME] Elide private ZA setup when possible (#196090) In private ZA functions without any instructions that require "active" ZA we can omit all ZA setup (and saves/restores). This is equivalent to removing the `__arm_new("za/zt0")` attribute when ZA state is unused. --- llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 17 ++++++- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 36 +------------- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 47 +++++-------------- 3 files changed, 29 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 4462af1ca306f..dbd6cf2af45ba 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -490,7 +490,6 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs); assert((InsertPt == MBBI || isCallStartOpcode(InsertPt->getOpcode())) && "Unexpected state change insertion point!"); - // TODO: Do something to avoid state changes where NZCV is live. if (MBBI == FirstTerminatorInsertPt) Block.PhysLiveRegsAtExit = PhysLiveRegs; if (MBBI == FirstNonPhiInsertPt) @@ -1162,6 +1161,19 @@ void MachineSMEABI::emitStateChange(EmitContext &Context, } } +/// Returns true if private ZA setup can be elided. This occurs when there is +/// no instruction within the function that requires ZA to be active. +static bool canElidePrivateZASetup(const FunctionInfo &FnInfo) { + for (const BlockInfo &BlockInfo : FnInfo.Blocks) { + for (const InstInfo &InstInfo : BlockInfo.Insts) { + if (InstInfo.NeededState == ZAState::ACTIVE || + InstInfo.NeededState == ZAState::ACTIVE_ZT0_SAVED) + return false; + } + } + return true; +} + } // end anonymous namespace INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI", @@ -1193,6 +1205,9 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs); + if (SMEFnAttrs.hasPrivateZAInterface() && canElidePrivateZASetup(FnInfo)) + return false; + SmallVector BundleStates = assignBundleZAStates(Bundles, FnInfo); EmitContext Context; diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index bdfddad32ff3a..36539d94338a0 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -643,42 +643,10 @@ define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" { define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" { ; CHECK-LABEL: test16: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: add x29, sp, #64 -; CHECK-NEXT: str x19, [sp, #80] // 8-byte Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-80] -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB17_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: zero {za} -; CHECK-NEXT: .LBB17_2: -; CHECK-NEXT: smstart za -; CHECK-NEXT: smstart sm -; CHECK-NEXT: sub x8, x29, #80 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: smstop sm +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl callee ; CHECK-NEXT: bl callee -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: smstop za -; CHECK-NEXT: sub sp, x29, #64 -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Reload -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() call void @callee() diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index d3c3c111c205b..7ef3262e5811c 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -131,57 +131,32 @@ define void @zt0_in_caller_zt0_new_callee(ptr %callee) "aarch64_in_zt0" nounwind ; New-ZT0 Callee -; Expect commit of lazy-save if ZA is dormant -; Expect smstart ZA & clear ZT0 -; Expect spill & fill of ZT0 around call -; Before return, expect smstop ZA +; Expect ZA state setup to be elided (no instructions in this function require +; ZA state). define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwind { ; CHECK-LABEL: zt0_new_caller_zt0_new_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB6_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: smstart za -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str zt0, [x8] -; CHECK-NEXT: smstop za +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void %callee() "aarch64_new_zt0"; ret void; } -; Expect commit of lazy-save if ZA is dormant -; Expect smstart ZA & clear ZT0 -; No spill & fill of ZT0 around __arm_tpidr2_save ; Expect spill & fill of ZT0 around __arm_sme_state call -; Before return, expect smstop ZA -define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { +define i64 @zt0_new_caller_abi_routine_callee() "aarch64_inout_zt0" nounwind { ; CHECK-LABEL: zt0_new_caller_abi_routine_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbz x8, .LBB7_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: smstart za -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str zt0, [x8] +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret %res = call {i64, i64} @__arm_sme_state() From 5ff13afa6a551d24f60345ae3d6c67e2d6fb5be5 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Fri, 8 May 2026 01:43:00 -0700 Subject: [PATCH 009/538] [libc] implement fwide (#196157) Add fwide function and tests. Part 1/11. All build file changes are in part 11. Assisted by Gemini --- libc/config/linux/aarch64/entrypoints.txt | 12 ++ libc/config/linux/riscv/entrypoints.txt | 12 ++ libc/config/linux/x86_64/entrypoints.txt | 12 +- libc/include/wchar.yaml | 67 ++++++++ libc/src/wchar/CMakeLists.txt | 139 ++++++++++++++++- libc/src/wchar/fwide.cpp | 44 ++++++ libc/src/wchar/fwide.h | 27 ++++ libc/test/src/wchar/CMakeLists.txt | 180 +++++++++++++++++++++- libc/test/src/wchar/fwide_test.cpp | 54 +++++++ 9 files changed, 543 insertions(+), 4 deletions(-) create mode 100644 libc/src/wchar/fwide.cpp create mode 100644 libc/src/wchar/fwide.h create mode 100644 libc/test/src/wchar/fwide_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index ea3f4cd8f51a0..a0fb4663c1e54 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -1242,6 +1242,18 @@ if(LLVM_LIBC_FULL_BUILD) # sys/select.h entrypoints libc.src.sys.select.select + + # wchar.h entrypoints + # libc.src.wchar.fgetwc + # libc.src.wchar.fgetws + # libc.src.wchar.fputwc + # libc.src.wchar.fputws + libc.src.wchar.fwide + # libc.src.wchar.getwc + # libc.src.wchar.getwchar + # libc.src.wchar.putwc + # libc.src.wchar.putwchar + # libc.src.wchar.ungetwc ) endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index d8d520c6d4236..f67984d4f6484 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -1376,6 +1376,18 @@ if(LLVM_LIBC_FULL_BUILD) # sys/select.h entrypoints libc.src.sys.select.select + + # wchar.h entrypoints + # libc.src.wchar.fgetwc + # libc.src.wchar.fgetws + # libc.src.wchar.fputwc + # libc.src.wchar.fputws + libc.src.wchar.fwide + # libc.src.wchar.getwc + # libc.src.wchar.getwchar + # libc.src.wchar.putwc + # libc.src.wchar.putwchar + # libc.src.wchar.ungetwc ) endif() diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index dfd6064951d52..b6247c1172150 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1463,9 +1463,19 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.wchar.wcrtomb libc.src.wchar.wcsrtombs libc.src.wchar.wcsnrtombs + # libc.src.wchar.fgetwc + # libc.src.wchar.fgetws + # libc.src.wchar.fputwc + # libc.src.wchar.fputws + libc.src.wchar.fwide + # libc.src.wchar.getwc + # libc.src.wchar.getwchar + # libc.src.wchar.putwc + # libc.src.wchar.putwchar + # libc.src.wchar.ungetwc # nl_types.h entrypoints - libc.src.nl_types.catopen + libc.src.nl_types.catopen libc.src.nl_types.catclose libc.src.nl_types.catgets ) diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 6575f2504c900..c7a956d542ff6 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -369,3 +369,70 @@ functions: - type: wchar_t *__restrict - type: const wchar_t *__restrict - type: size_t + - name: fgetwc + standards: + - stdc + return_type: wint_t + arguments: + - type: FILE * + - name: fgetws + standards: + - stdc + return_type: wchar_t * + arguments: + - type: wchar_t *__restrict + - type: int + - type: FILE *__restrict + - name: fputwc + standards: + - stdc + return_type: wint_t + arguments: + - type: wchar_t + - type: FILE * + - name: fputws + standards: + - stdc + return_type: int + arguments: + - type: const wchar_t *__restrict + - type: FILE *__restrict + - name: fwide + standards: + - stdc + return_type: int + arguments: + - type: FILE * + - type: int + - name: getwc + standards: + - stdc + return_type: wint_t + arguments: + - type: FILE * + - name: getwchar + standards: + - stdc + return_type: wint_t + arguments: + - type: void + - name: putwc + standards: + - stdc + return_type: wint_t + arguments: + - type: wchar_t + - type: FILE * + - name: putwchar + standards: + - stdc + return_type: wint_t + arguments: + - type: wchar_t + - name: ungetwc + standards: + - stdc + return_type: wint_t + arguments: + - type: wint_t + - type: FILE * diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 89383c33c6a4e..6b075da6d430c 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -310,7 +310,7 @@ add_entrypoint_object( libc.hdr.types.size_t libc.hdr.wchar_macros libc.src.__support.macros.null_check -) +) add_entrypoint_object( wcschr @@ -566,3 +566,140 @@ add_entrypoint_object( libc.src.__support.macros.config libc.src.__support.common ) + +add_entrypoint_object( + fwide + SRCS + fwide.cpp + HDRS + fwide.h + DEPENDS + libc.hdr.types.FILE + libc.src.__support.File.file + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + fputwc + SRCS + fputwc.cpp + HDRS + fputwc.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wint_t + libc.hdr.types.wchar_t + libc.hdr.wchar_macros + libc.src.__support.File.file + libc.src.__support.libc_errno + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + fgetwc + SRCS + fgetwc.cpp + HDRS + fgetwc.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wint_t + libc.hdr.types.wchar_t + libc.src.__support.File.file + libc.src.__support.libc_errno + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + fputws + SRCS + fputws.cpp + HDRS + fputws.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wchar_t + libc.src.__support.File.file + libc.src.__support.libc_errno + libc.src.string.string_utils + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + fgetws + SRCS + fgetws.cpp + HDRS + fgetws.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wchar_t + libc.src.__support.File.file + libc.src.__support.libc_errno + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + ungetwc + SRCS + ungetwc.cpp + HDRS + ungetwc.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wint_t + libc.src.__support.File.file + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + getwc + SRCS + getwc.cpp + HDRS + getwc.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wint_t + libc.src.__support.File.file + libc.src.__support.libc_errno + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + getwchar + SRCS + getwchar.cpp + HDRS + getwchar.h + DEPENDS + libc.src.stdio.stdin + libc.src.__support.File.file + libc.src.__support.libc_errno +) + +add_entrypoint_object( + putwc + SRCS + putwc.cpp + HDRS + putwc.h + DEPENDS + libc.hdr.types.FILE + libc.hdr.types.wint_t + libc.src.__support.File.file + libc.src.__support.libc_errno + libc.src.__support.macros.null_check +) + +add_entrypoint_object( + putwchar + SRCS + putwchar.cpp + HDRS + putwchar.h + DEPENDS + libc.src.stdio.stdout + libc.src.__support.File.file + libc.src.__support.libc_errno +) diff --git a/libc/src/wchar/fwide.cpp b/libc/src/wchar/fwide.cpp new file mode 100644 index 0000000000000..ada4c30ba9c59 --- /dev/null +++ b/libc/src/wchar/fwide.cpp @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the implementation of the fwide function, which sets and +/// gets the orientation of a stream. +/// +//===----------------------------------------------------------------------===// + +#include "src/wchar/fwide.h" +#include "hdr/types/FILE.h" +#include "src/__support/File/file.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/null_check.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, fwide, (::FILE * stream, int mode)) { + LIBC_CRASH_ON_NULLPTR(stream); + auto *f = reinterpret_cast(stream); + + File::Orientation orient; + if (mode > 0) { + orient = f->try_set_orientation(File::Orientation::WIDE); + } else if (mode < 0) { + orient = f->try_set_orientation(File::Orientation::BYTE); + } else { + orient = f->get_orientation(); + } + + if (orient == File::Orientation::WIDE) + return 1; + if (orient == File::Orientation::BYTE) + return -1; + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/fwide.h b/libc/src/wchar/fwide.h new file mode 100644 index 0000000000000..1463af6db9604 --- /dev/null +++ b/libc/src/wchar/fwide.h @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the prototype of the fwide function, which sets and +/// gets the orientation of a stream. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_FWIDE_H +#define LLVM_LIBC_SRC_WCHAR_FWIDE_H + +#include "hdr/types/FILE.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int fwide(::FILE *stream, int mode); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_FWIDE_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 3fd279f19c755..dd8a3fd4b735c 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -191,7 +191,7 @@ add_libc_test( ) add_libc_test( - wmemset_test + wmemset_test SUITE libc_wchar_unittests SRCS @@ -361,7 +361,7 @@ add_libc_test( DEPENDS libc.src.wchar.wcscat ) - + add_libc_test( wcsstr_test SUITE @@ -537,3 +537,179 @@ add_libc_unittest( DEPENDS libc.src.wchar.wcsxfrm ) + +add_libc_test( + fwide_test + SUITE + libc_wchar_unittests + SRCS + fwide_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.wchar.fwide +) + +add_libc_test( + fputwc_test + SUITE + libc_wchar_unittests + SRCS + fputwc_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fread + libc.src.stdio.fwrite + libc.src.stdio.ferror + libc.src.wchar.fputwc + libc.src.wchar.fwide + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + fgetwc_test + SUITE + libc_wchar_unittests + SRCS + fgetwc_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.feof + libc.src.stdio.ferror + libc.src.stdio.fwrite + libc.src.wchar.fgetwc + libc.src.wchar.fwide + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + fputws_test + SUITE + libc_wchar_unittests + SRCS + fputws_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fread + libc.src.stdio.ferror + libc.src.wchar.fputws + libc.src.wchar.fwide + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + fgetws_test + SUITE + libc_wchar_unittests + SRCS + fgetws_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fwrite + libc.src.stdio.ferror + libc.src.wchar.fgetws + libc.src.wchar.fwide + libc.src.wchar.wcscmp + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + ungetwc_test + SUITE + libc_wchar_unittests + SRCS + ungetwc_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fwrite + libc.src.stdio.feof + libc.src.stdio.ferror + libc.src.stdio.fseek + libc.src.wchar.fgetwc + libc.src.wchar.fwide + libc.src.wchar.ungetwc +) + +add_libc_test( + getwc_test + SUITE + libc_wchar_unittests + SRCS + getwc_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fwrite + libc.src.stdio.feof + libc.src.stdio.ferror + libc.src.wchar.getwc + libc.src.wchar.fwide + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + getwchar_test + SUITE + libc_wchar_unittests + SRCS + getwchar_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fwrite + libc.src.stdio.feof + libc.src.stdio.ferror + libc.src.stdio.stdin + libc.src.wchar.getwchar + libc.src.wchar.fwide + libc.src.wchar.ungetwc + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + putwc_test + SUITE + libc_wchar_unittests + SRCS + putwc_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fread + libc.src.stdio.ferror + libc.src.wchar.putwc + libc.src.wchar.fwide + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + putwchar_test + SUITE + libc_wchar_unittests + SRCS + putwchar_test.cpp + DEPENDS + libc.include.stdio + libc.src.stdio.fopen + libc.src.stdio.fclose + libc.src.stdio.fread + libc.src.stdio.ferror + libc.src.stdio.stdout + libc.src.wchar.putwchar + libc.src.wchar.fwide + libc.test.UnitTest.ErrnoCheckingTest +) diff --git a/libc/test/src/wchar/fwide_test.cpp b/libc/test/src/wchar/fwide_test.cpp new file mode 100644 index 0000000000000..6131310a13018 --- /dev/null +++ b/libc/test/src/wchar/fwide_test.cpp @@ -0,0 +1,54 @@ +//===-- Unittests for fwide -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdio/fclose.h" +#include "src/stdio/fopen.h" +#include "src/wchar/fwide.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcFwideTest, QueryInitial) { + auto FILENAME = + libc_make_test_file_path(APPEND_LIBC_TEST("fwide_query.test")); + ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); + ASSERT_FALSE(file == nullptr); + + // Initial orientation should be unoriented (0) + EXPECT_EQ(LIBC_NAMESPACE::fwide(file, 0), 0); + + ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); +} + +TEST(LlvmLibcFwideTest, OrientWide) { + auto FILENAME = libc_make_test_file_path(APPEND_LIBC_TEST("fwide_wide.test")); + ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); + ASSERT_FALSE(file == nullptr); + + // Setting mode > 0 should return > 0 (wide oriented) + EXPECT_GT(LIBC_NAMESPACE::fwide(file, 1), 0); + + // Subsequent orientation queries/attempts should still return > 0 + EXPECT_GT(LIBC_NAMESPACE::fwide(file, 0), 0); + EXPECT_GT(LIBC_NAMESPACE::fwide(file, -1), 0); + + ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); +} + +TEST(LlvmLibcFwideTest, OrientByte) { + auto FILENAME = libc_make_test_file_path(APPEND_LIBC_TEST("fwide_byte.test")); + ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); + ASSERT_FALSE(file == nullptr); + + // Setting mode < 0 should return < 0 (byte oriented) + EXPECT_LT(LIBC_NAMESPACE::fwide(file, -1), 0); + + // Subsequent orientation queries/attempts should still return < 0 + EXPECT_LT(LIBC_NAMESPACE::fwide(file, 0), 0); + EXPECT_LT(LIBC_NAMESPACE::fwide(file, 1), 0); + + ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); +} From 35314409dace1c11ce13cb41b15922f258bef8e0 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Fri, 8 May 2026 09:52:12 +0100 Subject: [PATCH 010/538] [Dexter][NFC] Add split step-data collection methods for DAP (#196350) This patch adds some extra state collection methods to DebuggerBase and implements them for DAP only. These methods are used to fetch a stacktrace without variable information, and to populate variable information into a StepIR containing only a stacktrace. These methods are currently unused, making this patch NFC, but this is a necessary precursor to the new script model, where we examine the stacktrace to determine what variable info we will collect. As part of the stacktrace-collection function, we also fetch the instruction address for each stack frame, if it is made available by the debugger; to enable this, this patch adds a new value with default `None` to `FrameIR`. --- .../dexter/dex/debugger/DAP.py | 66 ++++++++++++++++++- .../dexter/dex/debugger/DebuggerBase.py | 9 +++ .../dexter/dex/debugger/dbgeng/dbgeng.py | 7 ++ .../dexter/dex/debugger/lldb/LLDB.py | 7 ++ .../dex/debugger/visualstudio/VisualStudio.py | 11 ++++ .../dexter/dex/dextIR/FrameIR.py | 11 +++- 6 files changed, 109 insertions(+), 2 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py index 2e264a87a1f24..86d3217cba07a 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py @@ -18,7 +18,7 @@ import threading import time from enum import Enum -from typing import Optional +from typing import List, Optional from dex.debugger.DebuggerBase import DebuggerBase, watch_is_active from dex.dextIR import FrameIR, LocIR, StepIR, StopReason, ValueIR @@ -937,6 +937,70 @@ def _get_step_info(self, watches, step_index): program_state=ProgramState(state_frames), ) + def get_stack_frames(self, step_index: int) -> StepIR: + """Returns a StepIR with stackframes and source locations (but no watched values).""" + assert ( + not self._debugger_state.is_running + ), "Cannot get step info while debugger is running!" + trace_req_id = self.send_message( + self.make_request("stackTrace", {"threadId": self._debugger_state.thread}) + ) + trace_response = self._await_response(trace_req_id) + if not trace_response["success"]: + raise DebuggerException("failed to get stack frames") + stackframes = trace_response["body"]["stackFrames"] + + frames = [] + + for stackframe in stackframes: + # No source, skip the frame! Currently I've only observed this for frames below main, so we break here; if + # it happens elsewhere, then this will break more stuff and we'll come up with a better solution. + if ( + stackframe.get("source") is None + or stackframe["source"].get("path") is None + ): + break + + loc_dict = { + "path": self._external_to_debug_path(stackframe["source"]["path"]), + "lineno": stackframe["line"], + "column": stackframe["column"], + } + loc = LocIR(**loc_dict) + frame = FrameIR( + function=self._sanitize_function_name(stackframe["name"]), + is_inlined=stackframe["name"].startswith("[Inline Frame]"), + loc=loc, + instruction_addr=stackframe.get("instructionPointerReference", None), + ) + + # We skip frames that are below "main", since we do not expect those to be user code. + fname = frame.function or "" # pylint: disable=no-member + if any(name in fname for name in self.frames_below_main): + break + + frames.append(frame) + + if len(frames) == 1 and frames[0].function is None: + frames = [] + + reason = self._translate_stop_reason(self._debugger_state.stopped_reason) + + return StepIR( + step_index=step_index, + frames=frames, + stop_reason=reason, + ) + + def collect_watches(self, step: StepIR, watches: List[str]): + """Evaluates the provided watches and stores their evaluation results (ValueIR) in the provided step.""" + frame_idx = 0 + if not watches: + return + active_exprs = set(watches) + for expr in active_exprs: + step.watches[expr] = self.evaluate_expression(expr, frame_idx) + @property def is_running(self): return self._debugger_state.is_running diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerBase.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerBase.py index f8bee4ecb4231..2f268a414381f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerBase.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerBase.py @@ -10,6 +10,7 @@ import os import sys import traceback +from typing import List import unittest from types import SimpleNamespace @@ -217,6 +218,14 @@ def get_step_info(self, watches, step_index): def _get_step_info(self, watches, step_index): pass + @abc.abstractmethod + def get_stack_frames(self, step_index: int) -> StepIR: + pass + + @abc.abstractmethod + def collect_watches(self, step: StepIR, watches: List[str]): + pass + @abc.abstractproperty def is_running(self): pass diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/dbgeng/dbgeng.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/dbgeng/dbgeng.py index acb66a90895ea..921cfc70655f1 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/dbgeng/dbgeng.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/dbgeng/dbgeng.py @@ -8,6 +8,7 @@ import sys import os import platform +from typing import List from dex.debugger.DebuggerBase import DebuggerBase, watch_is_active from dex.dextIR import FrameIR, LocIR, StepIR, StopReason, ValueIR @@ -167,6 +168,12 @@ def _get_step_info(self, watches, step_index): program_state=ProgramState(state_frames), ) + def get_stack_frames(self, step_index: int) -> StepIR: + raise NotImplementedError("--use-script debugging not supported in dbgeng yet.") + + def collect_watches(self, step: StepIR, watches: List[str]): + raise NotImplementedError("--use-script debugging not supported in dbgeng yet.") + @property def is_running(self): return False # We're never free-running diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py index 17a22eb662cab..8f90981f67e4c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py @@ -11,6 +11,7 @@ import shlex from subprocess import CalledProcessError, check_output, STDOUT import sys +from typing import List from dex.debugger.DebuggerBase import DebuggerBase, watch_is_active from dex.debugger.DAP import DAP @@ -318,6 +319,12 @@ def _get_step_info(self, watches, step_index): program_state=ProgramState(state_frames), ) + def get_stack_frames(self, step_index: int) -> StepIR: + raise NotImplementedError("--use-script debugging not supported in lldb yet.") + + def collect_watches(self, step: StepIR, watches: List[str]): + raise NotImplementedError("--use-script debugging not supported in lldb yet.") + @property def is_running(self): # We're not running in async mode so this is always False. diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py index 1a1c8e9a8d9aa..406296a1605e9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py @@ -12,6 +12,7 @@ from enum import IntEnum from pathlib import PurePath, Path from collections import defaultdict, namedtuple +from typing import List from dex.command.CommandBase import StepExpectInfo from dex.debugger.DebuggerBase import DebuggerBase, watch_is_active @@ -392,6 +393,16 @@ def _get_step_info(self, watches, step_index): program_state=program_state, ) + def get_stack_frames(self, step_index: int) -> StepIR: + raise NotImplementedError( + "--use-script debugging not supported in visual studio yet." + ) + + def collect_watches(self, step: StepIR, watches: List[str]): + raise NotImplementedError( + "--use-script debugging not supported in visual studio yet." + ) + @property def is_running(self): return self._mode == VisualStudio.dbgRunMode diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/FrameIR.py b/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/FrameIR.py index a2c0523b47b79..4dd9a8b63ccc7 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/FrameIR.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/FrameIR.py @@ -4,13 +4,22 @@ # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from typing import Optional + from dex.dextIR.LocIR import LocIR class FrameIR: """Data class which represents a frame in the call stack""" - def __init__(self, function: str, is_inlined: bool, loc: LocIR): + def __init__( + self, + function: str, + is_inlined: bool, + loc: LocIR, + instruction_addr: Optional[str] = None, + ): self.function = function self.is_inlined = is_inlined self.loc = loc + self.instruction_addr = instruction_addr From 016894861f0dc650b4fc22d57034c2f41b7f8481 Mon Sep 17 00:00:00 2001 From: Konrad Kleine Date: Fri, 8 May 2026 11:18:45 +0200 Subject: [PATCH 011/538] [libclc] Use spirv[64]-mesa-mesa3d triple in README.md (#196483) Now that #194607 landed we use a normalized triple in the README for the SPIRV targets. Before `spirv-mesa3d-` and `spirv64-mesa3d-` were being used and those will be normalized to `spirv-unknown-mesa3d` and `spirv64-unknown-mesa3d` by the following command in `runtimes/CMakeLists.txt` with this command: ```console $ clang --target=spirv-mesa3d- -print-target-triple spirv-unknown-mesa3d ``` This is because in `llvm/lib/TargetParser/Triple.cpp` the term `mesa3d` is recognized as an OS and placed in third position. The install path for `libclc.spv` there ends up in `spirv-unknown-mesa3d/libclc.spv`. With this change we suggest to use triples that "survive" the normalization: ```console $ clang --target=spirv-mesa-mesa3d -print-target-triple spirv-mesa-mesa3d ``` See also this discussion: https://github.com/llvm/llvm-project/pull/194607#issuecomment-4378126607 --- libclc/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libclc/README.md b/libclc/README.md index 81ce1553d139c..e096a070ee878 100644 --- a/libclc/README.md +++ b/libclc/README.md @@ -61,9 +61,9 @@ cmake ../llvm -G Ninja -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release #### Configure for SPIR-V targets ``` cmake ../llvm -G Ninja -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release \ - -DRUNTIMES_spirv-mesa3d-_LLVM_ENABLE_RUNTIMES=libclc \ - -DRUNTIMES_spirv64-mesa3d-_LLVM_ENABLE_RUNTIMES=libclc \ - -DLLVM_RUNTIME_TARGETS="spirv-mesa3d-;spirv64-mesa3d-" + -DRUNTIMES_spirv-mesa-mesa3d_LLVM_ENABLE_RUNTIMES=libclc \ + -DRUNTIMES_spirv64-mesa-mesa3d_LLVM_ENABLE_RUNTIMES=libclc \ + -DLLVM_RUNTIME_TARGETS="spirv-mesa-mesa3d;spirv64-mesa-mesa3d" ``` To build multiple targets, pass them as a semicolon-separated list in From 242326570654760dd6bbd7c4120b39615518a1cf Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Fri, 8 May 2026 10:21:45 +0100 Subject: [PATCH 012/538] [lldb-dap] Fix build when using precompiled header and Xcode generator. (#196366) When building with precompiled headers and Xcode as a generator, It adds `obj.lldbDAP.dir/${BUILD_TYPE}/cmake_pch.xxx` but does not generate one causing the build to fail. This might have to do with `add_llvm_library` adding a source file `Dummy.c` to any object it creates if using Xcode as a generator and `lldbDAP` object not declaring it's LINK_LIBS and LINK_COMPONENTS. --- lldb/tools/lldb-dap/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index 1a8f167ef0a42..0e309d79376ac 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -77,6 +77,13 @@ add_lldb_library(lldbDAP OBJECT Protocol/ProtocolRequests.cpp ) +# When building with precompiled headers and Xcode as a generator, +# It adds obj.lldbDAP.dir/${BUILD_TYPE}/cmake_pch.xxx but does not generate one +# causing the build to fail. +if(CMAKE_GENERATOR STREQUAL "Xcode") + set_target_properties(lldbDAP PROPERTIES DISABLE_PRECOMPILE_HEADERS ON) +endif() + target_include_directories(lldbDAP PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) From 589f419f460971ff36884c099869f830cdfec463 Mon Sep 17 00:00:00 2001 From: Jaydeep Chauhan Date: Fri, 8 May 2026 14:53:59 +0530 Subject: [PATCH 013/538] [X86][GlobalISel] Added support for FNEG (#167919) --- .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 6 + .../Target/X86/GISel/X86RegisterBankInfo.cpp | 1 + .../test/CodeGen/X86/GlobalISel/isel-fneg.mir | 116 ++++++++++++++++++ llvm/test/CodeGen/X86/isel-fneg.ll | 69 +++++++++-- 4 files changed, 180 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/X86/GlobalISel/isel-fneg.mir diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index d6df8cb97cfc8..c0bb0f5f856e5 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -128,6 +128,12 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, G_FLOG, G_FLOG2, G_FLOG10, G_FPOWI, G_FSINCOS, G_FCEIL, G_FFLOOR}) .libcall(); + getActionDefinitionsBuilder(G_FNEG) + .legalFor(UseX87 && !HasSSE1, {s32}) + .legalFor(UseX87 && !HasSSE2, {s64}) + .legalFor(UseX87, {s80}) + .lower(); + getActionDefinitionsBuilder(G_FSQRT) .legalFor(HasSSE1 || UseX87, {s32}) .legalFor(HasSSE2 || UseX87, {s64}) diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp index ce3b8ccb7b061..a0c4d3a2633aa 100644 --- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp @@ -292,6 +292,7 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { switch (Opc) { case TargetOpcode::G_FSQRT: case TargetOpcode::G_FPEXT: + case TargetOpcode::G_FNEG: case TargetOpcode::G_FPTRUNC: case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_FPEXTLOAD: diff --git a/llvm/test/CodeGen/X86/GlobalISel/isel-fneg.mir b/llvm/test/CodeGen/X86/GlobalISel/isel-fneg.mir new file mode 100644 index 0000000000000..7aa5478dcdd95 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/isel-fneg.mir @@ -0,0 +1,116 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=i686-linux-gnu -run-pass=regbankselect,instruction-select %s -o - | FileCheck %s --check-prefixes GISEL-I686 + +--- +name: fneg_f64 +alignment: 16 +legalized: true +fixedStack: + - { id: 0, type: default, offset: 0, size: 8, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.1: + ; GISEL-I686-LABEL: name: fneg_f64 + ; GISEL-I686: [[DEF:%[0-9]+]]:rfp64 = IMPLICIT_DEF + ; GISEL-I686-NEXT: [[CHS_Fp64_:%[0-9]+]]:rfp64 = CHS_Fp64 [[DEF]], implicit-def dead $fpsw + ; GISEL-I686-NEXT: $fp0 = COPY [[CHS_Fp64_]] + ; GISEL-I686-NEXT: RET 0, implicit $fp0 + %1:_(p0) = G_FRAME_INDEX %fixed-stack.0 + %0:_(s64) = IMPLICIT_DEF + %2:_(s64) = G_FNEG %0 + $fp0 = COPY %2(s64) + RET 0, implicit $fp0 +... +--- +name: fneg_f32 +alignment: 16 +legalized: true +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.1: + ; GISEL-I686-LABEL: name: fneg_f32 + ; GISEL-I686: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m %fixed-stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (invariant load (s32) from %fixed-stack.0, align 16) + ; GISEL-I686-NEXT: [[CHS_Fp32_:%[0-9]+]]:rfp32 = CHS_Fp32 [[LD_Fp32m]], implicit-def dead $fpsw + ; GISEL-I686-NEXT: $fp0 = COPY [[CHS_Fp32_]] + ; GISEL-I686-NEXT: RET 0, implicit $fp0 + %1:_(p0) = G_FRAME_INDEX %fixed-stack.0 + %0:_(s32) = G_LOAD %1(p0) :: (invariant load (s32) from %fixed-stack.0, align 16) + %2:_(s32) = G_FNEG %0 + $fp0 = COPY %2(s32) + RET 0, implicit $fp0 +... +--- +name: fneg_f64_mem +alignment: 16 +legalized: true +fixedStack: + - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, type: default, offset: 0, size: 4, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.1: + ; GISEL-I686-LABEL: name: fneg_f64_mem + ; GISEL-I686: [[DEF:%[0-9]+]]:rfp64 = IMPLICIT_DEF + ; GISEL-I686-NEXT: [[CHS_Fp64_:%[0-9]+]]:rfp64 = CHS_Fp64 [[DEF]], implicit-def dead $fpsw + ; GISEL-I686-NEXT: $fp0 = COPY [[CHS_Fp64_]] + ; GISEL-I686-NEXT: RET 0, implicit $fp0 + %1:_(p0) = G_FRAME_INDEX %fixed-stack.1 + %0:_(s64) = IMPLICIT_DEF + %2:_(s64) = G_FNEG %0 + $fp0 = COPY %2(s64) + RET 0, implicit $fp0 +... +--- +name: fneg_f32_mem +alignment: 16 +legalized: true +fixedStack: + - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, type: default, offset: 0, size: 4, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.1: + ; GISEL-I686-LABEL: name: fneg_f32_mem + ; GISEL-I686: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (invariant load (p0) from %fixed-stack.1) + ; GISEL-I686-NEXT: [[CHS_Fp32_:%[0-9]+]]:rfp32 = CHS_Fp32 [[LD_Fp32m]], implicit-def dead $fpsw + ; GISEL-I686-NEXT: $fp0 = COPY [[CHS_Fp32_]] + ; GISEL-I686-NEXT: RET 0, implicit $fp0 + %2:_(p0) = G_FRAME_INDEX %fixed-stack.1 + %0:_(p0) = G_LOAD %2(p0) :: (invariant load (p0) from %fixed-stack.1, align 16) + %3:_(p0) = G_FRAME_INDEX %fixed-stack.0 + %1:_(s32) = G_LOAD %3(p0) :: (invariant load (p0) from %fixed-stack.0) + %4:_(s32) = G_FNEG %1 + $fp0 = COPY %4(s32) + RET 0, implicit $fp0 +... +--- +name: test_fp80 +alignment: 16 +legalized: true +fixedStack: + - { id: 0, type: default, offset: 0, size: 10, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.1: + ; GISEL-I686-LABEL: name: test_fp80 + ; GISEL-I686: [[LD_Fp80m:%[0-9]+]]:rfp80 = nofpexcept LD_Fp80m %fixed-stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (invariant load (s80) from %fixed-stack.0, align 16) + ; GISEL-I686-NEXT: [[CHS_Fp80_:%[0-9]+]]:rfp80 = CHS_Fp80 [[LD_Fp80m]], implicit-def dead $fpsw + ; GISEL-I686-NEXT: $fp0 = COPY [[CHS_Fp80_]] + ; GISEL-I686-NEXT: RET 0, implicit $fp0 + %1:_(p0) = G_FRAME_INDEX %fixed-stack.0 + %0:_(s80) = G_LOAD %1(p0) :: (invariant load (s80) from %fixed-stack.0, align 16) + %2:_(s80) = G_FNEG %0 + $fp0 = COPY %2(s80) + RET 0, implicit $fp0 +... diff --git a/llvm/test/CodeGen/X86/isel-fneg.ll b/llvm/test/CodeGen/X86/isel-fneg.ll index 77b3f263213a9..77a5ad9b747ea 100644 --- a/llvm/test/CodeGen/X86/isel-fneg.ll +++ b/llvm/test/CodeGen/X86/isel-fneg.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86 -; DISABLED: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86 ; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,FASTISEL-SSE-X86 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,SDAG-SSE-X86 -; DISABLED: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,GISEL-SSE-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,GISEL-SSE-X86 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,FASTISEL-SSE-X64 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,SDAG-SSE-X64 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,GISEL-SSE-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,GISEL-SSE-X64 define double @fneg_f64(double %x) nounwind { ; X86-LABEL: fneg_f64: @@ -53,15 +53,41 @@ define float @fneg_f32(float %x) nounwind { ; SDAG-X86-NEXT: fchs ; SDAG-X86-NEXT: retl ; -; SSE-X86-LABEL: fneg_f32: -; SSE-X86: # %bb.0: -; SSE-X86-NEXT: pushl %eax -; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE-X86-NEXT: movss %xmm0, (%esp) -; SSE-X86-NEXT: flds (%esp) -; SSE-X86-NEXT: popl %eax -; SSE-X86-NEXT: retl +; GISEL-X86-LABEL: fneg_f32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: flds {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: fchs +; GISEL-X86-NEXT: retl +; +; FASTISEL-SSE-X86-LABEL: fneg_f32: +; FASTISEL-SSE-X86: # %bb.0: +; FASTISEL-SSE-X86-NEXT: pushl %eax +; FASTISEL-SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FASTISEL-SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; FASTISEL-SSE-X86-NEXT: movss %xmm0, (%esp) +; FASTISEL-SSE-X86-NEXT: flds (%esp) +; FASTISEL-SSE-X86-NEXT: popl %eax +; FASTISEL-SSE-X86-NEXT: retl +; +; SDAG-SSE-X86-LABEL: fneg_f32: +; SDAG-SSE-X86: # %bb.0: +; SDAG-SSE-X86-NEXT: pushl %eax +; SDAG-SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SDAG-SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SDAG-SSE-X86-NEXT: movss %xmm0, (%esp) +; SDAG-SSE-X86-NEXT: flds (%esp) +; SDAG-SSE-X86-NEXT: popl %eax +; SDAG-SSE-X86-NEXT: retl +; +; GISEL-SSE-X86-LABEL: fneg_f32: +; GISEL-SSE-X86: # %bb.0: +; GISEL-SSE-X86-NEXT: pushl %eax +; GISEL-SSE-X86-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; GISEL-SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; GISEL-SSE-X86-NEXT: movl %eax, (%esp) +; GISEL-SSE-X86-NEXT: flds (%esp) +; GISEL-SSE-X86-NEXT: popl %eax +; GISEL-SSE-X86-NEXT: retl ; ; FASTISEL-SSE-X64-LABEL: fneg_f32: ; FASTISEL-SSE-X64: # %bb.0: @@ -143,6 +169,15 @@ define void @fneg_f32_mem(ptr %x, ptr %y) nounwind { ; SDAG-X86-NEXT: movl %edx, (%eax) ; SDAG-X86-NEXT: retl ; +; GISEL-X86-LABEL: fneg_f32_mem: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: flds (%eax) +; GISEL-X86-NEXT: fchs +; GISEL-X86-NEXT: fstps (%ecx) +; GISEL-X86-NEXT: retl +; ; FASTISEL-SSE-X86-LABEL: fneg_f32_mem: ; FASTISEL-SSE-X86: # %bb.0: ; FASTISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -161,6 +196,15 @@ define void @fneg_f32_mem(ptr %x, ptr %y) nounwind { ; SDAG-SSE-X86-NEXT: movl %edx, (%eax) ; SDAG-SSE-X86-NEXT: retl ; +; GISEL-SSE-X86-LABEL: fneg_f32_mem: +; GISEL-SSE-X86: # %bb.0: +; GISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-SSE-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; GISEL-SSE-X86-NEXT: xorl (%eax), %edx +; GISEL-SSE-X86-NEXT: movl %edx, (%ecx) +; GISEL-SSE-X86-NEXT: retl +; ; FASTISEL-SSE-X64-LABEL: fneg_f32_mem: ; FASTISEL-SSE-X64: # %bb.0: ; FASTISEL-SSE-X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -206,3 +250,4 @@ define x86_fp80 @test_fp80(x86_fp80 %a) nounwind { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; SSE-X64: {{.*}} +; SSE-X86: {{.*}} From 11e06ca1c4ec5e3f7077607c500fb3c63ba24484 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 8 May 2026 10:26:16 +0100 Subject: [PATCH 014/538] [DAG] canCreateUndefOrPoison - poison generating flags / out of range shift amounts only generate poison (#196489) Matches ValueTracking / GISel implementations - although testing options are limited until DAG has actual uses of UndefPoisonKind::UndefOnly --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index a50db7c007890..b1baa007b00cb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5843,7 +5843,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const { - if (ConsiderFlags && Op->hasPoisonGeneratingFlags()) + if (ConsiderFlags && includesPoison(Kind) && Op->hasPoisonGeneratingFlags()) return true; unsigned Opcode = Op.getOpcode(); @@ -5970,7 +5970,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::SRA: // If the max shift amount isn't in range, then the shift can // create poison. - return !getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1); + return includesPoison(Kind) && + !getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1); case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ_ZERO_UNDEF: From d1ac3dfa54b9a250851577b8604462452658f8f1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 8 May 2026 10:29:53 +0100 Subject: [PATCH 015/538] [DAG] canCreateUndefOrPoison - ISD::AssertSext/Zext/Align/NoFPClass only generate poison (#196492) --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b1baa007b00cb..15fc74b6cc7c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5853,7 +5853,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::AssertAlign: case ISD::AssertNoFPClass: // Assertion nodes can create poison if the assertion fails. - return true; + return includesPoison(Kind); case ISD::FREEZE: case ISD::CONCAT_VECTORS: From 218b7b43b269e578c61bf0361194df1562dbeb28 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Fri, 8 May 2026 11:30:11 +0200 Subject: [PATCH 016/538] [X86][GlobalISel] Support globals in pic mode (#170038) Introduce G_WRAPPER_RIP it is the same node as in DAG. It is required to make legalization possible when a load from stub is required to obtain a pointer to a global value. It allows to avoid manual selection in X86InstructionSelector. Also added a missing check on X86SelectAddress failure. --- .../X86/GISel/X86InstructionSelector.cpp | 46 ++----- .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 33 ++++- llvm/lib/Target/X86/GISel/X86LegalizerInfo.h | 3 + llvm/lib/Target/X86/X86InstrGISel.td | 9 ++ llvm/test/CodeGen/X86/GlobalISel/GV.ll | 117 ++++++++++++------ .../X86/GlobalISel/x86_64-legalize-GV.mir | 6 +- llvm/test/CodeGen/X86/machine-block-hash.mir | 12 +- 7 files changed, 148 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 985a9eb30b5f5..a77049fba133e 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -625,7 +625,8 @@ static bool X86SelectAddress(MachineInstr &I, const X86TargetMachine &TM, } break; } - case TargetOpcode::G_GLOBAL_VALUE: { + case TargetOpcode::G_GLOBAL_VALUE: + case X86::G_WRAPPER_RIP: { auto GV = I.getOperand(1).getGlobal(); if (GV->isThreadLocal()) { return false; // TODO: we don't support TLS yet. @@ -633,24 +634,15 @@ static bool X86SelectAddress(MachineInstr &I, const X86TargetMachine &TM, // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return false; - - unsigned int GVOpFlags = STI.classifyGlobalReference(GV); - - // If it's a stub, we need to do a load to find the real address, and we - // can't fold that into just an AM. The load will come from lowering this - // G_GLOBAL_VALUE later. - if (isGlobalStubReference(GVOpFlags)) - break; - - // If it's not a stub, point AM directly at the global. AM.GV = GV; - AM.GVOpFlags = GVOpFlags; + AM.GVOpFlags = STI.classifyGlobalReference(GV); // TODO: This reference is relative to the pic base. not supported yet. if (isGlobalRelativeToPICBase(AM.GVOpFlags)) return false; - if (STI.isPICStyleRIPRel()) { + if (STI.isPICStyleRIPRel() || AM.GVOpFlags == X86II::MO_GOTPCREL || + AM.GVOpFlags == X86II::MO_GOTPCREL_NORELAX) { // Use rip-relative addressing. assert(AM.Base.Reg == 0 && AM.IndexReg == 0 && "RIP-relative addresses can't have additional register operands"); @@ -779,27 +771,12 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, "unexpected instruction"); X86AddressMode AM; - unsigned NewOpc; - - const GlobalValue *GV = I.getOperand(1).getGlobal(); - const auto GVOpFlags = STI.classifyGlobalReference(GV); - if (isGlobalStubReference(GVOpFlags)) { - // If it's a stub, we need a load from the GOT instead of lea. - AM.GV = GV; - AM.GVOpFlags = GVOpFlags; - if (STI.isPICStyleRIPRel() || AM.GVOpFlags == X86II::MO_GOTPCREL || - AM.GVOpFlags == X86II::MO_GOTPCREL_NORELAX) - AM.Base.Reg = X86::RIP; - - NewOpc = STI.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm; - } else { - if (!X86SelectAddress(I, TM, MRI, STI, AM)) - return false; + if (!X86SelectAddress(I, TM, MRI, STI, AM)) + return false; - const Register DefReg = I.getOperand(0).getReg(); - LLT Ty = MRI.getType(DefReg); - NewOpc = getLeaOP(Ty, STI); - } + const Register DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + unsigned NewOpc = getLeaOP(Ty, STI); I.setDesc(TII.get(NewOpc)); MachineInstrBuilder MIB(MF, I); @@ -2001,7 +1978,8 @@ X86InstructionSelector::selectAddr(MachineOperand &Root) const { MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); MachineInstr *Ptr = MRI.getVRegDef(Root.getReg()); X86AddressMode AM; - X86SelectAddress(*Ptr, TM, MRI, STI, AM); + if (!X86SelectAddress(*Ptr, TM, MRI, STI, AM)) + return std::nullopt; if (AM.IndexReg) return std::nullopt; diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index c0bb0f5f856e5..c342511e99b77 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -335,7 +335,9 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .widenScalarToNextPow2(1, /*Min*/ 32) .clampScalar(1, s32, sMaxScalar); - getActionDefinitionsBuilder({G_FRAME_INDEX, G_GLOBAL_VALUE}).legalFor({p0}); + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + + getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({p0}); // load/store: add more corner cases for (unsigned Op : {G_LOAD, G_STORE}) { @@ -641,6 +643,8 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, return legalizeGETROUNDING(MI, MRI, Helper); case TargetOpcode::G_SET_ROUNDING: return legalizeSETROUNDING(MI, MRI, Helper); + case TargetOpcode::G_GLOBAL_VALUE: + return legalizeGLOBAL_VALUE(MI, MRI, Helper); } llvm_unreachable("expected switch to return"); } @@ -1011,6 +1015,33 @@ bool X86LegalizerInfo::legalizeSETROUNDING(MachineInstr &MI, return true; } +bool X86LegalizerInfo::legalizeGLOBAL_VALUE(MachineInstr &MI, + MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const { + const GlobalValue *GV = MI.getOperand(1).getGlobal(); + Register Dst = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(Dst); + unsigned GVOpFlags = Subtarget.classifyGlobalReference(GV); + + // For stub references (GOT/PLT), we need G_WRAPPER_RIP + load + if (isGlobalStubReference(GVOpFlags)) { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineFunction &MF = MIRBuilder.getMF(); + + Register StubAddr = MRI.createGenericVirtualRegister(DstTy); + MIRBuilder.buildInstr(X86::G_WRAPPER_RIP) + .addDef(StubAddr) + .addGlobalAddress(GV); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad, DstTy, + Align(DstTy.getSizeInBytes())); + MIRBuilder.buildLoad(Dst, StubAddr, *MMO); + MI.eraseFromParent(); + } + return true; +} + bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { return true; diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h index 09c727c8e8685..58be7bb7d02b8 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h @@ -60,6 +60,9 @@ class X86LegalizerInfo : public LegalizerInfo { bool legalizeSETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const; + + bool legalizeGLOBAL_VALUE(MachineInstr &MI, MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const; }; } // namespace llvm #endif diff --git a/llvm/lib/Target/X86/X86InstrGISel.td b/llvm/lib/Target/X86/X86InstrGISel.td index b0c6bb6f61ad8..521923175c444 100644 --- a/llvm/lib/Target/X86/X86InstrGISel.td +++ b/llvm/lib/Target/X86/X86InstrGISel.td @@ -41,7 +41,16 @@ def G_FLDCW16 : X86GenericInstruction { let mayLoad = true; } +// RIP-relative address wrapper for PIC mode global access. +// Equivalent to X86ISD::WrapperRIP in SelectionDAG. +def G_WRAPPER_RIP : X86GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = false; +} + def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; diff --git a/llvm/test/CodeGen/X86/GlobalISel/GV.ll b/llvm/test/CodeGen/X86/GlobalISel/GV.ll index 1681992ddca4d..36d6099b8e110 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/GV.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/GV.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64 ; RUN: llc -mtriple=x86_64-apple-darwin -global-isel -verify-machineinstrs -relocation-model=pic < %s -o - | FileCheck %s --check-prefix=X64_DARWIN_PIC -; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32 -; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32ABI +; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X86 +; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32 @g_int = dso_local global i32 0, align 4 +@g_int_stub = global i32 0, align 4 @external_g_int = external global i32, align 4 -; Function Attrs: noinline nounwind optnone uwtable -define dso_local ptr @test_global_ptrv() #3 { +define dso_local ptr @test_global_ptrv() { ; X64-LABEL: test_global_ptrv: ; X64: # %bb.0: # %entry ; X64-NEXT: leaq g_int, %rax @@ -19,22 +19,21 @@ define dso_local ptr @test_global_ptrv() #3 { ; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax ; X64_DARWIN_PIC-NEXT: retq ; +; X86-LABEL: test_global_ptrv: +; X86: # %bb.0: # %entry +; X86-NEXT: leal g_int, %eax +; X86-NEXT: retl +; ; X32-LABEL: test_global_ptrv: ; X32: # %bb.0: # %entry ; X32-NEXT: leal g_int, %eax -; X32-NEXT: retl -; -; X32ABI-LABEL: test_global_ptrv: -; X32ABI: # %bb.0: # %entry -; X32ABI-NEXT: leal g_int, %eax -; X32ABI-NEXT: movl %eax, %eax -; X32ABI-NEXT: retq +; X32-NEXT: movl %eax, %eax +; X32-NEXT: retq entry: ret ptr @g_int } -; Function Attrs: noinline nounwind optnone uwtable -define dso_local i32 @test_global_valv() #3 { +define dso_local i32 @test_global_valv() { ; X64-LABEL: test_global_valv: ; X64: # %bb.0: # %entry ; X64-NEXT: movl g_int, %eax @@ -45,20 +44,69 @@ define dso_local i32 @test_global_valv() #3 { ; X64_DARWIN_PIC-NEXT: movl _g_int(%rip), %eax ; X64_DARWIN_PIC-NEXT: retq ; +; X86-LABEL: test_global_valv: +; X86: # %bb.0: # %entry +; X86-NEXT: movl g_int, %eax +; X86-NEXT: retl +; ; X32-LABEL: test_global_valv: ; X32: # %bb.0: # %entry ; X32-NEXT: movl g_int, %eax -; X32-NEXT: retl -; -; X32ABI-LABEL: test_global_valv: -; X32ABI: # %bb.0: # %entry -; X32ABI-NEXT: movl g_int, %eax -; X32ABI-NEXT: retq +; X32-NEXT: retq entry: %0 = load i32, ptr @g_int, align 4 ret i32 %0 } +define dso_local ptr @test_global_stub_ptrv() { +; X64-LABEL: test_global_stub_ptrv: +; X64: # %bb.0: +; X64-NEXT: movq g_int_stub@GOTPCREL(%rip), %rax +; X64-NEXT: retq +; +; X64_DARWIN_PIC-LABEL: test_global_stub_ptrv: +; X64_DARWIN_PIC: ## %bb.0: +; X64_DARWIN_PIC-NEXT: leaq _g_int_stub(%rip), %rax +; X64_DARWIN_PIC-NEXT: retq +; +; X86-LABEL: test_global_stub_ptrv: +; X86: # %bb.0: +; X86-NEXT: leal g_int_stub, %eax +; X86-NEXT: retl +; +; X32-LABEL: test_global_stub_ptrv: +; X32: # %bb.0: +; X32-NEXT: movl g_int_stub@GOTPCREL(%rip), %eax +; X32-NEXT: retq + ret ptr @g_int_stub +} + +define dso_local i32 @test_global_stub_valv() { +; X64-LABEL: test_global_stub_valv: +; X64: # %bb.0: +; X64-NEXT: movq g_int_stub@GOTPCREL(%rip), %rax +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: retq +; +; X64_DARWIN_PIC-LABEL: test_global_stub_valv: +; X64_DARWIN_PIC: ## %bb.0: +; X64_DARWIN_PIC-NEXT: movl _g_int_stub(%rip), %eax +; X64_DARWIN_PIC-NEXT: retq +; +; X86-LABEL: test_global_stub_valv: +; X86: # %bb.0: +; X86-NEXT: movl g_int_stub, %eax +; X86-NEXT: retl +; +; X32-LABEL: test_global_stub_valv: +; X32: # %bb.0: +; X32-NEXT: movl g_int_stub@GOTPCREL(%rip), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: retq + %val = load i32, ptr @g_int_stub, align 4 + ret i32 %val +} + define dso_local ptr @test_external_global_ptrv() { ; X64-LABEL: test_external_global_ptrv: ; X64: # %bb.0: # %entry @@ -70,16 +118,15 @@ define dso_local ptr @test_external_global_ptrv() { ; X64_DARWIN_PIC-NEXT: movq _external_g_int@GOTPCREL(%rip), %rax ; X64_DARWIN_PIC-NEXT: retq ; +; X86-LABEL: test_external_global_ptrv: +; X86: # %bb.0: # %entry +; X86-NEXT: leal external_g_int, %eax +; X86-NEXT: retl +; ; X32-LABEL: test_external_global_ptrv: ; X32: # %bb.0: # %entry -; X32-NEXT: leal external_g_int, %eax -; X32-NEXT: retl -; -; X32ABI-LABEL: test_external_global_ptrv: -; X32ABI: # %bb.0: # %entry -; X32ABI-NEXT: movl external_g_int@GOTPCREL(%rip), %eax -; X32ABI-NEXT: movl %eax, %eax -; X32ABI-NEXT: retq +; X32-NEXT: movl external_g_int@GOTPCREL(%rip), %eax +; X32-NEXT: retq entry: ret ptr @external_g_int } @@ -97,16 +144,16 @@ define dso_local i32 @test_external_global_valv() { ; X64_DARWIN_PIC-NEXT: movl (%rax), %eax ; X64_DARWIN_PIC-NEXT: retq ; +; X86-LABEL: test_external_global_valv: +; X86: # %bb.0: # %entry +; X86-NEXT: movl external_g_int, %eax +; X86-NEXT: retl +; ; X32-LABEL: test_external_global_valv: ; X32: # %bb.0: # %entry -; X32-NEXT: movl external_g_int, %eax -; X32-NEXT: retl -; -; X32ABI-LABEL: test_external_global_valv: -; X32ABI: # %bb.0: # %entry -; X32ABI-NEXT: movl external_g_int@GOTPCREL(%rip), %eax -; X32ABI-NEXT: movl (%eax), %eax -; X32ABI-NEXT: retq +; X32-NEXT: movl external_g_int@GOTPCREL(%rip), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: retq entry: %0 = load i32, ptr @external_g_int, align 4 ret i32 %0 diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-GV.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-GV.mir index e7c5d9b367941..f22ef0609b385 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-GV.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-GV.mir @@ -15,10 +15,12 @@ alignment: 16 legalized: false regBankSelected: false # CHECK: registers: -# CHECK-NEXT: - { id: 0, class: _, preferred-register: '', flags: [ ] } +# CHECK-NEXT: - { id: 0, class: _, preferred-register: '', flags: [ ] } +# CHECK-NEXT: - { id: 1, class: _, preferred-register: '', flags: [ ] } registers: - { id: 0, class: _, preferred-register: '' } -# CHECK: %0:_(p0) = G_GLOBAL_VALUE @g_int +# CHECK: %1:_(p0) = G_WRAPPER_RIP @g_int +# CHECK-NEXT: %0:_(p0) = G_LOAD %1(p0) :: (load (p0) from got) # CHECK-NEXT: $rax = COPY %0(p0) # CHECK-NEXT: RET 0, implicit $rax body: | diff --git a/llvm/test/CodeGen/X86/machine-block-hash.mir b/llvm/test/CodeGen/X86/machine-block-hash.mir index e5e8fc403c3f7..0b39e072728be 100644 --- a/llvm/test/CodeGen/X86/machine-block-hash.mir +++ b/llvm/test/CodeGen/X86/machine-block-hash.mir @@ -3,8 +3,8 @@ name: foo body: | ; HASH-LABEL: Machine Block Hash Info for function: foo - ; HASH-NEXT: BB#0: 0xd60eefbed60e0002 - ; HASH-NEXT: BB#1: 0xd60ec94cd60e0002 + ; HASH-NEXT: BB#0: 0xf33ebf30f33e0002 + ; HASH-NEXT: BB#1: 0xf33ee99ef33e0002 bb.0: $eax = MOV32ri 1 RET 0 @@ -17,8 +17,8 @@ body: | name: func_mbb body: | ; HASH-LABEL: Machine Block Hash Info for function: func_mbb - ; HASH-NEXT: BB#0: 0xe018efbed60e0002 - ; HASH-NEXT: BB#1: 0xe018c94cd60e0002 + ; HASH-NEXT: BB#0: 0xf60abf30f33e0002 + ; HASH-NEXT: BB#1: 0xf60ae99ef33e0002 bb.0: successors: %bb.1 $eax = MOV32ri 1 @@ -32,7 +32,7 @@ body: | name: func_global body: | ; HASH-LABEL: Machine Block Hash Info for function: func_global - ; HASH-NEXT: BB#0: 0x68eed05768ee0002 + ; HASH-NEXT: BB#0: 0x7e32df2f7e320002 bb.0: $rax = MOV64rm $rip, 1, $noreg, @foo, $noreg RET 0 @@ -41,7 +41,7 @@ body: | name: func_fp body: | ; HASH-LABEL: Machine Block Hash Info for function: func_fp - ; HASH-NEXT: BB#0: 0x84d3c75784d30002 + ; HASH-NEXT: BB#0: 0x3e6b0e5c3e6b0002 bb.0: $xmm0 = MOVSSrm $rip, 1, $noreg, %const.0, $noreg RET 0 From 0041fa4aeb725a422399c48d7e6d933246e22bfe Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Fri, 8 May 2026 11:32:29 +0200 Subject: [PATCH 017/538] [llvm][tools] Extend llvm-objdump to support nested OffloadBinaries (#185425) Extends llvm-objdump to print the information of images contained in nested OffloadBinaries. For example, for a binary compiled with #185413 it shows ``` $llvm-objdump --offloading ./a.out ./a.out: file format elf64-x86-64 OFFLOADING IMAGE [0]: kind elf arch triple spirv64-intel producer openmp image size 43104 bytes [Nested OffloadBinary format detected] Number of inner images: 1 kind spir-v arch triple spirv64-intel producer openmp image size 42944 bytes ``` New tests are added for clang-ling-wrapper and llvm-offload-binary using this new functionality. Depends on #185413 --------- Co-authored-by: Yury Plyakhin --- .../Tooling/clang-linker-wrapper-spirv.cpp | 13 +++ .../tools/llvm-objdump/Offloading/coff.test | 4 + .../tools/llvm-objdump/Offloading/elf.test | 4 + .../nested-offload-binary-fails.test | 19 +++++ .../Offloading/nested-offload-binary.test | 85 +++++++++++++++++++ .../llvm-offload-binary.ll | 38 +++++++++ llvm/tools/llvm-objdump/OffloadDump.cpp | 56 ++++++++++-- 7 files changed, 212 insertions(+), 7 deletions(-) create mode 100644 clang/test/Tooling/clang-linker-wrapper-spirv.cpp create mode 100644 llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary-fails.test create mode 100644 llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test diff --git a/clang/test/Tooling/clang-linker-wrapper-spirv.cpp b/clang/test/Tooling/clang-linker-wrapper-spirv.cpp new file mode 100644 index 0000000000000..ecbfe626129db --- /dev/null +++ b/clang/test/Tooling/clang-linker-wrapper-spirv.cpp @@ -0,0 +1,13 @@ +// Verify the ELF packaging of OpenMP SPIR-V device images. +// REQUIRES: system-linux +// REQUIRES: spirv-tools +// REQUIRES: spirv-registered-target +// RUN: %clangxx -fopenmp -fopenmp-targets=spirv64-intel -nogpulib -o %t %s +// RUN: llvm-objdump --offloading %t | FileCheck -check-prefix=CHECK %s + +// CHECK: nested images 1 +// CHECK: triple spirv64-intel + +int main(int argc, char** argv) { + return 0; +} diff --git a/llvm/test/tools/llvm-objdump/Offloading/coff.test b/llvm/test/tools/llvm-objdump/Offloading/coff.test index 022277d137bd4..5bec7d60cb83e 100644 --- a/llvm/test/tools/llvm-objdump/Offloading/coff.test +++ b/llvm/test/tools/llvm-objdump/Offloading/coff.test @@ -22,21 +22,25 @@ symbols: # CHECK-NEXT:arch gfx908 # CHECK-NEXT:triple amdgcn-amd-amdhsa # CHECK-NEXT:producer openmp +# CHECK-NEXT:image size 0 bytes # CHECK-EMPTY: # CHECK-NEXT:OFFLOADING IMAGE [1]: # CHECK-NEXT:kind llvm ir # CHECK-NEXT:arch gfx90a # CHECK-NEXT:triple amdgcn-amd-amdhsa # CHECK-NEXT:producer openmp +# CHECK-NEXT:image size 0 bytes # CHECK-EMPTY: # CHECK-NEXT:OFFLOADING IMAGE [2]: # CHECK-NEXT:kind cubin # CHECK-NEXT:arch sm_52 # CHECK-NEXT:triple nvptx64-nvidia-cuda # CHECK-NEXT:producer openmp +# CHECK-NEXT:image size 0 bytes # CHECK-EMPTY: # CHECK-NEXT:OFFLOADING IMAGE [3]: # CHECK-NEXT:kind # CHECK-NEXT:arch sm_70 # CHECK-NEXT:triple nvptx64-nvidia-cuda # CHECK-NEXT:producer none +# CHECK-NEXT:image size 0 bytes diff --git a/llvm/test/tools/llvm-objdump/Offloading/elf.test b/llvm/test/tools/llvm-objdump/Offloading/elf.test index 10182aeb856cd..3064286b9fea1 100644 --- a/llvm/test/tools/llvm-objdump/Offloading/elf.test +++ b/llvm/test/tools/llvm-objdump/Offloading/elf.test @@ -31,21 +31,25 @@ Sections: # CHECK-NEXT:arch gfx908 # CHECK-NEXT:triple amdgcn-amd-amdhsa # CHECK-NEXT:producer openmp +# CHECK-NEXT:image size 0 bytes # CHECK-EMPTY: # CHECK-NEXT:OFFLOADING IMAGE [1]: # CHECK-NEXT:kind llvm ir # CHECK-NEXT:arch gfx90a # CHECK-NEXT:triple amdgcn-amd-amdhsa # CHECK-NEXT:producer openmp +# CHECK-NEXT:image size 0 bytes # CHECK-EMPTY: # CHECK-NEXT:OFFLOADING IMAGE [2]: # CHECK-NEXT:kind cubin # CHECK-NEXT:arch sm_52 # CHECK-NEXT:triple nvptx64-nvidia-cuda # CHECK-NEXT:producer openmp +# CHECK-NEXT:image size 0 bytes # CHECK-EMPTY: # CHECK-NEXT:OFFLOADING IMAGE [3]: # CHECK-NEXT:kind # CHECK-NEXT:arch sm_70 # CHECK-NEXT:triple nvptx64-nvidia-cuda # CHECK-NEXT:producer none +# CHECK-NEXT:image size 0 bytes diff --git a/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary-fails.test b/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary-fails.test new file mode 100644 index 0000000000000..cb16915d5feb2 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary-fails.test @@ -0,0 +1,19 @@ +## Test that llvm-objdump will display an error for incorrect nested OffloadBinary images. + +# RUN: yaml2obj %s -o %t.bin +# RUN: llvm-objdump --offloading %t.bin 2>&1 | FileCheck %s -DFILENAME=%t.bin "-DROOT=OFFLOADING IMAGE" + +!Offload +Members: + - ImageKind: IMG_Object + OffloadKind: OFK_OpenMP + String: + - Key: "triple" + Value: "x-y-z" + - Key: "arch" + Value: "none" + Content: 10ff10ad + +# CHECK: [[ROOT]] +# CHECK-NOT: [[ROOT]] +# CHECK: warning: '[[FILENAME]]': failed to extract nested OffloadBinary: Invalid data was encountered while parsing the file diff --git a/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test b/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test new file mode 100644 index 0000000000000..d46180242144e --- /dev/null +++ b/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test @@ -0,0 +1,85 @@ +## Test that llvm-objdump can display nested OffloadBinary images. + +## The content blobs below were generated from the following YAML Input +##!Offload +##Members: +## - ImageKind: IMG_Bitcode +## OffloadKind: OFK_OpenMP +## String: +## - Key: "triple" +## Value: "x-y-z" +## - Key: "arch" +## Value: "arch1" +## - ImageKind: IMG_Bitcode +## OffloadKind: OFK_OpenMP +## String: +## - Key: "triple" +## Value: "x-y-z" +## - Key: "arch" +## Value: "arch2" + +# RUN: yaml2obj %s -o %t.bin +# RUN: llvm-objdump --offloading %t.bin | FileCheck --match-full-lines --strict-whitespace --implicit-check-not={{.}} %s + +!Offload +Members: + - ImageKind: IMG_Object + OffloadKind: OFK_OpenMP + String: + - Key: "triple" + Value: "x-y-z" + - Key: "arch" + Value: "none" + Content: 10ff10ad02000000f00000000000000020000000000000000200000000000000020001000000000070000000000000000200000000000000f00000000000000000000000000000000200010000000000a0000000000000000200000000000000f0000000000000000000000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e9000000000000000500000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e300000000000000050000000000000000782d792d7a006172636800747269706c650061726368320061726368310000 + - ImageKind: IMG_Object + OffloadKind: OFK_OpenMP + String: + - Key: "triple" + Value: "a-b-c" + - Key: "arch" + Value: "none" + Content: 10ff10ad02000000f00000000000000020000000000000000200000000000000020001000000000070000000000000000200000000000000f00000000000000000000000000000000200010000000000a0000000000000000200000000000000f0000000000000000000000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e9000000000000000500000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e300000000000000050000000000000000782d792d7a006172636800747269706c650061726368320061726368310000 + +# CHECK:OFFLOADING IMAGE [0]: +# CHECK-NEXT:kind elf +# CHECK-NEXT:arch none +# CHECK-NEXT:triple x-y-z +# CHECK-NEXT:producer openmp +# CHECK-NEXT:image size {{[0-9]+}} bytes +# CHECK-NEXT:nested images 2 +# CHECK-EMPTY: +# CHECK-NEXT: OFFLOADING IMAGE [0.0]: +# CHECK-NEXT: kind llvm ir +# CHECK-NEXT: arch arch1 +# CHECK-NEXT: triple x-y-z +# CHECK-NEXT: producer openmp +# CHECK-NEXT: image size {{[0-9]+}} bytes +# CHECK-EMPTY: +# CHECK-NEXT: OFFLOADING IMAGE [0.1]: +# CHECK-NEXT: kind llvm ir +# CHECK-NEXT: arch arch2 +# CHECK-NEXT: triple x-y-z +# CHECK-NEXT: producer openmp +# CHECK-NEXT: image size {{[0-9]+}} bytes +# CHECK-EMPTY: +# CHECK-NEXT:OFFLOADING IMAGE [1]: +# CHECK-NEXT:kind elf +# CHECK-NEXT:arch none +# CHECK-NEXT:triple a-b-c +# CHECK-NEXT:producer openmp +# CHECK-NEXT:image size {{[0-9]+}} bytes +# CHECK-NEXT:nested images 2 +# CHECK-EMPTY: +# CHECK-NEXT: OFFLOADING IMAGE [1.0]: +# CHECK-NEXT: kind llvm ir +# CHECK-NEXT: arch arch1 +# CHECK-NEXT: triple x-y-z +# CHECK-NEXT: producer openmp +# CHECK-NEXT: image size {{[0-9]+}} bytes +# CHECK-EMPTY: +# CHECK-NEXT: OFFLOADING IMAGE [1.1]: +# CHECK-NEXT: kind llvm ir +# CHECK-NEXT: arch arch2 +# CHECK-NEXT: triple x-y-z +# CHECK-NEXT: producer openmp +# CHECK-NEXT: image size {{[0-9]+}} bytes diff --git a/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll b/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll index df46ad3a0d38a..31ee5e286717f 100644 --- a/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll +++ b/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll @@ -15,3 +15,41 @@ ; RUN: llvm-offload-binary -o %t3 --image=file=%s ; RUN: llvm-offload-binary %t3 --image=file=%t4 ; RUN: diff %s %t4 + +; Test nested OffloadBinary construction with multiple inner images. +; RUN: llvm-offload-binary -o %t5 --image=file=%s,arch=abc,triple=x-y-z --image=file=%s,arch=def,triple=x-y-z +; RUN: llvm-offload-binary -o %t6 --image=file=%t5,arch=nested,triple=x-y-z +; RUN: llvm-objdump --offloading %t6 | FileCheck %s --check-prefix=NESTED + +; NESTED: OFFLOADING IMAGE [0]: +; NESTED: arch nested +; NESTED: nested images 2 +; NESTED: OFFLOADING IMAGE [0.0]: +; NESTED: arch abc +; NESTED: OFFLOADING IMAGE [0.1]: +; NESTED: arch def + +; Test complex nested OffloadBinary construction with multiple levels. +; RUN: llvm-offload-binary -o %t7 --image=file=%s,arch=abc,triple=x-y-z --image=file=%t5,arch=nested,triple=x-y-z +; RUN: llvm-offload-binary -o %t8 --image=file=%t7,arch=nested,triple=x-y-z --image=file=%t5,arch=nested2,triple=x-y-z +; RUN: llvm-objdump --offloading %t8 | FileCheck %s --check-prefix=NESTED2 + +; NESTED2: OFFLOADING IMAGE [0]: +; NESTED2: arch nested +; NESTED2: nested images 2 +; NESTED2: OFFLOADING IMAGE [0.0]: +; NESTED2: arch abc +; NESTED2: OFFLOADING IMAGE [0.1]: +; NESTED2: arch nested +; NESTED2: nested images 2 +; NESTED2: OFFLOADING IMAGE [0.1.0]: +; NESTED2: arch abc +; NESTED2: OFFLOADING IMAGE [0.1.1]: +; NESTED2: arch def +; NESTED2: OFFLOADING IMAGE [1]: +; NESTED2: arch nested2 +; NESTED2: nested images 2 +; NESTED2: OFFLOADING IMAGE [1.0]: +; NESTED2: arch abc +; NESTED2: OFFLOADING IMAGE [1.1]: +; NESTED2: arch def diff --git a/llvm/tools/llvm-objdump/OffloadDump.cpp b/llvm/tools/llvm-objdump/OffloadDump.cpp index cd2727069c2e9..c0ba4d86d9209 100644 --- a/llvm/tools/llvm-objdump/OffloadDump.cpp +++ b/llvm/tools/llvm-objdump/OffloadDump.cpp @@ -13,6 +13,7 @@ #include "OffloadDump.h" #include "llvm-objdump.h" +#include "llvm/BinaryFormat/Magic.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/OffloadBinary.h" #include "llvm/Object/OffloadBundle.h" @@ -43,13 +44,54 @@ static StringRef getImageName(const OffloadBinary &OB) { } } -static void printBinary(const OffloadBinary &OB, uint64_t Index) { - outs() << "\nOFFLOADING IMAGE [" << Index << "]:\n"; - outs() << left_justify("kind", 16) << getImageName(OB) << "\n"; - outs() << left_justify("arch", 16) << OB.getArch() << "\n"; - outs() << left_justify("triple", 16) << OB.getTriple() << "\n"; - outs() << left_justify("producer", 16) - << getOffloadKindName(OB.getOffloadKind()) << "\n"; +/// Print metadata from an OffloadBinary. +static void printOffloadBinaryMetadata(const OffloadBinary &OB, + uint64_t Level) { + outs().indent(Level * 2) << left_justify("kind", 16) << getImageName(OB) + << "\n"; + outs().indent(Level * 2) << left_justify("arch", 16) << OB.getArch() << "\n"; + outs().indent(Level * 2) << left_justify("triple", 16) << OB.getTriple() + << "\n"; + outs().indent(Level * 2) << left_justify("producer", 16) + << getOffloadKindName(OB.getOffloadKind()) << "\n"; + + StringRef InnerImage = OB.getImage(); + outs().indent(Level * 2) << left_justify("image size", 16) + << InnerImage.size() << " bytes\n"; +} + +static void printBinary(const OffloadBinary &OB, uint64_t Index, + uint64_t Level = 0, Twine ParentIndexPrefix = "") { + outs() << "\n"; + outs().indent(Level * 2) << "OFFLOADING IMAGE [" << ParentIndexPrefix << Index + << "]:\n"; + + printOffloadBinaryMetadata(OB, Level); + + StringRef ImageData = OB.getImage(); + if (identify_magic(ImageData) != file_magic::offload_binary) + return; + + MemoryBufferRef InnerBuffer(ImageData, "inner-offload-binary"); + SmallVector InnerBinaries; + Error Err = extractOffloadBinaries(InnerBuffer, InnerBinaries); + if (Err) { + reportWarning("failed to extract nested OffloadBinary: " + + toString(std::move(Err)), + OB.getFileName()); + return; + } + assert(!InnerBinaries.empty() && + "An offload binary with a magic number should contain at least one " + "binary"); + + outs().indent(Level * 2) << left_justify("nested images", 16) + << InnerBinaries.size() << "\n"; + + for (uint64_t I = 0, E = InnerBinaries.size(); I != E; ++I) { + const OffloadBinary *InnerOB = InnerBinaries[I].getBinary(); + printBinary(*InnerOB, I, Level + 1, ParentIndexPrefix + Twine(Index) + "."); + } } /// Print the embedded offloading contents of an ObjectFile \p O. From 9698c4d64445f135151308116995d22889ef21d9 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Fri, 8 May 2026 10:32:32 +0100 Subject: [PATCH 018/538] [lldb][test] Move DAP processes to own group to avoid random SIGHUPs (#195816) On macOS, LLDB's test suite randomly receives SIGHUP signals that stop the test suite early. The source of these SIGHUP's seems to be a bug in the kernel (most likely job control). The exact steps for reproducing this are not clear, but I have a set of three tests of which two need to run concurrently for this to trigger: * TestDAP_runInTerminal * TestDAP_launch_io_integratedTerminal * TestDAP_launch_stdio_redirection_and_console I was also running UBSan on this build which may or may not be necessary to make this random failure more persistent. When these tests run, macOS job control will send SIGHUP to the process group of the spawned subprocesses in that test. As LIT is in the same process group, it also receives the SIGHUP and shuts down. This patch just uses Python's API for forcing the spawned subprocess to its own process group. This won't stop the SIGHUP and only prevents it from reaching LIT. The SIGHUP itself doesn't seem to affect the DAP test itself. My theory here is that the SIGHUP is received during test shutdown (or after it was shut down), so that's why it doesn't cause any visible failures in any of the tests. --- lldb/packages/Python/lldbsuite/test/lldbtest.py | 10 ++++++++++ .../test/tools/lldb-server/gdbremote_testcase.py | 6 +++++- .../tools/lldb-server/commandline/TestStubSetSID.py | 12 +++++++++--- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index f0eafba899c29..3bbd7a21edd51 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -434,6 +434,16 @@ def launch(self, executable, args, extra_env, **kwargs): stdout = kwargs.pop("stdout", DEVNULL if not self._trace_on else None) stderr = kwargs.pop("stderr", None) + # This works around a bug in the macOS job control code where + # a supurious SIGHUP is sent to the the process group of our + # spawned subprocess when it is shutting down. + # While this SIGHUP doesn't cause any issues for our subprocess, + # it does reach the LIT process and stops the test suite run + # early. + # This parameter forces the spawned process into a new process + # group which prevents the supurious SIGHUP from reaching LIT. + # We don't + kwargs.setdefault("start_new_session", True) self._proc = Popen( [executable] + args, diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py index 995dc6264d3ca..9eb390f51ac7b 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py @@ -116,6 +116,7 @@ def setUp(self): self.setUpBaseLogging() self.debug_monitor_extra_args = [] + self.start_new_session = True if self.isVerboseLoggingRequested(): # If requested, full logs go to a log file @@ -364,7 +365,10 @@ def launch_debug_monitor(self, attach_pid=None, logfile=None): # Start the server. server = self.spawnSubprocess( - self.debug_monitor_exe, commandline_args, install_remote=False + self.debug_monitor_exe, + commandline_args, + install_remote=False, + start_new_session=self.start_new_session, ) self.assertIsNotNone(server) diff --git a/lldb/test/API/tools/lldb-server/commandline/TestStubSetSID.py b/lldb/test/API/tools/lldb-server/commandline/TestStubSetSID.py index 41bed72b56189..8df49c521937c 100644 --- a/lldb/test/API/tools/lldb-server/commandline/TestStubSetSID.py +++ b/lldb/test/API/tools/lldb-server/commandline/TestStubSetSID.py @@ -20,10 +20,16 @@ def get_stub_sid(self, extra_stub_args=None): # Get the process id for the stub. return os.getsid(server.pid) + def prepare_test(self): + # Disable putting the target process into a new group/sid as we + # check whether the target is the same/different sid as us. + self.start_new_session = False + self.set_inferior_startup_launch() + @skipIfWindows @skipIfRemote # --setsid not used on remote platform and currently it is also impossible to get the sid of lldb-platform running on a remote target def test_sid_is_same_without_setsid(self): - self.set_inferior_startup_launch() + self.prepare_test() stub_sid = self.get_stub_sid() self.assertEqual(stub_sid, os.getsid(0)) @@ -31,7 +37,7 @@ def test_sid_is_same_without_setsid(self): @skipIfWindows @skipIfRemote # --setsid not used on remote platform and currently it is also impossible to get the sid of lldb-platform running on a remote target def test_sid_is_different_with_setsid(self): - self.set_inferior_startup_launch() + self.prepare_test() stub_sid = self.get_stub_sid(["--setsid"]) self.assertNotEqual(stub_sid, os.getsid(0)) @@ -39,7 +45,7 @@ def test_sid_is_different_with_setsid(self): @skipIfWindows @skipIfRemote # --setsid not used on remote platform and currently it is also impossible to get the sid of lldb-platform running on a remote target def test_sid_is_different_with_S_llgs(self): - self.set_inferior_startup_launch() + self.prepare_test() stub_sid = self.get_stub_sid(["-S"]) self.assertNotEqual(stub_sid, os.getsid(0)) From 88bb0e6ddd25520533de25b6a33d341146803e06 Mon Sep 17 00:00:00 2001 From: Shreeyash Pandey Date: Fri, 8 May 2026 15:25:55 +0530 Subject: [PATCH 019/538] [AArch64] Use EXT for byte shuffles with leading zeros (#193466) Fixes: https://github.com/llvm/llvm-project/issues/191735 Teach AArch64 LowerVECTOR_SHUFFLE to recognize byte shuffles that are a zero fill right shift and lower them to EXT with a zero vector. Adds a regression test too. Change-Id: Iffe97ff7e35cfaff790f537b4f1f5ba9aded4f92 --- .../Target/AArch64/AArch64ISelLowering.cpp | 81 ++++++++ .../CodeGen/AArch64/build-vector-extract.ll | 4 +- .../CodeGen/AArch64/build-vector-two-dup.ll | 5 +- llvm/test/CodeGen/AArch64/shuffles.ll | 184 ++++++++++++++++++ 4 files changed, 268 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6efc377617006..b53605e917e2b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14412,6 +14412,65 @@ static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, return true; } +// Check if an EXT instruction can handle the shuffle mask when one source is a +// splat. This matches shuffles where the splat occupies either a prefix or a +// suffix and the remaining lanes are a contiguous slice from the non-splat +// source. +static bool isEXTMaskWithSplat(ArrayRef M, EVT VT, unsigned SplatOperand, + bool &ReverseEXT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned OtherBase = SplatOperand == 0 ? NumElts : 0; + auto IsSplatElt = [=](int Elt) { + return Elt < 0 || + (SplatOperand == 0 ? Elt < (int)NumElts : Elt >= (int)NumElts); + }; + + unsigned PrefixSplatElts = 0; + while (PrefixSplatElts != NumElts && IsSplatElt(M[PrefixSplatElts])) + ++PrefixSplatElts; + + if (PrefixSplatElts > 0 && PrefixSplatElts < NumElts) { + bool Match = true; + for (unsigned I = PrefixSplatElts; I != NumElts; ++I) { + int Expected = OtherBase + I - PrefixSplatElts; + if (M[I] >= 0 && M[I] != Expected) { + Match = false; + break; + } + } + + if (Match) { + ReverseEXT = SplatOperand == 1; + Imm = NumElts - PrefixSplatElts; + return true; + } + } + + unsigned SuffixSplatElts = 0; + while (SuffixSplatElts != NumElts && + IsSplatElt(M[NumElts - 1 - SuffixSplatElts])) + ++SuffixSplatElts; + + if (0 < SuffixSplatElts && SuffixSplatElts < NumElts) { + bool Match = true; + for (unsigned I = 0; I != NumElts - SuffixSplatElts; ++I) { + int Expected = OtherBase + I + SuffixSplatElts; + if (M[I] >= 0 && M[I] != Expected) { + Match = false; + break; + } + } + + if (Match) { + ReverseEXT = SplatOperand == 0; + Imm = SuffixSplatElts; + return true; + } + } + + return false; +} + /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. @@ -15108,6 +15167,28 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, DAG.getConstant(8, DL, MVT::i32)); } + bool IsSplat1 = + V1.getValueType() == VT && DAG.isSplatValue(V1, /*AllowUndefs=*/false); + bool IsSplat2 = + V2.getValueType() == VT && DAG.isSplatValue(V2, /*AllowUndefs=*/false); + for (unsigned SplatOperand : {0U, 1U}) { + if ((SplatOperand == 0 && !IsSplat1) || (SplatOperand == 1 && !IsSplat2)) + continue; + + bool ReverseSplatEXT = false; + unsigned SplatImm; + if (isEXTMaskWithSplat(ShuffleMask, VT, SplatOperand, ReverseSplatEXT, + SplatImm)) { + SDValue ExtOp1 = V1; + SDValue ExtOp2 = V2; + if (ReverseSplatEXT) + std::swap(ExtOp1, ExtOp2); + SplatImm *= getExtFactor(ExtOp1); + return DAG.getNode(AArch64ISD::EXT, DL, VT, ExtOp1, ExtOp2, + DAG.getConstant(SplatImm, DL, MVT::i32)); + } + } + bool ReverseEXT = false; unsigned Imm; if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll index 36b1b2cdcb432..f3e40021b616a 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -91,8 +91,7 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract3_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov v1.s[0], v0.s[3] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 3 %z = zext i32 %e to i64 @@ -640,4 +639,3 @@ define <4 x i32> @larger_bv_than_source(<4 x i16> %t0) { %t2 = insertelement <4 x i32> undef, i32 %vgetq_lane, i64 0 ret <4 x i32> %t2 } - diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll index f725c19081deb..1bd2ceb815ead 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -22,10 +22,9 @@ entry: define <16 x i8> @test2(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ld1r { v0.8b }, [x0] ; CHECK-NEXT: ld1r { v1.8b }, [x1] -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: dup v0.8b, w8 -; CHECK-NEXT: mov v1.b[7], w8 +; CHECK-NEXT: ext v1.8b, v1.8b, v0.8b, #1 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll index 930c3dfc54730..375aff747d768 100644 --- a/llvm/test/CodeGen/AArch64/shuffles.ll +++ b/llvm/test/CodeGen/AArch64/shuffles.ll @@ -576,3 +576,187 @@ define <4 x i32> @extract_shuffle(<8 x i16> %j, <4 x i16> %k) { ret <4 x i32> %d } +; Zero/splat-fill EXT: splat prefix (zero inserted at the start, data shifted right) +define <16 x i8> @test_shuf_zero_ext_start_lhs(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_start_lhs: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v1.16b, v0.16b, #15 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_start_lhs: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v1.16b, v0.16b, #15 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @test_shuf_zero_ext_start_lhs2(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_start_lhs2: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_start_lhs2: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @test_shuf_zero_ext_start_rhs(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_start_rhs: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v1.16b, v0.16b, #15 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_start_rhs: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v1.16b, v0.16b, #15 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @test_shuf_zero_ext_start_rhs2(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_start_rhs2: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_start_rhs2: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %r +} + +; Zero/splat-fill EXT: splat suffix (zero appended at the end, data shifted left) +define <16 x i8> @test_shuf_zero_ext_end_lhs(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_end_lhs: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_end_lhs: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @test_shuf_zero_ext_end_lhs2(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_end_lhs2: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_end_lhs2: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @test_shuf_zero_ext_end_rhs(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_end_rhs: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_end_rhs: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v1.16b, #1 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %r +} + +define <16 x i8> @test_shuf_zero_ext_end_rhs2(<16 x i8> %a) { +; CHECKLE-LABEL: test_shuf_zero_ext_end_rhs2: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: movi v1.2d, #0000000000000000 +; CHECKLE-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_end_rhs2: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: movi v1.2d, #0000000000000000 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v1.16b, #2 +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %r = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %r +} + +; Zero/splat-fill EXT: non-zero splat prefix (splat of a scalar, data shifted right) +define <4 x i32> @test_shuf_zero_ext_start_lhs_splat(<4 x i32> %a, i32 %b) { +; CHECKLE-LABEL: test_shuf_zero_ext_start_lhs_splat: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: dup v1.4s, w0 +; CHECKLE-NEXT: ext v0.16b, v1.16b, v0.16b, #12 +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: test_shuf_zero_ext_start_lhs_splat: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.4s, v0.4s +; CHECKBE-NEXT: dup v1.4s, w0 +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ext v0.16b, v1.16b, v0.16b, #12 +; CHECKBE-NEXT: rev64 v0.4s, v0.4s +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %i = insertelement <4 x i32> poison, i32 %b, i64 0 + %s = shufflevector <4 x i32> %i, <4 x i32> poison, <4 x i32> zeroinitializer + %r = shufflevector <4 x i32> %a, <4 x i32> %s, <4 x i32> + ret <4 x i32> %r +} From ca7fe087557033802371ea74210a8ada737d3719 Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Fri, 8 May 2026 12:14:46 +0200 Subject: [PATCH 020/538] [llvm][OpenMP][SPIRV] Fix assertion for GPU reductions (#194879) Currenty compiling a `target reduction` results in the following assert for spirv64-intel target: > Assertion `New->getType() == getType() && "replaceUses of value with new value of different type!"' failed. This patch fixes it by adding an addrespace cast where necessary to make the types of the expressions match. Assisted-by: claude-sonnet-4-5 --- .../spirv_target_teams_reduction_addrspace.c | 33 +++++++++++++++++++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 10 +++++- 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c diff --git a/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c b/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c new file mode 100644 index 0000000000000..bddd5548b9b8b --- /dev/null +++ b/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c @@ -0,0 +1,33 @@ +// Test that target teams reduction codegen handles address space casts correctly. + +// RUN: %clang_cc1 -verify -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - | FileCheck %s + +// expected-no-diagnostics + +// Verify the kernel is generated. +// CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}}_main_{{.*}} + +// Verify __kmpc_alloc_shared is called for reduction variable. +// The return type should be ptr addrspace(4) (generic pointer). +// CHECK: call spir_func align 8 addrspace(9) ptr addrspace(4) @__kmpc_alloc_shared(i64 4) + +// Verify the reduction runtime function is called. +// CHECK: call spir_func addrspace(9) i32 @__kmpc_nvptx_teams_reduce_nowait_v2( + +// Verify __kmpc_free_shared is called. +// CHECK: call spir_func addrspace(9) void @__kmpc_free_shared(ptr addrspace(4) + +// Verify the reduction function is generated. +// CHECK: define internal void @{{.*}}reduction{{.*}}func + +int main() { + int x = 0; + + #pragma omp target teams num_teams(2) reduction(+ : x) + { + x += 2; + } + + return x; +} diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index f17602e8e786c..e3d5bf0663490 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4735,7 +4735,15 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( &LHSPtr, &RHSPtr, CurFunc)); // Fix the CallBack code genereated to use the correct Values for the LHS - // and RHS + // and RHS. Cast to match types before replacing (necessary to handle + // different address spaces). + if (LHSPtr->getType() != RedValue->getType()) + RedValue = Builder.CreatePointerBitCastOrAddrSpaceCast( + RedValue, LHSPtr->getType()); + if (RHSPtr->getType() != RHS->getType()) + RHS = + Builder.CreatePointerBitCastOrAddrSpaceCast(RHS, RHSPtr->getType()); + LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) { return cast(U.getUser())->getParent()->getParent() == ReductionFunc; From 0bdf71d5a6653841604d34a629aa751b44f6189a Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 8 May 2026 11:16:15 +0100 Subject: [PATCH 021/538] Revert "[lldb] Real-time console pane for output in lldb tui" (#196507) Reverts llvm/llvm-project#177160 The new test is timing out on the AArch64 Linux buildbot (https://lab.llvm.org/buildbot/#/builders/59/builds/34166) and on my own machine. I suspect something to do with the requested terminal size. If what we get is smaller than requested, it could time out waiting for expected program output. --- lldb/include/lldb/Core/Debugger.h | 2 - lldb/source/Core/CoreProperties.td | 4 - lldb/source/Core/Debugger.cpp | 6 - lldb/source/Core/IOHandlerCursesGUI.cpp | 380 ++---------------- .../API/commands/gui/console-output/Makefile | 2 - .../console-output/TestGuiConsoleOutput.py | 138 ------- .../API/commands/gui/console-output/main.cpp | 34 -- llvm/docs/ReleaseNotes.md | 1 - 8 files changed, 23 insertions(+), 544 deletions(-) delete mode 100644 lldb/test/API/commands/gui/console-output/Makefile delete mode 100644 lldb/test/API/commands/gui/console-output/TestGuiConsoleOutput.py delete mode 100644 lldb/test/API/commands/gui/console-output/main.cpp diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index 82d86f988f07f..e53e916d78cc1 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -406,8 +406,6 @@ class Debugger : public std::enable_shared_from_this, bool SetShowInlineDiagnostics(bool); - uint64_t GetGuiMaxConsoleLines() const; - bool LoadPlugin(const FileSpec &spec, Status &error); void RunIOHandlers(); diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index e4a565e97b81f..8cea0931868aa 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -293,8 +293,4 @@ let Definition = "debugger", Path = "" in { Global, DefaultFalse, Desc<"Controls whether diagnostics can refer directly to the command input, drawing arrows to it. If false, diagnostics will echo the input.">; - def GuiMaxConsoleLines: Property<"gui-console-max-lines", "UInt64">, - Global, - DefaultUnsignedValue<10000>, - Desc<"The maximum number of lines to keep in the console output window.">; } diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index e9fe71108c572..48e03881fa3b5 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -780,12 +780,6 @@ bool Debugger::SetShowInlineDiagnostics(bool b) { return SetPropertyAtIndex(idx, b); } -uint64_t Debugger::GetGuiMaxConsoleLines() const { - const uint32_t idx = ePropertyGuiMaxConsoleLines; - return GetPropertyAtIndexAs( - idx, g_debugger_properties[idx].default_uint_value); -} - #pragma mark Debugger // const DebuggerPropertiesSP & diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp index ff2cd2777af67..5b70917f11cbd 100644 --- a/lldb/source/Core/IOHandlerCursesGUI.cpp +++ b/lldb/source/Core/IOHandlerCursesGUI.cpp @@ -327,8 +327,6 @@ class WindowDelegate { virtual const char *WindowDelegateGetHelpText() { return nullptr; } virtual KeyHelp *WindowDelegateGetKeyHelp() { return nullptr; } - - virtual void WindowDelegateProcessEvent(const lldb::EventSP &event_sp) {} }; class HelpDialogDelegate : public WindowDelegate { @@ -801,13 +799,6 @@ class Window : public Surface { subwindow_sp->Draw(force); } - void HandleProcessEvent(const lldb::EventSP &event_sp) { - if (m_delegate_sp) - m_delegate_sp->WindowDelegateProcessEvent(event_sp); - for (auto &subwindow_sp : m_subwindows) - subwindow_sp->HandleProcessEvent(event_sp); - } - bool CreateHelpSubwindow() { if (m_delegate_sp) { const char *text = m_delegate_sp->WindowDelegateGetHelpText(); @@ -4394,11 +4385,6 @@ class Application { ConstString broadcaster_class( broadcaster->GetBroadcasterClass()); if (broadcaster_class == broadcaster_class_process) { - uint32_t event_type = event_sp->GetType(); - if (event_type & (Process::eBroadcastBitSTDOUT | - Process::eBroadcastBitSTDERR)) { - m_window_sp->HandleProcessEvent(event_sp); - } m_update_screen = true; continue; // Don't get any key, just update our view } @@ -6325,233 +6311,6 @@ HandleCharResult HelpDialogDelegate::WindowDelegateHandleChar(Window &window, return eKeyHandled; } -class ConsoleOutputWindowDelegate : public WindowDelegate { -private: - void PollProcessOutput() { - ExecutionContext exe_ctx = - m_debugger.GetCommandInterpreter().GetExecutionContext(); - Process *process = exe_ctx.GetProcessPtr(); - - if (!process || !process->IsAlive()) - return; - - // Buffer for reading output. - char buffer[1024]; - Status error; - - // Read all available stdout. - size_t bytes; - while ((bytes = process->GetSTDOUT(buffer, sizeof(buffer), error)) > 0) - AppendOutput(buffer, bytes, false); - - // Read all available stderr. - while ((bytes = process->GetSTDERR(buffer, sizeof(buffer), error)) > 0) - AppendOutput(buffer, bytes, true); - } - - void AppendOutput(const char *text, size_t len, bool is_stderr) { - if (!text || len == 0) - return; - - std::lock_guard lock(m_output_mutex); - - // Split text into lines and add to buffer. - std::string remaining = m_partial_line; - remaining.append(text, len); - - size_t start = 0, pos = 0; - while ((pos = remaining.find('\n', start)) != std::string::npos) { - std::string line = remaining.substr(start, pos - start); - if (is_stderr) - line = "[stderr] " + line; - m_output_lines.push_back(line); - - // Keep buffer size under limit. - size_t max_lines = m_debugger.GetGuiMaxConsoleLines(); - while (m_output_lines.size() > max_lines) { - m_output_lines.pop_front(); - if (m_first_visible_line > 0) - --m_first_visible_line; - } - - start = pos + 1; - } - - // Save any remaining partial line. - m_partial_line = remaining.substr(start); - - // Auto-scroll to bottom if enabled. - if (m_auto_scroll && !m_output_lines.empty()) { - m_first_visible_line = - m_output_lines.size() > 0 ? m_output_lines.size() - 1 : 0; - } - } - -public: - ConsoleOutputWindowDelegate(Debugger &debugger) - : m_debugger(debugger), m_first_visible_line(0), m_auto_scroll(true) {} - - ~ConsoleOutputWindowDelegate() override = default; - - void WindowDelegateProcessEvent(const lldb::EventSP &event_sp) override { - if (event_sp->GetType() & - (Process::eBroadcastBitSTDOUT | Process::eBroadcastBitSTDERR)) - PollProcessOutput(); - } - - bool WindowDelegateDraw(Window &window, bool force) override { - std::lock_guard lock(m_output_mutex); - - window.Erase(); - window.DrawTitleBox(window.GetName()); - - const int width = window.GetWidth(); - const int height = window.GetHeight(); - - // Calculate the visible range. - size_t total_lines = m_output_lines.size(); - if (total_lines == 0) { - window.MoveCursor(2, 1); - window.PutCString("(no output yet)"); - return true; - } - - // Adjust scroll pos if needed. - if (m_first_visible_line >= total_lines) { - m_first_visible_line = total_lines > 0 ? total_lines - 1 : 0; - } - - // Draw visible line. - int visible_height = height - 2; - size_t start_line = m_first_visible_line; - - // If we are at the end, display last N lines. - if (m_auto_scroll || start_line + visible_height > total_lines) { - start_line = total_lines > static_cast(visible_height) - ? total_lines - visible_height - : 0; - } - - for (int row = 1; - row <= visible_height && (start_line + row - 1) < total_lines; ++row) { - window.MoveCursor(2, row); - const std::string &line = m_output_lines[start_line + row - 1]; - - // Highlight stderr lines?. - bool is_stderr = (line.find("[stderr]") == 0); - if (is_stderr) - window.AttributeOn(COLOR_PAIR(2)); - - // Truncate line to fit window width. - int available_width = width - 3; - if (static_cast(line.length()) > available_width) - window.PutCString(line.substr(0, available_width).c_str()); - else - window.PutCString(line.c_str()); - - if (is_stderr) - window.AttributeOff(COLOR_PAIR(2)); - } - - return true; - } - - HandleCharResult WindowDelegateHandleChar(Window &window, int key) override { - std::lock_guard lock(m_output_mutex); - - size_t total_lines = m_output_lines.size(); - int visible_height = window.GetHeight() - 1; - - switch (key) { - case KEY_UP: - if (m_first_visible_line > 0) { - --m_first_visible_line; - m_auto_scroll = false; - } - return eKeyHandled; - - case KEY_DOWN: - if (m_first_visible_line + visible_height < total_lines) - ++m_first_visible_line; - // Re-enable Auto-scroll at bottom. - if (m_first_visible_line + visible_height >= total_lines) - m_auto_scroll = true; - return eKeyHandled; - - case KEY_PPAGE: - if (m_first_visible_line > static_cast(visible_height)) - m_first_visible_line -= visible_height; - else - m_first_visible_line = 0; - m_auto_scroll = false; - return eKeyHandled; - - case KEY_NPAGE: - m_first_visible_line += visible_height; - if (m_first_visible_line + visible_height >= total_lines) { - m_first_visible_line = total_lines > static_cast(visible_height) - ? total_lines - visible_height - : 0; - m_auto_scroll = true; - } - return eKeyHandled; - - case 'a': - m_auto_scroll = !m_auto_scroll; - if (m_auto_scroll && total_lines > 0) - m_first_visible_line = total_lines > static_cast(visible_height) - ? total_lines - visible_height - : 0; - return eKeyHandled; - - case 'c': - m_output_lines.clear(); - m_partial_line.clear(); - m_first_visible_line = 0; - return eKeyHandled; - - case KEY_HOME: - m_first_visible_line = 0; - m_auto_scroll = false; - return eKeyHandled; - - case KEY_END: - m_first_visible_line = total_lines > static_cast(visible_height) - ? total_lines - visible_height - : 0; - m_auto_scroll = true; - return eKeyHandled; - - default: - break; - } - - return eKeyNotHandled; - } - - const char *WindowDelegateGetHelpText() override { - return "Console Output view shows stdout and stderr from the process."; - } - - KeyHelp *WindowDelegateGetKeyHelp() override { - static curses::KeyHelp g_source_view_key_help[] = { - {KEY_UP, "Scroll up"}, {KEY_DOWN, "Scroll down"}, - {KEY_PPAGE, "Page up"}, {KEY_NPAGE, "Page down"}, - {KEY_HOME, "Go to top"}, {KEY_END, "Go to bottom"}, - {'h', "Show help dialog"}, {'a', "Toggle auto-scroll"}, - {'c', "Clear output"}, {'\0', nullptr}}; - return g_source_view_key_help; - } - -protected: - Debugger &m_debugger; - std::deque m_output_lines; - std::string m_partial_line; - size_t m_first_visible_line = 0; - bool m_auto_scroll = true; - std::mutex m_output_mutex; -}; - class ApplicationDelegate : public WindowDelegate, public MenuDelegate { public: enum { @@ -6583,7 +6342,6 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { eMenuID_ViewSource, eMenuID_ViewVariables, eMenuID_ViewBreakpoints, - eMenuId_ViewConsole, eMenuID_Help, eMenuID_HelpGUIHelp @@ -6594,14 +6352,6 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { ~ApplicationDelegate() override = default; - WindowDelegateSP GetConsoleDelegate() { - if (!m_console_delegate_sp) { - m_console_delegate_sp = - WindowDelegateSP(new ConsoleOutputWindowDelegate(m_debugger)); - } - return m_console_delegate_sp; - } - bool WindowDelegateDraw(Window &window, bool force) override { return false; // Drawing not handled, let standard window drawing happen } @@ -6832,7 +6582,6 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { WindowSP main_window_sp = m_app.GetMainWindow(); WindowSP source_window_sp = main_window_sp->FindSubWindow("Source"); WindowSP variables_window_sp = main_window_sp->FindSubWindow("Variables"); - WindowSP console_window_sp = main_window_sp->FindSubWindow("Console"); WindowSP registers_window_sp = main_window_sp->FindSubWindow("Registers"); const Rect source_bounds = source_window_sp->GetBounds(); @@ -6841,52 +6590,39 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { main_window_sp->RemoveSubWindow(variables_window_sp.get()); - if (console_window_sp) { - Rect console_bounds = console_window_sp->GetBounds(); - console_bounds.origin.x = variables_bounds.origin.x; - console_bounds.size.width = - variables_bounds.size.width + console_bounds.size.width; - console_window_sp->SetBounds(console_bounds); - } else if (registers_window_sp) { + if (registers_window_sp) { // We have a registers window, so give all the area back to the - // registers window. + // registers window Rect registers_bounds = variables_bounds; registers_bounds.size.width = source_bounds.size.width; registers_window_sp->SetBounds(registers_bounds); } else { - // We have no console or registers window showing so give the bottom - // area back to the source view. + // We have no registers window showing so give the bottom area back + // to the source view source_window_sp->Resize(source_bounds.size.width, source_bounds.size.height + variables_bounds.size.height); } } else { - Rect new_vars_rect; - if (console_window_sp) { - // Console exists, so split the area. - const Rect console_bounds = console_window_sp->GetBounds(); - Rect new_console_rect; - console_bounds.VerticalSplitPercentage(0.50, new_vars_rect, - new_console_rect); - } else if (registers_window_sp) { + Rect new_variables_rect; + if (registers_window_sp) { // We have a registers window so split the area of the registers // window into two columns where the left hand side will be the - // variables and the right hand side will be the registers. - const Rect registers_bounds = registers_window_sp->GetBounds(); - Rect new_regs_rect; - registers_bounds.VerticalSplitPercentage(0.50, new_vars_rect, - new_regs_rect); - registers_window_sp->SetBounds(new_regs_rect); + // variables and the right hand side will be the registers + const Rect variables_bounds = registers_window_sp->GetBounds(); + Rect new_registers_rect; + variables_bounds.VerticalSplitPercentage(0.50, new_variables_rect, + new_registers_rect); + registers_window_sp->SetBounds(new_registers_rect); } else { - // No registers or console window, grab the bottom part of the source - // window. + // No registers window, grab the bottom part of the source window Rect new_source_rect; source_bounds.HorizontalSplitPercentage(0.70, new_source_rect, - new_vars_rect); + new_variables_rect); source_window_sp->SetBounds(new_source_rect); } - WindowSP new_window_sp = - main_window_sp->CreateSubWindow("Variables", new_vars_rect, false); + WindowSP new_window_sp = main_window_sp->CreateSubWindow( + "Variables", new_variables_rect, false); new_window_sp->SetDelegate( WindowDelegateSP(new FrameVariablesWindowDelegate(m_debugger))); } @@ -6906,13 +6642,13 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { const Rect variables_bounds = variables_window_sp->GetBounds(); // We have a variables window, so give all the area back to the - // variables window. + // variables window variables_window_sp->Resize(variables_bounds.size.width + registers_window_sp->GetWidth(), variables_bounds.size.height); } else { // We have no variables window showing so give the bottom area back - // to the source view. + // to the source view source_window_sp->Resize(source_bounds.size.width, source_bounds.size.height + registers_window_sp->GetHeight()); @@ -6923,14 +6659,14 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { if (variables_window_sp) { // We have a variables window, split it into two columns where the // left hand side will be the variables and the right hand side will - // be the registers. + // be the registers const Rect variables_bounds = variables_window_sp->GetBounds(); Rect new_vars_rect; variables_bounds.VerticalSplitPercentage(0.50, new_vars_rect, new_regs_rect); variables_window_sp->SetBounds(new_vars_rect); } else { - // No variables window, grab the bottom part of the source window. + // No variables window, grab the bottom part of the source window Rect new_source_rect; source_bounds.HorizontalSplitPercentage(0.70, new_source_rect, new_regs_rect); @@ -6945,66 +6681,6 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { } return MenuActionResult::Handled; - case eMenuId_ViewConsole: { - WindowSP main_window_sp = m_app.GetMainWindow(); - WindowSP source_window_sp = main_window_sp->FindSubWindow("Source"); - WindowSP console_window_sp = main_window_sp->FindSubWindow("Console"); - WindowSP variables_window_sp = main_window_sp->FindSubWindow("Variables"); - WindowSP registers_window_sp = main_window_sp->FindSubWindow("Registers"); - const Rect source_bounds = source_window_sp->GetBounds(); - - if (console_window_sp) { - const Rect console_bounds = console_window_sp->GetBounds(); - main_window_sp->RemoveSubWindow(console_window_sp.get()); - - if (variables_window_sp) { - // Variables window exists, so give Console space to Variables. - Rect variables_bounds = variables_window_sp->GetBounds(); - variables_bounds.size.width = - variables_bounds.size.width + console_bounds.size.width; - variables_window_sp->SetBounds(variables_bounds); - } else if (registers_window_sp) { - // Registers window exists, so give Console space to Registers. - Rect registers_bounds = registers_window_sp->GetBounds(); - registers_bounds.size.width = source_bounds.size.width; - registers_window_sp->SetBounds(registers_bounds); - } else { - // No Variables or Registers window exists. - source_window_sp->Resize(source_bounds.size.width, - source_bounds.size.height + - console_bounds.size.height); - } - } else { - Rect new_console_rect; - if (variables_window_sp) { - // Variable window exists, split area. - const Rect variables_bounds = variables_window_sp->GetBounds(); - Rect new_vars_rect; - variables_bounds.VerticalSplitPercentage(0.50, new_vars_rect, - new_console_rect); - variables_window_sp->SetBounds(new_vars_rect); - } else if (registers_window_sp) { - // Registers window exists, split area. - const Rect registers_bounds = registers_window_sp->GetBounds(); - Rect new_regs_rect; - registers_bounds.VerticalSplitPercentage(0.50, new_console_rect, - new_regs_rect); - registers_window_sp->SetBounds(new_regs_rect); - } else { - // No Registers or Variables window exists, split source area. - Rect new_source_rect; - source_bounds.HorizontalSplitPercentage(0.70, new_source_rect, - new_console_rect); - source_window_sp->SetBounds(new_source_rect); - } - WindowSP new_window_sp = - main_window_sp->CreateSubWindow("Console", new_console_rect, false); - new_window_sp->SetDelegate(GetConsoleDelegate()); - } - touchwin(stdscr); - } - return MenuActionResult::Handled; - case eMenuID_ViewBreakpoints: { WindowSP main_window_sp = m_app.GetMainWindow(); WindowSP threads_window_sp = main_window_sp->FindSubWindow("Threads"); @@ -7052,7 +6728,6 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { protected: Application &m_app; Debugger &m_debugger; - WindowDelegateSP m_console_delegate_sp; }; class StatusBarWindowDelegate : public WindowDelegate { @@ -7960,8 +7635,6 @@ void IOHandlerCursesGUI::Activate() { view_menu_sp->AddSubmenu( std::make_shared("Breakpoints", nullptr, 'b', ApplicationDelegate::eMenuID_ViewBreakpoints)); - view_menu_sp->AddSubmenu(std::make_shared( - "Console", nullptr, 'o', ApplicationDelegate::eMenuId_ViewConsole)); MenuSP help_menu_sp( new Menu("Help", "F6", KEY_F(6), ApplicationDelegate::eMenuID_Help)); @@ -7985,16 +7658,12 @@ void IOHandlerCursesGUI::Activate() { Rect status_bounds = content_bounds.MakeStatusBar(); Rect source_bounds; Rect variables_bounds; - Rect console_bounds; Rect threads_bounds; Rect source_variables_bounds; - Rect variables_console_bounds; content_bounds.VerticalSplitPercentage(0.80, source_variables_bounds, threads_bounds); source_variables_bounds.HorizontalSplitPercentage(0.70, source_bounds, - variables_console_bounds); - variables_console_bounds.VerticalSplitPercentage(0.50, variables_bounds, - console_bounds); + variables_bounds); WindowSP menubar_window_sp = main_window_sp->CreateSubWindow("Menubar", menubar_bounds, false); @@ -8006,12 +7675,10 @@ void IOHandlerCursesGUI::Activate() { WindowSP source_window_sp( main_window_sp->CreateSubWindow("Source", source_bounds, true)); - WindowSP threads_window_sp( - main_window_sp->CreateSubWindow("Threads", threads_bounds, false)); WindowSP variables_window_sp( main_window_sp->CreateSubWindow("Variables", variables_bounds, false)); - WindowSP console_window_sp( - main_window_sp->CreateSubWindow("Console", console_bounds, false)); + WindowSP threads_window_sp( + main_window_sp->CreateSubWindow("Threads", threads_bounds, false)); WindowSP status_window_sp( main_window_sp->CreateSubWindow("Status", status_bounds, false)); status_window_sp->SetCanBeActive( @@ -8022,7 +7689,6 @@ void IOHandlerCursesGUI::Activate() { WindowDelegateSP(new SourceFileWindowDelegate(m_debugger))); variables_window_sp->SetDelegate( WindowDelegateSP(new FrameVariablesWindowDelegate(m_debugger))); - console_window_sp->SetDelegate(app_delegate_sp->GetConsoleDelegate()); TreeDelegateSP thread_delegate_sp(new ThreadsTreeDelegate(m_debugger)); threads_window_sp->SetDelegate(WindowDelegateSP( new TreeWindowDelegate(m_debugger, thread_delegate_sp))); diff --git a/lldb/test/API/commands/gui/console-output/Makefile b/lldb/test/API/commands/gui/console-output/Makefile deleted file mode 100644 index 3d0b98f13f3d7..0000000000000 --- a/lldb/test/API/commands/gui/console-output/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -CXX_SOURCES := main.cpp -include Makefile.rules diff --git a/lldb/test/API/commands/gui/console-output/TestGuiConsoleOutput.py b/lldb/test/API/commands/gui/console-output/TestGuiConsoleOutput.py deleted file mode 100644 index c21604a752b63..0000000000000 --- a/lldb/test/API/commands/gui/console-output/TestGuiConsoleOutput.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -Test that the 'gui' console output pane displays stdout / stderr from the debugged process -""" - -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test.lldbpexpect import PExpectTest - - -class TestGuiConsoleOutputTest(PExpectTest): - # PExpect uses many timeouts internally and doesn't play well - # under ASAN on a loaded machine.. - @skipIfAsan - @skipIfCursesSupportMissing - def test_gui_console_output(self): - """Test that console pane prints messages""" - self.build() - - self.launch( - executable=self.getBuildArtifact("a.out"), - dimensions=(100, 500), - run_under=["env", "TERM=xterm"], - ) - - self.expect( - 'br set -o true -f main.cpp -p "// break here begin"', - substrs=["Breakpoint 1", "address ="], - ) - - self.expect( - 'br set -o true -f main.cpp -p "// break here end"', - substrs=["Breakpoint 2", "address ="], - ) - - self.expect("run", substrs=["stop reason ="]) - - escape_key = chr(27).encode() - - # Start the GUI. - self.child.sendline("gui") - - # Check for gui elements in Menu bar (top of screen) - # We expect these in the order they appear to avoid consumption issues - self.child.expect_exact("Target") - self.child.expect_exact("Process") - self.child.expect_exact("View") - - # Check for window titles (middle of screen) - self.child.expect_exact("Sources") - self.child.expect_exact("Console") - - # The Console window show this message before continuing - self.child.expect_exact("(no output yet)") - - # Continue program execution - self.child.send("c") - - # Check console output for messages - self.child.expect_exact("Hello from stdout line 1") - self.child.expect_exact("Hello from stderr line 3") - - # Check for large output (verify buffer draining) - self.child.expect_exact("Large output line 0") - self.child.expect_exact("Large output line 99") - - # Wait for Breakpoint 2 - self.child.expect_exact("stop reason") - - # Press escape to quit the gui - self.child.send(escape_key) - - self.expect_prompt() - self.quit() - - @skipIfAsan - @skipIfCursesSupportMissing - def test_gui_console_navigate(self): - """Test that console pane navigation works""" - self.build() - - self.launch( - executable=self.getBuildArtifact("a.out"), - dimensions=(100, 500), - run_under=["env", "TERM=xterm"], - ) - - self.expect( - 'br set -o true -f main.cpp -p "// break here begin"', - substrs=["Breakpoint 1", "address ="], - ) - - self.expect( - 'br set -o true -f main.cpp -p "// break here end"', - substrs=["Breakpoint 2", "address ="], - ) - - self.expect("run", substrs=["stop reason ="]) - - escape_key = chr(27).encode() - tab_key = chr(9).encode() - - # Start the GUI. - self.child.sendline("gui") - - # Match elements in top-to-bottom order - self.child.expect_exact("Target") - self.child.expect_exact("Sources") - self.child.expect_exact("Console") - - # The Console window show this message before continuing - self.child.expect_exact("(no output yet)") - - # Continue program execution - self.child.send("c") - - # Check console output for messages - self.child.expect_exact("Hello from stdout line 1") - - # Wait for Breakpoint 2 - self.child.expect_exact("stop reason") - - # Tab to console - self.child.send(tab_key) # Sources -> Threads - self.child.send(tab_key) # Threads -> Variables - self.child.send(tab_key) # Variables -> Console - - # Clear Console output - self.child.send("c") - - # The Console window show this message after clear - self.child.expect_exact("(no output yet)") - - # Press escape to quit the gui - self.child.send(escape_key) - - self.expect_prompt() - self.quit() diff --git a/lldb/test/API/commands/gui/console-output/main.cpp b/lldb/test/API/commands/gui/console-output/main.cpp deleted file mode 100644 index 9800cd50e04bd..0000000000000 --- a/lldb/test/API/commands/gui/console-output/main.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include -#include - -void generate_output() { - for (unsigned i = 1; i < 4; ++i) { - std::cout << "Hello from stdout line " << i << std::endl; - std::cerr << "Hello from stderr line " << i << std::endl; - } -} - -void generate_large_output() { - for (unsigned i = 0; i < 100; ++i) { - std::cout << "Large output line " << i - << " to test buffer draining logic in the GUI console." - << std::endl; - } -} - -int main(int argc, char *argv[]) { - int test_var = 42; - - // Break before output. - int break_here = 0; // break here begin - - // Generate stdout/stderr output. - generate_output(); - generate_large_output(); - - // Wait to capture output. - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - return 0; // break here end -} diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index ec613d64e20a3..98f2205bc06a7 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -256,7 +256,6 @@ Makes programs 10x faster by doing Special New Thing. example, `breakpoint disable .` disables the just-hit breakpoint location. Another usage is to automate a command to run at the current location: `breakpoint command add -o 'p my_var' .`. * The `apropos` command now highlights matching keywords in its output when color is enabled. -* The TUI mode (enabled with the `gui` command) now has a real-time console output pane. stdout / stderr messages get redirected to this pane when it is enabled. #### Deprecated APIs From 2e2d90b9866196182301576ce9488a0cb171b1ad Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 8 May 2026 12:21:31 +0200 Subject: [PATCH 022/538] [libc++] Introduce implicit and explicit ABI annotations (#193045) This patch introduces `_LIBCPP_{BEGIN/END}_EXPLICIT_ABI_ANNOTATIONS` and marks everything within an `_LIBCPP_{BEGIN,END}_UNVERSIONED_NAMESPACE_STD` (and any derivatives like `_LIBCPP_{BEGIN,END}_NAMESPACE_STD`) implicitly by default. This allows us to drop `_LIBCPP_HIDE_FROM_ABI` in most of the code base, except for functions which shouldn't be `_LIBCPP_HIDE_FROM_ABI`. This patch doesn't remove any `_LIBCPP_HIDE_FROM_ABI`s, since we have over 13k of them in the code base. Actually dropping them will happen over some time to avoid too many merge conflicts. --- libcxx/include/CMakeLists.txt | 1 + libcxx/include/__algorithm/shuffle.h | 2 ++ libcxx/include/__algorithm/sort.h | 2 ++ libcxx/include/__atomic/atomic_sync.h | 4 ++++ libcxx/include/__atomic/atomic_sync_timed.h | 2 ++ .../__charconv/from_chars_floating_point.h | 2 ++ .../__charconv/to_chars_floating_point.h | 2 ++ libcxx/include/__chrono/exception.h | 2 ++ libcxx/include/__chrono/file_clock.h | 4 ++++ libcxx/include/__chrono/steady_clock.h | 2 ++ libcxx/include/__chrono/system_clock.h | 2 ++ libcxx/include/__chrono/time_zone.h | 2 ++ libcxx/include/__chrono/tzdb.h | 2 ++ libcxx/include/__chrono/tzdb_list.h | 2 ++ .../__condition_variable/condition_variable.h | 2 ++ libcxx/include/__config | 3 --- libcxx/include/__configuration/attributes.h | 1 + .../__configuration/diagnostic_suppression.h | 1 + libcxx/include/__configuration/namespace.h | 16 ++++++++++++-- libcxx/include/__configuration/utility.h | 22 +++++++++++++++++++ libcxx/include/__exception/exception.h | 2 ++ libcxx/include/__exception/exception_ptr.h | 3 +++ libcxx/include/__exception/nested_exception.h | 2 ++ libcxx/include/__exception/operations.h | 4 ++++ libcxx/include/__exception/terminate.h | 4 ++++ .../include/__expected/bad_expected_access.h | 2 ++ libcxx/include/__filesystem/directory_entry.h | 2 ++ .../include/__filesystem/directory_iterator.h | 2 ++ .../include/__filesystem/filesystem_error.h | 2 ++ libcxx/include/__filesystem/operations.h | 2 ++ libcxx/include/__filesystem/path.h | 2 ++ libcxx/include/__filesystem/path_iterator.h | 2 ++ .../recursive_directory_iterator.h | 2 ++ libcxx/include/__format/format_error.h | 2 ++ libcxx/include/__functional/function.h | 2 ++ libcxx/include/__functional/hash.h | 2 ++ libcxx/include/__hash_table | 2 ++ libcxx/include/__locale | 2 ++ libcxx/include/__locale_dir/check_grouping.h | 2 ++ libcxx/include/__locale_dir/get_c_locale.h | 2 ++ libcxx/include/__locale_dir/messages.h | 2 ++ libcxx/include/__locale_dir/money.h | 2 ++ libcxx/include/__locale_dir/num.h | 2 ++ libcxx/include/__locale_dir/support/windows.h | 2 ++ libcxx/include/__locale_dir/time.h | 2 ++ libcxx/include/__log_hardening_failure | 2 ++ libcxx/include/__math/gamma.h | 2 ++ libcxx/include/__memory/shared_count.h | 2 ++ libcxx/include/__memory/shared_ptr.h | 2 ++ .../__memory_resource/memory_resource.h | 2 ++ .../monotonic_buffer_resource.h | 2 ++ .../synchronized_pool_resource.h | 2 ++ .../unsynchronized_pool_resource.h | 2 ++ libcxx/include/__mutex/mutex.h | 2 ++ libcxx/include/__mutex/once_flag.h | 2 ++ libcxx/include/__new/exceptions.h | 3 +++ libcxx/include/__new/new_handler.h | 4 ++++ libcxx/include/__ostream/basic_ostream.h | 2 ++ libcxx/include/__ostream/print.h | 2 ++ libcxx/include/__pstl/backends/libdispatch.h | 2 ++ libcxx/include/__random/random_device.h | 2 ++ .../include/__system_error/error_category.h | 2 ++ libcxx/include/__system_error/error_code.h | 2 ++ .../include/__system_error/error_condition.h | 2 ++ libcxx/include/__system_error/system_error.h | 2 ++ .../__system_error/throw_system_error.h | 2 ++ libcxx/include/__thread/support/windows.h | 2 ++ libcxx/include/__thread/this_thread.h | 2 ++ libcxx/include/__thread/thread.h | 2 ++ libcxx/include/__verbose_abort | 2 ++ libcxx/include/any | 4 ++++ libcxx/include/barrier | 2 ++ libcxx/include/codecvt | 2 ++ libcxx/include/condition_variable | 2 ++ libcxx/include/fstream | 2 ++ libcxx/include/future | 2 ++ libcxx/include/ios | 2 ++ libcxx/include/istream | 2 ++ libcxx/include/module.modulemap.in | 1 + libcxx/include/mutex | 2 ++ libcxx/include/print | 2 ++ libcxx/include/regex | 2 ++ libcxx/include/shared_mutex | 2 ++ libcxx/include/sstream | 2 ++ libcxx/include/stdexcept | 2 ++ libcxx/include/streambuf | 2 ++ libcxx/include/string | 2 ++ libcxx/include/strstream | 2 ++ libcxx/include/valarray | 2 ++ libcxx/include/variant | 2 ++ libcxx/src/algorithm.cpp | 2 ++ libcxx/src/any.cpp | 2 ++ libcxx/src/atomic.cpp | 2 ++ libcxx/src/barrier.cpp | 2 ++ libcxx/src/call_once.cpp | 2 ++ libcxx/src/charconv.cpp | 3 +++ libcxx/src/chrono.cpp | 2 ++ libcxx/src/condition_variable.cpp | 2 ++ libcxx/src/condition_variable_destructor.cpp | 2 ++ libcxx/src/error_category.cpp | 2 ++ libcxx/src/expected.cpp | 4 ++++ libcxx/src/experimental/chrono_exception.cpp | 2 ++ .../experimental/include/tzdb/tzdb_private.h | 2 ++ .../experimental/log_hardening_failure.cpp | 2 ++ libcxx/src/experimental/time_zone.cpp | 2 ++ libcxx/src/experimental/tzdb.cpp | 2 ++ libcxx/src/experimental/tzdb_list.cpp | 2 ++ libcxx/src/filesystem/directory_entry.cpp | 2 ++ libcxx/src/filesystem/directory_iterator.cpp | 2 ++ libcxx/src/filesystem/filesystem_clock.cpp | 2 ++ libcxx/src/filesystem/filesystem_error.cpp | 2 ++ libcxx/src/filesystem/operations.cpp | 2 ++ libcxx/src/filesystem/path.cpp | 2 ++ libcxx/src/fstream.cpp | 2 ++ libcxx/src/functional.cpp | 2 ++ libcxx/src/future.cpp | 2 ++ libcxx/src/hash.cpp | 2 ++ libcxx/src/ios.cpp | 2 ++ libcxx/src/iostream.cpp | 2 ++ libcxx/src/locale.cpp | 2 ++ libcxx/src/memory.cpp | 2 ++ libcxx/src/memory_resource.cpp | 2 ++ libcxx/src/mutex.cpp | 2 ++ libcxx/src/mutex_destructor.cpp | 2 ++ libcxx/src/optional.cpp | 2 ++ libcxx/src/ostream.cpp | 2 ++ libcxx/src/print.cpp | 2 ++ libcxx/src/pstl/libdispatch.cpp | 2 ++ libcxx/src/random.cpp | 2 ++ libcxx/src/random_shuffle.cpp | 2 ++ libcxx/src/regex.cpp | 2 ++ libcxx/src/shared_mutex.cpp | 2 ++ libcxx/src/std_stream.h | 2 ++ libcxx/src/stdexcept.cpp | 2 ++ libcxx/src/string.cpp | 2 ++ libcxx/src/strstream.cpp | 2 ++ libcxx/src/support/win32/locale_win32.cpp | 2 ++ libcxx/src/support/win32/thread_win32.cpp | 2 ++ libcxx/src/system_error.cpp | 2 ++ libcxx/src/thread.cpp | 2 ++ libcxx/src/valarray.cpp | 2 ++ libcxx/src/vector.cpp | 2 ++ libcxx/src/verbose_abort.cpp | 2 ++ libcxx/test/support/test_tzdb.h | 2 ++ 144 files changed, 331 insertions(+), 5 deletions(-) create mode 100644 libcxx/include/__configuration/utility.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index e27fb7602430f..5c2520de90dcc 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -341,6 +341,7 @@ set(files __configuration/language.h __configuration/namespace.h __configuration/platform.h + __configuration/utility.h __coroutine/coroutine_handle.h __coroutine/coroutine_traits.h __coroutine/noop_coroutine_handle.h diff --git a/libcxx/include/__algorithm/shuffle.h b/libcxx/include/__algorithm/shuffle.h index 7177fbb469ba7..c0707f17b5b33 100644 --- a/libcxx/include/__algorithm/shuffle.h +++ b/libcxx/include/__algorithm/shuffle.h @@ -28,6 +28,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI __libcpp_debug_randomizer { public: _LIBCPP_HIDE_FROM_ABI __libcpp_debug_randomizer() { @@ -160,6 +161,7 @@ shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last, _UniformRan std::move(__first), std::move(__last), std::forward<_UniformRandomNumberGenerator>(__g)); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h index 8aa894e9228c6..0a936db78e5fd 100644 --- a/libcxx/include/__algorithm/sort.h +++ b/libcxx/include/__algorithm/sort.h @@ -828,6 +828,7 @@ void __introsort(_RandomAccessIterator __first, } } +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template void __sort(_RandomAccessIterator, _RandomAccessIterator, _Comp); @@ -856,6 +857,7 @@ extern template _LIBCPP_EXPORTED_FROM_ABI void __sort<__less&, float*>(fl extern template _LIBCPP_EXPORTED_FROM_ABI void __sort<__less&, double*>(double*, double*, __less&); extern template _LIBCPP_EXPORTED_FROM_ABI void __sort<__less&, long double*>(long double*, long double*, __less&); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h index 12bd1b96a0269..b9b67d9092c64 100644 --- a/libcxx/include/__atomic/atomic_sync.h +++ b/libcxx/include/__atomic/atomic_sync.h @@ -28,6 +28,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 # if _LIBCPP_HAS_THREADS +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + # if !_LIBCPP_AVAILABILITY_HAS_NEW_SYNC // old dylib interface kept for backwards compatibility @@ -73,6 +75,8 @@ _LIBCPP_AVAILABILITY_NEW_SYNC _LIBCPP_EXPORTED_FROM_ABI void __atomic_notify_one template _LIBCPP_AVAILABILITY_NEW_SYNC _LIBCPP_EXPORTED_FROM_ABI void __atomic_notify_all_native(const void*) _NOEXCEPT; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS + # if _LIBCPP_AVAILABILITY_HAS_NEW_SYNC template diff --git a/libcxx/include/__atomic/atomic_sync_timed.h b/libcxx/include/__atomic/atomic_sync_timed.h index f3bf780b0e364..7cb24151fde7c 100644 --- a/libcxx/include/__atomic/atomic_sync_timed.h +++ b/libcxx/include/__atomic/atomic_sync_timed.h @@ -30,6 +30,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 # if _LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_NEW_SYNC +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_AVAILABILITY_NEW_SYNC _LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __atomic_monitor_global(void const* __address) _NOEXCEPT; @@ -41,6 +42,7 @@ _LIBCPP_AVAILABILITY_NEW_SYNC _LIBCPP_EXPORTED_FROM_ABI void __atomic_wait_globa template _LIBCPP_AVAILABILITY_NEW_SYNC _LIBCPP_EXPORTED_FROM_ABI void __atomic_wait_native_with_timeout(void const* __address, void const* __old_value, uint64_t __timeout_ns) _NOEXCEPT; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template struct __atomic_wait_timed_backoff_impl { diff --git a/libcxx/include/__charconv/from_chars_floating_point.h b/libcxx/include/__charconv/from_chars_floating_point.h index 811e518a81db7..ed7c54ae58261 100644 --- a/libcxx/include/__charconv/from_chars_floating_point.h +++ b/libcxx/include/__charconv/from_chars_floating_point.h @@ -35,6 +35,7 @@ struct __from_chars_result { errc __ec; }; +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template _LIBCPP_EXPORTED_FROM_ABI __from_chars_result<_Fp> __from_chars_floating_point( _LIBCPP_NOESCAPE const char* __first, _LIBCPP_NOESCAPE const char* __last, chars_format __fmt); @@ -44,6 +45,7 @@ extern template __from_chars_result __from_chars_floating_point( extern template __from_chars_result __from_chars_floating_point( _LIBCPP_NOESCAPE const char* __first, _LIBCPP_NOESCAPE const char* __last, chars_format __fmt); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template _LIBCPP_HIDE_FROM_ABI from_chars_result diff --git a/libcxx/include/__charconv/to_chars_floating_point.h b/libcxx/include/__charconv/to_chars_floating_point.h index 118f316b21a10..ca997f85526f9 100644 --- a/libcxx/include/__charconv/to_chars_floating_point.h +++ b/libcxx/include/__charconv/to_chars_floating_point.h @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_STD_VER >= 17 @@ -50,6 +51,7 @@ _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT _LIBCPP_EXPORTED_FROM_ABI to_chars_ to_chars(char* __first, char* __last, long double __value, chars_format __fmt, int __precision); #endif // _LIBCPP_STD_VER >= 17 +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H diff --git a/libcxx/include/__chrono/exception.h b/libcxx/include/__chrono/exception.h index 1eb5b1b62d92c..27fbfd6b57a2b 100644 --- a/libcxx/include/__chrono/exception.h +++ b/libcxx/include/__chrono/exception.h @@ -31,6 +31,7 @@ # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_STD_VER >= 20 @@ -128,6 +129,7 @@ template # endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB diff --git a/libcxx/include/__chrono/file_clock.h b/libcxx/include/__chrono/file_clock.h index 968f652f796d2..7ded6162fada6 100644 --- a/libcxx/include/__chrono/file_clock.h +++ b/libcxx/include/__chrono/file_clock.h @@ -46,6 +46,8 @@ _LIBCPP_END_NAMESPACE_STD #ifndef _LIBCPP_CXX03_LANG _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + struct _FilesystemClock { # if _LIBCPP_HAS_INT128 typedef __int128_t rep; @@ -76,6 +78,8 @@ struct _FilesystemClock { } # endif // _LIBCPP_STD_VER >= 20 }; + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM #endif // !_LIBCPP_CXX03_LANG diff --git a/libcxx/include/__chrono/steady_clock.h b/libcxx/include/__chrono/steady_clock.h index 8e68c9a3c20f2..ed530815dc4cb 100644 --- a/libcxx/include/__chrono/steady_clock.h +++ b/libcxx/include/__chrono/steady_clock.h @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -37,6 +38,7 @@ class _LIBCPP_EXPORTED_FROM_ABI steady_clock { } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___CHRONO_STEADY_CLOCK_H diff --git a/libcxx/include/__chrono/system_clock.h b/libcxx/include/__chrono/system_clock.h index e3ef75ae50fa6..449514b051492 100644 --- a/libcxx/include/__chrono/system_clock.h +++ b/libcxx/include/__chrono/system_clock.h @@ -20,6 +20,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -47,6 +48,7 @@ using sys_days = sys_time; } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___CHRONO_SYSTEM_CLOCK_H diff --git a/libcxx/include/__chrono/time_zone.h b/libcxx/include/__chrono/time_zone.h index d18d59d2736bf..20f3297f772b9 100644 --- a/libcxx/include/__chrono/time_zone.h +++ b/libcxx/include/__chrono/time_zone.h @@ -36,6 +36,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION @@ -176,6 +177,7 @@ operator<=>(const time_zone& __x, const time_zone& __y) noexcept { # endif // _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && // _LIBCPP_HAS_LOCALIZATION +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__chrono/tzdb.h b/libcxx/include/__chrono/tzdb.h index fb85f66b01968..2fa04604b91f7 100644 --- a/libcxx/include/__chrono/tzdb.h +++ b/libcxx/include/__chrono/tzdb.h @@ -35,6 +35,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION @@ -87,6 +88,7 @@ struct tzdb { # endif // _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && // _LIBCPP_HAS_LOCALIZATION +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h index 2b83a6df1daf8..997599ac74964 100644 --- a/libcxx/include/__chrono/tzdb_list.h +++ b/libcxx/include/__chrono/tzdb_list.h @@ -28,6 +28,7 @@ # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION @@ -101,6 +102,7 @@ _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const tzdb& reload_tzdb(); # endif // _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && // _LIBCPP_HAS_LOCALIZATION +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB diff --git a/libcxx/include/__condition_variable/condition_variable.h b/libcxx/include/__condition_variable/condition_variable.h index b7151930e9226..3fa40287c3600 100644 --- a/libcxx/include/__condition_variable/condition_variable.h +++ b/libcxx/include/__condition_variable/condition_variable.h @@ -32,6 +32,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_HAS_THREADS @@ -225,6 +226,7 @@ inline void condition_variable::__do_timed_wait(unique_lock& __lk, #endif // _LIBCPP_HAS_THREADS +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__config b/libcxx/include/__config index 6cf9652327236..a34c6ee502bb2 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -40,9 +40,6 @@ # define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y) # define _LIBCPP_CONCAT3(X, Y, Z) _LIBCPP_CONCAT(X, _LIBCPP_CONCAT(Y, Z)) -# define _LIBCPP_TOSTRING2(x) #x -# define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x) - # ifndef __has_constexpr_builtin # define __has_constexpr_builtin(x) 0 # endif diff --git a/libcxx/include/__configuration/attributes.h b/libcxx/include/__configuration/attributes.h index 39683b7688d76..cc828466482fd 100644 --- a/libcxx/include/__configuration/attributes.h +++ b/libcxx/include/__configuration/attributes.h @@ -12,6 +12,7 @@ #include <__config_site> #include <__configuration/hardening.h> #include <__configuration/language.h> +#include <__configuration/utility.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__configuration/diagnostic_suppression.h b/libcxx/include/__configuration/diagnostic_suppression.h index 25d43449a6a77..cd9ffa06d1202 100644 --- a/libcxx/include/__configuration/diagnostic_suppression.h +++ b/libcxx/include/__configuration/diagnostic_suppression.h @@ -12,6 +12,7 @@ #include <__config_site> #include <__configuration/compiler.h> +#include <__configuration/utility.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__configuration/namespace.h b/libcxx/include/__configuration/namespace.h index 740baad567af7..befb64ac5903e 100644 --- a/libcxx/include/__configuration/namespace.h +++ b/libcxx/include/__configuration/namespace.h @@ -13,6 +13,7 @@ #include <__config_site> #include <__configuration/attributes.h> #include <__configuration/diagnostic_suppression.h> +#include <__configuration/utility.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header @@ -39,6 +40,17 @@ # define _LIBCPP_POP_EXTENSION_DIAGNOSTICS #endif +#define _LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS \ + _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wpragma-clang-attribute") \ + _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wignored-attributes") \ + _Pragma(_LIBCPP_TOSTRING(clang attribute _LibcxxExplicitABIAnnotations.push( \ + __attribute__((__exclude_from_explicit_instantiation__, \ + __visibility__("hidden"), \ + __abi_tag__(_LIBCPP_TOSTRING(_LIBCPP_ODR_SIGNATURE)))), \ + apply_to = function))) _LIBCPP_DIAGNOSTIC_POP + +#define _LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _Pragma("clang attribute _LibcxxExplicitABIAnnotations.pop") + // clang-format off // The unversioned namespace is used when we want to be ABI compatible with other standard libraries in some way. There @@ -50,9 +62,9 @@ // If it's not clear whether using the unversioned namespace is the correct thing to do, it's not. The versioned // namespace (_LIBCPP_BEGIN_NAMESPACE_STD) should almost always be used. # define _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD \ - _LIBCPP_PUSH_EXTENSION_DIAGNOSTICS namespace _LIBCPP_NAMESPACE_VISIBILITY std { + _LIBCPP_PUSH_EXTENSION_DIAGNOSTICS _LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS namespace _LIBCPP_NAMESPACE_VISIBILITY std { -# define _LIBCPP_END_UNVERSIONED_NAMESPACE_STD } _LIBCPP_POP_EXTENSION_DIAGNOSTICS +# define _LIBCPP_END_UNVERSIONED_NAMESPACE_STD } _LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_POP_EXTENSION_DIAGNOSTICS # define _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD inline namespace _LIBCPP_ABI_NAMESPACE { # define _LIBCPP_END_NAMESPACE_STD } _LIBCPP_END_UNVERSIONED_NAMESPACE_STD diff --git a/libcxx/include/__configuration/utility.h b/libcxx/include/__configuration/utility.h new file mode 100644 index 0000000000000..81e91887614d3 --- /dev/null +++ b/libcxx/include/__configuration/utility.h @@ -0,0 +1,22 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CONFIGURATION_UTILITY_H +#define _LIBCPP___CONFIGURATION_UTILITY_H + +#include <__config_site> + +#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER +# pragma GCC system_header +#endif + +#define _LIBCPP_TOSTRING2(x) #x +#define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x) + +#endif // _LIBCPP___CONFIGURATION_UTILITY_H diff --git a/libcxx/include/__exception/exception.h b/libcxx/include/__exception/exception.h index ddc34b0fa8fa1..c5f37d5fd1611 100644 --- a/libcxx/include/__exception/exception.h +++ b/libcxx/include/__exception/exception.h @@ -22,6 +22,7 @@ #endif _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if defined(_LIBCPP_ABI_VCRUNTIME) && (!defined(_HAS_EXCEPTIONS) || _HAS_EXCEPTIONS != 0) // The std::exception class was already included above, but we're explicit about this condition here for clarity. @@ -91,6 +92,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_exception : public exception { }; #endif // !_LIBCPP_ABI_VCRUNTIME +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD #endif // _LIBCPP___EXCEPTION_EXCEPTION_H diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index d3c137d530411..f7ed00d555836 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -60,6 +60,7 @@ _LIBCPP_OVERRIDABLE_FUNC_VIS __cxa_exception* __cxa_init_primary_exception( #endif // !defined(_LIBCPP_ABI_MICROSOFT) _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #ifndef _LIBCPP_ABI_MICROSOFT @@ -219,6 +220,8 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { } #endif // defined(_LIBCPP_ABI_MICROSOFT) + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h index dd84efbccde88..f8abe7db50de7 100644 --- a/libcxx/include/__exception/nested_exception.h +++ b/libcxx/include/__exception/nested_exception.h @@ -28,6 +28,7 @@ #endif _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI nested_exception { exception_ptr __ptr_; @@ -95,6 +96,7 @@ inline _LIBCPP_HIDE_FROM_ABI void rethrow_if_nested(const _Ep& __e) { template ::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI void rethrow_if_nested(const _Ep&) {} +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD #endif // _LIBCPP___EXCEPTION_NESTED_EXCEPTION_H diff --git a/libcxx/include/__exception/operations.h b/libcxx/include/__exception/operations.h index 2b93ad260c30b..83507a7393662 100644 --- a/libcxx/include/__exception/operations.h +++ b/libcxx/include/__exception/operations.h @@ -16,6 +16,8 @@ #endif _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + #if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS) || \ defined(_LIBCPP_BUILDING_LIBRARY) using unexpected_handler = void (*)(); @@ -37,6 +39,8 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr; [[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT; [[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD #endif // _LIBCPP___EXCEPTION_OPERATIONS_H diff --git a/libcxx/include/__exception/terminate.h b/libcxx/include/__exception/terminate.h index 955a49c2b00c3..8f2f584b18d35 100644 --- a/libcxx/include/__exception/terminate.h +++ b/libcxx/include/__exception/terminate.h @@ -16,7 +16,11 @@ #endif _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + [[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT; + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD #endif // _LIBCPP___EXCEPTION_TERMINATE_H diff --git a/libcxx/include/__expected/bad_expected_access.h b/libcxx/include/__expected/bad_expected_access.h index b1958101d5178..d0ba175c04c92 100644 --- a/libcxx/include/__expected/bad_expected_access.h +++ b/libcxx/include/__expected/bad_expected_access.h @@ -23,6 +23,7 @@ _LIBCPP_PUSH_MACROS #if _LIBCPP_STD_VER >= 23 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template class bad_expected_access; @@ -66,6 +67,7 @@ class bad_expected_access : public bad_expected_access { _Err __unex_; }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h index fab400b439429..d741984b34fee 100644 --- a/libcxx/include/__filesystem/directory_entry.h +++ b/libcxx/include/__filesystem/directory_entry.h @@ -39,6 +39,7 @@ _LIBCPP_PUSH_MACROS #if _LIBCPP_STD_VER >= 17 && _LIBCPP_HAS_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class directory_entry { typedef filesystem::path _Path; @@ -465,6 +466,7 @@ class __dir_element_proxy { directory_entry __elem_; }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM #endif // _LIBCPP_STD_VER >= 17 && _LIBCPP_HAS_FILESYSTEM diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h index b62129807b567..52a5bf59a2c80 100644 --- a/libcxx/include/__filesystem/directory_iterator.h +++ b/libcxx/include/__filesystem/directory_iterator.h @@ -33,6 +33,7 @@ _LIBCPP_PUSH_MACROS #if _LIBCPP_STD_VER >= 17 && _LIBCPP_HAS_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_HIDDEN __dir_stream; class directory_iterator { @@ -129,6 +130,7 @@ operator!=(const directory_iterator& __lhs, const directory_iterator& __rhs) noe return directory_iterator(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM # if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h index 6f1daf866a504..054e38111b415 100644 --- a/libcxx/include/__filesystem/filesystem_error.h +++ b/libcxx/include/__filesystem/filesystem_error.h @@ -26,6 +26,7 @@ #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI filesystem_error : public system_error { public: @@ -80,6 +81,7 @@ template } # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__filesystem/operations.h b/libcxx/include/__filesystem/operations.h index f536a1a9d4466..77b6e1908066d 100644 --- a/libcxx/include/__filesystem/operations.h +++ b/libcxx/include/__filesystem/operations.h @@ -31,6 +31,7 @@ _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI path __absolute(const path&, error_code* __ec = nullptr); _LIBCPP_EXPORTED_FROM_ABI path __canonical(const path&, error_code* __ec = nullptr); _LIBCPP_EXPORTED_FROM_ABI bool @@ -67,6 +68,7 @@ _LIBCPP_EXPORTED_FROM_ABI path __temp_directory_path(error_code* __ec = nullptr) _LIBCPP_EXPORTED_FROM_ABI bool __fs_is_empty(const path& __p, error_code* __ec = nullptr); _LIBCPP_EXPORTED_FROM_ABI void __permissions(const path&, perms, perm_options, error_code* = nullptr); _LIBCPP_EXPORTED_FROM_ABI space_info __space(const path&, error_code* __ec = nullptr); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS [[nodiscard]] inline _LIBCPP_HIDE_FROM_ABI path absolute(const path& __p) { return __absolute(__p); } [[nodiscard]] inline _LIBCPP_HIDE_FROM_ABI path absolute(const path& __p, error_code& __ec) { diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h index 47ecf1ce19782..a63c4ee611ebf 100644 --- a/libcxx/include/__filesystem/path.h +++ b/libcxx/include/__filesystem/path.h @@ -42,6 +42,7 @@ _LIBCPP_PUSH_MACROS #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template struct __can_convert_char { @@ -899,6 +900,7 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(path& __lhs, path& __rhs) noexcept { __lh [[nodiscard]] _LIBCPP_EXPORTED_FROM_ABI size_t hash_value(const path& __p) noexcept; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__filesystem/path_iterator.h b/libcxx/include/__filesystem/path_iterator.h index dd408a76ca597..30a9a5a059af7 100644 --- a/libcxx/include/__filesystem/path_iterator.h +++ b/libcxx/include/__filesystem/path_iterator.h @@ -22,6 +22,7 @@ #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI path::iterator { public: @@ -103,6 +104,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const path::iterator& __lhs, const return !(__lhs == __rhs); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h index 18165b0031e58..99665d017a101 100644 --- a/libcxx/include/__filesystem/recursive_directory_iterator.h +++ b/libcxx/include/__filesystem/recursive_directory_iterator.h @@ -32,6 +32,7 @@ _LIBCPP_PUSH_MACROS #if _LIBCPP_STD_VER >= 17 && _LIBCPP_HAS_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class recursive_directory_iterator { public: @@ -139,6 +140,7 @@ begin(recursive_directory_iterator __iter) noexcept { return recursive_directory_iterator(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM # if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__format/format_error.h b/libcxx/include/__format/format_error.h index b92e6d1de00e2..853b4c1c3e75b 100644 --- a/libcxx/include/__format/format_error.h +++ b/libcxx/include/__format/format_error.h @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_STD_VER >= 20 @@ -45,6 +46,7 @@ _LIBCPP_DIAGNOSTIC_POP #endif // _LIBCPP_STD_VER >= 20 +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___FORMAT_FORMAT_ERROR_H diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index d3a978ac862db..3185a1942455a 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -44,6 +44,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // bad_function_call _LIBCPP_DIAGNOSTIC_PUSH +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if !_LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wweak-vtables") # endif @@ -65,6 +66,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception { const char* what() const _NOEXCEPT override; # endif }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_DIAGNOSTIC_POP [[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() { diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h index fa7181984f163..c794f57356ae7 100644 --- a/libcxx/include/__functional/hash.h +++ b/libcxx/include/__functional/hash.h @@ -249,7 +249,9 @@ struct __murmur2_or_cityhash<_Size, 64> { }; #if _LIBCPP_AVAILABILITY_HAS_HASH_MEMORY +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[__gnu__::__pure__]] _LIBCPP_EXPORTED_FROM_ABI size_t __hash_memory(_LIBCPP_NOESCAPE const void*, size_t) _NOEXCEPT; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS #else _LIBCPP_HIDE_FROM_ABI inline size_t __hash_memory(const void* __ptr, size_t __size) _NOEXCEPT { return __murmur2_or_cityhash()(__ptr, __size); diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table index 4b1e729c1d459..e1264703f6b18 100644 --- a/libcxx/include/__hash_table +++ b/libcxx/include/__hash_table @@ -74,7 +74,9 @@ struct __is_hash_value_type : false_type {}; template struct __is_hash_value_type<_One> : __is_hash_value_type_imp<__remove_cvref_t<_One> > {}; +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI size_t __next_prime(size_t __n); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template struct __hash_node_base { diff --git a/libcxx/include/__locale b/libcxx/include/__locale index 8b6b48ccd6b59..1d5f12431588c 100644 --- a/libcxx/include/__locale +++ b/libcxx/include/__locale @@ -42,6 +42,7 @@ # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI locale; @@ -1497,6 +1498,7 @@ protected: }; # endif // _LIBCPP_HAS_WIDE_CHARACTERS +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_LOCALIZATION diff --git a/libcxx/include/__locale_dir/check_grouping.h b/libcxx/include/__locale_dir/check_grouping.h index 93e9e404bb5f3..fe0f4f4b1780a 100644 --- a/libcxx/include/__locale_dir/check_grouping.h +++ b/libcxx/include/__locale_dir/check_grouping.h @@ -20,10 +20,12 @@ # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI void __check_grouping(const string& __grouping, unsigned* __g, unsigned* __g_end, ios_base::iostate& __err); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_LOCALIZATION diff --git a/libcxx/include/__locale_dir/get_c_locale.h b/libcxx/include/__locale_dir/get_c_locale.h index e8bac9a87095b..677827f2d8f1d 100644 --- a/libcxx/include/__locale_dir/get_c_locale.h +++ b/libcxx/include/__locale_dir/get_c_locale.h @@ -29,7 +29,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD # else # define _LIBCPP_GET_C_LOCALE __cloc() // Get the C locale object +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI __locale::__locale_t __cloc(); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS # define __cloc_defined # endif diff --git a/libcxx/include/__locale_dir/messages.h b/libcxx/include/__locale_dir/messages.h index 686f472840c22..4023d2d5688d0 100644 --- a/libcxx/include/__locale_dir/messages.h +++ b/libcxx/include/__locale_dir/messages.h @@ -33,6 +33,7 @@ # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI messages_base { public: @@ -136,6 +137,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_LOCALIZATION diff --git a/libcxx/include/__locale_dir/money.h b/libcxx/include/__locale_dir/money.h index 12ba38467d805..bb67f95ca2e0b 100644 --- a/libcxx/include/__locale_dir/money.h +++ b/libcxx/include/__locale_dir/money.h @@ -32,6 +32,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // money_base @@ -864,6 +865,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h index b7ea02e7cb7f7..8af427ca37dbe 100644 --- a/libcxx/include/__locale_dir/num.h +++ b/libcxx/include/__locale_dir/num.h @@ -42,6 +42,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS struct _LIBCPP_EXPORTED_FROM_ABI __num_get_base { static const int __num_get_buf_sz = 40; @@ -1008,6 +1009,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h index bacad6fa52e0c..c9ea500bc0b12 100644 --- a/libcxx/include/__locale_dir/support/windows.h +++ b/libcxx/include/__locale_dir/support/windows.h @@ -27,6 +27,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace __locale { using __lconv_t _LIBCPP_NODEBUG = std::lconv; @@ -304,6 +305,7 @@ struct __locale_guard { #endif // _LIBCPP_BUILDING_LIBRARY } // namespace __locale +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___LOCALE_DIR_SUPPORT_WINDOWS_H diff --git a/libcxx/include/__locale_dir/time.h b/libcxx/include/__locale_dir/time.h index 78698e9651918..7db1cc660cd30 100644 --- a/libcxx/include/__locale_dir/time.h +++ b/libcxx/include/__locale_dir/time.h @@ -22,6 +22,7 @@ # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template _LIBCPP_HIDE_FROM_ABI int __get_up_to_n_digits( @@ -755,6 +756,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_LOCALIZATION diff --git a/libcxx/include/__log_hardening_failure b/libcxx/include/__log_hardening_failure index d1805306f6b6e..2b6065e9a19fb 100644 --- a/libcxx/include/__log_hardening_failure +++ b/libcxx/include/__log_hardening_failure @@ -24,7 +24,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD // This function should never be called directly from the code -- it should only be called through the // `_LIBCPP_LOG_HARDENING_FAILURE` macro. +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[__gnu__::__cold__]] _LIBCPP_EXPORTED_FROM_ABI void __log_hardening_failure(const char* __message) noexcept; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS // _LIBCPP_LOG_HARDENING_FAILURE(message) // diff --git a/libcxx/include/__math/gamma.h b/libcxx/include/__math/gamma.h index 6c82cbf0cae93..a742ea42d9009 100644 --- a/libcxx/include/__math/gamma.h +++ b/libcxx/include/__math/gamma.h @@ -68,11 +68,13 @@ inline _LIBCPP_HIDE_FROM_ABI double __lgamma_r(double __d) _NOEXCEPT { return __ #else +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if defined(_LIBCPP_OBJECT_FORMAT_MACHO) double __lgamma_r_shim(double, int*) _NOEXCEPT __asm__("_lgamma_r"); # else double __lgamma_r_shim(double, int*) _NOEXCEPT __asm__("lgamma_r"); # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS inline _LIBCPP_HIDE_FROM_ABI double __lgamma_r(double __d) _NOEXCEPT { int __sign; diff --git a/libcxx/include/__memory/shared_count.h b/libcxx/include/__memory/shared_count.h index b40d8c9cf77d1..4fd81287d854c 100644 --- a/libcxx/include/__memory/shared_count.h +++ b/libcxx/include/__memory/shared_count.h @@ -18,6 +18,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // NOTE: Relaxed and acq/rel atomics (for increment and decrement respectively) // should be sufficient for thread safety. @@ -111,6 +112,7 @@ class _LIBCPP_EXPORTED_FROM_ABI __shared_weak_count : private __shared_count { virtual void __on_zero_shared_weak() _NOEXCEPT = 0; }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___MEMORY_SHARED_COUNT_H diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index 4c86eb160ef1a..bbf4a5553f78d 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -70,6 +70,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI bad_weak_ptr : public std::exception { public: @@ -1536,6 +1537,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool atomic_compare_exchange_weak_explicit( #endif // _LIBCPP_HAS_THREADS +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__memory_resource/memory_resource.h b/libcxx/include/__memory_resource/memory_resource.h index 5b42ae54890b1..c5726bd2bcdb5 100644 --- a/libcxx/include/__memory_resource/memory_resource.h +++ b/libcxx/include/__memory_resource/memory_resource.h @@ -21,6 +21,7 @@ #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace pmr { @@ -84,6 +85,7 @@ null_memory_resource() noexcept; } // namespace pmr +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__memory_resource/monotonic_buffer_resource.h b/libcxx/include/__memory_resource/monotonic_buffer_resource.h index 9c7b07df52f8a..a6d292386169b 100644 --- a/libcxx/include/__memory_resource/monotonic_buffer_resource.h +++ b/libcxx/include/__memory_resource/monotonic_buffer_resource.h @@ -21,6 +21,7 @@ #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace pmr { @@ -112,6 +113,7 @@ class _LIBCPP_AVAILABILITY_PMR _LIBCPP_EXPORTED_FROM_ABI monotonic_buffer_resour } // namespace pmr +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__memory_resource/synchronized_pool_resource.h b/libcxx/include/__memory_resource/synchronized_pool_resource.h index 1c929675bb3b3..97a0f28ad4129 100644 --- a/libcxx/include/__memory_resource/synchronized_pool_resource.h +++ b/libcxx/include/__memory_resource/synchronized_pool_resource.h @@ -24,6 +24,7 @@ #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace pmr { @@ -88,6 +89,7 @@ class _LIBCPP_AVAILABILITY_PMR _LIBCPP_EXPORTED_FROM_ABI synchronized_pool_resou } // namespace pmr +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h index 89198a1b7c96e..589afaf2faa22 100644 --- a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h +++ b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h @@ -22,6 +22,7 @@ #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace pmr { @@ -99,6 +100,7 @@ class _LIBCPP_AVAILABILITY_PMR _LIBCPP_EXPORTED_FROM_ABI unsynchronized_pool_res } // namespace pmr +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__mutex/mutex.h b/libcxx/include/__mutex/mutex.h index e9cedf8db1cca..d0addacc2893f 100644 --- a/libcxx/include/__mutex/mutex.h +++ b/libcxx/include/__mutex/mutex.h @@ -20,6 +20,7 @@ #if _LIBCPP_HAS_THREADS _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_CAPABILITY("mutex") mutex { __libcpp_mutex_t __m_ = _LIBCPP_MUTEX_INITIALIZER; @@ -46,6 +47,7 @@ class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_CAPABILITY("mutex") mutex { static_assert(is_nothrow_default_constructible::value, "the default constructor for std::mutex must be nothrow"); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_THREADS diff --git a/libcxx/include/__mutex/once_flag.h b/libcxx/include/__mutex/once_flag.h index ad15b2eb6df68..9999852b8b78a 100644 --- a/libcxx/include/__mutex/once_flag.h +++ b/libcxx/include/__mutex/once_flag.h @@ -113,7 +113,9 @@ void _LIBCPP_HIDE_FROM_ABI __call_once_proxy(void* __vp) { (*__p)(); } +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI void __call_once(volatile once_flag::_State_type&, void*, void (*)(void*)); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template inline _LIBCPP_HIDE_FROM_ABI _ValueType __libcpp_acquire_load(_ValueType const* __value) { diff --git a/libcxx/include/__new/exceptions.h b/libcxx/include/__new/exceptions.h index 1aadc23120cbb..28561187fb555 100644 --- a/libcxx/include/__new/exceptions.h +++ b/libcxx/include/__new/exceptions.h @@ -24,6 +24,8 @@ #endif _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + #if !defined(_LIBCPP_ABI_VCRUNTIME) class _LIBCPP_EXPORTED_FROM_ABI bad_alloc : public exception { @@ -74,6 +76,7 @@ class bad_array_new_length : public bad_alloc { _LIBCPP_VERBOSE_ABORT("bad_array_new_length was thrown in -fno-exceptions mode"); #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD #endif // _LIBCPP___NEW_EXCEPTIONS_H diff --git a/libcxx/include/__new/new_handler.h b/libcxx/include/__new/new_handler.h index 05f4e846c3ef9..fc6aabec61b64 100644 --- a/libcxx/include/__new/new_handler.h +++ b/libcxx/include/__new/new_handler.h @@ -19,9 +19,13 @@ # include #else _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + typedef void (*new_handler)(); _LIBCPP_EXPORTED_FROM_ABI new_handler set_new_handler(new_handler) _NOEXCEPT; _LIBCPP_EXPORTED_FROM_ABI new_handler get_new_handler() _NOEXCEPT; + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD #endif // _LIBCPP_ABI_VCRUNTIME diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h index 62770be72f983..43c9399758bf1 100644 --- a/libcxx/include/__ostream/basic_ostream.h +++ b/libcxx/include/__ostream/basic_ostream.h @@ -41,6 +41,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template class basic_ostream : virtual public basic_ios<_CharT, _Traits> { @@ -672,6 +673,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__ostream/print.h b/libcxx/include/__ostream/print.h index 8ed52301403e1..5b4ab60a45eea 100644 --- a/libcxx/include/__ostream/print.h +++ b/libcxx/include/__ostream/print.h @@ -82,7 +82,9 @@ _LIBCPP_HIDE_FROM_ABI inline void vprint_nonunicode(ostream& __os, string_view _ // native Unicode API; // Whether the returned FILE* is "a terminal capable of displaying Unicode" // is determined in the same way as the print(FILE*, ...) overloads. +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_HAS_UNICODE template // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563). diff --git a/libcxx/include/__pstl/backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h index 88d4231d29a0a..33fd5de66fbb1 100644 --- a/libcxx/include/__pstl/backends/libdispatch.h +++ b/libcxx/include/__pstl/backends/libdispatch.h @@ -48,6 +48,7 @@ _LIBCPP_PUSH_MACROS #if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace __pstl { namespace __libdispatch { @@ -393,6 +394,7 @@ struct __fill<__libdispatch_backend_tag, _ExecutionPolicy> : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {}; } // namespace __pstl +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__random/random_device.h b/libcxx/include/__random/random_device.h index 33ec585cc0efb..17ca2fc7499d0 100644 --- a/libcxx/include/__random/random_device.h +++ b/libcxx/include/__random/random_device.h @@ -20,6 +20,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_HAS_RANDOM_DEVICE @@ -74,6 +75,7 @@ class _LIBCPP_EXPORTED_FROM_ABI random_device { #endif // _LIBCPP_HAS_RANDOM_DEVICE +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__system_error/error_category.h b/libcxx/include/__system_error/error_category.h index 7f7c7355c7e7f..f7b9233f6ef9b 100644 --- a/libcxx/include/__system_error/error_category.h +++ b/libcxx/include/__system_error/error_category.h @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class error_condition; class _LIBCPP_EXPORTED_FROM_ABI error_code; @@ -70,6 +71,7 @@ class _LIBCPP_HIDDEN __do_message : public error_category { [[__gnu__::__const__]] [[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI const error_category& generic_category() _NOEXCEPT; [[__gnu__::__const__]] [[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI const error_category& system_category() _NOEXCEPT; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___SYSTEM_ERROR_ERROR_CATEGORY_H diff --git a/libcxx/include/__system_error/error_code.h b/libcxx/include/__system_error/error_code.h index e904376939753..1b4fdd8eabb77 100644 --- a/libcxx/include/__system_error/error_code.h +++ b/libcxx/include/__system_error/error_code.h @@ -24,6 +24,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template struct is_error_code_enum : public false_type {}; @@ -137,6 +138,7 @@ struct hash : public __unary_function { } }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___SYSTEM_ERROR_ERROR_CODE_H diff --git a/libcxx/include/__system_error/error_condition.h b/libcxx/include/__system_error/error_condition.h index be7deaba0444c..47d87be8d7654 100644 --- a/libcxx/include/__system_error/error_condition.h +++ b/libcxx/include/__system_error/error_condition.h @@ -23,6 +23,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template struct is_error_condition_enum : public false_type {}; @@ -124,6 +125,7 @@ struct hash : public __unary_function } }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___SYSTEM_ERROR_ERROR_CONDITION_H diff --git a/libcxx/include/__system_error/system_error.h b/libcxx/include/__system_error/system_error.h index 74427d8f0bf9b..435b8f7f6aa13 100644 --- a/libcxx/include/__system_error/system_error.h +++ b/libcxx/include/__system_error/system_error.h @@ -22,6 +22,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI system_error : public runtime_error { error_code __ec_; @@ -52,6 +53,7 @@ class _LIBCPP_EXPORTED_FROM_ABI system_error : public runtime_error { #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___SYSTEM_ERROR_SYSTEM_ERROR_H diff --git a/libcxx/include/__system_error/throw_system_error.h b/libcxx/include/__system_error/throw_system_error.h index e4605b6f014dc..abcfaeb7fbed2 100644 --- a/libcxx/include/__system_error/throw_system_error.h +++ b/libcxx/include/__system_error/throw_system_error.h @@ -17,9 +17,11 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___SYSTEM_ERROR_THROW_SYSTEM_ERROR_H diff --git a/libcxx/include/__thread/support/windows.h b/libcxx/include/__thread/support/windows.h index 558b5c81dc191..558e8b5050e3e 100644 --- a/libcxx/include/__thread/support/windows.h +++ b/libcxx/include/__thread/support/windows.h @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS using __libcpp_timespec_t = ::timespec; @@ -126,6 +127,7 @@ _LIBCPP_EXPORTED_FROM_ABI void* __libcpp_tls_get(__libcpp_tls_key __key); _LIBCPP_EXPORTED_FROM_ABI int __libcpp_tls_set(__libcpp_tls_key __key, void* __p); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___THREAD_SUPPORT_WINDOWS_H diff --git a/libcxx/include/__thread/this_thread.h b/libcxx/include/__thread/this_thread.h index 4df137711a7fd..5a93e4d605027 100644 --- a/libcxx/include/__thread/this_thread.h +++ b/libcxx/include/__thread/this_thread.h @@ -32,7 +32,9 @@ namespace this_thread { #if _LIBCPP_HAS_THREADS +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI void sleep_for(const chrono::nanoseconds& __ns); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template _LIBCPP_HIDE_FROM_ABI void sleep_for(const chrono::duration<_Rep, _Period>& __d) { diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h index 3b1f45e45e35c..ee7a65212f31d 100644 --- a/libcxx/include/__thread/thread.h +++ b/libcxx/include/__thread/thread.h @@ -45,6 +45,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_HAS_THREADS @@ -261,6 +262,7 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(thread& __x, thread& __y) _NOEXCEPT { __x #endif // _LIBCPP_HAS_THREADS +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort index f8b696733e2b3..22df0e14463d5 100644 --- a/libcxx/include/__verbose_abort +++ b/libcxx/include/__verbose_abort @@ -20,8 +20,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD // This function should never be called directly from the code -- it should only be called through // the _LIBCPP_VERBOSE_ABORT macro. +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[__noreturn__]] _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_ATTRIBUTE_FORMAT( __printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _NOEXCEPT; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS // _LIBCPP_VERBOSE_ABORT(format, args...) // diff --git a/libcxx/include/any b/libcxx/include/any index d9368df75296e..740bcec93835c 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -120,10 +120,14 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + class _LIBCPP_EXPORTED_FROM_ABI bad_any_cast : public bad_cast { public: const char* what() const _NOEXCEPT override; }; + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 428a39a44e095..0bd0e3da0ca18 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -95,6 +95,7 @@ using __barrier_phase_t _LIBCPP_NODEBUG = uint8_t; class __barrier_algorithm_base; +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[__gnu__::__returns_nonnull__, __gnu__::__malloc__]] _LIBCPP_EXPORTED_FROM_ABI __barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected); @@ -104,6 +105,7 @@ __arrive_barrier_algorithm_base([[__gnu__::__nonnull__]] _LIBCPP_NOESCAPE __barr _LIBCPP_EXPORTED_FROM_ABI void __destroy_barrier_algorithm_base( [[__gnu__::__nonnull__]] _LIBCPP_NOESCAPE __barrier_algorithm_base* __barrier) noexcept; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS template class __barrier_base { diff --git a/libcxx/include/codecvt b/libcxx/include/codecvt index 33ade1d298a7e..00f3301a60f55 100644 --- a/libcxx/include/codecvt +++ b/libcxx/include/codecvt @@ -71,6 +71,7 @@ class codecvt_utf8_utf16 # if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT) _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS enum _LIBCPP_DEPRECATED_IN_CXX17 codecvt_mode { consume_header = 4, generate_header = 2, little_endian = 1 }; @@ -579,6 +580,7 @@ public: }; _LIBCPP_SUPPRESS_DEPRECATED_POP +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD # endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT) diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable index d42a4802b1792..a5dc060fbfa38 100644 --- a/libcxx/include/condition_variable +++ b/libcxx/include/condition_variable @@ -146,6 +146,7 @@ _LIBCPP_PUSH_MACROS # if _LIBCPP_HAS_THREADS _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template struct __unlock_guard { @@ -342,6 +343,7 @@ bool condition_variable_any::wait_for( _LIBCPP_EXPORTED_FROM_ABI void notify_all_at_thread_exit(condition_variable&, unique_lock); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD # endif // _LIBCPP_HAS_THREADS diff --git a/libcxx/include/fstream b/libcxx/include/fstream index 4000be8153731..7edecbb935a0b 100644 --- a/libcxx/include/fstream +++ b/libcxx/include/fstream @@ -220,6 +220,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_STD_VER >= 23 && defined(_LIBCPP_WIN32API) _LIBCPP_EXPORTED_FROM_ABI void* __filebuf_windows_native_handle(FILE* __file) noexcept; @@ -1616,6 +1617,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ofstream; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_filebuf; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/future b/libcxx/include/future index 6bd836afa04e8..9c71dfb89087d 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -420,6 +420,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // enum class future_errc _LIBCPP_DECLARE_STRONG_ENUM(future_errc){ @@ -2057,6 +2058,7 @@ inline shared_future<_Rp&> future<_Rp&>::share() _NOEXCEPT { inline shared_future future::share() _NOEXCEPT { return shared_future(std::move(*this)); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/ios b/libcxx/include/ios index 9c03f56a230b5..d1ec14cba37d1 100644 --- a/libcxx/include/ios +++ b/libcxx/include/ios @@ -247,6 +247,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS typedef ptrdiff_t streamsize; @@ -878,6 +879,7 @@ _LIBCPP_HIDE_FROM_ABI inline ios_base& defaultfloat(ios_base& __str) { return __str; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/istream b/libcxx/include/istream index dfa22e9f3bfb7..c4b2de9caf011 100644 --- a/libcxx/include/istream +++ b/libcxx/include/istream @@ -190,6 +190,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template class basic_istream : virtual public basic_ios<_CharT, _Traits> { @@ -1405,6 +1406,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_istream; # endif extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_iostream; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 44d8e21b2fba3..b5aa319d0aca2 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -13,6 +13,7 @@ module std_config [system] { textual header "__configuration/language.h" textual header "__configuration/namespace.h" textual header "__configuration/platform.h" + textual header "__configuration/utility.h" textual header "version" } diff --git a/libcxx/include/mutex b/libcxx/include/mutex index bec0185ede21a..a3f358afe2acd 100644 --- a/libcxx/include/mutex +++ b/libcxx/include/mutex @@ -215,6 +215,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS # if _LIBCPP_HAS_THREADS @@ -497,6 +498,7 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(scoped_lock); # endif // _LIBCPP_STD_VER >= 17 # endif // _LIBCPP_HAS_THREADS +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/print b/libcxx/include/print index 19a0117a90410..cfa844e6ab607 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -55,6 +55,7 @@ namespace std { _LIBCPP_BEGIN_NAMESPACE_STD # ifdef _LIBCPP_WIN32API +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream); # if _LIBCPP_HAS_WIDE_CHARACTERS @@ -69,6 +70,7 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream); // Note the function is only implemented on the Windows platform. _LIBCPP_EXPORTED_FROM_ABI void __write_to_windows_console(FILE* __stream, wstring_view __view); # endif // _LIBCPP_HAS_WIDE_CHARACTERS +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS # endif // _LIBCPP_WIN32API # if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/regex b/libcxx/include/regex index ae6ffcbb55481..695a0f21754c4 100644 --- a/libcxx/include/regex +++ b/libcxx/include/regex @@ -841,6 +841,7 @@ _LIBCPP_PUSH_MACROS # define _LIBCPP_REGEX_COMPLEXITY_FACTOR 4096 _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace regex_constants { @@ -5808,6 +5809,7 @@ regex_replace(const _CharT* __s, return __r; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD # if _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex index 028bbf5650254..c1ba1c3b7a77b 100644 --- a/libcxx/include/shared_mutex +++ b/libcxx/include/shared_mutex @@ -153,6 +153,7 @@ _LIBCPP_PUSH_MACROS # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS struct _LIBCPP_EXPORTED_FROM_ABI __shared_mutex_base { mutex __mut_; @@ -431,6 +432,7 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(shared_lock<_Mutex>& __x, shared_lock<_Mu __x.swap(__y); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD # endif // _LIBCPP_STD_VER >= 14 diff --git a/libcxx/include/sstream b/libcxx/include/sstream index a42e8fbc9b72e..e40e65dbde2b9 100644 --- a/libcxx/include/sstream +++ b/libcxx/include/sstream @@ -338,6 +338,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // Class template basic_stringbuf [stringbuf] @@ -1288,6 +1289,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostringstream extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_istringstream; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept index d01de5c46211c..3c9c03c0ca27c 100644 --- a/libcxx/include/stdexcept +++ b/libcxx/include/stdexcept @@ -212,7 +212,9 @@ public: _LIBCPP_BEGIN_NAMESPACE_STD // in the dylib +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*); +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS [[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) { # if _LIBCPP_HAS_EXCEPTIONS diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf index 978ecb3538532..bffdeacfa0459 100644 --- a/libcxx/include/streambuf +++ b/libcxx/include/streambuf @@ -133,6 +133,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template class basic_streambuf { @@ -430,6 +431,7 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_streambuf; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_streambuf; # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/string b/libcxx/include/string index 5bd168f16eb47..2455938a92d9c 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -694,6 +694,7 @@ _LIBCPP_PUSH_MACROS # endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // basic_string @@ -3966,6 +3967,7 @@ inline constexpr bool __format::__enable_insertable> # endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/strstream b/libcxx/include/strstream index b33977ff66e21..cd1eb286b138c 100644 --- a/libcxx/include/strstream +++ b/libcxx/include/strstream @@ -151,6 +151,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_DEPRECATED _LIBCPP_EXPORTED_FROM_ABI strstreambuf : public streambuf { public: @@ -349,6 +350,7 @@ private: strstreambuf __sb_; // exposition only }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/valarray b/libcxx/include/valarray index 58287b60dd898..c77656f2d1000 100644 --- a/libcxx/include/valarray +++ b/libcxx/include/valarray @@ -381,6 +381,7 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template class valarray; @@ -3300,6 +3301,7 @@ template return __v.__end_; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/variant b/libcxx/include/variant index 9b2c4ee23ddcf..45462ee8a7713 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -285,12 +285,14 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI bad_variant_access : public exception { public: [[__nodiscard__]] const char* what() const _NOEXCEPT override; }; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_UNVERSIONED_NAMESPACE_STD _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/src/algorithm.cpp b/libcxx/src/algorithm.cpp index 8157be6f7406e..bbfbb3885f1c3 100644 --- a/libcxx/src/algorithm.cpp +++ b/libcxx/src/algorithm.cpp @@ -10,6 +10,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS template void __sort(RandomAccessIterator first, RandomAccessIterator last, Comp comp) { @@ -47,4 +48,5 @@ template void __sort<__less&, double*>(double*, double*, __less& template void __sort<__less&, long double*>(long double*, long double*, __less&); // clang-format on +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/any.cpp b/libcxx/src/any.cpp index fb9dd33b3ad7e..1482dcc1dac01 100644 --- a/libcxx/src/any.cpp +++ b/libcxx/src/any.cpp @@ -19,6 +19,7 @@ const char* bad_any_cast::what() const noexcept { return "bad any cast"; } // Preserve std::experimental::any_bad_cast for ABI compatibility // Even though it no longer exists in a header file _LIBCPP_BEGIN_NAMESPACE_LFTS +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI bad_any_cast : public bad_cast { public: @@ -27,6 +28,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_any_cast : public bad_cast { const char* bad_any_cast::what() const noexcept { return "bad any cast"; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_LFTS #endif diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp index 4d3064a2c9b79..b661714cff6ec 100644 --- a/libcxx/src/atomic.cpp +++ b/libcxx/src/atomic.cpp @@ -65,6 +65,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS struct NoTimeout {}; @@ -523,6 +524,7 @@ __libcpp_atomic_monitor(__cxx_atomic_contention_t const volatile* __location) no _LIBCPP_DIAGNOSTIC_POP +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/barrier.cpp b/libcxx/src/barrier.cpp index 72c29b49ada39..b213ea26dc09d 100644 --- a/libcxx/src/barrier.cpp +++ b/libcxx/src/barrier.cpp @@ -10,6 +10,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class __barrier_algorithm_base { public: @@ -69,4 +70,5 @@ __destroy_barrier_algorithm_base(_LIBCPP_NOESCAPE __barrier_algorithm_base* __ba delete __barrier; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp index 237969aacbab9..015b874cffa44 100644 --- a/libcxx/src/call_once.cpp +++ b/libcxx/src/call_once.cpp @@ -17,6 +17,7 @@ #include "include/atomic_support.h" _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // If dispatch_once_f ever handles C++ exceptions, and if one can get to it // without illegal macros (unexpected macros not beginning with _UpperCase or @@ -68,4 +69,5 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)( #endif // !_LIBCPP_HAS_THREADS } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/charconv.cpp b/libcxx/src/charconv.cpp index 148068b07e8e4..026258bdddc9c 100644 --- a/libcxx/src/charconv.cpp +++ b/libcxx/src/charconv.cpp @@ -13,6 +13,7 @@ #include "include/to_chars_floating_point.h" _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_AVAILABILITY_MINIMUM_HEADER_VERSION < 15 @@ -89,4 +90,6 @@ template __from_chars_result __from_chars_floating_point( template __from_chars_result __from_chars_floating_point( _LIBCPP_NOESCAPE const char* __first, _LIBCPP_NOESCAPE const char* __last, chars_format __fmt); + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp index 77010a2bead6d..af5fe7c01fb8b 100644 --- a/libcxx/src/chrono.cpp +++ b/libcxx/src/chrono.cpp @@ -63,6 +63,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -261,4 +262,5 @@ steady_clock::time_point steady_clock::now() noexcept { return __libcpp_steady_c } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/condition_variable.cpp b/libcxx/src/condition_variable.cpp index a87399d1b71b1..8248719ee859d 100644 --- a/libcxx/src/condition_variable.cpp +++ b/libcxx/src/condition_variable.cpp @@ -23,6 +23,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // ~condition_variable is defined elsewhere. @@ -72,6 +73,7 @@ void notify_all_at_thread_exit(condition_variable& cond, unique_lock lk) __thread_local_data()->notify_all_at_thread_exit(&cond, lk.release()); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/condition_variable_destructor.cpp b/libcxx/src/condition_variable_destructor.cpp index fc4b4a601d964..29724e65a4f56 100644 --- a/libcxx/src/condition_variable_destructor.cpp +++ b/libcxx/src/condition_variable_destructor.cpp @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #ifdef NEEDS_CONDVAR_DESTRUCTOR @@ -37,4 +38,5 @@ class _LIBCPP_EXPORTED_FROM_ABI condition_variable { condition_variable::~condition_variable() { __libcpp_condvar_destroy(&__cv_); } #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/error_category.cpp b/libcxx/src/error_category.cpp index 9c0ca6a04a523..941ca659b9d09 100644 --- a/libcxx/src/error_category.cpp +++ b/libcxx/src/error_category.cpp @@ -17,6 +17,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // class error_category @@ -36,4 +37,5 @@ bool error_category::equivalent(const error_code& code, int condition) const noe return *this == code.category() && code.value() == condition; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/expected.cpp b/libcxx/src/expected.cpp index f30efb5164796..19542b199e85b 100644 --- a/libcxx/src/expected.cpp +++ b/libcxx/src/expected.cpp @@ -9,5 +9,9 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS + const char* bad_expected_access::what() const noexcept { return "bad access to std::expected"; } + +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/experimental/chrono_exception.cpp b/libcxx/src/experimental/chrono_exception.cpp index bea2ad110310a..0d77f2a7b667d 100644 --- a/libcxx/src/experimental/chrono_exception.cpp +++ b/libcxx/src/experimental/chrono_exception.cpp @@ -9,6 +9,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -19,4 +20,5 @@ _LIBCPP_EXPORTED_FROM_ABI ambiguous_local_time::~ambiguous_local_time() = defaul } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/experimental/include/tzdb/tzdb_private.h b/libcxx/src/experimental/include/tzdb/tzdb_private.h index 8ec3f890ef65c..c8d39acf3155b 100644 --- a/libcxx/src/experimental/include/tzdb/tzdb_private.h +++ b/libcxx/src/experimental/include/tzdb/tzdb_private.h @@ -16,6 +16,7 @@ #include "types_private.h" _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -23,6 +24,7 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules); } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_SRC_INCLUDE_TZDB_TZ_PRIVATE_H diff --git a/libcxx/src/experimental/log_hardening_failure.cpp b/libcxx/src/experimental/log_hardening_failure.cpp index f836c15452249..10ae969c39a7d 100644 --- a/libcxx/src/experimental/log_hardening_failure.cpp +++ b/libcxx/src/experimental/log_hardening_failure.cpp @@ -15,6 +15,7 @@ #endif // __BIONIC__ _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS void __log_hardening_failure(const char* message) noexcept { // Always log the message to `stderr` in case the platform-specific system calls fail. @@ -28,4 +29,5 @@ void __log_hardening_failure(const char* message) noexcept { #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp index d954932ffa37f..fc7a9c26c91b2 100644 --- a/libcxx/src/experimental/time_zone.cpp +++ b/libcxx/src/experimental/time_zone.cpp @@ -55,6 +55,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #ifdef PRINT template <> @@ -1055,4 +1056,5 @@ time_zone::__get_info(local_seconds __local_time) const { } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/experimental/tzdb.cpp b/libcxx/src/experimental/tzdb.cpp index fd976ba4bd799..ca4aa1cabd769 100644 --- a/libcxx/src/experimental/tzdb.cpp +++ b/libcxx/src/experimental/tzdb.cpp @@ -46,6 +46,7 @@ // TODO TZDB Implement the Windows mapping in tzdb::current_zone _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -824,4 +825,5 @@ _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI string remote_version() { } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/experimental/tzdb_list.cpp b/libcxx/src/experimental/tzdb_list.cpp index b99c30a9b9e6e..2026808e65388 100644 --- a/libcxx/src/experimental/tzdb_list.cpp +++ b/libcxx/src/experimental/tzdb_list.cpp @@ -13,6 +13,7 @@ #include "include/tzdb/tzdb_list_private.h" _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -40,4 +41,5 @@ _LIBCPP_EXPORTED_FROM_ABI tzdb_list::const_iterator tzdb_list::__erase_after(con } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/filesystem/directory_entry.cpp b/libcxx/src/filesystem/directory_entry.cpp index 152715243906c..0c7f595569a07 100644 --- a/libcxx/src/filesystem/directory_entry.cpp +++ b/libcxx/src/filesystem/directory_entry.cpp @@ -16,6 +16,7 @@ #include "time_utils.h" _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS error_code directory_entry::__do_refresh() noexcept { __data_.__reset(); @@ -70,4 +71,5 @@ error_code directory_entry::__do_refresh() noexcept { return failure_ec; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/directory_iterator.cpp b/libcxx/src/filesystem/directory_iterator.cpp index 7d00c4933fc3e..5d682aee2b839 100644 --- a/libcxx/src/filesystem/directory_iterator.cpp +++ b/libcxx/src/filesystem/directory_iterator.cpp @@ -26,6 +26,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS using detail::ErrorHandler; @@ -320,4 +321,5 @@ bool recursive_directory_iterator::__try_recursion(error_code* ec) { return false; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/filesystem_clock.cpp b/libcxx/src/filesystem/filesystem_clock.cpp index 865a1018871f2..96a1336d4cbca 100644 --- a/libcxx/src/filesystem/filesystem_clock.cpp +++ b/libcxx/src/filesystem/filesystem_clock.cpp @@ -42,6 +42,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated") @@ -76,4 +77,5 @@ _FilesystemClock::time_point _FilesystemClock::now() noexcept { #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/filesystem_error.cpp b/libcxx/src/filesystem/filesystem_error.cpp index 0d8185fb5c3aa..43752aa7ac3bd 100644 --- a/libcxx/src/filesystem/filesystem_error.cpp +++ b/libcxx/src/filesystem/filesystem_error.cpp @@ -15,6 +15,7 @@ #include "format_string.h" _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS filesystem_error::~filesystem_error() {} @@ -37,4 +38,5 @@ void filesystem_error::__create_what(int __num_paths) { }(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp index 745db87ce3736..3d358dd55e8b8 100644 --- a/libcxx/src/filesystem/operations.cpp +++ b/libcxx/src/filesystem/operations.cpp @@ -69,6 +69,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS using detail::capture_errno; using detail::ErrorHandler; @@ -1077,4 +1078,5 @@ path __weakly_canonical(const path& p, error_code* ec) { return result.lexically_normal(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/filesystem/path.cpp b/libcxx/src/filesystem/path.cpp index 400b6e8988569..12a698da901a4 100644 --- a/libcxx/src/filesystem/path.cpp +++ b/libcxx/src/filesystem/path.cpp @@ -14,6 +14,7 @@ #include "path_parser.h" _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS using detail::ErrorHandler; using parser::createView; @@ -444,4 +445,5 @@ size_t __char_to_wide(const string& str, wchar_t* out, size_t outlen) { } #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_FILESYSTEM diff --git a/libcxx/src/fstream.cpp b/libcxx/src/fstream.cpp index 55a4442b9c782..2975088eb6255 100644 --- a/libcxx/src/fstream.cpp +++ b/libcxx/src/fstream.cpp @@ -18,6 +18,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if defined(_LIBCPP_WIN32API) @@ -34,4 +35,5 @@ _LIBCPP_EXPORTED_FROM_ABI void* __filebuf_windows_native_handle(FILE* __file) no #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/functional.cpp b/libcxx/src/functional.cpp index 59dab16bb03f9..737b6e0931ad6 100644 --- a/libcxx/src/functional.cpp +++ b/libcxx/src/functional.cpp @@ -9,6 +9,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS bad_function_call::~bad_function_call() noexcept {} @@ -18,4 +19,5 @@ size_t __hash_memory(_LIBCPP_NOESCAPE const void* ptr, size_t size) noexcept { return __murmur2_or_cityhash()(ptr, size); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/future.cpp b/libcxx/src/future.cpp index 7bba635e9006f..284b68828c7fd 100644 --- a/libcxx/src/future.cpp +++ b/libcxx/src/future.cpp @@ -10,6 +10,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_HIDDEN __future_error_category : public __do_message { public: @@ -194,4 +195,5 @@ shared_future& shared_future::operator=(const shared_future& __rhs) return *this; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp index b04cf0a8948c3..dd6b603d2d6d4 100644 --- a/libcxx/src/hash.cpp +++ b/libcxx/src/hash.cpp @@ -13,6 +13,7 @@ _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wtautological-constant-out-of-range-compare") _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace { @@ -127,4 +128,5 @@ size_t __next_prime(size_t n) { } } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/ios.cpp b/libcxx/src/ios.cpp index 5baff89a86410..3a8147a0f9d13 100644 --- a/libcxx/src/ios.cpp +++ b/libcxx/src/ios.cpp @@ -22,6 +22,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_HIDDEN __iostream_category : public __do_message { public: @@ -375,6 +376,7 @@ bool ios_base::sync_with_stdio(bool sync) { return r; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/iostream.cpp b/libcxx/src/iostream.cpp index b216c6ad35daf..0393982361456 100644 --- a/libcxx/src/iostream.cpp +++ b/libcxx/src/iostream.cpp @@ -15,6 +15,7 @@ #define ABI_NAMESPACE_STR _LIBCPP_TOSTRING(_LIBCPP_ABI_NAMESPACE) _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // This file implements the various stream objects provided inside . We're doing some ODR violations in here, // so this quite fragile. Specifically, the size of the stream objects (i.e. cout, cin etc.) needs to stay the same. @@ -154,4 +155,5 @@ ios_base::Init::Init() { ios_base::Init::~Init() {} +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp index 4d5e681e2556a..b12ca290e7090 100644 --- a/libcxx/src/locale.cpp +++ b/libcxx/src/locale.cpp @@ -45,6 +45,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS struct __libcpp_unique_locale { __libcpp_unique_locale(const char* nm) : __loc_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm, 0)) {} @@ -5655,6 +5656,7 @@ template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS codecvt_byname; #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/memory.cpp b/libcxx/src/memory.cpp index 764dfc64bf0b5..94a90cd3b9f0f 100644 --- a/libcxx/src/memory.cpp +++ b/libcxx/src/memory.cpp @@ -26,6 +26,7 @@ #include "include/atomic_support.h" _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS bad_weak_ptr::~bad_weak_ptr() noexcept {} @@ -144,4 +145,5 @@ _LIBCPP_DIAGNOSTIC_POP #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp index 22b5493427f0f..00307e107faa6 100644 --- a/libcxx/src/memory_resource.cpp +++ b/libcxx/src/memory_resource.cpp @@ -20,6 +20,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace pmr { @@ -497,4 +498,5 @@ void* monotonic_buffer_resource::do_allocate(size_t bytes, size_t align) { } // namespace pmr +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/mutex.cpp b/libcxx/src/mutex.cpp index 5b1e7da121fc4..303001a3b9623 100644 --- a/libcxx/src/mutex.cpp +++ b/libcxx/src/mutex.cpp @@ -23,6 +23,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // ~mutex is defined elsewhere @@ -141,6 +142,7 @@ void recursive_timed_mutex::unlock() noexcept { } } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/mutex_destructor.cpp b/libcxx/src/mutex_destructor.cpp index 4c63ea0da74da..308c6ea05c316 100644 --- a/libcxx/src/mutex_destructor.cpp +++ b/libcxx/src/mutex_destructor.cpp @@ -24,6 +24,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #ifdef NEEDS_MUTEX_DESTRUCTOR class _LIBCPP_EXPORTED_FROM_ABI mutex { @@ -39,4 +40,5 @@ class _LIBCPP_EXPORTED_FROM_ABI mutex { mutex::~mutex() noexcept { __libcpp_mutex_destroy(&__m_); } #endif // !NEEDS_MUTEX_DESTRUCTOR +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp index 3b92580565bfc..905091cb43aa3 100644 --- a/libcxx/src/optional.cpp +++ b/libcxx/src/optional.cpp @@ -24,6 +24,7 @@ const char* bad_optional_access::what() const noexcept { return "bad_optional_ac // Preserve std::experimental::bad_optional_access for ABI compatibility // Even though it no longer exists in a header file _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS class _LIBCPP_EXPORTED_FROM_ABI bad_optional_access : public std::logic_error { public: @@ -35,6 +36,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_optional_access : public std::logic_error { bad_optional_access::~bad_optional_access() noexcept = default; +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_EXPERIMENTAL #endif // _LIBCPP_AVAILABILITY_MINIMUM_HEADER_VERSION < 7 diff --git a/libcxx/src/ostream.cpp b/libcxx/src/ostream.cpp index e0f14a8917927..ae3afff7a9f5a 100644 --- a/libcxx/src/ostream.cpp +++ b/libcxx/src/ostream.cpp @@ -15,6 +15,7 @@ #include "std_stream.h" _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os) { // dynamic_cast requires RTTI, this only affects users whose vendor builds @@ -38,4 +39,5 @@ _LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os) { return nullptr; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp index a5edcc4632195..21d868e8c6a7a 100644 --- a/libcxx/src/print.cpp +++ b/libcxx/src/print.cpp @@ -33,6 +33,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if defined(_LIBCPP_WIN32API) @@ -72,4 +73,5 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isat _LIBCPP_DIAGNOSTIC_POP #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp index 3dca702341c85..528018358eeb0 100644 --- a/libcxx/src/pstl/libdispatch.cpp +++ b/libcxx/src/pstl/libdispatch.cpp @@ -12,6 +12,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace __pstl::__libdispatch { void __dispatch_apply(size_t chunk_count, void* context, void (*func)(void* context, size_t chunk)) noexcept { @@ -29,4 +30,5 @@ __chunk_partitions __partition_chunks(ptrdiff_t element_count) noexcept { } } // namespace __pstl::__libdispatch +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/random.cpp b/libcxx/src/random.cpp index 79815aadc7323..1e6308cc8c49e 100644 --- a/libcxx/src/random.cpp +++ b/libcxx/src/random.cpp @@ -36,6 +36,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if defined(_LIBCPP_USING_GETENTROPY) @@ -153,4 +154,5 @@ double random_device::entropy() const noexcept { #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/random_shuffle.cpp b/libcxx/src/random_shuffle.cpp index 4f2669a6c7fa5..aa0f78fafe06e 100644 --- a/libcxx/src/random_shuffle.cpp +++ b/libcxx/src/random_shuffle.cpp @@ -17,6 +17,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_HAS_THREADS static constinit __libcpp_mutex_t __rs_mut = _LIBCPP_MUTEX_INITIALIZER; @@ -48,4 +49,5 @@ __rs_default::result_type __rs_default::operator()() { __rs_default __rs_get() { return __rs_default(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/regex.cpp b/libcxx/src/regex.cpp index 6d9f06e213466..9d7d1699ae266 100644 --- a/libcxx/src/regex.cpp +++ b/libcxx/src/regex.cpp @@ -11,6 +11,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS static const char* make_error_type_string(regex_constants::error_type ecode) { switch (ecode) { @@ -396,4 +397,5 @@ void __match_any_but_newline::__exec(__state& __s) const { } } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/shared_mutex.cpp b/libcxx/src/shared_mutex.cpp index 6180833736956..4c5e45f668d66 100644 --- a/libcxx/src/shared_mutex.cpp +++ b/libcxx/src/shared_mutex.cpp @@ -13,6 +13,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS // Shared Mutex Base __shared_mutex_base::__shared_mutex_base() : __state_(0) {} @@ -96,4 +97,5 @@ void shared_timed_mutex::lock_shared() { return __base_.lock_shared(); } bool shared_timed_mutex::try_lock_shared() { return __base_.try_lock_shared(); } void shared_timed_mutex::unlock_shared() { return __base_.unlock_shared(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/std_stream.h b/libcxx/src/std_stream.h index 772e8b91ae349..4b9d3a34b2441 100644 --- a/libcxx/src/std_stream.h +++ b/libcxx/src/std_stream.h @@ -24,6 +24,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS static const int __limit = 8; @@ -380,6 +381,7 @@ void __stdoutbuf<_CharT>::imbue(const locale& __loc) { __always_noconv_ = __cv_->always_noconv(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/stdexcept.cpp b/libcxx/src/stdexcept.cpp index 0ee438bef02e1..31f60d5a5ac5b 100644 --- a/libcxx/src/stdexcept.cpp +++ b/libcxx/src/stdexcept.cpp @@ -18,6 +18,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS void __throw_runtime_error(const char* msg) { #if _LIBCPP_HAS_EXCEPTIONS @@ -27,4 +28,5 @@ void __throw_runtime_error(const char* msg) { #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp index 178ef710f0bcf..b55f9ff5c9a43 100644 --- a/libcxx/src/string.cpp +++ b/libcxx/src/string.cpp @@ -19,6 +19,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_AVAILABILITY_MINIMUM_HEADER_VERSION < 14 @@ -367,4 +368,5 @@ wstring to_wstring(double val) { return as_string(get_swprintf(), initial_string wstring to_wstring(long double val) { return as_string(get_swprintf(), initial_string()(), L"%Lf", val); } #endif +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/strstream.cpp b/libcxx/src/strstream.cpp index 70374191c6aba..667740f5c5dd4 100644 --- a/libcxx/src/strstream.cpp +++ b/libcxx/src/strstream.cpp @@ -18,6 +18,7 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS strstreambuf::strstreambuf(streamsize __alsize) : __strmode_(__dynamic), __alsize_(__alsize), __palloc_(nullptr), __pfree_(nullptr) {} @@ -253,6 +254,7 @@ ostrstream::~ostrstream() {} strstream::~strstream() {} +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/src/support/win32/locale_win32.cpp b/libcxx/src/support/win32/locale_win32.cpp index 26722e6e47a73..e7c7a114e9ce6 100644 --- a/libcxx/src/support/win32/locale_win32.cpp +++ b/libcxx/src/support/win32/locale_win32.cpp @@ -16,6 +16,7 @@ #include // wide char manipulation _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace __locale { // @@ -182,4 +183,5 @@ int __asprintf(char** ret, __locale_t loc, const char* format, ...) { } } // namespace __locale +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/support/win32/thread_win32.cpp b/libcxx/src/support/win32/thread_win32.cpp index 3a67d759f0f5e..606104e32b453 100644 --- a/libcxx/src/support/win32/thread_win32.cpp +++ b/libcxx/src/support/win32/thread_win32.cpp @@ -16,6 +16,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS static_assert(sizeof(__libcpp_mutex_t) == sizeof(SRWLOCK), ""); static_assert(alignof(__libcpp_mutex_t) == alignof(SRWLOCK), ""); @@ -211,4 +212,5 @@ int __libcpp_tls_set(__libcpp_tls_key __key, void* __p) { return 0; } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/system_error.cpp b/libcxx/src/system_error.cpp index 6397a94932b63..cf3d2c2926861 100644 --- a/libcxx/src/system_error.cpp +++ b/libcxx/src/system_error.cpp @@ -27,6 +27,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if defined(_LIBCPP_WIN32API) @@ -368,4 +369,5 @@ void __throw_system_error(int ev, const char* what_arg) { #endif } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/thread.cpp b/libcxx/src/thread.cpp index e494574ec21dd..d2aff1ee0c3fe 100644 --- a/libcxx/src/thread.cpp +++ b/libcxx/src/thread.cpp @@ -33,6 +33,7 @@ #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS thread::~thread() { if (!__libcpp_thread_isnull(&__t_)) @@ -170,4 +171,5 @@ void __thread_struct::notify_all_at_thread_exit(condition_variable* cv, mutex* m void __thread_struct::__make_ready_at_thread_exit(__assoc_sub_state* __s) { __p_->__make_ready_at_thread_exit(__s); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/valarray.cpp b/libcxx/src/valarray.cpp index 3d3a9ac30ebd0..431598eec5ca2 100644 --- a/libcxx/src/valarray.cpp +++ b/libcxx/src/valarray.cpp @@ -9,6 +9,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_AVAILABILITY_MINIMUM_HEADER_VERSION < 9 template _LIBCPP_EXPORTED_FROM_ABI valarray::valarray(size_t); @@ -45,4 +46,5 @@ void gslice::__init(size_t __start) { } } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/vector.cpp b/libcxx/src/vector.cpp index 77a028a48077d..ec1da7961f54a 100644 --- a/libcxx/src/vector.cpp +++ b/libcxx/src/vector.cpp @@ -9,6 +9,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS #if _LIBCPP_AVAILABILITY_MINIMUM_HEADER_VERSION < 15 @@ -27,4 +28,5 @@ void __vector_base_common::__throw_out_of_range() const { std::__throw_out #endif // _LIBCPP_AVAILABILITY_MINIMUM_HEADER_VERSION < 15 +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp index afe20c3cd0b16..306ebbe10c601 100644 --- a/libcxx/src/verbose_abort.cpp +++ b/libcxx/src/verbose_abort.cpp @@ -22,6 +22,7 @@ extern "C" void android_set_abort_message(const char* msg); #endif _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS [[gnu::weak]] void __libcpp_verbose_abort(char const* format, ...) noexcept { // Write message to stderr. We do this before formatting into a @@ -62,4 +63,5 @@ _LIBCPP_BEGIN_NAMESPACE_STD std::abort(); } +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/test/support/test_tzdb.h b/libcxx/test/support/test_tzdb.h index 4a05993dbce55..bb81a46ae420b 100644 --- a/libcxx/test/support/test_tzdb.h +++ b/libcxx/test/support/test_tzdb.h @@ -15,6 +15,7 @@ #if defined(_LIBCPP_VERSION) _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_EXPLICIT_ABI_ANNOTATIONS namespace chrono { @@ -24,6 +25,7 @@ _LIBCPP_AVAILABILITY_TZDB _LIBCPP_OVERRIDABLE_FUNC_VIS string_view __libcpp_tzdb } // namespace chrono +_LIBCPP_END_EXPLICIT_ABI_ANNOTATIONS _LIBCPP_END_NAMESPACE_STD #endif From 4899e71b0431a2c0ff5ca8745ed60b71f26da903 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 8 May 2026 11:38:03 +0100 Subject: [PATCH 023/538] [AMDGPU] Increment VA_VDST twice for each VOP3PX2 instruction (#196353) In expert scheduling mode, change the VA_VDST counts to match the hardware implementation. The inserted waits were conservatively correct before. This just makes them more precise in some cases. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 ++++- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 5 ++++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 4 +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 11 +++---- .../AMDGPU/expert_scheduling_gfx1250.mir | 29 +++++++++++++++++++ 5 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx1250.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 12ba73cae0ce2..aaa01ee2e549a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1113,7 +1113,13 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { assert(T < Context->MaxCounter); unsigned UB = getScoreUB(T); - unsigned CurrScore = UB + 1; + unsigned Increment = 1; + if (T == AMDGPU::VA_VDST && AMDGPU::getHasMatrixScale(Inst.getOpcode())) { + // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as + // two VOP3P instructions and increments VA_VDST twice. + Increment = 2; + } + unsigned CurrScore = UB + Increment; if (CurrScore == 0) report_fatal_error("InsertWaitcnt score wraparound"); // PendingEvents and ScoreUB need to be update regardless if this event diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1c145359ccc61..b13aed2432602 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -588,6 +588,11 @@ bool getWMMAIsXDL(unsigned Opc) { return Info ? Info->is_wmma_xdl : false; } +bool getHasMatrixScale(unsigned Opc) { + const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc); + return Info && Info->HasMatrixScale; +} + uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f6b86a59b7b1d..49373f09ee460 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -126,6 +126,7 @@ struct True16D16Info { struct WMMAInstInfo { uint32_t Opcode; bool is_wmma_xdl; + bool HasMatrixScale; }; #define GET_MIMGBaseOpcode_DECL @@ -618,6 +619,9 @@ bool getMAIIsGFX940XDL(unsigned Opc); LLVM_READONLY bool getWMMAIsXDL(unsigned Opc); +LLVM_READONLY +bool getHasMatrixScale(unsigned Opc); + // Get an equivalent BitOp3 for a binary logical \p Opc. // \returns BitOp3 modifier for the logical operation or zero. // Used in VOPD3 conversion. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7ae93b30e7f03..d95c9eb788700 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1752,15 +1752,16 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, def WMMAInstInfoTable : GenericTable { let FilterClass = "WMMAInstInfo"; let CppTypeName = "WMMAInstInfo"; - let Fields = ["Opcode", "is_wmma_xdl"]; + let Fields = ["Opcode", "is_wmma_xdl", "HasMatrixScale"]; let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getWMMAInstInfoHelper"; } -class WMMAInstInfo { +class WMMAInstInfo { Instruction Opcode = !cast(NAME); bit is_wmma_xdl = 0; + bit HasMatrixScale = WMMAProfile.HasMatrixScale; } multiclass WMMAInstGFX12 { @@ -1770,14 +1771,14 @@ multiclass WMMAInstGFX12, WMMAInstInfo { + def _twoaddr : VOP3P_Pseudo, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; let FixedSize = WMMAProfile.HasMatrixScale; let Size = !if(WMMAProfile.HasMatrixScale, 16, 8); } let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in - def _threeaddr : VOP3P_Pseudo, WMMAInstInfo { + def _threeaddr : VOP3P_Pseudo, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; let FixedSize = WMMAProfile.HasMatrixScale; let Size = !if(WMMAProfile.HasMatrixScale, 16, 8); @@ -1789,7 +1790,7 @@ multiclass WMMAInstGFX12 { - def _twoaddr : VOP3P_Pseudo, WMMAInstInfo { + def _twoaddr : VOP3P_Pseudo, WMMAInstInfo { let Mnemonic = Instr; let PseudoInstr = Instr#PseudoInstrSuffix; let mayRaiseFPException = 0; diff --git a/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx1250.mir b/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx1250.mir new file mode 100644 index 0000000000000..b149660c814a7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/expert_scheduling_gfx1250.mir @@ -0,0 +1,29 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-expert-scheduling-mode -run-pass=si-insert-waitcnts %s -o - | FileCheck %s + +--- +name: wmma_scale +body: | + bb.0: + ; CHECK-LABEL: name: wmma_scale + ; CHECK: S_SETREG_IMM32_B32 2, 2074, implicit-def $mode, implicit $mode + ; CHECK-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; CHECK-NEXT: S_WAIT_KMCNT 0 + ; CHECK-NEXT: early-clobber $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr16, $vgpr16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: early-clobber $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_WAITCNT_DEPCTR .VaVdst_3 + ; CHECK-NEXT: $vgpr64 = GLOBAL_LOAD_DWORD $vgpr8_vgpr9, 0, 0, implicit $exec + ; CHECK-NEXT: S_WAIT_LOADCNT 0 + ; CHECK-NEXT: S_WAITCNT_DEPCTR .VaVdst_1 + ; CHECK-NEXT: $vgpr64 = GLOBAL_LOAD_DWORD $vgpr16_vgpr17, 0, 0, implicit $exec + ; CHECK-NEXT: S_WAIT_LOADCNT 0 + ; CHECK-NEXT: S_WAITCNT_DEPCTR .VaVdst_0 + ; CHECK-NEXT: $vgpr64 = GLOBAL_LOAD_DWORD $vgpr40_vgpr41, 0, 0, implicit $exec + $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr16, $vgpr16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, 0, implicit $exec + $vgpr64 = GLOBAL_LOAD_DWORD $vgpr8_vgpr9, 0, 0, implicit $exec + $vgpr64 = GLOBAL_LOAD_DWORD $vgpr16_vgpr17, 0, 0, implicit $exec + $vgpr64 = GLOBAL_LOAD_DWORD $vgpr40_vgpr41, 0, 0, implicit $exec +... From d5f0b9eb6766a31d17e1c93e024b1c246e90c8db Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 8 May 2026 11:38:22 +0100 Subject: [PATCH 024/538] [LLVM][LICM] Skip unrelated accesses when looking for hoist/sink conflicting instructions. (#195132) Essentially uses ModRef analysis in place of getClobberingMemoryAccess() because the former has more accurate information as to how in loop accesses and the hoist/sink target relate. --- llvm/include/llvm/Analysis/AliasAnalysis.h | 3 ++ llvm/lib/Transforms/Scalar/LICM.cpp | 51 ++++--------------- llvm/test/Transforms/LICM/call-hoisting.ll | 3 +- .../LICM/hoist-inaccesiblemem-call.ll | 41 +++++++++++++-- llvm/test/Transforms/LICM/pr54495.ll | 2 +- 5 files changed, 53 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index 23fa20f17d92f..4997c41f37273 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -691,6 +691,9 @@ class BatchAAResults { ModRefInfo getModRefInfo(const Instruction *I, const CallBase *Call2) { return AA.getModRefInfo(I, Call2, AAQI); } + ModRefInfo getModRefInfo(const Instruction *I, const Instruction *I2) { + return AA.getModRefInfo(I, I2, AAQI); + } ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) { return AA.getArgModRefInfo(Call, ArgIdx); } diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 9f3535a506b7e..2ceadb1ac30e2 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2319,9 +2319,7 @@ static bool noConflictingReadWrites(Instruction *I, MemorySSA *MSSA, if (!MSSA->isLiveOnEntryDef(Source) && CurLoop->contains(Source->getBlock())) return false; - // If there are interfering Uses (i.e. their defining access is in the - // loop), or ordered loads (stored as Defs!), don't move this store. - // Could do better here, but this is conservatively correct. + // If there are interfering Uses don't move this store. // TODO: Cache set of Uses on the first walk in runOnLoop, update when // moving accesses. Can also extend to dominating uses. for (auto *BB : CurLoop->getBlocks()) { @@ -2329,45 +2327,16 @@ static bool noConflictingReadWrites(Instruction *I, MemorySSA *MSSA, if (!Accesses) continue; for (const auto &MA : *Accesses) { + // Accesses are ordered. If we find one that I dominates we can stop. if (!Flags.getIsSink() && MSSA->dominates(IMD, &MA)) - continue; - if (const auto *MU = dyn_cast(&MA)) { - auto *MD = getClobberingMemoryAccess(*MSSA, BAA, Flags, - const_cast(MU)); - if (!MSSA->isLiveOnEntryDef(MD) && CurLoop->contains(MD->getBlock())) - return false; - // Disable hoisting past potentially interfering loads. Optimized - // Uses may point to an access outside the loop, as getClobbering - // checks the previous iteration when walking the backedge. - // FIXME: More precise: no Uses that alias I. - if (!Flags.getIsSink() && !MSSA->dominates(IMD, MU)) - return false; - } else if (const auto *MD = dyn_cast(&MA)) { - if (auto *LI = dyn_cast(MD->getMemoryInst())) { - (void)LI; // Silence warning. - assert(!LI->isUnordered() && "Expected unordered load"); - return false; - } - // Any call, while it may not be clobbering I, it may be a use. - if (auto *CI = dyn_cast(MD->getMemoryInst())) { - // Check if the call may read from the memory location written - // to by I. Check CI's attributes and arguments; the number of - // such checks performed is limited above by NoOfMemAccTooLarge. - if (auto *SI = dyn_cast(I)) { - ModRefInfo MRI = BAA.getModRefInfo(CI, MemoryLocation::get(SI)); - if (isModOrRefSet(MRI)) - return false; - } else { - auto *SCI = cast(I); - // If the instruction we are wanting to hoist is also a call - // instruction then we need not check mod/ref info with itself - if (SCI == CI) - continue; - ModRefInfo MRI = BAA.getModRefInfo(CI, SCI); - if (isModOrRefSet(MRI)) - return false; - } - } + break; + + if (const auto *MemUseOrDef = dyn_cast(&MA)) { + // Skip unrelated accesses. + if (isNoModRef(BAA.getModRefInfo(MemUseOrDef->getMemoryInst(), I))) + continue; + + return false; } } } diff --git a/llvm/test/Transforms/LICM/call-hoisting.ll b/llvm/test/Transforms/LICM/call-hoisting.ll index 748810be3a709..2520e2277cf56 100644 --- a/llvm/test/Transforms/LICM/call-hoisting.ll +++ b/llvm/test/Transforms/LICM/call-hoisting.ll @@ -277,17 +277,16 @@ exit: ret i32 %val } -; FIXME: It's safe to hoist @store(), because @load() does not alias. define i32 @unrelated_read(ptr noalias %loc, ptr noalias %otherloc) { ; CHECK-LABEL: define i32 @unrelated_read( ; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[OTHERLOC:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @store(i32 0, ptr [[LOC]]) ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[OTHERLOC_GEP:%.*]] = getelementptr i32, ptr [[OTHERLOC]], i32 [[IV]] ; CHECK-NEXT: [[VAL:%.*]] = call i32 @load(ptr [[OTHERLOC_GEP]]) -; CHECK-NEXT: call void @store(i32 0, ptr [[LOC]]) ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV]], 200 ; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll b/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll index fd73080365993..89d3cc9d37033 100644 --- a/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll +++ b/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll @@ -4,18 +4,16 @@ ;; It should hoist fn_write_inaccessible_mem ;; because there is no conflict between inaccessible memory ;; fn_read_inaccessible_mem is a nice side effect -; FIXME: fn_write_inaccessible_mem is currently not hoisted due to the preceding -; load, even though it does not alias. define i32 @loop_alias(i64 %x, ptr %start) { ; CHECK-LABEL: define i32 @loop_alias( ; CHECK-SAME: i64 [[X:%.*]], ptr [[START:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @fn_write_inaccessible_mem() ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[GEP:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[PHI]], align 4 ; CHECK-NEXT: [[VAL:%.*]] = call i32 @fn_args(i32 [[LOAD]]) -; CHECK-NEXT: call void @fn_write_inaccessible_mem() ; CHECK-NEXT: call void @fn_read_inaccessible_mem(i32 [[LOAD]]) ; CHECK-NEXT: [[GEP]] = getelementptr inbounds nuw i32, ptr [[PHI]], i64 [[X]] ; CHECK-NEXT: [[ACC:%.*]] = add nuw nsw i32 [[VAL]], 1 @@ -158,6 +156,43 @@ loop: br label %loop } +define i32 @non_dominated_load_that_does_not_alias_inaccessible_mem(ptr %dst, i64 %x, ptr %start) { +; CHECK-LABEL: define i32 @non_dominated_load_that_does_not_alias_inaccessible_mem( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]], ptr [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[GEP:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[PHI]], align 4 +; CHECK-NEXT: [[VAL:%.*]] = call i32 @fn_args(i32 [[LOAD]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @fn_read_inaccessible_mem_2(i32 [[LOAD]]) +; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 +; CHECK-NEXT: [[GEP]] = getelementptr inbounds nuw i32, ptr [[PHI]], i64 [[X]] +; CHECK-NEXT: [[ACC:%.*]] = add nuw nsw i32 [[VAL]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ACC]], 10 +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[AFTER_LOOP:.*]] +; CHECK: [[AFTER_LOOP]]: +; CHECK-NEXT: [[ACC_LCSSA:%.*]] = phi i32 [ [[ACC]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[ACC_LCSSA]] +; +entry: + br label %loop +loop: + %phi = phi ptr [ %start, %entry ], [ %gep, %loop ] + %load = load i32, ptr %phi + %val = call i32 @fn_args(i32 %load) + call void @fn_write_inaccessible_mem() + %res = call i32 @fn_read_inaccessible_mem_2(i32 %load) + store i32 %res, ptr %dst + %gep = getelementptr inbounds nuw i32, ptr %phi, i64 %x + %acc = add nuw nsw i32 %val, 1 + %cmp = icmp ult i32 %acc, 10 + br i1 %cmp, label %loop, label %after_loop +after_loop: + ret i32 %acc +} + declare i32 @fn_args(i32) nounwind willreturn memory(argmem: read) declare i32 @fn_read_inaccessible_mem_2(i32) nounwind willreturn memory(inaccessiblemem: read) declare void @fn_write_inaccessible_mem() nounwind memory(inaccessiblemem: write) diff --git a/llvm/test/Transforms/LICM/pr54495.ll b/llvm/test/Transforms/LICM/pr54495.ll index d01ca69d55242..5e66758257ef0 100644 --- a/llvm/test/Transforms/LICM/pr54495.ll +++ b/llvm/test/Transforms/LICM/pr54495.ll @@ -6,6 +6,7 @@ define void @test(ptr %p1, ptr %p2, ptr noalias %p3) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: +; CHECK-NEXT: store ptr [[P3:%.*]], ptr [[P3]], align 8 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[P1:%.*]], [[ENTRY:%.*]] ], [ [[P2:%.*]], [[LOOP]] ] @@ -13,7 +14,6 @@ define void @test(ptr %p1, ptr %p2, ptr noalias %p3) { ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[V]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[LOOP_EXIT:%.*]] ; CHECK: loop.exit: -; CHECK-NEXT: store ptr [[P3:%.*]], ptr [[P3]], align 8 ; CHECK-NEXT: ret void ; entry: From 7f2a665d96d3e91f30b5cf1b138adee2b2938bee Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 07:32:19 -0400 Subject: [PATCH 025/538] [libc][math] Fix -Wshadow warnings in FMod.h (#196346) The using statement inside the lambda is redundant with the same using 4 lines up. No behavior change. --- libc/src/__support/FPUtil/generic/FMod.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h index b74b8b7d1b18d..1a321231a8eb9 100644 --- a/libc/src/__support/FPUtil/generic/FMod.h +++ b/libc/src/__support/FPUtil/generic/FMod.h @@ -171,7 +171,6 @@ class FMod { using StorageType = typename FPB::StorageType; LIBC_INLINE static constexpr bool pre_check(T x, T y, T &out) { - using FPB = fputil::FPBits; const T quiet_nan = FPB::quiet_nan().get_val(); FPB sx(x), sy(y); if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() && From 27c5421df21b05def2a297028bfccc9a6ba16396 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Fri, 8 May 2026 13:49:27 +0200 Subject: [PATCH 026/538] [OpenMP] Fix set-but-unused-var warning in omptest (#196069) This fixes a warning in omptest about a set but unused variable. The var was intended to control whether colored logging output is created. That logic has been moved into the `Logger` itself. --- openmp/tools/omptest/include/EnvHelper.h | 38 ++++++++++++++++++++++++ openmp/tools/omptest/src/Logging.cpp | 4 +++ openmp/tools/omptest/src/OmptTester.cpp | 31 ++++++++----------- 3 files changed, 54 insertions(+), 19 deletions(-) create mode 100644 openmp/tools/omptest/include/EnvHelper.h diff --git a/openmp/tools/omptest/include/EnvHelper.h b/openmp/tools/omptest/include/EnvHelper.h new file mode 100644 index 0000000000000..2f061c904ddd0 --- /dev/null +++ b/openmp/tools/omptest/include/EnvHelper.h @@ -0,0 +1,38 @@ +//===- EnvHelper.h - General logging class ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Provides environment helpers shared between a couple of places. +/// +//===----------------------------------------------------------------------===// + +#include + +#ifndef OPENMP_TOOLS_OMPTEST_INCLUDE_ENVHELPER_H +#define OPENMP_TOOLS_OMPTEST_INCLUDE_ENVHELPER_H + +namespace omptest { +/// Load the value of a given boolean environmental variable. Return +/// std::nullopt if not specified in the environment. +inline std::optional +getBoolEnvironmentVariable(const char *VariableName) { + if (VariableName == nullptr) + return std::nullopt; + if (const char *EnvValue = std::getenv(VariableName)) { + std::string S{EnvValue}; + for (auto &C : S) + C = (char)std::tolower(C); + if (S == "1" || S == "on" || S == "true" || S == "yes") + return true; + if (S == "0" || S == "off" || S == "false" || S == "no") + return false; + } + return std::nullopt; +} +} // namespace omptest +#endif // OPENMP_TOOLS_OMPTEST_INCLUDE_ENVHELPER_H diff --git a/openmp/tools/omptest/src/Logging.cpp b/openmp/tools/omptest/src/Logging.cpp index c609b6ee83aec..2132cf8c62efe 100644 --- a/openmp/tools/omptest/src/Logging.cpp +++ b/openmp/tools/omptest/src/Logging.cpp @@ -12,12 +12,16 @@ //===----------------------------------------------------------------------===// #include "Logging.h" +#include "EnvHelper.h" using namespace omptest; using namespace logging; Logger::Logger(Level LogLevel, std::ostream &OutStream, bool FormatOutput) : LoggingLevel(LogLevel), OutStream(OutStream), FormatOutput(FormatOutput) { + if (auto EnvVal = getBoolEnvironmentVariable("OMPTEST_LOG_COLORED"); + EnvVal.has_value()) + FormatOutput = EnvVal.value(); // Flush any buffered output OutStream << std::flush; } diff --git a/openmp/tools/omptest/src/OmptTester.cpp b/openmp/tools/omptest/src/OmptTester.cpp index 1a83f621173ad..601f7c6089e8d 100644 --- a/openmp/tools/omptest/src/OmptTester.cpp +++ b/openmp/tools/omptest/src/OmptTester.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "OmptTester.h" +#include "EnvHelper.h" #include #include @@ -49,7 +50,6 @@ static std::atomic NextOpId{0x8000000000000001}; static bool UseEMICallbacks = false; static bool UseTracing = false; static bool RunAsTestSuite = false; -static bool ColoredLog = false; // OMPT entry point handles static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0; @@ -358,20 +358,6 @@ static void on_ompt_callback_target_map_emi(ompt_data_t *target_data, assert(0 && "Target map emi callback is unimplemented"); } -/// Load the value of a given boolean environmental variable. -bool getBoolEnvironmentVariable(const char *VariableName) { - if (VariableName == nullptr) - return false; - if (const char *EnvValue = std::getenv(VariableName)) { - std::string S{EnvValue}; - for (auto &C : S) - C = (char)std::tolower(C); - if (S == "1" || S == "on" || S == "true" || S == "yes") - return true; - } - return false; -} - /// Called by the OMP runtime to initialize the OMPT int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data) { @@ -380,10 +366,17 @@ int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num, if (!ompt_set_callback) return 0; // failure - UseEMICallbacks = getBoolEnvironmentVariable("OMPTEST_USE_OMPT_EMI"); - UseTracing = getBoolEnvironmentVariable("OMPTEST_USE_OMPT_TRACING"); - RunAsTestSuite = getBoolEnvironmentVariable("OMPTEST_RUN_AS_TESTSUITE"); - ColoredLog = getBoolEnvironmentVariable("OMPTEST_LOG_COLORED"); + if (auto EmiCallbacksEnvVal = + getBoolEnvironmentVariable("OMPTEST_USE_OMPT_EMI")) + UseEMICallbacks = EmiCallbacksEnvVal.value(); + + if (auto TracingEnvVal = + getBoolEnvironmentVariable("OMPTEST_USE_OMPT_TRACING")) + UseTracing = TracingEnvVal.value(); + + if (auto RunAsTestSuiteEnvVal = + getBoolEnvironmentVariable("OMPTEST_RUN_AS_TESTSUITE")) + RunAsTestSuite = RunAsTestSuiteEnvVal.value(); register_ompt_callback(ompt_callback_thread_begin); register_ompt_callback(ompt_callback_thread_end); From 51e82b849cde71a4f9008cd0d48f202bb3105bbf Mon Sep 17 00:00:00 2001 From: Shanzhi Chen Date: Fri, 8 May 2026 20:07:09 +0800 Subject: [PATCH 027/538] [BOLT][AArch64] Add support for LDR relaxation on LDRSWl (#196051) BOLT currently supports LDR relaxation for LDRXl and LDRWl. Add support for LDR relaxation on LDRSWl. --- bolt/include/bolt/Core/MCPlusBuilder.h | 7 +-- bolt/lib/Passes/AArch64RelaxationPass.cpp | 2 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 50 ++++++++++++------- bolt/test/AArch64/ldr-relaxation.s | 23 +++++++++ 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index f9acaa4ee55a3..3a840615bec8f 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -861,12 +861,7 @@ class MCPlusBuilder { return false; } - virtual bool isLDRWl(const MCInst &Inst) const { - llvm_unreachable("not implemented"); - return false; - } - - virtual bool isLDRXl(const MCInst &Inst) const { + virtual bool isLoadLiteralGPR(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; } diff --git a/bolt/lib/Passes/AArch64RelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp index 610adad58cfcb..2b7384dc848dd 100644 --- a/bolt/lib/Passes/AArch64RelaxationPass.cpp +++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp @@ -46,7 +46,7 @@ void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) { bool IsADR = BC.MIB->isADR(Inst); // TODO: Handle other types of LDR (literal, PC-relative) instructions. - if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst)) + if (!IsADR && !BC.MIB->isLoadLiteralGPR(Inst)) continue; const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index b091378682731..6695198802006 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -715,12 +715,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return Inst.getOpcode() == AArch64::ADDXri; } - bool isLDRWl(const MCInst &Inst) const override { - return Inst.getOpcode() == AArch64::LDRWl; - } - - bool isLDRXl(const MCInst &Inst) const override { - return Inst.getOpcode() == AArch64::LDRXl; + bool isLoadLiteralGPR(const MCInst &Inst) const override { + unsigned OpCode = Inst.getOpcode(); + return OpCode == AArch64::LDRWl || OpCode == AArch64::LDRXl || + OpCode == AArch64::LDRSWl; } MCPhysReg getADRReg(const MCInst &Inst) const { @@ -744,16 +742,36 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType createAdrpLdr(const MCInst &LDRInst, MCContext *Ctx) const override { - assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) && - "LDR (literal, 32 or 64-bit integer load) instruction expected"); + assert(isLoadLiteralGPR(LDRInst) && + "LDR (literal) or LDRSW (literal) expected"); assert(LDRInst.getOperand(0).isReg() && "unexpected operand in LDR instruction"); const MCPhysReg DataReg = LDRInst.getOperand(0).getReg(); - const MCPhysReg AddrReg = - isLDRXl(LDRInst) ? DataReg - : (MCPhysReg)RegInfo->getMatchingSuperReg( - DataReg, AArch64::sub_32, - &RegInfo->getRegClass(AArch64::GPR64RegClassID)); + MCPhysReg AddrReg; + unsigned OpCode; + uint32_t RelType; + switch (LDRInst.getOpcode()) { + case AArch64::LDRWl: + AddrReg = (MCPhysReg)RegInfo->getMatchingSuperReg( + DataReg, AArch64::sub_32, + &RegInfo->getRegClass(AArch64::GPR64RegClassID)); + OpCode = AArch64::LDRWui; + RelType = ELF::R_AARCH64_LDST32_ABS_LO12_NC; + break; + case AArch64::LDRXl: + AddrReg = DataReg; + OpCode = AArch64::LDRXui; + RelType = ELF::R_AARCH64_LDST64_ABS_LO12_NC; + break; + case AArch64::LDRSWl: + AddrReg = DataReg; + OpCode = AArch64::LDRSWui; + RelType = ELF::R_AARCH64_LDST64_ABS_LO12_NC; + break; + default: + llvm_unreachable("LDR (literal) or LDRSW (literal) expected"); + } + const MCSymbol *Target = getTargetSymbol(LDRInst, 1); assert(Target && "missing target symbol in LDR instruction"); @@ -764,15 +782,13 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Insts[0].addOperand(MCOperand::createImm(0)); setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx, ELF::R_AARCH64_NONE); - Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui); + Insts[1].setOpcode(OpCode); Insts[1].clear(); Insts[1].addOperand(MCOperand::createReg(DataReg)); Insts[1].addOperand(MCOperand::createReg(AddrReg)); Insts[1].addOperand(MCOperand::createImm(0)); Insts[1].addOperand(MCOperand::createImm(0)); - setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx, - isLDRXl(LDRInst) ? ELF::R_AARCH64_LDST64_ABS_LO12_NC - : ELF::R_AARCH64_LDST32_ABS_LO12_NC); + setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx, RelType); return Insts; } diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s index 7632504a01635..58ffeb034c253 100644 --- a/bolt/test/AArch64/ldr-relaxation.s +++ b/bolt/test/AArch64/ldr-relaxation.s @@ -112,6 +112,29 @@ _start: ret .cfi_endproc .size _start, .-_start +.endif + +## Check LDR relaxation works on LDRSW (literal) + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE_LDRSW=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX_LDRSW + +# RELAX_LDRSW: adrp +# RELAX_LDRSW-NEXT: ldrsw + +.ifdef RELAX_SIMPLE_LDRSW + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldrsw x0, _foo + ret + .cfi_endproc +.size _start, .-_start .endif .section .text_cold From 1bba0bfe1c2957ed21b712ffddeb9fd0348ff06a Mon Sep 17 00:00:00 2001 From: sujianIBM <98488060+sujianIBM@users.noreply.github.com> Date: Fri, 8 May 2026 08:20:04 -0400 Subject: [PATCH 028/538] Exclude unsupported compiler-rt tests on z/OS. (#194437) This PR excludes unsupported part (NAN, -NAN, INFINITY, -INFINITY) from the following 2 compiler-rt tests on z/OS. ``` compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c ``` --- compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c | 9 +++++++-- .../test/builtins/Unit/compiler_rt_scalbnl_test.c | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c index f49ce710b0443..437600a14c3b6 100644 --- a/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c +++ b/compiler-rt/test/builtins/Unit/compiler_rt_logbl_test.c @@ -34,8 +34,13 @@ int test__compiler_rt_logbl(fp_t x) { } fp_t cases[] = { - 1.e-6, -1.e-6, NAN, -NAN, INFINITY, -INFINITY, -1, - -0.0, 0.0, 1, -2, 2, -0.5, 0.5, + 1.e-6, -1.e-6, +// The logbl() function's behavior on the following values on z/OS +// differs from the C++ standard specification. +# ifndef __MVS__ + NAN, -NAN, INFINITY, -INFINITY, +# endif + -1, -0.0, 0.0, 1, -2, 2, -0.5, 0.5, }; int main() { diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c index 0d9bbdfd68e4a..fd14e1ae7009d 100644 --- a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c +++ b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnl_test.c @@ -39,10 +39,14 @@ int test__compiler_rt_scalbnl(const char *mode, fp_t x, int y) { } fp_t cases[] = { +// The scalbnl() function's behavior on the following values on z/OS +// differs from the C++ standard specification. +# ifndef __MVS__ -NAN, NAN, -INFINITY, INFINITY, +# endif -0.0, 0.0, -1, From 1b38e21077dc469b0c67360440e4d19710ef053e Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Fri, 8 May 2026 12:23:28 +0000 Subject: [PATCH 029/538] Syscall migrations of stdio and unistd (#196403) Added ErrorOr-returning syscall wrappers for access, chdir, dup, dup2, dup3, faccessat, fchdir, fsync, lseek, readlink, readlinkat, rename, rmdir, and unlinkat. Migrated the Linux entrypoint implementations in src/unistd/linux/ and src/stdio/linux/rename.cpp to use them. Replaced internal::lseekimpl() with linux_syscalls::lseek() in the File infrastructure and deleted the now-unused lseekImpl.h. Assisted-by: Automated tooling, human reviewed. --- libc/src/__support/File/linux/CMakeLists.txt | 4 +- libc/src/__support/File/linux/file.cpp | 7 +- libc/src/__support/File/linux/lseekImpl.h | 52 ----- .../linux/syscall_wrappers/CMakeLists.txt | 178 ++++++++++++++++++ .../OSUtil/linux/syscall_wrappers/access.h | 43 +++++ .../OSUtil/linux/syscall_wrappers/chdir.h | 36 ++++ .../OSUtil/linux/syscall_wrappers/dup.h | 36 ++++ .../OSUtil/linux/syscall_wrappers/dup2.h | 55 ++++++ .../OSUtil/linux/syscall_wrappers/dup3.h | 36 ++++ .../OSUtil/linux/syscall_wrappers/faccessat.h | 43 +++++ .../OSUtil/linux/syscall_wrappers/fchdir.h | 36 ++++ .../OSUtil/linux/syscall_wrappers/fsync.h | 36 ++++ .../OSUtil/linux/syscall_wrappers/lseek.h | 55 ++++++ .../OSUtil/linux/syscall_wrappers/readlink.h | 46 +++++ .../linux/syscall_wrappers/readlinkat.h | 38 ++++ .../OSUtil/linux/syscall_wrappers/rename.h | 47 +++++ .../OSUtil/linux/syscall_wrappers/rmdir.h | 43 +++++ .../OSUtil/linux/syscall_wrappers/unlinkat.h | 36 ++++ libc/src/stdio/linux/CMakeLists.txt | 4 +- libc/src/stdio/linux/rename.cpp | 17 +- libc/src/unistd/linux/CMakeLists.txt | 65 ++----- libc/src/unistd/linux/access.cpp | 19 +- libc/src/unistd/linux/chdir.cpp | 10 +- libc/src/unistd/linux/dup.cpp | 12 +- libc/src/unistd/linux/dup2.cpp | 38 +--- libc/src/unistd/linux/dup3.cpp | 16 +- libc/src/unistd/linux/faccessat.cpp | 17 +- libc/src/unistd/linux/fchdir.cpp | 10 +- libc/src/unistd/linux/fsync.cpp | 12 +- libc/src/unistd/linux/lseek.cpp | 13 +- libc/src/unistd/linux/readlink.cpp | 21 +-- libc/src/unistd/linux/readlinkat.cpp | 14 +- libc/src/unistd/linux/rmdir.cpp | 19 +- libc/src/unistd/linux/unlink.cpp | 18 +- libc/src/unistd/linux/unlinkat.cpp | 16 +- 35 files changed, 853 insertions(+), 295 deletions(-) delete mode 100644 libc/src/__support/File/linux/lseekImpl.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/access.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/chdir.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/dup.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/dup2.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/dup3.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/faccessat.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/fchdir.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/fsync.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/lseek.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/readlink.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/readlinkat.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/rename.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/rmdir.h create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/unlinkat.h diff --git a/libc/src/__support/File/linux/CMakeLists.txt b/libc/src/__support/File/linux/CMakeLists.txt index c046dd4066900..2becb385e44d9 100644 --- a/libc/src/__support/File/linux/CMakeLists.txt +++ b/libc/src/__support/File/linux/CMakeLists.txt @@ -5,17 +5,15 @@ add_object_library( file.cpp HDRS file.h - lseekImpl.h DEPENDS libc.hdr.fcntl_macros libc.hdr.stdio_macros - libc.hdr.stdint_proxy - libc.hdr.types.off_t libc.hdr.types.FILE libc.include.sys_syscall libc.include.sys_stat libc.src.__support.CPP.new libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.lseek libc.src.__support.error_or libc.src.__support.File.file libc.src.errno.errno diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp index 13065300e5a34..10a6f2a97dc41 100644 --- a/libc/src/__support/File/linux/file.cpp +++ b/libc/src/__support/File/linux/file.cpp @@ -12,8 +12,8 @@ #include "hdr/types/off_t.h" #include "src/__support/CPP/new.h" #include "src/__support/File/file.h" -#include "src/__support/File/linux/lseekImpl.h" #include "src/__support/OSUtil/fcntl.h" +#include "src/__support/OSUtil/linux/syscall_wrappers/lseek.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/alloc-checker.h" #include "src/__support/libc_errno.h" // For error macros @@ -47,10 +47,7 @@ FileIOResult linux_file_read(File *f, void *buf, size_t size) { ErrorOr linux_file_seek(File *f, off_t offset, int whence) { auto *lf = reinterpret_cast(f); - auto result = internal::lseekimpl(lf->get_fd(), offset, whence); - if (!result.has_value()) - return result.error(); - return result.value(); + return linux_syscalls::lseek(lf->get_fd(), offset, whence); } int linux_file_close(File *f) { diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h deleted file mode 100644 index 47df99ae84b90..0000000000000 --- a/libc/src/__support/File/linux/lseekImpl.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- Linux implementation of lseek -------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_FILE_LINUX_LSEEKIMPL_H -#define LLVM_LIBC_SRC___SUPPORT_FILE_LINUX_LSEEKIMPL_H - -#include "hdr/stdint_proxy.h" // For uint64_t. -#include "hdr/types/off_t.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. -#include "src/__support/common.h" -#include "src/__support/error_or.h" -#include "src/__support/libc_errno.h" -#include "src/__support/macros/config.h" - -#include // For syscall numbers. - -namespace LIBC_NAMESPACE_DECL { -namespace internal { - -LIBC_INLINE ErrorOr lseekimpl(int fd, off_t offset, int whence) { - off_t result; -#ifdef SYS_lseek - result = LIBC_NAMESPACE::syscall_impl(SYS_lseek, fd, offset, whence); - if (result < 0) - return Error(-static_cast(result)); -#elif defined(SYS_llseek) || defined(SYS__llseek) - static_assert(sizeof(size_t) == 4, "size_t must be 32 bits."); -#ifdef SYS_llseek - constexpr long LLSEEK_SYSCALL_NO = SYS_llseek; -#elif defined(SYS__llseek) - constexpr long LLSEEK_SYSCALL_NO = SYS__llseek; -#endif - off_t offset_64 = offset; - int ret = LIBC_NAMESPACE::syscall_impl( - LLSEEK_SYSCALL_NO, fd, offset_64 >> 32, offset_64, &result, whence); - if (ret < 0) - return Error(-ret); -#else -#error "lseek, llseek and _llseek syscalls not available." -#endif - return result; -} - -} // namespace internal -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_FILE_LINUX_LSEEKIMPL_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt b/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt index 6418d6f83dbfa..bbe76fece3bdd 100644 --- a/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt @@ -365,3 +365,181 @@ add_header_library( libc.hdr.fcntl_macros libc.include.sys_syscall ) + +add_header_library( + unlinkat + HDRS + unlinkat.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + +add_header_library( + rmdir + HDRS + rmdir.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.fcntl_macros + libc.include.sys_syscall +) + +add_header_library( + chdir + HDRS + chdir.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + +add_header_library( + fchdir + HDRS + fchdir.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + +add_header_library( + fsync + HDRS + fsync.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + + +add_header_library( + dup + HDRS + dup.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + +add_header_library( + dup2 + HDRS + dup2.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.fcntl_macros + libc.include.sys_syscall +) + +add_header_library( + dup3 + HDRS + dup3.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + +add_header_library( + access + HDRS + access.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.fcntl_macros + libc.include.sys_syscall +) + +add_header_library( + faccessat + HDRS + faccessat.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.include.sys_syscall +) + +add_header_library( + readlink + HDRS + readlink.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.fcntl_macros + libc.hdr.types.ssize_t + libc.include.sys_syscall +) + +add_header_library( + readlinkat + HDRS + readlinkat.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.types.ssize_t + libc.include.sys_syscall +) + +add_header_library( + rename + HDRS + rename.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.fcntl_macros + libc.include.sys_syscall +) + +add_header_library( + lseek + HDRS + lseek.h + DEPENDS + libc.src.__support.OSUtil.osutil + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.macros.config + libc.hdr.stdint_proxy + libc.hdr.types.off_t + libc.include.sys_syscall +) diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/access.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/access.h new file mode 100644 index 0000000000000..3f3a3bd343901 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/access.h @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for access. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_ACCESS_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_ACCESS_H + +#include "hdr/fcntl_macros.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr access(const char *path, int mode) { +#ifdef SYS_access + int ret = syscall_impl(SYS_access, path, mode); +#elif defined(SYS_faccessat) + int ret = syscall_impl(SYS_faccessat, AT_FDCWD, path, mode, 0); +#else +#error "access and faccessat syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_ACCESS_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/chdir.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/chdir.h new file mode 100644 index 0000000000000..3bd87ca0f488f --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/chdir.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for chdir. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_CHDIR_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_CHDIR_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr chdir(const char *path) { + int ret = syscall_impl(SYS_chdir, path); + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_CHDIR_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/dup.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/dup.h new file mode 100644 index 0000000000000..3d54b684bd66e --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/dup.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for dup. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr dup(int fd) { + int ret = syscall_impl(SYS_dup, fd); + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/dup2.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/dup2.h new file mode 100644 index 0000000000000..49f37134bd5f5 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/dup2.h @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for dup2. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP2_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP2_H + +#include "hdr/fcntl_macros.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr dup2(int oldfd, int newfd) { +#ifdef SYS_dup2 + int ret = syscall_impl(SYS_dup2, oldfd, newfd); +#elif defined(SYS_dup3) + if (oldfd == newfd) { +#if defined(SYS_fcntl) + int ret = syscall_impl(SYS_fcntl, oldfd, F_GETFD); +#elif defined(SYS_fcntl64) + int ret = syscall_impl(SYS_fcntl64, oldfd, F_GETFD); +#else +#error "SYS_fcntl and SYS_fcntl64 syscalls not available." +#endif + if (ret >= 0) + return oldfd; + return Error(-ret); + } + int ret = syscall_impl(SYS_dup3, oldfd, newfd, 0); +#else +#error "dup2 and dup3 syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP2_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/dup3.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/dup3.h new file mode 100644 index 0000000000000..3be0ad4526e90 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/dup3.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for dup3. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP3_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP3_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr dup3(int oldfd, int newfd, int flags) { + int ret = syscall_impl(SYS_dup3, oldfd, newfd, flags); + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_DUP3_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/faccessat.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/faccessat.h new file mode 100644 index 0000000000000..83fd7e617fb8a --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/faccessat.h @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for faccessat. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FACCESSAT_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FACCESSAT_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr faccessat(int dfd, const char *path, int mode, + int flags) { +#ifdef SYS_faccessat2 + int ret = syscall_impl(SYS_faccessat2, dfd, path, mode, flags); +#elif defined(SYS_faccessat) + int ret = syscall_impl(SYS_faccessat, dfd, path, mode, flags); +#else +#error "faccessat2 and faccessat syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FACCESSAT_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/fchdir.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/fchdir.h new file mode 100644 index 0000000000000..2830bb763a86a --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/fchdir.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for fchdir. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FCHDIR_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FCHDIR_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr fchdir(int fd) { + int ret = syscall_impl(SYS_fchdir, fd); + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FCHDIR_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/fsync.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/fsync.h new file mode 100644 index 0000000000000..1ebbca23f3019 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/fsync.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for fsync. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FSYNC_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FSYNC_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr fsync(int fd) { + int ret = syscall_impl(SYS_fsync, fd); + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_FSYNC_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/lseek.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/lseek.h new file mode 100644 index 0000000000000..861a1174d367f --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/lseek.h @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for lseek. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_LSEEK_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_LSEEK_H + +#include "hdr/stdint_proxy.h" +#include "hdr/types/off_t.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr lseek(int fd, off_t offset, int whence) { + off_t result; +#ifdef SYS_lseek + result = syscall_impl(SYS_lseek, fd, offset, whence); + if (result < 0) + return Error(-static_cast(result)); +#elif defined(SYS_llseek) || defined(SYS__llseek) +#ifdef SYS_llseek + constexpr long LLSEEK_SYSCALL_NO = SYS_llseek; +#elif defined(SYS__llseek) + constexpr long LLSEEK_SYSCALL_NO = SYS__llseek; +#endif + uint64_t offset_64 = static_cast(offset); + int ret = syscall_impl(LLSEEK_SYSCALL_NO, fd, + static_cast(offset_64 >> 32), + static_cast(offset_64), &result, whence); + if (ret < 0) + return Error(-ret); +#else +#error "lseek, llseek and _llseek syscalls not available." +#endif + return result; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_LSEEK_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/readlink.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/readlink.h new file mode 100644 index 0000000000000..7c6a0c8db0fee --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/readlink.h @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for readlink. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_READLINK_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_READLINK_H + +#include "hdr/fcntl_macros.h" +#include "hdr/types/ssize_t.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr readlink(const char *path, char *buf, + size_t bufsiz) { +#ifdef SYS_readlink + ssize_t ret = syscall_impl(SYS_readlink, path, buf, bufsiz); +#elif defined(SYS_readlinkat) + ssize_t ret = + syscall_impl(SYS_readlinkat, AT_FDCWD, path, buf, bufsiz); +#else +#error "readlink and readlinkat syscalls not available." +#endif + if (ret < 0) + return Error(-static_cast(ret)); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_READLINK_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/readlinkat.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/readlinkat.h new file mode 100644 index 0000000000000..d65573c9d8aee --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/readlinkat.h @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for readlinkat. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_READLINKAT_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_READLINKAT_H + +#include "hdr/types/ssize_t.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr readlinkat(int dfd, const char *path, char *buf, + size_t bufsiz) { + ssize_t ret = syscall_impl(SYS_readlinkat, dfd, path, buf, bufsiz); + if (ret < 0) + return Error(-static_cast(ret)); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_READLINKAT_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/rename.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/rename.h new file mode 100644 index 0000000000000..ec0be35b27ab8 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/rename.h @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for rename. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_RENAME_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_RENAME_H + +#include "hdr/fcntl_macros.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr rename(const char *oldpath, const char *newpath) { +#ifdef SYS_renameat2 + int ret = + syscall_impl(SYS_renameat2, AT_FDCWD, oldpath, AT_FDCWD, newpath, 0); +#elif defined(SYS_renameat) + int ret = + syscall_impl(SYS_renameat, AT_FDCWD, oldpath, AT_FDCWD, newpath); +#elif defined(SYS_rename) + int ret = syscall_impl(SYS_rename, oldpath, newpath); +#else +#error "rename, renameat and renameat2 syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_RENAME_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/rmdir.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/rmdir.h new file mode 100644 index 0000000000000..cf8f03ce9fdfb --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/rmdir.h @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for rmdir. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_RMDIR_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_RMDIR_H + +#include "hdr/fcntl_macros.h" +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr rmdir(const char *path) { +#ifdef SYS_rmdir + int ret = syscall_impl(SYS_rmdir, path); +#elif defined(SYS_unlinkat) + int ret = syscall_impl(SYS_unlinkat, AT_FDCWD, path, AT_REMOVEDIR); +#else +#error "rmdir and unlinkat syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_RMDIR_H diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/unlinkat.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/unlinkat.h new file mode 100644 index 0000000000000..ffba729333fbd --- /dev/null +++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/unlinkat.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Syscall wrapper for unlinkat. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_UNLINKAT_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_UNLINKAT_H + +#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers + +namespace LIBC_NAMESPACE_DECL { +namespace linux_syscalls { + +LIBC_INLINE ErrorOr unlinkat(int dfd, const char *path, int flags) { + int ret = syscall_impl(SYS_unlinkat, dfd, path, flags); + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace linux_syscalls +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_UNLINKAT_H diff --git a/libc/src/stdio/linux/CMakeLists.txt b/libc/src/stdio/linux/CMakeLists.txt index 5b7bd84f1ceea..1552060c52550 100644 --- a/libc/src/stdio/linux/CMakeLists.txt +++ b/libc/src/stdio/linux/CMakeLists.txt @@ -19,10 +19,8 @@ add_entrypoint_object( HDRS ../rename.h DEPENDS - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.rename libc.src.errno.errno - libc.hdr.fcntl_macros ) add_entrypoint_object( diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp index 426c8698e557d..7f8515b705386 100644 --- a/libc/src/stdio/linux/rename.cpp +++ b/libc/src/stdio/linux/rename.cpp @@ -7,23 +7,20 @@ //===----------------------------------------------------------------------===// #include "src/stdio/rename.h" -#include "hdr/fcntl_macros.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/rename.h" #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, rename, (const char *oldpath, const char *newpath)) { - int ret = LIBC_NAMESPACE::syscall_impl(SYS_renameat2, AT_FDCWD, oldpath, - AT_FDCWD, newpath, 0); - - if (ret >= 0) - return 0; - libc_errno = -ret; - return -1; + auto result = linux_syscalls::rename(oldpath, newpath); + if (!result) { + libc_errno = result.error(); + return -1; + } + return 0; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index f979ba0669872..8153196b0a21c 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -5,10 +5,7 @@ add_entrypoint_object( HDRS ../access.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.access libc.src.errno.errno ) @@ -19,9 +16,7 @@ add_entrypoint_object( HDRS ../chdir.h DEPENDS - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.chdir libc.src.errno.errno ) @@ -59,10 +54,7 @@ add_entrypoint_object( HDRS ../dup.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.dup libc.src.errno.errno ) @@ -73,10 +65,7 @@ add_entrypoint_object( HDRS ../dup2.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.dup2 libc.src.errno.errno ) @@ -87,10 +76,7 @@ add_entrypoint_object( HDRS ../dup3.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.dup3 libc.src.errno.errno ) @@ -102,8 +88,7 @@ add_entrypoint_object( ../faccessat.h DEPENDS libc.hdr.fcntl_macros - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.faccessat libc.src.errno.errno ) @@ -114,9 +99,7 @@ add_entrypoint_object( HDRS ../fchdir.h DEPENDS - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.fchdir libc.src.errno.errno ) @@ -198,9 +181,7 @@ add_entrypoint_object( HDRS ../fsync.h DEPENDS - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.fsync libc.src.errno.errno ) @@ -393,10 +374,7 @@ add_entrypoint_object( ../lseek.h DEPENDS libc.hdr.types.off_t - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.lseek libc.src.errno.errno ) @@ -514,10 +492,7 @@ add_entrypoint_object( HDRS ../rmdir.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.rmdir libc.src.errno.errno ) @@ -530,10 +505,7 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.types.ssize_t - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.readlink libc.src.errno.errno ) @@ -546,10 +518,7 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.types.ssize_t - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.readlinkat libc.src.errno.errno ) @@ -643,10 +612,7 @@ add_entrypoint_object( HDRS ../unlink.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.unlink libc.src.errno.errno ) @@ -657,10 +623,7 @@ add_entrypoint_object( HDRS ../unlinkat.h DEPENDS - libc.hdr.fcntl_macros - libc.include.unistd - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil + libc.src.__support.OSUtil.linux.syscall_wrappers.unlinkat libc.src.errno.errno ) diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp index f06eec5a8db6a..570ccf6a78eb2 100644 --- a/libc/src/unistd/linux/access.cpp +++ b/libc/src/unistd/linux/access.cpp @@ -8,28 +8,17 @@ #include "src/unistd/access.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/access.h" #include "src/__support/common.h" - -#include "hdr/fcntl_macros.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, access, (const char *path, int mode)) { -#ifdef SYS_access - int ret = LIBC_NAMESPACE::syscall_impl(SYS_access, path, mode); -#elif defined(SYS_faccessat) - int ret = - LIBC_NAMESPACE::syscall_impl(SYS_faccessat, AT_FDCWD, path, mode); -#else -#error "access and faccessat syscalls not available." -#endif - - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::access(path, mode); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; diff --git a/libc/src/unistd/linux/chdir.cpp b/libc/src/unistd/linux/chdir.cpp index 04ba509b49a56..e7a668b620f94 100644 --- a/libc/src/unistd/linux/chdir.cpp +++ b/libc/src/unistd/linux/chdir.cpp @@ -8,19 +8,17 @@ #include "src/unistd/chdir.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/chdir.h" #include "src/__support/common.h" - #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, chdir, (const char *path)) { - int ret = LIBC_NAMESPACE::syscall_impl(SYS_chdir, path); - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::chdir(path); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; diff --git a/libc/src/unistd/linux/dup.cpp b/libc/src/unistd/linux/dup.cpp index 81d30c6cdbc4c..b9a8f9d0498df 100644 --- a/libc/src/unistd/linux/dup.cpp +++ b/libc/src/unistd/linux/dup.cpp @@ -8,22 +8,20 @@ #include "src/unistd/dup.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/dup.h" #include "src/__support/common.h" - #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, dup, (int fd)) { - int ret = LIBC_NAMESPACE::syscall_impl(SYS_dup, fd); - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::dup(fd); + if (!ret) { + libc_errno = ret.error(); return -1; } - return ret; + return ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp index 0a0e86573b34e..f081e1e685878 100644 --- a/libc/src/unistd/linux/dup2.cpp +++ b/libc/src/unistd/linux/dup2.cpp @@ -8,48 +8,20 @@ #include "src/unistd/dup2.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/dup2.h" #include "src/__support/common.h" - -#include "hdr/fcntl_macros.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, dup2, (int oldfd, int newfd)) { -#ifdef SYS_dup2 - // If dup2 syscall is available, we make use of directly. - int ret = LIBC_NAMESPACE::syscall_impl(SYS_dup2, oldfd, newfd); -#elif defined(SYS_dup3) - // If dup2 syscall is not available, we try using the dup3 syscall. However, - // dup3 fails if oldfd is the same as newfd. So, we handle that case - // separately before making the dup3 syscall. - if (oldfd == newfd) { - // Check if oldfd is actually a valid file descriptor. -#if SYS_fcntl - int ret = LIBC_NAMESPACE::syscall_impl(SYS_fcntl, oldfd, F_GETFD); -#elif defined(SYS_fcntl64) - // Same as fcntl but can handle large offsets - int ret = LIBC_NAMESPACE::syscall_impl(SYS_fcntl64, oldfd, F_GETFD); -#else -#error "SYS_fcntl and SYS_fcntl64 syscalls not available." -#endif - if (ret >= 0) - return oldfd; - libc_errno = -ret; - return -1; - } - int ret = LIBC_NAMESPACE::syscall_impl(SYS_dup3, oldfd, newfd, 0); -#else -#error "dup2 and dup3 syscalls not available." -#endif - if (ret < 0) { - libc_errno = -ret; + auto result = linux_syscalls::dup2(oldfd, newfd); + if (!result) { + libc_errno = result.error(); return -1; } - return ret; + return result.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/dup3.cpp b/libc/src/unistd/linux/dup3.cpp index 770fb73515b21..ce705f63373d1 100644 --- a/libc/src/unistd/linux/dup3.cpp +++ b/libc/src/unistd/linux/dup3.cpp @@ -8,22 +8,20 @@ #include "src/unistd/dup3.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/dup3.h" #include "src/__support/common.h" - #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, dup3, (int oldfd, int newfd, int flags)) { - // If dup2 syscall is available, we make use of directly. - int ret = LIBC_NAMESPACE::syscall_impl(SYS_dup3, oldfd, newfd, flags); - if (ret >= 0) - return ret; - libc_errno = -ret; - return -1; + ErrorOr ret = linux_syscalls::dup3(oldfd, newfd, flags); + if (!ret) { + libc_errno = ret.error(); + return -1; + } + return ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/faccessat.cpp b/libc/src/unistd/linux/faccessat.cpp index 7a2a29cb0e901..41ad0f5a10f26 100644 --- a/libc/src/unistd/linux/faccessat.cpp +++ b/libc/src/unistd/linux/faccessat.cpp @@ -8,27 +8,18 @@ #include "src/unistd/faccessat.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/faccessat.h" #include "src/__support/common.h" - -#include "hdr/fcntl_macros.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, faccessat, (int fd, const char *path, int amode, int flag)) { -#ifdef SYS_faccessat2 - int ret = - LIBC_NAMESPACE::syscall_impl(SYS_faccessat2, fd, path, amode, flag); -#else -#error "faccessat2 syscall is not available." -#endif - - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::faccessat(fd, path, amode, flag); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; diff --git a/libc/src/unistd/linux/fchdir.cpp b/libc/src/unistd/linux/fchdir.cpp index f7a7422363e6e..f49545b4bd3e5 100644 --- a/libc/src/unistd/linux/fchdir.cpp +++ b/libc/src/unistd/linux/fchdir.cpp @@ -8,19 +8,17 @@ #include "src/unistd/fchdir.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/fchdir.h" #include "src/__support/common.h" - #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, fchdir, (int fd)) { - int ret = LIBC_NAMESPACE::syscall_impl(SYS_fchdir, fd); - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::fchdir(fd); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; diff --git a/libc/src/unistd/linux/fsync.cpp b/libc/src/unistd/linux/fsync.cpp index fe08aed61e250..bf5796d793d70 100644 --- a/libc/src/unistd/linux/fsync.cpp +++ b/libc/src/unistd/linux/fsync.cpp @@ -8,22 +8,20 @@ #include "src/unistd/fsync.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/fsync.h" #include "src/__support/common.h" - #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, fsync, (int fd)) { - int ret = LIBC_NAMESPACE::syscall_impl(SYS_fsync, fd); - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::fsync(fd); + if (!ret) { + libc_errno = ret.error(); return -1; } - return ret; + return 0; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/lseek.cpp b/libc/src/unistd/linux/lseek.cpp index 26a08269fd8de..02ef338ecb45e 100644 --- a/libc/src/unistd/linux/lseek.cpp +++ b/libc/src/unistd/linux/lseek.cpp @@ -7,21 +7,16 @@ //===----------------------------------------------------------------------===// #include "src/unistd/lseek.h" +#include "src/__support/OSUtil/linux/syscall_wrappers/lseek.h" +#include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/File/linux/lseekImpl.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. -#include "src/__support/common.h" - -#include "hdr/types/off_t.h" -#include // For syscall numbers. - namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(off_t, lseek, (int fd, off_t offset, int whence)) { - auto result = internal::lseekimpl(fd, offset, whence); - if (!result.has_value()) { + ErrorOr result = linux_syscalls::lseek(fd, offset, whence); + if (!result) { libc_errno = result.error(); return -1; } diff --git a/libc/src/unistd/linux/readlink.cpp b/libc/src/unistd/linux/readlink.cpp index b297a41ca37bd..c029805d90224 100644 --- a/libc/src/unistd/linux/readlink.cpp +++ b/libc/src/unistd/linux/readlink.cpp @@ -8,33 +8,22 @@ #include "src/unistd/readlink.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/readlink.h" #include "src/__support/common.h" - -#include "hdr/fcntl_macros.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(ssize_t, readlink, (const char *__restrict path, char *__restrict buf, size_t bufsize)) { -#ifdef SYS_readlink - ssize_t ret = - LIBC_NAMESPACE::syscall_impl(SYS_readlink, path, buf, bufsize); -#elif defined(SYS_readlinkat) - ssize_t ret = LIBC_NAMESPACE::syscall_impl(SYS_readlinkat, AT_FDCWD, - path, buf, bufsize); -#else -#error "readlink or readlinkat syscalls not available." -#endif - if (ret < 0) { - libc_errno = static_cast(-ret); + auto result = linux_syscalls::readlink(path, buf, bufsize); + if (!result) { + libc_errno = result.error(); return -1; } - return ret; + return result.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/readlinkat.cpp b/libc/src/unistd/linux/readlinkat.cpp index cd0dcb8e0ff02..2c97d2f7b109a 100644 --- a/libc/src/unistd/linux/readlinkat.cpp +++ b/libc/src/unistd/linux/readlinkat.cpp @@ -8,26 +8,22 @@ #include "src/unistd/readlinkat.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/readlinkat.h" #include "src/__support/common.h" - -#include "hdr/fcntl_macros.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(ssize_t, readlinkat, (int fd, const char *__restrict path, char *__restrict buf, size_t bufsize)) { - ssize_t ret = LIBC_NAMESPACE::syscall_impl(SYS_readlinkat, fd, path, - buf, bufsize); - if (ret < 0) { - libc_errno = static_cast(-ret); + auto result = linux_syscalls::readlinkat(fd, path, buf, bufsize); + if (!result) { + libc_errno = result.error(); return -1; } - return ret; + return result.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/rmdir.cpp b/libc/src/unistd/linux/rmdir.cpp index eca6e954ef898..b7473a60c3fd2 100644 --- a/libc/src/unistd/linux/rmdir.cpp +++ b/libc/src/unistd/linux/rmdir.cpp @@ -8,28 +8,17 @@ #include "src/unistd/rmdir.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/rmdir.h" #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "hdr/fcntl_macros.h" -#include // For syscall numbers. - namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, rmdir, (const char *path)) { -#ifdef SYS_rmdir - int ret = LIBC_NAMESPACE::syscall_impl(SYS_rmdir, path); -#elif defined(SYS_unlinkat) - int ret = LIBC_NAMESPACE::syscall_impl(SYS_unlinkat, AT_FDCWD, path, - AT_REMOVEDIR); -#else -#error "rmdir and unlinkat syscalls not available." -#endif - - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::rmdir(path); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; diff --git a/libc/src/unistd/linux/unlink.cpp b/libc/src/unistd/linux/unlink.cpp index 5fde2600937b2..bb1b5400042e7 100644 --- a/libc/src/unistd/linux/unlink.cpp +++ b/libc/src/unistd/linux/unlink.cpp @@ -8,27 +8,17 @@ #include "src/unistd/unlink.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/unlink.h" #include "src/__support/common.h" - -#include "hdr/fcntl_macros.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, unlink, (const char *path)) { -#ifdef SYS_unlink - int ret = LIBC_NAMESPACE::syscall_impl(SYS_unlink, path); -#elif defined(SYS_unlinkat) - int ret = LIBC_NAMESPACE::syscall_impl(SYS_unlinkat, AT_FDCWD, path, 0); -#else -#error "unlink and unlinkat syscalls not available." -#endif - - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::unlink(path); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; diff --git a/libc/src/unistd/linux/unlinkat.cpp b/libc/src/unistd/linux/unlinkat.cpp index b2012c52b8854..acb2e6d1936bb 100644 --- a/libc/src/unistd/linux/unlinkat.cpp +++ b/libc/src/unistd/linux/unlinkat.cpp @@ -8,25 +8,17 @@ #include "src/unistd/unlinkat.h" -#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/OSUtil/linux/syscall_wrappers/unlinkat.h" #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "hdr/fcntl_macros.h" -#include // For syscall numbers. - namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, unlinkat, (int dfd, const char *path, int flags)) { -#ifdef SYS_unlinkat - int ret = LIBC_NAMESPACE::syscall_impl(SYS_unlinkat, dfd, path, flags); -#else -#error "unlinkat syscalls not available." -#endif - - if (ret < 0) { - libc_errno = -ret; + ErrorOr ret = linux_syscalls::unlinkat(dfd, path, flags); + if (!ret) { + libc_errno = ret.error(); return -1; } return 0; From 39dc4b0f32dd898a41f9e51fa182f4021b983699 Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Fri, 8 May 2026 14:26:39 +0200 Subject: [PATCH 030/538] [clang][OpenMP][SPIRV] Use the right calling convention for reduction helpers (#195911) This is a follow-up to #194879 to ensure that the helpers for reduction use the right calling convention (in particular that they are marked as spir_func for SPIRV). Assisted by Claude Sonnet 4.5. --- clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c | 6 ++++-- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c b/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c index bddd5548b9b8b..7217ef9400a6b 100644 --- a/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c +++ b/clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c @@ -18,8 +18,10 @@ // Verify __kmpc_free_shared is called. // CHECK: call spir_func addrspace(9) void @__kmpc_free_shared(ptr addrspace(4) -// Verify the reduction function is generated. -// CHECK: define internal void @{{.*}}reduction{{.*}}func +// Verify the reduction helper functions are generated. +// CHECK: define internal spir_func void @{{.*}}reduction{{.*}}func +// CHECK: define internal spir_func void @{{.*}}shuffle_and_reduce_func +// CHECK: define internal spir_func void @{{.*}}inter_warp_copy_func int main() { int x = 0; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index e3d5bf0663490..19bfff7a7a4e0 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3430,6 +3430,7 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( Function *WcFunc = Function::Create(FuncTy, GlobalVariable::InternalLinkage, "_omp_reduction_inter_warp_copy_func", &M); + WcFunc->setCallingConv(Config.getRuntimeCC()); WcFunc->setAttributes(FuncAttrs); WcFunc->addParamAttr(0, Attribute::NoUndef); WcFunc->addParamAttr(1, Attribute::NoUndef); @@ -3690,6 +3691,7 @@ Expected OpenMPIRBuilder::emitShuffleAndReduceFunction( Function *SarFunc = Function::Create(FuncTy, GlobalVariable::InternalLinkage, "_omp_reduction_shuffle_and_reduce_func", &M); + SarFunc->setCallingConv(Config.getRuntimeCC()); SarFunc->setAttributes(FuncAttrs); SarFunc->addParamAttr(0, Attribute::NoUndef); SarFunc->addParamAttr(1, Attribute::NoUndef); @@ -4386,6 +4388,7 @@ Expected OpenMPIRBuilder::createReductionFunction( std::string Name = getReductionFuncName(ReducerName); Function *ReductionFunc = Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M); + ReductionFunc->setCallingConv(Config.getRuntimeCC()); ReductionFunc->setAttributes(FuncAttrs); ReductionFunc->addParamAttr(0, Attribute::NoUndef); ReductionFunc->addParamAttr(1, Attribute::NoUndef); From 03bd38f411de317422ced4a9527af11294196023 Mon Sep 17 00:00:00 2001 From: Ryan Mansfield Date: Fri, 8 May 2026 08:36:59 -0400 Subject: [PATCH 031/538] [llvm-otool] Add -m flag and archive(member) input syntax (#194234) Support classic otool's archive(member) input syntax where a filename like 'foo.a(bar.o)' extracts and processes only the named member from the archive. The -m flag disables this syntax parsing, treating the entire string as a literal filename. Fixes #126272 --- llvm/docs/CommandGuide/llvm-otool.rst | 4 ++ .../MachO/otool-archive-member.test | 48 +++++++++++++++ llvm/tools/llvm-objdump/MachODump.cpp | 61 +++++++++++++++++++ llvm/tools/llvm-objdump/MachODump.h | 1 + llvm/tools/llvm-objdump/OtoolOpts.td | 4 +- llvm/tools/llvm-objdump/llvm-objdump.cpp | 2 + 6 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-objdump/MachO/otool-archive-member.test diff --git a/llvm/docs/CommandGuide/llvm-otool.rst b/llvm/docs/CommandGuide/llvm-otool.rst index ab92673092c9c..17aa9089b3875 100644 --- a/llvm/docs/CommandGuide/llvm-otool.rst +++ b/llvm/docs/CommandGuide/llvm-otool.rst @@ -83,6 +83,10 @@ OPTIONS Print load commands. +.. option:: -m + + Don't use archive(member) syntax. + .. option:: -mcpu= Select cpu for disassembly. diff --git a/llvm/test/tools/llvm-objdump/MachO/otool-archive-member.test b/llvm/test/tools/llvm-objdump/MachO/otool-archive-member.test new file mode 100644 index 0000000000000..3aaa2028b98ca --- /dev/null +++ b/llvm/test/tools/llvm-objdump/MachO/otool-archive-member.test @@ -0,0 +1,48 @@ +# Test llvm-otool archive(member) input syntax and -m flag. + +RUN: mkdir -p %t.dir +RUN: cp %p/Inputs/hello.obj.macho-x86_64 %t.dir/foo.o +RUN: cp %p/Inputs/ObjC.obj.macho-x86_64 %t.dir/bar.o +RUN: llvm-ar rcs %t.dir/lib.a %t.dir/foo.o %t.dir/bar.o + +# archive(member) syntax extracts only the named member. +RUN: llvm-otool -l "%t.dir/lib.a(bar.o)" \ +RUN: | FileCheck %s --check-prefix=MEMBER --implicit-check-not="foo.o" + +MEMBER: bar.o + +# Without archive(member), both members are shown. +RUN: llvm-otool -l %t.dir/lib.a \ +RUN: | FileCheck %s --check-prefix=BOTH + +BOTH: foo.o +BOTH: bar.o + +# -m disables archive(member) parsing, treating the filename literally. +RUN: not llvm-otool -mh "%t.dir/lib.a(bar.o)" 2>&1 \ +RUN: | FileCheck %s --check-prefix=NOPAREN -DMSG=%errc_ENOENT -DPATH=%t.dir + +NOPAREN: error: '[[PATH]]/lib.a(bar.o)': [[MSG]] + +# Non-existent member gives an error. +RUN: not llvm-otool -h "%t.dir/lib.a(nonexistent)" 2>&1 \ +RUN: | FileCheck %s --check-prefix=BADMEMBER -DPATH=%t.dir + +BADMEMBER: error: '[[PATH]]/lib.a': archive does not contain a member named: nonexistent + +# archive(member) on a non-archive file gives an error. +RUN: not llvm-otool -h "%p/Inputs/hello.obj.macho-x86_64(foo)" 2>&1 \ +RUN: | FileCheck %s --check-prefix=NOTARCHIVE + +NOTARCHIVE: error: '{{.*}}hello.obj.macho-x86_64': not an archive (cannot extract member: foo) + +# Handle multiple inputs with archive(member) syntax. +RUN: llvm-otool -h "%t.dir/lib.a(foo.o)" "%t.dir/lib.a(bar.o)" \ +RUN: | FileCheck %s --check-prefix=MULTI + +MULTI: Archive : +MULTI-NEXT: Mach header +MULTI: 496 +MULTI-NEXT: Archive : +MULTI-NEXT: Mach header +MULTI: 896 diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp index 1676b17e14593..8b5a56b0db095 100644 --- a/llvm/tools/llvm-objdump/MachODump.cpp +++ b/llvm/tools/llvm-objdump/MachODump.cpp @@ -81,10 +81,12 @@ bool objdump::Verbose; bool objdump::ObjcMetaData; std::string objdump::DisSymName; bool objdump::IsOtool; +bool objdump::UseMemberSyntax; bool objdump::SymbolicOperands; std::vector objdump::ArchFlags; static bool ArchAll = false; +static std::string ArchiveMemberFilter; static std::string ThumbTripleName; static StringRef ordinalName(const object::MachOObjectFile *, int); @@ -2545,6 +2547,18 @@ static bool ValidateArchFlags() { return true; } +static bool skipArchiveMember(const object::Archive::Child &C, + StringRef Filename) { + if (ArchiveMemberFilter.empty()) + return false; + Expected NameOrErr = C.getName(); + if (!NameOrErr) { + reportError(NameOrErr.takeError(), Filename); + return true; + } + return *NameOrErr != ArchiveMemberFilter; +} + // ParseInputMachO() parses the named Mach-O file in Filename and handles the // -arch flags selecting just those slices as specified by them and also parses // archive files. Then for each individual Mach-O file ProcessMachO() is @@ -2553,6 +2567,19 @@ void objdump::parseInputMachO(StringRef Filename) { if (!ValidateArchFlags()) return; + // In otool mode, support archive(member) syntax: if the filename ends + // with ')' and contains '(', split it into the archive path and member + // name. The -m option disables this parsing. + ArchiveMemberFilter.clear(); + if (IsOtool && UseMemberSyntax && !Filename.empty() && + Filename.back() == ')') { + auto Pos = Filename.rfind('('); + if (Pos != StringRef::npos && Pos > 0) { + ArchiveMemberFilter = Filename.substr(Pos + 1).drop_back().str(); + Filename = Filename.substr(0, Pos); + } + } + // Attempt to open the binary. Expected> BinaryOrErr = createBinary(Filename); if (!BinaryOrErr) { @@ -2571,8 +2598,12 @@ void objdump::parseInputMachO(StringRef Filename) { Error Err = Error::success(); unsigned I = -1; + bool FoundMember = false; for (auto &C : A->children(Err)) { ++I; + if (skipArchiveMember(C, Filename)) + continue; + FoundMember = true; Expected> ChildOrErr = C.getAsBinary(); if (!ChildOrErr) { if (Error E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) @@ -2587,12 +2618,20 @@ void objdump::parseInputMachO(StringRef Filename) { } if (Err) reportError(std::move(Err), Filename); + if (!FoundMember && !ArchiveMemberFilter.empty()) + reportError(Filename, "archive does not contain a member named: " + + ArchiveMemberFilter); return; } if (MachOUniversalBinary *UB = dyn_cast(&Bin)) { parseInputMachO(UB); return; } + if (!ArchiveMemberFilter.empty()) { + reportError(Filename, "not an archive (cannot extract member: " + + ArchiveMemberFilter + ")"); + return; + } if (ObjectFile *O = dyn_cast(&Bin)) { if (!checkMachOAndArchFlags(O, Filename)) return; @@ -2652,8 +2691,12 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) { ArchiveMemberOffsets, ArchitectureName); Error Err = Error::success(); unsigned I = -1; + bool FoundMember = false; for (auto &C : A->children(Err)) { ++I; + if (skipArchiveMember(C, Filename)) + continue; + FoundMember = true; Expected> ChildOrErr = C.getAsBinary(); if (!ChildOrErr) { if (Error E = @@ -2668,6 +2711,10 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) { } if (Err) reportError(std::move(Err), Filename); + if (!FoundMember && !ArchiveMemberFilter.empty()) + reportError(Filename, + "archive does not contain a member named: " + + ArchiveMemberFilter); } else { consumeError(AOrErr.takeError()); reportError(Filename, @@ -2714,8 +2761,12 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) { ArchiveMemberOffsets); Error Err = Error::success(); unsigned I = -1; + bool FoundMember = false; for (auto &C : A->children(Err)) { ++I; + if (skipArchiveMember(C, Filename)) + continue; + FoundMember = true; Expected> ChildOrErr = C.getAsBinary(); if (!ChildOrErr) { if (Error E = @@ -2729,6 +2780,9 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) { } if (Err) reportError(std::move(Err), Filename); + if (!FoundMember && !ArchiveMemberFilter.empty()) + reportError(Filename, "archive does not contain a member named: " + + ArchiveMemberFilter); } else { consumeError(AOrErr.takeError()); reportError(Filename, "Mach-O universal file for architecture " + @@ -2767,8 +2821,12 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) { ArchitectureName); Error Err = Error::success(); unsigned I = -1; + bool FoundMember = false; for (auto &C : A->children(Err)) { ++I; + if (skipArchiveMember(C, Filename)) + continue; + FoundMember = true; Expected> ChildOrErr = C.getAsBinary(); if (!ChildOrErr) { if (Error E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) @@ -2783,6 +2841,9 @@ void objdump::parseInputMachO(MachOUniversalBinary *UB) { } if (Err) reportError(std::move(Err), Filename); + if (!FoundMember && !ArchiveMemberFilter.empty()) + reportError(Filename, "archive does not contain a member named: " + + ArchiveMemberFilter); } else { consumeError(AOrErr.takeError()); reportError(Filename, "Mach-O universal file for architecture " + diff --git a/llvm/tools/llvm-objdump/MachODump.h b/llvm/tools/llvm-objdump/MachODump.h index 81fd8775e1d20..b9a316a4deb16 100644 --- a/llvm/tools/llvm-objdump/MachODump.h +++ b/llvm/tools/llvm-objdump/MachODump.h @@ -59,6 +59,7 @@ extern bool Rebase; extern bool Rpaths; extern bool SymbolicOperands; extern bool UniversalHeaders; +extern bool UseMemberSyntax; extern bool Verbose; extern bool WeakBind; extern std::vector ArchFlags; diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td index 8cc70d3207245..e14c09aee42f4 100644 --- a/llvm/tools/llvm-objdump/OtoolOpts.td +++ b/llvm/tools/llvm-objdump/OtoolOpts.td @@ -43,9 +43,11 @@ def chained_fixups : Flag<["-"], "chained_fixups">, def dyld_info : Flag<["-"], "dyld_info">, HelpText<"print bind and rebase information">; +def m : Flag<["-"], "m">, + HelpText<"don't use archive(member) syntax">; + // Not (yet?) implemented: // -c print argument strings of a core file -// -m don't use archive(member) syntax // -dyld_opcodes // -addr_slide=arg // -function_offsets diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index 9cc658cbb341c..8c090ba1e29e7 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -3692,6 +3692,8 @@ static void parseOtoolOptions(const llvm::opt::InputArgList &InputArgs) { ChainedFixups = InputArgs.hasArg(OTOOL_chained_fixups); DyldInfo = InputArgs.hasArg(OTOOL_dyld_info); + UseMemberSyntax = !InputArgs.hasArg(OTOOL_m); + InputFilenames = InputArgs.getAllArgValues(OTOOL_INPUT); if (InputFilenames.empty()) reportCmdLineError("no input file"); From c48b74fbcf604c14f4e53d284c3f3da4076db427 Mon Sep 17 00:00:00 2001 From: Jan Schultke Date: Fri, 8 May 2026 14:37:21 +0200 Subject: [PATCH 032/538] [clang] Deduce _BitInt(N) template parameter as size_t (#195534) Update template argument deduction to deduce the `N` in `_BitInt(N)` as `size_t` rather than `int`. This increases consistency with deduction of array sizes, and matches the behavior proposed in P3666. Fixes #195033 --- clang/docs/ReleaseNotes.rst | 5 ++ clang/lib/Sema/SemaTemplateDeduction.cpp | 6 +- clang/test/SemaCXX/ext-int.cpp | 109 ++++++++++++++--------- 3 files changed, 75 insertions(+), 45 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ac462e3bf4732..aea89aebd0c31 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -49,6 +49,11 @@ C++ Specific Potentially Breaking Changes - Clang now correctly rejects ``export`` declarations in module implementation partitions. (#GH107602) +- Template argument deduction now treats the ``N`` in ``_BitInt(N)`` + as being of type ``std::size_t`` instead of ``int``, + matching the deduction of array sizes from ``int(&)[N]``. + This is a breaking change for code that depended on the previously deduced type. (#GH195033) + ABI Changes in This Version --------------------------- diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index c71c40526ccdc..defdd9ca6968a 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -2511,11 +2511,13 @@ static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch( if (!NTTP) return TemplateDeductionResult::Success; - llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false); + // Deduce the size parameter of _BitInt as std::size_t + QualType T = S.Context.getSizeType(); + llvm::APSInt ArgSize(S.Context.getTypeSize(T), /*IsUnsigned=*/true); ArgSize = IA->getNumBits(); return DeduceNonTypeTemplateArgument( - S, TemplateParams, NTTP, ArgSize, S.Context.IntTy, true, Info, + S, TemplateParams, NTTP, ArgSize, T, true, Info, POK != PartialOrderingKind::None, Deduced, HasDeducedAnyParam); } diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp index 5c566dafed931..281ae3d3c1779 100644 --- a/clang/test/SemaCXX/ext-int.cpp +++ b/clang/test/SemaCXX/ext-int.cpp @@ -61,6 +61,29 @@ template void deduced_whole_type(T){} template void deduced_bound(_BitInt(I)){} +template +void deduced_bound_unsigned(unsigned _BitInt(I)){} + +using size_t = decltype(sizeof(0)); + +#if __cplusplus >= 201703L +template void deduced_bound_auto(_BitInt(X)) { + static_assert(__is_same(decltype(X), size_t), ""); + static_assert(X == 9, ""); +} +template void deduced_bound_auto_unsigned(unsigned _BitInt(X)) { + static_assert(__is_same(decltype(X), size_t), ""); + static_assert(X == 11, ""); +} +template void deduced_bound_dependent(_BitInt(V)) { + static_assert(__is_same(T, size_t), ""); + static_assert(V == 9, ""); +} +template void deduced_bound_dependent_unsigned(unsigned _BitInt(V)) { + static_assert(__is_same(T, size_t), ""); + static_assert(V == 11, ""); +} +#endif // Ensure ext-int can be used in template places. void Templates() { @@ -69,20 +92,20 @@ void Templates() { ExtIntTemplParam c; constexpr _BitInt(9) d = 1; ExtIntTemplParam e; + constexpr unsigned _BitInt(11) f = 1; deduced_whole_type(b); + deduced_whole_type(f); deduced_bound(b); + deduced_bound_unsigned(f); +#if __cplusplus >= 201703L + deduced_bound_auto(d); + deduced_bound_auto_unsigned(f); + deduced_bound_dependent(d); + deduced_bound_dependent_unsigned(f); +#endif } -template -struct is_same { - static constexpr bool value = false; -}; -template -struct is_same { - static constexpr bool value = true; -}; - // Reject vector types: // expected-error@+1{{'_BitInt' vector element width must be a power of 2}} typedef _BitInt(5) __attribute__((vector_size(16))) VecTy3; @@ -143,16 +166,16 @@ void Ops() { x4_u - b; x43_s + b; x43_u - b; - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); + static_assert(__is_same(decltype(x43_s + x_int), _BitInt(43)), ""); + static_assert(__is_same(decltype(x43_u + x_int), unsigned _BitInt(43)), ""); + static_assert(__is_same(decltype(x32_s + x_int), int), ""); + static_assert(__is_same(decltype(x32_u + x_int), unsigned int), ""); + static_assert(__is_same(decltype(x32_s + x_uint), unsigned int), ""); + static_assert(__is_same(decltype(x32_u + x_uint), unsigned int), ""); + static_assert(__is_same(decltype(x4_s + x_int), int), ""); + static_assert(__is_same(decltype(x4_u + x_int), int), ""); + static_assert(__is_same(decltype(x4_s + x_uint), unsigned int), ""); + static_assert(__is_same(decltype(x4_u + x_uint), unsigned int), ""); // Bitwise checks. x43_s % y4_u; @@ -168,28 +191,28 @@ void Ops() { x4_s > 33; // expected-warning {{result of comparison of constant 33 with expression of type '_BitInt(4)' is always false}} // Same size/sign ops don't change type. - static_assert(is_same::value,""); - static_assert(is_same::value,""); - static_assert(is_same::value,""); - static_assert(is_same::value,""); + static_assert(__is_same(decltype(x43_s + y43_s), _BitInt(43)),""); + static_assert(__is_same(decltype(x4_s - y4_s), _BitInt(4)),""); + static_assert(__is_same(decltype(x43_u * y43_u), unsigned _BitInt(43)),""); + static_assert(__is_same(decltype(x4_u / y4_u), unsigned _BitInt(4)),""); // Unary ops shouldn't go through integer promotions. - static_assert(is_same::value,""); - static_assert(is_same::value,""); - static_assert(is_same::value,""); - static_assert(is_same::value,""); - static_assert(is_same::value,""); - static_assert(is_same::value,""); + static_assert(__is_same(decltype(~x43_s), _BitInt(43)),""); + static_assert(__is_same(decltype(~x4_s), _BitInt(4)),""); + static_assert(__is_same(decltype(+x43_s), _BitInt(43)),""); + static_assert(__is_same(decltype(+x4_s), _BitInt(4)),""); + static_assert(__is_same(decltype(-x43_u), unsigned _BitInt(43)),""); + static_assert(__is_same(decltype(-x4_u), unsigned _BitInt(4)),""); // expected-warning@+1{{expression with side effects has no effect in an unevaluated context}} - static_assert(is_same::value,""); + static_assert(__is_same(decltype(++x43_s), _BitInt(43)&),""); // expected-warning@+1{{expression with side effects has no effect in an unevaluated context}} - static_assert(is_same::value,""); + static_assert(__is_same(decltype(--x4_s), _BitInt(4)&),""); // expected-warning@+1{{expression with side effects has no effect in an unevaluated context}} - static_assert(is_same::value,""); + static_assert(__is_same(decltype(x43_s--), _BitInt(43)),""); // expected-warning@+1{{expression with side effects has no effect in an unevaluated context}} - static_assert(is_same::value,""); - static_assert(is_same> 1), _BitInt(4)>::value,""); - static_assert(is_same::value,""); + static_assert(__is_same(decltype(x4_s++), _BitInt(4)),""); + static_assert(__is_same(decltype(x4_s >> 1), _BitInt(4)),""); + static_assert(__is_same(decltype(x4_u << 1), unsigned _BitInt(4)),""); static_assert(sizeof(x43_s) == 8, ""); static_assert(sizeof(x4_s) == 1, ""); @@ -202,7 +225,7 @@ constexpr int func() { return 42;} void ConstexprBitsize() { _BitInt(func()) F; - static_assert(is_same::value, ""); + static_assert(__is_same(decltype(F), _BitInt(42)), ""); } // Not useable as an underlying type. @@ -273,9 +296,9 @@ void Ternary(_BitInt(30) s30, _BitInt(31) s31a, _BitInt(31) s31b, (void)(b ? s31a : s31b); (void)(s30 ? s31a : s31b); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); + static_assert(__is_same(decltype(b ? s30 : s31a), _BitInt(31)), ""); + static_assert(__is_same(decltype(b ? s32 : s30), _BitInt(32)), ""); + static_assert(__is_same(decltype(b ? s30 : 0), int), ""); } void FromPaper1() { @@ -285,11 +308,11 @@ void FromPaper1() { _BitInt(33) a33 = 1; char c = 3; - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); - static_assert(is_same::value, ""); + static_assert(__is_same(decltype(a2 * a3), _BitInt(3)), ""); + static_assert(__is_same(decltype(a2 * c), int), ""); + static_assert(__is_same(decltype(a33 * c), _BitInt(33)), ""); } void FromPaper2(_BitInt(8) a1, _BitInt(24) a2) { - static_assert(is_same::value, ""); + static_assert(__is_same(decltype(a1 * (_BitInt(32))a2), _BitInt(32)), ""); } From b206f70af15ff38c2826e39de2feeb84ad4c540c Mon Sep 17 00:00:00 2001 From: Oleksandr Tarasiuk Date: Fri, 8 May 2026 15:48:44 +0300 Subject: [PATCH 033/538] Revert "[Clang] disallow selectany on non-global-variable declarations" (#196511) Reverts llvm/llvm-project#189641 --- clang/docs/ReleaseNotes.rst | 1 - clang/include/clang/Basic/Attr.td | 2 -- .../clang/Basic/DiagnosticSemaKinds.td | 4 +-- clang/lib/Sema/SemaDecl.cpp | 30 +++++++++--------- ...a-attribute-supported-attributes-list.test | 1 - clang/test/Sema/attr-selectany.c | 6 +--- clang/test/SemaCXX/attr-selectany.cpp | 31 +++++-------------- clang/test/SemaCXX/declspec-selectany.cpp | 4 +-- 8 files changed, 26 insertions(+), 53 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index aea89aebd0c31..d700af6a82290 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -361,7 +361,6 @@ Attribute Changes in Clang usage. - Clang now allows GNU attributes between a member declarator and bit-field width. (#GH184954) -- Clang now disallows use of the ``selectany`` attribute on non-global-variable declarations. (#GH189141) Improvements to Clang's diagnostics ----------------------------------- diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index ffa6a17f51362..70b5773f95b08 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4510,8 +4510,6 @@ def DLLImportStaticLocal : InheritableAttr, TargetSpecificAttr, GCC<"selectany">]; - let Subjects = SubjectList<[NonParmVar], ErrorDiag, - "variable declarations with external linkage">; let Documentation = [SelectAnyDocs]; let SimpleHandler = 1; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index c69b2ce3648f8..c15a9ec1ff0f6 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3904,8 +3904,8 @@ def warn_cmse_nonsecure_union : Warning< InGroup>; def err_attribute_weak_static : Error< "weak declaration cannot have internal linkage">; -def err_attribute_selectany_non_extern_var : Error< - "'selectany' can only be applied to variables with external linkage">; +def err_attribute_selectany_non_extern_data : Error< + "'selectany' can only be applied to data items with external linkage">; def warn_attribute_hybrid_patchable_non_extern : Warning< "'hybrid_patchable' is ignored on functions without external linkage">, InGroup; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index be9654078940f..eb5b6d65b4d58 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3140,16 +3140,15 @@ static void checkNewAttributesAfterDef(Sema &S, Decl *New, const Decl *Old) { --E; continue; } - } else if (isa(NewAttribute)) { + } else if (isa(NewAttribute) && + cast(New)->isInline() && + !cast(New)->isInlineSpecified()) { // Don't warn about applying selectany to implicitly inline variables. // Older compilers and language modes would require the use of selectany // to make such variables inline, and it would have no effect if we // honored it. - if (const auto *VD = dyn_cast(New); - VD && VD->isInline() && !VD->isInlineSpecified()) { - ++I; - continue; - } + ++I; + continue; } else if (isa(NewAttribute)) { // We allow to add OMP[Begin]DeclareVariantAttr to be added to // declarations after definitions. @@ -7120,16 +7119,15 @@ static void checkAliasAttr(Sema &S, NamedDecl &ND) { } static void checkSelectAnyAttr(Sema &S, NamedDecl &ND) { - SelectAnyAttr *Attr = ND.getAttr(); - if (!Attr) - return; - - if (const auto *VD = dyn_cast(&ND); - VD && !VD->isStaticDataMember() && VD->isExternallyVisible()) - return; - - S.Diag(Attr->getLocation(), diag::err_attribute_selectany_non_extern_var); - ND.dropAttr(); + // 'selectany' only applies to externally visible variable declarations. + // It does not apply to functions. + if (SelectAnyAttr *Attr = ND.getAttr()) { + if (isa(ND) || !ND.isExternallyVisible()) { + S.Diag(Attr->getLocation(), + diag::err_attribute_selectany_non_extern_data); + ND.dropAttr(); + } + } } static void checkHybridPatchableAttr(Sema &S, NamedDecl &ND) { diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index a5f157c18c57d..03b9a77ec1814 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -201,7 +201,6 @@ // CHECK-NEXT: SYCLSpecialClass (SubjectMatchRule_record) // CHECK-NEXT: ScopedLockable (SubjectMatchRule_record) // CHECK-NEXT: Section (SubjectMatchRule_function, SubjectMatchRule_variable_is_global, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property) -// CHECK-NEXT: SelectAny (SubjectMatchRule_variable_not_is_parameter) // CHECK-NEXT: SetTypestate (SubjectMatchRule_function_is_member) // CHECK-NEXT: SpeculativeLoadHardening (SubjectMatchRule_function, SubjectMatchRule_objc_method) // CHECK-NEXT: StackProtectorIgnore (SubjectMatchRule_variable_is_local) diff --git a/clang/test/Sema/attr-selectany.c b/clang/test/Sema/attr-selectany.c index d8a0baf4edc0b..1078695c26abc 100644 --- a/clang/test/Sema/attr-selectany.c +++ b/clang/test/Sema/attr-selectany.c @@ -8,8 +8,4 @@ extern __declspec(selectany) const int x1 = 1; // no warning, const means we nee // Should we really warn on this? extern __declspec(selectany) int x2 = 1; // expected-warning {{'extern' variable has an initializer}} -__declspec(selectany) void x3(void) { } // expected-error {{'selectany' attribute only applies to variable declarations with external linkage}} - -void t() { - __declspec(selectany) extern int i; -} +__declspec(selectany) void foo(void) { } // expected-error{{'selectany' can only be applied to data items with external linkage}} diff --git a/clang/test/SemaCXX/attr-selectany.cpp b/clang/test/SemaCXX/attr-selectany.cpp index 70f40618af8f3..4afcb8130a14c 100644 --- a/clang/test/SemaCXX/attr-selectany.cpp +++ b/clang/test/SemaCXX/attr-selectany.cpp @@ -1,14 +1,14 @@ -// RUN: %clang_cc1 -triple x86_64-win32 -fms-compatibility -fms-extensions -fsyntax-only -verify=expected -std=c++11 %s -// RUN: %clang_cc1 -triple x86_64-unknown-linux -fms-compatibility -fms-extensions -fsyntax-only -verify=expected -std=c++11 %s -// RUN: %clang_cc1 -triple x86_64-win32-macho -fms-compatibility -fms-extensions -fsyntax-only -verify=expected,win23-macho -std=c++11 %s +// RUN: %clang_cc1 -triple x86_64-win32 -fms-compatibility -fms-extensions -fsyntax-only -verify -std=c++11 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux -fms-compatibility -fms-extensions -fsyntax-only -verify -std=c++11 %s +// RUN: %clang_cc1 -triple x86_64-win32-macho -fms-compatibility -fms-extensions -fsyntax-only -verify -std=c++11 %s // MSVC produces similar diagnostics. -__declspec(selectany) void foo() { } // expected-error{{'selectany' attribute only applies to variable declarations with external linkage}} +__declspec(selectany) void foo() { } // expected-error{{'selectany' can only be applied to data items with external linkage}} __declspec(selectany) int x1 = 1; -const __declspec(selectany) int x2 = 2; // expected-error{{'selectany' can only be applied to variables with external linkage}} +const __declspec(selectany) int x2 = 2; // expected-error{{'selectany' can only be applied to data items with external linkage}} extern const __declspec(selectany) int x3 = 3; @@ -18,7 +18,7 @@ const __declspec(selectany) int x4 = 4; // MSDN says this is incorrect, but MSVC doesn't diagnose it. extern __declspec(selectany) int x5; -static __declspec(selectany) int x6 = 2; // expected-error{{'selectany' can only be applied to variables with external linkage}} +static __declspec(selectany) int x6 = 2; // expected-error{{'selectany' can only be applied to data items with external linkage}} // FIXME: MSVC accepts this and makes x7 externally visible and comdat, but keep // it as internal and not weak/linkonce. @@ -36,7 +36,7 @@ class X { __declspec(selectany) X x(1); namespace { class Internal {}; } -__declspec(selectany) auto x8 = Internal(); // expected-error {{'selectany' can only be applied to variables with external linkage}} +__declspec(selectany) auto x8 = Internal(); // expected-error {{'selectany' can only be applied to data items with external linkage}} // The D3D11 headers do something like this. MSVC doesn't error on this at @@ -53,20 +53,3 @@ extern const SomeStruct some_struct; // Without selectany, this should stay an error. const SomeStruct some_struct2; // expected-error {{default initialization of an object of const type 'const SomeStruct' without a user-provided default constructor}} - -struct __declspec(selectany) S1 {}; // expected-error {{'selectany' attribute only applies to variable declarations with external linkage}} -__declspec(selectany) struct S1 s1; - -void t() { - __declspec(selectany) int a; // expected-error {{'selectany' can only be applied to variables with external linkage}} - __declspec(selectany) extern int b; - __declspec(selectany) static int c; // expected-error {{'selectany' can only be applied to variables with external linkage}} - __declspec(selectany) thread_local int d; // expected-error {{'selectany' can only be applied to variables with external linkage}} win23-macho-error {{thread-local storage is not supported for the current target}} -} - -struct S2 {}; -struct __declspec(selectany) S2 s2; // expected-error {{'selectany' attribute only applies to variable declarations with external linkage}} - -struct S3 { - __declspec(selectany) static int a; // expected-error {{'selectany' can only be applied to variables with external linkage}} -}; diff --git a/clang/test/SemaCXX/declspec-selectany.cpp b/clang/test/SemaCXX/declspec-selectany.cpp index 9e9c906caa008..7e64a2924c99a 100644 --- a/clang/test/SemaCXX/declspec-selectany.cpp +++ b/clang/test/SemaCXX/declspec-selectany.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -std=c++14 %s -triple x86_64-scei-ps4 -fdeclspec -verify // MSVC emits this error too. -const int __declspec(selectany) test1 = 0; // expected-error {{'selectany' can only be applied to variables with external linkage}} +const int __declspec(selectany) test1 = 0; // expected-error {{'selectany' can only be applied to data items with external linkage}} extern const int test2; const int test2 = 42; // expected-note {{previous definition is here}} @@ -15,4 +15,4 @@ const int __declspec(selectany) test3 = 42; // Standard usage. struct Test4 { static constexpr int sdm = 0; }; -__declspec(selectany) constexpr int Test4::sdm; // expected-error {{'selectany' can only be applied to variables with external linkage}} +__declspec(selectany) constexpr int Test4::sdm; // no warning From 546aef6871cefdab1de1b3ee90eb825892be969e Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 8 May 2026 05:49:48 -0700 Subject: [PATCH 034/538] [LLVM] Add `Type::getTruncatedType()` and use it in Intrinsics.cpp (#196239) --- llvm/include/llvm/IR/DerivedTypes.h | 32 +++++++++++++++++------- llvm/include/llvm/IR/Type.h | 4 +++ llvm/lib/IR/Intrinsics.cpp | 38 +++++------------------------ 3 files changed, 33 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index c272f091e31a4..0c686cc5c32d7 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -67,7 +67,15 @@ class IntegerType : public Type { /// Returns type twice as wide the input type. IntegerType *getExtendedType() const { - return Type::getIntNTy(getContext(), 2 * getScalarSizeInBits()); + return Type::getIntNTy(getContext(), 2 * getBitWidth()); + } + + /// Returns type half as wide the input type. + IntegerType *getTruncatedType() const { + unsigned BitWidth = getBitWidth(); + assert((BitWidth & 1) == 0 && + "Cannot truncate integer type with odd bit-width"); + return Type::getIntNTy(getContext(), BitWidth / 2); } /// Get the number of bits in this IntegerType @@ -542,9 +550,9 @@ class VectorType : public Type { // the input type, and the element type is an integer or float type which // is half as wide as the elements in the input type. static VectorType *getTruncatedElementVectorType(VectorType *VTy) { - Type *EltTy; - if (VTy->getElementType()->isFloatingPointTy()) { - switch(VTy->getElementType()->getTypeID()) { + Type *EltTy = VTy->getElementType(); + if (EltTy->isFloatingPointTy()) { + switch (EltTy->getTypeID()) { case DoubleTyID: EltTy = Type::getFloatTy(VTy->getContext()); break; @@ -555,11 +563,7 @@ class VectorType : public Type { llvm_unreachable("Cannot create narrower fp vector element type"); } } else { - unsigned EltBits = - VTy->getElementType()->getPrimitiveSizeInBits().getFixedValue(); - assert((EltBits & 1) == 0 && - "Cannot truncate vector element with odd bit-width"); - EltTy = IntegerType::get(VTy->getContext(), EltBits / 2); + EltTy = cast(EltTy)->getTruncatedType(); } return VectorType::get(EltTy, VTy->getElementCount()); } @@ -801,6 +805,16 @@ Type *Type::getExtendedType() const { return cast(this)->getExtendedType(); } +Type *Type::getTruncatedType() const { + assert( + isIntOrIntVectorTy() && + "Original type expected to be a vector of integers or a scalar integer."); + if (auto *VTy = dyn_cast(this)) + return VectorType::getTruncatedElementVectorType( + const_cast(VTy)); + return cast(this)->getTruncatedType(); +} + Type *Type::getWithNewType(Type *EltTy) const { if (auto *VTy = dyn_cast(this)) return VectorType::get(EltTy, VTy->getElementCount()); diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h index 4217d797cdf28..7d40e5dac9463 100644 --- a/llvm/include/llvm/IR/Type.h +++ b/llvm/include/llvm/IR/Type.h @@ -444,6 +444,10 @@ class Type { /// wide as in the original type. For vectors, preserves element count. LLVM_ABI inline Type *getExtendedType() const; + /// Given scalar/vector integer type, returns a type with elements half as + /// wide as in the original type. For vectors, preserves element count. + LLVM_ABI inline Type *getTruncatedType() const; + /// Get the address space of this pointer or pointer vector type. LLVM_ABI inline unsigned getPointerAddressSpace() const; diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 00cc17b3cabf5..ff57d335c9a13 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -554,22 +554,10 @@ static Type *DecodeFixedType(ArrayRef &Infos, case IITDescriptor::Overloaded: case IITDescriptor::VecOfAnyPtrsToElt: return OverloadTys[D.getOverloadIndex()]; - case IITDescriptor::Extend: { - Type *Ty = OverloadTys[D.getOverloadIndex()]; - if (VectorType *VTy = dyn_cast(Ty)) - return VectorType::getExtendedElementVectorType(VTy); - - return IntegerType::get(Context, 2 * cast(Ty)->getBitWidth()); - } - case IITDescriptor::Trunc: { - Type *Ty = OverloadTys[D.getOverloadIndex()]; - if (VectorType *VTy = dyn_cast(Ty)) - return VectorType::getTruncatedElementVectorType(VTy); - - IntegerType *ITy = cast(Ty); - assert(ITy->getBitWidth() % 2 == 0); - return IntegerType::get(Context, ITy->getBitWidth() / 2); - } + case IITDescriptor::Extend: + return OverloadTys[D.getOverloadIndex()]->getExtendedType(); + case IITDescriptor::Trunc: + return OverloadTys[D.getOverloadIndex()]->getTruncatedType(); case IITDescriptor::Subdivide2: case IITDescriptor::Subdivide4: { Type *Ty = OverloadTys[D.getOverloadIndex()]; @@ -960,14 +948,7 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, if (D.getOverloadIndex() >= OverloadTys.size()) return IsDeferredCheck || DeferCheck(Ty); - Type *NewTy = OverloadTys[D.getOverloadIndex()]; - if (VectorType *VTy = dyn_cast(NewTy)) - NewTy = VectorType::getExtendedElementVectorType(VTy); - else if (IntegerType *ITy = dyn_cast(NewTy)) - NewTy = IntegerType::get(ITy->getContext(), 2 * ITy->getBitWidth()); - else - return true; - + Type *NewTy = OverloadTys[D.getOverloadIndex()]->getExtendedType(); return Ty != NewTy; } case IITDescriptor::Trunc: { @@ -975,14 +956,7 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, if (D.getOverloadIndex() >= OverloadTys.size()) return IsDeferredCheck || DeferCheck(Ty); - Type *NewTy = OverloadTys[D.getOverloadIndex()]; - if (VectorType *VTy = dyn_cast(NewTy)) - NewTy = VectorType::getTruncatedElementVectorType(VTy); - else if (IntegerType *ITy = dyn_cast(NewTy)) - NewTy = IntegerType::get(ITy->getContext(), ITy->getBitWidth() / 2); - else - return true; - + Type *NewTy = OverloadTys[D.getOverloadIndex()]->getTruncatedType(); return Ty != NewTy; } case IITDescriptor::OneNthEltsVec: { From aa5d182da16125577757d6c49a9fa510f680743a Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Fri, 8 May 2026 13:57:03 +0100 Subject: [PATCH 035/538] [AMDGPU] Make VALU instructions defining SGPR non-ignorable (#195270) This fixes an issue where CSE would incorrectly eliminate an instruction that produces a lane mask. For example, the second V_CMP_GT in the code below cannot be replaced with %3, despite both having the same operands as it would cause an incorrect exec mask being calculated in %6: ``` bb.1 %3:sreg_64 = V_CMP_GT_U32_e64 %0:vgpr_32, %1:sreg_32, implicit $exec %4:sreg_64 = SI_IF_BREAK killed %3:sreg_64, %2:sreg_64, implicit-def dead $scc SI_LOOP %4:sreg_64, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 bb.2: SI_END_CF %4:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %5:sreg_64 = V_CMP_GT_U32_e64 %0:vgpr_32, %1:sreg_32, implicit $exec %6:sreg_64 = S_AND_B64 %5:sreg_64, $exec, implicit-def $scc ``` This is submitted as a preferable solution when compared to both #194863 (large number of diffs in tests) and #193522 (more of a workaround than an actual fix). --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 49 +-- .../CodeGen/AMDGPU/coalescer_distribute.ll | 4 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 38 +- ...-phi-regression-issue130646-issue130119.ll | 38 +- llvm/test/CodeGen/AMDGPU/licm-valu.mir | 2 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll | 23 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 148 ++++--- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 108 ++--- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 108 ++--- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 148 ++++--- .../ran-out-of-sgprs-allocation-failure.mir | 16 +- llvm/test/CodeGen/AMDGPU/structurize-hoist.ll | 59 ++- .../AMDGPU/tuple-allocation-failure.ll | 394 ++++++++---------- .../CodeGen/AMDGPU/v-cmp-cse-across-loop.mir | 67 +++ ...r-descriptor-waterfall-loop-idom-update.ll | 6 +- 15 files changed, 621 insertions(+), 587 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/v-cmp-cse-across-loop.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 65137fa09e209..3efae655b311e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -150,46 +150,23 @@ bool SIInstrInfo::isReMaterializableImpl( return TargetInstrInfo::isReMaterializableImpl(MI); } -// Returns true if the scalar result of a VALU instruction depends on exec. +// Returns true if the result of a VALU instruction depends on exec. bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const { - // Ignore comparisons which are only used masked with exec. - // This allows some hoisting/sinking of VALU comparisons. - if (MI.isCompare()) { - const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst); - if (!Dst) - return true; - - Register DstReg = Dst->getReg(); - if (!DstReg.isVirtual()) - return true; + assert(isVALU(MI)); - const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { - switch (Use.getOpcode()) { - case AMDGPU::S_AND_SAVEEXEC_B32: - case AMDGPU::S_AND_SAVEEXEC_B64: - break; - case AMDGPU::S_AND_B32: - case AMDGPU::S_AND_B64: - if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr)) - return true; - break; - default: - return true; - } - } - return false; - } + // If it is convergent it depends on EXEC. + if (MI.isConvergent()) + return true; - // If it is not convergent it does not depend on EXEC. - if (!MI.isConvergent()) - return false; + // If it defines SGPR it depends on EXEC + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + for (const MachineOperand &Def : MI.defs()) { + if (!Def.isReg()) + continue; - switch (MI.getOpcode()) { - default: - break; - case AMDGPU::V_READFIRSTLANE_B32: - return true; + Register Reg = Def.getReg(); + if (Reg && RI.isSGPRReg(MRI, Reg)) + return true; } return false; diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll index d07cc84865bea..94085ad99ccda 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll +++ b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll @@ -18,11 +18,9 @@ define amdgpu_kernel void @hoge(i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK-NEXT: s_bitcmp1_b32 s2, 24 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: .LBB0_1: ; %bb25 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb30 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 2f287d269e4d8..c554e26a43965 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -14,26 +14,26 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_cbranch_execz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 ; GCN-NEXT: s_and_b64 exec, exec, vcc ; GCN-NEXT: s_cbranch_execz .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4 ; GCN-NEXT: .LBB0_3: ; %bb.outer.end ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -376,18 +376,18 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_5 -; GCN-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GCN-NEXT: s_cbranch_execz .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.else ; GCN-NEXT: s_mov_b32 s6, 0 @@ -398,7 +398,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8 ; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GCN-NEXT: .LBB2_3: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GCN-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GCN-NEXT: s_cbranch_execz .LBB2_5 ; GCN-NEXT: ; %bb.4: ; %bb.then ; GCN-NEXT: s_mov_b32 s6, 0 @@ -409,7 +409,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4 ; GCN-NEXT: .LBB2_5: ; %bb.outer.end -; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: s_mov_b32 m0, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll index d03d53a8cbbaa..0c25ffd80dbad 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll @@ -73,43 +73,41 @@ define amdgpu_cs void @issue130119(i1 %arg) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: s_mov_b32 s16, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_branch .LBB1_2 ; CHECK-NEXT: .LBB1_1: ; %Flow2 ; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_2: ; %bb1 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB1_4 Depth 2 -; CHECK-NEXT: s_and_b32 s2, s16, 1 -; CHECK-NEXT: s_cmp_eq_u32 s2, 0 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_eq_u32 s2, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 +; CHECK-NEXT: s_and_b32 s6, s16, 1 +; CHECK-NEXT: s_cmp_eq_u32 s6, 0 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_cmp_eq_u32 s6, 1 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 -; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9 +; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: s_branch .LBB1_4 ; CHECK-NEXT: .LBB1_3: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; CHECK-NEXT: s_xor_b64 s[14:15], s[14:15], -1 ; CHECK-NEXT: s_and_b64 s[12:13], exec, s[12:13] ; CHECK-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; CHECK-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; CHECK-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; CHECK-NEXT: s_and_b64 s[12:13], s[14:15], exec -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_cbranch_execz .LBB1_8 ; CHECK-NEXT: .LBB1_4: ; %bb3 ; CHECK-NEXT: ; Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] -; CHECK-NEXT: s_mov_b64 s[14:15], s[6:7] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[14:15], s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB1_6 ; CHECK-NEXT: ; %bb.5: ; %bb7 ; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2 @@ -128,14 +126,14 @@ define amdgpu_cs void @issue130119(i1 %arg) { ; CHECK-NEXT: .LBB1_8: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] -; CHECK-NEXT: s_mov_b64 s[2:3], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; CHECK-NEXT: s_cbranch_execz .LBB1_1 ; CHECK-NEXT: ; %bb.9: ; %bb10 ; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: s_or_b32 s16, s16, 1 -; CHECK-NEXT: s_xor_b64 s[2:3], exec, -1 +; CHECK-NEXT: s_xor_b64 s[4:5], exec, -1 ; CHECK-NEXT: s_branch .LBB1_1 ; CHECK-NEXT: .LBB1_10: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/licm-valu.mir b/llvm/test/CodeGen/AMDGPU/licm-valu.mir index 0020e89580a14..a27153867408b 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-valu.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-valu.mir @@ -74,12 +74,12 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 1, 2, implicit $exec ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 1, 2, implicit $exec ; GCN-NEXT: $exec = S_AND_B64 $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll index af81d95973452..fe7adab59a23c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -1527,14 +1527,12 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; SI-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-NEXT: s_cbranch_vccnz .LBB26_5 ; SI-NEXT: ; %bb.1: ; %.preheader1.preheader -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_mov_b32_e32 v0, 0x3fc00000 -; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 ; SI-NEXT: .LBB26_2: ; %bb ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; SI-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 ; SI-NEXT: s_cbranch_vccnz .LBB26_2 ; SI-NEXT: ; %bb.3: ; %bb33 @@ -1559,15 +1557,13 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX10-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX10-NEXT: s_cbranch_vccnz .LBB26_5 ; GFX10-NEXT: ; %bb.1: ; %.preheader1.preheader -; GFX10-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3fc00000 +; GFX10-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 ; GFX10-NEXT: s_mov_b64 s[2:3], exec -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GFX10-NEXT: .LBB26_2: ; %bb ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 -; GFX10-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX10-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX10-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX10-NEXT: ; %bb.3: ; %bb33 ; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], exec @@ -1591,16 +1587,13 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX11-NEXT: s_cbranch_vccnz .LBB26_5 ; GFX11-NEXT: ; %bb.1: ; %.preheader1.preheader -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3fc00000 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 ; GFX11-NEXT: s_mov_b64 s[2:3], exec -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 -; GFX11-NEXT: s_waitcnt_depctr depctr_va_sdst(0) ; GFX11-NEXT: .LBB26_2: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 -; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[0:1] ; GFX11-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX11-NEXT: ; %bb.3: ; %bb33 ; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec @@ -1630,12 +1623,10 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, ; GFX12-NEXT: s_mov_b64 s[2:3], exec ; GFX12-NEXT: s_mov_b32 s4, 0x3fc00000 ; GFX12-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX12-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; GFX12-NEXT: .LBB26_2: ; %bb ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_add_f32 s4, s4, 0x3e800000 -; GFX12-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX12-NEXT: s_and_not1_b64 vcc, exec, s[0:1] ; GFX12-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX12-NEXT: ; %bb.3: ; %bb33 ; GFX12-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 899f924ac06bd..b0da72184f050 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -225,21 +225,23 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, float 4.0 seq_cst @@ -451,15 +453,16 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-LABEL: local_atomic_fadd_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffc, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -837,9 +840,10 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-LABEL: local_atomic_fadd_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfff8, v2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -847,7 +851,8 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_mov_b32_e32 v4, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfff8, v2 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v5, v[3:4], v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -1203,20 +1208,21 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-LABEL: local_atomic_fadd_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfff8, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 +; GFX6-NEXT: ds_read_b64 v[1:2], v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfff8, v0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v5, v[1:2], v[3:4] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3025,25 +3031,27 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -3306,19 +3314,20 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fadd_noret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -5491,25 +5500,27 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -5848,9 +5859,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fadd_noret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5859,8 +5870,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -6270,9 +6282,9 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6290,9 +6302,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -6679,9 +6692,9 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6699,9 +6712,10 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -7399,9 +7413,9 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-LABEL: local_atomic_fadd_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -7416,15 +7430,16 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 @@ -8093,9 +8108,9 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -8110,15 +8125,16 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index b7ddb8321c68e..43d77d0afa8c6 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -2616,25 +2616,27 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffe, v0 +; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -2908,19 +2910,20 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fmax_noret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -5102,26 +5105,28 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -5461,9 +5466,9 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fmax_noret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5473,8 +5478,9 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -5980,9 +5986,9 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6000,9 +6006,10 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -6483,9 +6490,9 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6503,9 +6510,10 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -7449,9 +7457,9 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 @@ -7466,15 +7474,16 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 @@ -8375,9 +8384,9 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 @@ -8392,15 +8401,16 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 81dcd95a64bf6..1759a212c6ea3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -2616,25 +2616,27 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffe, v0 +; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -2908,19 +2910,20 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fmin_noret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -5102,26 +5105,28 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -5461,9 +5466,9 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fmin_noret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5473,8 +5478,9 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -5980,9 +5986,9 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6000,9 +6006,10 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -6483,9 +6490,9 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6503,9 +6510,10 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -7449,9 +7457,9 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 @@ -7466,15 +7474,16 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 @@ -8375,9 +8384,9 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 @@ -8392,15 +8401,16 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 7bb36262cf389..5feb76edf3d17 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -417,21 +417,23 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, float 4.0 seq_cst @@ -817,15 +819,16 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-LABEL: local_atomic_fsub_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffc, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -1253,9 +1256,10 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-LABEL: local_atomic_fsub_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfff8, v2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1263,7 +1267,8 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: v_mov_b32_e32 v4, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfff8, v2 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v5, v[3:4], v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -1665,20 +1670,21 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-LABEL: local_atomic_fsub_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfff8, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 +; GFX6-NEXT: ds_read_b64 v[1:2], v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfff8, v0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v5, v[1:2], v[3:4] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3487,25 +3493,27 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v3, v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -3768,19 +3776,20 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fsub_noret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -5953,25 +5962,27 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 @@ -6310,9 +6321,9 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-LABEL: local_atomic_fsub_noret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6321,8 +6332,9 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffe, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v4, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -6794,9 +6806,9 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -6814,9 +6826,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -7259,9 +7272,9 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: ds_read_b32 v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -7279,9 +7292,10 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0xfffc, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v7, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -8225,9 +8239,9 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -8242,15 +8256,16 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 @@ -9151,9 +9166,9 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -9168,15 +9183,16 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0xfffc, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v5, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 13b1d76e14a00..e7eefafe31203 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -44,10 +44,14 @@ body: | ; CHECK-NEXT: SI_SPILL_S32_SAVE $sgpr15, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) ; CHECK-NEXT: SI_SPILL_S32_SAVE $sgpr14, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) ; CHECK-NEXT: renamable $sgpr14_sgpr15 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.5, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.5, align 4, addrspace 5) ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 @@ -163,7 +167,7 @@ body: | ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 @@ -211,13 +215,13 @@ body: | ; CHECK-NEXT: $sgpr13 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; CHECK-NEXT: renamable $sgpr84 = COPY killed renamable $sgpr8 ; CHECK-NEXT: renamable $sgpr33 = COPY killed renamable $sgpr16 - ; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY killed renamable $sgpr14_sgpr15 + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr14_sgpr15 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 - ; CHECK-NEXT: renamable $sgpr14_sgpr15 = COPY killed renamable $sgpr36_sgpr37 + ; CHECK-NEXT: renamable $sgpr14_sgpr15 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr16 = COPY killed renamable $sgpr33 ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr68_sgpr69 ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr70_sgpr71 @@ -253,7 +257,7 @@ body: | ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.5, align 4, addrspace 5) ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 @@ -266,7 +270,7 @@ body: | ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll index b4036517cc0d5..ade922e8af166 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll @@ -85,58 +85,57 @@ merge: define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 { ; GFX900-LABEL: test_loop_with_if: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: s_mov_b64 s[4:5], 0 -; GFX900-NEXT: s_movk_i32 s10, 0xfe +; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_movk_i32 s8, 0xfe ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_bitcmp1_b32 s2, 0 -; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] -; GFX900-NEXT: v_mov_b32_e32 v2, s1 -; GFX900-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GFX900-NEXT: v_mov_b32_e32 v1, s0 -; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3 +; GFX900-NEXT: s_bitcmp1_b32 s0, 0 +; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], -1 +; GFX900-NEXT: v_mov_b32_e32 v2, s7 ; GFX900-NEXT: s_branch .LBB2_2 ; GFX900-NEXT: .LBB2_1: ; %latch ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_add_u32_e32 v5, 20, v3 -; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s10, v5 -; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_add_u32_e32 v6, 20, v3 +; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s8, v6 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: flat_store_dword v[1:2], v3 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execz .LBB2_8 ; GFX900-NEXT: .LBB2_2: ; %loop ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: flat_load_dwordx2 v[3:4], v[1:2] -; GFX900-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX900-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 1, v5 +; GFX900-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX900-NEXT: s_mov_b64 s[4:5], 0 ; GFX900-NEXT: s_cbranch_vccnz .LBB2_4 ; GFX900-NEXT: ; %bb.3: ; %if ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: v_cmp_gt_i32_e32 vcc, 11, v5 -; GFX900-NEXT: s_andn2_b64 s[8:9], s[2:3], exec -; GFX900-NEXT: s_and_b64 s[12:13], vcc, exec -; GFX900-NEXT: s_mov_b64 s[6:7], -1 -; GFX900-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX900-NEXT: v_cmp_gt_i32_e32 vcc, 11, v6 +; GFX900-NEXT: s_andn2_b64 s[6:7], s[0:1], exec +; GFX900-NEXT: s_and_b64 s[10:11], vcc, exec +; GFX900-NEXT: s_mov_b64 s[4:5], -1 +; GFX900-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX900-NEXT: .LBB2_4: ; %Flow ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] -; GFX900-NEXT: s_xor_b64 s[8:9], exec, s[12:13] +; GFX900-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] +; GFX900-NEXT: s_xor_b64 s[6:7], exec, s[10:11] ; GFX900-NEXT: s_cbranch_execz .LBB2_6 ; GFX900-NEXT: ; %bb.5: ; %else ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX900-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GFX900-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX900-NEXT: .LBB2_6: ; %Flow1 ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX900-NEXT: s_cbranch_execz .LBB2_1 ; GFX900-NEXT: ; %bb.7: ; %then ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 2221647f580cb..64f141908895d 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,54 +32,43 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[68:71], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s54, 0 +; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[50:51], s[4:5], v[0:1] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr60 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s70, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s4, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s5, 1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: s_xor_b64 s[52:53], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v40, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: s_xor_b64 s[64:65], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: s_xor_b64 s[66:67], s[4:5], -1 +; GLOBALNESS1-NEXT: s_mov_b32 s84, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[48:49], s[8:9] -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 -; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 -; GLOBALNESS1-NEXT: s_mov_b32 s83, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s84, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s85, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s86, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0 @@ -87,32 +76,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s4, 2 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s5, 3 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s5, 5 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[54:55], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s4, 6 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s5, 7 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v60, 6 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v60, 7 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -126,10 +105,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47] +; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47] ; GLOBALNESS1-NEXT: s_add_u32 s8, s48, 40 ; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47] +; GLOBALNESS1-NEXT: flat_load_dword v57, v[46:47] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s49, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -138,14 +117,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr15 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[70:71] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 1, v40 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 @@ -153,12 +132,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s55, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s71, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s71, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_7: ; %Flow26 @@ -167,7 +146,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s71, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 @@ -181,95 +160,94 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[68:69], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[98:99], s[68:69] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 10 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s8, 8 +; GLOBALNESS1-NEXT: v_writelane_b32 v60, s9, 9 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v60, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v60, 3 +; GLOBALNESS1-NEXT: s_mov_b32 s87, s71 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v56, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[70:71], 0, v2 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[80:81], 0, v[0:1] ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[68:69] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[64:65] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v60, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v60, 1 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[98:99] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[80:81] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[80:81] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s70, s48, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s71, s49, 0 +; GLOBALNESS1-NEXT: s_add_u32 s82, s48, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s83, s49, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx2 s[96:97], s[4:5], 0x0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[82:83] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[96:97] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[82:83] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr15 -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[96:97] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[70:71] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -277,24 +255,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_branch .LBB1_14 ; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[48:49], 0x0 -; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 +; GLOBALNESS1-NEXT: v_readlane_b32 s8, v60, 8 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 -; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 -; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 +; GLOBALNESS1-NEXT: s_mov_b32 s71, s87 +; GLOBALNESS1-NEXT: v_readlane_b32 s9, v60, 9 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[98:99] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[68:69] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v60, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v60, 5 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -317,9 +291,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr15 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -336,9 +310,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr15 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -347,54 +321,43 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx4 s[68:71], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[68:69] ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s54, 0 +; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[50:51], s[4:5], v[0:1] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr60 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s70, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s4, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s5, 1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: s_xor_b64 s[52:53], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v40, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[64:65], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[66:67], s[4:5], -1 +; GLOBALNESS0-NEXT: s_mov_b32 s82, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[8:9] -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 -; GLOBALNESS0-NEXT: s_mov_b32 s71, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s82, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s83, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s84, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0 @@ -402,32 +365,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s4, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s5, 3 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s5, 5 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[54:55], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s4, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s5, 7 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v60, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v60, 7 +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -441,10 +394,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47] +; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47] ; GLOBALNESS0-NEXT: s_add_u32 s8, s48, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47] +; GLOBALNESS0-NEXT: flat_load_dword v57, v[46:47] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s49, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -453,14 +406,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr15 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[84:85] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 1, v40 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 @@ -468,12 +421,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s55, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s71, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s71, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_7: ; %Flow26 @@ -482,7 +435,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s71, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 @@ -496,96 +449,93 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[68:69], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[98:99], s[68:69] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3 -; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s8, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v60, s9, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v60, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v60, 3 +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v56, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[70:71], 0, v2 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[80:81], 0, v[0:1] ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[68:69] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[64:65] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v60, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v60, 1 +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[98:99] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[80:81] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[80:81] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s84, s48, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s85, s49, 0 +; GLOBALNESS0-NEXT: s_add_u32 s86, s48, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s87, s49, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx2 s[96:97], s[4:5], 0x0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[86:87] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[96:97] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[86:87] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr15 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[96:97] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[70:71] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -593,22 +543,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v60, 8 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s9, v60, 9 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[98:99] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[68:69] +; GLOBALNESS0-NEXT: s_load_dwordx4 s[68:71], s[48:49], 0x0 ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v60, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v60, 5 +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -631,9 +579,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr15 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -650,9 +598,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[38:39] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr15 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/v-cmp-cse-across-loop.mir b/llvm/test/CodeGen/AMDGPU/v-cmp-cse-across-loop.mir new file mode 100644 index 0000000000000..3e366143958eb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/v-cmp-cse-across-loop.mir @@ -0,0 +1,67 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=machine-cse -verify-machineinstrs %s -o - | FileCheck %s +# Check that V_CMP is not eliminated across the loop. + +--- +name: v_cmp_cse_across_loop +tracksRegLiveness: true +legalized: true +regBankSelected: true +selected: true +body: | + ; CHECK-LABEL: name: v_cmp_cse_across_loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_GT_U32_e64_]], [[COPY2]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_CMP_GT_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_GT_U32_e64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $vgpr0, $sgpr0, $sgpr2_sgpr3 + + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = COPY $sgpr0 + %2:sreg_64 = COPY $sgpr2_sgpr3 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %3:sreg_64 = V_CMP_GT_U32_e64 %0:vgpr_32, %1:sreg_32, implicit $exec + %4:sreg_64 = SI_IF_BREAK killed %3:sreg_64, %2:sreg_64, implicit-def dead $scc + SI_LOOP %4:sreg_64, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x80000000) + + SI_END_CF %4:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64 = V_CMP_GT_U32_e64 %0:vgpr_32, %1:sreg_32, implicit $exec + %6:sreg_64 = S_AND_B64 %5:sreg_64, $exec, implicit-def $scc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index a69dba90a9640..950a1252a3e06 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: .LBB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 +; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) From 11e91bdcc6e1ddc81d95fa592e09728d4b346f2b Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Fri, 8 May 2026 06:03:34 -0700 Subject: [PATCH 036/538] [mlir][core] in -mlir-print-ir-*, dump the pass options as well (#195198) This change modifies the header comment to IR dumped by `-mlir-print-ir-*` flags. The new comment contains the exact pass pipeline run for the pass in question. This is useful when using `mlir-print-ir-tree-dir`, as it provides the exact reproducing pass pipeline that can be used on the dumped IR. For example, when using --mlir-print-ir-before-all when triaging a stack trace, the last dumped IR (along with this new comment) can be used to reproduce the failure with a single pass. Before: ``` // -----// IR Dump Before CanonicalizerPass (canonicalize) //----- // ``` After: ``` // -----// IR Dump Before CanonicalizerPass: canonicalize{cse-between-iterations=false max-iterations=5 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true} //----- // ``` --------- Co-authored-by: Jeremy Kun --- mlir/lib/Pass/IRPrinting.cpp | 40 ++++++------ mlir/test/Pass/dynamic-pipeline-nested.mlir | 8 +-- mlir/test/Pass/dynamic-pipeline.mlir | 16 ++--- mlir/test/Pass/ir-printing.mlir | 67 +++++++++++---------- mlir/test/Pass/run-reproducer.mlir | 10 +-- mlir/test/python/pass_manager.py | 6 +- 6 files changed, 77 insertions(+), 70 deletions(-) diff --git a/mlir/lib/Pass/IRPrinting.cpp b/mlir/lib/Pass/IRPrinting.cpp index c11dbc627c0be..032d4f7e2d67d 100644 --- a/mlir/lib/Pass/IRPrinting.cpp +++ b/mlir/lib/Pass/IRPrinting.cpp @@ -48,18 +48,6 @@ class IRPrinterInstrumentation : public PassInstrumentation { static void printIR(Operation *op, bool printModuleScope, raw_ostream &out, OpPrintingFlags flags) { - // Otherwise, check to see if we are not printing at module scope. - if (!printModuleScope) - return op->print(out << " //----- //\n", - op->getBlock() ? flags.useLocalScope() : flags); - - // Otherwise, we are printing at module scope. - out << " ('" << op->getName() << "' operation"; - if (auto symbolName = - op->getAttrOfType(SymbolTable::getSymbolAttrName())) - out << ": @" << symbolName.getValue(); - out << ") //----- //\n"; - // Find the top-level operation. auto *topLevelOp = op; while (auto *parentOp = topLevelOp->getParentOp()) @@ -67,6 +55,24 @@ static void printIR(Operation *op, bool printModuleScope, raw_ostream &out, topLevelOp->print(out, flags); } +static void printIRHeader(raw_ostream &out, StringRef title, Pass *pass, + Operation *op, bool printModuleScope, + bool failed = false) { + out << "// -----// IR Dump " << title << " " << pass->getName(); + if (failed) + out << " Failed"; + out << ": "; + pass->printAsTextualPipeline(out); + if (printModuleScope) { + out << " ('" << op->getName() << "' operation"; + if (auto symbolName = + op->getAttrOfType(SymbolTable::getSymbolAttrName())) + out << ": @" << symbolName.getValue(); + out << ")"; + } + out << " //----- //\n"; +} + /// Instrumentation hooks. void IRPrinterInstrumentation::runBeforePass(Pass *pass, Operation *op) { if (isa(pass)) @@ -76,8 +82,7 @@ void IRPrinterInstrumentation::runBeforePass(Pass *pass, Operation *op) { beforePassFingerPrints.try_emplace(pass, op); config->printBeforeIfEnabled(pass, op, [&](raw_ostream &out) { - out << "// -----// IR Dump Before " << pass->getName() << " (" - << pass->getArgument() << ")"; + printIRHeader(out, "Before", pass, op, config->shouldPrintAtModuleScope()); printIR(op, config->shouldPrintAtModuleScope(), out, config->getOpPrintingFlags()); out << "\n\n"; @@ -107,8 +112,7 @@ void IRPrinterInstrumentation::runAfterPass(Pass *pass, Operation *op) { } config->printAfterIfEnabled(pass, op, [&](raw_ostream &out) { - out << "// -----// IR Dump After " << pass->getName() << " (" - << pass->getArgument() << ")"; + printIRHeader(out, "After", pass, op, config->shouldPrintAtModuleScope()); printIR(op, config->shouldPrintAtModuleScope(), out, config->getOpPrintingFlags()); out << "\n\n"; @@ -122,8 +126,8 @@ void IRPrinterInstrumentation::runAfterPassFailed(Pass *pass, Operation *op) { beforePassFingerPrints.erase(pass); config->printAfterIfEnabled(pass, op, [&](raw_ostream &out) { - out << formatv("// -----// IR Dump After {0} Failed ({1})", pass->getName(), - pass->getArgument()); + printIRHeader(out, "After", pass, op, config->shouldPrintAtModuleScope(), + /*failed=*/true); printIR(op, config->shouldPrintAtModuleScope(), out, config->getOpPrintingFlags()); out << "\n\n"; diff --git a/mlir/test/Pass/dynamic-pipeline-nested.mlir b/mlir/test/Pass/dynamic-pipeline-nested.mlir index ac2fdd3265b63..5f480e42b084f 100644 --- a/mlir/test/Pass/dynamic-pipeline-nested.mlir +++ b/mlir/test/Pass/dynamic-pipeline-nested.mlir @@ -10,7 +10,7 @@ func.func @f() { // CHECK: IR Dump Before // CHECK-SAME: TestDynamicPipelinePass -// CHECK-NEXT: module @inner_mod1 +// CHECK: module @inner_mod1 module @inner_mod1 { // We use the mlir-print-ir-after-all dumps to check the granularity of the // scheduling: if we are nesting we expect to see to individual "Dump Before @@ -18,11 +18,11 @@ module @inner_mod1 { // the CSE pass to run on the `inner_mod1` module directly. // CHECK: Dump Before CSE -// NOTNESTED-NEXT: @inner_mod1 -// NESTED-NEXT: @foo +// NOTNESTED: @inner_mod1 +// NESTED: @foo module @foo {} // Only in the nested case we have a second run of the pass here. // NESTED: Dump Before CSE -// NESTED-NEXT: @baz +// NESTED: @baz module @baz {} } diff --git a/mlir/test/Pass/dynamic-pipeline.mlir b/mlir/test/Pass/dynamic-pipeline.mlir index 5e31ba476aeb0..7cae9284f44d2 100644 --- a/mlir/test/Pass/dynamic-pipeline.mlir +++ b/mlir/test/Pass/dynamic-pipeline.mlir @@ -10,20 +10,20 @@ func.func @f() { // CHECK: IR Dump Before // CHECK-SAME: TestDynamicPipelinePass -// CHECK-NEXT: module @inner_mod1 +// CHECK: module @inner_mod1 // MOD2-ONLY: dynamic-pipeline skip op name: inner_mod1 module @inner_mod1 { // MOD1: Dump Before CSE -// MOD1-NEXT: @foo +// MOD1: @foo // MOD1: Dump Before Canonicalizer -// MOD1-NEXT: @foo +// MOD1: @foo func.func @foo() { return } // MOD1: Dump Before CSE -// MOD1-NEXT: @baz +// MOD1: @baz // MOD1: Dump Before Canonicalizer -// MOD1-NEXT: @baz +// MOD1: @baz func.func @baz() { return } @@ -31,13 +31,13 @@ module @inner_mod1 { // CHECK: IR Dump Before // CHECK-SAME: TestDynamicPipelinePass -// CHECK-NEXT: module @inner_mod2 +// CHECK: module @inner_mod2 // MOD1-ONLY: dynamic-pipeline skip op name: inner_mod2 module @inner_mod2 { // MOD2: Dump Before CSE -// MOD2-NEXT: @foo +// MOD2: @foo // MOD2: Dump Before Canonicalizer -// MOD2-NEXT: @foo +// MOD2: @foo func.func @foo() { return } diff --git a/mlir/test/Pass/ir-printing.mlir b/mlir/test/Pass/ir-printing.mlir index 467d76fdaa7f6..360b347043722 100644 --- a/mlir/test/Pass/ir-printing.mlir +++ b/mlir/test/Pass/ir-printing.mlir @@ -5,6 +5,7 @@ // RUN: mlir-opt %s -mlir-disable-threading=true -pass-pipeline='builtin.module(func.func(cse,canonicalize))' -mlir-print-ir-before=cse -mlir-print-ir-module-scope -o /dev/null 2>&1 | FileCheck -check-prefix=BEFORE_MODULE %s // RUN: mlir-opt %s -mlir-disable-threading=true -pass-pipeline='builtin.module(func.func(cse,cse))' -mlir-print-ir-after-all -mlir-print-ir-after-change -o /dev/null 2>&1 | FileCheck -check-prefix=AFTER_ALL_CHANGE %s // RUN: not mlir-opt %s -mlir-disable-threading=true -pass-pipeline='builtin.module(func.func(cse,test-pass-failure))' -mlir-print-ir-after-failure -o /dev/null 2>&1 | FileCheck -check-prefix=AFTER_FAILURE %s +// RUN: mlir-opt %s -mlir-disable-threading=true -pass-pipeline='builtin.module(func.func(canonicalize{max-iterations=5}))' -mlir-print-ir-before=canonicalize -o /dev/null 2>&1 | FileCheck -check-prefix=OPTIONS %s func.func @foo() { %0 = arith.constant 0 : i32 @@ -15,53 +16,55 @@ func.func @bar() { return } -// BEFORE: // -----// IR Dump Before{{.*}}CSEPass (cse) //----- // -// BEFORE-NEXT: func @foo() -// BEFORE: // -----// IR Dump Before{{.*}}CSEPass (cse) //----- // -// BEFORE-NEXT: func @bar() -// BEFORE-NOT: // -----// IR Dump Before{{.*}}CanonicalizerPass (canonicalize) //----- // +// BEFORE: // -----// IR Dump Before{{.*}}CSEPass: cse //----- // +// BEFORE: func @foo() +// BEFORE: // -----// IR Dump Before{{.*}}CSEPass: cse //----- // +// BEFORE: func @bar() +// BEFORE-NOT: // -----// IR Dump Before{{.*}}CanonicalizerPass: canonicalize //----- // // BEFORE-NOT: // -----// IR Dump After -// BEFORE_ALL: // -----// IR Dump Before{{.*}}CSEPass (cse) //----- // -// BEFORE_ALL-NEXT: func @foo() -// BEFORE_ALL: // -----// IR Dump Before{{.*}}CanonicalizerPass (canonicalize) //----- // -// BEFORE_ALL-NEXT: func @foo() -// BEFORE_ALL: // -----// IR Dump Before{{.*}}CSEPass (cse) //----- // -// BEFORE_ALL-NEXT: func @bar() -// BEFORE_ALL: // -----// IR Dump Before{{.*}}CanonicalizerPass (canonicalize) //----- // -// BEFORE_ALL-NEXT: func @bar() +// BEFORE_ALL: // -----// IR Dump Before{{.*}}CSEPass: cse //----- // +// BEFORE_ALL: func @foo() +// BEFORE_ALL: // -----// IR Dump Before{{.*}}CanonicalizerPass: canonicalize{{.*}} //----- // +// BEFORE_ALL: func @foo() +// BEFORE_ALL: // -----// IR Dump Before{{.*}}CSEPass: cse //----- // +// BEFORE_ALL: func @bar() +// BEFORE_ALL: // -----// IR Dump Before{{.*}}CanonicalizerPass: canonicalize{{.*}} //----- // +// BEFORE_ALL: func @bar() // BEFORE_ALL-NOT: // -----// IR Dump After // AFTER-NOT: // -----// IR Dump Before -// AFTER: // -----// IR Dump After{{.*}}CSEPass (cse) //----- // -// AFTER-NEXT: func @foo() -// AFTER: // -----// IR Dump After{{.*}}CSEPass (cse) //----- // -// AFTER-NEXT: func @bar() -// AFTER-NOT: // -----// IR Dump After{{.*}}CanonicalizerPass (canonicalize) //----- // +// AFTER: // -----// IR Dump After{{.*}}CSEPass: cse //----- // +// AFTER: func @foo() +// AFTER: // -----// IR Dump After{{.*}}CSEPass: cse //----- // +// AFTER: func @bar() +// AFTER-NOT: // -----// IR Dump After{{.*}}CanonicalizerPass: canonicalize{{.*}} //----- // // AFTER_ALL-NOT: // -----// IR Dump Before -// AFTER_ALL: // -----// IR Dump After{{.*}}CSEPass (cse) //----- // -// AFTER_ALL-NEXT: func @foo() -// AFTER_ALL: // -----// IR Dump After{{.*}}CanonicalizerPass (canonicalize) //----- // -// AFTER_ALL-NEXT: func @foo() -// AFTER_ALL: // -----// IR Dump After{{.*}}CSEPass (cse) //----- // -// AFTER_ALL-NEXT: func @bar() -// AFTER_ALL: // -----// IR Dump After{{.*}}CanonicalizerPass (canonicalize) //----- // -// AFTER_ALL-NEXT: func @bar() +// AFTER_ALL: // -----// IR Dump After{{.*}}CSEPass: cse //----- // +// AFTER_ALL: func @foo() +// AFTER_ALL: // -----// IR Dump After{{.*}}CanonicalizerPass: canonicalize{{.*}} //----- // +// AFTER_ALL: func @foo() +// AFTER_ALL: // -----// IR Dump After{{.*}}CSEPass: cse //----- // +// AFTER_ALL: func @bar() +// AFTER_ALL: // -----// IR Dump After{{.*}}CanonicalizerPass: canonicalize{{.*}} //----- // +// AFTER_ALL: func @bar() -// BEFORE_MODULE: // -----// IR Dump Before{{.*}}CSEPass (cse) ('func.func' operation: @foo) //----- // +// BEFORE_MODULE: // -----// IR Dump Before{{.*}}CSEPass: cse ('func.func' operation: @foo) //----- // // BEFORE_MODULE: func @foo() // BEFORE_MODULE: func @bar() -// BEFORE_MODULE: // -----// IR Dump Before{{.*}}CSEPass (cse) ('func.func' operation: @bar) //----- // +// BEFORE_MODULE: // -----// IR Dump Before{{.*}}CSEPass: cse ('func.func' operation: @bar) //----- // // BEFORE_MODULE: func @foo() // BEFORE_MODULE: func @bar() -// AFTER_ALL_CHANGE: // -----// IR Dump After{{.*}}CSEPass (cse) //----- // -// AFTER_ALL_CHANGE-NEXT: func @foo() -// AFTER_ALL_CHANGE-NOT: // -----// IR Dump After{{.*}}CSEPass (cse) //----- // +// AFTER_ALL_CHANGE: // -----// IR Dump After{{.*}}CSEPass: cse //----- // +// AFTER_ALL_CHANGE: func @foo() +// AFTER_ALL_CHANGE-NOT: // -----// IR Dump After{{.*}}CSEPass: cse //----- // // We expect that only 'foo' changed during CSE, and the second run of CSE did // nothing. // AFTER_FAILURE-NOT: // -----// IR Dump After{{.*}}CSE -// AFTER_FAILURE: // -----// IR Dump After{{.*}}TestFailurePass Failed (test-pass-failure) //----- // +// AFTER_FAILURE: // -----// IR Dump After{{.*}}TestFailurePass Failed: test-pass-failure{{.*}} //----- // // AFTER_FAILURE: func @foo() + +// OPTIONS: // -----// IR Dump Before{{.*}}CanonicalizerPass: canonicalize{cse-between-iterations=false{{[[:space:]]*}}max-iterations=5 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true} //----- // diff --git a/mlir/test/Pass/run-reproducer.mlir b/mlir/test/Pass/run-reproducer.mlir index 68f634d2038fc..5b3b21090d169 100644 --- a/mlir/test/Pass/run-reproducer.mlir +++ b/mlir/test/Pass/run-reproducer.mlir @@ -26,9 +26,9 @@ func.func @bar() { } #-} -// BEFORE: // -----// IR Dump Before{{.*}}CSEPass (cse) //----- // -// BEFORE-NEXT: func @foo() -// BEFORE: // -----// IR Dump Before{{.*}}CSEPass (cse) //----- // -// BEFORE-NEXT: func @bar() -// BEFORE-NOT: // -----// IR Dump Before{{.*}}CanonicalizerPass (canonicalize) //----- // +// BEFORE: // -----// IR Dump Before{{.*}}CSEPass: cse //----- // +// BEFORE: func @foo() +// BEFORE: // -----// IR Dump Before{{.*}}CSEPass: cse //----- // +// BEFORE: func @bar() +// BEFORE-NOT: // -----// IR Dump Before{{.*}}CanonicalizerPass: canonicalize //----- // // BEFORE-NOT: // -----// IR Dump After diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py index a097af92d1f0a..39b57d321033a 100644 --- a/mlir/test/python/pass_manager.py +++ b/mlir/test/python/pass_manager.py @@ -275,7 +275,7 @@ def testPrintIrAfterAll(): pm = PassManager.parse("builtin.module(canonicalize)") ctx.enable_multithreading(False) pm.enable_ir_printing() - # CHECK: // -----// IR Dump After CanonicalizerPass (canonicalize) //----- // + # CHECK: // -----// IR Dump After CanonicalizerPass: canonicalize{{.*}} //----- // # CHECK: module { # CHECK: func.func @main() { # CHECK: return @@ -301,14 +301,14 @@ def testPrintIrBeforeAndAfterAll(): pm = PassManager.parse("builtin.module(canonicalize)") ctx.enable_multithreading(False) pm.enable_ir_printing(print_before_all=True, print_after_all=True) - # CHECK: // -----// IR Dump Before CanonicalizerPass (canonicalize) //----- // + # CHECK: // -----// IR Dump Before CanonicalizerPass: canonicalize{{.*}} //----- // # CHECK: module { # CHECK: func.func @main() { # CHECK: %[[C10:.*]] = arith.constant 10 : i64 # CHECK: return # CHECK: } # CHECK: } - # CHECK: // -----// IR Dump After CanonicalizerPass (canonicalize) //----- // + # CHECK: // -----// IR Dump After CanonicalizerPass: canonicalize{{.*}} //----- // # CHECK: module { # CHECK: func.func @main() { # CHECK: return From a058bae4d2e2a3aadf0aa77c8ab2de7b4005efae Mon Sep 17 00:00:00 2001 From: sstwcw Date: Fri, 8 May 2026 13:06:25 +0000 Subject: [PATCH 037/538] [clang-format][NFC] Format with the new formatter (#196523) https://github.com/llvm/llvm-project/pull/173152#issuecomment-4403491044 --- clang/lib/Format/BreakableToken.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index 9571a64797a2d..98d357c78bb7c 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -680,7 +680,7 @@ const llvm::StringSet<> BreakableBlockComment::ContentIndentingJavadocAnnotations = { "@param", "@return", "@returns", "@throws", "@type", "@template", "@see", "@deprecated", "@define", "@exports", "@mods", "@private", -}; + }; unsigned BreakableBlockComment::getContentIndent(unsigned LineIndex) const { if (!Style.isJava() && !Style.isJavaScript()) From 30778f83dcfc21fb15b18fb4e258269a37cdff4b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 8 May 2026 14:08:18 +0100 Subject: [PATCH 038/538] [X86] Add test coverage showing failure to fold freeze(fnearbyint(x)) -> fnearbyint(freeze(x)) (#196521) Use ftrunc + fnearbyint/fround/froundeven/frint/ftrunc/ffloor/fceil fold to show failure --- llvm/test/CodeGen/X86/freeze-unary.ll | 180 ++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll index bc9e29957c74a..a2c8be0d91e4a 100644 --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -461,3 +461,183 @@ define <16 x i8> @freeze_parity_vec(<16 x i8> %a0) nounwind { %w = and <16 x i8> %z, ret <16 x i8> %z } + +define float @ftrunc_freeze_fnearbyint(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_fnearbyint: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll nearbyintf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_fnearbyint: +; X64: # %bb.0: +; X64-NEXT: roundss $12, %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.nearbyint.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} + +define float @ftrunc_freeze_fround(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_fround: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll roundf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_fround: +; X64: # %bb.0: +; X64-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: andps %xmm0, %xmm1 +; X64-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: addss %xmm0, %xmm1 +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm1, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.round.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} + +define float @ftrunc_freeze_froundeven(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_froundeven: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll roundevenf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_froundeven: +; X64: # %bb.0: +; X64-NEXT: roundss $8, %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.roundeven.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} + +define float @ftrunc_freeze_frint(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_frint: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll rintf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_frint: +; X64: # %bb.0: +; X64-NEXT: roundss $4, %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.rint.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} + +define float @ftrunc_freeze_ftrunc(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_ftrunc: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_ftrunc: +; X64: # %bb.0: +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.trunc.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} + +define float @ftrunc_freeze_ffloor(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_ffloor: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll floorf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_ffloor: +; X64: # %bb.0: +; X64-NEXT: roundss $9, %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.floor.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} + +define float @ftrunc_freeze_fceil(float %a0) nounwind { +; X86-LABEL: ftrunc_freeze_fceil: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll ceilf +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll truncf +; X86-NEXT: addl $8, %esp +; X86-NEXT: retl +; +; X64-LABEL: ftrunc_freeze_fceil: +; X64: # %bb.0: +; X64-NEXT: roundss $10, %xmm0, %xmm0 +; X64-NEXT: roundss $11, %xmm0, %xmm0 +; X64-NEXT: retq + %f0 = call float @llvm.ceil.f32(float %a0) + %fr = freeze float %f0 + %ft = call float @llvm.trunc.f32(float %fr) + ret float %ft +} From 6b4f02426d53578871a11b6ce94553bc9931223f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 May 2026 09:10:25 -0400 Subject: [PATCH 039/538] [SLP] Account for GEP pointer-chain cost when root scalars feed load/store indices When every external use of the root TreeEntry's scalars is a GEP with a single load or store user (sharing one access type) and all lanes are consumed this way, charge the delta between the vector (unknown stride) and scalar (unit stride) pointer-chain costs once via TTI::getPointersChainCost, scaled for the root entry. Vectorizing such a root forces lane extracts or a vector GEP to drive address computation, which is typically more expensive than keeping the indices scalar in a unit-stride address chain. Reviewers: hiraditya, bababuck, RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/192726 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 58 +++++++++++++++++++ .../SLPVectorizer/X86/minimum-sizes.ll | 30 ++++------ 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9cfeae0b8bee7..eec6499c7b724 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19173,9 +19173,32 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, // On AArch64, this helps in fusing a mov instruction, associated with // extractelement, with fmul in the backend so that extractelement is free. SmallVector, 4> ScalarUserAndIdx; + bool AllUsersGEPSWithStoresLoads = true; + SmallBitVector UsedLanes(VectorizableTree.front()->getVectorFactor()); + SmallVector Pointers; + Type *UserScalarTy = nullptr; for (ExternalUser &EU : ExternalUses) { ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane); + if (EU.E.Idx == 0) { + UsedLanes.set(EU.Lane); + auto *User = dyn_cast_if_present(EU.User); + if (User && User->hasOneUse() && + isa(User->user_back())) { + Type *LocalTy = getValueType(User->user_back()); + if (!UserScalarTy) { + UserScalarTy = LocalTy; + } else if (UserScalarTy != LocalTy) { + AllUsersGEPSWithStoresLoads = false; + break; + } + Pointers.push_back(User); + } else { + AllUsersGEPSWithStoresLoads = false; + break; + } + } } + AllUsersGEPSWithStoresLoads &= UsedLanes.all(); SmallDenseSet, 8> CheckedScalarUser; for (ExternalUser &EU : ExternalUses) { LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry " @@ -19447,6 +19470,41 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, ExtractCost += ExtraCost; } + // Charge the pointer-chain cost difference once for the root entry when + // every external use of its scalars is a GEP feeding a single load/store + // (see the detection loop above). Vectorizing the root in this pattern + // forces lane extracts (or a vector GEP with unknown stride) to drive the + // address computation, which is typically more expensive than keeping the + // indices scalar in a unit-stride address chain. Add the delta once rather + // than per external use. + if (AllUsersGEPSWithStoresLoads && !Pointers.empty()) { + const TreeEntry &RootEntry = *VectorizableTree.front(); + const bool AnyRootKeptAsScalar = any_of(RootEntry.Scalars, [&](Value *V) { + return ExternalUsesAsOriginalScalar.contains(V); + }); + const Value *CommonBase = nullptr; + bool HaveCommonBase = true; + for (const Value *P : Pointers) { + const Value *Op = getUnderlyingObject(P); + if (!CommonBase) + CommonBase = Op; + else if (CommonBase != Op) { + HaveCommonBase = false; + break; + } + } + if (!AnyRootKeptAsScalar && HaveCommonBase) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto *VecTy = getWidenedType(UserScalarTy, RootEntry.Scalars.size()); + InstructionCost ScalarGEPCost = TTI->getPointersChainCost( + Pointers, CommonBase, TTI::PointersChainInfo::getUnitStride(), + UserScalarTy, CostKind); + InstructionCost VectorGEPCost = TTI->getPointersChainCost( + Pointers, CommonBase, TTI::PointersChainInfo::getUnknownStride(), + VecTy, CostKind); + ExtractCost += ScaleCost(VectorGEPCost - ScalarGEPCost, RootEntry); + } + } // Insert externals for extract of operands of casts to be emitted as scalars // instead of extractelement. for (Value *V : ScalarOpsFromCasts) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll index 58fb5f772207d..3b085ee821001 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -17,15 +17,12 @@ target triple = "x86_64-unknown-linux-gnu" define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; SSE-LABEL: @PR31243_zext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1) -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 -; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]] +; SSE-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3:%.*]] to i32 +; SSE-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5:%.*]] to i32 +; SSE-NEXT: [[T2:%.*]] = or i32 [[TMP4]], 1 +; SSE-NEXT: [[T3:%.*]] = or i32 [[TMP6]], 1 +; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[T2]] +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[T3]] ; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] @@ -76,15 +73,12 @@ entry: define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) { ; SSE-LABEL: @PR31243_sext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1) -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3]] to i32 -; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5]] to i32 -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]] +; SSE-NEXT: [[TMP4:%.*]] = sext i8 [[TMP3:%.*]] to i32 +; SSE-NEXT: [[TMP6:%.*]] = sext i8 [[TMP5:%.*]] to i32 +; SSE-NEXT: [[T2:%.*]] = or i32 [[TMP4]], 1 +; SSE-NEXT: [[T3:%.*]] = or i32 [[TMP6]], 1 +; SSE-NEXT: [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[T2]] +; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[T3]] ; SSE-NEXT: [[T6:%.*]] = load i8, ptr [[T4]], align 1 ; SSE-NEXT: [[T7:%.*]] = load i8, ptr [[T5]], align 1 ; SSE-NEXT: [[T8:%.*]] = add i8 [[T6]], [[T7]] From 6d9c57ba52c9511eb82722c38bc23f7cd6fea6a3 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 8 May 2026 14:11:45 +0100 Subject: [PATCH 040/538] [LLVM][InstSimplify] Refactor simplifyBinaryIntrinsic to remove Call operand. (#196309) --- .../llvm/Analysis/InstSimplifyFolder.h | 6 ++-- .../llvm/Analysis/InstructionSimplify.h | 4 +-- llvm/lib/Analysis/InstructionSimplify.cpp | 35 ++++++++++--------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/llvm/include/llvm/Analysis/InstSimplifyFolder.h b/llvm/include/llvm/Analysis/InstSimplifyFolder.h index 2832beb9e337c..e1f102fee9259 100644 --- a/llvm/include/llvm/Analysis/InstSimplifyFolder.h +++ b/llvm/include/llvm/Analysis/InstSimplifyFolder.h @@ -121,8 +121,10 @@ class LLVM_ABI InstSimplifyFolder final : public IRBuilderFolder { Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty, Instruction *FMFSource = nullptr) const override { - return simplifyBinaryIntrinsic(ID, Ty, LHS, RHS, SQ, - dyn_cast_if_present(FMFSource)); + FastMathFlags FMF; + if (auto *FPMO = dyn_cast_if_present(FMFSource)) + FMF = FPMO->getFastMathFlags(); + return simplifyBinaryIntrinsic(ID, Ty, LHS, RHS, FMF, SQ); } //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h index b5fdd8422ab9a..488adad5cc1cf 100644 --- a/llvm/include/llvm/Analysis/InstructionSimplify.h +++ b/llvm/include/llvm/Analysis/InstructionSimplify.h @@ -198,8 +198,8 @@ LLVM_ABI Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, /// The \p `Call` argument is optional and may be null. LLVM_ABI Value *simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, Value *Op0, Value *Op1, - const SimplifyQuery &Q, - const CallBase *Call); + FastMathFlags FMF, + const SimplifyQuery &Q); /// Given operands for a ShuffleVectorInst, fold the result or return null. /// See class ShuffleVectorInst for a description of the mask representation. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 2bd0eb269510f..c842bdf8ff492 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -6722,7 +6722,7 @@ enum class MinMaxOptResult { // quieted), or to choose either option in the case of undef/poison. static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst, const Intrinsic::ID IID, - const CallBase *Call, + FastMathFlags FMF, Constant **OutNewConstVal) { assert(OutNewConstVal != nullptr); @@ -6758,15 +6758,14 @@ static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst, return MinMaxOptResult::UseOtherVal; } - if (CAPF.isInfinity() || (Call && Call->hasNoInfs() && CAPF.isLargest())) { + if (CAPF.isInfinity() || (FMF.noInfs() && CAPF.isLargest())) { // minnum(X, -inf) -> -inf (ignoring sNaN -> qNaN propagation) // maxnum(X, +inf) -> +inf (ignoring sNaN -> qNaN propagation) // minimum(X, -inf) -> -inf if nnan // maximum(X, +inf) -> +inf if nnan // minimumnum(X, -inf) -> -inf // maximumnum(X, +inf) -> +inf - if (CAPF.isNegative() == IsMin && - (!PropagateNaN || (Call && Call->hasNoNaNs()))) { + if (CAPF.isNegative() == IsMin && (!PropagateNaN || FMF.noNaNs())) { *OutNewConstVal = const_cast(RHSConst); return MinMaxOptResult::UseNewConstVal; } @@ -6777,8 +6776,7 @@ static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst, // maximum(X, -inf) -> X (ignoring quieting of sNaNs) // minimumnum(X, +inf) -> X if nnan // maximumnum(X, -inf) -> X if nnan - if (CAPF.isNegative() != IsMin && - (PropagateNaN || (Call && Call->hasNoNaNs()))) + if (CAPF.isNegative() != IsMin && (PropagateNaN || FMF.noNaNs())) return MinMaxOptResult::UseOtherVal; } return MinMaxOptResult::CannotOptimize; @@ -6841,19 +6839,18 @@ static Value *simplifySVEIntReduction(Intrinsic::ID IID, Type *ReturnType, } Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, - Value *Op0, Value *Op1, - const SimplifyQuery &Q, - const CallBase *Call) { + Value *Op0, Value *Op1, FastMathFlags FMF, + const SimplifyQuery &Q) { unsigned BitWidth = ReturnType->getScalarSizeInBits(); switch (IID) { case Intrinsic::get_active_lane_mask: { if (match(Op1, m_Zero())) return ConstantInt::getFalse(ReturnType); - if (!Call) + if (!Q.CxtI) break; - const Function *F = Call->getFunction(); + const Function *F = Q.CxtI->getFunction(); auto *ScalableTy = dyn_cast(ReturnType); Attribute Attr = F->getFnAttribute(Attribute::VScaleRange); if (ScalableTy && Attr.isValid()) { @@ -7138,7 +7135,7 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, if (Constant *SplatVal = C->getSplatValue()) { // Handle splat vectors (including scalable vectors) - OptResult = OptimizeConstMinMax(SplatVal, IID, Call, &NewConst); + OptResult = OptimizeConstMinMax(SplatVal, IID, FMF, &NewConst); if (OptResult == MinMaxOptResult::UseNewConstVal) NewConst = ConstantVector::getSplat(ElemCount, NewConst); @@ -7157,7 +7154,7 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, OptResult = MinMaxOptResult::CannotOptimize; break; } - auto ElemResult = OptimizeConstMinMax(Elt, IID, Call, &NewConst); + auto ElemResult = OptimizeConstMinMax(Elt, IID, FMF, &NewConst); if (ElemResult == MinMaxOptResult::CannotOptimize || (ElemResult != OptResult && OptResult != MinMaxOptResult::UseEither && @@ -7174,7 +7171,7 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, } } else { // Handle scalar inputs - OptResult = OptimizeConstMinMax(C, IID, Call, &NewConst); + OptResult = OptimizeConstMinMax(C, IID, FMF, &NewConst); } if (OptResult == MinMaxOptResult::UseOtherVal || @@ -7253,9 +7250,13 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee, if (NumOperands == 1) return simplifyUnaryIntrinsic(F, Args[0], Q, Call); - if (NumOperands == 2) - return simplifyBinaryIntrinsic(IID, F->getReturnType(), Args[0], Args[1], Q, - Call); + if (NumOperands == 2) { + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(Call)) + FMF = FPMO->getFastMathFlags(); + return simplifyBinaryIntrinsic(IID, F->getReturnType(), Args[0], Args[1], + FMF, Q.getWithInstruction(Call)); + } // Handle intrinsics with 3 or more arguments. switch (IID) { From b1b1e19b03c83c3b1588d86c2272ab8effc85ef6 Mon Sep 17 00:00:00 2001 From: "forking-google-bazel-bot[bot]" <265904573+forking-google-bazel-bot[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 08:41:41 -0500 Subject: [PATCH 041/538] [Bazel] Fixes 1b38e21 (#196524) This fixes 1b38e21077dc469b0c67360440e4d19710ef053e. Co-authored-by: Google Bazel Bot --- .../llvm-project-overlay/libc/BUILD.bazel | 233 +++++++++++++++++- 1 file changed, 226 insertions(+), 7 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 3457e51a9cdd0..1143fb6f681ad 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1483,7 +1483,6 @@ libc_support_library( ], hdrs = [ "src/__support/File/linux/file.h", - "src/__support/File/linux/lseekImpl.h", ], deps = [ ":__support_alloc_checker", @@ -1494,6 +1493,7 @@ libc_support_library( ":__support_libc_errno", ":__support_macros_config", ":__support_osutil_fcntl", + ":__support_osutil_linux_syscall_wrappers_lseek", ":__support_osutil_syscall", ":hdr_fcntl_macros", ":hdr_stdint_proxy", @@ -1517,17 +1517,223 @@ libc_support_library( ) libc_support_library( - name = "__support_file_linux_lseekimpl", - hdrs = ["src/__support/File/linux/lseekImpl.h"], + name = "__support_osutil_linux_syscall_wrappers_chdir", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/chdir.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_dup", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/dup.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_dup2", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/dup2.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_dup3", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/dup3.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_fchdir", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/fchdir.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_fsync", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/fsync.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_readlink", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/readlink.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ":types_ssize_t", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_readlinkat", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/readlinkat.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ":types_ssize_t", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_rename", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/rename.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_rmdir", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/rmdir.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_unlink", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/unlink.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_unlinkat", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/unlinkat.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_access", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/access.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":__support_common", + ":__support_error_or", + ":__support_macros_config", + ":__support_osutil_syscall", + ":hdr_fcntl_macros", + ], +) + +libc_support_library( + name = "__support_osutil_linux_syscall_wrappers_lseek", + hdrs = ["src/__support/OSUtil/linux/syscall_wrappers/lseek.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), deps = [ ":__support_common", ":__support_error_or", - ":__support_libc_errno", ":__support_macros_config", ":__support_osutil_syscall", - ":errno", ":hdr_stdint_proxy", - ":hdr_stdio_overlay", ":types_off_t", ], ) @@ -13553,6 +13759,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_access", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -13567,6 +13774,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_chdir", ":__support_osutil_syscall", ":errno", ], @@ -13593,6 +13801,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_dup", ":__support_osutil_syscall", ":errno", ":hdr_unistd_macros", @@ -13607,6 +13816,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_dup2", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -13626,6 +13836,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_dup3", ":__support_osutil_syscall", ":errno", ":hdr_unistd_macros", @@ -13652,6 +13863,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_fchdir", ":__support_osutil_syscall", ":errno", ], @@ -13665,6 +13877,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_fsync", ":__support_osutil_syscall", ":errno", ], @@ -13832,9 +14045,9 @@ libc_function( hdrs = ["src/unistd/lseek.h"], deps = [ ":__support_common", - ":__support_file_linux_lseekimpl", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_lseek", ":__support_osutil_syscall", ":errno", ":hdr_unistd_macros", @@ -13912,6 +14125,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_readlink", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -13929,6 +14143,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_readlinkat", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -13946,6 +14161,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_rmdir", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -14035,6 +14251,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_unlink", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -14049,6 +14266,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_unlinkat", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", @@ -14676,6 +14894,7 @@ libc_function( ":__support_common", ":__support_libc_errno", ":__support_macros_config", + ":__support_osutil_linux_syscall_wrappers_rename", ":__support_osutil_syscall", ":errno", ":hdr_fcntl_macros", From 6ef96710068e2fe1d1c4ebaa957ec215ff42c2c2 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 8 May 2026 08:41:58 -0500 Subject: [PATCH 042/538] [libclc] Canonicalize 'clspv' to the 'spirv-unknown-vulkan' triple (#196351) Summary: The libclc project has clspv support for exporting OpenCL standard library utilities to Vulkan consumers. This was previously exposed as a hack into the build system that renamed the triple and relied on macro defines. Recent changes allowed us to use `vulkan` as an OS for the spir-V target. This should make the intention more clear and allow the system to inherit the same triple handling the other targets use. Tested the build, but I will need @rjodinchr and @alan-baker to verify. --- libclc/CMakeLists.txt | 92 +++++++++---------- libclc/README.md | 7 +- libclc/clc/include/clc/clcfunc.h | 2 +- .../atomic/clc_atomic_compare_exchange.inc | 6 +- .../clc/lib/generic/atomic/clc_atomic_def.inc | 6 +- .../clc/lib/{clspv => vulkan}/CMakeLists.txt | 2 +- .../{clspv => vulkan}/integer/clc_mul_hi.cl | 4 +- .../lib/{clspv => vulkan}/math/clc_sw_fma.cl | 0 libclc/cmake/modules/AddLibclc.cmake | 4 +- .../lib/{clspv => vulkan}/CMakeLists.txt | 6 +- .../conversion/convert_float.inc | 0 .../conversion/convert_float2float.cl | 0 .../conversion/convert_float2int.cl | 0 .../conversion/convert_int2float.cl | 0 .../conversion/convert_integer.cl | 0 .../conversion/convert_integer.inc | 4 +- .../opencl/lib/{clspv => vulkan}/math/fma.cl | 0 .../{clspv => vulkan}/shared/vstore_half.cl | 0 .../{clspv => vulkan}/shared/vstore_half.inc | 0 libclc/test/CMakeLists.txt | 2 +- 20 files changed, 65 insertions(+), 70 deletions(-) rename libclc/clc/lib/{clspv => vulkan}/CMakeLists.txt (61%) rename libclc/clc/lib/{clspv => vulkan}/integer/clc_mul_hi.cl (77%) rename libclc/clc/lib/{clspv => vulkan}/math/clc_sw_fma.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/CMakeLists.txt (88%) rename libclc/opencl/lib/{clspv => vulkan}/conversion/convert_float.inc (100%) rename libclc/opencl/lib/{clspv => vulkan}/conversion/convert_float2float.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/conversion/convert_float2int.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/conversion/convert_int2float.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/conversion/convert_integer.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/conversion/convert_integer.inc (93%) rename libclc/opencl/lib/{clspv => vulkan}/math/fma.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/shared/vstore_half.cl (100%) rename libclc/opencl/lib/{clspv => vulkan}/shared/vstore_half.inc (100%) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index cb05fb7b662d8..cf1834a7ece63 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -23,7 +23,7 @@ option( ) # List of all supported architectures. -set( LIBCLC_ARCHS_ALL amdgpu amdgcn clspv clspv64 nvptx64 spirv spirv64 ) +set( LIBCLC_ARCHS_ALL amdgpu amdgcn nvptx64 spirv spirv32 spirv64 ) set(LIBCLC_TARGET ${LLVM_DEFAULT_TARGET_TRIPLE}) @@ -100,8 +100,8 @@ string( REPLACE "-" ";" TRIPLE ${LIBCLC_TARGET} ) list(GET TRIPLE 0 ARCH) list(GET TRIPLE 2 OS) -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv64) - if(NOT LIBCLC_USE_SPIRV_BACKEND AND NOT llvm-spirv_exe) +if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) + if(NOT OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND AND NOT llvm-spirv_exe) message(FATAL_ERROR "SPIR-V backend or llvm-spirv is required for libclc ${LIBCLC_TARGET}") endif() endif() @@ -120,12 +120,14 @@ if(ARCH STREQUAL amdgcn) add_subdirectory(opencl/lib/amdgpu) elseif(ARCH STREQUAL nvptx64) add_subdirectory(clc/lib/ptx-nvidiacl) -elseif(ARCH STREQUAL spirv OR ARCH STREQUAL spirv64) - add_subdirectory(clc/lib/spirv) - add_subdirectory(opencl/lib/spirv) -elseif(ARCH STREQUAL clspv OR ARCH STREQUAL clspv64) - add_subdirectory(clc/lib/clspv) - add_subdirectory(opencl/lib/clspv) +elseif(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) + if(OS STREQUAL vulkan) + add_subdirectory(clc/lib/vulkan) + add_subdirectory(opencl/lib/vulkan) + else() + add_subdirectory(clc/lib/spirv) + add_subdirectory(opencl/lib/spirv) + endif() endif() add_custom_target( libclc ALL ) @@ -133,30 +135,18 @@ add_custom_target( libclc ALL ) add_custom_target( libclc-opencl-builtins COMMENT "Build libclc OpenCL builtins" ) add_dependencies( libclc libclc-opencl-builtins ) -# Determine the clang target triple. +# Determine the clang target triple. Vulkan and SPIR-V backend targets use the +# triple directly; other SPIR-V targets fall back to the legacy SPIR target. set(clang_triple ${LIBCLC_TARGET}) -if(ARCH STREQUAL spirv AND LIBCLC_USE_SPIRV_BACKEND) - set(clang_triple spirv32--) -elseif(ARCH STREQUAL spirv64 AND LIBCLC_USE_SPIRV_BACKEND) - set(clang_triple spirv64--) -elseif(ARCH STREQUAL spirv OR ARCH STREQUAL clspv) - set(clang_triple spir--) -elseif(ARCH STREQUAL spirv64 OR ARCH STREQUAL clspv64) - set(clang_triple spir64--) -endif() - -# Determine the preprocessor identifier for this target. -set(MACRO_ARCH ${ARCH}) -if(ARCH STREQUAL spirv) - set(MACRO_ARCH SPIRV32) -elseif(ARCH STREQUAL spirv64) - set(MACRO_ARCH SPIRV64) -elseif(ARCH STREQUAL clspv) - set(MACRO_ARCH CLSPV32) -elseif(ARCH STREQUAL clspv64) - set(MACRO_ARCH CLSPV64) +if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) + if(NOT OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND) + if(ARCH STREQUAL spirv) + set(clang_triple spir--) + else() + set(clang_triple spir64--) + endif() + endif() endif() -string(TOUPPER "CLC_${MACRO_ARCH}" target_define) # Address space values. set(private_addrspace_val 0) @@ -164,7 +154,7 @@ set(generic_addrspace_val 0) if(ARCH STREQUAL amdgcn) set(private_addrspace_val 5) endif() -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv64) +if((ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) AND NOT OS STREQUAL vulkan) set(generic_addrspace_val 4) endif() @@ -173,13 +163,14 @@ set(target_compile_flags) set(target_extra_defines) set(opt_flags -O3) -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv64) - list(APPEND target_compile_flags -O0 -finline-hint-functions) - list(APPEND target_extra_defines CLC_SPIRV) - set(opt_flags) -elseif(ARCH STREQUAL clspv OR ARCH STREQUAL clspv64) - list(APPEND target_compile_flags -Wno-unknown-assumption -U__opencl_c_int64) - list(APPEND target_extra_defines CLC_CLSPV) +if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) + if(OS STREQUAL vulkan) + list(APPEND target_compile_flags -Wno-unknown-assumption -U__opencl_c_int64) + else() + list(APPEND target_compile_flags -O0 -finline-hint-functions) + list(APPEND target_extra_defines CLC_SPIRV) + set(opt_flags) + endif() elseif(ARCH STREQUAL amdgcn) list(APPEND target_compile_flags "SHELL:-Xclang -mcode-object-version=none") endif() @@ -190,19 +181,23 @@ if(ARCH STREQUAL amdgcn) list(APPEND _clc_overrides ${CLC_AMDGPU_SOURCES}) elseif(ARCH STREQUAL nvptx64 AND (OS STREQUAL nvidiacl OR OS STREQUAL cuda)) list(APPEND _clc_overrides ${CLC_PTX_NVIDIACL_SOURCES}) -elseif(ARCH STREQUAL spirv OR ARCH STREQUAL spirv64) - list(APPEND _clc_overrides ${CLC_SPIRV_SOURCES}) -elseif(ARCH STREQUAL clspv OR ARCH STREQUAL clspv64) - list(APPEND _clc_overrides ${CLC_CLSPV_SOURCES}) +elseif(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) + if(OS STREQUAL vulkan) + list(APPEND _clc_overrides ${CLC_VULKAN_SOURCES}) + else() + list(APPEND _clc_overrides ${CLC_SPIRV_SOURCES}) + endif() endif() libclc_merge_sources(clc_sources ${CLC_GENERIC_SOURCES} ${_clc_overrides}) -# Collect OpenCL sources. SPIR-V and Clspv targets use self-contained +# Collect OpenCL sources. SPIR-V and Vulkan targets use self-contained # subsets while others merge with target-specific overrides. -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv64) - set(opencl_sources ${OPENCL_SPIRV_SOURCES}) -elseif(ARCH STREQUAL clspv OR ARCH STREQUAL clspv64) - set(opencl_sources ${OPENCL_CLSPV_SOURCES}) +if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) + if(OS STREQUAL vulkan) + set(opencl_sources ${OPENCL_VULKAN_SOURCES}) + else() + set(opencl_sources ${OPENCL_SPIRV_SOURCES}) + endif() else() set(_opencl_overrides) if(ARCH STREQUAL amdgcn) @@ -231,7 +226,6 @@ set(compile_flags ) set(_common_defs - ${target_define} ${target_extra_defines} __CLC_PRIVATE_ADDRSPACE_VAL=${private_addrspace_val} __CLC_GENERIC_ADDRSPACE_VAL=${generic_addrspace_val} diff --git a/libclc/README.md b/libclc/README.md index e096a070ee878..169d59b75c5c6 100644 --- a/libclc/README.md +++ b/libclc/README.md @@ -50,12 +50,11 @@ cmake ../llvm -G Ninja -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release -DLLVM_RUNTIME_TARGETS="nvptx64-nvidia-cuda" ``` -#### Configure for CLSPV targets +#### Configure for Vulkan (clspv) targets ``` cmake ../llvm -G Ninja -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release \ - -DRUNTIMES_clspv--_LLVM_ENABLE_RUNTIMES=libclc \ - -DRUNTIMES_clspv64--_LLVM_ENABLE_RUNTIMES=libclc \ - -DLLVM_RUNTIME_TARGETS="clspv--;clspv64--" + -DRUNTIMES_spirv64-unknown-vulkan_LLVM_ENABLE_RUNTIMES=libclc \ + -DLLVM_RUNTIME_TARGETS="spirv64-unknown-vulkan" ``` #### Configure for SPIR-V targets diff --git a/libclc/clc/include/clc/clcfunc.h b/libclc/clc/include/clc/clcfunc.h index 5457a1892ac87..79b5f4230fac7 100644 --- a/libclc/clc/include/clc/clcfunc.h +++ b/libclc/clc/include/clc/clcfunc.h @@ -14,7 +14,7 @@ #define _CLC_INLINE inline #define _CLC_CONST __attribute__((const)) -#if defined(CLC_CLSPV) +#if defined(__VULKAN__) #define _CLC_DEF __attribute__((noinline)) __attribute__((clspv_libclc_builtin)) #else #define _CLC_DEF diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc index 07ef69d426768..d28e3c72e465e 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc +++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc @@ -8,16 +8,16 @@ #ifdef __CLC_SCALAR -#if defined(__SPIR32__) || defined(CLC_NVPTX) +#if defined(__SPIR32__) || defined(__NVPTX__) #if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) || \ (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32)) #define __CLC_HAS_ATOMIC #endif -#else // defined(__SPIR32__) || defined(CLC_NVPTX) +#else // defined(__SPIR32__) || defined(__NVPTX__) #if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32) #define __CLC_HAS_ATOMIC #endif -#endif // defined(__SPIR32__) || defined(CLC_NVPTX) +#endif // defined(__SPIR32__) || defined(__NVPTX__) #ifdef __CLC_HAS_ATOMIC diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc index 3a313defe0c03..2af450ffbbe5c 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc +++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc @@ -8,16 +8,16 @@ #ifdef __CLC_SCALAR -#if defined(__SPIR32__) || defined(CLC_NVPTX) +#if defined(__SPIR32__) || defined(__NVPTX__) #if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) || \ (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32)) #define __CLC_HAS_ATOMIC #endif -#else // defined(__SPIR32__) || defined(CLC_NVPTX) +#else // defined(__SPIR32__) || defined(__NVPTX__) #if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32) #define __CLC_HAS_ATOMIC #endif -#endif // defined(__SPIR32__) || defined(CLC_NVPTX) +#endif // defined(__SPIR32__) || defined(__NVPTX__) #ifdef __CLC_HAS_ATOMIC diff --git a/libclc/clc/lib/clspv/CMakeLists.txt b/libclc/clc/lib/vulkan/CMakeLists.txt similarity index 61% rename from libclc/clc/lib/clspv/CMakeLists.txt rename to libclc/clc/lib/vulkan/CMakeLists.txt index e54fccb10c3b7..172e3be32d65c 100644 --- a/libclc/clc/lib/clspv/CMakeLists.txt +++ b/libclc/clc/lib/vulkan/CMakeLists.txt @@ -1,4 +1,4 @@ -libclc_configure_source_list(CLC_CLSPV_SOURCES +libclc_configure_source_list(CLC_VULKAN_SOURCES ${CMAKE_CURRENT_SOURCE_DIR} integer/clc_mul_hi.cl math/clc_sw_fma.cl diff --git a/libclc/clc/lib/clspv/integer/clc_mul_hi.cl b/libclc/clc/lib/vulkan/integer/clc_mul_hi.cl similarity index 77% rename from libclc/clc/lib/clspv/integer/clc_mul_hi.cl rename to libclc/clc/lib/vulkan/integer/clc_mul_hi.cl index aab761736fff2..fdf4ac344341f 100644 --- a/libclc/clc/lib/clspv/integer/clc_mul_hi.cl +++ b/libclc/clc/lib/vulkan/integer/clc_mul_hi.cl @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// Opt-out of libclc mul_hi implementation for clspv. -// clspv has an internal implementation that does not required using a bigger +// Opt-out of libclc mul_hi implementation for Vulkan SPIR-V targets. +// clspv has an internal implementation that does not require using a bigger // data size. That implementation is based on OpMulExtended which is SPIR-V // specific, thus it cannot be written in OpenCL-C. diff --git a/libclc/clc/lib/clspv/math/clc_sw_fma.cl b/libclc/clc/lib/vulkan/math/clc_sw_fma.cl similarity index 100% rename from libclc/clc/lib/clspv/math/clc_sw_fma.cl rename to libclc/clc/lib/vulkan/math/clc_sw_fma.cl diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index e1e099d2c9869..41297e8eb1e92 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -111,7 +111,9 @@ function(link_libclc_builtin_library target_name) DEPENDS ${link_deps} ) - if(ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64) + string(REPLACE "-" ";" triple_parts "${ARG_TRIPLE}") + list(GET triple_parts 2 triple_os) + if((ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv32 OR ARG_ARCH STREQUAL spirv64) AND NOT triple_os STREQUAL vulkan) # SPIR-V targets produce a .spv file from the linked bitcode. set(builtins_lib ${library_dir}/${ARG_OUTPUT_FILENAME}.spv) if(LIBCLC_USE_SPIRV_BACKEND) diff --git a/libclc/opencl/lib/clspv/CMakeLists.txt b/libclc/opencl/lib/vulkan/CMakeLists.txt similarity index 88% rename from libclc/opencl/lib/clspv/CMakeLists.txt rename to libclc/opencl/lib/vulkan/CMakeLists.txt index 2b9aa33f6e178..5fa2e7a367678 100644 --- a/libclc/opencl/lib/clspv/CMakeLists.txt +++ b/libclc/opencl/lib/vulkan/CMakeLists.txt @@ -1,8 +1,8 @@ set(_gen ${CMAKE_CURRENT_SOURCE_DIR}/../generic) -# CLSPV uses a curated subset of generic builtins plus its own overrides, +# Vulkan uses a curated subset of generic builtins plus its own overrides, # so this list is self-contained rather than merged with the generic set. -libclc_configure_source_list(_clspv_sources +libclc_configure_source_list(_vulkan_sources ${CMAKE_CURRENT_SOURCE_DIR} conversion/convert_float2float.cl conversion/convert_float2int.cl @@ -85,4 +85,4 @@ libclc_configure_source_list(_gen_sources math/tgamma.cl ) -set(OPENCL_CLSPV_SOURCES ${_clspv_sources} ${_gen_sources} PARENT_SCOPE) +set(OPENCL_VULKAN_SOURCES ${_vulkan_sources} ${_gen_sources} PARENT_SCOPE) diff --git a/libclc/opencl/lib/clspv/conversion/convert_float.inc b/libclc/opencl/lib/vulkan/conversion/convert_float.inc similarity index 100% rename from libclc/opencl/lib/clspv/conversion/convert_float.inc rename to libclc/opencl/lib/vulkan/conversion/convert_float.inc diff --git a/libclc/opencl/lib/clspv/conversion/convert_float2float.cl b/libclc/opencl/lib/vulkan/conversion/convert_float2float.cl similarity index 100% rename from libclc/opencl/lib/clspv/conversion/convert_float2float.cl rename to libclc/opencl/lib/vulkan/conversion/convert_float2float.cl diff --git a/libclc/opencl/lib/clspv/conversion/convert_float2int.cl b/libclc/opencl/lib/vulkan/conversion/convert_float2int.cl similarity index 100% rename from libclc/opencl/lib/clspv/conversion/convert_float2int.cl rename to libclc/opencl/lib/vulkan/conversion/convert_float2int.cl diff --git a/libclc/opencl/lib/clspv/conversion/convert_int2float.cl b/libclc/opencl/lib/vulkan/conversion/convert_int2float.cl similarity index 100% rename from libclc/opencl/lib/clspv/conversion/convert_int2float.cl rename to libclc/opencl/lib/vulkan/conversion/convert_int2float.cl diff --git a/libclc/opencl/lib/clspv/conversion/convert_integer.cl b/libclc/opencl/lib/vulkan/conversion/convert_integer.cl similarity index 100% rename from libclc/opencl/lib/clspv/conversion/convert_integer.cl rename to libclc/opencl/lib/vulkan/conversion/convert_integer.cl diff --git a/libclc/opencl/lib/clspv/conversion/convert_integer.inc b/libclc/opencl/lib/vulkan/conversion/convert_integer.inc similarity index 93% rename from libclc/opencl/lib/clspv/conversion/convert_integer.inc rename to libclc/opencl/lib/vulkan/conversion/convert_integer.inc index dc0dad71ca445..072f36918831a 100644 --- a/libclc/opencl/lib/clspv/conversion/convert_integer.inc +++ b/libclc/opencl/lib/vulkan/conversion/convert_integer.inc @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -// Do not generate user-facing default conversions for clspv as they are handled -// natively. Do not generate user-facing "_rte" conversions for clspv as they +// Do not generate user-facing default conversions for Vulkan targets as they +// are handled natively. Do not generate user-facing "_rte" conversions as they // are handled natively. #define __CLC_DEFINE_FUNCTION_ROUNDING_MODE_SATURATION_IMPL(TYPE_SRC, MODE) \ diff --git a/libclc/opencl/lib/clspv/math/fma.cl b/libclc/opencl/lib/vulkan/math/fma.cl similarity index 100% rename from libclc/opencl/lib/clspv/math/fma.cl rename to libclc/opencl/lib/vulkan/math/fma.cl diff --git a/libclc/opencl/lib/clspv/shared/vstore_half.cl b/libclc/opencl/lib/vulkan/shared/vstore_half.cl similarity index 100% rename from libclc/opencl/lib/clspv/shared/vstore_half.cl rename to libclc/opencl/lib/vulkan/shared/vstore_half.cl diff --git a/libclc/opencl/lib/clspv/shared/vstore_half.inc b/libclc/opencl/lib/vulkan/shared/vstore_half.inc similarity index 100% rename from libclc/opencl/lib/clspv/shared/vstore_half.inc rename to libclc/opencl/lib/vulkan/shared/vstore_half.inc diff --git a/libclc/test/CMakeLists.txt b/libclc/test/CMakeLists.txt index 98566a517f604..4c54d93b26813 100644 --- a/libclc/test/CMakeLists.txt +++ b/libclc/test/CMakeLists.txt @@ -7,7 +7,7 @@ set(LIBCLC_TEST_DEPS umbrella_lit_testsuite_begin(check-libclc) # Testing unresolved symbols. -# Skip nvptx, clspv, spirv targets +# Skip nvptx and spirv targets. if(ARCH MATCHES amdgcn) foreach(tgt IN LISTS LIBCLC_UNRESOLVED_SYMBOL_TEST_TARGETS) set(target_file "$") From 944cc03a4d68fe0f0002040b5e85400a068a0e74 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 8 May 2026 14:44:07 +0100 Subject: [PATCH 043/538] [AArch64] Add sqneg tablegen patterns (#196265) This adds some tablegen patterns for sqneg instructions, largely copied from the equivalent MVE patterns. They perform a saturating negation, effectively just protecting against INT_MIN, which is equivalent to a `ssub_sat 0, R`. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 18 ++++++++++++++++ llvm/test/CodeGen/AArch64/vqabs.ll | 24 +++++++-------------- llvm/test/CodeGen/AArch64/vqneg.ll | 24 +++++++-------------- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index e0bfa5982b33d..6c20efffb1944 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6676,6 +6676,24 @@ defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", AArch64usqadd, int_aarch64_neon_usqadd>; +// ssub_sat(0, R) -> sqneg(R) +def : Pat<(v16i8 (ssubsat immAllZerosV, V128:$reg)), + (v16i8 (SQNEGv16i8 V128:$reg))>; +def : Pat<(v8i16 (ssubsat immAllZerosV, V128:$reg)), + (v8i16 (SQNEGv8i16 V128:$reg))>; +def : Pat<(v4i32 (ssubsat immAllZerosV, V128:$reg)), + (v4i32 (SQNEGv4i32 V128:$reg))>; +def : Pat<(v2i64 (ssubsat immAllZerosV, V128:$reg)), + (v2i64 (SQNEGv2i64 V128:$reg))>; +def : Pat<(v8i8 (ssubsat immAllZerosV, V64:$reg)), + (v8i8 (SQNEGv8i8 V64:$reg))>; +def : Pat<(v4i16 (ssubsat immAllZerosV, V64:$reg)), + (v4i16 (SQNEGv4i16 V64:$reg))>; +def : Pat<(v2i32 (ssubsat immAllZerosV, V64:$reg)), + (v2i32 (SQNEGv2i32 V64:$reg))>; +def : Pat<(v1i64 (ssubsat immAllZerosV, V64:$reg)), + (v1i64 (SQNEGv1i64 V64:$reg))>; + // Floating-point conversion patterns. multiclass IntegerToFPSIMDScalarPatterns { let Predicates = [HasFPRCVT] in { diff --git a/llvm/test/CodeGen/AArch64/vqabs.ll b/llvm/test/CodeGen/AArch64/vqabs.ll index a05bed23b14e2..7faba50d51e86 100644 --- a/llvm/test/CodeGen/AArch64/vqabs.ll +++ b/llvm/test/CodeGen/AArch64/vqabs.ll @@ -89,9 +89,8 @@ entry: define <16 x i8> @vqabs_sat_v16i8(<16 x i8> %A) { ; CHECK-LABEL: vqabs_sat_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.16b, v0.16b ; CHECK-NEXT: cmgt v2.16b, v0.16b, #0 -; CHECK-NEXT: sqsub v1.16b, v1.16b, v0.16b ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: @@ -104,9 +103,8 @@ entry: define <8 x i16> @vqabs_sat_v8i16(<8 x i16> %A) { ; CHECK-LABEL: vqabs_sat_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.8h, v0.8h ; CHECK-NEXT: cmgt v2.8h, v0.8h, #0 -; CHECK-NEXT: sqsub v1.8h, v1.8h, v0.8h ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: @@ -119,9 +117,8 @@ entry: define <4 x i32> @vqabs_sat_v4i32(<4 x i32> %A) { ; CHECK-LABEL: vqabs_sat_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.4s, v0.4s ; CHECK-NEXT: cmgt v2.4s, v0.4s, #0 -; CHECK-NEXT: sqsub v1.4s, v1.4s, v0.4s ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: @@ -134,9 +131,8 @@ entry: define <2 x i64> @vqabs_sat_v2i64(<2 x i64> %A) { ; CHECK-LABEL: vqabs_sat_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.2d, v0.2d ; CHECK-NEXT: cmgt v2.2d, v0.2d, #0 -; CHECK-NEXT: sqsub v1.2d, v1.2d, v0.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret entry: @@ -149,9 +145,8 @@ entry: define <8 x i8> @vqabs_sat_v8i8(<8 x i8> %A) { ; CHECK-LABEL: vqabs_sat_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.8b, v0.8b ; CHECK-NEXT: cmgt v2.8b, v0.8b, #0 -; CHECK-NEXT: sqsub v1.8b, v1.8b, v0.8b ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret entry: @@ -164,9 +159,8 @@ entry: define <4 x i16> @vqabs_sat_v4i16(<4 x i16> %A) { ; CHECK-LABEL: vqabs_sat_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.4h, v0.4h ; CHECK-NEXT: cmgt v2.4h, v0.4h, #0 -; CHECK-NEXT: sqsub v1.4h, v1.4h, v0.4h ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret entry: @@ -179,9 +173,8 @@ entry: define <2 x i32> @vqabs_sat_v2i32(<2 x i32> %A) { ; CHECK-LABEL: vqabs_sat_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sqneg v1.2s, v0.2s ; CHECK-NEXT: cmgt v2.2s, v0.2s, #0 -; CHECK-NEXT: sqsub v1.2s, v1.2s, v0.2s ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret entry: @@ -194,9 +187,8 @@ entry: define <1 x i64> @vqabs_sat_v1i64(<1 x i64> %A) { ; CHECK-SD-LABEL: vqabs_sat_v1i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: sqneg d1, d0 ; CHECK-SD-NEXT: cmgt d2, d0, #0 -; CHECK-SD-NEXT: sqsub d1, d1, d0 ; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/vqneg.ll b/llvm/test/CodeGen/AArch64/vqneg.ll index 7e91ea5a48289..351c8f4bb0db3 100644 --- a/llvm/test/CodeGen/AArch64/vqneg.ll +++ b/llvm/test/CodeGen/AArch64/vqneg.ll @@ -73,8 +73,7 @@ entry: define <16 x i8> @vqneg_sat_v16i8(<16 x i8> %A) { ; CHECK-LABEL: vqneg_sat_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sqneg v0.16b, v0.16b ; CHECK-NEXT: ret entry: %0 = tail call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> zeroinitializer, <16 x i8> %A) @@ -84,8 +83,7 @@ entry: define <8 x i16> @vqneg_sat_v8i16(<8 x i16> %A) { ; CHECK-LABEL: vqneg_sat_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: sqneg v0.8h, v0.8h ; CHECK-NEXT: ret entry: %0 = tail call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> zeroinitializer, <8 x i16> %A) @@ -95,8 +93,7 @@ entry: define <4 x i32> @vqneg_sat_v4i32(<4 x i32> %A) { ; CHECK-LABEL: vqneg_sat_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.4s, v1.4s, v0.4s +; CHECK-NEXT: sqneg v0.4s, v0.4s ; CHECK-NEXT: ret entry: %0 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> zeroinitializer, <4 x i32> %A) @@ -106,8 +103,7 @@ entry: define <2 x i64> @vqneg_sat_v2i64(<2 x i64> %A) { ; CHECK-LABEL: vqneg_sat_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: sqneg v0.2d, v0.2d ; CHECK-NEXT: ret entry: %0 = tail call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> zeroinitializer, <2 x i64> %A) @@ -117,8 +113,7 @@ entry: define <8 x i8> @vqneg_sat_v8i8(<8 x i8> %A) { ; CHECK-LABEL: vqneg_sat_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.8b, v1.8b, v0.8b +; CHECK-NEXT: sqneg v0.8b, v0.8b ; CHECK-NEXT: ret entry: %0 = tail call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> zeroinitializer, <8 x i8> %A) @@ -128,8 +123,7 @@ entry: define <4 x i16> @vqneg_sat_v4i16(<4 x i16> %A) { ; CHECK-LABEL: vqneg_sat_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h +; CHECK-NEXT: sqneg v0.4h, v0.4h ; CHECK-NEXT: ret entry: %0 = tail call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> zeroinitializer, <4 x i16> %A) @@ -139,8 +133,7 @@ entry: define <2 x i32> @vqneg_sat_v2i32(<2 x i32> %A) { ; CHECK-LABEL: vqneg_sat_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s +; CHECK-NEXT: sqneg v0.2s, v0.2s ; CHECK-NEXT: ret entry: %0 = tail call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> zeroinitializer, <2 x i32> %A) @@ -150,8 +143,7 @@ entry: define <1 x i64> @vqneg_sat_v1i64(<1 x i64> %A) { ; CHECK-SD-LABEL: vqneg_sat_v1i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 -; CHECK-SD-NEXT: sqsub d0, d1, d0 +; CHECK-SD-NEXT: sqneg d0, d0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vqneg_sat_v1i64: From 854197d054e605bfda53395d533a7fd25bf01585 Mon Sep 17 00:00:00 2001 From: Tomohiro Kashiwada Date: Fri, 8 May 2026 22:50:18 +0900 Subject: [PATCH 044/538] [llvm-objcopy] Add a missing dependency (#196531) --- llvm/tools/llvm-objcopy/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/tools/llvm-objcopy/CMakeLists.txt b/llvm/tools/llvm-objcopy/CMakeLists.txt index 10897769d2826..fa3105e82efcf 100644 --- a/llvm/tools/llvm-objcopy/CMakeLists.txt +++ b/llvm/tools/llvm-objcopy/CMakeLists.txt @@ -35,6 +35,7 @@ add_llvm_tool(llvm-objcopy ObjcopyOptsTableGen InstallNameToolOptsTableGen StripOptsTableGen + ExtractBundleEntryOptsTableGen GENERATE_DRIVER ) From 8ece18da53d9d8c4404b78ded5807c6e7109e37b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 8 May 2026 08:54:05 -0500 Subject: [PATCH 045/538] [LLVM] Remove FFI forwarding prefix for liboffload (#196518) Summary: The dependency on libffi was removed awhile back but neglected to remove this. --- llvm/runtimes/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 1e074526d0f7c..812cd387c6596 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -307,7 +307,6 @@ function(runtime_default_target) PASSTHROUGH_PREFIXES LLVM_ENABLE_RUNTIMES LLVM_USE_LINKER CUDA CMAKE_CUDA # For runtimes that may look for the CUDA compiler and/or SDK (libc, offload, flang-rt) - FFI # offload uses libffi FLANG_RUNTIME # Shared between Flang and Flang-RT ${ARG_PREFIXES} EXTRA_TARGETS ${extra_targets} From 90a2a8e6d0427804233b0aa58969b65bae87e425 Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Fri, 8 May 2026 10:06:51 -0400 Subject: [PATCH 046/538] [HLSL] Add type traits for ConstantBuffers templates (#195154) This commit adds the type traits to restrict the template type in a ConstantBuffer to structs or classes that do not contain a resource type. Assisted-by: Gemini ------------------------- - main - https://github.com/llvm/llvm-project/pull/195151 - https://github.com/llvm/llvm-project/pull/195152 - https://github.com/llvm/llvm-project/pull/195153 - https://github.com/llvm/llvm-project/pull/195154 :point_left: [Stack](https://www.git-town.com/how-to/proposal-breadcrumb.html) generated by [Git Town](https://github.com/git-town/git-town) --- clang/include/clang/Basic/TokenKinds.def | 1 + clang/include/clang/Sema/SemaHLSL.h | 1 + clang/lib/Sema/HLSLExternalSemaSource.cpp | 50 +++++++++++-- clang/lib/Sema/SemaHLSL.cpp | 13 ++++ clang/lib/Sema/SemaTypeTraits.cpp | 9 +++ .../SemaHLSL/BuiltIns/ConstantBuffers.hlsl | 35 --------- .../SemaHLSL/Resources/ConstantBuffers.hlsl | 74 +++++++++++++++++++ 7 files changed, 142 insertions(+), 41 deletions(-) delete mode 100644 clang/test/SemaHLSL/BuiltIns/ConstantBuffers.hlsl create mode 100644 clang/test/SemaHLSL/Resources/ConstantBuffers.hlsl diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index 005d81b5b9282..f07d8ebb75035 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -691,6 +691,7 @@ KEYWORD(column_major , KEYHLSL) TYPE_TRAIT_2(__builtin_hlsl_is_scalarized_layout_compatible, IsScalarizedLayoutCompatible, KEYHLSL) TYPE_TRAIT_1(__builtin_hlsl_is_intangible, IsIntangibleType, KEYHLSL) TYPE_TRAIT_1(__builtin_hlsl_is_typed_resource_element_compatible, IsTypedResourceElementCompatible, KEYHLSL) +TYPE_TRAIT_1(__builtin_hlsl_is_constant_buffer_element_compatible, IsConstantBufferElementCompatible, KEYHLSL) // OpenMP Type Traits UNARY_EXPR_OR_TYPE_TRAIT(__builtin_omp_required_simd_align, OpenMPRequiredSimdAlign, KEYALL) diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 68c2f209976c4..e65de5d4aa4c3 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -216,6 +216,7 @@ class SemaHLSL : public SemaBase { // HLSL Type trait implementations bool IsScalarizedLayoutCompatible(QualType T1, QualType T2) const; bool IsTypedResourceElementCompatible(QualType T1); + bool IsConstantBufferElementCompatible(QualType T1); bool CheckCompatibleParameterABI(FunctionDecl *New, FunctionDecl *Old); diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 9769eee10ae2f..449b32a215631 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -365,6 +365,32 @@ static Expr *constructTypedBufferConstraintExpr(Sema &S, SourceLocation NameLoc, return TypedResExpr; } +// This function is responsible for constructing the constraint expression for +// this concept: +// template concept is_constant_buffer_element_compatible = +// std::is_class_v && !__is_intangible(T); +static Expr *constructConstantBufferConstraintExpr(Sema &S, + SourceLocation NameLoc, + TemplateTypeParmDecl *T) { + ASTContext &Context = S.getASTContext(); + + // Obtain the QualType for 'bool' + QualType BoolTy = Context.BoolTy; + + // Create a QualType that points to this TemplateTypeParmDecl + QualType TType = Context.getTypeDeclType(T); + + // Create a TypeSourceInfo for the template type parameter 'T' + TypeSourceInfo *TTypeSourceInfo = + Context.getTrivialTypeSourceInfo(TType, NameLoc); + + TypeTraitExpr *ResExpr = TypeTraitExpr::Create( + Context, BoolTy, NameLoc, UTT_IsConstantBufferElementCompatible, + {TTypeSourceInfo}, NameLoc, true); + + return ResExpr; +} + // This function is responsible for constructing the constraint expression for // this concept: // template concept is_structured_resource_element_compatible = @@ -415,8 +441,10 @@ static Expr *constructStructuredBufferConstraintExpr(Sema &S, return CombinedExpr; } +enum class HLSLBufferType { Typed, Structured, Constant }; + static ConceptDecl *constructBufferConceptDecl(Sema &S, NamespaceDecl *NSD, - bool isTypedBuffer) { + HLSLBufferType BT) { ASTContext &Context = S.getASTContext(); DeclContext *DC = NSD->getDeclContext(); SourceLocation DeclLoc = SourceLocation(); @@ -440,14 +468,22 @@ static ConceptDecl *constructBufferConceptDecl(Sema &S, NamespaceDecl *NSD, DeclarationName DeclName; Expr *ConstraintExpr = nullptr; - if (isTypedBuffer) { + switch (BT) { + case HLSLBufferType::Typed: DeclName = DeclarationName( &Context.Idents.get("__is_typed_resource_element_compatible")); ConstraintExpr = constructTypedBufferConstraintExpr(S, DeclLoc, T); - } else { + break; + case HLSLBufferType::Structured: DeclName = DeclarationName( &Context.Idents.get("__is_structured_resource_element_compatible")); ConstraintExpr = constructStructuredBufferConstraintExpr(S, DeclLoc, T); + break; + case HLSLBufferType::Constant: + DeclName = DeclarationName( + &Context.Idents.get("__is_constant_buffer_element_compatible")); + ConstraintExpr = constructConstantBufferConstraintExpr(S, DeclLoc, T); + break; } // Create a ConceptDecl @@ -468,12 +504,14 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { ASTContext &AST = SemaPtr->getASTContext(); CXXRecordDecl *Decl; ConceptDecl *TypedBufferConcept = constructBufferConceptDecl( - *SemaPtr, HLSLNamespace, /*isTypedBuffer*/ true); + *SemaPtr, HLSLNamespace, HLSLBufferType::Typed); ConceptDecl *StructuredBufferConcept = constructBufferConceptDecl( - *SemaPtr, HLSLNamespace, /*isTypedBuffer*/ false); + *SemaPtr, HLSLNamespace, HLSLBufferType::Structured); + ConceptDecl *ConstantBufferConcept = constructBufferConceptDecl( + *SemaPtr, HLSLNamespace, HLSLBufferType::Constant); Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "ConstantBuffer") - .addSimpleTemplateParams({"element_type"}) + .addSimpleTemplateParams({"element_type"}, ConstantBufferConcept) .finalizeForwardDeclaration(); onCompletion(Decl, [this](CXXRecordDecl *Decl) { diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 594a18f0b8c78..4a7df5b4266f6 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -4700,6 +4700,19 @@ static void BuildFlattenedTypeList(QualType BaseTy, } } +bool SemaHLSL::IsConstantBufferElementCompatible(clang::QualType QT) { + if (QT.isNull()) + return false; + + // Must be a class/struct. + const auto *RD = QT->getAsCXXRecordDecl(); + if (!RD || RD->isUnion()) + return false; + + // Cannot be a resource type or contain one. + return !QT->isHLSLIntangibleType(); +} + bool SemaHLSL::IsTypedResourceElementCompatible(clang::QualType QT) { // null and array types are not allowed. if (QT.isNull() || QT->isArrayType()) diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp index a94a59e8add7b..c79b3f7045ca6 100644 --- a/clang/lib/Sema/SemaTypeTraits.cpp +++ b/clang/lib/Sema/SemaTypeTraits.cpp @@ -367,6 +367,7 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, case UTT_IsCompound: case UTT_IsMemberPointer: case UTT_IsTypedResourceElementCompatible: + case UTT_IsConstantBufferElementCompatible: // Fall-through // These traits are modeled on type predicates in C++0x [meta.unary.prop] @@ -1131,6 +1132,14 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT, return false; return Self.HLSL().IsTypedResourceElementCompatible(T); + + case UTT_IsConstantBufferElementCompatible: + assert(Self.getLangOpts().HLSL && + "constant buffer element compatible types are an HLSL-only feature"); + if (T->isIncompleteType()) + return false; + + return Self.HLSL().IsConstantBufferElementCompatible(T); } } diff --git a/clang/test/SemaHLSL/BuiltIns/ConstantBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/ConstantBuffers.hlsl deleted file mode 100644 index 10c65031b79f2..0000000000000 --- a/clang/test/SemaHLSL/BuiltIns/ConstantBuffers.hlsl +++ /dev/null @@ -1,35 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -finclude-default-header -fsyntax-only -verify %s - -struct S { // expected-note 3 {{candidate constructor}} - float a; - int b; -}; - -struct Empty {}; - -struct ContainsResource { - Texture2D tex; -}; - -union U { - float a; - int b; -}; - -// Valid -ConstantBuffer cb; -ConstantBuffer cb_empty; - -void takes_inout_s(inout S s) {} - -void foo() { - // This case should fail because we cannot writeback to `cb` after the call. - // expected-error@+1 {{no viable constructor copying parameter of type 'const hlsl_constant S'}} - takes_inout_s(cb); -} - -void test_direct_assignment() { - // expected-error@+2 {{cannot assign to return value because function 'operator const hlsl_constant S &' returns a const value}} - // expected-note@* {{function 'operator const hlsl_constant S &' which returns const-qualified type 'const hlsl_constant S &' declared here}} - cb.a = 5.0; -} diff --git a/clang/test/SemaHLSL/Resources/ConstantBuffers.hlsl b/clang/test/SemaHLSL/Resources/ConstantBuffers.hlsl new file mode 100644 index 0000000000000..0ef3ada50c988 --- /dev/null +++ b/clang/test/SemaHLSL/Resources/ConstantBuffers.hlsl @@ -0,0 +1,74 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -finclude-default-header -fsyntax-only -verify %s + +struct S { // expected-note 3 {{candidate constructor}} + float a; + int b; +}; + +struct Empty {}; + +struct ContainsResource { + Texture2D tex; +}; + +union U { + float a; + int b; +}; + +// Valid +ConstantBuffer cb; +ConstantBuffer cb_empty; + +// Invalid: non-struct/class +// expected-error@+1 {{constraints not satisfied for class template 'ConstantBuffer'}} +ConstantBuffer cb_float; +// expected-note@* {{because 'float' does not satisfy '__is_constant_buffer_element_compatible'}} +// expected-note@* {{because '__builtin_hlsl_is_constant_buffer_element_compatible(float)' evaluated to false}} + +// expected-error@+1 {{constraints not satisfied for class template 'ConstantBuffer'}} +ConstantBuffer cb_float4; +// expected-note@* {{because 'float4' (aka 'vector') does not satisfy '__is_constant_buffer_element_compatible'}} +// expected-note@* {{because '__builtin_hlsl_is_constant_buffer_element_compatible(vector)' evaluated to false}} + +// expected-error@+1 {{constraints not satisfied for class template 'ConstantBuffer'}} +ConstantBuffer cb_array; +// expected-note@* {{because 'float[4]' does not satisfy '__is_constant_buffer_element_compatible'}} +// expected-note@* {{because '__builtin_hlsl_is_constant_buffer_element_compatible(float[4])' evaluated to false}} + +// Invalid: contains resource +// expected-error@+1 {{constraints not satisfied for class template 'ConstantBuffer'}} +ConstantBuffer cb_res; +// expected-note@* {{because 'ContainsResource' does not satisfy '__is_constant_buffer_element_compatible'}} +// expected-note@* {{because '__builtin_hlsl_is_constant_buffer_element_compatible(ContainsResource)' evaluated to false}} + +// Invalid: intangible type +// expected-error@+1 {{use of class template 'Texture2D' requires template arguments}} +ConstantBuffer cb_tex; +// expected-note@* {{template declaration from hidden source}} + +// Invalid: intangible type +// expected-error@+1 {{constraints not satisfied for class template 'ConstantBuffer'}} +ConstantBuffer> cb_tex; +// expected-note@* {{because 'Texture2D' does not satisfy '__is_constant_buffer_element_compatible'}} +// expected-note@*:* {{because '__builtin_hlsl_is_constant_buffer_element_compatible(hlsl::Texture2D)' evaluated to false}} + +// Invalid: union +// expected-error@+1 {{constraints not satisfied for class template 'ConstantBuffer'}} +ConstantBuffer cb_union; +// expected-note@* {{because 'U' does not satisfy '__is_constant_buffer_element_compatible'}} +// expected-note@* {{because '__builtin_hlsl_is_constant_buffer_element_compatible(U)' evaluated to false}} + +void takes_inout_s(inout S s) {} + +void foo() { + // This case should fail because we cannot writeback to `cb` after the call. + // expected-error@+1 {{no viable constructor copying parameter of type 'const hlsl_constant S'}} + takes_inout_s(cb); +} + +void test_direct_assignment() { + // expected-error@+2 {{cannot assign to return value because function 'operator const hlsl_constant S &' returns a const value}} + // expected-note@* {{function 'operator const hlsl_constant S &' which returns const-qualified type 'const hlsl_constant S &' declared here}} + cb.a = 5.0; +} From fb04e8fbb5db10d3dedffbafa86d2b24234123cf Mon Sep 17 00:00:00 2001 From: StefanPaulet <65234821+StefanPaulet@users.noreply.github.com> Date: Fri, 8 May 2026 17:11:57 +0300 Subject: [PATCH 047/538] [clang] Improved diagnostics for explicit specialization/instantiation of closure type members (#192843) Follow-up from #191419: - Added separate diagnostics for explicit instantiation and explicit specialization of a closure type member. - Diagnostic for explicit instantiation is now issued at the point of instantiation and not at the lambda declaration. - Added note with `defined here` pointing to lambda declaration for both diagnostics. --------- Co-authored-by: Vlad Serebrennikov --- clang/docs/ReleaseNotes.rst | 2 ++ .../clang/Basic/DiagnosticSemaKinds.td | 4 +-- clang/lib/Sema/SemaDecl.cpp | 11 +++++--- clang/lib/Sema/SemaTemplate.cpp | 12 +++++++- clang/test/CXX/drs/cwg17xx.cpp | 28 +++++++++++++------ clang/test/SemaCXX/lambda-expressions.cpp | 14 +++++++--- 6 files changed, 51 insertions(+), 20 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d700af6a82290..700fbe4304141 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -165,6 +165,8 @@ C++17 Feature Support Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- Implemented `CWG1780 Explicit instantiation/specialization of generic lambda + operator() `_ - Clang now allows omitting ``typename`` before a template name in a conversion operator, implementing `CWG2413 `_. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index c15a9ec1ff0f6..9605c02b819d4 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -8951,8 +8951,8 @@ let CategoryName = "Lambda Issue" in { def warn_cxx11_compat_generic_lambda : Warning< "generic lambdas are incompatible with C++11">, InGroup, DefaultIgnore; - def err_lambda_explicit_spec : Error< - "lambda call operator should not be explicitly specialized or instantiated">; + def err_lambda_explicit_temp_spec : Error< + "a member of a lambda should not be explicitly %select{specialized|instantiated}0">; // C++17 '*this' captures. def warn_cxx14_compat_star_this_lambda_capture : Warning< diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index eb5b6d65b4d58..7c5bcd56b346c 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -16318,10 +16318,13 @@ Decl *Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Decl *D, // have the LSI properly restored. if (isGenericLambdaCallOperatorSpecialization(FD)) { // C++2c 7.5.5.2p17 A member of a closure type shall not be explicitly - // instantiated, explicitly specialized. - if (FD->getTemplateSpecializationInfo() - ->isExplicitInstantiationOrSpecialization()) { - Diag(FD->getLocation(), diag::err_lambda_explicit_spec); + // specialized. + if (FD->getTemplateSpecializationInfo()->isExplicitSpecialization()) { + Diag(FD->getLocation(), diag::err_lambda_explicit_temp_spec) + << /*specialization*/ 0; + CXXRecordDecl *RD = cast(FD->getParent()); + Diag(RD->getLocation(), diag::note_defined_here) << RD; + FD->setInvalidDecl(); PushFunctionScope(); } else { diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 8c6ae204d6d19..174f42caac506 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -11138,8 +11138,18 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S, // Let the ASTConsumer know that this function has been explicitly // instantiated now, and its linkage might have changed. Consumer.HandleTopLevelDecl(DeclGroupRef(Specialization)); - } else if (TSK == TSK_ExplicitInstantiationDefinition) + } else if (TSK == TSK_ExplicitInstantiationDefinition) { + // C++2c [expr.prim.lambda.closure]/19 A member of a closure type shall not + // be explicitly instantiated. + if (const auto *RD = dyn_cast(Specialization->getParent()); + RD && RD->isLambda()) { + Diag(D.getBeginLoc(), diag::err_lambda_explicit_temp_spec) + << /*instantiation*/ 1; + Diag(RD->getLocation(), diag::note_defined_here) << RD; + return (Decl *)nullptr; + } InstantiateFunctionDefinition(D.getIdentifierLoc(), Specialization); + } // C++0x [temp.explicit]p2: // If the explicit instantiation is for a member function, a member class diff --git a/clang/test/CXX/drs/cwg17xx.cpp b/clang/test/CXX/drs/cwg17xx.cpp index d2ea5b6f2ae52..7466c0cfb6048 100644 --- a/clang/test/CXX/drs/cwg17xx.cpp +++ b/clang/test/CXX/drs/cwg17xx.cpp @@ -226,28 +226,38 @@ using Bind = Instantiate::template Bind, Argument>; namespace cwg1780 { // cwg1780: 23 #if __cplusplus >= 201103L +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#elif __cplusplus >= 201103L +#define CONSTEXPR +#endif + auto l = []() -> int { return 5; }; using L = decltype(l); class A { -#if __cplusplus >= 201703L - friend constexpr auto L::operator()() const -> int; // expected-error{{a member of a lambda should not be the target of a friend declaration}} -#else - friend auto L::operator()() const -> int; // expected-error{{a member of a lambda should not be the target of a friend declaration}} -#endif + friend CONSTEXPR auto L::operator()() const -> int; + // since-cxx11-error@-1 {{a member of a lambda should not be the target of a friend declaration}} }; +#undef CONSTEXPR + #if __cplusplus >= 201402L -auto gl = [](auto a) { return 5; }; +auto gl = [](auto a) { return 5; }; // #cwg1780-spec using GL = decltype(gl); template <> -auto GL::operator()(int a) const { // expected-error{{lambda call operator should not be explicitly specialized or instantiated}} +auto GL::operator()(int a) const { +// since-cxx11-error@-1 {{a member of a lambda should not be explicitly specialized}} +// since-cxx11-note-re@#cwg1780-spec {{{{'\(lambda at .+\)'}} defined here}} return 6; } -auto gll = [](auto a) { return 5; }; // expected-error{{lambda call operator should not be explicitly specialized or instantiated}} +auto gll = [](auto a) -> int { return 5; }; // #cwg1780-inst + using GLL = decltype(gll); -template auto GLL::operator()(int a) const; // expected-note{{in instantiation of function template specialization 'cwg1780::(lambda)::operator()' requested here}} +template auto GLL::operator()(int a) const -> int; +// since-cxx11-error@-1 {{a member of a lambda should not be explicitly instantiated}} +// since-cxx11-note-re@#cwg1780-inst {{{{'\(lambda at .+\)'}} defined here}} #endif #endif diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp index 6ae7ac9888b41..1b96005f8d442 100644 --- a/clang/test/SemaCXX/lambda-expressions.cpp +++ b/clang/test/SemaCXX/lambda-expressions.cpp @@ -788,18 +788,24 @@ void GH67492() { // FIXME: This currently causes clang to crash in C++11 mode. #if __cplusplus >= 201402L namespace GH83267 { -auto l = [](auto a) { return 1; }; +auto l = [](auto a) { return 1; }; // #l-gh83267 using type = decltype(l); template<> -auto type::operator()(int a) const { // expected-error{{lambda call operator should not be explicitly specialized or instantiated}} +auto type::operator()(int a) const { // expected-error {{a member of a lambda should not be explicitly specialized}} + // expected-note@#l-gh83267 {{defined here}} return c; // expected-error {{use of undeclared identifier 'c'}} } -auto ll = [](auto a) { return 1; }; // expected-error{{lambda call operator should not be explicitly specialized or instantiated}} +auto ll = [](auto a) -> int { return 1; }; // #ll-gh83267 using t = decltype(ll); -template auto t::operator()(int a) const; // expected-note {{in instantiation}} +template auto t::operator()(int a) const -> int; // expected-error {{a member of a lambda should not be explicitly instantiated}} + // expected-note@#ll-gh83267 {{defined here}} +template +using cll = int(*)(T); +template t::operator cll() const; // expected-error {{a member of a lambda should not be explicitly instantiated}} + // expected-note@#ll-gh83267 {{defined here}} } #endif From 52fbf34e35194d05951dfd76a45cc886cf3e4ba5 Mon Sep 17 00:00:00 2001 From: TPPPP Date: Fri, 8 May 2026 22:22:55 +0800 Subject: [PATCH 048/538] [Clang] Fix stack-use-after-return in TryArrayCopy by allocating OpaqueValueExpr on the ASTContext (#192080) Change the `OpaqueValueExpr` in `TryArrayCopy` from stack memory to heap memory to avoid stack-use-after-return. Fixes #192026 --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaInit.cpp | 11 +++++++---- clang/test/SemaCXX/gh192026.cpp | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 clang/test/SemaCXX/gh192026.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 700fbe4304141..2a7c315192f2d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -539,6 +539,8 @@ Bug Fixes in This Version - Clang now emits an error for friend declarations of lambda members. (#GH26540) - Fixed a crash caused by lambda capture handling in delayed default arguments. (#GH176534) - Fixed a crash when parsing invalid ``static_assert`` declarations with string-literal messages (#GH187690). +- Fixed a potential stack-use-after-return issue in Clang when copy-initializing + an array via an element-at-a-time copy loop (#GH192026) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 92fc73814deb8..ceac3722376fa 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -4380,10 +4380,13 @@ static void TryArrayCopy(Sema &S, const InitializationKind &Kind, InitializedEntity::InitializeElement(S.Context, 0, Entity); QualType InitEltT = S.Context.getAsArrayType(Initializer->getType())->getElementType(); - OpaqueValueExpr OVE(Initializer->getExprLoc(), InitEltT, - Initializer->getValueKind(), - Initializer->getObjectKind()); - Expr *OVEAsExpr = &OVE; + + // FIXME: Here's a functional memory leak cuz we don't have a temporary + // allocator at the moment + OpaqueValueExpr *OVE = new (S.Context) OpaqueValueExpr( + Initializer->getExprLoc(), InitEltT, Initializer->getValueKind(), + Initializer->getObjectKind()); + Expr *OVEAsExpr = OVE; Sequence.InitializeFrom(S, Element, Kind, OVEAsExpr, /*TopLevelOfInitList*/ false, TreatUnavailableAsInvalid); diff --git a/clang/test/SemaCXX/gh192026.cpp b/clang/test/SemaCXX/gh192026.cpp new file mode 100644 index 0000000000000..3b179f8420119 --- /dev/null +++ b/clang/test/SemaCXX/gh192026.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +struct ControlSwitcher { bool b; }; + +class ComplexChain { + volatile union { + char flag_byte; + int ref_count; + } state_flags[5]; // expected-note {{copy constructor of 'ComplexChain' is implicitly deleted because field 'state_flags' has no copy constructor}} + + ControlSwitcher cs{true}; + + ComplexChain trigger_bug() { + return *this; // expected-error {{call to implicitly-deleted copy constructor of 'ComplexChain'}} + } +}; From b14eb2fc0db35080e6da8df26e329e70ebae4552 Mon Sep 17 00:00:00 2001 From: jumerckx <31353884+jumerckx@users.noreply.github.com> Date: Fri, 8 May 2026 16:32:54 +0200 Subject: [PATCH 049/538] [mlir] Fix C++ name hiding bug in PDLPatternMatch for Op classes (#195554) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Native constraints with a named operation operand type failed because of name hiding in `ProcessPDLValue`. i.e. previously a constraint like: ``` Constraint TestConstraintWithNamedOpOperand(testOp: Op) [{ return success(); }]; ``` would fail with: ``` In file included from /home/jumerckx/llvm-project/mlir/include/mlir/IR/PatternMatch.h:814, from /home/jumerckx/llvm-project/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h:13, from /home/jumerckx/llvm-project/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h:14, from /home/jumerckx/llvm-project/mlir/test/lib/Tools/PDLL/../../Dialect/Test/TestDialect.h:21, from /home/jumerckx/llvm-project/mlir/test/lib/Tools/PDLL/TestPDLL.cpp:9: /home/jumerckx/llvm-project/mlir/include/mlir/IR/PDLPatternMatch.h.inc: In instantiation of ‘typename FnTraitsT::result_t mlir::detail::pdl_function_builder::processArgsAndInvokeConstraint(PDLFnT&, mlir::PatternRewriter&, llvm::ArrayRef, std::index_sequence) [with PDLFnT = llvm::LogicalResult (* const)(mlir::PatternRewriter&, test::OpA); long unsigned int ...I = {0}; FnTraitsT = llvm::function_traits; typename FnTraitsT::result_t = llvm::LogicalResult; std::index_sequence = std::integer_sequence]’: /home/jumerckx/llvm-project/mlir/include/mlir/IR/PDLPatternMatch.h.inc:733:42: required from ‘std::enable_if_t<(! std::is_convertible)> >::value), std::function)> > mlir::detail::pdl_function_builder::buildConstraintFn(ConstraintFnT&&) [with ConstraintFnT = llvm::LogicalResult (&)(mlir::PatternRewriter&, test::OpA); std::enable_if_t<(! std::is_convertible)> >::value), std::function)> > = std::function)>]’ /home/jumerckx/llvm-project/mlir/include/mlir/IR/PDLPatternMatch.h.inc:868:79: required from ‘void mlir::PDLPatternModule::registerConstraintFunction(llvm::StringRef, ConstraintFnT&&) [with ConstraintFnT = llvm::LogicalResult (&)(mlir::PatternRewriter&, test::OpA)]’ /home/jumerckx/llvm-project/build/tools/mlir/test/lib/Tools/PDLL/TestPDLLPatterns.h.inc:64:31: required from ‘{anonymous}::GeneratedPDLLPattern0::GeneratedPDLLPattern0(mlir::MLIRContext*, ConfigsT&& ...) [with ConfigsT = {}]’ /home/jumerckx/llvm-project/mlir/include/mlir/IR/PatternMatch.h:1017:25: required from ‘std::enable_if_t::value> mlir::RewritePatternSet:addImpl(llvm::ArrayRef, Args&& ...) [with T = {anonymous}::GeneratedPDLLPattern0; Args = {mlir::MLIRContext*}; std::enable_if_t::value> = void]’ /home/jumerckx/llvm-project/mlir/include/mlir/IR/PatternMatch.h:864:17: required from ‘mlir::RewritePatternSet& mlir::RewritePatternSet::add(ConstructorArg&&, ConstructorArgs&& ...) [with Ts = {{anonymous}::GeneratedPDLLPattern0}; ConstructorArg = mlir::MLIRContext*; ConstructorArgs = {}; = void]’ /home/jumerckx/llvm-project/build/tools/mlir/test/lib/Tools/PDLL/TestPDLLPatterns.h.inc:74:38: required from ‘void populateGeneratedPDLLPatterns(mlir::RewritePatternSet&, ConfigsT&& ...) [with ConfigsT = {}]’ /home/jumerckx/llvm-project/mlir/test/lib/Tools/PDLL/TestPDLL.cpp:36:34: required from here /home/jumerckx/llvm-project/mlir/include/mlir/IR/PDLPatternMatch.h.inc:702:80: error: cannot convert ‘const mlir::PDLValue’ to ‘mlir::Operation*’ 702 | (ProcessPDLValue>::processAsArg( | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ 703 | values[I]))...); | ~~~~~~~~~~~ /home/jumerckx/llvm-project/mlir/include/mlir/IR/PDLPatternMatch.h.inc:522:36: note: initializing argument 1 of ‘static T mlir::detail::pdl_function_builder::ProcessPDLValue::value, void>::type>::processAsArg(mlir::Operation*) [with T = test::OpA]’ 522 | static T processAsArg(Operation *value) { return cast(value); } | ~~~~~~~~~~~^~~~~ ``` This pr fixes that. --- mlir/include/mlir/IR/PDLPatternMatch.h.inc | 2 ++ mlir/test/lib/Tools/PDLL/TestPDLL.cpp | 1 + mlir/test/lib/Tools/PDLL/TestPDLL.pdll | 12 ++++++++++++ mlir/test/mlir-pdll/Integration/test-pdll.mlir | 7 +++++++ 4 files changed, 22 insertions(+) diff --git a/mlir/include/mlir/IR/PDLPatternMatch.h.inc b/mlir/include/mlir/IR/PDLPatternMatch.h.inc index 4afbcf2924965..aa74202178a9b 100644 --- a/mlir/include/mlir/IR/PDLPatternMatch.h.inc +++ b/mlir/include/mlir/IR/PDLPatternMatch.h.inc @@ -14,6 +14,7 @@ #if MLIR_ENABLE_PDL_IN_PATTERNMATCH #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" +#include "llvm/ADT/TypeSwitch.h" namespace mlir { //===----------------------------------------------------------------------===// @@ -519,6 +520,7 @@ template struct ProcessPDLValue::value>> : public ProcessDerivedPDLValue { static T processAsArg(Operation *value) { return cast(value); } + using ProcessDerivedPDLValue::processAsArg; }; //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/Tools/PDLL/TestPDLL.cpp b/mlir/test/lib/Tools/PDLL/TestPDLL.cpp index f6b2b2b1c683f..d2c17fc63f178 100644 --- a/mlir/test/lib/Tools/PDLL/TestPDLL.cpp +++ b/mlir/test/lib/Tools/PDLL/TestPDLL.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "TestDialect.h" +#include "TestOps.h" #include "mlir/Dialect/PDL/IR/PDL.h" #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h" #include "mlir/Interfaces/CastInterfaces.h" diff --git a/mlir/test/lib/Tools/PDLL/TestPDLL.pdll b/mlir/test/lib/Tools/PDLL/TestPDLL.pdll index 9715b556bbe21..718ab95c4bab0 100644 --- a/mlir/test/lib/Tools/PDLL/TestPDLL.pdll +++ b/mlir/test/lib/Tools/PDLL/TestPDLL.pdll @@ -9,8 +9,20 @@ #include "TestOps.td" #include "mlir/Interfaces/CastInterfaces.td" +Constraint TestConstraintWithNamedOpOperand(testOp: Op) [{ + return success(); +}]; + + /// A simple pattern that matches and replaces an operation. Pattern TestSimplePattern => replace op with op; // Test the import of interfaces. Pattern TestInterface => replace _: CastOpInterface with op; + +// Test application of constraint. +Pattern { + let op = op(input: Value); + TestConstraintWithNamedOpOperand(op); + replace op with op; +} diff --git a/mlir/test/mlir-pdll/Integration/test-pdll.mlir b/mlir/test/mlir-pdll/Integration/test-pdll.mlir index baaffc74bf1f1..f9adcf5347da8 100644 --- a/mlir/test/mlir-pdll/Integration/test-pdll.mlir +++ b/mlir/test/mlir-pdll/Integration/test-pdll.mlir @@ -15,3 +15,10 @@ func.func @testImportedInterface() -> i1 { %value = "builtin.unrealized_conversion_cast"() : () -> (i1) return %value : i1 } + +// CHECK-LABEL: func @testWithConstraint +func.func @testWithConstraint(%a: i32) { + // CHECK: test.success + %b = "test.op_a"(%a) { attr = 0 : i32} : (i32) -> (i32) + return +} From 787048a6c1e68afd090cf8261ff948646d5f61c2 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 8 May 2026 07:36:43 -0700 Subject: [PATCH 050/538] [Verifier] Use worklist to verify metadata (#196461) This patch switches from using recursive descent to verify metadata to using a worklist. This change is motivated by the fact that we ran into some stackoverflows in this code while loading some reasonably large bitcode modules from an internal server application (i.e., real world code) within fibers that have somewhat limited stack space. We of course can get around this by just increasing the stack limit of the fibers, but this seemed easy enough and the proper way to do things. This implementation is mine, but I did a preliminary implementation using Gemini inside of Windsurf. --- llvm/lib/IR/Verifier.cpp | 92 +++++++++++++++++------------- llvm/unittests/IR/VerifierTest.cpp | 21 +++++++ 2 files changed, 74 insertions(+), 39 deletions(-) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 2ea113fe665d9..e75ba813e4559 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -129,6 +129,7 @@ #include #include #include +#include #include #include @@ -1113,59 +1114,72 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) { } } -void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) { +void Verifier::visitMDNode(const MDNode &BaseMD, + AreDebugLocsAllowed AllowLocs) { // Only visit each node once. Metadata can be mutually recursive, so this // avoids infinite recursion here, as well as being an optimization. - if (!MDNodes.insert(&MD).second) + if (!MDNodes.insert(&BaseMD).second) return; - Check(&MD.getContext() == &Context, - "MDNode context does not match Module context!", &MD); + std::queue Worklist; + Worklist.push(&BaseMD); - switch (MD.getMetadataID()) { - default: - llvm_unreachable("Invalid MDNode subclass"); - case Metadata::MDTupleKind: - break; + while (!Worklist.empty()) { + const MDNode *CurrentMD = Worklist.front(); + Worklist.pop(); + Check(&CurrentMD->getContext() == &Context, + "MDNode context does not match Module context!", CurrentMD); + + switch (CurrentMD->getMetadataID()) { + default: + llvm_unreachable("Invalid MDNode subclass"); + case Metadata::MDTupleKind: + break; #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) \ case Metadata::CLASS##Kind: \ - visit##CLASS(cast(MD)); \ + visit##CLASS(cast(*CurrentMD)); \ break; #include "llvm/IR/Metadata.def" - } + } - for (const Metadata *Op : MD.operands()) { - if (!Op) - continue; - Check(!isa(Op), "Invalid operand for global metadata!", - &MD, Op); - CheckDI(!isa(Op) || AllowLocs == AreDebugLocsAllowed::Yes, - "DILocation not allowed within this metadata node", &MD, Op); - if (auto *N = dyn_cast(Op)) { - visitMDNode(*N, AllowLocs); - continue; + for (const Metadata *Op : CurrentMD->operands()) { + if (!Op) + continue; + Check(!isa(Op), "Invalid operand for global metadata!", + CurrentMD, Op); + CheckDI(!isa(Op) || AllowLocs == AreDebugLocsAllowed::Yes, + "DILocation not allowed within this metadata node", CurrentMD, + Op); + if (auto *N = dyn_cast(Op)) { + if (MDNodes.insert(N).second) + Worklist.push(N); + continue; + } + if (auto *V = dyn_cast(Op)) { + visitValueAsMetadata(*V, nullptr); + continue; + } } - if (auto *V = dyn_cast(Op)) { - visitValueAsMetadata(*V, nullptr); - continue; + + // Check llvm.loop.estimated_trip_count. + if (CurrentMD->getNumOperands() > 0 && + CurrentMD->getOperand(0).equalsStr(LLVMLoopEstimatedTripCount)) { + Check(CurrentMD->getNumOperands() == 2, "Expected two operands", + CurrentMD); + auto *Count = + dyn_cast_or_null(CurrentMD->getOperand(1)); + Check(Count && Count->getType()->isIntegerTy() && + cast(Count->getType())->getBitWidth() <= 32, + "Expected second operand to be an integer constant of type i32 or " + "smaller", + CurrentMD); } - } - // Check llvm.loop.estimated_trip_count. - if (MD.getNumOperands() > 0 && - MD.getOperand(0).equalsStr(LLVMLoopEstimatedTripCount)) { - Check(MD.getNumOperands() == 2, "Expected two operands", &MD); - auto *Count = dyn_cast_or_null(MD.getOperand(1)); - Check(Count && Count->getType()->isIntegerTy() && - cast(Count->getType())->getBitWidth() <= 32, - "Expected second operand to be an integer constant of type i32 or " - "smaller", - &MD); + // Check these last, so we diagnose problems in operands first. + Check(!CurrentMD->isTemporary(), "Expected no forward declarations!", + CurrentMD); + Check(CurrentMD->isResolved(), "All nodes should be resolved!", CurrentMD); } - - // Check these last, so we diagnose problems in operands first. - Check(!MD.isTemporary(), "Expected no forward declarations!", &MD); - Check(MD.isResolved(), "All nodes should be resolved!", &MD); } void Verifier::visitValueAsMetadata(const ValueAsMetadata &MD, Function *F) { diff --git a/llvm/unittests/IR/VerifierTest.cpp b/llvm/unittests/IR/VerifierTest.cpp index 2d93c57308bcf..1e31fc5e06f65 100644 --- a/llvm/unittests/IR/VerifierTest.cpp +++ b/llvm/unittests/IR/VerifierTest.cpp @@ -516,5 +516,26 @@ TEST(VerifierTest, GetElementPtrInst) { << Error; } +TEST(VerifierTest, DeeplyNested) { + LLVMContext Ctx; + Module M("M", Ctx); + + // Construct an extremely deeply nested metadata node that should cause + // a stack overflow on most platforms if recursion through the entire + // chain is performed. + Metadata *CurrentMetadataNode = + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 0)); + for (int i = 0; i < 100000; ++i) { + CurrentMetadataNode = MDTuple::get(Ctx, {CurrentMetadataNode}); + } + + NamedMDNode *NamedMetadataNode = M.getOrInsertNamedMetadata("foo"); + NamedMetadataNode->addOperand(cast(CurrentMetadataNode)); + + std::string Error; + raw_string_ostream ErrorOS(Error); + EXPECT_FALSE(verifyModule(M, &ErrorOS)); +} + } // end anonymous namespace } // end namespace llvm From b9d50a8e952a6e9c48f884c41ea0c5d4aae854aa Mon Sep 17 00:00:00 2001 From: quic-k Date: Fri, 8 May 2026 20:18:46 +0530 Subject: [PATCH 051/538] [Clang][Hexagon] Add H2 as recognized OS in target triple (#195621) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H2 (Hexagon Hypervisor) is an open-source lightweight OS/hypervisor for the Qualcomm Hexagon processor family that supports running multiple guest operating systems concurrently via hardware virtualization. It is available at: https://github.com/qualcomm/hexagon-hypervisor Since there is currently no open-source toolchain for Hexagon, H2 provides a path toward building one — making Clang/LLVM support for H2 an important step in enabling an open-source Hexagon development ecosystem. Signed-off-by: Kushal Pal --- clang/lib/Basic/Targets.cpp | 2 ++ clang/lib/Basic/Targets/OSTargets.h | 13 +++++++++++++ clang/test/Preprocessor/hexagon-predefines.c | 6 ++++++ llvm/include/llvm/TargetParser/Triple.h | 6 +++++- llvm/lib/TargetParser/Triple.cpp | 3 +++ 5 files changed, 29 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index 4c6d900d88980..ed88ae7173bad 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -128,6 +128,8 @@ std::unique_ptr AllocateTarget(const llvm::Triple &Triple, return std::make_unique>(Triple, Opts); if (Triple.isOSQurt()) return std::make_unique>(Triple, Opts); + if (Triple.isOSH2()) + return std::make_unique>(Triple, Opts); return std::make_unique(Triple, Opts); case llvm::Triple::lanai: diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h index 235004d69c5cc..0fc387c90b280 100644 --- a/clang/lib/Basic/Targets/OSTargets.h +++ b/clang/lib/Basic/Targets/OSTargets.h @@ -1099,6 +1099,19 @@ class LLVM_LIBRARY_VISIBILITY QURTTargetInfo : public OSTargetInfo { using OSTargetInfo::OSTargetInfo; }; +// H2 Target +template +class LLVM_LIBRARY_VISIBILITY H2TargetInfo : public OSTargetInfo { +protected: + void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple, + MacroBuilder &Builder) const override { + Builder.defineMacro("__h2__"); + } + +public: + using OSTargetInfo::OSTargetInfo; +}; + // SerenityOS target template class LLVM_LIBRARY_VISIBILITY SerenityTargetInfo : public OSTargetInfo { diff --git a/clang/test/Preprocessor/hexagon-predefines.c b/clang/test/Preprocessor/hexagon-predefines.c index 7652e4169a63c..cb3e9492ea07e 100644 --- a/clang/test/Preprocessor/hexagon-predefines.c +++ b/clang/test/Preprocessor/hexagon-predefines.c @@ -255,3 +255,9 @@ // CHECK-QURT: #define __hexagon__ 1 // CHECK-QURT: #define __qurt__ 1 // CHECK-QURT-NOT: #define __linux__ + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-h2 %s | FileCheck \ +// RUN: %s -check-prefix CHECK-H2 +// CHECK-H2: #define __h2__ 1 +// CHECK-H2: #define __hexagon__ 1 +// CHECK-H2-NOT: #define __linux__ diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 6e26f14ab8448..d5a42d9646c18 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -260,7 +260,8 @@ class Triple { ChipStar, Firmware, QURT, - LastOSType = QURT + H2, + LastOSType = H2 }; enum EnvironmentType { UnknownEnvironment, @@ -774,6 +775,9 @@ class Triple { /// Tests whether the OS is QURT. bool isOSQurt() const { return getOS() == Triple::QURT; } + /// Tests whether the OS is H2. + bool isOSH2() const { return getOS() == Triple::H2; } + /// Tests whether the OS uses the ELF binary format. bool isOSBinFormatELF() const { return getObjectFormat() == Triple::ELF; } diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 80f7ef063c908..c6515425b7eb5 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -495,6 +495,8 @@ StringRef Triple::getOSTypeName(OSType Kind) { return "firmware"; case QURT: return "qurt"; + case H2: + return "h2"; } llvm_unreachable("Invalid OSType"); @@ -965,6 +967,7 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("chipstar", Triple::ChipStar) .StartsWith("firmware", Triple::Firmware) .StartsWith("qurt", Triple::QURT) + .StartsWith("h2", Triple::H2) .Default(Triple::UnknownOS); } From f15adb33f65a523b0a0aa478187db32667d37946 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 May 2026 11:06:15 -0400 Subject: [PATCH 052/538] [SLP][X86] Forward gather VL to getShuffleCost for broadcast-of-load ShuffleCostEstimator::finalize() emits the user-visible final shuffle on top of the gather node, but it called the local createShuffle helper without forwarding the gathered scalars. Consequently X86TTIImpl::getShuffleCost(), which folds vbroadcast{ss,sd} of a load into TCC_Free under AVX/AVX2, could not see the underlying load and overestimated the cost of the splat permutation by one unit, occasionally dropping a profitable vectorization. Also, modified the TTI cost estimation to handle correctly the cost for the scalar code before actual admission of the vector broadcast. Reviewers: hiraditya, RKSimon, bababuck Pull Request: https://github.com/llvm/llvm-project/pull/195859 --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 35 +++++++++++++- .../Transforms/Vectorize/SLPVectorizer.cpp | 47 +++++++++++++++---- .../SLPVectorizer/X86/broadcast-load-cost.ll | 2 +- .../X86/revec-scalar-insertelement.ll | 8 ++-- 4 files changed, 77 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 1bc0ff397e476..b2b77265b06a5 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1576,8 +1576,41 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, LT.first = 1; // If we're broadcasting a load then AVX/AVX2 can do this for free. + // If many-used-load whose every use is one of a small set of operations + // that SLP can rewrite into a single vector lane, codegen can fold it into + // the free broadcast. using namespace PatternMatch; - if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) && + auto IsBroadcastLoadFoldUser = [&](const User *U) { + if (isa(U) && U->getOperand(1) == Args[0]) + return true; + if (U->getType()->isVectorTy()) + return false; + // Terminators (return/branch/switch/indirectbr/resume/invoke EH) + // and phis carry the value across control flow. + if (const auto *I = dyn_cast(U)) + if (I->isTerminator() || + isa(I)) + return false; + // Only pure calls can be folded. + if (const auto *CB = dyn_cast(U)) + return CB->doesNotAccessMemory() && !CB->mayHaveSideEffects(); + return true; + }; + auto IsFoldableSLPBroadcastLoad = [&]() { + if (!match(Args[0], m_Load(m_Value()))) + return false; + auto *FVT = dyn_cast(DstTy); + if (!FVT) + return false; + // getNumUses() counts each Use, matching the per-lane broadcast + // accounting (a use like `op %x, %x` consumes two broadcast lanes). + if (Args[0]->getNumUses() != FVT->getNumElements()) + return false; + return all_of(Args[0]->users(), IsBroadcastLoadFoldUser); + }; + if (!Args.empty() && + (match(Args[0], m_OneUse(m_Load(m_Value()))) || + IsFoldableSLPBroadcastLoad()) && (ST->hasAVX2() || (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) return TTI::TCC_Free; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index eec6499c7b724..f1a6eb2d7e8af 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13823,9 +13823,10 @@ class BaseShuffleAnalysis { /// Smart shuffle instruction emission, walks through shuffles trees and /// tries to find the best matching vector for the actual shuffle /// instruction. - template + template static T createShuffle(Value *V1, Value *V2, ArrayRef Mask, - ShuffleBuilderTy &Builder, Type *ScalarTy) { + ShuffleBuilderTy &Builder, Type *ScalarTy, + Args... Arguments) { assert(V1 && "Expected at least one vector value."); unsigned ScalarTyNumElements = getNumElements(ScalarTy); SmallVector NewMask(Mask); @@ -13940,7 +13941,7 @@ class BaseShuffleAnalysis { assert(V1 && "Expected non-null value after looking through shuffles."); if (!IsIdentity) - return Builder.createShuffleVector(V1, NewMask); + return Builder.createShuffleVector(V1, NewMask, Arguments...); return Builder.createIdentity(V1); } @@ -15204,6 +15205,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { bool IsFinalized = false; SmallVector CommonMask; SmallVector, 2> InVectors; + /// Captures the original scalar VL of a single, "clean" gather() call so + /// the values can be forwarded as the Args operand to getShuffleCost() for + /// the final permutation in finalize(). This lets the target cost model + /// recognize patterns such as broadcast-of-load (e.g. on X86, + /// vbroadcast{ss,sd} folds the broadcast and the load into one instruction + /// under AVX/AVX2 and is reported as TCC_Free by getShuffleCost). The + /// state machine is: + /// * engaged + empty: tracking active, no qualifying gather seen yet. + /// * engaged + non-empty: exactly one qualifying gather observed and its + /// VL still corresponds to InVectors.front(). + /// * disengaged: the cached VL is no longer trustworthy (multiple + /// gather() calls, or a state-mutating add() happened). + std::optional> BVValues = SmallVector(); const TargetTransformInfo &TTI; InstructionCost Cost = 0; SmallDenseSet VectorizedVals; @@ -15496,14 +15510,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, cast(V1->getType()), Mask); } - InstructionCost createShuffleVector(Value *V1, ArrayRef Mask) const { + InstructionCost createShuffleVector(Value *V1, ArrayRef Mask, + ArrayRef VL) const { // Empty mask or identity mask are free. unsigned VF = cast(V1->getType())->getElementCount().getKnownMinValue(); if (isEmptyOrIdentity(Mask, VF)) return TTI::TCC_Free; - return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - cast(V1->getType()), Mask); + return ::getShuffleCost( + TTI, TTI::SK_PermuteSingleSrc, cast(V1->getType()), Mask, + TTI::TCK_RecipThroughput, /*Index=*/0, /*SubTp=*/nullptr, VL); } InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } InstructionCost createPoison(Type *Ty, unsigned VF) const { @@ -15518,7 +15534,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost createShuffle(const PointerUnion &P1, const PointerUnion &P2, - ArrayRef Mask) { + ArrayRef Mask, ArrayRef VL = {}) { ShuffleCostBuilder Builder(TTI); SmallVector CommonMask(Mask); Value *V1 = P1.dyn_cast(), *V2 = P2.dyn_cast(); @@ -15726,7 +15742,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (InVectors.size() == 2) InVectors.pop_back(); return ExtraCost + BaseShuffleAnalysis::createShuffle( - V1, V2, CommonMask, Builder, ScalarTy); + V1, V2, CommonMask, Builder, ScalarTy, VL); } public: @@ -15876,6 +15892,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { SameNodesEstimated = true; } void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef Mask) { + BVValues.reset(); if (&E1 == &E2) { assert(all_of(Mask, [&](int Idx) { @@ -15900,6 +15917,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize); } void add(const TreeEntry &E1, ArrayRef Mask) { + BVValues.reset(); if (InVectors.empty()) { CommonMask.assign(Mask.begin(), Mask.end()); InVectors.assign(1, &E1); @@ -15935,6 +15953,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } /// Adds another one input vector and the mask for the shuffling. void add(Value *V1, ArrayRef Mask, bool ForExtracts = false) { + if (BVValues && !isa(V1)) + BVValues.reset(); if (InVectors.empty()) { assert(CommonMask.empty() && !ForExtracts && "Expected empty input mask/vectors."); @@ -15984,6 +16004,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Value *gather(ArrayRef VL, unsigned MaskVF = 0, Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); + if (BVValues) { + if (BVValues->empty() && InVectors.empty()) + BVValues->assign(VL.begin(), VL.end()); + else + BVValues.reset(); + } if (!Root) { // FIXME: Need to find a way to avoid use of getNullValue here. SmallVector Vals; @@ -16113,10 +16139,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { assert(InVectors.size() == 1 && "Expected only one vector with no mask"); return Cost; } + ArrayRef VL; + if (BVValues) + VL = *BVValues; return Cost + createShuffle(InVectors.front(), InVectors.size() == 2 ? InVectors.back() : nullptr, - CommonMask); + CommonMask, VL); } ~ShuffleCostEstimator() { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast-load-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast-load-cost.ll index a0e6ffc4a250c..6ee5fc8adef27 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast-load-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast-load-cost.ll @@ -17,7 +17,7 @@ ; YAML-NEXT: Function: bcast_v2f64_load ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-1' +; YAML-NEXT: - Cost: '-2' define void @bcast_v2f64_load(ptr %A, double %y, double %z, ptr %S) { ; CHECK-LABEL: define void @bcast_v2f64_load( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-scalar-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-scalar-insertelement.ll index c714b24c3d85e..6a53cfa5ec1c8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-scalar-insertelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-scalar-insertelement.ll @@ -6,10 +6,10 @@ define <8 x i16> @test(ptr %0) { ; CHECK-SAME: ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[VECTOR_PH:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> zeroinitializer, i16 [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP1]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP1]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP1]], i64 5 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> zeroinitializer, <8 x i16> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP5]] ; vector.ph: From 2de925076d0ea999d0c0d354271b13dc74e96140 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 11:08:24 -0400 Subject: [PATCH 053/538] [libc][math] Fix a -Wshadow warning in add_sub.h (#196337) y_bits on line 111 was shadowing y_bits on line 48. No behavior change. --- libc/src/__support/FPUtil/generic/add_sub.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/__support/FPUtil/generic/add_sub.h b/libc/src/__support/FPUtil/generic/add_sub.h index 6db5bcbb75020..a7f22865a6a00 100644 --- a/libc/src/__support/FPUtil/generic/add_sub.h +++ b/libc/src/__support/FPUtil/generic/add_sub.h @@ -108,10 +108,10 @@ add_or_sub(InType x, InType y) { if constexpr (cpp::is_same_v && cpp::is_same_v) { - OutFPBits y_bits(y); + OutFPBits out_y_bits(y); if constexpr (IsSub) - y_bits.set_sign(y_bits.sign().negate()); - return y_bits.get_val(); + out_y_bits.set_sign(out_y_bits.sign().negate()); + return out_y_bits.get_val(); } else { #ifdef LIBC_HAS_CONSTANT_EVALUATION From e59fe34ea2e931dfeb39e9facd0442ca96dfdc6d Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Fri, 8 May 2026 16:19:27 +0100 Subject: [PATCH 054/538] [lldb][windows] do not open a new windows when running a shell command (#196089) --- lldb/include/lldb/Host/ProcessLaunchInfo.h | 4 ++ lldb/source/Host/common/ProcessLaunchInfo.cpp | 12 +++++ .../Host/windows/ProcessLauncherWindows.cpp | 51 ++++++++++++++----- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/lldb/include/lldb/Host/ProcessLaunchInfo.h b/lldb/include/lldb/Host/ProcessLaunchInfo.h index 933616d1bf8ce..39f85205999de 100644 --- a/lldb/include/lldb/Host/ProcessLaunchInfo.h +++ b/lldb/include/lldb/Host/ProcessLaunchInfo.h @@ -85,6 +85,10 @@ class ProcessLaunchInfo : public ProcessInfo { const FileAction *GetFileActionForFD(int fd) const; + /// Returns true if fd has an explicit file action, or is the destination of a + /// duplicate action. + bool IsFDRedirected(int fd) const; + Flags &GetFlags() { return m_flags; } const Flags &GetFlags() const { return m_flags; } diff --git a/lldb/source/Host/common/ProcessLaunchInfo.cpp b/lldb/source/Host/common/ProcessLaunchInfo.cpp index 571f6702649da..b5b82c7475822 100644 --- a/lldb/source/Host/common/ProcessLaunchInfo.cpp +++ b/lldb/source/Host/common/ProcessLaunchInfo.cpp @@ -138,6 +138,18 @@ const FileAction *ProcessLaunchInfo::GetFileActionForFD(int fd) const { return nullptr; } +bool ProcessLaunchInfo::IsFDRedirected(int fd) const { + if (GetFileActionForFD(fd)) + return true; + for (size_t i = 0; i < GetNumFileActions(); ++i) { + const FileAction *act = GetFileActionAtIndex(i); + if (act->GetAction() == FileAction::eFileActionDuplicate && + act->GetActionArgument() == fd) + return true; + } + return false; +} + const FileSpec &ProcessLaunchInfo::GetWorkingDirectory() const { return m_working_dir; } diff --git a/lldb/source/Host/windows/ProcessLauncherWindows.cpp b/lldb/source/Host/windows/ProcessLauncherWindows.cpp index 79e0faca64961..fadde17512530 100644 --- a/lldb/source/Host/windows/ProcessLauncherWindows.cpp +++ b/lldb/source/Host/windows/ProcessLauncherWindows.cpp @@ -198,15 +198,19 @@ ProcessLauncherWindows::LaunchProcess(const ProcessLaunchInfo &launch_info, startupinfoex.StartupInfo.wShowWindow = SW_HIDE; } - DWORD flags = CREATE_NEW_CONSOLE | CREATE_UNICODE_ENVIRONMENT | - EXTENDED_STARTUPINFO_PRESENT; + DWORD flags = CREATE_UNICODE_ENVIRONMENT | EXTENDED_STARTUPINFO_PRESENT; + const bool stdio_redirected = launch_info.IsFDRedirected(STDIN_FILENO) && + launch_info.IsFDRedirected(STDOUT_FILENO) && + launch_info.IsFDRedirected(STDERR_FILENO); + if (stdio_redirected) + flags |= CREATE_NO_WINDOW; + else if (!launch_info.GetFlags().Test(eLaunchFlagDisableSTDIO) && + pty_mode == PseudoConsole::Mode::None) + flags |= CREATE_NEW_CONSOLE; + if (launch_info.GetFlags().Test(eLaunchFlagDebug)) flags |= DEBUG_ONLY_THIS_PROCESS; - if (launch_info.GetFlags().Test(eLaunchFlagDisableSTDIO) || - pty_mode != PseudoConsole::Mode::None) - flags &= ~CREATE_NEW_CONSOLE; - std::vector environment = CreateEnvironmentBufferW(launch_info.GetEnvironment()); @@ -264,19 +268,38 @@ llvm::ErrorOr> ProcessLauncherWindows::GetInheritedHandles( HANDLE stdout_handle, HANDLE stderr_handle, HANDLE stdin_handle) { std::vector inherited_handles; - startupinfoex.StartupInfo.hStdError = - stderr_handle ? stderr_handle : GetStdHandle(STD_ERROR_HANDLE); startupinfoex.StartupInfo.hStdInput = stdin_handle ? stdin_handle : GetStdHandle(STD_INPUT_HANDLE); startupinfoex.StartupInfo.hStdOutput = stdout_handle ? stdout_handle : GetStdHandle(STD_OUTPUT_HANDLE); - if (startupinfoex.StartupInfo.hStdError) - inherited_handles.push_back(startupinfoex.StartupInfo.hStdError); - if (startupinfoex.StartupInfo.hStdInput) - inherited_handles.push_back(startupinfoex.StartupInfo.hStdInput); - if (startupinfoex.StartupInfo.hStdOutput) - inherited_handles.push_back(startupinfoex.StartupInfo.hStdOutput); + // eFileActionDuplicate stores the source fd in m_fd and the destination in + // m_arg. GetFileActionForFD searches by m_fd (source), so a + // AppendDuplicateFileAction(STDOUT, STDERR) won't be found when looking up + // STDERR. Scan for duplicate actions that target stderr explicitly. + HANDLE effective_stderr = stderr_handle; + if (!effective_stderr && launch_info) { + for (size_t i = 0; i < launch_info->GetNumFileActions(); ++i) { + const FileAction *act = launch_info->GetFileActionAtIndex(i); + if (act->GetAction() == FileAction::eFileActionDuplicate && + act->GetActionArgument() == STDERR_FILENO) { + effective_stderr = startupinfoex.StartupInfo.hStdOutput; + break; + } + } + } + startupinfoex.StartupInfo.hStdError = + effective_stderr ? effective_stderr : GetStdHandle(STD_ERROR_HANDLE); + + // PROC_THREAD_ATTRIBUTE_HANDLE_LIST requires unique entries. + auto push_if_new = [&](HANDLE h) { + if (h && std::find(inherited_handles.begin(), inherited_handles.end(), h) == + inherited_handles.end()) + inherited_handles.push_back(h); + }; + push_if_new(startupinfoex.StartupInfo.hStdError); + push_if_new(startupinfoex.StartupInfo.hStdInput); + push_if_new(startupinfoex.StartupInfo.hStdOutput); if (launch_info) { for (size_t i = 0; i < launch_info->GetNumFileActions(); ++i) { From d88ddad05c0d91fa0a66ca81b7a19af186ae9992 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Fri, 8 May 2026 17:24:17 +0200 Subject: [PATCH 055/538] [SPIR-V] Fix dangling MachineFunction pointers in finalizeLowering (#196050) --- llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp | 3 +-- llvm/lib/Target/SPIRV/SPIRVISelLowering.h | 4 ---- .../Target/SPIRV/SPIRVGlobalRegistryTests.cpp | 22 +++++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp index 493192b068658..eb16c9a314a23 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp @@ -429,7 +429,7 @@ void validateAccessChain(const SPIRVSubtarget &STI, MachineRegisterInfo *MRI, void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const { // finalizeLowering() is called twice (see GlobalISel/InstructionSelect.cpp) // We'd like to avoid the needless second processing pass. - if (ProcessedMF.find(&MF) != ProcessedMF.end()) + if (MF.getRegInfo().reservedRegsFrozen()) return; MachineRegisterInfo *MRI = &MF.getRegInfo(); @@ -604,7 +604,6 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const { } } } - ProcessedMF.insert(&MF); TargetLowering::finalizeLowering(MF); } diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h index 3561745c1b9e0..6af0bbcc9818d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h @@ -16,7 +16,6 @@ #include "SPIRVGlobalRegistry.h" #include "llvm/CodeGen/TargetLowering.h" -#include namespace llvm { class SPIRVSubtarget; @@ -24,9 +23,6 @@ class SPIRVSubtarget; class SPIRVTargetLowering : public TargetLowering { const SPIRVSubtarget &STI; - // Record of already processed machine functions - mutable std::set ProcessedMF; - public: explicit SPIRVTargetLowering(const TargetMachine &TM, const SPIRVSubtarget &ST); diff --git a/llvm/unittests/Target/SPIRV/SPIRVGlobalRegistryTests.cpp b/llvm/unittests/Target/SPIRV/SPIRVGlobalRegistryTests.cpp index c7d4072cbf32e..618edab01f91b 100644 --- a/llvm/unittests/Target/SPIRV/SPIRVGlobalRegistryTests.cpp +++ b/llvm/unittests/Target/SPIRV/SPIRVGlobalRegistryTests.cpp @@ -101,3 +101,25 @@ TEST_F(SPIRVGlobalRegistryTest, PrepareFunctionsClearsStalePointers) { EXPECT_EQ(GR->findDeducedElementType(F), nullptr); } + +TEST_F(SPIRVGlobalRegistryTest, FinalizeLoweringFreezesAcrossAliasedMFs) { + auto *STM = static_cast(TM.get()); + Function *F = Mod->getFunction("f"); + ASSERT_NE(F, nullptr); + const auto *Sub = STM->getSubtargetImpl(*F); + const SPIRVTargetLowering *TLI = Sub->getTargetLowering(); + + MF.reset(); + std::optional Slot; + + Slot.emplace(*F, *TM, *Sub, MMI->getContext(), 0); + Slot->push_back(Slot->CreateMachineBasicBlock()); + TLI->finalizeLowering(*Slot); + ASSERT_TRUE(Slot->getRegInfo().reservedRegsFrozen()); + + Slot.reset(); + Slot.emplace(*F, *TM, *Sub, MMI->getContext(), 0); + Slot->push_back(Slot->CreateMachineBasicBlock()); + TLI->finalizeLowering(*Slot); + EXPECT_TRUE(Slot->getRegInfo().reservedRegsFrozen()); +} From 23cd5566507da14ea3d687cb7290a4dbbecfdaf9 Mon Sep 17 00:00:00 2001 From: Kevin Sala Penades Date: Fri, 8 May 2026 08:28:14 -0700 Subject: [PATCH 056/538] [llvm][Instrumentor] Fix expected error message (#196540) The test checks that passing the current directory for the stub runtime file path is detected an error. However, the exact error message may depend on the OS. --- llvm/test/Instrumentation/Instrumentor/generate_bad_rt.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/Instrumentation/Instrumentor/generate_bad_rt.ll b/llvm/test/Instrumentation/Instrumentor/generate_bad_rt.ll index 11b080b8b2e97..36c6c22a3a032 100644 --- a/llvm/test/Instrumentation/Instrumentor/generate_bad_rt.ll +++ b/llvm/test/Instrumentation/Instrumentor/generate_bad_rt.ll @@ -1,3 +1,3 @@ ; RUN: not opt < %s -passes=instrumentor -instrumentor-read-config-file=%S/bad_rt_config.json 2>&1 | FileCheck %s --ignore-case -; CHECK: error: failed to open instrumentor stub runtime file for writing: Is a directory +; CHECK: error: failed to open instrumentor stub runtime file for writing: {{.*}} From 0f23bffeae55ae29ee681e5aeedf4300f5700eb4 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Fri, 8 May 2026 11:34:21 -0400 Subject: [PATCH 057/538] Symbol names on arm64 can have an extra char on front of name (#194937) My previous change (https://github.com/llvm/llvm-project/pull/192132) didn't work with aarch64. The symbol name has the \x01 on the front and the greps fail. Update the expression for the grep to allow for that char. Sorry about that. The original grep with `@pipe` also fails on aarch64 & macos. The negative test those greps are doing will always pass on aarch64 & macos. That should be looking for the updated symbol name too. --- clang/test/CodeGen/2008-07-31-asm-labels.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clang/test/CodeGen/2008-07-31-asm-labels.c b/clang/test/CodeGen/2008-07-31-asm-labels.c index 71be9378233d4..f0ca5da8875da 100644 --- a/clang/test/CodeGen/2008-07-31-asm-labels.c +++ b/clang/test/CodeGen/2008-07-31-asm-labels.c @@ -1,26 +1,25 @@ -// RUN: %clang_cc1 -emit-llvm -o %t %s -// RUN: not grep "@pipe()" %t -// RUN: grep '_thisIsNotAPipe' %t | count 3 -// RUN: not grep '@g0' %t -// RUN: grep '_renamed' %t | count 2 -// RUN: %clang_cc1 -DUSE_DEF -emit-llvm -o %t %s -// RUN: not grep "@pipe()" %t -// RUN: grep '_thisIsNotAPipe' %t | count 3 +// RUN: %clang_cc1 -emit-llvm -o - %s |FileCheck %s -check-prefixes CHECK,CHECKREF +// RUN: %clang_cc1 -DUSE_DEF -emit-llvm -o - %s |FileCheck %s -check-prefixes CHECK,CHECKDEF // +//CHECK: _renamed{{.*}} = external {{.*}}global void pipe() asm("_thisIsNotAPipe"); void f0(void) { pipe(); +//CHECK: call {{.*}}_thisIsNotAPipe } void pipe(int); +//CHECKREF: declare {{.*}}_thisIsNotAPipe void f1(void) { pipe(1); +//CHECK: call {{.*}}_thisIsNotAPipe } #ifdef USE_DEF +//CHECKDEF: define {{.*}}_thisIsNotAPipe void pipe(int arg) { int x = 10; } @@ -30,4 +29,5 @@ void pipe(int arg) { extern int g0 asm("_renamed"); int f2(void) { return g0; +//CHECK: load {{.*}}_renamed } From 14a290ab3d013fa30e8efd767beb4c4f002c6175 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 8 May 2026 10:39:05 -0500 Subject: [PATCH 058/538] [mlir][vector] Implement IndexedAccessOpInterface for load, store, etc. (#196216) This commit adds simple (not trying to account for unit dimensions that could be cast away) implementations of IndexedAccessOpInterface to low-level vector operations like vector.load and vector.store, eliminating the need for the old-style code in FoldMemRefAliasOps.cpp. After this commit, it'll be possible to migrate all the other memref-rewriting passes (ExpandAddressComputation and FlattenMemRefs) to use the interface, taking a bunch of dialect dependencies off of memref/transforms. Assisted-By: GPT 5.5 (pulled in old code, wrote some new tests) --- .../MemRef/IR/MemoryAccessOpInterfaces.td | 6 +- .../mlir/Dialect/Vector/IR/VectorOps.td | 6 + .../Transforms/IndexedAccessOpInterfaceImpl.h | 21 ++ .../MemRef/Transforms/FoldMemRefAliasOps.cpp | 331 +----------------- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 4 + .../Dialect/Vector/Transforms/CMakeLists.txt | 1 + .../IndexedAccessOpInterfaceImpl.cpp | 101 ++++++ mlir/lib/RegisterAllDialects.cpp | 2 + .../Dialect/MemRef/fold-memref-alias-ops.mlir | 125 +++++++ 9 files changed, 278 insertions(+), 319 deletions(-) create mode 100644 mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h create mode 100644 mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td b/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td index 7fc69b4fabca6..0f1ef521afc57 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td @@ -53,7 +53,11 @@ def IndexedAccessOpInterface : OpInterface<"IndexedAccessOpInterface"> { InterfaceMethod< /*desc=*/[{ Return the shape of the portion of the memref that is being accessed by - this operation, if known, ignoring leading unit dimensions. + this operation, if known. This shape describes the access dimensions + whose strides are semantically important for this operation. + Implementations shall omit dimensions whose strides do not affect the + operation semantics. (In particular, if an operation will access one + element of the base memref, this method should return `{}`.) Reindexing transformations may not modify the *strides* of the trailing N dimensions, where N is the size returned value, and should ensure that diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 8f4fa5ca6a844..28a8109cb59c0 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -1658,6 +1658,7 @@ def Vector_TransferWriteOp : let hasVerifier = 1; } +// Promises IndexedAccessOpInterface. def Vector_LoadOp : Vector_Op<"load", [ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, @@ -1776,6 +1777,7 @@ def Vector_LoadOp : Vector_Op<"load", [ "$base `[` $indices `]` attr-dict `:` type($base) `,` type($result)"; } +// Promises IndexedAccessOpInterface. def Vector_StoreOp : Vector_Op<"store", [ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, @@ -1883,6 +1885,7 @@ def Vector_StoreOp : Vector_Op<"store", [ "`:` type($base) `,` type($valueToStore)"; } +// Promises IndexedAccessOpInterface. def Vector_MaskedLoadOp : Vector_Op<"maskedload", [ DeclareOpInterfaceMethods, @@ -1978,6 +1981,7 @@ def Vector_MaskedLoadOp : ]; } +// Promises IndexedAccessOpInterface. def Vector_MaskedStoreOp : Vector_Op<"maskedstore", [ DeclareOpInterfaceMethods, @@ -2251,6 +2255,7 @@ def Vector_ScatterOp }]>]; } +// Promises IndexedAccessOpInterface. def Vector_ExpandLoadOp : Vector_Op<"expandload", [ DeclareOpInterfaceMethods, @@ -2342,6 +2347,7 @@ def Vector_ExpandLoadOp : ]; } +// Promises IndexedAccessOpInterface. def Vector_CompressStoreOp : Vector_Op<"compressstore", [ DeclareOpInterfaceMethods, diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h new file mode 100644 index 0000000000000..57fe661ad81f0 --- /dev/null +++ b/mlir/include/mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h @@ -0,0 +1,21 @@ +//===- IndexedAccessOpInterfaceImpl.h ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_VECTOR_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H +#define MLIR_DIALECT_VECTOR_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H + +namespace mlir { + +class DialectRegistry; + +namespace vector { +void registerIndexedAccessOpInterfaceExternalModels(DialectRegistry ®istry); +} // namespace vector +} // namespace mlir + +#endif // MLIR_DIALECT_VECTOR_TRANSFORMS_INDEXEDACCESSOPINTERFACEIMPL_H diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp index e36ddfa063e11..de7662753d142 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp @@ -25,7 +25,6 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" -#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include @@ -52,6 +51,8 @@ hasTrivialReassociationSuffix(ArrayRef reassocs, int64_t n) { if (n <= 0) return true; + if (n > static_cast(reassocs.size())) + return false; return llvm::all_of( reassocs.take_back(n), [&](const ReassociationIndices &indices) { return indices.size() == 1; }); @@ -60,89 +61,17 @@ hasTrivialReassociationSuffix(ArrayRef reassocs, static bool hasTrailingUnitStrides(memref::SubViewOp subview, int64_t n) { if (n <= 0) return true; - return llvm::all_of(subview.getStaticStrides().take_back(n), - [](int64_t s) { return s == 1; }); + ArrayRef strides = subview.getStaticStrides(); + if (n > static_cast(strides.size())) + return false; + return llvm::all_of(strides.take_back(n), [](int64_t s) { return s == 1; }); } -/// Helpers to access the memref operand for each op. -template -static Value getMemRefOperand(LoadOrStoreOpTy op) { - return op.getMemref(); -} - -static Value getMemRefOperand(vector::LoadOp op) { return op.getBase(); } - -static Value getMemRefOperand(vector::StoreOp op) { return op.getBase(); } - -static Value getMemRefOperand(vector::MaskedLoadOp op) { return op.getBase(); } - -static Value getMemRefOperand(vector::MaskedStoreOp op) { return op.getBase(); } - //===----------------------------------------------------------------------===// // Patterns //===----------------------------------------------------------------------===// namespace { -/// Merges subview operation with load/transferRead operation. -template -class LoadOpOfSubViewOpFolder final : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy loadOp, - PatternRewriter &rewriter) const override; -}; - -/// Merges expand_shape operation with load/transferRead operation. -template -class LoadOpOfExpandShapeOpFolder final : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy loadOp, - PatternRewriter &rewriter) const override; -}; - -/// Merges collapse_shape operation with load/transferRead operation. -template -class LoadOpOfCollapseShapeOpFolder final : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy loadOp, - PatternRewriter &rewriter) const override; -}; - -/// Merges subview operation with store/transferWriteOp operation. -template -class StoreOpOfSubViewOpFolder final : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy storeOp, - PatternRewriter &rewriter) const override; -}; - -/// Merges expand_shape operation with store/transferWriteOp operation. -template -class StoreOpOfExpandShapeOpFolder final : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy storeOp, - PatternRewriter &rewriter) const override; -}; - -/// Merges collapse_shape operation with store/transferWriteOp operation. -template -class StoreOpOfCollapseShapeOpFolder final : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy storeOp, - PatternRewriter &rewriter) const override; -}; - /// Folds subview(subview(x)) to a single subview(x). class SubViewOfSubViewFolder : public OpRewritePattern { public: @@ -286,226 +215,6 @@ struct TransferOpOfCollapseShapeOpFolder final }; } // namespace -static LogicalResult preconditionsFoldSubViewOp(RewriterBase &rewriter, - Operation *op, - memref::SubViewOp subviewOp) { - return success(); -} - -template -LogicalResult LoadOpOfSubViewOpFolder::matchAndRewrite( - OpTy loadOp, PatternRewriter &rewriter) const { - auto subViewOp = - getMemRefOperand(loadOp).template getDefiningOp(); - - if (!subViewOp) - return rewriter.notifyMatchFailure(loadOp, "not a subview producer"); - - LogicalResult preconditionResult = - preconditionsFoldSubViewOp(rewriter, loadOp, subViewOp); - if (failed(preconditionResult)) - return preconditionResult; - - SmallVector sourceIndices; - affine::resolveIndicesIntoOpWithOffsetsAndStrides( - rewriter, loadOp.getLoc(), subViewOp.getMixedOffsets(), - subViewOp.getMixedStrides(), subViewOp.getDroppedDims(), - loadOp.getIndices(), sourceIndices); - - llvm::TypeSwitch(loadOp) - .Case([&](memref::LoadOp op) { - rewriter.replaceOpWithNewOp( - loadOp, subViewOp.getSource(), sourceIndices, op.getNontemporal()); - }) - .Case([&](vector::LoadOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), subViewOp.getSource(), sourceIndices); - }) - .Case([&](vector::MaskedLoadOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), subViewOp.getSource(), sourceIndices, - op.getMask(), op.getPassThru()); - }) - .DefaultUnreachable("unexpected operation"); - return success(); -} - -template -LogicalResult LoadOpOfExpandShapeOpFolder::matchAndRewrite( - OpTy loadOp, PatternRewriter &rewriter) const { - auto expandShapeOp = - getMemRefOperand(loadOp).template getDefiningOp(); - - if (!expandShapeOp) - return failure(); - - SmallVector sourceIndices; - // memref.load guarantees that indexes start inbounds while the vector - // operations don't. This impacts if our linearization is `disjoint` - resolveSourceIndicesExpandShape(loadOp.getLoc(), rewriter, expandShapeOp, - loadOp.getIndices(), sourceIndices, - isa(loadOp.getOperation())); - - return llvm::TypeSwitch(loadOp) - .Case([&](memref::LoadOp op) { - rewriter.replaceOpWithNewOp( - loadOp, expandShapeOp.getViewSource(), sourceIndices, - op.getNontemporal()); - return success(); - }) - .Case([&](vector::LoadOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), expandShapeOp.getViewSource(), sourceIndices, - op.getNontemporal()); - return success(); - }) - .Case([&](vector::MaskedLoadOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), expandShapeOp.getViewSource(), sourceIndices, - op.getMask(), op.getPassThru()); - return success(); - }) - .DefaultUnreachable("unexpected operation"); -} - -template -LogicalResult LoadOpOfCollapseShapeOpFolder::matchAndRewrite( - OpTy loadOp, PatternRewriter &rewriter) const { - auto collapseShapeOp = getMemRefOperand(loadOp) - .template getDefiningOp(); - - if (!collapseShapeOp) - return failure(); - - SmallVector sourceIndices; - resolveSourceIndicesCollapseShape(loadOp.getLoc(), rewriter, collapseShapeOp, - loadOp.getIndices(), sourceIndices); - llvm::TypeSwitch(loadOp) - .Case([&](memref::LoadOp op) { - rewriter.replaceOpWithNewOp( - loadOp, collapseShapeOp.getViewSource(), sourceIndices, - op.getNontemporal()); - }) - .Case([&](vector::LoadOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), collapseShapeOp.getViewSource(), sourceIndices, - op.getNontemporal()); - }) - .Case([&](vector::MaskedLoadOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), collapseShapeOp.getViewSource(), sourceIndices, - op.getMask(), op.getPassThru()); - }) - .DefaultUnreachable("unexpected operation"); - return success(); -} - -template -LogicalResult StoreOpOfSubViewOpFolder::matchAndRewrite( - OpTy storeOp, PatternRewriter &rewriter) const { - auto subViewOp = - getMemRefOperand(storeOp).template getDefiningOp(); - - if (!subViewOp) - return rewriter.notifyMatchFailure(storeOp, "not a subview producer"); - - LogicalResult preconditionResult = - preconditionsFoldSubViewOp(rewriter, storeOp, subViewOp); - if (failed(preconditionResult)) - return preconditionResult; - - SmallVector sourceIndices; - affine::resolveIndicesIntoOpWithOffsetsAndStrides( - rewriter, storeOp.getLoc(), subViewOp.getMixedOffsets(), - subViewOp.getMixedStrides(), subViewOp.getDroppedDims(), - storeOp.getIndices(), sourceIndices); - - llvm::TypeSwitch(storeOp) - .Case([&](memref::StoreOp op) { - rewriter.replaceOpWithNewOp( - op, op.getValue(), subViewOp.getSource(), sourceIndices, - op.getNontemporal()); - }) - .Case([&](vector::StoreOp op) { - rewriter.replaceOpWithNewOp( - op, op.getValueToStore(), subViewOp.getSource(), sourceIndices); - }) - .Case([&](vector::MaskedStoreOp op) { - rewriter.replaceOpWithNewOp( - op, subViewOp.getSource(), sourceIndices, op.getMask(), - op.getValueToStore()); - }) - .DefaultUnreachable("unexpected operation"); - return success(); -} - -template -LogicalResult StoreOpOfExpandShapeOpFolder::matchAndRewrite( - OpTy storeOp, PatternRewriter &rewriter) const { - auto expandShapeOp = - getMemRefOperand(storeOp).template getDefiningOp(); - - if (!expandShapeOp) - return failure(); - - SmallVector sourceIndices; - // memref.store guarantees that indexes start inbounds while the vector - // operations don't. This impacts if our linearization is `disjoint` - resolveSourceIndicesExpandShape(storeOp.getLoc(), rewriter, expandShapeOp, - storeOp.getIndices(), sourceIndices, - isa(storeOp.getOperation())); - llvm::TypeSwitch(storeOp) - .Case([&](memref::StoreOp op) { - rewriter.replaceOpWithNewOp( - storeOp, op.getValueToStore(), expandShapeOp.getViewSource(), - sourceIndices, op.getNontemporal()); - }) - .Case([&](vector::StoreOp op) { - rewriter.replaceOpWithNewOp( - op, op.getValueToStore(), expandShapeOp.getViewSource(), - sourceIndices, op.getNontemporal()); - }) - .Case([&](vector::MaskedStoreOp op) { - rewriter.replaceOpWithNewOp( - op, expandShapeOp.getViewSource(), sourceIndices, op.getMask(), - op.getValueToStore()); - }) - .DefaultUnreachable("unexpected operation"); - return success(); -} - -template -LogicalResult StoreOpOfCollapseShapeOpFolder::matchAndRewrite( - OpTy storeOp, PatternRewriter &rewriter) const { - auto collapseShapeOp = getMemRefOperand(storeOp) - .template getDefiningOp(); - - if (!collapseShapeOp) - return failure(); - - SmallVector sourceIndices; - resolveSourceIndicesCollapseShape(storeOp.getLoc(), rewriter, collapseShapeOp, - storeOp.getIndices(), sourceIndices); - llvm::TypeSwitch(storeOp) - .Case([&](memref::StoreOp op) { - rewriter.replaceOpWithNewOp( - storeOp, op.getValueToStore(), collapseShapeOp.getViewSource(), - sourceIndices, op.getNontemporal()); - }) - .Case([&](vector::StoreOp op) { - rewriter.replaceOpWithNewOp( - op, op.getValueToStore(), collapseShapeOp.getViewSource(), - sourceIndices, op.getNontemporal()); - }) - .Case([&](vector::MaskedStoreOp op) { - rewriter.replaceOpWithNewOp( - op, collapseShapeOp.getViewSource(), sourceIndices, op.getMask(), - op.getValueToStore()); - }) - .DefaultUnreachable("unexpected operation"); - return success(); -} - LogicalResult AccessOpOfSubViewOpFolder::matchAndRewrite(memref::IndexedAccessOpInterface op, PatternRewriter &rewriter) const { @@ -849,27 +558,13 @@ LogicalResult TransferOpOfCollapseShapeOpFolder::matchAndRewrite( } void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) { - patterns.add< - // Interface-based patterns to which we will be migrating. - AccessOpOfSubViewOpFolder, AccessOpOfExpandShapeOpFolder, - AccessOpOfCollapseShapeOpFolder, IndexedMemCopyOpOfSubViewOpFolder, - IndexedMemCopyOpOfExpandShapeOpFolder, - IndexedMemCopyOpOfCollapseShapeOpFolder, TransferOpOfSubViewOpFolder, - TransferOpOfExpandShapeOpFolder, TransferOpOfCollapseShapeOpFolder, - // The old way of doing things. Don't add more of these. - LoadOpOfSubViewOpFolder, - LoadOpOfSubViewOpFolder, - StoreOpOfSubViewOpFolder, - StoreOpOfSubViewOpFolder, - LoadOpOfExpandShapeOpFolder, - LoadOpOfExpandShapeOpFolder, - StoreOpOfExpandShapeOpFolder, - StoreOpOfExpandShapeOpFolder, - LoadOpOfCollapseShapeOpFolder, - LoadOpOfCollapseShapeOpFolder, - StoreOpOfCollapseShapeOpFolder, - StoreOpOfCollapseShapeOpFolder, - SubViewOfSubViewFolder>(patterns.getContext()); + patterns + .add(patterns.getContext()); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 780b0cbb36120..51be1e4431e70 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/UB/IR/UBMatchers.h" #include "mlir/Dialect/Utils/IndexingUtils.h" @@ -483,6 +484,9 @@ void VectorDialect::initialize() { addInterfaces(); + declarePromisedInterfaces(); declarePromisedInterfaces(); diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt index 4e0f07af95984..112a1db6fe93b 100644 --- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRVectorTransforms BufferizableOpInterfaceImpl.cpp + IndexedAccessOpInterfaceImpl.cpp LowerVectorBitCast.cpp LowerVectorBroadcast.cpp LowerVectorContract.cpp diff --git a/mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp b/mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp new file mode 100644 index 0000000000000..c91ea97a2f965 --- /dev/null +++ b/mlir/lib/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.cpp @@ -0,0 +1,101 @@ +//===- IndexedAccessOpInterfaceImpl.cpp -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Implement IndexedAccessOpInterface on vector dialect operations with +// %memref[%i, %j, ...] operands so generic memref-dialect passes can rewrite +// their base/index pairs. Transfer ops keep their VectorTransferOpInterface +// patterns; gather/scatter have tensor-or-memref bases and index-vector +// operands that do not fit IndexedAccessOpInterface's rank-matched index +// contract. +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h" + +#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" + +using namespace mlir; +using namespace mlir::memref; + +namespace { +/// Return true if this op has the memref semantics expected by this model. +template +bool hasMemrefSemantics(Operation *op) { + return llvm::isa(cast(op).getBase().getType()); +} + +/// Return the vector shape whose access strides must be preserved, marking +/// scalable dimensions as dynamic. +SmallVector getAccessedVectorShape(VectorType vecTy) { + return llvm::map_to_vector( + llvm::zip_equal(vecTy.getShape(), vecTy.getScalableDims()), [](auto dim) { + auto [size, scalable] = dim; + return scalable ? ShapedType::kDynamic : size; + }); +} + +template +struct VectorLoadStoreLikeOpImpl final + : IndexedAccessOpInterface::ExternalModel< + VectorLoadStoreLikeOpImpl, LoadStoreOp> { + TypedValue getAccessedMemref(Operation *op) const { + return cast(op).getBase(); + } + + Operation::operand_range getIndices(Operation *op) const { + return cast(op).getIndices(); + } + + SmallVector getAccessedShape(Operation *op) const { + assert(hasMemrefSemantics(op) && + "expected vector op with memref semantics"); + return getAccessedVectorShape(cast(op).getVectorType()); + } + + std::optional> + updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref, + ValueRange newIndices) const { + assert(hasMemrefSemantics(op) && + "expected vector op with memref semantics"); + assert(llvm::isa(newMemref.getType()) && + "expected replacement memref"); + rewriter.modifyOpInPlace(op, [&]() { + auto concreteOp = cast(op); + concreteOp.getBaseMutable().assign(newMemref); + concreteOp.getIndicesMutable().assign(newIndices); + }); + return std::nullopt; + } + + // TODO: The various load and store operations, at the very least vector.load + // and vector.store, should be taught a starts-in-bounds attribute that would + // let us optimize index generation. + bool hasInboundsIndices(Operation *op) const { + assert(hasMemrefSemantics(op) && + "expected vector op with memref semantics"); + return false; + } +}; + +template +static void attachAll(MLIRContext *ctx) { + (Ops::template attachInterface>(*ctx), ...); +} + +} // namespace + +void mlir::vector::registerIndexedAccessOpInterfaceExternalModels( + DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, vector::VectorDialect *dialect) { + attachAll(ctx); + }); +} diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp index 589730b785133..01a7401db4710 100644 --- a/mlir/lib/RegisterAllDialects.cpp +++ b/mlir/lib/RegisterAllDialects.cpp @@ -95,6 +95,7 @@ #include "mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Vector/Transforms/IndexedAccessOpInterfaceImpl.h" #include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h" #include "mlir/Dialect/WasmSSA/IR/WasmSSA.h" #include "mlir/Dialect/X86/X86Dialect.h" @@ -197,6 +198,7 @@ void mlir::registerAllDialects(DialectRegistry ®istry) { tensor::registerValueBoundsOpInterfaceExternalModels(registry); tosa::registerShardingInterfaceExternalModels(registry); vector::registerBufferizableOpInterfaceExternalModels(registry); + vector::registerIndexedAccessOpInterfaceExternalModels(registry); vector::registerSubsetOpInterfaceExternalModels(registry); vector::registerValueBoundsOpInterfaceExternalModels(registry); NVVM::registerNVVMTargetInterfaceExternalModels(registry); diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir index 50c7ebaff1e6a..6e2702d936ee0 100644 --- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir +++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir @@ -642,6 +642,26 @@ func.func @fold_vector_load_subview(%src : memref<24x64xf32>, // ----- +// TODO: This should fold, but implementing IndexedAccessOpInterface on vector.load +// in a way that would allow the fold added complexity (emitting +// `vector.shape_cast`s) that people wanted to keep out of the initial +// implementation during previous discussions. (Note: this didn't work in the +// pre-interface version of the pass either.) +func.func @no_fold_scalar_equivalent_vector_load_subview( + %arg0 : memref<16xf32>, %off : index, %idx : index) -> vector<1xf32> { + %0 = memref.subview %arg0[%off][4][2] : memref<16xf32> to memref<4xf32, strided<[2], offset: ?>> + %1 = vector.load %0[%idx] : memref<4xf32, strided<[2], offset: ?>>, vector<1xf32> + return %1 : vector<1xf32> +} + +// CHECK-LABEL: func @no_fold_scalar_equivalent_vector_load_subview +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<16xf32> +// CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ARG0]] +// CHECK-NOT: vector.shape_cast +// CHECK: vector.load %[[SUBVIEW]] + +// ----- + func.func @fold_vector_maskedload_subview( %arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3: vector<32xi1>, %arg4: vector<32xf32>) -> vector<32xf32> { %0 = memref.subview %arg0[%arg1, %arg2][1, 1][1, 1] : memref<12x32xf32> to memref> @@ -659,6 +679,21 @@ func.func @fold_vector_maskedload_subview( // ----- +func.func @no_fold_vector_maskedload_subview_high_rank_vector( + %arg0 : memref<8xf32>, %idx : index, + %mask : vector<2x2x2xi1>, %pass : vector<2x2x2xf32>) -> vector<2x2x2xf32> { + %0 = memref.subview %arg0[%idx][1][1] : memref<8xf32> to memref<1xf32, strided<[1], offset: ?>> + %1 = vector.maskedload %0[%idx], %mask, %pass : memref<1xf32, strided<[1], offset: ?>>, vector<2x2x2xi1>, vector<2x2x2xf32> into vector<2x2x2xf32> + return %1 : vector<2x2x2xf32> +} + +// CHECK-LABEL: func @no_fold_vector_maskedload_subview_high_rank_vector +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<8xf32> +// CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ARG0]] +// CHECK: vector.maskedload %[[SUBVIEW]] + +// ----- + func.func @fold_vector_store_subview(%src : memref<24x64xf32>, %off1 : index, %off2 : index, @@ -723,6 +758,24 @@ func.func @fold_vector_load_expand_shape( // ----- +// Folding this would require changing the vector op rank. That is handled by +// vector drop-leading-unit-dim patterns, not by fold-memref-alias-ops. +func.func @no_fold_vector_load_expand_shape_leading_unit( + %arg0 : memref<32xf32>, %arg1 : index) -> vector<1x8xf32> { + %c0 = arith.constant 0 : index + %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32> + %1 = vector.load %0[%arg1, %c0] : memref<4x8xf32>, vector<1x8xf32> + return %1 : vector<1x8xf32> +} + +// CHECK-LABEL: func @no_fold_vector_load_expand_shape_leading_unit +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32> +// CHECK: memref.expand_shape %[[ARG0]] +// CHECK-NOT: vector.shape_cast +// CHECK: vector.load + +// ----- + func.func @fold_vector_maskedload_expand_shape( %arg0 : memref<32xf32>, %arg1 : index, %arg3: vector<8xi1>, %arg4: vector<8xf32>) -> vector<8xf32> { %c0 = arith.constant 0 : index @@ -742,6 +795,22 @@ func.func @fold_vector_maskedload_expand_shape( // ----- +func.func @no_fold_vector_maskedload_expand_shape_high_rank_vector( + %arg0 : memref<32xf32>, %arg1 : index, + %mask : vector<2x2x2xi1>, %pass : vector<2x2x2xf32>) -> vector<2x2x2xf32> { + %c0 = arith.constant 0 : index + %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32> + %1 = vector.maskedload %0[%arg1, %c0], %mask, %pass : memref<4x8xf32>, vector<2x2x2xi1>, vector<2x2x2xf32> into vector<2x2x2xf32> + return %1 : vector<2x2x2xf32> +} + +// CHECK-LABEL: func @no_fold_vector_maskedload_expand_shape_high_rank_vector +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32> +// CHECK: %[[EXPAND:.*]] = memref.expand_shape %[[ARG0]] +// CHECK: vector.maskedload %[[EXPAND]] + +// ----- + func.func @fold_vector_store_expand_shape( %arg0 : memref<32xf32>, %arg1 : index, %val : vector<8xf32>) { %c0 = arith.constant 0 : index @@ -759,6 +828,24 @@ func.func @fold_vector_store_expand_shape( // ----- +// Folding this would require changing the vector op rank. That is handled by +// vector drop-leading-unit-dim patterns, not by fold-memref-alias-ops. +func.func @no_fold_vector_store_expand_shape_leading_unit( + %arg0 : memref<32xf32>, %arg1 : index, %val : vector<1x8xf32>) { + %c0 = arith.constant 0 : index + %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32> + vector.store %val, %0[%arg1, %c0] : memref<4x8xf32>, vector<1x8xf32> + return +} + +// CHECK-LABEL: func @no_fold_vector_store_expand_shape_leading_unit +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32> +// CHECK: memref.expand_shape %[[ARG0]] +// CHECK-NOT: vector.shape_cast +// CHECK: vector.store + +// ----- + func.func @fold_vector_maskedstore_expand_shape( %arg0 : memref<32xf32>, %arg1 : index, %arg3: vector<8xi1>, %arg4: vector<8xf32>) { %c0 = arith.constant 0 : index @@ -778,6 +865,44 @@ func.func @fold_vector_maskedstore_expand_shape( // ----- +func.func @fold_vector_expandload_expand_shape( + %arg0 : memref<32xf32>, %arg1 : index, %arg3: vector<8xi1>, %arg4: vector<8xf32>) -> vector<8xf32> { + %c0 = arith.constant 0 : index + %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32> + %1 = vector.expandload %0[%arg1, %c0], %arg3, %arg4 : memref<4x8xf32>, vector<8xi1>, vector<8xf32> into vector<8xf32> + return %1 : vector<8xf32> +} + +// CHECK-LABEL: func @fold_vector_expandload_expand_shape +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: vector<8xi1> +// CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]+]]: vector<8xf32> +// CHECK: %[[C0:.*]] = arith.constant 0 +// CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (4, 8) +// CHECK: vector.expandload %[[ARG0]][%[[IDX]]], %[[ARG3]], %[[ARG4]] + +// ----- + +func.func @fold_vector_compressstore_expand_shape( + %arg0 : memref<32xf32>, %arg1 : index, %arg3: vector<8xi1>, %arg4: vector<8xf32>) { + %c0 = arith.constant 0 : index + %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32> + vector.compressstore %0[%arg1, %c0], %arg3, %arg4 : memref<4x8xf32>, vector<8xi1>, vector<8xf32> + return +} + +// CHECK-LABEL: func @fold_vector_compressstore_expand_shape +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: vector<8xi1> +// CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]+]]: vector<8xf32> +// CHECK: %[[C0:.*]] = arith.constant 0 +// CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (4, 8) +// CHECK: vector.compressstore %[[ARG0]][%[[IDX]]], %[[ARG3]], %[[ARG4]] + +// ----- + func.func @fold_vector_transfer_read_expand_shape( %arg0 : memref<32xf32>, %arg1 : index) -> vector<8xf32> { %c0 = arith.constant 0 : index From b42a2131f2110bf09fc544536af2918c4a6f2429 Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Fri, 8 May 2026 16:40:53 +0100 Subject: [PATCH 059/538] [lldb][windows] drain the ConPTY on process exit (#196371) --- .../Process/Windows/Common/ProcessWindows.cpp | 69 ++++++++++--------- .../Process/Windows/Common/ProcessWindows.h | 4 ++ 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 28b5554069c90..eb1c4acc0f2bb 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -655,6 +655,7 @@ void ProcessWindows::OnExitProcess(uint32_t exit_code) { LLDB_LOG(log, "Process {0} exited with code {1}", GetID(), exit_code); if (m_pty) { + DrainProcessStdout(); m_pty->SetStopping(true); m_pty->Close(); m_stdio_communication.InterruptRead(); @@ -675,6 +676,32 @@ void ProcessWindows::OnExitProcess(uint32_t exit_code) { ProcessDebugger::OnExitProcess(exit_code); } +void ProcessWindows::DrainProcessStdout() { + if (!m_stdio_communication.ReadThreadIsRunning()) + return; + m_stdio_communication.SynchronizeWithReadThread(); + if (!m_pty || m_pty->GetMode() != PseudoConsole::Mode::ConPTY) + return; + + HANDLE pipe = m_pty->GetSTDOUTHandle(); + for (int consec_empty = 0; consec_empty < 3;) { + if (!m_stdio_communication.ReadThreadIsRunning()) + break; + DWORD avail = 0; + // PeekNamedPipe is thread safe. + if (!::PeekNamedPipe(pipe, nullptr, 0, nullptr, &avail, nullptr)) + break; + if (avail > 0) { + consec_empty = 0; + m_stdio_communication.SynchronizeWithReadThread(); + } else { + ++consec_empty; + if (consec_empty < 3) + ::SleepEx(1, FALSE); + } + } +} + void ProcessWindows::OnDebuggerConnected(lldb::addr_t image_base) { DebuggerThreadSP debugger = m_session_data->m_debugger; Log *log = GetLog(WindowsLog::Process); @@ -741,41 +768,10 @@ ProcessWindows::OnDebugException(bool first_chance, return ExceptionResult::SendToApplication; } - // Drain any in-flight process output before announcing the stop. The I/O - // reader thread and this debug-event thread run concurrently. Without - // synchronization the eBroadcastBitStateChanged(Stopped) event can reach - // the Debugger event thread before the preceding eBroadcastBitSTDOUT - // events. - auto drain_stdout = [this] { - if (!m_stdio_communication.ReadThreadIsRunning()) - return; - m_stdio_communication.SynchronizeWithReadThread(); - if (!m_pty || m_pty->GetMode() != PseudoConsole::Mode::ConPTY) - return; - - HANDLE pipe = m_pty->GetSTDOUTHandle(); - for (int consec_empty = 0; consec_empty < 3;) { - if (!m_stdio_communication.ReadThreadIsRunning()) - break; - DWORD avail = 0; - // PeekNamedPipe is thread safe. - if (!::PeekNamedPipe(pipe, nullptr, 0, nullptr, &avail, nullptr)) - break; - if (avail > 0) { - consec_empty = 0; - m_stdio_communication.SynchronizeWithReadThread(); - } else { - ++consec_empty; - if (consec_empty < 3) - ::SleepEx(1, FALSE); - } - } - }; - if (!first_chance) { // Not any second chance exception is an application crash by definition. // It may be an expression evaluation crash. - drain_stdout(); + DrainProcessStdout(); SetPrivateState(eStateStopped); } @@ -796,12 +792,17 @@ ProcessWindows::OnDebugException(bool first_chance, LLDB_LOG(log, "Hit non-loader breakpoint at address {0:x}.", record.GetExceptionAddress()); } - drain_stdout(); + // Drain any in-flight process output before announcing the stop. The I/O + // reader thread and this debug-event thread run concurrently. Without + // synchronization the eBroadcastBitStateChanged(Stopped) event can reach + // the Debugger event thread before the preceding eBroadcastBitSTDOUT + // events. + DrainProcessStdout(); SetPrivateState(eStateStopped); break; case EXCEPTION_SINGLE_STEP: result = ExceptionResult::BreakInDebugger; - drain_stdout(); + DrainProcessStdout(); SetPrivateState(eStateStopped); break; default: diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h index 228619d0e3d5e..31cf498a16d50 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h @@ -108,6 +108,10 @@ class ProcessWindows : public Process, public ProcessDebugger { void SetPseudoConsoleHandle() override; protected: + /// Block until the stdio read thread has surfaced everything currently + /// buffered in the ConPTY/pipe to the process's STDOUT cache. + void DrainProcessStdout(); + ProcessWindows(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp); Status DoGetMemoryRegionInfo(lldb::addr_t vm_addr, From f68d44dce63f1a94e60410e8817e0eed70fc0578 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Fri, 8 May 2026 08:43:58 -0700 Subject: [PATCH 060/538] [clang][modules] Deserialize submodules lazily (#194968) This PR implements on-demand deserialization of `Module` objects from PCM files. This is motivated by dependency scanning, where eager deserialization of submodules turns out to be very expensive and typically unnecessary. The core of this patch is the introduction of `ModuleRef`, which may either be a pointer to `Module`, a pointer to new `ExternalSubmoduleSource` (implemented by `ASTReader`) and `serialization::SubmoduleID`, both, or none (null). Dereferencing `ModuleRef` ensures the `Module` is deserialized if possible. This replaces `ASTReader::UnresolvedModuleRefs` and changes the structure of a PCM file a bit, most importantly introducing new `SUBMODULE_CHILD` record that enables hooking up the laziness into the qualified by-name lookup that uses `Module::SubModuleIndex`. This speeds up dependency scanning by ~5.5%. --- clang/include/clang/Basic/Module.h | 97 +++++- clang/include/clang/Lex/ModuleMap.h | 2 +- clang/include/clang/Lex/Preprocessor.h | 2 +- .../include/clang/Serialization/ASTBitCodes.h | 14 +- clang/include/clang/Serialization/ASTReader.h | 46 +-- .../include/clang/Serialization/ModuleFile.h | 16 + clang/lib/Basic/Module.cpp | 5 +- clang/lib/Lex/ModuleMap.cpp | 9 +- clang/lib/Lex/Preprocessor.cpp | 2 +- clang/lib/Sema/SemaLookup.cpp | 3 +- clang/lib/Sema/SemaModule.cpp | 4 +- clang/lib/Serialization/ASTReader.cpp | 284 +++++++++--------- clang/lib/Serialization/ASTWriter.cpp | 73 +++-- 13 files changed, 333 insertions(+), 224 deletions(-) diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h index f83319db082d7..3fd6bfa063af4 100644 --- a/clang/include/clang/Basic/Module.h +++ b/clang/include/clang/Basic/Module.h @@ -48,9 +48,17 @@ namespace clang { class FileManager; class LangOptions; +class Module; class ModuleMap; class TargetInfo; +/// Interface for on-demand deserialization of submodules stored in a PCM file. +class ExternalSubmoduleSource { +public: + virtual Module *getSubmodule(uint32_t GlobalID) = 0; + virtual ~ExternalSubmoduleSource() = default; +}; + /// Describes the name of a module. using ModuleId = SmallVector, 2>; @@ -222,6 +230,62 @@ struct ModuleAttributes { NoUndeclaredIncludes(false) {} }; +/// Reference to a module that consists of either an existing/materialized +/// Module object, reference to a serialized submodule record, both, or +/// neither (null). +class ModuleRef { + /// The existing/materialized Module object. + mutable Module *Existing = nullptr; + + /// The external submodule source (i.e. \c ASTReader), and a boolean + /// signifying whether it's already been used to deserialize \c SubmoduleID. + mutable llvm::PointerIntPair + ExternalSource = {nullptr, false}; + + /// Identifier of the external submodule in \c ExternalSource. + mutable uint64_t SubmoduleID = 0; + +public: + /// Create an empty reference. + ModuleRef() = default; + + /// Create reference to a materialized module. + ModuleRef(Module *M) : Existing(M) {} + + /// Create reference to a serialized submodule record. + ModuleRef(ExternalSubmoduleSource *ExtSrc, uint64_t SubmoduleID) + : ExternalSource(ExtSrc, false), SubmoduleID(SubmoduleID) {} + + /// Get the existing/materialized module, if there's any. + Module *getExisting() const { return Existing; } + /// Add the existing/materialized module. + void setExisting(Module *E) { Existing = E; } + + /// Add the serialized submodule record reference. + void setExternal(ExternalSubmoduleSource *ExtSrc, uint64_t ID) { + ExternalSource = {ExtSrc, false}; + SubmoduleID = ID; + } + + /// Check whether this is a non-empty reference. + operator bool() const { + return Existing || (ExternalSource.getPointer() && SubmoduleID); + } + + /// Get the existing/materialized module. Try materializing it on-demand from + /// the serialized submodule record if possible. + operator Module *() const { + if (!ExternalSource.getInt() && ExternalSource.getPointer() && + SubmoduleID) { + Existing = ExternalSource.getPointer()->getSubmodule(SubmoduleID); + ExternalSource.setInt(true); + } + return Existing; + } + + Module *operator->() const { return *this; } +}; + /// Required to construct a Module. /// /// This tag type is only constructible by ModuleMap, guaranteeing it ownership @@ -348,7 +412,7 @@ class alignas(8) Module { private: /// The submodules of this module, indexed by name. - std::vector SubModules; + std::vector SubModules; /// A mapping from the submodule name to the index into the /// \c SubModules vector at which that submodule resides. @@ -552,17 +616,17 @@ class alignas(8) Module { /// The set of modules imported by this module, and on which this /// module depends. - llvm::SmallSetVector Imports; + llvm::SmallVector Imports; /// The set of top-level modules that affected the compilation of this module, /// but were not imported. - llvm::SmallSetVector AffectingClangModules; + llvm::SmallVector AffectingClangModules; /// Describes an exported module. /// /// The pointer is the module being re-exported, while the bit will be true /// to indicate that this is a wildcard export. - using ExportDecl = std::pair; + using ExportDecl = std::pair; /// The set of export declarations. SmallVector Exports; @@ -640,7 +704,7 @@ class alignas(8) Module { /// A conflict between two modules. struct Conflict { /// The module that this module conflicts with. - Module *Other; + ModuleRef Other; /// The message provided to the user when there is a conflict. std::string Message; @@ -742,6 +806,23 @@ class alignas(8) Module { Parent->SubModules.push_back(this); } + /// Add a child submodule. + void addSubmodule(StringRef Name, Module *Submodule) { + auto [It, New] = SubModuleIndex.insert({Name, SubModules.size()}); + if (New) + SubModules.emplace_back(); + SubModules[It->second].setExisting(Submodule); + } + + /// Add the external part of a submodule ModuleRef. + void addSubmodule(StringRef Name, ExternalSubmoduleSource *ExternalSource, + uint64_t SubmoduleID) { + auto [It, New] = SubModuleIndex.insert({Name, SubModules.size()}); + if (New) + SubModules.emplace_back(); + SubModules[It->second].setExternal(ExternalSource, SubmoduleID); + } + /// Is this module have similar semantics as headers. bool isHeaderLikeModule() const { return isModuleMapModule() || isHeaderUnit(); @@ -913,7 +994,7 @@ class alignas(8) Module { /// Find the submodule with the given name. /// /// \returns The submodule if found, or NULL otherwise. - Module *findSubmodule(StringRef Name) const; + ModuleRef findSubmodule(StringRef Name) const; /// Get the Global Module Fragment (sub-module) for this module, it there is /// one. @@ -941,8 +1022,8 @@ class alignas(8) Module { unsigned getVisibilityID() const { return VisibilityID; } - using submodule_iterator = std::vector::iterator; - using submodule_const_iterator = std::vector::const_iterator; + using submodule_iterator = std::vector::iterator; + using submodule_const_iterator = std::vector::const_iterator; llvm::iterator_range submodules() { return llvm::make_range(SubModules.begin(), SubModules.end()); diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h index ed326a7fd545b..12f8dbb0b6090 100644 --- a/clang/include/clang/Lex/ModuleMap.h +++ b/clang/include/clang/Lex/ModuleMap.h @@ -548,7 +548,7 @@ class ModuleMap { /// null, we will look for a top-level module. /// /// \returns The named submodule, if known; otherwose, returns null. - Module *lookupModuleQualified(StringRef Name, Module *Context) const; + ModuleRef lookupModuleQualified(StringRef Name, Module *Context) const; /// Find a new module or submodule, or create it if it does not already /// exist. diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8830294ea1658..8cba21539e48a 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -1535,7 +1535,7 @@ class Preprocessor { assert(M->isModuleMapModule()); if (!BuildingSubmoduleStack.empty()) { if (M != BuildingSubmoduleStack.back().M) - BuildingSubmoduleStack.back().M->AffectingClangModules.insert(M); + BuildingSubmoduleStack.back().M->AffectingClangModules.push_back(M); } else { AffectingClangModules.insert(M); } diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 9a41f9e89df98..3c8f3ba59a07e 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -44,7 +44,7 @@ namespace serialization { /// Version 4 of AST files also requires that the version control branch and /// revision match exactly, since there is no backward compatibility of /// AST files at this time. -const unsigned VERSION_MAJOR = 37; +const unsigned VERSION_MAJOR = 38; /// AST file minor version number supported by this version of /// Clang. @@ -751,6 +751,10 @@ enum ASTRecordTypes { /// Record code for extname-redefined undeclared identifiers. EXTNAME_UNDECLARED_IDENTIFIERS = 79, + + /// Record that encodes the number of submodules, their base ID in the AST + /// file, and for each module the relative bit offset into the stream. + SUBMODULE_METADATA = 80, }; /// Record types used within a source manager block. @@ -819,8 +823,8 @@ enum PreprocessorDetailRecordTypes { /// Record types used within a submodule description block. enum SubmoduleRecordTypes { - /// Metadata for submodules as a whole. - SUBMODULE_METADATA = 0, + /// Defines the end of a single submodule. Sentinel record without any data. + SUBMODULE_END = 0, /// Defines the major attributes of a submodule, including its /// name and parent. @@ -884,6 +888,10 @@ enum SubmoduleRecordTypes { /// Specifies affecting modules that were not imported. SUBMODULE_AFFECTING_MODULES = 18, + + /// Specifies a direct submodule by name and ID, enabling on-demand + /// deserialization of children without loading the entire submodule block. + SUBMODULE_CHILD = 19, }; /// Record types used within a comments block. diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 8394647885bd3..bedac9f8a540a 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -418,14 +418,13 @@ struct LookupBlockOffsets : VisibleLookupBlockOffsets { /// The AST reader provides lazy de-serialization of declarations, as /// required when traversing the AST. Only those AST nodes that are /// actually required will be de-serialized. -class ASTReader - : public ExternalPreprocessorSource, - public ExternalPreprocessingRecordSource, - public ExternalHeaderFileInfoSource, - public ExternalSemaSource, - public IdentifierInfoLookup, - public ExternalSLocEntrySource -{ +class ASTReader : public ExternalPreprocessorSource, + public ExternalPreprocessingRecordSource, + public ExternalHeaderFileInfoSource, + public ExternalSemaSource, + public IdentifierInfoLookup, + public ExternalSLocEntrySource, + public ExternalSubmoduleSource { public: /// Types of AST files. friend class ASTDeclMerger; @@ -820,32 +819,6 @@ class ASTReader /// declarations in that submodule that could be made visible. HiddenNamesMapType HiddenNamesMap; - /// A module import, export, or conflict that hasn't yet been resolved. - struct UnresolvedModuleRef { - /// The file in which this module resides. - ModuleFile *File; - - /// The module that is importing or exporting. - Module *Mod; - - /// The kind of module reference. - enum { Import, Export, Conflict, Affecting } Kind; - - /// The local ID of the module that is being exported. - unsigned ID; - - /// Whether this is a wildcard export. - LLVM_PREFERRED_TYPE(bool) - unsigned IsWildcard : 1; - - /// String data. - StringRef String; - }; - - /// The set of module imports and exports that still need to be - /// resolved. - SmallVector UnresolvedModuleRefs; - /// A vector containing selectors that have already been loaded. /// /// This vector is indexed by the Selector ID (-1). NULL selector @@ -1612,8 +1585,6 @@ class ASTReader ASTReadResult ReadModuleMapFileBlock(RecordData &Record, ModuleFile &F, const ModuleFile *ImportedBy, unsigned ClientLoadCapabilities); - llvm::Error ReadSubmoduleBlock(ModuleFile &F, - unsigned ClientLoadCapabilities); static bool ParseLanguageOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain, ASTReaderListener &Listener, @@ -2444,8 +2415,7 @@ class ASTReader unsigned LocalID) const; /// Retrieve the submodule that corresponds to a global submodule ID. - /// - Module *getSubmodule(serialization::SubmoduleID GlobalID); + Module *getSubmodule(uint32_t GlobalID) override; /// Retrieve the module that corresponds to the given module ID. /// diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h index 58f2fcba01e67..6c47040fde093 100644 --- a/clang/include/clang/Serialization/ModuleFile.h +++ b/clang/include/clang/Serialization/ModuleFile.h @@ -447,9 +447,25 @@ class ModuleFile { /// Base submodule ID for submodules local to this module. serialization::SubmoduleID BaseSubmoduleID = 0; + /// Base submodule ID for submodules local to this module within its own + /// address space. + unsigned LocalBaseSubmoduleID = 0; + + /// Local submodule ID of the top-level module. + unsigned LocalTopLevelSubmoduleID = 0; + /// Remapping table for submodule IDs in this module. ContinuousRangeMap SubmoduleRemap; + /// The cursor to the start of the submodules block. + llvm::BitstreamCursor SubmodulesCursor; + + /// Absolute offset of the start of the submodules block. + uint64_t SubmodulesOffsetBase = 0; + + /// Relative offsets for all submodule entries in the AST file. + const llvm::support::unaligned_uint64_t *SubmoduleOffsets = nullptr; + // === Selectors === /// The number of selectors new to this file. diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp index 66629baa6240b..d27abb1153c72 100644 --- a/clang/lib/Basic/Module.cpp +++ b/clang/lib/Basic/Module.cpp @@ -53,8 +53,7 @@ Module::Module(ModuleConstructorTag, StringRef Name, NoUndeclaredIncludes = Parent->NoUndeclaredIncludes; ModuleMapIsPrivate = Parent->ModuleMapIsPrivate; - Parent->SubModuleIndex[Name] = Parent->SubModules.size(); - Parent->SubModules.push_back(this); + Parent->addSubmodule(Name, this); } } @@ -348,7 +347,7 @@ void Module::markUnavailable(bool Unimportable) { } } -Module *Module::findSubmodule(StringRef Name) const { +ModuleRef Module::findSubmodule(StringRef Name) const { if (auto It = SubModuleIndex.find(Name); It != SubModuleIndex.end()) return SubModules[It->second]; diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index aaeea01bf775e..436b8e5620765 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -970,7 +970,8 @@ Module *ModuleMap::lookupModuleUnqualified(StringRef Name, return findModule(Name); } -Module *ModuleMap::lookupModuleQualified(StringRef Name, Module *Context) const{ +ModuleRef ModuleMap::lookupModuleQualified(StringRef Name, + Module *Context) const { if (!Context) return findModule(Name); @@ -982,8 +983,8 @@ std::pair ModuleMap::findOrCreateModule(StringRef Name, bool IsFramework, bool IsExplicit) { // Try to find an existing module with this name. - if (Module *Sub = lookupModuleQualified(Name, Parent)) - return std::make_pair(Sub, false); + if (ModuleRef Sub = lookupModuleQualified(Name, Parent); Sub.getExisting()) + return std::make_pair(Sub.getExisting(), false); // Create a new module with this name. Module *M = createModule(Name, Parent, IsFramework, IsExplicit); @@ -992,7 +993,7 @@ std::pair ModuleMap::findOrCreateModule(StringRef Name, Module *ModuleMap::createModule(StringRef Name, Module *Parent, bool IsFramework, bool IsExplicit) { - assert(lookupModuleQualified(Name, Parent) == nullptr && + assert(!lookupModuleQualified(Name, Parent).getExisting() && "Creating duplicate submodule"); Module *Result = new (ModulesAlloc.Allocate()) diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index b08459632aacb..761bf8e9af56b 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -1450,7 +1450,7 @@ void Preprocessor::makeModuleVisible(Module *M, SourceLocation Loc, // Add this module to the imports list of the currently-built submodule. if (!BuildingSubmoduleStack.empty() && M != BuildingSubmoduleStack.back().M) - BuildingSubmoduleStack.back().M->Imports.insert(M); + BuildingSubmoduleStack.back().M->Imports.push_back(M); } bool Preprocessor::FinishLexStringLiteral(Token &Result, std::string &String, diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index b96065f8619d2..e4e55bb7d0ac7 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -2032,7 +2032,8 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) { // Directly imported module are necessarily reachable. // Since we can't export import a module implementation partition unit, we // don't need to count for Exports here. - if (CurrentM && CurrentM->getTopLevelModule()->Imports.count(DeclTopModule)) + if (CurrentM && + llvm::is_contained(CurrentM->getTopLevelModule()->Imports, DeclTopModule)) return true; // Then we treat all module implementation partition unit as unreachable. diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp index 67f46b64cf047..caa61a99a6914 100644 --- a/clang/lib/Sema/SemaModule.cpp +++ b/clang/lib/Sema/SemaModule.cpp @@ -483,7 +483,7 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // Sequence initialization of the imported module before that of the current // module, if any. Context.addModuleInitializer(ModuleScopes.back().Module, Import); - Mod->Imports.insert(Interface); // As if we imported it. + Mod->Imports.push_back(Interface); // As if we imported it. // Also save this as a shortcut to checking for decls in the interface ThePrimaryInterface = Interface; // If we made an implicit import of the module interface, then return the @@ -710,7 +710,7 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, if (ExportLoc.isValid() || getEnclosingExportDecl(Import)) getCurrentModule()->Exports.emplace_back(Mod, false); else - getCurrentModule()->Imports.insert(Mod); + getCurrentModule()->Imports.push_back(Mod); } HadImportedNamedModules = true; diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 7e8bb6509e84b..6b242f553c59d 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -3762,8 +3762,13 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; case SUBMODULE_BLOCK_ID: - if (llvm::Error Err = ReadSubmoduleBlock(F, ClientLoadCapabilities)) + F.SubmodulesCursor = Stream; + if (llvm::Error Err = Stream.SkipBlock()) + return Err; + if (llvm::Error Err = + ReadBlockAbbrevs(F.SubmodulesCursor, SUBMODULE_BLOCK_ID)) return Err; + F.SubmodulesOffsetBase = F.SubmodulesCursor.GetCurrentBitNo(); break; case COMMENTS_BLOCK_ID: { @@ -3815,6 +3820,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, case HEADER_SEARCH_TABLE: case IMPORTED_MODULES: case MACRO_OFFSET: + case SUBMODULE_METADATA: break; default: continue; @@ -3825,6 +3831,50 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, default: // Default behavior: ignore. break; + case SUBMODULE_METADATA: { + F.BaseSubmoduleID = getTotalNumSubmodules(); + F.LocalNumSubmodules = Record[0]; + F.LocalBaseSubmoduleID = Record[1]; + F.LocalTopLevelSubmoduleID = Record[2]; + F.SubmoduleOffsets = + (const llvm::support::unaligned_uint64_t *)Blob.data(); + if (F.LocalNumSubmodules > 0) { + // Introduce the global -> local mapping for submodules within this + // module. + GlobalSubmoduleMap.insert( + std::make_pair(getTotalNumSubmodules() + 1, &F)); + + // Introduce the local -> global mapping for submodules within this + // module. + F.SubmoduleRemap.insertOrReplace( + std::make_pair(F.LocalBaseSubmoduleID, + F.BaseSubmoduleID - F.LocalBaseSubmoduleID)); + + SubmodulesLoaded.resize(SubmodulesLoaded.size() + F.LocalNumSubmodules); + } + + auto ReadSubmodule = [&](unsigned LocalID) -> Module * { + return getSubmodule(getGlobalSubmoduleID(F, LocalID)); + }; + + if (PP.getHeaderSearchInfo().getModuleMap().findModule(F.ModuleName)) { + // If we already knew about this module, make sure to bring all + // submodules up to date. + for (unsigned Index = 0; Index != F.LocalNumSubmodules; ++Index) { + unsigned LocalID = + Index + F.LocalBaseSubmoduleID + NUM_PREDEF_SUBMODULE_IDS; + ReadSubmodule(LocalID); + } + } else { + // If we didn't know this module, we loaded it transitively. Deserialize + // just the top-level module to register it with ModuleMap, but load the + // rest lazily. + ReadSubmodule(F.LocalTopLevelSubmoduleID); + } + + break; + } + case TYPE_OFFSET: { if (F.LocalNumTypes != 0) return llvm::createStringError( @@ -5103,41 +5153,6 @@ ASTReader::ASTReadResult ASTReader::ReadAST(ModuleFileName FileName, F.ImportLoc = TranslateSourceLocation(*M.ImportedBy, M.ImportLoc); } - // Resolve any unresolved module exports. - for (unsigned I = 0, N = UnresolvedModuleRefs.size(); I != N; ++I) { - UnresolvedModuleRef &Unresolved = UnresolvedModuleRefs[I]; - SubmoduleID GlobalID = getGlobalSubmoduleID(*Unresolved.File,Unresolved.ID); - Module *ResolvedMod = getSubmodule(GlobalID); - - switch (Unresolved.Kind) { - case UnresolvedModuleRef::Conflict: - if (ResolvedMod) { - Module::Conflict Conflict; - Conflict.Other = ResolvedMod; - Conflict.Message = Unresolved.String.str(); - Unresolved.Mod->Conflicts.push_back(Conflict); - } - continue; - - case UnresolvedModuleRef::Import: - if (ResolvedMod) - Unresolved.Mod->Imports.insert(ResolvedMod); - continue; - - case UnresolvedModuleRef::Affecting: - if (ResolvedMod) - Unresolved.Mod->AffectingClangModules.insert(ResolvedMod); - continue; - - case UnresolvedModuleRef::Export: - if (ResolvedMod || Unresolved.IsWildcard) - Unresolved.Mod->Exports.push_back(Module::ExportDecl( - ResolvedMod, static_cast(Unresolved.IsWildcard))); - continue; - } - } - UnresolvedModuleRefs.clear(); - // FIXME: How do we load the 'use'd modules? They may not be submodules. // Might be unnecessary as use declarations are only used to build the // module itself. @@ -6293,11 +6308,34 @@ bool ASTReader::isAcceptableASTFile( /*ValidateDiagnosticOptions=*/true); } -llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, - unsigned ClientLoadCapabilities) { - // Enter the submodule block. - if (llvm::Error Err = F.Stream.EnterSubBlock(SUBMODULE_BLOCK_ID)) - return Err; +Module *ASTReader::getSubmodule(uint32_t GlobalID) { + if (GlobalID < NUM_PREDEF_SUBMODULE_IDS) { + assert(GlobalID == 0 && "Unhandled global submodule ID"); + return nullptr; + } + + SubmoduleID GlobalIndex = GlobalID - NUM_PREDEF_SUBMODULE_IDS; + if (GlobalIndex >= SubmodulesLoaded.size()) { + Error("submodule ID out of range in AST file"); + return nullptr; + } + + if (SubmodulesLoaded[GlobalIndex]) + return SubmodulesLoaded[GlobalIndex]; + + GlobalSubmoduleMapType::iterator It = GlobalSubmoduleMap.find(GlobalID); + assert(It != GlobalSubmoduleMap.end()); + ModuleFile &F = *It->second; + unsigned Index = GlobalID - F.BaseSubmoduleID - NUM_PREDEF_SUBMODULE_IDS; + unsigned LocalID = Index + F.LocalBaseSubmoduleID + NUM_PREDEF_SUBMODULE_IDS; + + BitstreamCursor &Cursor = F.SubmodulesCursor; + SavedStreamPosition SavedPosition(Cursor); + unsigned Offset = F.SubmoduleOffsets[Index]; + if (llvm::Error Err = Cursor.JumpToBit(F.SubmodulesOffsetBase + Offset)) { + Error(std::move(Err)); + return nullptr; + } ModuleMap &ModMap = PP.getHeaderSearchInfo().getModuleMap(); bool KnowsTopLevelModule = ModMap.findModule(F.ModuleName) != nullptr; @@ -6308,23 +6346,24 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, ? &ModuleMap::createModule : &ModuleMap::findOrCreateModuleFirst; - bool First = true; Module *CurrentModule = nullptr; RecordData Record; while (true) { - Expected MaybeEntry = - F.Stream.advanceSkippingSubblocks(); - if (!MaybeEntry) - return MaybeEntry.takeError(); + Expected MaybeEntry = Cursor.advance(); + if (!MaybeEntry) { + Error(MaybeEntry.takeError()); + return nullptr; + } llvm::BitstreamEntry Entry = MaybeEntry.get(); switch (Entry.Kind) { - case llvm::BitstreamEntry::SubBlock: // Handled for us already. + case llvm::BitstreamEntry::SubBlock: case llvm::BitstreamEntry::Error: - return llvm::createStringError(std::errc::illegal_byte_sequence, - "malformed block record in AST file"); - case llvm::BitstreamEntry::EndBlock: - return llvm::Error::success(); + case llvm::BitstreamEntry::EndBlock: { + Error(llvm::createStringError(std::errc::illegal_byte_sequence, + "malformed block record in AST file")); + return nullptr; + } case llvm::BitstreamEntry::Record: // The interesting case. break; @@ -6333,35 +6372,35 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, // Read a record. StringRef Blob; Record.clear(); - Expected MaybeKind = F.Stream.readRecord(Entry.ID, Record, &Blob); - if (!MaybeKind) - return MaybeKind.takeError(); - unsigned Kind = MaybeKind.get(); - - if ((Kind == SUBMODULE_METADATA) != First) - return llvm::createStringError( - std::errc::illegal_byte_sequence, - "submodule metadata record should be at beginning of block"); - First = false; - - // Submodule information is only valid if we have a current module. - // FIXME: Should we error on these cases? - if (!CurrentModule && Kind != SUBMODULE_METADATA && - Kind != SUBMODULE_DEFINITION) - continue; + Expected MaybeKind = Cursor.readRecord(Entry.ID, Record, &Blob); + if (!MaybeKind) { + Error(MaybeKind.takeError()); + return nullptr; + } + auto Kind = static_cast(MaybeKind.get()); switch (Kind) { - default: // Default behavior: ignore. - break; + case SUBMODULE_END: + if (!CurrentModule) { + Error(llvm::createStringError(std::errc::illegal_byte_sequence, + "malformed module definition")); + return nullptr; + } + return CurrentModule; case SUBMODULE_DEFINITION: { - if (Record.size() < 13) - return llvm::createStringError(std::errc::illegal_byte_sequence, - "malformed module definition"); + if (Record.size() < 13) { + Error(llvm::createStringError(std::errc::illegal_byte_sequence, + "malformed module definition")); + return nullptr; + } StringRef Name = Blob; unsigned Idx = 0; - SubmoduleID GlobalID = getGlobalSubmoduleID(F, Record[Idx++]); + unsigned ReadLocalID = Record[Idx++]; + assert(LocalID == ReadLocalID); + SubmoduleID ReadGlobalID = getGlobalSubmoduleID(F, ReadLocalID); + assert(GlobalID == ReadGlobalID); SubmoduleID Parent = getGlobalSubmoduleID(F, Record[Idx++]); Module::ModuleKind Kind = (Module::ModuleKind)Record[Idx++]; SourceLocation DefinitionLoc = ReadSourceLocation(F, Record[Idx++]); @@ -6378,18 +6417,15 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, bool NamedModuleHasInit = Record[Idx++]; Module *ParentModule = nullptr; - if (Parent) + if (Parent) { ParentModule = getSubmodule(Parent); + if (!ParentModule) + return nullptr; + } CurrentModule = std::invoke(CreateModule, &ModMap, Name, ParentModule, IsFramework, IsExplicit); - SubmoduleID GlobalIndex = GlobalID - NUM_PREDEF_SUBMODULE_IDS; - if (GlobalIndex >= SubmodulesLoaded.size() || - SubmodulesLoaded[GlobalIndex]) - return llvm::createStringError(std::errc::invalid_argument, - "too many submodules"); - if (!ParentModule) { if ([[maybe_unused]] const ModuleFileKey *CurFileKey = CurrentModule->getASTFileKey()) { @@ -6410,7 +6446,7 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, Diag(diag::note_module_file_conflict) << CurModMapFile->getName() << ModMapFile->getName(); - return llvm::make_error(); + return nullptr; } } @@ -6520,59 +6556,29 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, break; } - case SUBMODULE_METADATA: { - F.BaseSubmoduleID = getTotalNumSubmodules(); - F.LocalNumSubmodules = Record[0]; - unsigned LocalBaseSubmoduleID = Record[1]; - if (F.LocalNumSubmodules > 0) { - // Introduce the global -> local mapping for submodules within this - // module. - GlobalSubmoduleMap.insert(std::make_pair(getTotalNumSubmodules()+1,&F)); - - // Introduce the local -> global mapping for submodules within this - // module. - F.SubmoduleRemap.insertOrReplace( - std::make_pair(LocalBaseSubmoduleID, - F.BaseSubmoduleID - LocalBaseSubmoduleID)); - - SubmodulesLoaded.resize(SubmodulesLoaded.size() + F.LocalNumSubmodules); - } - break; - } - case SUBMODULE_IMPORTS: for (unsigned Idx = 0; Idx != Record.size(); ++Idx) { - UnresolvedModuleRef Unresolved; - Unresolved.File = &F; - Unresolved.Mod = CurrentModule; - Unresolved.ID = Record[Idx]; - Unresolved.Kind = UnresolvedModuleRef::Import; - Unresolved.IsWildcard = false; - UnresolvedModuleRefs.push_back(Unresolved); + SubmoduleID GlobalID = getGlobalSubmoduleID(F, Record[Idx]); + CurrentModule->Imports.push_back(ModuleRef(this, GlobalID)); } break; case SUBMODULE_AFFECTING_MODULES: for (unsigned Idx = 0; Idx != Record.size(); ++Idx) { - UnresolvedModuleRef Unresolved; - Unresolved.File = &F; - Unresolved.Mod = CurrentModule; - Unresolved.ID = Record[Idx]; - Unresolved.Kind = UnresolvedModuleRef::Affecting; - Unresolved.IsWildcard = false; - UnresolvedModuleRefs.push_back(Unresolved); + SubmoduleID GlobalID = getGlobalSubmoduleID(F, Record[Idx]); + CurrentModule->AffectingClangModules.push_back( + ModuleRef(this, GlobalID)); } break; case SUBMODULE_EXPORTS: for (unsigned Idx = 0; Idx + 1 < Record.size(); Idx += 2) { - UnresolvedModuleRef Unresolved; - Unresolved.File = &F; - Unresolved.Mod = CurrentModule; - Unresolved.ID = Record[Idx]; - Unresolved.Kind = UnresolvedModuleRef::Export; - Unresolved.IsWildcard = Record[Idx + 1]; - UnresolvedModuleRefs.push_back(Unresolved); + SubmoduleID GlobalID = getGlobalSubmoduleID(F, Record[Idx]); + bool IsWildcard = Record[Idx + 1]; + ModuleRef ExportedMod = + GlobalID ? ModuleRef(this, GlobalID) : ModuleRef(); + if (ExportedMod || IsWildcard) + CurrentModule->Exports.push_back({ExportedMod, IsWildcard}); } // Once we've loaded the set of exports, there's no reason to keep @@ -6596,14 +6602,11 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, break; case SUBMODULE_CONFLICT: { - UnresolvedModuleRef Unresolved; - Unresolved.File = &F; - Unresolved.Mod = CurrentModule; - Unresolved.ID = Record[0]; - Unresolved.Kind = UnresolvedModuleRef::Conflict; - Unresolved.IsWildcard = false; - Unresolved.String = Blob; - UnresolvedModuleRefs.push_back(Unresolved); + SubmoduleID GlobalID = getGlobalSubmoduleID(F, Record[0]); + Module::Conflict Conflict; + Conflict.Other = ModuleRef(this, GlobalID); + Conflict.Message = Blob.str(); + CurrentModule->Conflicts.push_back(Conflict); break; } @@ -6624,6 +6627,13 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, CurrentModule->ExportAsModule = Blob.str(); ModMap.addLinkAsDependency(CurrentModule); break; + + case SUBMODULE_CHILD: { + // Record a not-yet-loaded direct child for on-demand deserialization. + SubmoduleID GlobalID = getGlobalSubmoduleID(F, Record[0]); + CurrentModule->addSubmodule(Blob, this, GlobalID); + break; + } } } } @@ -10061,20 +10071,6 @@ ASTReader::getGlobalSubmoduleID(ModuleFile &M, unsigned LocalID) const { return LocalID + I->second; } -Module *ASTReader::getSubmodule(SubmoduleID GlobalID) { - if (GlobalID < NUM_PREDEF_SUBMODULE_IDS) { - assert(GlobalID == 0 && "Unhandled global submodule ID"); - return nullptr; - } - - if (GlobalID > SubmodulesLoaded.size()) { - Error("submodule ID out of range in AST file"); - return nullptr; - } - - return SubmodulesLoaded[GlobalID - NUM_PREDEF_SUBMODULE_IDS]; -} - Module *ASTReader::getModule(unsigned ID) { return getSubmodule(ID); } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index ba644fefc109a..1970ed86589b5 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -919,6 +919,7 @@ void ASTWriter::WriteBlockInfoBlock() { // AST Top-Level Block. BLOCK(AST_BLOCK); + RECORD(SUBMODULE_METADATA); RECORD(TYPE_OFFSET); RECORD(DECL_OFFSET); RECORD(IDENTIFIER_OFFSET); @@ -997,7 +998,7 @@ void ASTWriter::WriteBlockInfoBlock() { // Submodule Block. BLOCK(SUBMODULE_BLOCK); - RECORD(SUBMODULE_METADATA); + RECORD(SUBMODULE_END); RECORD(SUBMODULE_DEFINITION); RECORD(SUBMODULE_UMBRELLA_HEADER); RECORD(SUBMODULE_HEADER); @@ -1016,6 +1017,7 @@ void ASTWriter::WriteBlockInfoBlock() { RECORD(SUBMODULE_PRIVATE_TEXTUAL_HEADER); RECORD(SUBMODULE_INITIALIZERS); RECORD(SUBMODULE_EXPORT_AS); + RECORD(SUBMODULE_CHILD); // Comments Block. BLOCK(COMMENTS_BLOCK); @@ -2983,16 +2985,6 @@ unsigned ASTWriter::getSubmoduleID(Module *Mod) { return ID; } -/// Compute the number of modules within the given tree (including the -/// given module). -static unsigned getNumberOfModules(Module *Mod) { - unsigned ChildModules = 0; - for (Module *Submodule : Mod->submodules()) - ChildModules += getNumberOfModules(Submodule); - - return ChildModules + 1; -} - void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) { // Enter the submodule description block. Stream.EnterSubblock(SUBMODULE_BLOCK_ID, /*bits for abbreviations*/5); @@ -3088,11 +3080,16 @@ void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) { Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Macro name unsigned ExportAsAbbrev = Stream.EmitAbbrev(std::move(Abbrev)); - // Write the submodule metadata block. - RecordData::value_type Record[] = { - getNumberOfModules(WritingModule), - FirstSubmoduleID - NUM_PREDEF_SUBMODULE_IDS}; - Stream.EmitRecord(SUBMODULE_METADATA, Record); + Abbrev = std::make_shared(); + Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_CHILD)); + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Child submodule ID + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Child name + unsigned ChildAbbrev = Stream.EmitAbbrev(std::move(Abbrev)); + + SmallVector SubmoduleOffsets; + uint64_t SubmoduleOffsetBase = Stream.GetCurrentBitNo(); + + unsigned TopLevelID = getSubmoduleID(WritingModule); // Write all of the submodules. std::queue Q; @@ -3101,6 +3098,19 @@ void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) { Module *Mod = Q.front(); Q.pop(); unsigned ID = getSubmoduleID(Mod); + if (ID < FirstSubmoduleID) { + assert(0 && "Loaded submodule entered WritingModule ?"); + continue; + } + + // Record the local offset of this submodule. + unsigned Index = ID - FirstSubmoduleID; + if (Index >= SubmoduleOffsets.size()) + SubmoduleOffsets.resize(Index + 1); + + uint64_t Offset = Stream.GetCurrentBitNo() - SubmoduleOffsetBase; + assert((Offset >> 32) == 0 && "Submodule offset too large"); + SubmoduleOffsets[Index] = Offset; uint64_t ParentID = 0; if (Mod->Parent) { @@ -3259,6 +3269,20 @@ void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) { Stream.EmitRecordWithBlob(ExportAsAbbrev, Record, Mod->ExportAsModule); } + // Emit one SUBMODULE_CHILD record per direct child so the reader can + // populate PendingSubmodules and demand-load children by name. + for (Module *Child : Mod->submodules()) { + RecordData::value_type Record[] = {SUBMODULE_CHILD, + getSubmoduleID(Child)}; + Stream.EmitRecordWithBlob(ChildAbbrev, Record, Child->Name); + } + + // Emit the sentinel signifying the end of this submodule. + { + RecordData Record; + Stream.EmitRecord(SUBMODULE_END, Record); + } + // Queue up the submodules of this module. for (Module *M : Mod->submodules()) Q.push(M); @@ -3266,10 +3290,23 @@ void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) { Stream.ExitBlock(); - assert((NextSubmoduleID - FirstSubmoduleID == - getNumberOfModules(WritingModule)) && + assert((NextSubmoduleID - FirstSubmoduleID == SubmoduleOffsets.size()) && "Wrong # of submodules; found a reference to a non-local, " "non-imported submodule?"); + + Abbrev = std::make_shared(); + Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_METADATA)); + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Submodule count + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Base submodule ID + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Top-level submod ID + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Submodule offsets + unsigned SubmoduleMetadataAbbrev = Stream.EmitAbbrev(std::move(Abbrev)); + + RecordData::value_type Record[] = { + SUBMODULE_METADATA, SubmoduleOffsets.size(), + FirstSubmoduleID - NUM_PREDEF_SUBMODULE_IDS, TopLevelID}; + Stream.EmitRecordWithBlob(SubmoduleMetadataAbbrev, Record, + bytes(SubmoduleOffsets)); } void ASTWriter::WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag, From 180409dc67570ef26b99d8a4dc7d1df6734152bc Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 08:47:37 -0700 Subject: [PATCH 061/538] [lldb] Add --changed option to `settings show` (#196390) Add a `--changed`/`-c` flag to `settings show` that restricts the output to settings whose current value differs from the default. This makes it easy to inspect what has been customized in a session or config without scrolling through the full property tree. One thing worth calling out is that this works as expected with explicit property paths, for example you can show only the modified settings belonging to `target`: ``` (lldb) set show -c target target.load-script-from-symbol-file (enum) = true (default: trusted) (lldb) ``` If nothing has been changed, the output is empty: ``` (lldb) sett show -c target.process (lldb) ``` rdar://176483441 --- lldb/include/lldb/Interpreter/OptionValue.h | 8 ++ .../lldb/Interpreter/OptionValueArch.h | 2 + .../lldb/Interpreter/OptionValueBoolean.h | 2 + .../lldb/Interpreter/OptionValueChar.h | 2 + .../lldb/Interpreter/OptionValueEnumeration.h | 2 + .../lldb/Interpreter/OptionValueFileSpec.h | 2 + .../lldb/Interpreter/OptionValueFormat.h | 2 + .../Interpreter/OptionValueFormatEntity.h | 4 + .../lldb/Interpreter/OptionValueLanguage.h | 2 + .../lldb/Interpreter/OptionValueProperties.h | 2 + .../lldb/Interpreter/OptionValueRegex.h | 4 + .../lldb/Interpreter/OptionValueSInt64.h | 2 + .../lldb/Interpreter/OptionValueString.h | 2 + .../lldb/Interpreter/OptionValueUInt64.h | 2 + .../source/Commands/CommandObjectSettings.cpp | 16 +++ lldb/source/Commands/Options.td | 3 + .../Interpreter/OptionValueProperties.cpp | 11 ++ .../API/commands/settings/TestSettings.py | 105 ++++++++++++++++++ 18 files changed, 173 insertions(+) diff --git a/lldb/include/lldb/Interpreter/OptionValue.h b/lldb/include/lldb/Interpreter/OptionValue.h index 9c992821251cb..7e48a675e2b7f 100644 --- a/lldb/include/lldb/Interpreter/OptionValue.h +++ b/lldb/include/lldb/Interpreter/OptionValue.h @@ -63,6 +63,7 @@ class OptionValue { eDumpOptionRaw = (1u << 4), eDumpOptionCommand = (1u << 5), eDumpOptionDefaultValue = (1u << 6), + eDumpOptionOnlyChanged = (1u << 7), eDumpGroupValue = (eDumpOptionName | eDumpOptionType | eDumpOptionValue), eDumpGroupHelp = (eDumpOptionName | eDumpOptionType | eDumpOptionDescription), @@ -249,6 +250,13 @@ class OptionValue { void SetOptionWasSet() { m_value_was_set = true; } + /// Return true if the current value equals the default value. + /// + /// Subclasses that store a default value should override this to compare + /// against it. The base implementation falls back to `OptionWasSet()`, which + /// is a reasonable approximation for types without an explicit default. + virtual bool IsDefault() const { return !OptionWasSet(); } + void SetParent(const lldb::OptionValueSP &parent_sp) { m_parent_wp = parent_sp; } diff --git a/lldb/include/lldb/Interpreter/OptionValueArch.h b/lldb/include/lldb/Interpreter/OptionValueArch.h index 3ba07b65dd618..8b6954f03dd29 100644 --- a/lldb/include/lldb/Interpreter/OptionValueArch.h +++ b/lldb/include/lldb/Interpreter/OptionValueArch.h @@ -49,6 +49,8 @@ class OptionValueArch : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + void AutoComplete(CommandInterpreter &interpreter, lldb_private::CompletionRequest &request) override; diff --git a/lldb/include/lldb/Interpreter/OptionValueBoolean.h b/lldb/include/lldb/Interpreter/OptionValueBoolean.h index 6d15dcd2fca5d..72c1ce446b8a0 100644 --- a/lldb/include/lldb/Interpreter/OptionValueBoolean.h +++ b/lldb/include/lldb/Interpreter/OptionValueBoolean.h @@ -45,6 +45,8 @@ class OptionValueBoolean : public Cloneable { void AutoComplete(CommandInterpreter &interpreter, CompletionRequest &request) override; + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions /// Convert to bool operator. diff --git a/lldb/include/lldb/Interpreter/OptionValueChar.h b/lldb/include/lldb/Interpreter/OptionValueChar.h index 2e2cf1ac1e08d..c1f83a3daf846 100644 --- a/lldb/include/lldb/Interpreter/OptionValueChar.h +++ b/lldb/include/lldb/Interpreter/OptionValueChar.h @@ -43,6 +43,8 @@ class OptionValueChar : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions const char &operator=(char c) { diff --git a/lldb/include/lldb/Interpreter/OptionValueEnumeration.h b/lldb/include/lldb/Interpreter/OptionValueEnumeration.h index 91ab454b2065e..e8566934d9fc5 100644 --- a/lldb/include/lldb/Interpreter/OptionValueEnumeration.h +++ b/lldb/include/lldb/Interpreter/OptionValueEnumeration.h @@ -52,6 +52,8 @@ class OptionValueEnumeration m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + void AutoComplete(CommandInterpreter &interpreter, CompletionRequest &request) override; diff --git a/lldb/include/lldb/Interpreter/OptionValueFileSpec.h b/lldb/include/lldb/Interpreter/OptionValueFileSpec.h index 66c5e328180f5..66f2b2a04ff53 100644 --- a/lldb/include/lldb/Interpreter/OptionValueFileSpec.h +++ b/lldb/include/lldb/Interpreter/OptionValueFileSpec.h @@ -53,6 +53,8 @@ class OptionValueFileSpec : public Cloneable { void AutoComplete(CommandInterpreter &interpreter, CompletionRequest &request) override; + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions FileSpec &GetCurrentValue() { return m_current_value; } diff --git a/lldb/include/lldb/Interpreter/OptionValueFormat.h b/lldb/include/lldb/Interpreter/OptionValueFormat.h index 5fd3192304573..661e8b507d64f 100644 --- a/lldb/include/lldb/Interpreter/OptionValueFormat.h +++ b/lldb/include/lldb/Interpreter/OptionValueFormat.h @@ -42,6 +42,8 @@ class OptionValueFormat m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions lldb::Format GetCurrentValue() const { return m_current_value; } diff --git a/lldb/include/lldb/Interpreter/OptionValueFormatEntity.h b/lldb/include/lldb/Interpreter/OptionValueFormatEntity.h index c10d56cbeb70b..bbc1f8c1eec43 100644 --- a/lldb/include/lldb/Interpreter/OptionValueFormatEntity.h +++ b/lldb/include/lldb/Interpreter/OptionValueFormatEntity.h @@ -34,6 +34,10 @@ class OptionValueFormatEntity void Clear() override; + bool IsDefault() const override { + return m_current_format == m_default_format; + } + void AutoComplete(CommandInterpreter &interpreter, CompletionRequest &request) override; diff --git a/lldb/include/lldb/Interpreter/OptionValueLanguage.h b/lldb/include/lldb/Interpreter/OptionValueLanguage.h index e1c1f85493ad6..41ddb2a13f15e 100644 --- a/lldb/include/lldb/Interpreter/OptionValueLanguage.h +++ b/lldb/include/lldb/Interpreter/OptionValueLanguage.h @@ -44,6 +44,8 @@ class OptionValueLanguage : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions lldb::LanguageType GetCurrentValue() const { return m_current_value; } diff --git a/lldb/include/lldb/Interpreter/OptionValueProperties.h b/lldb/include/lldb/Interpreter/OptionValueProperties.h index 21da8e584a7b4..d9b6c4764f4a4 100644 --- a/lldb/include/lldb/Interpreter/OptionValueProperties.h +++ b/lldb/include/lldb/Interpreter/OptionValueProperties.h @@ -46,6 +46,8 @@ class OptionValueProperties void DumpValue(const ExecutionContext *exe_ctx, Stream &strm, uint32_t dump_mask) override; + bool IsDefault() const override; + llvm::json::Value ToJSON(const ExecutionContext *exe_ctx) const override; llvm::StringRef GetName() const override { return m_name; } diff --git a/lldb/include/lldb/Interpreter/OptionValueRegex.h b/lldb/include/lldb/Interpreter/OptionValueRegex.h index b952cb2476012..2799fea1538dc 100644 --- a/lldb/include/lldb/Interpreter/OptionValueRegex.h +++ b/lldb/include/lldb/Interpreter/OptionValueRegex.h @@ -41,6 +41,10 @@ class OptionValueRegex : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { + return m_regex.GetText() == m_default_regex_str; + } + // Subclass specific functions const RegularExpression *GetCurrentValue() const { return (m_regex.IsValid() ? &m_regex : nullptr); diff --git a/lldb/include/lldb/Interpreter/OptionValueSInt64.h b/lldb/include/lldb/Interpreter/OptionValueSInt64.h index c220ac29e461f..f19f3f8ab875e 100644 --- a/lldb/include/lldb/Interpreter/OptionValueSInt64.h +++ b/lldb/include/lldb/Interpreter/OptionValueSInt64.h @@ -48,6 +48,8 @@ class OptionValueSInt64 : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions const int64_t &operator=(int64_t value) { diff --git a/lldb/include/lldb/Interpreter/OptionValueString.h b/lldb/include/lldb/Interpreter/OptionValueString.h index 4ec98176b6f8b..e199443fa8b49 100644 --- a/lldb/include/lldb/Interpreter/OptionValueString.h +++ b/lldb/include/lldb/Interpreter/OptionValueString.h @@ -82,6 +82,8 @@ class OptionValueString : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions Flags &GetOptions() { return m_options; } diff --git a/lldb/include/lldb/Interpreter/OptionValueUInt64.h b/lldb/include/lldb/Interpreter/OptionValueUInt64.h index 087c1d3ee321a..2a87c19c54bbf 100644 --- a/lldb/include/lldb/Interpreter/OptionValueUInt64.h +++ b/lldb/include/lldb/Interpreter/OptionValueUInt64.h @@ -51,6 +51,8 @@ class OptionValueUInt64 : public Cloneable { m_value_was_set = false; } + bool IsDefault() const override { return m_current_value == m_default_value; } + // Subclass specific functions const uint64_t &operator=(uint64_t value) { diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp index 126f57c738115..34a59d506da7f 100644 --- a/lldb/source/Commands/CommandObjectSettings.cpp +++ b/lldb/source/Commands/CommandObjectSettings.cpp @@ -264,6 +264,9 @@ class CommandObjectSettingsShow : public CommandObjectParsed { case 'd': m_include_defaults = true; break; + case 'c': + m_only_changed = true; + break; default: llvm_unreachable("Unimplemented option"); } @@ -272,6 +275,7 @@ class CommandObjectSettingsShow : public CommandObjectParsed { void OptionParsingStarting(ExecutionContext *execution_context) override { m_include_defaults = false; + m_only_changed = false; } llvm::ArrayRef GetDefinitions() override { @@ -279,6 +283,7 @@ class CommandObjectSettingsShow : public CommandObjectParsed { } bool m_include_defaults = false; + bool m_only_changed = false; }; protected: @@ -288,9 +293,20 @@ class CommandObjectSettingsShow : public CommandObjectParsed { uint32_t dump_mask = OptionValue::eDumpGroupValue; if (m_options.m_include_defaults) dump_mask |= OptionValue::eDumpOptionDefaultValue; + if (m_options.m_only_changed) { + dump_mask |= OptionValue::eDumpOptionOnlyChanged; + dump_mask |= OptionValue::eDumpOptionDefaultValue; + } if (!args.empty()) { for (const auto &arg : args) { + if (m_options.m_only_changed) { + Status lookup_error; + lldb::OptionValueSP value_sp = GetDebugger().GetPropertyValue( + &m_exe_ctx, arg.ref(), lookup_error); + if (value_sp && value_sp->IsDefault()) + continue; + } Status error(GetDebugger().DumpPropertyValue( &m_exe_ctx, result.GetOutputStream(), arg.ref(), dump_mask)); if (error.Success()) { diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td index ba6781dcab04e..123ba7bdb257e 100644 --- a/lldb/source/Commands/Options.td +++ b/lldb/source/Commands/Options.td @@ -71,6 +71,9 @@ let Command = "settings clear" in { let Command = "settings show" in { def setshow_defaults : Option<"defaults", "d">, Desc<"Include ${d}efault values if defined.">; + def setshow_changed : Option<"changed", "c">, + Desc<"Only show settings whose value differs from the " + "default.">; } let Command = "breakpoint list" in { diff --git a/lldb/source/Interpreter/OptionValueProperties.cpp b/lldb/source/Interpreter/OptionValueProperties.cpp index def6cc462f76a..0034e79018727 100644 --- a/lldb/source/Interpreter/OptionValueProperties.cpp +++ b/lldb/source/Interpreter/OptionValueProperties.cpp @@ -342,6 +342,8 @@ void OptionValueProperties::DumpValue(const ExecutionContext *exe_ctx, if (property) { OptionValue *option_value = property->GetValue().get(); assert(option_value); + if ((dump_mask & eDumpOptionOnlyChanged) && option_value->IsDefault()) + continue; const bool transparent_value = option_value->ValueIsTransparent(); property->Dump(exe_ctx, strm, dump_mask); if (!transparent_value) @@ -350,6 +352,15 @@ void OptionValueProperties::DumpValue(const ExecutionContext *exe_ctx, } } +bool OptionValueProperties::IsDefault() const { + for (const Property &property : m_properties) { + if (OptionValue *value = property.GetValue().get()) + if (!value->IsDefault()) + return false; + } + return true; +} + llvm::json::Value OptionValueProperties::ToJSON(const ExecutionContext *exe_ctx) const { llvm::json::Object json_properties; diff --git a/lldb/test/API/commands/settings/TestSettings.py b/lldb/test/API/commands/settings/TestSettings.py index 8410befe399a3..1a93034d2707a 100644 --- a/lldb/test/API/commands/settings/TestSettings.py +++ b/lldb/test/API/commands/settings/TestSettings.py @@ -237,6 +237,111 @@ def test_set_auto_confirm(self): startstr="auto-confirm (boolean) = false", ) + def test_settings_show_changed(self): + """Test `settings show --changed` filters the listing to non-default values.""" + setting = "target.max-children-count" + + def cleanup(): + self.runCmd("settings clear %s" % setting, check=False) + + self.addTearDownHook(cleanup) + + # Ensure a clean slate for this setting. + self.runCmd("settings clear %s" % setting) + + # With the setting at its default, it should not show up under --changed. + self.expect( + "settings show --changed", + matching=False, + substrs=[setting], + ) + + # After explicitly changing the setting, it should show up along with + # the default value. + self.runCmd("settings set %s 42" % setting) + self.expect( + "settings show --changed", + substrs=["%s (unsigned) = 42 (default: 24)" % setting], + ) + + # After clearing, it should no longer show up. + self.runCmd("settings clear %s" % setting) + self.expect( + "settings show --changed", + matching=False, + substrs=[setting], + ) + + # An explicit property path at its default prints nothing. + self.expect( + "settings show --changed %s" % setting, + matching=False, + substrs=[setting], + ) + + # When the value has been changed, the explicit path prints the + # current value and default. + self.runCmd("settings set %s 42" % setting) + self.expect( + "settings show --changed %s" % setting, + substrs=["%s (unsigned) = 42 (default: 24)" % setting], + ) + + def test_settings_show_changed_per_target(self): + """Test that `settings show --changed` reflects per-target values: a + per-target setting changed in target A should show as changed when A is + selected, and as unchanged when target B is selected.""" + setting = "target.max-children-count" + + def cleanup(): + self.runCmd("settings clear %s" % setting, check=False) + + self.addTearDownHook(cleanup) + self.runCmd("settings clear %s" % setting) + + target_a = self.dbg.CreateTarget("") + self.assertTrue(target_a.IsValid(), "Created target A") + target_b = self.dbg.CreateTarget("") + self.assertTrue(target_b.IsValid(), "Created target B") + + index_a = self.dbg.GetIndexOfTarget(target_a) + index_b = self.dbg.GetIndexOfTarget(target_b) + + # Select target A and override the per-target setting there. + self.runCmd("target select %d" % index_a) + self.runCmd("settings set %s 42" % setting) + + # With A selected, the changed listing should include the override. + self.expect( + "settings show --changed", + substrs=["%s (unsigned) = 42 (default: 24)" % setting], + ) + self.expect( + "settings show --changed %s" % setting, + substrs=["%s (unsigned) = 42 (default: 24)" % setting], + ) + + # Switch to target B: the same setting is still at its default for B, + # so it must not appear in either form of the changed listing. + self.runCmd("target select %d" % index_b) + self.expect( + "settings show --changed", + matching=False, + substrs=[setting], + ) + self.expect( + "settings show --changed %s" % setting, + matching=False, + substrs=[setting], + ) + + # Sanity check: switching back to A still shows the override. + self.runCmd("target select %d" % index_a) + self.expect( + "settings show --changed %s" % setting, + substrs=["%s (unsigned) = 42 (default: 24)" % setting], + ) + @skipIf(archs=no_match(["x86_64", "i386", "i686"])) def test_disassembler_settings(self): """Test that user options for the disassembler take effect.""" From cd2b1a1979a353879dc2cc8790ef46eb8cab9c83 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 8 May 2026 08:50:10 -0700 Subject: [PATCH 062/538] [X86] Remove tests for non-existant intrinsics. NFC (#196237) There is no PSRAQ instruction until AVX512. The incorrect intrinsic names were just being interpreted as a call to an external functional. --- llvm/test/CodeGen/X86/blend-of-shift.ll | 164 +--------------------- llvm/test/CodeGen/X86/shuffle-of-shift.ll | 100 +------------ 2 files changed, 8 insertions(+), 256 deletions(-) diff --git a/llvm/test/CodeGen/X86/blend-of-shift.ll b/llvm/test/CodeGen/X86/blend-of-shift.ll index cf382c7903bd7..b824e8ba78883 100644 --- a/llvm/test/CodeGen/X86/blend-of-shift.ll +++ b/llvm/test/CodeGen/X86/blend-of-shift.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X64,X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X64,X64-AVX2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X86,X86-SSE2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X86,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ;------------------------------ 32-bit shuffles -------------------------------; @@ -178,77 +178,6 @@ define <4 x i32> @shuffle_i32_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> ret <4 x i32> %i5 } -define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { -; X64-SSE2-LABEL: shuffle_i32_of_ashr_i64: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: subq $40, %rsp -; X64-SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-SSE2-NEXT: movl $63, %edi -; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; X64-SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-SSE2-NEXT: movl $63, %edi -; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-SSE2-NEXT: shufps $27, (%rsp), %xmm0 # 16-byte Folded Reload -; X64-SSE2-NEXT: # xmm0 = xmm0[3,2],mem[1,0] -; X64-SSE2-NEXT: addq $40, %rsp -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: shuffle_i32_of_ashr_i64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: subq $40, %rsp -; X64-AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-AVX2-NEXT: movl $63, %edi -; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; X64-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-AVX2-NEXT: movl $63, %edi -; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vshufps $27, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; X64-AVX2-NEXT: # xmm0 = xmm0[3,2],mem[1,0] -; X64-AVX2-NEXT: addq $40, %rsp -; X64-AVX2-NEXT: retq -; -; X86-SSE2-LABEL: shuffle_i32_of_ashr_i64: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE2-NEXT: pushl $63 -; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-SSE2-NEXT: addl $4, %esp -; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill -; X86-SSE2-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-SSE2-NEXT: pushl $63 -; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-SSE2-NEXT: addl $4, %esp -; X86-SSE2-NEXT: movups (%esp), %xmm1 # 16-byte Reload -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[1,0] -; X86-SSE2-NEXT: addl $32, %esp -; X86-SSE2-NEXT: retl -; -; X86-AVX2-LABEL: shuffle_i32_of_ashr_i64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: subl $32, %esp -; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-AVX2-NEXT: pushl $63 -; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vmovups %xmm0, (%esp) # 16-byte Spill -; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-AVX2-NEXT: pushl $63 -; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vshufps $27, (%esp), %xmm0, %xmm0 # 16-byte Folded Reload -; X86-AVX2-NEXT: # xmm0 = xmm0[3,2],mem[1,0] -; X86-AVX2-NEXT: addl $32, %esp -; X86-AVX2-NEXT: retl - %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) - %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63) - %i3 = bitcast <2 x i64> %i1 to <4 x i32> - %i4 = bitcast <2 x i64> %i2 to <4 x i32> - %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> - ret <4 x i32> %i5 -} ;------------------------------ 64-bit shuffles -------------------------------; @@ -430,88 +359,3 @@ define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> ret <2 x i64> %i5 } -define <2 x i64> @shuffle_i64_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { -; X64-SSE2-LABEL: shuffle_i64_of_ashr_i64: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: subq $40, %rsp -; X64-SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-SSE2-NEXT: movl $63, %edi -; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; X64-SSE2-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-SSE2-NEXT: movl $63, %edi -; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-SSE2-NEXT: shufpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; X64-SSE2-NEXT: # xmm0 = xmm0[1],mem[0] -; X64-SSE2-NEXT: addq $40, %rsp -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: shuffle_i64_of_ashr_i64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: subq $40, %rsp -; X64-AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-AVX2-NEXT: movl $63, %edi -; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; X64-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-AVX2-NEXT: movl $63, %edi -; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; X64-AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; X64-AVX2-NEXT: addq $40, %rsp -; X64-AVX2-NEXT: retq -; -; X86-SSE2-LABEL: shuffle_i64_of_ashr_i64: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE2-NEXT: pushl $63 -; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-SSE2-NEXT: addl $4, %esp -; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill -; X86-SSE2-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-SSE2-NEXT: pushl $63 -; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-SSE2-NEXT: addl $4, %esp -; X86-SSE2-NEXT: movups (%esp), %xmm1 # 16-byte Reload -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] -; X86-SSE2-NEXT: addl $32, %esp -; X86-SSE2-NEXT: retl -; -; X86-AVX2-LABEL: shuffle_i64_of_ashr_i64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: subl $32, %esp -; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-AVX2-NEXT: pushl $63 -; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vmovups %xmm0, (%esp) # 16-byte Spill -; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-AVX2-NEXT: pushl $63 -; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vmovdqu (%esp), %xmm1 # 16-byte Reload -; X86-AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; X86-AVX2-NEXT: addl $32, %esp -; X86-AVX2-NEXT: retl - %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) - %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63) - %i3 = bitcast <2 x i64> %i1 to <2 x i64> - %i4 = bitcast <2 x i64> %i2 to <2 x i64> - %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> - ret <2 x i64> %i5 -} - -declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) -declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) -declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) -declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) -declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) -declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) -declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) -declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) -declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32) ; does not exist -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} -; X64: {{.*}} -; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/shuffle-of-shift.ll b/llvm/test/CodeGen/X86/shuffle-of-shift.ll index e2dc74d4e4df1..555c4cd41b242 100644 --- a/llvm/test/CodeGen/X86/shuffle-of-shift.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-shift.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X64,X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X64,X64-AVX2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X86,X86-SSE2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X86,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ;------------------------------ 32-bit shuffles -------------------------------; @@ -141,45 +141,6 @@ define <4 x i32> @shuffle_i32_of_lshr_i64(<2 x i64> %x) nounwind { %i3 = shufflevector <4 x i32> %i2, <4 x i32> poison, <4 x i32> ret <4 x i32> %i3 } -define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x) nounwind { -; X64-SSE2-LABEL: shuffle_i32_of_ashr_i64: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $63, %edi -; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X64-SSE2-NEXT: popq %rax -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: shuffle_i32_of_ashr_i64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: pushq %rax -; X64-AVX2-NEXT: movl $63, %edi -; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X64-AVX2-NEXT: popq %rax -; X64-AVX2-NEXT: retq -; -; X86-SSE2-LABEL: shuffle_i32_of_ashr_i64: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl $63 -; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-SSE2-NEXT: addl $4, %esp -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X86-SSE2-NEXT: retl -; -; X86-AVX2-LABEL: shuffle_i32_of_ashr_i64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: pushl $63 -; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X86-AVX2-NEXT: retl - %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) - %i2 = bitcast <2 x i64> %i1 to <4 x i32> - %i3 = shufflevector <4 x i32> %i2, <4 x i32> poison, <4 x i32> - ret <4 x i32> %i3 -} ;------------------------------ 64-bit shuffles -------------------------------; @@ -321,56 +282,3 @@ define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x) nounwind { %i3 = shufflevector <2 x i64> %i2, <2 x i64> poison, <2 x i32> ret <2 x i64> %i3 } -define <2 x i64> @shuffle_i64_of_ashr_i64(<2 x i64> %x) nounwind { -; X64-SSE2-LABEL: shuffle_i64_of_ashr_i64: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $63, %edi -; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-SSE2-NEXT: popq %rax -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: shuffle_i64_of_ashr_i64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: pushq %rax -; X64-AVX2-NEXT: movl $63, %edi -; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: popq %rax -; X64-AVX2-NEXT: retq -; -; X86-SSE2-LABEL: shuffle_i64_of_ashr_i64: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl $63 -; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-SSE2-NEXT: addl $4, %esp -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-SSE2-NEXT: retl -; -; X86-AVX2-LABEL: shuffle_i64_of_ashr_i64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: pushl $63 -; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT -; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: retl - %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) - %i2 = bitcast <2 x i64> %i1 to <2 x i64> - %i3 = shufflevector <2 x i64> %i2, <2 x i64> poison, <2 x i32> - ret <2 x i64> %i3 -} - -declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) -declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) -declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) -declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) -declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) -declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) -declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) -declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) -declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32) ; does not exist -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} -; X64: {{.*}} -; X86: {{.*}} From 435957ac1c05a9fe7c21cea947d04774d60484c9 Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Fri, 8 May 2026 11:51:14 -0400 Subject: [PATCH 063/538] [flang][docs] Removed HighLevelFIR transition plan section (#196227) Removed the "Transition Plan" section from flang/docs/HighLevelFIR.md, since the transition has completed a long time ago and the legacy lowering code is being removed now. --- flang/docs/HLFIRTransition.md | 55 +++++++++++++++++++++++++++++++++++ flang/docs/HighLevelFIR.md | 50 ------------------------------- flang/docs/index.md | 9 ++++++ 3 files changed, 64 insertions(+), 50 deletions(-) create mode 100644 flang/docs/HLFIRTransition.md diff --git a/flang/docs/HLFIRTransition.md b/flang/docs/HLFIRTransition.md new file mode 100644 index 0000000000000..b63af78fee1dd --- /dev/null +++ b/flang/docs/HLFIRTransition.md @@ -0,0 +1,55 @@ +# Transition of Lowering to HLFIR + +This section was extracted from [HighLevelFIR.md](High-Level Fortran IR (HLFIR)). +This information is no longer relevant to the current state of HLFIR +lowering, but could be useful as a historical reference. + +## Transition Plan + +The new higher-level steps proposed in this document will require significant +refactoring of lowering. Codegen should not be impacted since the current FIR +will remain untouched. + +A lot of the code in lowering generating Fortran features (like an intrinsic or +how to do assignments) is based on the fir::ExtendedValue concept. This +currently is a collection of mlir::Value that allows describing a Fortran object +(either a variable or an evaluated expression result). The variable and +expression concepts described above should allow to keep an interface very +similar to the fir::ExtendedValue, but having the fir::ExtendedValue wrap a +single value or mlir::Operation* from which all of the object entity +information can be inferred. + +That way, all the helpers currently generating FIR from fir::ExtendedValue could +be kept and used with the new variable and expression concepts with as little +modification as possible. + +The proposed plan is to: +- 1. Introduce the new HLFIR operations. +- 2. Refactor fir::ExtendedValue so that it can work with the new variable and + expression concepts (requires part of 1.). +- 3. Introduce the new translation passes, using the fir::ExtendedValue helpers + (requires 1.). +- 3.b Introduce the new optimization passes (requires 1.). +- 4. Introduce the fir.declare and hlfir.finalize usage in lowering (requires 1. + and 2. and part of 3.). + +The following steps might have to be done in parallel of the current lowering, +to avoid disturbing the work on performance until the new lowering is complete +and on par. + +- 5. Introduce hlfir.designate and hlfir.associate usage in lowering. +- 6. Introduce lowering to hlfir.assign (with RHS that is not a hlfir.expr), + hlfir.ptr_assign. +- 7. Introduce lowering to hlfir.expr and related operations. +- 8. Introduce lowering to hlfir.forall. + +At that point, lowering using the high-level FIR should be in place, allowing +extensive testing. +- 9. Debugging correctness. +- 10. Debugging execution performance. + +The plan is to do these steps incrementally upstream, but for lowering this will +most likely be safer to do have the new expression lowering implemented in +parallel upstream, and to add an option to use the new lowering rather than to +directly modify the current expression lowering and have it step by step +equivalent functionally and performance wise. diff --git a/flang/docs/HighLevelFIR.md b/flang/docs/HighLevelFIR.md index 2399efcdeacd3..99a05c48b1e4e 100644 --- a/flang/docs/HighLevelFIR.md +++ b/flang/docs/HighLevelFIR.md @@ -960,56 +960,6 @@ LLVM. These high level optimization passes can be run any number of times in any order. -## Transition Plan - -The new higher-level steps proposed in this document will require significant -refactoring of lowering. Codegen should not be impacted since the current FIR -will remain untouched. - -A lot of the code in lowering generating Fortran features (like an intrinsic or -how to do assignments) is based on the fir::ExtendedValue concept. This -currently is a collection of mlir::Value that allows describing a Fortran object -(either a variable or an evaluated expression result). The variable and -expression concepts described above should allow to keep an interface very -similar to the fir::ExtendedValue, but having the fir::ExtendedValue wrap a -single value or mlir::Operation* from which all of the object entity -information can be inferred. - -That way, all the helpers currently generating FIR from fir::ExtendedValue could -be kept and used with the new variable and expression concepts with as little -modification as possible. - -The proposed plan is to: -- 1. Introduce the new HLFIR operations. -- 2. Refactor fir::ExtendedValue so that it can work with the new variable and - expression concepts (requires part of 1.). -- 3. Introduce the new translation passes, using the fir::ExtendedValue helpers - (requires 1.). -- 3.b Introduce the new optimization passes (requires 1.). -- 4. Introduce the fir.declare and hlfir.finalize usage in lowering (requires 1. - and 2. and part of 3.). - -The following steps might have to be done in parallel of the current lowering, -to avoid disturbing the work on performance until the new lowering is complete -and on par. - -- 5. Introduce hlfir.designate and hlfir.associate usage in lowering. -- 6. Introduce lowering to hlfir.assign (with RHS that is not a hlfir.expr), - hlfir.ptr_assign. -- 7. Introduce lowering to hlfir.expr and related operations. -- 8. Introduce lowering to hlfir.forall. - -At that point, lowering using the high-level FIR should be in place, allowing -extensive testing. -- 9. Debugging correctness. -- 10. Debugging execution performance. - -The plan is to do these steps incrementally upstream, but for lowering this will -most likely be safer to do have the new expression lowering implemented in -parallel upstream, and to add an option to use the new lowering rather than to -directly modify the current expression lowering and have it step by step -equivalent functionally and performance wise. - ## Examples ### Example 1: simple array assignment diff --git a/flang/docs/index.md b/flang/docs/index.md index 029f49a353d8f..a565b15528642 100644 --- a/flang/docs/index.md +++ b/flang/docs/index.md @@ -105,6 +105,15 @@ on how to get in touch with us and to learn more about the current status. fstack-arrays ``` +# Historical References + +```{eval-rst} +.. toctree:: + :titlesonly: + + HLFIRTransition +``` + # Indices and tables ```{eval-rst} From ee44ba80855a91b42c4a66f33c41abaa7e79b099 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 8 May 2026 18:02:52 +0200 Subject: [PATCH 064/538] Revert "[lldb] Do not refcount breakpoints in lldb-server" (#196561) Reverts llvm/llvm-project#195858 due to breakage on arm(32). --- .../lldb/Host/common/NativeProcessProtocol.h | 1 + lldb/source/Host/common/NativeProcessProtocol.cpp | 14 ++++++++++---- .../multi-breakpoint/TestMultiBreakpoint.py | 13 ++----------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/lldb/include/lldb/Host/common/NativeProcessProtocol.h b/lldb/include/lldb/Host/common/NativeProcessProtocol.h index 186a3d0f2f612..06b36c2cc9eb5 100644 --- a/lldb/include/lldb/Host/common/NativeProcessProtocol.h +++ b/lldb/include/lldb/Host/common/NativeProcessProtocol.h @@ -419,6 +419,7 @@ class NativeProcessProtocol { protected: struct SoftwareBreakpoint { + uint32_t ref_count; llvm::SmallVector saved_opcodes; llvm::ArrayRef breakpoint_opcodes; }; diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp index dbffdc619ef42..196f54b93538d 100644 --- a/lldb/source/Host/common/NativeProcessProtocol.cpp +++ b/lldb/source/Host/common/NativeProcessProtocol.cpp @@ -344,8 +344,10 @@ Status NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr, LLDB_LOG(log, "addr = {0:x}, size_hint = {1}", addr, size_hint); auto it = m_software_breakpoints.find(addr); - if (it != m_software_breakpoints.end()) + if (it != m_software_breakpoints.end()) { + ++it->second.ref_count; return Status(); + } auto expected_bkpt = EnableSoftwareBreakpoint(addr, size_hint); if (!expected_bkpt) return Status::FromError(expected_bkpt.takeError()); @@ -360,10 +362,14 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) { auto it = m_software_breakpoints.find(addr); if (it == m_software_breakpoints.end()) return Status::FromErrorString("Breakpoint not found."); + assert(it->second.ref_count > 0); + if (--it->second.ref_count > 0) + return Status(); // Remove the entry from m_software_breakpoints rightaway, so that we don't - // leave behind an entry in case one of the following conditions returns an - // error. The breakpoint is moved so that it can be accessed below. + // leave behind an entry with ref_count == 0 in case one of the following + // conditions returns an error. The breakpoint is moved so that it can be + // accessed below. SoftwareBreakpoint bkpt = std::move(it->second); m_software_breakpoints.erase(it); @@ -497,7 +503,7 @@ NativeProcessProtocol::EnableSoftwareBreakpoint(lldb::addr_t addr, } LLDB_LOG(log, "addr = {0:x}: SUCCESS", addr); - return SoftwareBreakpoint{saved_opcode_bytes, *expected_trap}; + return SoftwareBreakpoint{1, saved_opcode_bytes, *expected_trap}; } llvm::Expected> diff --git a/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py b/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py index a4d6351e05d65..eb9e2952d5a49 100644 --- a/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py +++ b/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py @@ -17,8 +17,6 @@ # Runs on systems where we can always predict the software break size @skipIf(archs=no_match(["x86_64", "arm64", "aarch64"])) class TestMultiBreakpoint(TestBase): - NO_DEBUG_INFO_TESTCASE = True - def check_invalid_packet(self, packet_str): reply = lldbutil.send_packet_get_reply(self, packet_str) if reply.startswith("E"): @@ -62,9 +60,6 @@ def get_function_address(self, name): return f"{addr:x}" def test_multi_breakpoint(self): - # Debugserver uses refcounted breakpoints - breakpoints_are_refcounted = self.platformIsDarwin() - self.build() source_file = lldb.SBFileSpec("main.c") self.target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( @@ -156,9 +151,7 @@ def make_packet(array): # Clean up both. array = [f"z0,{addr_a},{bp_kind}", f"z0,{addr_a},{bp_kind}"] reply = self.send_packet(make_packet(array)) - self.assertMultiResponse( - reply, ["OK", "OK" if breakpoints_are_refcounted else "error"] - ) + self.assertMultiResponse(reply, ["OK", "OK"]) # --- Set the same breakpoint twice, but remove it thrice. array = [f"Z0,{addr_a},{bp_kind}", f"Z0,{addr_a},{bp_kind}"] @@ -170,9 +163,7 @@ def make_packet(array): f"z0,{addr_a},{bp_kind}", ] reply = self.send_packet(make_packet(array)) - self.assertMultiResponse( - reply, ["OK", "OK" if breakpoints_are_refcounted else "error", "error"] - ) + self.assertMultiResponse(reply, ["OK", "OK", "error"]) # --- Set and remove the same address in a single packet --- # The spec requires requests to be executed in order, so the set From 7c7f5be3560d4b4c9b6697d07d48436ff828833c Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Fri, 8 May 2026 18:30:13 +0200 Subject: [PATCH 065/538] [CodeGen][AMDGPU] Move boilerplate unit test code to base class (NFC) (#196547) This adds the `CodeGenTestBase` class to handle boilerplate code for codegen unit tests and makes use of it wherever possible, in particular in AMDGPU unit tests. Furthermore, this makes all AMDGPU unit tests rely on GoogleTest's API for "run once per test-suite" code, instead of re-implementing that behavior using a `std::once` flag. As a consequence all TEST(...) become TEST_F(...). --- llvm/unittests/CodeGen/CodeGenTestBase.h | 91 +++++++++++++++++++ .../CodeGen/MachineDomTreeUpdaterTest.cpp | 77 +--------------- llvm/unittests/CodeGen/RematerializerTest.cpp | 78 ++-------------- .../Target/AMDGPU/AMDGPUUnitTests.cpp | 40 ++++---- .../unittests/Target/AMDGPU/AMDGPUUnitTests.h | 21 ++++- llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 + llvm/unittests/Target/AMDGPU/CSETest.cpp | 2 +- .../Target/AMDGPU/DwarfRegMappings.cpp | 4 +- .../AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp | 2 +- llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp | 52 ++--------- llvm/unittests/Target/AMDGPU/PALMetadata.cpp | 21 +---- .../Target/AMDGPU/UniformityAnalysisTest.cpp | 20 +--- 12 files changed, 158 insertions(+), 251 deletions(-) create mode 100644 llvm/unittests/CodeGen/CodeGenTestBase.h diff --git a/llvm/unittests/CodeGen/CodeGenTestBase.h b/llvm/unittests/CodeGen/CodeGenTestBase.h new file mode 100644 index 0000000000000..0a3117779fc36 --- /dev/null +++ b/llvm/unittests/CodeGen/CodeGenTestBase.h @@ -0,0 +1,91 @@ +//===--- CodeGenTestBase.h - Utilities for codegen unit tests ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_CODEGEN_CODEGENTESTBASE_H +#define LLVM_UNITTESTS_CODEGEN_CODEGENTESTBASE_H + +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/CodeGen/MIRParser/MIRParser.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" + +namespace llvm { + +/// Boilerplate set-up for codegen tests. Sets up all analyses managers for a +/// given target and creates a module from an MIR string. +class CodeGenTestBase : public testing::Test { +public: + LLVMContext Context; + std::unique_ptr TM; + std::unique_ptr MMI; + std::unique_ptr MIR; + std::unique_ptr Mod; + + LoopAnalysisManager LAM; + MachineFunctionAnalysisManager MFAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + + MachineFunction &getMF(StringRef FuncName) { + return FAM.getResult(*Mod->getFunction(FuncName)) + .getMF(); + } + +protected: + /// Sets up the target machine and analyses managers. + void setUpImpl(StringRef Triple, StringRef CPU, StringRef FS) { + llvm::Triple TT(Triple); + std::string Error; + const Target *T = TargetRegistry::lookupTarget("", TT, Error); + if (!T) + GTEST_SKIP(); + TargetOptions Options; + TM.reset(T->createTargetMachine(TT, CPU, FS, Options, std::nullopt)); + if (!TM) + GTEST_SKIP(); + MMI = std::make_unique(TM.get()); + + PassBuilder PB(TM.get()); + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.registerMachineFunctionAnalyses(MFAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); + MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); }); + } + + /// Parses \p MIRCode into a module. Returns whether parsing was successful. + bool parseMIR(StringRef MIRCode) { + SMDiagnostic Diagnostic; + std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); + MIR = createMIRParser(std::move(MBuffer), Context); + if (!MIR) + return false; + + Mod = MIR->parseIRModule(); + Mod->setDataLayout(TM->createDataLayout()); + if (MIR->parseMachineFunctions(*Mod, MAM)) { + Mod.reset(); + return false; + } + return true; + } +}; + +} // namespace llvm + +#endif // LLVM_UNITTESTS_CODEGEN_CODEGENTESTBASE_H diff --git a/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp b/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp index 85fdba0b3a9f7..c32565d875cdb 100644 --- a/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp +++ b/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp @@ -7,88 +7,23 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineDomTreeUpdater.h" -#include "llvm/Analysis/CGSCCPassManager.h" -#include "llvm/Analysis/LoopAnalysisManager.h" +#include "CodeGenTestBase.h" #include "llvm/CodeGen/MIRParser/MIRParser.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/IR/Module.h" -#include "llvm/MC/TargetRegistry.h" #include "llvm/Passes/PassBuilder.h" -#include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" #include "gtest/gtest.h" using namespace llvm; -class MachineDomTreeUpdaterTest : public testing::Test { +class MachineDomTreeUpdaterTest : public CodeGenTestBase { public: - LLVMContext Context; - std::unique_ptr TM; - std::unique_ptr M; - std::unique_ptr MMI; - std::unique_ptr MIR; - - LoopAnalysisManager LAM; - MachineFunctionAnalysisManager MFAM; - FunctionAnalysisManager FAM; - CGSCCAnalysisManager CGAM; - ModuleAnalysisManager MAM; - - ModulePassManager MPM; - FunctionPassManager FPM; - MachineFunctionPassManager MFPM; - static void SetUpTestCase() { InitializeAllTargets(); InitializeAllTargetMCs(); } - void SetUp() override { - Triple TargetTriple("x86_64-unknown-linux-gnu"); - std::string Error; - const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); - if (!T) - GTEST_SKIP(); - TargetOptions Options; - TM = std::unique_ptr( - T->createTargetMachine(TargetTriple, "", "", Options, std::nullopt)); - if (!TM) - GTEST_SKIP(); - MMI = std::make_unique(TM.get()); - - PassBuilder PB(TM.get()); - PB.registerModuleAnalyses(MAM); - PB.registerCGSCCAnalyses(CGAM); - PB.registerFunctionAnalyses(FAM); - PB.registerLoopAnalyses(LAM); - PB.registerMachineFunctionAnalyses(MFAM); - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); - MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); }); - } - - bool parseMIR(StringRef MIRCode) { - SMDiagnostic Diagnostic; - std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); - MIR = createMIRParser(std::move(MBuffer), Context); - if (!MIR) - return false; - - M = MIR->parseIRModule(); - M->setDataLayout(TM->createDataLayout()); - - if (MIR->parseMachineFunctions(*M, MAM)) { - M.reset(); - return false; - } - - return true; - } + void SetUp() override { setUpImpl("x86_64-unknown-linux-gnu", "", ""); } }; TEST_F(MachineDomTreeUpdaterTest, EagerUpdateBasicOperations) { @@ -150,8 +85,7 @@ body: | ASSERT_TRUE(parseMIR(MIRString)); - auto &MF = - FAM.getResult(*M->getFunction("f0")).getMF(); + MachineFunction &MF = getMF("f0"); MachineDominatorTree DT(MF); MachinePostDominatorTree PDT(MF); @@ -240,8 +174,7 @@ body: | ASSERT_TRUE(parseMIR(MIRString)); - auto &MF = - FAM.getResult(*M->getFunction("f0")).getMF(); + MachineFunction &MF = getMF("f0"); MachineDominatorTree DT(MF); MachinePostDominatorTree PDT(MF); diff --git a/llvm/unittests/CodeGen/RematerializerTest.cpp b/llvm/unittests/CodeGen/RematerializerTest.cpp index 00316aa5f72be..f4724f652682d 100644 --- a/llvm/unittests/CodeGen/RematerializerTest.cpp +++ b/llvm/unittests/CodeGen/RematerializerTest.cpp @@ -7,96 +7,32 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Rematerializer.h" -#include "llvm/Analysis/CGSCCPassManager.h" -#include "llvm/Analysis/LoopAnalysisManager.h" -#include "llvm/CodeGen/MIRParser/MIRParser.h" -#include "llvm/CodeGen/MachineDomTreeUpdater.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachinePassManager.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/IR/Module.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Passes/PassBuilder.h" -#include "llvm/Support/SourceMgr.h" +#include "CodeGenTestBase.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" -#include "gtest/gtest.h" -#include using namespace llvm; using RegisterIdx = Rematerializer::RegisterIdx; -class RematerializerTest : public testing::Test { +class RematerializerTest : public CodeGenTestBase { public: - LLVMContext Context; - std::unique_ptr TM; - std::unique_ptr M; - std::unique_ptr MMI; - - std::unique_ptr MIR; std::unique_ptr> Regions; std::unique_ptr Remater; MachineFunction *MF; - LoopAnalysisManager LAM; - MachineFunctionAnalysisManager MFAM; - FunctionAnalysisManager FAM; - CGSCCAnalysisManager CGAM; - - ModulePassManager MPM; - FunctionPassManager FPM; - MachineFunctionPassManager MFPM; - ModuleAnalysisManager MAM; - static void SetUpTestCase() { InitializeAllTargets(); InitializeAllTargetMCs(); } - void SetUp() override { - Triple TargetTriple("amdgcn--"); - std::string Error; - const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); - if (!T) - GTEST_SKIP(); - TargetOptions Options; - TM = std::unique_ptr(T->createTargetMachine( - TargetTriple, "gfx950", "", Options, std::nullopt)); - if (!TM) - GTEST_SKIP(); - MMI = std::make_unique(TM.get()); - - PassBuilder PB(TM.get()); - PB.registerModuleAnalyses(MAM); - PB.registerCGSCCAnalyses(CGAM); - PB.registerFunctionAnalyses(FAM); - PB.registerLoopAnalyses(LAM); - PB.registerMachineFunctionAnalyses(MFAM); - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); - MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); }); - } + void SetUp() override { setUpImpl("amdgcn--", "gfx950", ""); } bool parseMIRAndInit(StringRef MIRCode, StringRef FunName) { - SMDiagnostic Diagnostic; - std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); - MIR = createMIRParser(std::move(MBuffer), Context); - if (!MIR) + if (!parseMIR(MIRCode)) return false; - M = MIR->parseIRModule(); - M->setDataLayout(TM->createDataLayout()); - - if (MIR->parseMachineFunctions(*M, MAM)) { - M.reset(); - return false; - } - - MF = &FAM.getResult(*M->getFunction(FunName)) - .getMF(); + MF = &CodeGenTestBase::getMF(FunName); LiveIntervals &LIS = MFAM.getResult(*MF); // Create regions for the rematerializer. Both MBBs and terminator MIs diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp index 81982d0217f71..78cad1d9d65b6 100644 --- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUUnitTests.h" +#include "AMDGPUGenSubtargetInfo.inc" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "llvm/MC/TargetRegistry.h" @@ -14,24 +15,20 @@ #include "llvm/TargetParser/TargetParser.h" #include "gtest/gtest.h" -#include "AMDGPUGenSubtargetInfo.inc" - using namespace llvm; -std::once_flag flag; - -void InitializeAMDGPUTarget() { - std::call_once(flag, []() { - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - }); +static void initializeAMDGPUTarget() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); } -std::unique_ptr -llvm::createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { - InitializeAMDGPUTarget(); +void AMDGPUTestBase::SetUpTestSuite() { initializeAMDGPUTarget(); } + +void AMDGPUCodeGenTestBase::SetUpTestSuite() { initializeAMDGPUTarget(); } +std::unique_ptr +createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { Triple TT(TStr); std::string Error; const Target *T = TargetRegistry::lookupTarget(TT, Error); @@ -88,10 +85,11 @@ static bool checkMinMax(std::stringstream &OS, unsigned Occ, unsigned MinOcc, return MinValid && MaxValid && RangeValid; } -static const std::pair - EmptyFS = {"", ""}, - W32FS = {"+wavefrontsize32", "w32"}, - W64FS = {"+wavefrontsize64", "w64"}; +static const std::pair EmptyFS = {"", ""}, + W32FS = {"+wavefrontsize32", + "w32"}, + W64FS = {"+wavefrontsize64", + "w64"}; using TestFuncTy = function_ref; @@ -180,7 +178,7 @@ static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS, testWithBlockSize(32); } -TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { +TEST_F(AMDGPUTestBase, TestVGPRLimitsPerOccupancy) { auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize) { unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(DynamicVGPRBlockSize); @@ -240,7 +238,7 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS, EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS; } -TEST(AMDGPU, TestOccupancyAbsoluteLimits) { +TEST_F(AMDGPUTestBase, TestOccupancyAbsoluteLimits) { // CPUName, Features, DynamicVGPRBlockSize; Expected MinOcc, MaxOcc, MaxVGPRs testAbsoluteLimits("gfx1200", "+wavefrontsize32", 0, 1, 16, 256); testAbsoluteLimits("gfx1200", "+wavefrontsize32", 16, 1, 16, 128); @@ -251,7 +249,7 @@ static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) { return SubReg ? TRI.getSubRegIndexName(SubReg) : ""; } -TEST(AMDGPU, TestReverseComposeSubRegIndices) { +TEST_F(AMDGPUTestBase, TestReverseComposeSubRegIndices) { auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx900", ""); if (!TM) return; @@ -327,7 +325,7 @@ TEST(AMDGPU, TestReverseComposeSubRegIndices) { } } -TEST(AMDGPU, TestGetNamedOperandIdx) { +TEST_F(AMDGPUTestBase, TestGetNamedOperandIdx) { std::unique_ptr TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx900", ""); if (!TM) diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h index ccd2e32527042..c26ae98353ed7 100644 --- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h @@ -9,17 +9,30 @@ #ifndef LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H #define LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H +#include "AMDGPUGenSubtargetInfo.inc" +#include "AMDGPUTargetMachine.h" +#include "CodeGenTestBase.h" +#include "GCNSubtarget.h" #include #include namespace llvm { - class GCNTargetMachine; class StringRef; +} // end namespace llvm -std::unique_ptr -createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS); +std::unique_ptr +createAMDGPUTargetMachine(std::string TStr, llvm::StringRef CPU, + llvm::StringRef FS); -} // end namespace llvm +class AMDGPUTestBase : public testing::Test { +public: + static void SetUpTestSuite(); +}; + +class AMDGPUCodeGenTestBase : public llvm::CodeGenTestBase { +public: + static void SetUpTestSuite(); +}; #endif // LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt index 7760f694933c2..2425556ebe33f 100644 --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -1,6 +1,7 @@ include_directories( ${PROJECT_SOURCE_DIR}/lib/Target/AMDGPU ${PROJECT_BINARY_DIR}/lib/Target/AMDGPU + ${PROJECT_SOURCE_DIR}/unittests/CodeGen ) set(LLVM_LINK_COMPONENTS diff --git a/llvm/unittests/Target/AMDGPU/CSETest.cpp b/llvm/unittests/Target/AMDGPU/CSETest.cpp index ff44ff184234c..382f23906e92d 100644 --- a/llvm/unittests/Target/AMDGPU/CSETest.cpp +++ b/llvm/unittests/Target/AMDGPU/CSETest.cpp @@ -15,7 +15,7 @@ using namespace llvm; -TEST(AMDGPU, TestCSEForRegisterClassOrBankAndLLT) { +TEST_F(AMDGPUTestBase, TestCSEForRegisterClassOrBankAndLLT) { auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx1100", ""); if (!TM) GTEST_SKIP(); diff --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp index 68a6b96997223..ec8ed7f15b35a 100644 --- a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp +++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp @@ -12,7 +12,7 @@ using namespace llvm; -TEST(AMDGPU, TestWave64DwarfRegMapping) { +TEST_F(AMDGPUTestBase, TestWave64DwarfRegMapping) { for (auto Triple : {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { auto TM = createAMDGPUTargetMachine(Triple, "gfx1010", "+wavefrontsize64"); @@ -52,7 +52,7 @@ TEST(AMDGPU, TestWave64DwarfRegMapping) { } } -TEST(AMDGPU, TestWave32DwarfRegMapping) { +TEST_F(AMDGPUTestBase, TestWave32DwarfRegMapping) { for (auto Triple : {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { auto TM = createAMDGPUTargetMachine(Triple, "gfx1010", "+wavefrontsize32"); diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp index 5ac4edae5f0df..8034bf6561d86 100644 --- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp +++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp @@ -13,7 +13,7 @@ using namespace llvm; -TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) { +TEST_F(AMDGPUTestBase, ExecMayBeModifiedBeforeAnyUse) { auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx906", ""); if (!TM) GTEST_SKIP(); diff --git a/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp b/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp index 95266dc853bfd..ec3d465891467 100644 --- a/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp +++ b/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp @@ -6,52 +6,22 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUTargetMachine.h" #include "AMDGPUUnitTests.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/MIRParser/MIRParser.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/TargetParser/TargetParser.h" #include "gtest/gtest.h" #include "AMDGPUGenSubtargetInfo.inc" using namespace llvm; -// FIXME: Consolidate parseMIR and other common helpers (this one is copied from -// unittests/MIR/MachineMetadata.cpp). -std::unique_ptr parseMIR(LLVMContext &Context, const TargetMachine &TM, - StringRef MIRCode, const char *FnName, - MachineModuleInfo &MMI) { - SMDiagnostic Diagnostic; - std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); - auto MIR = createMIRParser(std::move(MBuffer), Context); - if (!MIR) - return nullptr; - - std::unique_ptr Mod = MIR->parseIRModule(); - if (!Mod) - return nullptr; - - Mod->setDataLayout(TM.createDataLayout()); - - if (MIR->parseMachineFunctions(*Mod, MMI)) { - return nullptr; - } - - return Mod; -} - -TEST(AMDGPULiveRegUnits, TestVGPRBlockLoadStore) { - auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx1200", ""); - ASSERT_TRUE(TM) << "No target machine"; - - GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), - std::string(TM->getTargetFeatureString()), *TM); +class LiveRegUnitsTest : public AMDGPUCodeGenTestBase { +public: + void SetUp() override { setUpImpl("amdgcn-amd-", "gfx1200", ""); } +}; +TEST_F(LiveRegUnitsTest, TestVGPRBlockLoadStore) { // Add a very simple MIR snippet that saves and restores a block of VGPRs. The // body of the function, represented by a S_NOP, clobbers one CSR (v42) and // one caller-saved register (v49), and reads one CSR (v61) and one @@ -76,17 +46,13 @@ body: | ... )MIR"; - LLVMContext Context; - MachineModuleInfo MMI(TM.get()); - auto M = parseMIR(Context, *TM, MIRString, "vgpr-block-insts", MMI); - - auto *MF = MMI.getMachineFunction(*M->getFunction("vgpr-block-insts")); - auto *MBB = MF->getBlockNumbered(0); - + ASSERT_TRUE(parseMIR(MIRString)); + MachineFunction &MF = getMF("vgpr-block-insts"); + auto *MBB = MF.getBlockNumbered(0); auto MIt = --MBB->instr_end(); LiveRegUnits LiveUnits; - LiveUnits.init(*ST.getRegisterInfo()); + LiveUnits.init(*MF.getSubtarget().getRegisterInfo()); LiveUnits.addLiveOuts(*MBB); LiveUnits.stepBackward(*MIt); diff --git a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp index e9392195fcc01..64537921ec4a6 100644 --- a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp +++ b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" +#include "AMDGPUUnitTests.h" #include "GCNSubtarget.h" #include "SIProgramInfo.h" #include "Utils/AMDGPUPALMetadata.h" @@ -17,13 +18,12 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" #include "gtest/gtest.h" using namespace llvm; -class PALMetadata : public testing::Test { +class PALMetadata : public AMDGPUTestBase { protected: std::unique_ptr TM; std::unique_ptr Ctx; @@ -33,23 +33,8 @@ class PALMetadata : public testing::Test { std::unique_ptr M; AMDGPUPALMetadata MD; - static void SetUpTestSuite() { - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - } - PALMetadata() { - Triple TT("amdgcn--amdpal"); - StringRef CPU = "gfx1010"; - StringRef FS = ""; - - std::string Error; - const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error); - TargetOptions Options; - - TM.reset(static_cast(TheTarget->createTargetMachine( - TT, CPU, FS, Options, std::nullopt, std::nullopt))); + TM = createAMDGPUTargetMachine("amdgcn--amdpal", "gfx1010", ""); Ctx = std::make_unique(); M = std::make_unique("Module", *Ctx); diff --git a/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp b/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp index ae44d3ef6cbf2..da480e02e8c12 100644 --- a/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp +++ b/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/UniformityAnalysis.h" +#include "AMDGPUUnitTests.h" #include "llvm/ADT/GenericUniformityImpl.h" #include "llvm/Analysis/CycleAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -19,26 +20,12 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" -#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/TargetParser/Triple.h" #include "gtest/gtest.h" using namespace llvm; -static std::unique_ptr -createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { - Triple TT(TStr); - std::string Error; - const Target *T = TargetRegistry::lookupTarget(TT, Error); - if (!T) - return nullptr; - return std::unique_ptr( - T->createTargetMachine(TT, CPU, FS, {}, std::nullopt)); -} - static UniformityInfo computeUniformity(const TargetTransformInfo *TTI, Function *F) { DominatorTree DT(*F); @@ -50,10 +37,7 @@ static UniformityInfo computeUniformity(const TargetTransformInfo *TTI, return UI; } -TEST(UniformityAnalysis, NewValueIsConservativelyDivergent) { - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); +TEST_F(AMDGPUTestBase, NewValueIsConservativelyDivergent) { StringRef ModuleString = R"( target triple = "amdgcn-unknown-amdhsa" From 4771770e15f9ec9d159631218f42499daecafbad Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 8 May 2026 17:41:58 +0100 Subject: [PATCH 066/538] [DAG] canCreateUndefOrPoison - ISD::FCEIL/FFLOOR/FTRUNC/FRINT/FNEARBYINT/FROUND/FROUNDEVEN can never create poison/undef (#196543) Also add missing fold support for ftrunc(fround(x)) -> fround(x) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 1 + .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 7 +++ llvm/test/CodeGen/X86/freeze-unary.ll | 63 +++++-------------- 3 files changed, 22 insertions(+), 49 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 15577af91ff03..5a467a5a5ba53 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20316,6 +20316,7 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) { case ISD::FRINT: case ISD::FTRUNC: case ISD::FNEARBYINT: + case ISD::FROUND: case ISD::FROUNDEVEN: case ISD::FFLOOR: case ISD::FCEIL: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 15fc74b6cc7c9..0b7d8b7946f99 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5900,6 +5900,13 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_PAIR: case ISD::SPLAT_VECTOR: case ISD::FABS: + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FROUNDEVEN: return false; case ISD::ABS: diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll index a2c8be0d91e4a..03b003f52ffd6 100644 --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -465,21 +465,16 @@ define <16 x i8> @freeze_parity_vec(<16 x i8> %a0) nounwind { define float @ftrunc_freeze_fnearbyint(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_fnearbyint: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll nearbyintf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_fnearbyint: ; X64: # %bb.0: ; X64-NEXT: roundss $12, %xmm0, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.nearbyint.f32(float %a0) %fr = freeze float %f0 @@ -490,15 +485,11 @@ define float @ftrunc_freeze_fnearbyint(float %a0) nounwind { define float @ftrunc_freeze_fround(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_fround: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll roundf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_fround: @@ -509,7 +500,6 @@ define float @ftrunc_freeze_fround(float %a0) nounwind { ; X64-NEXT: addss %xmm0, %xmm1 ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: roundss $11, %xmm1, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.round.f32(float %a0) %fr = freeze float %f0 @@ -520,21 +510,16 @@ define float @ftrunc_freeze_fround(float %a0) nounwind { define float @ftrunc_freeze_froundeven(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_froundeven: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll roundevenf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_froundeven: ; X64: # %bb.0: ; X64-NEXT: roundss $8, %xmm0, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.roundeven.f32(float %a0) %fr = freeze float %f0 @@ -545,21 +530,16 @@ define float @ftrunc_freeze_froundeven(float %a0) nounwind { define float @ftrunc_freeze_frint(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_frint: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll rintf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_frint: ; X64: # %bb.0: ; X64-NEXT: roundss $4, %xmm0, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.rint.f32(float %a0) %fr = freeze float %f0 @@ -570,21 +550,16 @@ define float @ftrunc_freeze_frint(float %a0) nounwind { define float @ftrunc_freeze_ftrunc(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_ftrunc: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll truncf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_ftrunc: ; X64: # %bb.0: ; X64-NEXT: roundss $11, %xmm0, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.trunc.f32(float %a0) %fr = freeze float %f0 @@ -595,21 +570,16 @@ define float @ftrunc_freeze_ftrunc(float %a0) nounwind { define float @ftrunc_freeze_ffloor(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_ffloor: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll floorf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_ffloor: ; X64: # %bb.0: ; X64-NEXT: roundss $9, %xmm0, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.floor.f32(float %a0) %fr = freeze float %f0 @@ -620,21 +590,16 @@ define float @ftrunc_freeze_ffloor(float %a0) nounwind { define float @ftrunc_freeze_fceil(float %a0) nounwind { ; X86-LABEL: ftrunc_freeze_fceil: ; X86: # %bb.0: -; X86-NEXT: subl $8, %esp +; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll ceilf -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll truncf -; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: ftrunc_freeze_fceil: ; X64: # %bb.0: ; X64-NEXT: roundss $10, %xmm0, %xmm0 -; X64-NEXT: roundss $11, %xmm0, %xmm0 ; X64-NEXT: retq %f0 = call float @llvm.ceil.f32(float %a0) %fr = freeze float %f0 From 003846bf173a7ea78395f57f134d4a0500016fe1 Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Fri, 8 May 2026 16:44:05 +0000 Subject: [PATCH 067/538] [libc] Fix op_tests Memcmp guard to require SSE4.1 (#196572) The is_vector<__m128i> specialisation in op_x86.h is gated on __SSE4_1__, but op_tests.cpp included generic::Memcmp<__m128i> under the weaker __SSE2__ guard. On baseline x86-64 (where __SSE2__ is always defined but __SSE4_1__ may not be), this caused a static_assert failure in is_element_type_v. Changed the guard from __SSE2__ to __SSE4_1__ to match the specialisation requirement, consistent with how BcmpImplementations already guards its __m128i entry. Assisted-by: Automated tooling, human reviewed. --- libc/test/src/string/memory_utils/op_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp index 06089a89d9aac..9d26056a720f3 100644 --- a/libc/test/src/string/memory_utils/op_tests.cpp +++ b/libc/test/src/string/memory_utils/op_tests.cpp @@ -299,7 +299,7 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) { using MemcmpImplementations = testing::TypeList< #if defined(LIBC_TARGET_ARCH_IS_X86_64) && !defined(LIBC_TARGET_OS_IS_WINDOWS) -#ifdef __SSE2__ +#ifdef __SSE4_1__ generic::Memcmp<__m128i>, // #endif #ifdef __AVX2__ From 64f9bb5fae806e8e430144c67d9020f32829a4be Mon Sep 17 00:00:00 2001 From: KIM SO JUNG Date: Sat, 9 May 2026 01:48:00 +0900 Subject: [PATCH 068/538] [Object][Wasm] Fix off-by-one in data segment name index validation (#196338) The check `Index > DataSegments.size()` in `parseNameSection()` allows `Index == DataSegments.size()`, which is an out-of-bounds access. In an assertions-disabled ASan build, a malformed wasm object with one data segment and a data segment name entry using index 1 triggers a heap-buffer-overflow READ in `WasmObjectFile::parseNameSection()`. Fix by checking `Index >= DataSegments.size()` instead. Also add a regression test that verifies the malformed input is rejected with "invalid data segment name entry". --- llvm/lib/Object/WasmObjectFile.cpp | 4 +-- .../wasm/invalid-data-segment-name-index.test | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/llvm-readobj/wasm/invalid-data-segment-name-index.test diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp index 5f125ffb10198..98f60bd710c7e 100644 --- a/llvm/lib/Object/WasmObjectFile.cpp +++ b/llvm/lib/Object/WasmObjectFile.cpp @@ -600,7 +600,7 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) { if (!SeenSegments.insert(Index).second) return make_error( "segment named more than once", object_error::parse_failed); - if (Index > DataSegments.size()) + if (Index >= DataSegments.size()) return make_error("invalid data segment name entry", object_error::parse_failed); nameType = wasm::NameType::DATA_SEGMENT; @@ -833,7 +833,7 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) { auto Offset = readVaruint64(Ctx); auto Size = readVaruint64(Ctx); if (!(Info.Flags & wasm::WASM_SYMBOL_ABSOLUTE)) { - if (static_cast(Index) >= DataSegments.size()) + if (Index >= DataSegments.size()) return make_error( "invalid data segment index: " + Twine(Index), object_error::parse_failed); diff --git a/llvm/test/tools/llvm-readobj/wasm/invalid-data-segment-name-index.test b/llvm/test/tools/llvm-readobj/wasm/invalid-data-segment-name-index.test new file mode 100644 index 0000000000000..b32c0feaef7ec --- /dev/null +++ b/llvm/test/tools/llvm-readobj/wasm/invalid-data-segment-name-index.test @@ -0,0 +1,25 @@ +# RUN: yaml2obj %s -o %t.wasm +# RUN: not llvm-readobj --symbols %t.wasm 2>&1 | FileCheck %s + +# CHECK: error: '{{.*}}': invalid data segment name entry + +--- !WASM +FileHeader: + Version: 0x1 +Sections: + - Type: MEMORY + Memories: + - Minimum: 0x1 + - Type: DATA + Segments: + - SectionOffset: 0 + InitFlags: 0 + Offset: + Opcode: I32_CONST + Value: 0 + Content: '' + - Type: CUSTOM + Name: name + DataSegmentNames: + - Index: 1 + Name: invalid_data_segment_name From 6c083a67521c49b21a92b742cde4c648f535ab0c Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 8 May 2026 18:48:32 +0200 Subject: [PATCH 069/538] [AMDGPU] Add VOP3P encoding to gfx13 (#196252) Co-authored-by: Ivan Kosarev --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 102 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s | 1 + llvm/test/MC/AMDGPU/gfx13_asm_vop3p.s | 1608 +++++++++++++++++ llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp16.s | 18 + llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp8.s | 34 + .../test/MC/AMDGPU/gfx13_asm_vop3p_features.s | 125 ++ 6 files changed, 1846 insertions(+), 42 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3p.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp8.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3p_features.s diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index d95c9eb788700..9e6b94752a445 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -2569,20 +2569,27 @@ defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x26>; defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x27>; //===----------------------------------------------------------------------===// -// GFX12 +// GFX12, GFX13. //===----------------------------------------------------------------------===// multiclass VOP3P_Real_gfx12 op> : VOP3P_Real_Base; multiclass VOP3P_Real_gfx1250 op> : VOP3P_Real_Base; -multiclass VOP3P_Real_with_name_gfx12 op, +multiclass VOP3P_Real_gfx1250_gfx13 op> : + VOP3P_Real_gfx1250, VOP3P_Real_Base; + +multiclass VOP3P_Real_with_name_gfx12_gfx13 op, string backing_ps_name = NAME, string asmName = !cast(NAME).Mnemonic> : - VOP3P_Real_with_name; + VOP3P_Real_with_name, + VOP3P_Real_with_name; + +multiclass VOP3P_Realtriple_gfx1250_gfx13 op> : + VOP3P_Realtriple, VOP3P_Realtriple; -defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; -defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; +defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12_gfx13<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; +defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12_gfx13<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>; defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>; @@ -2596,19 +2603,19 @@ defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>; defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>; defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>; defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>; -defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>; -defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>; -defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>; -defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>; -defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>; +defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250_gfx13<0x11>; +defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250_gfx13<0x23>; +defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250_gfx13<0x2a>; +defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250_gfx13<0x2b>; +defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250_gfx13<0x2c>; defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>; defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; -defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple; -defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple; -defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple; +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple_gfx1250_gfx13<0x3d>; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple_gfx1250_gfx13<0x3e>; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple_gfx1250_gfx13<0x3f>; let PostEncoderMethod = "postEncodeVOP3" in { defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; @@ -2633,15 +2640,18 @@ multiclass VOP3P_Real_with_name_gfx1170 op, defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx1170<0x11, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx1170<0x12, "V_PK_MAX_F16", "v_pk_max_num_f16">; -defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx11_gfx12<0x1d>; -defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx11_gfx12<0x1e>; - //===----------------------------------------------------------------------===// -// GFX11 +// GFX11, GFX12, GFX13. //===----------------------------------------------------------------------===// +multiclass VOP3P_Real_gfx11_gfx12_gfx13 op> : + VOP3P_Real_gfx11_gfx12, VOP3P_Real_Base; + +defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx11_gfx12_gfx13<0x1d>; +defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx11_gfx12_gfx13<0x1e>; + +defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11_gfx12_gfx13<0x16>; +defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11_gfx12_gfx13<0x18>; -defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11_gfx12<0x16>; -defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11_gfx12<0x18>; defm V_DOT2_F32_BF16 : VOP3P_Realtriple_gfx11_gfx12<0x1a>; let AssemblerPredicate = isGFX11Plus in { @@ -3007,7 +3017,7 @@ defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; //===----------------------------------------------------------------------===// -// GFX10. +// GFX10, GFX11, GFX12, GFX13. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in { @@ -3020,6 +3030,10 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in { multiclass VOP3P_Real_gfx10_gfx11 op> : VOP3P_Real_gfx10, VOP3P_Real_Base; +multiclass VOP3P_Real_gfx10_gfx11_gfx12_gfx13 op> : + VOP3P_Real_gfx10_gfx11, VOP3P_Real_Base, + VOP3P_Real_Base; + multiclass VOP3P_Real_gfx10_gfx11_not_gfx1170 op> : VOP3P_Real_gfx10, VOP3P_Real_Base; @@ -3030,35 +3044,39 @@ multiclass VOP3P_Real_gfx10_gfx11_gfx12_Triple op> : VOP3P_Real_gfx10, VOP3P_Realtriple, VOP3P_Realtriple; -defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x00>; -defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x01>; -defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x02>; -defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x03>; -defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x04>; -defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x05>; -defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x06>; -defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x07>; -defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x08>; -defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x09>; -defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0a>; -defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0b>; -defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0c>; -defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0d>; -defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0e>; -defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0f>; -defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x10>; +multiclass VOP3P_Real_gfx10_gfx11_gfx12_gfx13_Triple op> : + VOP3P_Real_gfx10_gfx11_gfx12_Triple, + VOP3P_Realtriple; + +defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x00>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x01>; +defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x02>; +defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x03>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x04>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x05>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x06>; +defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x07>; +defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x08>; +defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x09>; +defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x0a>; +defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x0b>; +defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x0c>; +defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x0d>; +defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x0e>; +defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x0f>; +defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x10>; defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11_not_gfx1170<0x11>; defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11_not_gfx1170<0x12>; -defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x20>; -defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x21>; -defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x22>; +defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13_Triple<0x20>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13_Triple<0x21>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13_Triple<0x22>; defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x13>; -defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11_gfx12<0x17>; -defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11_gfx12<0x19>; +defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x17>; +defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11_gfx12_gfx13<0x19>; defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>; defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>; diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s index f7ee690c79211..566978544c596 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s @@ -1,5 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --sort --version 6 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_dot4_i32_i8 v5, v1, v2, s3 // GFX12: v_dot4_i32_iu8 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x16,0xcc,0x01,0x05,0x0e,0x18] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p.s new file mode 100644 index 0000000000000..3055a0fe43206 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p.s @@ -0,0 +1,1608 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 %s | FileCheck --check-prefix=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 %s | %extract-encodings | llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 -disassemble | FileCheck --check-prefix=GFX13 %s + +v_dot4_i32_iu8 v255, 0xaf123456, vcc_hi, null neg_lo:[0,0,0] +// GFX13: v_dot4_i32_iu8 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x40,0x16,0xcc,0xff,0xd6,0xf0,0x19,0x56,0x34,0x12,0xaf] + +v_dot4_i32_iu8 v5, -1, exec_hi, src_scc +// GFX13: v_dot4_i32_iu8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x40,0x16,0xcc,0xc1,0xfe,0xf4,0x1b] + +v_dot4_i32_iu8 v5, 0.5, m0, 0.5 neg_lo:[1,0,0] +// GFX13: v_dot4_i32_iu8 v5, 0.5, m0, 0.5 neg_lo:[1,0,0] ; encoding: [0x05,0x40,0x16,0xcc,0xf0,0xfa,0xc0,0x3b] + +v_dot4_i32_iu8 v5, exec_hi, null, vcc_lo +// GFX13: v_dot4_i32_iu8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x40,0x16,0xcc,0x7f,0xf8,0xa8,0x19] + +v_dot4_i32_iu8 v5, exec_lo, -1, vcc_hi +// GFX13: v_dot4_i32_iu8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x40,0x16,0xcc,0x7e,0x82,0xad,0x19] + +v_dot4_i32_iu8 v5, m0, 0.5, m0 +// GFX13: v_dot4_i32_iu8 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x16,0xcc,0x7d,0xe0,0xf5,0x19] + +v_dot4_i32_iu8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_dot4_i32_iu8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x40,0x16,0xcc,0x7c,0xfc,0xfc,0x1b,0x56,0x34,0x12,0xaf] + +v_dot4_i32_iu8 v5, s1, v255, exec_hi +// GFX13: v_dot4_i32_iu8 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x16,0xcc,0x01,0xfe,0xff,0x19] + +v_dot4_i32_iu8 v5, s105, s105, exec_lo +// GFX13: v_dot4_i32_iu8 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x16,0xcc,0x69,0xd2,0xf8,0x19] + +v_dot4_i32_iu8 v5, src_scc, vcc_lo, -1 neg_lo:[0,1,0] +// GFX13: v_dot4_i32_iu8 v5, src_scc, vcc_lo, -1 neg_lo:[0,1,0] ; encoding: [0x05,0x40,0x16,0xcc,0xfd,0xd4,0x04,0x5b] + +v_dot4_i32_iu8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_dot4_i32_iu8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x16,0xcc,0x7b,0xfa,0xed,0x19] + +v_dot4_i32_iu8 v5, v1, v2, s3 +// GFX13: v_dot4_i32_iu8 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x16,0xcc,0x01,0x05,0x0e,0x18] + +v_dot4_i32_iu8 v5, v255, s2, s105 +// GFX13: v_dot4_i32_iu8 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x16,0xcc,0xff,0x05,0xa4,0x19] + +v_dot4_i32_iu8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_dot4_i32_iu8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x40,0x16,0xcc,0x6b,0xfe,0xfd,0x1f,0x56,0x34,0x12,0xaf] + +v_dot4_i32_iu8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_dot4_i32_iu8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x16,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_dot4_u32_u8 v255, 0xaf123456, vcc_hi, null +// GFX13: v_dot4_u32_u8 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x40,0x17,0xcc,0xff,0xd6,0xf0,0x19,0x56,0x34,0x12,0xaf] + +v_dot4_u32_u8 v5, -1, exec_hi, src_scc +// GFX13: v_dot4_u32_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x40,0x17,0xcc,0xc1,0xfe,0xf4,0x1b] + +v_dot4_u32_u8 v5, 0.5, m0, 0.5 +// GFX13: v_dot4_u32_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x40,0x17,0xcc,0xf0,0xfa,0xc0,0x1b] + +v_dot4_u32_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_dot4_u32_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x40,0x17,0xcc,0x7f,0xf8,0xa8,0x19] + +v_dot4_u32_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_dot4_u32_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x40,0x17,0xcc,0x7e,0x82,0xad,0x19] + +v_dot4_u32_u8 v5, m0, 0.5, m0 +// GFX13: v_dot4_u32_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x17,0xcc,0x7d,0xe0,0xf5,0x19] + +v_dot4_u32_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_dot4_u32_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x40,0x17,0xcc,0x7c,0xfc,0xfc,0x1b,0x56,0x34,0x12,0xaf] + +v_dot4_u32_u8 v5, s1, v255, exec_hi +// GFX13: v_dot4_u32_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x17,0xcc,0x01,0xfe,0xff,0x19] + +v_dot4_u32_u8 v5, s105, s105, exec_lo +// GFX13: v_dot4_u32_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x17,0xcc,0x69,0xd2,0xf8,0x19] + +v_dot4_u32_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_dot4_u32_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x40,0x17,0xcc,0xfd,0xd4,0x04,0x1b] + +v_dot4_u32_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_dot4_u32_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x17,0xcc,0x7b,0xfa,0xed,0x19] + +v_dot4_u32_u8 v5, v1, v2, s3 +// GFX13: v_dot4_u32_u8 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x17,0xcc,0x01,0x05,0x0e,0x18] + +v_dot4_u32_u8 v5, v255, s2, s105 +// GFX13: v_dot4_u32_u8 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x17,0xcc,0xff,0x05,0xa4,0x19] + +v_dot4_u32_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_dot4_u32_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x40,0x17,0xcc,0x6b,0xfe,0xfd,0x1f,0x56,0x34,0x12,0xaf] + +v_dot4_u32_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_dot4_u32_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x17,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_dot8_i32_iu4 v255, 0xaf123456, vcc_hi, null neg_lo:[0,0,0] clamp +// GFX13: v_dot8_i32_iu4 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0xc0,0x18,0xcc,0xff,0xd6,0xf0,0x19,0x56,0x34,0x12,0xaf] + +v_dot8_i32_iu4 v5, -1, exec_hi, src_scc +// GFX13: v_dot8_i32_iu4 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x40,0x18,0xcc,0xc1,0xfe,0xf4,0x1b] + +v_dot8_i32_iu4 v5, 0.5, m0, 0.5 neg_lo:[1,0,0] +// GFX13: v_dot8_i32_iu4 v5, 0.5, m0, 0.5 neg_lo:[1,0,0] ; encoding: [0x05,0x40,0x18,0xcc,0xf0,0xfa,0xc0,0x3b] + +v_dot8_i32_iu4 v5, exec_hi, null, vcc_lo +// GFX13: v_dot8_i32_iu4 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x40,0x18,0xcc,0x7f,0xf8,0xa8,0x19] + +v_dot8_i32_iu4 v5, exec_lo, -1, vcc_hi +// GFX13: v_dot8_i32_iu4 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x40,0x18,0xcc,0x7e,0x82,0xad,0x19] + +v_dot8_i32_iu4 v5, m0, 0.5, m0 +// GFX13: v_dot8_i32_iu4 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x18,0xcc,0x7d,0xe0,0xf5,0x19] + +v_dot8_i32_iu4 v5, null, exec_lo, 0xaf123456 +// GFX13: v_dot8_i32_iu4 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x40,0x18,0xcc,0x7c,0xfc,0xfc,0x1b,0x56,0x34,0x12,0xaf] + +v_dot8_i32_iu4 v5, s1, v255, exec_hi +// GFX13: v_dot8_i32_iu4 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x18,0xcc,0x01,0xfe,0xff,0x19] + +v_dot8_i32_iu4 v5, s105, s105, exec_lo +// GFX13: v_dot8_i32_iu4 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x18,0xcc,0x69,0xd2,0xf8,0x19] + +v_dot8_i32_iu4 v5, src_scc, vcc_lo, -1 neg_lo:[0,1,0] +// GFX13: v_dot8_i32_iu4 v5, src_scc, vcc_lo, -1 neg_lo:[0,1,0] ; encoding: [0x05,0x40,0x18,0xcc,0xfd,0xd4,0x04,0x5b] + +v_dot8_i32_iu4 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_dot8_i32_iu4 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x18,0xcc,0x7b,0xfa,0xed,0x19] + +v_dot8_i32_iu4 v5, v1, v2, s3 +// GFX13: v_dot8_i32_iu4 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x18,0xcc,0x01,0x05,0x0e,0x18] + +v_dot8_i32_iu4 v5, v255, s2, s105 +// GFX13: v_dot8_i32_iu4 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x18,0xcc,0xff,0x05,0xa4,0x19] + +v_dot8_i32_iu4 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_dot8_i32_iu4 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x40,0x18,0xcc,0x6b,0xfe,0xfd,0x1f,0x56,0x34,0x12,0xaf] + +v_dot8_i32_iu4 v5, vcc_lo, ttmp15, v3 +// GFX13: v_dot8_i32_iu4 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x18,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_dot8_u32_u4 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_dot8_u32_u4 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0xc0,0x19,0xcc,0xff,0xd6,0xf0,0x19,0x56,0x34,0x12,0xaf] + +v_dot8_u32_u4 v5, -1, exec_hi, src_scc +// GFX13: v_dot8_u32_u4 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x40,0x19,0xcc,0xc1,0xfe,0xf4,0x1b] + +v_dot8_u32_u4 v5, 0.5, m0, 0.5 +// GFX13: v_dot8_u32_u4 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x40,0x19,0xcc,0xf0,0xfa,0xc0,0x1b] + +v_dot8_u32_u4 v5, exec_hi, null, vcc_lo +// GFX13: v_dot8_u32_u4 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x40,0x19,0xcc,0x7f,0xf8,0xa8,0x19] + +v_dot8_u32_u4 v5, exec_lo, -1, vcc_hi +// GFX13: v_dot8_u32_u4 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x40,0x19,0xcc,0x7e,0x82,0xad,0x19] + +v_dot8_u32_u4 v5, m0, 0.5, m0 +// GFX13: v_dot8_u32_u4 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x19,0xcc,0x7d,0xe0,0xf5,0x19] + +v_dot8_u32_u4 v5, null, exec_lo, 0xaf123456 +// GFX13: v_dot8_u32_u4 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x40,0x19,0xcc,0x7c,0xfc,0xfc,0x1b,0x56,0x34,0x12,0xaf] + +v_dot8_u32_u4 v5, s1, v255, exec_hi +// GFX13: v_dot8_u32_u4 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x19,0xcc,0x01,0xfe,0xff,0x19] + +v_dot8_u32_u4 v5, s105, s105, exec_lo +// GFX13: v_dot8_u32_u4 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x19,0xcc,0x69,0xd2,0xf8,0x19] + +v_dot8_u32_u4 v5, src_scc, vcc_lo, -1 +// GFX13: v_dot8_u32_u4 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x40,0x19,0xcc,0xfd,0xd4,0x04,0x1b] + +v_dot8_u32_u4 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_dot8_u32_u4 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x19,0xcc,0x7b,0xfa,0xed,0x19] + +v_dot8_u32_u4 v5, v1, v2, s3 +// GFX13: v_dot8_u32_u4 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x19,0xcc,0x01,0x05,0x0e,0x18] + +v_dot8_u32_u4 v5, v255, s2, s105 +// GFX13: v_dot8_u32_u4 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x19,0xcc,0xff,0x05,0xa4,0x19] + +v_dot8_u32_u4 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_dot8_u32_u4 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x40,0x19,0xcc,0x6b,0xfe,0xfd,0x1f,0x56,0x34,0x12,0xaf] + +v_dot8_u32_u4 v5, vcc_lo, ttmp15, v3 +// GFX13: v_dot8_u32_u4 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x19,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_fma_mix_f32 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX13: v_fma_mix_f32 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x20,0xcc,0xfd,0xd6,0xf0,0x61] + +v_fma_mix_f32 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX13: v_fma_mix_f32 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x20,0xcc,0xc1,0xfa,0x04,0x53] + +v_fma_mix_f32 v5, -m0, -1, |vcc_lo| +// GFX13: v_fma_mix_f32 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x20,0xcc,0x7d,0x82,0xa9,0x21] + +v_fma_mix_f32 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX13: v_fma_mix_f32 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x20,0xcc,0x7f,0xfc,0xf8,0xf9] + +v_fma_mix_f32 v5, -|exec_lo|, null, -|src_scc| +// GFX13: v_fma_mix_f32 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x20,0xcc,0x7e,0xf8,0xf4,0xa3] + +v_fma_mix_f32 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX13: v_fma_mix_f32 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x20,0xcc,0xf0,0xd4,0xfc,0xc9] + +v_fma_mix_f32 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX13: v_fma_mix_f32 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x20,0xcc,0x7c,0xfe,0xc0,0x03] + +v_fma_mix_f32 v5, s1, s2, v3 +// GFX13: v_fma_mix_f32 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x20,0xcc,0x01,0x04,0x0c,0x04] + +v_fma_mix_f32 v5, s105, s105, m0 +// GFX13: v_fma_mix_f32 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x20,0xcc,0x69,0xd2,0xf4,0x01] + +v_fma_mix_f32 v5, v1, v2, s3 +// GFX13: v_fma_mix_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x20,0xcc,0x01,0x05,0x0e,0x00] + +v_fma_mix_f32 v5, v255, v255, s105 +// GFX13: v_fma_mix_f32 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x20,0xcc,0xff,0xff,0xa7,0x01] + +v_fma_mix_f32 v5, vcc_hi, src_scc, v255 +// GFX13: v_fma_mix_f32 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x20,0xcc,0x6b,0xfa,0xfd,0x07] + +v_fma_mix_f32 v5, vcc_lo, ttmp15, ttmp15 +// GFX13: v_fma_mix_f32 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x20,0xcc,0x6a,0xf6,0xec,0x01] + +v_fma_mix_f32 v5, |ttmp15|, 0.5, -vcc_hi +// GFX13: v_fma_mix_f32 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x20,0xcc,0x7b,0xe0,0xad,0x81] + +v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX13: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61] + +v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX13: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53] + +v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| +// GFX13: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21] + +v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX13: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9] + +v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| +// GFX13: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3] + +v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX13: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9] + +v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX13: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03] + +v_fma_mix_f32_bf16 v5, s1, s2, v3 +// GFX13: v_fma_mix_f32_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04] + +v_fma_mix_f32_bf16 v5, s105, s105, m0 +// GFX13: v_fma_mix_f32_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01] + +v_fma_mix_f32_bf16 v5, v1, v2, s3 +// GFX13: v_fma_mix_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00] + +v_fma_mix_f32_bf16 v5, v255, v255, s105 +// GFX13: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01] + +v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 +// GFX13: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07] + +v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 +// GFX13: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01] + +v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX13: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81] + +v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX13: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61] + +v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX13: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53] + +v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| +// GFX13: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21] + +v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX13: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9] + +v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| +// GFX13: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3] + +v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX13: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9] + +v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX13: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03] + +v_fma_mixhi_bf16 v5, s1, s2, v3 +// GFX13: v_fma_mixhi_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04] + +v_fma_mixhi_bf16 v5, s105, s105, m0 +// GFX13: v_fma_mixhi_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01] + +v_fma_mixhi_bf16 v5, v1, v2, s3 +// GFX13: v_fma_mixhi_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00] + +v_fma_mixhi_bf16 v5, v255, v255, s105 +// GFX13: v_fma_mixhi_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01] + +v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 +// GFX13: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07] + +v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 +// GFX13: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01] + +v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX13: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81] + +v_fma_mixhi_f16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX13: v_fma_mixhi_f16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x22,0xcc,0xfd,0xd6,0xf0,0x61] + +v_fma_mixhi_f16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX13: v_fma_mixhi_f16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x22,0xcc,0xc1,0xfa,0x04,0x53] + +v_fma_mixhi_f16 v5, -m0, -1, |vcc_lo| +// GFX13: v_fma_mixhi_f16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x22,0xcc,0x7d,0x82,0xa9,0x21] + +v_fma_mixhi_f16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX13: v_fma_mixhi_f16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x22,0xcc,0x7f,0xfc,0xf8,0xf9] + +v_fma_mixhi_f16 v5, -|exec_lo|, null, -|src_scc| +// GFX13: v_fma_mixhi_f16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x22,0xcc,0x7e,0xf8,0xf4,0xa3] + +v_fma_mixhi_f16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX13: v_fma_mixhi_f16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x22,0xcc,0xf0,0xd4,0xfc,0xc9] + +v_fma_mixhi_f16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX13: v_fma_mixhi_f16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x22,0xcc,0x7c,0xfe,0xc0,0x03] + +v_fma_mixhi_f16 v5, s1, s2, v3 +// GFX13: v_fma_mixhi_f16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x22,0xcc,0x01,0x04,0x0c,0x04] + +v_fma_mixhi_f16 v5, s105, s105, m0 +// GFX13: v_fma_mixhi_f16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x22,0xcc,0x69,0xd2,0xf4,0x01] + +v_fma_mixhi_f16 v5, v1, v2, s3 +// GFX13: v_fma_mixhi_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x22,0xcc,0x01,0x05,0x0e,0x00] + +v_fma_mixhi_f16 v5, v255, v255, s105 +// GFX13: v_fma_mixhi_f16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x22,0xcc,0xff,0xff,0xa7,0x01] + +v_fma_mixhi_f16 v5, vcc_hi, src_scc, v255 +// GFX13: v_fma_mixhi_f16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x22,0xcc,0x6b,0xfa,0xfd,0x07] + +v_fma_mixhi_f16 v5, vcc_lo, ttmp15, ttmp15 +// GFX13: v_fma_mixhi_f16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x22,0xcc,0x6a,0xf6,0xec,0x01] + +v_fma_mixhi_f16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX13: v_fma_mixhi_f16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x22,0xcc,0x7b,0xe0,0xad,0x81] + +v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX13: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61] + +v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX13: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53] + +v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| +// GFX13: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21] + +v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX13: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9] + +v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| +// GFX13: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3] + +v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX13: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9] + +v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX13: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03] + +v_fma_mixlo_bf16 v5, s1, s2, v3 +// GFX13: v_fma_mixlo_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04] + +v_fma_mixlo_bf16 v5, s105, s105, m0 +// GFX13: v_fma_mixlo_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01] + +v_fma_mixlo_bf16 v5, v1, v2, s3 +// GFX13: v_fma_mixlo_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00] + +v_fma_mixlo_bf16 v5, v255, v255, s105 +// GFX13: v_fma_mixlo_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01] + +v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 +// GFX13: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07] + +v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 +// GFX13: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01] + +v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX13: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81] + +v_fma_mixlo_f16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX13: v_fma_mixlo_f16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x21,0xcc,0xfd,0xd6,0xf0,0x61] + +v_fma_mixlo_f16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX13: v_fma_mixlo_f16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x21,0xcc,0xc1,0xfa,0x04,0x53] + +v_fma_mixlo_f16 v5, -m0, -1, |vcc_lo| +// GFX13: v_fma_mixlo_f16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x21,0xcc,0x7d,0x82,0xa9,0x21] + +v_fma_mixlo_f16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX13: v_fma_mixlo_f16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x21,0xcc,0x7f,0xfc,0xf8,0xf9] + +v_fma_mixlo_f16 v5, -|exec_lo|, null, -|src_scc| +// GFX13: v_fma_mixlo_f16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x21,0xcc,0x7e,0xf8,0xf4,0xa3] + +v_fma_mixlo_f16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX13: v_fma_mixlo_f16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x21,0xcc,0xf0,0xd4,0xfc,0xc9] + +v_fma_mixlo_f16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX13: v_fma_mixlo_f16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x21,0xcc,0x7c,0xfe,0xc0,0x03] + +v_fma_mixlo_f16 v5, s1, s2, v3 +// GFX13: v_fma_mixlo_f16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x21,0xcc,0x01,0x04,0x0c,0x04] + +v_fma_mixlo_f16 v5, s105, s105, m0 +// GFX13: v_fma_mixlo_f16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x21,0xcc,0x69,0xd2,0xf4,0x01] + +v_fma_mixlo_f16 v5, v1, v2, s3 +// GFX13: v_fma_mixlo_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x21,0xcc,0x01,0x05,0x0e,0x00] + +v_fma_mixlo_f16 v5, v255, v255, s105 +// GFX13: v_fma_mixlo_f16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x21,0xcc,0xff,0xff,0xa7,0x01] + +v_fma_mixlo_f16 v5, vcc_hi, src_scc, v255 +// GFX13: v_fma_mixlo_f16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x21,0xcc,0x6b,0xfa,0xfd,0x07] + +v_fma_mixlo_f16 v5, vcc_lo, ttmp15, ttmp15 +// GFX13: v_fma_mixlo_f16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x21,0xcc,0x6a,0xf6,0xec,0x01] + +v_fma_mixlo_f16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX13: v_fma_mixlo_f16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x21,0xcc,0x7b,0xe0,0xad,0x81] + +v_pk_add_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_add_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_add_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_add_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x23,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_add_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_add_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x23,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_add_bf16 v5, exec_hi, null +// GFX13: v_pk_add_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x23,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_add_bf16 v5, exec_lo, -1 +// GFX13: v_pk_add_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x23,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_add_bf16 v5, m0, 0.5 +// GFX13: v_pk_add_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x23,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_add_bf16 v5, null, exec_lo +// GFX13: v_pk_add_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x23,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_add_bf16 v5, s1, s2 +// GFX13: v_pk_add_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x23,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_add_bf16 v5, s105, s105 +// GFX13: v_pk_add_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x23,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_add_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_add_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x23,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_add_bf16 v5, ttmp15, src_scc +// GFX13: v_pk_add_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x23,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_add_bf16 v5, v1, v2 +// GFX13: v_pk_add_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x23,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_add_bf16 v5, v255, v255 +// GFX13: v_pk_add_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x23,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_add_bf16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_add_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x23,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_add_bf16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_add_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x23,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_add_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_add_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x0f,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_add_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_add_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x0f,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_add_f16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_add_f16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x0f,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_add_f16 v5, exec_hi, null +// GFX13: v_pk_add_f16 v5, exec_hi, null ; encoding: [0x05,0x40,0x0f,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_add_f16 v5, exec_lo, -1 +// GFX13: v_pk_add_f16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0f,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_add_f16 v5, m0, 0.5 +// GFX13: v_pk_add_f16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0f,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_add_f16 v5, null, exec_lo +// GFX13: v_pk_add_f16 v5, null, exec_lo ; encoding: [0x05,0x40,0x0f,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_add_f16 v5, s1, s2 +// GFX13: v_pk_add_f16 v5, s1, s2 ; encoding: [0x05,0x40,0x0f,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_add_f16 v5, s105, s105 +// GFX13: v_pk_add_f16 v5, s105, s105 ; encoding: [0x05,0x40,0x0f,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_add_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_add_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0f,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_add_f16 v5, ttmp15, src_scc +// GFX13: v_pk_add_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0f,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_add_f16 v5, v1, v2 +// GFX13: v_pk_add_f16 v5, v1, v2 ; encoding: [0x05,0x40,0x0f,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_add_f16 v5, v255, v255 +// GFX13: v_pk_add_f16 v5, v255, v255 ; encoding: [0x05,0x40,0x0f,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_add_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_add_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x0f,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_add_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_add_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x0f,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_add_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp +// GFX13: v_pk_add_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp ; encoding: [0xff,0xd0,0x02,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_add_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_add_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x02,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_add_i16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x02,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_add_i16 v5, exec_hi, null +// GFX13: v_pk_add_i16 v5, exec_hi, null ; encoding: [0x05,0x40,0x02,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_add_i16 v5, exec_lo, -1 +// GFX13: v_pk_add_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_add_i16 v5, m0, 0.5 +// GFX13: v_pk_add_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_add_i16 v5, null, exec_lo +// GFX13: v_pk_add_i16 v5, null, exec_lo ; encoding: [0x05,0x40,0x02,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_add_i16 v5, s1, s2 +// GFX13: v_pk_add_i16 v5, s1, s2 ; encoding: [0x05,0x40,0x02,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_add_i16 v5, s105, s105 +// GFX13: v_pk_add_i16 v5, s105, s105 ; encoding: [0x05,0x40,0x02,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_add_i16 v5, ttmp15, src_scc +// GFX13: v_pk_add_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_add_i16 v5, v1, v2 +// GFX13: v_pk_add_i16 v5, v1, v2 ; encoding: [0x05,0x40,0x02,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_add_i16 v5, v255, v255 +// GFX13: v_pk_add_i16 v5, v255, v255 ; encoding: [0x05,0x40,0x02,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_add_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_add_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x02,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_add_i16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_add_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x02,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_add_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp +// GFX13: v_pk_add_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp ; encoding: [0xff,0xd0,0x0a,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_add_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_add_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0a,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_add_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_add_u16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x0a,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_add_u16 v5, exec_hi, null +// GFX13: v_pk_add_u16 v5, exec_hi, null ; encoding: [0x05,0x40,0x0a,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_add_u16 v5, exec_lo, -1 +// GFX13: v_pk_add_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_add_u16 v5, m0, 0.5 +// GFX13: v_pk_add_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_add_u16 v5, null, exec_lo +// GFX13: v_pk_add_u16 v5, null, exec_lo ; encoding: [0x05,0x40,0x0a,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_add_u16 v5, s1, s2 +// GFX13: v_pk_add_u16 v5, s1, s2 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_add_u16 v5, s105, s105 +// GFX13: v_pk_add_u16 v5, s105, s105 ; encoding: [0x05,0x40,0x0a,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_add_u16 v5, ttmp15, src_scc +// GFX13: v_pk_add_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_add_u16 v5, v1, v2 +// GFX13: v_pk_add_u16 v5, v1, v2 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_add_u16 v5, v255, v255 +// GFX13: v_pk_add_u16 v5, v255, v255 ; encoding: [0x05,0x40,0x0a,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_add_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_add_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x0a,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_add_u16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_add_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x0a,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_ashrrev_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_ashrrev_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x06,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_ashrrev_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_ashrrev_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x06,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_ashrrev_i16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x06,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_ashrrev_i16 v5, exec_hi, null +// GFX13: v_pk_ashrrev_i16 v5, exec_hi, null ; encoding: [0x05,0x40,0x06,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_ashrrev_i16 v5, exec_lo, -1 +// GFX13: v_pk_ashrrev_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_ashrrev_i16 v5, m0, 0.5 +// GFX13: v_pk_ashrrev_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_ashrrev_i16 v5, null, exec_lo +// GFX13: v_pk_ashrrev_i16 v5, null, exec_lo ; encoding: [0x05,0x40,0x06,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_ashrrev_i16 v5, s1, s2 +// GFX13: v_pk_ashrrev_i16 v5, s1, s2 ; encoding: [0x05,0x40,0x06,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_ashrrev_i16 v5, s105, s105 +// GFX13: v_pk_ashrrev_i16 v5, s105, s105 ; encoding: [0x05,0x40,0x06,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_ashrrev_i16 v5, ttmp15, src_scc +// GFX13: v_pk_ashrrev_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_ashrrev_i16 v5, v1, v2 +// GFX13: v_pk_ashrrev_i16 v5, v1, v2 ; encoding: [0x05,0x40,0x06,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_ashrrev_i16 v5, v255, v255 +// GFX13: v_pk_ashrrev_i16 v5, v255, v255 ; encoding: [0x05,0x40,0x06,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_ashrrev_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_ashrrev_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x06,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_ashrrev_i16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_ashrrev_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x06,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_fma_bf16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp +// GFX13: v_pk_fma_bf16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp ; encoding: [0xff,0xa7,0x11,0xcc,0xff,0xd6,0xf0,0xf9,0x0b,0xfe,0x00,0x00] + +v_pk_fma_bf16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX13: v_pk_fma_bf16 v5, -1, exec_hi, src_scc neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x05,0x42,0x11,0xcc,0xc1,0xfe,0xf4,0x5b] + +v_pk_fma_bf16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX13: v_pk_fma_bf16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x05,0x4c,0x11,0xcc,0xf0,0xfa,0xc0,0x93] + +v_pk_fma_bf16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] +// GFX13: v_pk_fma_bf16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x11,0xcc,0x7f,0xf8,0xa8,0x11] + +v_pk_fma_bf16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] +// GFX13: v_pk_fma_bf16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x11,0xcc,0x7e,0x82,0xad,0x01] + +v_pk_fma_bf16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] +// GFX13: v_pk_fma_bf16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x11,0xcc,0x7d,0xe0,0xf5,0x01] + +v_pk_fma_bf16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX13: v_pk_fma_bf16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x05,0x39,0x11,0xcc,0x7c,0xfc,0xfc,0x2b,0x0b,0xfe,0x00,0x00] + +v_pk_fma_bf16 v5, s1, v255, exec_hi +// GFX13: v_pk_fma_bf16 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x11,0xcc,0x01,0xfe,0xff,0x19] + +v_pk_fma_bf16 v5, s105, s105, exec_lo +// GFX13: v_pk_fma_bf16 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x11,0xcc,0x69,0xd2,0xf8,0x19] + +v_pk_fma_bf16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX13: v_pk_fma_bf16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x11,0xcc,0xfd,0xd4,0x04,0x0b] + +v_pk_fma_bf16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_pk_fma_bf16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x11,0xcc,0x7b,0xfa,0xed,0x19] + +v_pk_fma_bf16 v5, v1, v2, s3 +// GFX13: v_pk_fma_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x11,0xcc,0x01,0x05,0x0e,0x18] + +v_pk_fma_bf16 v5, v255, s2, s105 +// GFX13: v_pk_fma_bf16 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x11,0xcc,0xff,0x05,0xa4,0x19] + +v_pk_fma_bf16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_pk_fma_bf16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x40,0x11,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00] + +v_pk_fma_bf16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_pk_fma_bf16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x11,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_pk_fma_f16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp +// GFX13: v_pk_fma_f16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[1,1,1] neg_hi:[1,1,1] clamp ; encoding: [0xff,0xa7,0x0e,0xcc,0xff,0xd6,0xf0,0xf9,0x0b,0xfe,0x00,0x00] + +v_pk_fma_f16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,1,0] neg_hi:[0,1,0] +// GFX13: v_pk_fma_f16 v5, -1, exec_hi, src_scc neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x05,0x42,0x0e,0xcc,0xc1,0xfe,0xf4,0x5b] + +v_pk_fma_f16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +// GFX13: v_pk_fma_f16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x05,0x4c,0x0e,0xcc,0xf0,0xfa,0xc0,0x93] + +v_pk_fma_f16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] +// GFX13: v_pk_fma_f16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x0e,0xcc,0x7f,0xf8,0xa8,0x11] + +v_pk_fma_f16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] +// GFX13: v_pk_fma_f16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x0e,0xcc,0x7e,0x82,0xad,0x01] + +v_pk_fma_f16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] +// GFX13: v_pk_fma_f16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x0e,0xcc,0x7d,0xe0,0xf5,0x01] + +v_pk_fma_f16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] +// GFX13: v_pk_fma_f16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x05,0x39,0x0e,0xcc,0x7c,0xfc,0xfc,0x2b,0x0b,0xfe,0x00,0x00] + +v_pk_fma_f16 v5, s1, v255, exec_hi +// GFX13: v_pk_fma_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x0e,0xcc,0x01,0xfe,0xff,0x19] + +v_pk_fma_f16 v5, s105, s105, exec_lo +// GFX13: v_pk_fma_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x0e,0xcc,0x69,0xd2,0xf8,0x19] + +v_pk_fma_f16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX13: v_pk_fma_f16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x0e,0xcc,0xfd,0xd4,0x04,0x0b] + +v_pk_fma_f16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_pk_fma_f16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x0e,0xcc,0x7b,0xfa,0xed,0x19] + +v_pk_fma_f16 v5, v1, v2, s3 +// GFX13: v_pk_fma_f16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x18] + +v_pk_fma_f16 v5, v255, s2, s105 +// GFX13: v_pk_fma_f16 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x0e,0xcc,0xff,0x05,0xa4,0x19] + +v_pk_fma_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_pk_fma_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x40,0x0e,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00] + +v_pk_fma_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_pk_fma_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x0e,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_pk_lshlrev_b16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_lshlrev_b16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x04,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_lshlrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_lshlrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x04,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_lshlrev_b16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x04,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_lshlrev_b16 v5, 0x3800, m0 ; encoding: [0x05,0x40,0x04,0xcc,0xff,0xfa,0x00,0x1a,0x00,0x38,0x00,0x00] + +v_pk_lshlrev_b16 v5, exec_hi, null +// GFX13: v_pk_lshlrev_b16 v5, exec_hi, null ; encoding: [0x05,0x40,0x04,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_lshlrev_b16 v5, exec_lo, -1 +// GFX13: v_pk_lshlrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_lshlrev_b16 v5, m0, 0.5 +// GFX13: v_pk_lshlrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_lshlrev_b16 v5, null, exec_lo +// GFX13: v_pk_lshlrev_b16 v5, null, exec_lo ; encoding: [0x05,0x40,0x04,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_lshlrev_b16 v5, s1, s2 +// GFX13: v_pk_lshlrev_b16 v5, s1, s2 ; encoding: [0x05,0x40,0x04,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_lshlrev_b16 v5, s105, s105 +// GFX13: v_pk_lshlrev_b16 v5, s105, s105 ; encoding: [0x05,0x40,0x04,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_lshlrev_b16 v5, ttmp15, src_scc +// GFX13: v_pk_lshlrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_lshlrev_b16 v5, v1, v2 +// GFX13: v_pk_lshlrev_b16 v5, v1, v2 ; encoding: [0x05,0x40,0x04,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_lshlrev_b16 v5, v255, v255 +// GFX13: v_pk_lshlrev_b16 v5, v255, v255 ; encoding: [0x05,0x40,0x04,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_lshlrev_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_lshlrev_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x04,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_lshlrev_b16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_lshlrev_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x04,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_lshrrev_b16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_lshrrev_b16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x05,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_lshrrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_lshrrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x05,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_lshrrev_b16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x05,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_lshrrev_b16 v5, exec_hi, null +// GFX13: v_pk_lshrrev_b16 v5, exec_hi, null ; encoding: [0x05,0x40,0x05,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_lshrrev_b16 v5, exec_lo, -1 +// GFX13: v_pk_lshrrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_lshrrev_b16 v5, m0, 0.5 +// GFX13: v_pk_lshrrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_lshrrev_b16 v5, null, exec_lo +// GFX13: v_pk_lshrrev_b16 v5, null, exec_lo ; encoding: [0x05,0x40,0x05,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_lshrrev_b16 v5, s1, s2 +// GFX13: v_pk_lshrrev_b16 v5, s1, s2 ; encoding: [0x05,0x40,0x05,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_lshrrev_b16 v5, s105, s105 +// GFX13: v_pk_lshrrev_b16 v5, s105, s105 ; encoding: [0x05,0x40,0x05,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_lshrrev_b16 v5, ttmp15, src_scc +// GFX13: v_pk_lshrrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_lshrrev_b16 v5, v1, v2 +// GFX13: v_pk_lshrrev_b16 v5, v1, v2 ; encoding: [0x05,0x40,0x05,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_lshrrev_b16 v5, v255, v255 +// GFX13: v_pk_lshrrev_b16 v5, v255, v255 ; encoding: [0x05,0x40,0x05,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_lshrrev_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_lshrrev_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x05,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_lshrrev_b16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_lshrrev_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x05,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_mad_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] clamp +// GFX13: v_pk_mad_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] clamp ; encoding: [0xff,0xa0,0x00,0xcc,0xff,0xd6,0xf0,0x19,0x0b,0xfe,0x00,0x00] + +v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] +// GFX13: v_pk_mad_i16 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x40,0x00,0xcc,0xc1,0xfe,0xf4,0x1b] + +v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] +// GFX13: v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13] + +v_pk_mad_i16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] +// GFX13: v_pk_mad_i16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x00,0xcc,0x7f,0xf8,0xa8,0x11] + +v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] +// GFX13: v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x00,0xcc,0x7e,0x82,0xad,0x01] + +v_pk_mad_i16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] +// GFX13: v_pk_mad_i16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x00,0xcc,0x7d,0xe0,0xf5,0x01] + +v_pk_mad_i16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] +// GFX13: v_pk_mad_i16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x00,0xcc,0x7c,0xfc,0xfc,0x0b,0x0b,0xfe,0x00,0x00] + +v_pk_mad_i16 v5, s1, v255, exec_hi +// GFX13: v_pk_mad_i16 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x00,0xcc,0x01,0xfe,0xff,0x19] + +v_pk_mad_i16 v5, s105, s105, exec_lo +// GFX13: v_pk_mad_i16 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x00,0xcc,0x69,0xd2,0xf8,0x19] + +v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] +// GFX13: v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b] + +v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19] + +v_pk_mad_i16 v5, v1, v2, s3 +// GFX13: v_pk_mad_i16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0x05,0x0e,0x18] + +v_pk_mad_i16 v5, v255, s2, s105 +// GFX13: v_pk_mad_i16 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x00,0xcc,0xff,0x05,0xa4,0x19] + +v_pk_mad_i16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_pk_mad_i16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x40,0x00,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00] + +v_pk_mad_i16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_pk_mad_i16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x00,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_pk_mad_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] clamp +// GFX13: v_pk_mad_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,1] op_sel_hi:[1,1,0] clamp ; encoding: [0xff,0xa0,0x09,0xcc,0xff,0xd6,0xf0,0x19,0x0b,0xfe,0x00,0x00] + +v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] +// GFX13: v_pk_mad_u16 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x40,0x09,0xcc,0xc1,0xfe,0xf4,0x1b] + +v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] +// GFX13: v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13] + +v_pk_mad_u16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] +// GFX13: v_pk_mad_u16 v5, exec_hi, null, vcc_lo op_sel_hi:[0,1,0] ; encoding: [0x05,0x00,0x09,0xcc,0x7f,0xf8,0xa8,0x11] + +v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] +// GFX13: v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x09,0xcc,0x7e,0x82,0xad,0x01] + +v_pk_mad_u16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] +// GFX13: v_pk_mad_u16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x09,0xcc,0x7d,0xe0,0xf5,0x01] + +v_pk_mad_u16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] +// GFX13: v_pk_mad_u16 v5, null, exec_lo, 0xfe0b op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x09,0xcc,0x7c,0xfc,0xfc,0x0b,0x0b,0xfe,0x00,0x00] + +v_pk_mad_u16 v5, s1, v255, exec_hi +// GFX13: v_pk_mad_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x40,0x09,0xcc,0x01,0xfe,0xff,0x19] + +v_pk_mad_u16 v5, s105, s105, exec_lo +// GFX13: v_pk_mad_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x40,0x09,0xcc,0x69,0xd2,0xf8,0x19] + +v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] +// GFX13: v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b] + +v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19] + +v_pk_mad_u16 v5, v1, v2, s3 +// GFX13: v_pk_mad_u16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x09,0xcc,0x01,0x05,0x0e,0x18] + +v_pk_mad_u16 v5, v255, s2, s105 +// GFX13: v_pk_mad_u16 v5, v255, s2, s105 ; encoding: [0x05,0x40,0x09,0xcc,0xff,0x05,0xa4,0x19] + +v_pk_mad_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_pk_mad_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x40,0x09,0xcc,0x6b,0xfe,0xfd,0x1f,0x0b,0xfe,0x00,0x00] + +v_pk_mad_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_pk_mad_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x40,0x09,0xcc,0x6a,0xf6,0x0c,0x1c] + +v_pk_max_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_max_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x07,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_max_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_max_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x07,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_max_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_max_i16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x07,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_max_i16 v5, exec_hi, null +// GFX13: v_pk_max_i16 v5, exec_hi, null ; encoding: [0x05,0x40,0x07,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_max_i16 v5, exec_lo, -1 +// GFX13: v_pk_max_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_max_i16 v5, m0, 0.5 +// GFX13: v_pk_max_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_max_i16 v5, null, exec_lo +// GFX13: v_pk_max_i16 v5, null, exec_lo ; encoding: [0x05,0x40,0x07,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_max_i16 v5, s1, s2 +// GFX13: v_pk_max_i16 v5, s1, s2 ; encoding: [0x05,0x40,0x07,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_max_i16 v5, s105, s105 +// GFX13: v_pk_max_i16 v5, s105, s105 ; encoding: [0x05,0x40,0x07,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_max_i16 v5, ttmp15, src_scc +// GFX13: v_pk_max_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_max_i16 v5, v1, v2 +// GFX13: v_pk_max_i16 v5, v1, v2 ; encoding: [0x05,0x40,0x07,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_max_i16 v5, v255, v255 +// GFX13: v_pk_max_i16 v5, v255, v255 ; encoding: [0x05,0x40,0x07,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_max_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_max_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x07,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_max_i16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_max_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x07,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_max_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_max_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2c,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_max_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_max_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2c,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_max_num_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_max_num_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2c,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_max_num_bf16 v5, exec_hi, null +// GFX13: v_pk_max_num_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2c,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_max_num_bf16 v5, exec_lo, -1 +// GFX13: v_pk_max_num_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2c,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_max_num_bf16 v5, m0, 0.5 +// GFX13: v_pk_max_num_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2c,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_max_num_bf16 v5, null, exec_lo +// GFX13: v_pk_max_num_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2c,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_max_num_bf16 v5, s1, s2 +// GFX13: v_pk_max_num_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2c,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_max_num_bf16 v5, s105, s105 +// GFX13: v_pk_max_num_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2c,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_max_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_max_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2c,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_max_num_bf16 v5, ttmp15, src_scc +// GFX13: v_pk_max_num_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2c,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_max_num_bf16 v5, v1, v2 +// GFX13: v_pk_max_num_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2c,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_max_num_bf16 v5, v255, v255 +// GFX13: v_pk_max_num_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2c,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_max_num_bf16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_max_num_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2c,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_max_num_bf16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_max_num_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2c,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_max_num_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_max_num_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x1c,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_max_num_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_max_num_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x1c,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_max_num_f16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_max_num_f16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x1c,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_max_num_f16 v5, exec_hi, null +// GFX13: v_pk_max_num_f16 v5, exec_hi, null ; encoding: [0x05,0x40,0x1c,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_max_num_f16 v5, exec_lo, -1 +// GFX13: v_pk_max_num_f16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x1c,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_max_num_f16 v5, m0, 0.5 +// GFX13: v_pk_max_num_f16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x1c,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_max_num_f16 v5, null, exec_lo +// GFX13: v_pk_max_num_f16 v5, null, exec_lo ; encoding: [0x05,0x40,0x1c,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_max_num_f16 v5, s1, s2 +// GFX13: v_pk_max_num_f16 v5, s1, s2 ; encoding: [0x05,0x40,0x1c,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_max_num_f16 v5, s105, s105 +// GFX13: v_pk_max_num_f16 v5, s105, s105 ; encoding: [0x05,0x40,0x1c,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_max_num_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_max_num_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x1c,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_max_num_f16 v5, ttmp15, src_scc +// GFX13: v_pk_max_num_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x1c,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_max_num_f16 v5, v1, v2 +// GFX13: v_pk_max_num_f16 v5, v1, v2 ; encoding: [0x05,0x40,0x1c,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_max_num_f16 v5, v255, v255 +// GFX13: v_pk_max_num_f16 v5, v255, v255 ; encoding: [0x05,0x40,0x1c,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_max_num_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_max_num_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x1c,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_max_num_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_max_num_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x1c,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_max_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_max_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x0c,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_max_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_max_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0c,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_max_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_max_u16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x0c,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_max_u16 v5, exec_hi, null +// GFX13: v_pk_max_u16 v5, exec_hi, null ; encoding: [0x05,0x40,0x0c,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_max_u16 v5, exec_lo, -1 +// GFX13: v_pk_max_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_max_u16 v5, m0, 0.5 +// GFX13: v_pk_max_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_max_u16 v5, null, exec_lo +// GFX13: v_pk_max_u16 v5, null, exec_lo ; encoding: [0x05,0x40,0x0c,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_max_u16 v5, s1, s2 +// GFX13: v_pk_max_u16 v5, s1, s2 ; encoding: [0x05,0x40,0x0c,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_max_u16 v5, s105, s105 +// GFX13: v_pk_max_u16 v5, s105, s105 ; encoding: [0x05,0x40,0x0c,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_max_u16 v5, ttmp15, src_scc +// GFX13: v_pk_max_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_max_u16 v5, v1, v2 +// GFX13: v_pk_max_u16 v5, v1, v2 ; encoding: [0x05,0x40,0x0c,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_max_u16 v5, v255, v255 +// GFX13: v_pk_max_u16 v5, v255, v255 ; encoding: [0x05,0x40,0x0c,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_max_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_max_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x0c,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_max_u16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_max_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x0c,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_maximum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_maximum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x1e,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_maximum_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_maximum_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x1e,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_maximum_f16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_maximum_f16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x1e,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_maximum_f16 v5, exec_hi, null +// GFX13: v_pk_maximum_f16 v5, exec_hi, null ; encoding: [0x05,0x40,0x1e,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_maximum_f16 v5, exec_lo, -1 +// GFX13: v_pk_maximum_f16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x1e,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_maximum_f16 v5, m0, 0.5 +// GFX13: v_pk_maximum_f16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x1e,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_maximum_f16 v5, null, exec_lo +// GFX13: v_pk_maximum_f16 v5, null, exec_lo ; encoding: [0x05,0x40,0x1e,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_maximum_f16 v5, s1, s2 +// GFX13: v_pk_maximum_f16 v5, s1, s2 ; encoding: [0x05,0x40,0x1e,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_maximum_f16 v5, s105, s105 +// GFX13: v_pk_maximum_f16 v5, s105, s105 ; encoding: [0x05,0x40,0x1e,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_maximum_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_maximum_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x1e,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_maximum_f16 v5, ttmp15, src_scc +// GFX13: v_pk_maximum_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x1e,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_maximum_f16 v5, v1, v2 +// GFX13: v_pk_maximum_f16 v5, v1, v2 ; encoding: [0x05,0x40,0x1e,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_maximum_f16 v5, v255, v255 +// GFX13: v_pk_maximum_f16 v5, v255, v255 ; encoding: [0x05,0x40,0x1e,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_maximum_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_maximum_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x1e,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_maximum_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_maximum_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x1e,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_min_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_min_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x08,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_min_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_min_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x08,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_min_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_min_i16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x08,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_min_i16 v5, exec_hi, null +// GFX13: v_pk_min_i16 v5, exec_hi, null ; encoding: [0x05,0x40,0x08,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_min_i16 v5, exec_lo, -1 +// GFX13: v_pk_min_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_min_i16 v5, m0, 0.5 +// GFX13: v_pk_min_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_min_i16 v5, null, exec_lo +// GFX13: v_pk_min_i16 v5, null, exec_lo ; encoding: [0x05,0x40,0x08,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_min_i16 v5, s1, s2 +// GFX13: v_pk_min_i16 v5, s1, s2 ; encoding: [0x05,0x40,0x08,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_min_i16 v5, s105, s105 +// GFX13: v_pk_min_i16 v5, s105, s105 ; encoding: [0x05,0x40,0x08,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_min_i16 v5, ttmp15, src_scc +// GFX13: v_pk_min_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_min_i16 v5, v1, v2 +// GFX13: v_pk_min_i16 v5, v1, v2 ; encoding: [0x05,0x40,0x08,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_min_i16 v5, v255, v255 +// GFX13: v_pk_min_i16 v5, v255, v255 ; encoding: [0x05,0x40,0x08,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_min_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_min_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x08,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_min_i16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_min_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x08,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_min_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_min_num_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2b,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_min_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_min_num_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2b,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_min_num_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_min_num_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2b,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_min_num_bf16 v5, exec_hi, null +// GFX13: v_pk_min_num_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2b,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_min_num_bf16 v5, exec_lo, -1 +// GFX13: v_pk_min_num_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2b,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_min_num_bf16 v5, m0, 0.5 +// GFX13: v_pk_min_num_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2b,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_min_num_bf16 v5, null, exec_lo +// GFX13: v_pk_min_num_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2b,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_min_num_bf16 v5, s1, s2 +// GFX13: v_pk_min_num_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2b,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_min_num_bf16 v5, s105, s105 +// GFX13: v_pk_min_num_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2b,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_min_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_min_num_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2b,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_min_num_bf16 v5, ttmp15, src_scc +// GFX13: v_pk_min_num_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2b,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_min_num_bf16 v5, v1, v2 +// GFX13: v_pk_min_num_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2b,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_min_num_bf16 v5, v255, v255 +// GFX13: v_pk_min_num_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2b,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_min_num_bf16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_min_num_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2b,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_min_num_bf16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_min_num_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2b,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_min_num_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_min_num_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x1b,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_min_num_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_min_num_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x1b,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_min_num_f16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_min_num_f16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x1b,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_min_num_f16 v5, exec_hi, null +// GFX13: v_pk_min_num_f16 v5, exec_hi, null ; encoding: [0x05,0x40,0x1b,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_min_num_f16 v5, exec_lo, -1 +// GFX13: v_pk_min_num_f16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x1b,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_min_num_f16 v5, m0, 0.5 +// GFX13: v_pk_min_num_f16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x1b,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_min_num_f16 v5, null, exec_lo +// GFX13: v_pk_min_num_f16 v5, null, exec_lo ; encoding: [0x05,0x40,0x1b,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_min_num_f16 v5, s1, s2 +// GFX13: v_pk_min_num_f16 v5, s1, s2 ; encoding: [0x05,0x40,0x1b,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_min_num_f16 v5, s105, s105 +// GFX13: v_pk_min_num_f16 v5, s105, s105 ; encoding: [0x05,0x40,0x1b,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_min_num_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_min_num_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x1b,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_min_num_f16 v5, ttmp15, src_scc +// GFX13: v_pk_min_num_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x1b,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_min_num_f16 v5, v1, v2 +// GFX13: v_pk_min_num_f16 v5, v1, v2 ; encoding: [0x05,0x40,0x1b,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_min_num_f16 v5, v255, v255 +// GFX13: v_pk_min_num_f16 v5, v255, v255 ; encoding: [0x05,0x40,0x1b,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_min_num_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_min_num_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x1b,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_min_num_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_min_num_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x1b,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_min_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_min_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x0d,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_min_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_min_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0d,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_min_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_min_u16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x0d,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_min_u16 v5, exec_hi, null +// GFX13: v_pk_min_u16 v5, exec_hi, null ; encoding: [0x05,0x40,0x0d,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_min_u16 v5, exec_lo, -1 +// GFX13: v_pk_min_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_min_u16 v5, m0, 0.5 +// GFX13: v_pk_min_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_min_u16 v5, null, exec_lo +// GFX13: v_pk_min_u16 v5, null, exec_lo ; encoding: [0x05,0x40,0x0d,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_min_u16 v5, s1, s2 +// GFX13: v_pk_min_u16 v5, s1, s2 ; encoding: [0x05,0x40,0x0d,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_min_u16 v5, s105, s105 +// GFX13: v_pk_min_u16 v5, s105, s105 ; encoding: [0x05,0x40,0x0d,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_min_u16 v5, ttmp15, src_scc +// GFX13: v_pk_min_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_min_u16 v5, v1, v2 +// GFX13: v_pk_min_u16 v5, v1, v2 ; encoding: [0x05,0x40,0x0d,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_min_u16 v5, v255, v255 +// GFX13: v_pk_min_u16 v5, v255, v255 ; encoding: [0x05,0x40,0x0d,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_min_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_min_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x0d,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_min_u16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_min_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x0d,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_minimum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_minimum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x1d,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_minimum_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_minimum_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x1d,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_minimum_f16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_minimum_f16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x1d,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_minimum_f16 v5, exec_hi, null +// GFX13: v_pk_minimum_f16 v5, exec_hi, null ; encoding: [0x05,0x40,0x1d,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_minimum_f16 v5, exec_lo, -1 +// GFX13: v_pk_minimum_f16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x1d,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_minimum_f16 v5, m0, 0.5 +// GFX13: v_pk_minimum_f16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x1d,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_minimum_f16 v5, null, exec_lo +// GFX13: v_pk_minimum_f16 v5, null, exec_lo ; encoding: [0x05,0x40,0x1d,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_minimum_f16 v5, s1, s2 +// GFX13: v_pk_minimum_f16 v5, s1, s2 ; encoding: [0x05,0x40,0x1d,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_minimum_f16 v5, s105, s105 +// GFX13: v_pk_minimum_f16 v5, s105, s105 ; encoding: [0x05,0x40,0x1d,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_minimum_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_minimum_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x1d,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_minimum_f16 v5, ttmp15, src_scc +// GFX13: v_pk_minimum_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x1d,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_minimum_f16 v5, v1, v2 +// GFX13: v_pk_minimum_f16 v5, v1, v2 ; encoding: [0x05,0x40,0x1d,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_minimum_f16 v5, v255, v255 +// GFX13: v_pk_minimum_f16 v5, v255, v255 ; encoding: [0x05,0x40,0x1d,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_minimum_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_minimum_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x1d,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_minimum_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_minimum_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x1d,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_mul_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_mul_bf16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x2a,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_mul_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_mul_bf16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x2a,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_mul_bf16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_mul_bf16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x2a,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_mul_bf16 v5, exec_hi, null +// GFX13: v_pk_mul_bf16 v5, exec_hi, null ; encoding: [0x05,0x40,0x2a,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_mul_bf16 v5, exec_lo, -1 +// GFX13: v_pk_mul_bf16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x2a,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_mul_bf16 v5, m0, 0.5 +// GFX13: v_pk_mul_bf16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x2a,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_mul_bf16 v5, null, exec_lo +// GFX13: v_pk_mul_bf16 v5, null, exec_lo ; encoding: [0x05,0x40,0x2a,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_mul_bf16 v5, s1, s2 +// GFX13: v_pk_mul_bf16 v5, s1, s2 ; encoding: [0x05,0x40,0x2a,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_mul_bf16 v5, s105, s105 +// GFX13: v_pk_mul_bf16 v5, s105, s105 ; encoding: [0x05,0x40,0x2a,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_mul_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_mul_bf16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x2a,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_mul_bf16 v5, ttmp15, src_scc +// GFX13: v_pk_mul_bf16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x2a,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_mul_bf16 v5, v1, v2 +// GFX13: v_pk_mul_bf16 v5, v1, v2 ; encoding: [0x05,0x40,0x2a,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_mul_bf16 v5, v255, v255 +// GFX13: v_pk_mul_bf16 v5, v255, v255 ; encoding: [0x05,0x40,0x2a,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_mul_bf16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_mul_bf16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x2a,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_mul_bf16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_mul_bf16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x2a,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_mul_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp +// GFX13: v_pk_mul_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x10,0xcc,0xff,0xd6,0x00,0x6a,0x0b,0xfe,0x00,0x00] + +v_pk_mul_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] +// GFX13: v_pk_mul_f16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] neg_lo:[1,0] neg_hi:[1,0] ; encoding: [0x05,0x59,0x10,0xcc,0xc1,0xfe,0x00,0x22] + +v_pk_mul_f16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] neg_lo:[0,1] neg_hi:[0,1] +// GFX13: v_pk_mul_f16 v5, 0.5, m0 neg_lo:[0,1] neg_hi:[0,1] ; encoding: [0x05,0x42,0x10,0xcc,0xf0,0xfa,0x00,0x5a] + +v_pk_mul_f16 v5, exec_hi, null +// GFX13: v_pk_mul_f16 v5, exec_hi, null ; encoding: [0x05,0x40,0x10,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_mul_f16 v5, exec_lo, -1 +// GFX13: v_pk_mul_f16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x10,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_mul_f16 v5, m0, 0.5 +// GFX13: v_pk_mul_f16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x10,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_mul_f16 v5, null, exec_lo +// GFX13: v_pk_mul_f16 v5, null, exec_lo ; encoding: [0x05,0x40,0x10,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_mul_f16 v5, s1, s2 +// GFX13: v_pk_mul_f16 v5, s1, s2 ; encoding: [0x05,0x40,0x10,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_mul_f16 v5, s105, s105 +// GFX13: v_pk_mul_f16 v5, s105, s105 ; encoding: [0x05,0x40,0x10,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_mul_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] neg_hi:[0,0] +// GFX13: v_pk_mul_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x10,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_mul_f16 v5, ttmp15, src_scc +// GFX13: v_pk_mul_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x10,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_mul_f16 v5, v1, v2 +// GFX13: v_pk_mul_f16 v5, v1, v2 ; encoding: [0x05,0x40,0x10,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_mul_f16 v5, v255, v255 +// GFX13: v_pk_mul_f16 v5, v255, v255 ; encoding: [0x05,0x40,0x10,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_mul_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_mul_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x10,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_mul_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_mul_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x10,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_mul_lo_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_mul_lo_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0xff,0x50,0x01,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_mul_lo_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_mul_lo_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x01,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_mul_lo_u16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x01,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_mul_lo_u16 v5, exec_hi, null +// GFX13: v_pk_mul_lo_u16 v5, exec_hi, null ; encoding: [0x05,0x40,0x01,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_mul_lo_u16 v5, exec_lo, -1 +// GFX13: v_pk_mul_lo_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_mul_lo_u16 v5, m0, 0.5 +// GFX13: v_pk_mul_lo_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_mul_lo_u16 v5, null, exec_lo +// GFX13: v_pk_mul_lo_u16 v5, null, exec_lo ; encoding: [0x05,0x40,0x01,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_mul_lo_u16 v5, s1, s2 +// GFX13: v_pk_mul_lo_u16 v5, s1, s2 ; encoding: [0x05,0x40,0x01,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_mul_lo_u16 v5, s105, s105 +// GFX13: v_pk_mul_lo_u16 v5, s105, s105 ; encoding: [0x05,0x40,0x01,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_mul_lo_u16 v5, ttmp15, src_scc +// GFX13: v_pk_mul_lo_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_mul_lo_u16 v5, v1, v2 +// GFX13: v_pk_mul_lo_u16 v5, v1, v2 ; encoding: [0x05,0x40,0x01,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_mul_lo_u16 v5, v255, v255 +// GFX13: v_pk_mul_lo_u16 v5, v255, v255 ; encoding: [0x05,0x40,0x01,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_mul_lo_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_mul_lo_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x01,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_mul_lo_u16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_mul_lo_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x01,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_sub_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp +// GFX13: v_pk_sub_i16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp ; encoding: [0xff,0xd0,0x03,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_sub_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_sub_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x03,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_sub_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_sub_i16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x03,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_sub_i16 v5, exec_hi, null +// GFX13: v_pk_sub_i16 v5, exec_hi, null ; encoding: [0x05,0x40,0x03,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_sub_i16 v5, exec_lo, -1 +// GFX13: v_pk_sub_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_sub_i16 v5, m0, 0.5 +// GFX13: v_pk_sub_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_sub_i16 v5, null, exec_lo +// GFX13: v_pk_sub_i16 v5, null, exec_lo ; encoding: [0x05,0x40,0x03,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_sub_i16 v5, s1, s2 +// GFX13: v_pk_sub_i16 v5, s1, s2 ; encoding: [0x05,0x40,0x03,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_sub_i16 v5, s105, s105 +// GFX13: v_pk_sub_i16 v5, s105, s105 ; encoding: [0x05,0x40,0x03,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_sub_i16 v5, ttmp15, src_scc +// GFX13: v_pk_sub_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_sub_i16 v5, v1, v2 +// GFX13: v_pk_sub_i16 v5, v1, v2 ; encoding: [0x05,0x40,0x03,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_sub_i16 v5, v255, v255 +// GFX13: v_pk_sub_i16 v5, v255, v255 ; encoding: [0x05,0x40,0x03,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_sub_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_sub_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x03,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_sub_i16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_sub_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x03,0xcc,0x6a,0xf6,0x00,0x1a] + +v_pk_sub_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp +// GFX13: v_pk_sub_u16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] clamp ; encoding: [0xff,0xd0,0x0b,0xcc,0xff,0xd6,0x00,0x0a,0x0b,0xfe,0x00,0x00] + +v_pk_sub_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] +// GFX13: v_pk_sub_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0b,0xcc,0xc1,0xfe,0x00,0x02] + +v_pk_sub_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_sub_u16 v5, 0.5, m0 ; encoding: [0x05,0x40,0x0b,0xcc,0xf0,0xfa,0x00,0x1a] + +v_pk_sub_u16 v5, exec_hi, null +// GFX13: v_pk_sub_u16 v5, exec_hi, null ; encoding: [0x05,0x40,0x0b,0xcc,0x7f,0xf8,0x00,0x1a] + +v_pk_sub_u16 v5, exec_lo, -1 +// GFX13: v_pk_sub_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x1a] + +v_pk_sub_u16 v5, m0, 0.5 +// GFX13: v_pk_sub_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x1a] + +v_pk_sub_u16 v5, null, exec_lo +// GFX13: v_pk_sub_u16 v5, null, exec_lo ; encoding: [0x05,0x40,0x0b,0xcc,0x7c,0xfc,0x00,0x1a] + +v_pk_sub_u16 v5, s1, s2 +// GFX13: v_pk_sub_u16 v5, s1, s2 ; encoding: [0x05,0x40,0x0b,0xcc,0x01,0x04,0x00,0x1a] + +v_pk_sub_u16 v5, s105, s105 +// GFX13: v_pk_sub_u16 v5, s105, s105 ; encoding: [0x05,0x40,0x0b,0xcc,0x69,0xd2,0x00,0x1a] + +v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x12] + +v_pk_sub_u16 v5, ttmp15, src_scc +// GFX13: v_pk_sub_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x1a] + +v_pk_sub_u16 v5, v1, v2 +// GFX13: v_pk_sub_u16 v5, v1, v2 ; encoding: [0x05,0x40,0x0b,0xcc,0x01,0x05,0x02,0x1a] + +v_pk_sub_u16 v5, v255, v255 +// GFX13: v_pk_sub_u16 v5, v255, v255 ; encoding: [0x05,0x40,0x0b,0xcc,0xff,0xff,0x03,0x1a] + +v_pk_sub_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_pk_sub_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x40,0x0b,0xcc,0x6b,0xfe,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_pk_sub_u16 v5, vcc_lo, ttmp15 +// GFX13: v_pk_sub_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x0b,0xcc,0x6a,0xf6,0x00,0x1a] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp16.s new file mode 100644 index 0000000000000..cd3929ed33a08 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp16.s @@ -0,0 +1,18 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding %s | FileCheck --check-prefixes=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding -disassemble | FileCheck --check-prefixes=GFX13 %s + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0 +// GFX13: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1] + +v_fma_mix_f32_bf16 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0 +// GFX13: v_fma_mix_f32_bf16_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x3d,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1] + +v_fma_mixhi_bf16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 +// GFX13: v_fma_mixhi_bf16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x3f,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f] + +v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 +// GFX13: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f] + +v_fma_mixlo_bf16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 +// GFX13: v_fma_mixlo_bf16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x3e,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp8.s new file mode 100644 index 0000000000000..2919c72042866 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_dpp8.s @@ -0,0 +1,34 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding %s | FileCheck --check-prefixes=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding -disassemble | FileCheck --check-prefixes=GFX13 %s + +v_fma_mix_f32 v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 +// GFX13: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] + +v_fma_mix_f32 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] + +v_fma_mix_f32_bf16 v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 +// GFX13: v_fma_mix_f32_bf16_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x3d,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] + +v_fma_mix_f32_bf16 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mix_f32_bf16_e64_dpp v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x00,0x3d,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] + +v_fma_mixhi_bf16 v0, abs(v1), -v2, abs(v3) dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mixhi_bf16_e64_dpp v0, |v1|, -v2, |v3| dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x05,0x3f,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92] + +v_fma_mixhi_bf16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mixhi_bf16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x3f,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] + +v_fma_mixlo_bf16 v0, abs(v1), -v2, abs(v3) dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mixlo_bf16_e64_dpp v0, |v1|, -v2, |v3| dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x05,0x3e,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92] + +v_fma_mixlo_bf16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mixlo_bf16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x3e,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] + +v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92] + +// For test purpose only. OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to all 1 +v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] +// GFX13: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_features.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_features.s new file mode 100644 index 0000000000000..31f4332689c7e --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3p_features.s @@ -0,0 +1,125 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 %s | FileCheck --check-prefix=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 %s | %extract-encodings | llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 -disassemble | FileCheck --check-prefix=GFX13 %s + +// Test op_sel/op_sel_hi + +v_pk_add_u16 v1, v2, v3 +// GFX13: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] +// GFX13: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,1] +// GFX13: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1] +// GFX13: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x02] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x02] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,1] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x12] + +v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x0a] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x1a] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x0a] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x12] + +v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x12] + +v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] +// GFX13: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x0a] + +// Test src2 op_sel/op_sel_hi + +v_pk_fma_f16 v8, v0, s0, v1 +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04] + +v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04] + +// Test neg_lo/neg_hi + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] +// GFX13: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c] + +// DOT + +v_dot4_i32_iu8 v3, v4, v5, v6 +// GFX13: v_dot4_i32_iu8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c] + +v_dot4_i32_iu8 v3, v4, v5, 0xf neg_lo:[1,1] +// GFX13: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a] + +v_dot4_u32_u8 v3, v4, v5, v6 +// GFX13: v_dot4_u32_u8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c] + +v_dot4_i32_iu8 v3, v4, v5, 0xf +// GFX13: v_dot4_i32_iu8 v3, v4, v5, 15 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a] + +v_dot8_i32_iu4 v3, v4, v5, 0xf neg_lo:[1,0] +// GFX13: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a] + +v_dot8_i32_iu4 v3, v4, v5, v0 neg_lo:[0,0] +// GFX13: v_dot8_i32_iu4 v3, v4, v5, v0 ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c] + +v_dot8_u32_u4 v0, v1, v2, v3 +// GFX13: v_dot8_u32_u4 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c] From 6c5f5c1bad824b9dc736bb9c167a065ca71dad69 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 8 May 2026 09:53:29 -0700 Subject: [PATCH 070/538] [Clang][Modules] Fix -Wunused-variable (#196577) Mark some variables [[maybe_unused]] and inline others that do not have side effects to avoid -Wunused-variable in non-assert builds. --- clang/lib/Serialization/ASTReader.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 6b242f553c59d..dfd714dd53814 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -6327,7 +6327,8 @@ Module *ASTReader::getSubmodule(uint32_t GlobalID) { assert(It != GlobalSubmoduleMap.end()); ModuleFile &F = *It->second; unsigned Index = GlobalID - F.BaseSubmoduleID - NUM_PREDEF_SUBMODULE_IDS; - unsigned LocalID = Index + F.LocalBaseSubmoduleID + NUM_PREDEF_SUBMODULE_IDS; + [[maybe_unused]] unsigned LocalID = + Index + F.LocalBaseSubmoduleID + NUM_PREDEF_SUBMODULE_IDS; BitstreamCursor &Cursor = F.SubmodulesCursor; SavedStreamPosition SavedPosition(Cursor); @@ -6397,10 +6398,9 @@ Module *ASTReader::getSubmodule(uint32_t GlobalID) { StringRef Name = Blob; unsigned Idx = 0; - unsigned ReadLocalID = Record[Idx++]; + [[maybe_unused]] unsigned ReadLocalID = Record[Idx++]; assert(LocalID == ReadLocalID); - SubmoduleID ReadGlobalID = getGlobalSubmoduleID(F, ReadLocalID); - assert(GlobalID == ReadGlobalID); + assert(GlobalID == getGlobalSubmoduleID(F, ReadLocalID)); SubmoduleID Parent = getGlobalSubmoduleID(F, Record[Idx++]); Module::ModuleKind Kind = (Module::ModuleKind)Record[Idx++]; SourceLocation DefinitionLoc = ReadSourceLocation(F, Record[Idx++]); From 2b97000077e1f8fd453c4096729e9520fe9254e7 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 8 May 2026 17:55:25 +0100 Subject: [PATCH 071/538] [AArch64][GlobalISel] Legalize F64 to BF16 fptruncates (#196077) This two-step expansion of bf16 fptrunc steps needs to be careful to avoid double-rounding error. Under AArch64 we can apparently convert to a fcvtxn that performs round-to-odd, followed by a standard fp truncate to bf16 to make sure the rounding from there is done correctly. This reuses the existing lowering added for vector operations. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 19 ++- .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 2 + llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll | 11 +- .../test/CodeGen/AArch64/bf16-instructions.ll | 115 ++++++++++++------ .../CodeGen/AArch64/bf16-v4-instructions.ll | 50 +++++--- .../CodeGen/AArch64/bf16-v8-instructions.ll | 80 ++++++++---- 6 files changed, 189 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index e8bc74a150b57..605cb86a7bb60 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -872,9 +872,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .customIf([](const LegalityQuery &Q) { LLT DstTy = Q.Types[0]; LLT SrcTy = Q.Types[1]; - return SrcTy.isFixedVector() && DstTy.isFixedVector() && - SrcTy.getScalarType().isFloat64() && - DstTy.getScalarType().isFloat16(); + return SrcTy.getScalarSizeInBits() == 64 && + DstTy.getScalarSizeInBits() == 16; }) .lowerFor({{bf16, f32}, {v4bf16, v4f32}}) // Clamp based on input @@ -2659,6 +2658,20 @@ bool AArch64LegalizerInfo::legalizeFptrunc(MachineInstr &MI, MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI) const { auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); + + // This function legalizes f64 -> bf16 and f64 -> f16 truncations via f64 -> + // f32 G_FPTRUNC_ODD and f32 -> [b]f16 G_FPTRUNC, which apparently avoids the + // usual double-rounding issue that could be present from using twin + // G_FPTRUNC. + + if (DstTy.isBFloat16() && SrcTy.isFloat64()) { + auto Mid = + MIRBuilder.buildInstr(AArch64::G_FPTRUNC_ODD, {LLT::float32()}, {Src}); + MIRBuilder.buildInstr(AArch64::G_FPTRUNC, {Dst}, {Mid}); + MI.eraseFromParent(); + return true; + } + assert(SrcTy.isFixedVector() && isPowerOf2_32(SrcTy.getNumElements()) && "Expected a power of 2 elements"); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index f1a8f8d4c65c5..d65ffb1c36814 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -741,6 +741,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, case AArch64::G_PMULL: case AArch64::G_SLI: case AArch64::G_SRI: + case AArch64::G_FPTRUNC_ODD: return true; case TargetOpcode::G_INTRINSIC: switch (cast(MI).getIntrinsicID()) { @@ -781,6 +782,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, case TargetOpcode::G_BUILD_VECTOR_TRUNC: case AArch64::G_SLI: case AArch64::G_SRI: + case AArch64::G_FPTRUNC_ODD: return true; case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: switch (cast(MI).getIntrinsicID()) { diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index 94b494c8c08c4..b2d9237531900 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -3,7 +3,6 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FI ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI -; CHECK-GI: warning: Instruction selection used fallback path for test_vcvt_bf16_f64 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp { ; CHECK-LABEL: test_vcvt_f64_f32: @@ -172,13 +171,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { ; CHECK-GI-NEXT: fcvtxn v0.2s, v0.2d ; CHECK-GI-NEXT: movi.4s v1, #1 ; CHECK-GI-NEXT: movi.4s v2, #127, msl #8 +; CHECK-GI-NEXT: movi.4s v5, #64, lsl #16 ; CHECK-GI-NEXT: ushr.4s v3, v0, #16 +; CHECK-GI-NEXT: fcmeq.4s v4, v0, v0 ; CHECK-GI-NEXT: add.4s v2, v0, v2 +; CHECK-GI-NEXT: orr.16b v0, v0, v5 ; CHECK-GI-NEXT: and.16b v1, v3, v1 -; CHECK-GI-NEXT: fcmeq.4s v3, v0, v0 -; CHECK-GI-NEXT: orr.4s v0, #64, lsl #16 -; CHECK-GI-NEXT: add.4s v1, v1, v2 -; CHECK-GI-NEXT: bit.16b v0, v1, v3 +; CHECK-GI-NEXT: mvn.16b v3, v4 +; CHECK-GI-NEXT: add.4s v1, v2, v1 +; CHECK-GI-NEXT: bif.16b v0, v1, v3 ; CHECK-GI-NEXT: shrn.4h v0, v0, #16 ; CHECK-GI-NEXT: ret %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index 240ac524b90c2..d9f661d2d1d34 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -38,7 +38,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sitofp_i64 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32_fadd ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sitofp_i32_fadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_double ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_powi @@ -63,7 +62,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_minnum ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_copysign_f64 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_floor ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_ceil ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_trunc @@ -107,7 +105,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sitofp_i64 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32_fadd ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sitofp_i32_fadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_double ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_powi @@ -132,7 +129,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fneg ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_minnum ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_copysign_f64 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_floor ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_ceil ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_trunc @@ -1119,24 +1115,40 @@ define bfloat @test_fptrunc_float(float %a) #0 { } define bfloat @test_fptrunc_double(double %a) #0 { -; CHECK-CVT-LABEL: test_fptrunc_double: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: fcvtxn s0, d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fptrunc_double: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: fcvtxn s0, d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_fptrunc_double: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: fcvtxn s0, d0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fptrunc_double: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: fcvtxn s0, d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = fptrunc double %a to bfloat ret bfloat %r } @@ -2025,28 +2037,57 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { } define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { -; CHECK-CVT-LABEL: test_copysign_f64: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: fcvt s1, d1 -; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_copysign_f64: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: fcvt s1, d1 +; CHECK-CVT-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_copysign_f64: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: fcvt s1, d1 -; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_copysign_f64: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: fcvt s1, d1 +; CHECK-BF16-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_copysign_f64: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: fcvtxn s1, d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: fmov w9, s1 +; CHECK-CVT-GI-NEXT: fcmp s1, #0.0 +; CHECK-CVT-GI-NEXT: mvni v1.4h, #128, lsl #8 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s2, w8 +; CHECK-CVT-GI-NEXT: bif v0.8b, v2.8b, v1.8b +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_copysign_f64: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: fcvtxn s1, d1 +; CHECK-BF16-GI-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: bfcvt h1, s1 +; CHECK-BF16-GI-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-BF16-GI-NEXT: ret %tb = fptrunc double %b to bfloat %r = call bfloat @llvm.copysign.bf16(bfloat %a, bfloat %tb) ret bfloat %r diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll index ff5fcc38e1fc0..c10436c93614c 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll @@ -40,7 +40,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sin @@ -109,7 +108,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sin @@ -1034,21 +1032,21 @@ define <4 x bfloat> @test_fptrunc_float(<4 x float> %a) { } define <4 x bfloat> @test_fptrunc_double(<4 x double> %a) { -; CHECK-CVT-LABEL: test_fptrunc_double: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: fcvtxn v0.2s, v0.2d -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: fcvtxn2 v0.4s, v1.2d -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b -; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fptrunc_double: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-SD-NEXT: fcmeq v3.4s, v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-CVT-SD-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-CVT-SD-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_fptrunc_double: ; CHECK-BF16: // %bb.0: @@ -1056,6 +1054,24 @@ define <4 x bfloat> @test_fptrunc_double(<4 x double> %a) { ; CHECK-BF16-NEXT: fcvtxn2 v0.4s, v1.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fptrunc_double: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %1 = fptrunc <4 x double> %a to <4 x bfloat> ret <4 x bfloat> %1 } diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index 980b195d4bf01..7c0b4092a6f48 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -45,7 +45,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sin @@ -118,7 +117,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sin @@ -1670,30 +1668,30 @@ define <8 x bfloat> @test_fptrunc_float(<8 x float> %a) { } define <8 x bfloat> @test_fptrunc_double(<8 x double> %a) { -; CHECK-CVT-LABEL: test_fptrunc_double: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: fcvtxn v2.2s, v2.2d -; CHECK-CVT-NEXT: fcvtxn v0.2s, v0.2d -; CHECK-CVT-NEXT: fcvtxn2 v2.4s, v3.2d -; CHECK-CVT-NEXT: fcvtxn2 v0.4s, v1.2d -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8 -; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s -; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s -; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s -; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b -; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fptrunc_double: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: fcvtxn v2.2s, v2.2d +; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-CVT-SD-NEXT: fcvtxn2 v2.4s, v3.2d +; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: add v6.4s, v2.4s, v3.4s +; CHECK-CVT-SD-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-SD-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-SD-NEXT: fcmeq v5.4s, v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-CVT-SD-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-CVT-SD-NEXT: fcmeq v6.4s, v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-CVT-SD-NEXT: bit v2.16b, v4.16b, v5.16b +; CHECK-CVT-SD-NEXT: bit v0.16b, v1.16b, v6.16b +; CHECK-CVT-SD-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_fptrunc_double: ; CHECK-BF16: // %bb.0: @@ -1704,6 +1702,36 @@ define <8 x bfloat> @test_fptrunc_double(<8 x double> %a) { ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fptrunc_double: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-CVT-GI-NEXT: fcvtxn v2.2s, v2.2d +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-CVT-GI-NEXT: fcvtxn2 v2.4s, v3.2d +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v2.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret %1 = fptrunc <8 x double> %a to <8 x bfloat> ret <8 x bfloat> %1 } From a7591ef9419ee6047f551b4f16b93dddc00de1b9 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 May 2026 12:59:13 -0400 Subject: [PATCH 072/538] [SLP][NFC]Add a test with the revectorization of the struct-returning intrinsics Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/196581 --- .../SLPVectorizer/struct-return-revec.ll | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll diff --git a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll new file mode 100644 index 0000000000000..45d6e395b6886 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S --passes=slp-vectorizer -slp-revec < %s | FileCheck %s + +@phase = dso_local local_unnamed_addr global [8 x double] zeroinitializer, align 16 +@sinval = dso_local local_unnamed_addr global [8 x double] zeroinitializer, align 16 +@cosval = dso_local local_unnamed_addr global [8 x double] zeroinitializer, align 16 + +define i32 @test() { +; CHECK-LABEL: define i32 @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @phase, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 1 +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @sinval, align 16 +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr @cosval, align 16 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 1 +; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 8 +; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 1 +; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16 +; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 1 +; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 8 +; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = load <2 x double>, ptr @phase, align 16 + %1 = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %0) + %2 = extractvalue { <2 x double>, <2 x double> } %1, 0 + %3 = extractvalue { <2 x double>, <2 x double> } %1, 1 + store <2 x double> %2, ptr @sinval, align 16 + store <2 x double> %3, ptr @cosval, align 16 + %4 = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 8 + %5 = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %4) + %6 = extractvalue { <2 x double>, <2 x double> } %5, 0 + %7 = extractvalue { <2 x double>, <2 x double> } %5, 1 + store <2 x double> %6, ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 8 + store <2 x double> %7, ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 8 + %8 = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16 + %9 = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %8) + %10 = extractvalue { <2 x double>, <2 x double> } %9, 0 + %11 = extractvalue { <2 x double>, <2 x double> } %9, 1 + store <2 x double> %10, ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16 + store <2 x double> %11, ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16 + %12 = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 8 + %13 = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %12) + %14 = extractvalue { <2 x double>, <2 x double> } %13, 0 + %15 = extractvalue { <2 x double>, <2 x double> } %13, 1 + store <2 x double> %14, ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 8 + store <2 x double> %15, ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 8 + ret i32 0 +} + +declare { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double>) From 62fd4ff3a361deaa85bb51d2ff25ae3b49accee8 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Fri, 8 May 2026 19:00:53 +0200 Subject: [PATCH 073/538] [AMDGPU] Add missing CMake link component (#196579) The issue was triggered by #196547. --- llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt index 2425556ebe33f..3203738cc1c69 100644 --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -17,6 +17,7 @@ set(LLVM_LINK_COMPONENTS GlobalISel MC MIRParser + Passes Support TargetParser ) From b0c6df7b95b3c70d78c65a39598007f722794d38 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 May 2026 13:12:35 -0400 Subject: [PATCH 074/538] [SLP] Vectorize struct-returning intrinsics Allow SLP to combine across lanes calls that return a literal struct (llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single call returning a struct of vectors, by widening {T, T, ...} to {, ...} via VectorTypeUtils and emitting extractvalue + extractelement for external uses. Reviewers: hiraditya, bababuck, RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/195521 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 625 ++++++--- .../SLPVectorizer/X86/arith-add-saddo.ll | 1064 +++++++-------- .../SLPVectorizer/X86/arith-add-uaddo.ll | 1064 +++++++-------- .../SLPVectorizer/X86/arith-mul-smulo.ll | 1164 ++++++++--------- .../SLPVectorizer/X86/arith-mul-umulo.ll | 1044 ++++++--------- .../SLPVectorizer/X86/arith-sub-ssubo.ll | 1064 +++++++-------- .../SLPVectorizer/X86/arith-sub-usubo.ll | 1064 +++++++-------- ...revec-non-power-2-to-power-2-large-vect.ll | 8 +- llvm/test/Transforms/SLPVectorizer/sincos.ll | 52 +- .../SLPVectorizer/struct-return-revec.ll | 28 +- 10 files changed, 3265 insertions(+), 3912 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f1a6eb2d7e8af..a508b1d1f744f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" @@ -71,6 +72,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/VectorTypeUtils.h" #ifdef EXPENSIVE_CHECKS #include "llvm/IR/Verifier.h" #endif @@ -300,10 +302,10 @@ static const unsigned MaxPHINumOperands = 128; /// be inevitably scalarized. static bool isValidElementType(Type *Ty) { // TODO: Support ScalableVectorType. - if (SLPReVec && isa(Ty)) - Ty = Ty->getScalarType(); - return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && - !Ty->isPPC_FP128Ty(); + if (SLPReVec && isVectorizedTy(Ty)) + Ty = toScalarizedTy(Ty); + return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() && + !Ty->isVoidTy(); } /// Returns the "element type" of the given value/instruction \p V. @@ -328,15 +330,33 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) { static unsigned getNumElements(Type *Ty) { assert(!isa(Ty) && "ScalableVectorType is not supported."); - if (auto *VecTy = dyn_cast(Ty)) - return VecTy->getNumElements(); + if (isVectorizedTy(Ty)) + return getVectorizedTypeVF(Ty).getFixedValue(); return 1; } /// \returns the vector type of ScalarTy based on vectorization factor. -static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { - return FixedVectorType::get(ScalarTy->getScalarType(), - VF * getNumElements(ScalarTy)); +static Type *getWidenedType(Type *ScalarTy, unsigned VF) { + if (VF == 1 && !isVectorizedTy(ScalarTy)) { + // Workaround for 1 x vector types: toVectorizedTy returns the type + // unchanged when EC is scalar, but BoUpSLP relies on widening to + // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the + // pipeline operating on vector types. + if (auto *StructTy = dyn_cast(ScalarTy)) { + assert(isUnpackedStructLiteral(StructTy) && + "expected unpacked struct literal"); + assert(all_of(StructTy->elements(), VectorType::isValidElementType) && + "expected all element types to be valid vector element types"); + return StructType::get( + StructTy->getContext(), + map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * { + return FixedVectorType::get(ElTy, 1); + })); + } + return FixedVectorType::get(ScalarTy, 1); + } + return toVectorizedTy(toScalarizedTy(ScalarTy), + ElementCount::getFixed(VF * getNumElements(ScalarTy))); } /// Returns the number of elements of the given type \p Ty, not less than \p Sz, @@ -344,7 +364,7 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { /// legalization. static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz) { - if (!isValidElementType(Ty)) + if (!isValidElementType(Ty) || isa(Ty)) return bit_ceil(Sz); // Find the number of elements, which forms full vectors. const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); @@ -359,7 +379,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz) { - if (!isValidElementType(Ty)) + if (!isValidElementType(Ty) || isa(Ty)) return bit_floor(Sz); // Find the number of elements, which forms full vectors. unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); @@ -2039,6 +2059,8 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, return false; if (has_single_bit(Sz)) return true; + if (isa(Ty)) + return false; const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) && Sz % NumParts == 0; @@ -2048,19 +2070,20 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, /// phase. If the type is going to be scalarized or does not uses whole /// registers, returns 1. static unsigned -getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, - Type *ScalarTy, +getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy, const unsigned Limit = std::numeric_limits::max()) { + if (isa(VecTy)) + return 1; unsigned NumParts = TTI.getNumberOfParts(VecTy); if (NumParts == 0 || NumParts >= Limit) return 1; unsigned Sz = getNumElements(VecTy); unsigned ScalarSz = getNumElements(ScalarTy); - unsigned PWSz = - getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz); + Type *ElementTy = toScalarizedTy(VecTy); + unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz); if (NumParts >= Sz || PWSz % NumParts != 0 || (PWSz / NumParts) % ScalarSz != 0 || - !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts)) + !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts)) return 1; const unsigned NumElts = PWSz / NumParts; if (divideCeil(Sz, NumElts) != NumParts) @@ -2209,14 +2232,14 @@ class slpvectorizer::BoUpSLP { ReductionBitWidth >= DL->getTypeSizeInBits( VectorizableTree.front()->Scalars.front()->getType())) - return getWidenedType( - VectorizableTree.front()->Scalars.front()->getType(), - VectorizableTree.front()->getVectorFactor()); - return getWidenedType( + return cast( + getWidenedType(VectorizableTree.front()->Scalars.front()->getType(), + VectorizableTree.front()->getVectorFactor())); + return cast(getWidenedType( IntegerType::get( VectorizableTree.front()->Scalars.front()->getContext(), ReductionBitWidth), - VectorizableTree.front()->getVectorFactor()); + VectorizableTree.front()->getVectorFactor())); } /// Returns true if the tree results in one of the reduced bitcasts variants. @@ -3989,8 +4012,7 @@ class slpvectorizer::BoUpSLP { /// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself /// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs. InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy, - VectorType *VecTy, - VectorType *FinalVecTy, + Type *VecTy, Type *FinalVecTy, TTI::TargetCostKind CostKind) const; /// This is the recursive part of buildTree. @@ -7107,12 +7129,12 @@ static InstructionCost getExtractWithExtendCost( const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { - if (auto *ScalarTy = dyn_cast(Dst)) { + if (isVectorizedTy(Dst)) { assert(SLPReVec && "Only supported by REVEC."); - auto *SubTp = - getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements()); + auto *SubTp = cast( + getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst))); return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind, - Index * ScalarTy->getNumElements(), SubTp) + + Index * getNumElements(Dst), SubTp) + TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None, CostKind); } @@ -7205,7 +7227,7 @@ static bool isMaskedLoadCompress( InterleaveFactor = 0; Type *ScalarTy = VL.front()->getType(); const size_t Sz = VL.size(); - auto *VecTy = getWidenedType(ScalarTy, Sz); + auto *VecTy = cast(getWidenedType(ScalarTy, Sz)); constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; SmallVector Mask; if (!Order.empty()) @@ -7241,7 +7263,7 @@ static bool isMaskedLoadCompress( // Check for very large distances between elements. if (*Diff / Sz >= MaxRegSize / 8) return false; - LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); + LoadVecTy = cast(getWidenedType(ScalarTy, *Diff + 1)); auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); Align CommonAlignment = LI->getAlign(); IsMasked = !isSafeToLoadUnconditionally( @@ -7290,8 +7312,8 @@ static bool isMaskedLoadCompress( } if (IsStrided && !IsMasked && Order.empty()) { // Check for potential segmented(interleaved) loads. - VectorType *AlignedLoadVecTy = getWidenedType( - ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)); + VectorType *AlignedLoadVecTy = cast(getWidenedType( + ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1))); if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment, DL, cast(VL.back()), &AC, &DT, &TLI)) @@ -7482,7 +7504,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate( Type *StrideTy = DL->getIndexType(Ptr0->getType()); SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal); - SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz); + SPtrInfo.Ty = cast(getWidenedType(NewScalarTy, VecSz)); return true; } @@ -7537,7 +7559,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, NewScalarTy = Type::getIntNTy( SE->getContext(), DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets); - FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz); + auto *StridedLoadTy = + cast(getWidenedType(NewScalarTy, VecSz)); unsigned MinProfitableStridedOps = IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores; const unsigned BaseTyNumElts = getNumElements(BaseTy); @@ -7736,7 +7759,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); - auto *VecTy = getWidenedType(ScalarTy, Sz); + auto *VecTy = cast(getWidenedType(ScalarTy, Sz)); Align CommonAlignment = computeCommonAlignment(VL); // Cache masked gather legality - both the !IsSorted path below and the // post-branch check use the same VecTy/CommonAlignment, and the underlying @@ -7817,7 +7840,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // estimate as a buildvector, otherwise estimate as splat. APInt DemandedElts = APInt::getAllOnes(Sz); Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType(); - VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz); + auto *PtrVecTy = cast(getWidenedType(PtrScalarTy, Sz)); // Cache the underlying object of PointerOps.front() - it is invariant // across the per-V comparisons below and getUnderlyingObject walks // GEP/cast chains. @@ -7914,7 +7937,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( } for (const auto &[SliceStart, LS] : States) { const unsigned SliceVF = std::min(VF, VL.size() - SliceStart); - auto *SubVecTy = getWidenedType(ScalarTy, SliceVF); + auto *SubVecTy = cast(getWidenedType(ScalarTy, SliceVF)); auto *LI0 = cast(VL[SliceStart]); InstructionCost VectorGEPCost = (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) @@ -8519,7 +8542,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, const auto *It = find_if_not(TE.Scalars, isConstant); if (It == TE.Scalars.begin()) return OrdersType(); - auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz); + auto *Ty = + cast(getWidenedType(TE.Scalars.front()->getType(), Sz)); if (It != TE.Scalars.end()) { OrdersType Order(Sz, Sz); unsigned Idx = std::distance(TE.Scalars.begin(), It); @@ -8777,6 +8801,12 @@ void BoUpSLP::reorderTopToBottom() { // Maps a TreeEntry to the reorder indices of external users. DenseMap> ExternalUserReorderMap; + // TODO: Reordering of struct types is not supported. + if (any_of(VectorizableTree, [](const std::unique_ptr &TE) { + return TE->State == TreeEntry::Vectorize && + isa(getValueType(TE->Scalars.front())); + })) + return; // Compute IgnoreReorder once - it depends only on UserIgnoreList and // VectorizableTree.front(), which do not change during this loop. const bool IgnoreReorder = @@ -8803,7 +8833,8 @@ void BoUpSLP::reorderTopToBottom() { if (TE->hasState() && TE->isAltShuffle() && TE->State != TreeEntry::SplitVectorize) { Type *ScalarTy = TE->Scalars[0]->getType(); - VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size()); + auto *VecTy = + cast(getWidenedType(ScalarTy, TE->Scalars.size())); unsigned Opcode0 = TE->getOpcode(); unsigned Opcode1 = TE->getAltOpcode(); SmallBitVector OpcodeMask( @@ -9172,6 +9203,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } if (Users.first) { auto &Data = Users; + // TODO: Reordering of struct types is not supported. + if (Data.first->State == TreeEntry::Vectorize && + isa(getValueType(Data.first->Scalars.front()))) + continue; if (Data.first->State == TreeEntry::SplitVectorize) { assert( Data.second.size() <= 2 && @@ -9972,7 +10007,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( ArrayRef Values(reinterpret_cast(Loads.begin()), Loads.size()); Align Alignment = computeCommonAlignment(Values); - auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size()); + auto *Ty = cast( + getWidenedType(Loads.front()->getType(), Loads.size())); return TTI->isLegalMaskedGather(Ty, Alignment) && !TTI->forceScalarizeMaskedGather(Ty, Alignment); }; @@ -10270,7 +10306,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( // Segmented load detected - vectorize at maximum vector factor. if (InterleaveFactor <= Slice.size() && TTI.isLegalInterleavedAccessType( - getWidenedType(Slice.front()->getType(), VF), + cast( + getWidenedType(Slice.front()->getType(), VF)), InterleaveFactor, cast(Slice.front())->getAlign(), cast(Slice.front()) @@ -10530,11 +10567,10 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, /// function (if possible) calls. Returns invalid cost for the corresponding /// calls, if they cannot be vectorized/will be scalarized. static std::pair -getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, - TargetTransformInfo *TTI, TargetLibraryInfo *TLI, - ArrayRef ArgTys) { +getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI, ArrayRef ArgTys) { auto Shape = VFShape::get(CI->getFunctionType(), - ElementCount::getFixed(VecTy->getNumElements()), + ElementCount::getFixed(getNumElements(VecTy)), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); auto LibCost = InstructionCost::getInvalid(); @@ -10595,6 +10631,68 @@ ArrayRef BoUpSLP::getLoopNest(const Loop *L) { return Res; } +/// Detects an extractvalue bundle that can be widened by vectorizing the +/// underlying struct-returning calls. +/// +/// \p VL is a bundle whose state \p S is Instruction::ExtractValue. The +/// bundle is acceptable for widening into one struct-of-vectors call only +/// when: +/// - every element of \p VL is an ExtractValueInst, +/// - every ExtractValueInst extracts the same struct field (its +/// getIndices() matches the main op's indices), +/// - the aggregate operands form a uniform set of CallInsts (per +/// getSameOpcode) that is not an alt-shuffle and whose return type is +/// a literal struct, and +/// - every user of every such call is itself an ExtractValueInst, so the +/// external-use extraction code can rebuild scalars via extractvalue + +/// extractelement without needing an insertvalue chain. +/// +/// On success returns true and fills \p Indices with the common field +/// index path and \p Calls with the per-lane aggregate calls (in VL order), +/// for the caller to feed as the operand of the new tree entry. Otherwise +/// returns false and leaves the output parameters untouched. +static bool checkEVsForVecCalls(ArrayRef VL, + const InstructionsState &S, + const TargetLibraryInfo &TLI, + SmallVectorImpl &Indices, + SmallVectorImpl &Calls) { + assert(S && S.getOpcode() == Instruction::ExtractValue && + "Expected extractvalue instruction state."); + if (!all_of(VL, IsaPred)) + return false; + auto *VL0 = cast(S.getMainOp()); + ArrayRef VL0Indices = VL0->getIndices(); + SmallVector Aggregates; + for (Value *V : VL) { + if (V == VL0) { + Aggregates.push_back(VL0->getAggregateOperand()); + continue; + } + auto *IV = cast(V); + if (IV->getIndices() != VL0Indices) + return false; + Value *Agg = IV->getAggregateOperand(); + Aggregates.push_back(Agg); + } + const InstructionsState AggState = getSameOpcode(Aggregates, TLI); + if (AggState && AggState.getOpcode() == Instruction::Call && + !AggState.isAltShuffle() && + isa(AggState.getMainOp()->getType())) { + // The struct-returning call may have non-bundle users too. The external + // extraction code rebuilds scalars by extractvalue + extractelement, + // which only works when every user of the call is an ExtractValueInst. + // Bail out if any aggregate has a different kind of user. + for (Value *Agg : Aggregates) { + if (!all_of(Agg->users(), IsaPred)) + return false; + } + Indices.assign(VL0Indices.begin(), VL0Indices.end()); + Calls.swap(Aggregates); + return true; + } + return false; +} + BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, @@ -10640,6 +10738,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( bool Reuse = canReuseExtract(VL, CurrentOrder); if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; + SmallVector Indices; + SmallVector Calls; + if (ShuffleOrOp == Instruction::ExtractValue && + checkEVsForVecCalls(VL, S, *TLI, Indices, Calls)) + return TreeEntry::Vectorize; LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); return TreeEntry::NeedToGather; } @@ -11161,6 +11264,12 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, const BoUpSLP &R, bool BuildGatherOnly = true) { + // TODO: Reordering of struct types is not supported. + if (isa(getValueType(VL.front()))) { + LLVM_DEBUG(dbgs() << "SLP: struct type in bundle.\n"); + ReuseShuffleIndices.clear(); + return true; + } // Check that every instruction appears once in this bundle. SmallVector UniqueValues; SmallDenseMap UniquePositions(VL.size()); @@ -11272,8 +11381,9 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, if (Val != PoisonMaskElem && UniquePositions.contains(UniqueValues[Val])) DemandedElts.setBit(Idx); Type *ScalarTy = ::getValueType(UniqueValues.front()); - auto *VecTy = getWidenedType(ScalarTy, VL.size()); - auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues); + auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); + auto *UniquesVecTy = + cast(getWidenedType(ScalarTy, NumUniqueScalarValues)); const unsigned NumParts = ::getNumberOfParts(TTI, VecTy, ScalarTy); const unsigned UniquesNumParts = ::getNumberOfParts(TTI, UniquesVecTy, ScalarTy); @@ -11429,7 +11539,7 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, Op2.push_back(V); } Type *ScalarTy = getValueType(VL.front()); - VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); + auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); unsigned Opcode0 = LocalState.getOpcode(); unsigned Opcode1 = LocalState.getAltOpcode(); SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1)); @@ -11464,8 +11574,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, if (!ReorderIndices.empty()) inversePermutation(ReorderIndices, Mask); unsigned NumParts = TTI->getNumberOfParts(VecTy); - VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size()); - VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size()); + auto *Op1VecTy = cast(getWidenedType(ScalarTy, Op1.size())); + auto *Op2VecTy = cast(getWidenedType(ScalarTy, Op2.size())); // Check non-profitable single register ops, which better to be represented // as alternate ops. if (NumParts >= VL.size()) @@ -11473,8 +11583,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; InstructionCost InsertCost = ::getShuffleCost( *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy); - FixedVectorType *SubVecTy = - getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size())); + auto *SubVecTy = cast( + getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()))); InstructionCost NewShuffleCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind); if (!LocalState.isCmpOp() && NumParts <= 1 && @@ -11674,7 +11784,16 @@ class InstructionsCompatibilityAnalysis { Handler.getOperands(I).end()); return; } - case Instruction::ExtractValue: + case Instruction::ExtractValue: { + SmallVector Indices; + SmallVector Calls; + if (checkEVsForVecCalls(VL, S, TLI, Indices, Calls)) { + Operands.assign(1, {}); + Operands[0].swap(Calls); + return; + } + [[fallthrough]]; + } case Instruction::ExtractElement: // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. @@ -12062,7 +12181,7 @@ class InstructionsCompatibilityAnalysis { } if (S && S.isAltShuffle()) { Type *ScalarTy = S.getMainOp()->getType(); - VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); + auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); unsigned Opcode0 = S.getOpcode(); unsigned Opcode1 = S.getAltOpcode(); SmallBitVector OpcodeMask( @@ -12127,8 +12246,7 @@ class InstructionsCompatibilityAnalysis { constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind); InstructionCost VectorCost; - FixedVectorType *VecTy = - getWidenedType(S.getMainOp()->getType(), VL.size()); + auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size()); switch (MainOpcode) { case Instruction::Add: case Instruction::Sub: @@ -12530,7 +12648,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, // Rough cost estimation, if the vector code (+ potential extracts) is // more profitable than the scalar + buildvector. Type *ScalarTy = VL.front()->getType(); - auto *VecTy = getWidenedType(ScalarTy, VL.size()); + auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); InstructionCost VectorizeCostEstimate = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) + ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted, @@ -12849,6 +12967,12 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, // This is a special case, as it does not gather, but at the same time // we are not extending buildTreeRec() towards the operands. TE->setOperands(Operands); + if (ShuffleOrOp == Instruction::ExtractValue) { + SmallVector Indices; + SmallVector Calls; + if (checkEVsForVecCalls(VL, S, *TLI, Indices, Calls)) + buildTreeRec(Operands.front(), Depth + 1, {TE, 0}); + } return; } case Instruction::InsertElement: { @@ -14152,10 +14276,11 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = 0; auto *ScalarTy = TE.Scalars.front()->getType(); - auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size()); + auto *VecTy = cast(getWidenedType(ScalarTy, TE.Scalars.size())); for (auto [Idx, Sz] : SubVectors) { - Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, - Idx, getWidenedType(ScalarTy, Sz)); + Cost += + ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, + Idx, cast(getWidenedType(ScalarTy, Sz))); } Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/true, @@ -14368,8 +14493,10 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order, Stride * LhsTE->getVectorFactor()); FastMathFlags FMF; SmallPtrSet CheckedExtracts; - auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor()); - auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()); + auto *VecTy = + cast(getWidenedType(ScalarTy, TE.getVectorFactor())); + auto *SrcVecTy = + cast(getWidenedType(SrcScalarTy, LhsTE->getVectorFactor())); TTI::CastContextHint CastCtx = getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0)); InstructionCost VecCost = @@ -14500,7 +14627,7 @@ bool BoUpSLP::matchesInversedZExtSelect( if (InversedCmpsIndices.empty()) return false; - VectorType *VecTy = + Type *VecTy = getWidenedType(Cmp->getOperand(0)->getType(), CmpTE->getVectorFactor()); Type *CmpTy = CmpInst::makeCmpResultType(VecTy); @@ -14561,14 +14688,16 @@ bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const { // Check if bitcast is cheaper than select. auto *DstTy = IntegerType::getIntNTy(ScalarTy->getContext(), SelectTE.getVectorFactor()); - VectorType *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor()); + Type *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor()); Type *CmpTy = CmpInst::makeCmpResultType(OpTy); - VectorType *VecTy = getWidenedType(ScalarTy, SelectTE.getVectorFactor()); + auto *VecTy = + cast(getWidenedType(ScalarTy, SelectTE.getVectorFactor())); auto It = MinBWs.find(&SelectTE); if (It != MinBWs.end()) { auto *EffectiveScalarTy = IntegerType::get(F->getContext(), It->second.first); - VecTy = getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor()); + VecTy = cast( + getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor())); } TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost BitcastCost = TTI->getCastInstrCost( @@ -14882,7 +15011,8 @@ void BoUpSLP::transformNodes() { if (E.State != TreeEntry::Vectorize) break; Type *ScalarTy = E.getMainOp()->getType(); - auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); + auto *VecTy = + cast(getWidenedType(ScalarTy, E.Scalars.size())); Align CommonAlignment = computeCommonAlignment(E.Scalars); // Check if profitable to represent consecutive load + reverse as strided // load with stride -1. @@ -14920,7 +15050,8 @@ void BoUpSLP::transformNodes() { case Instruction::Store: { Type *ScalarTy = cast(E.getMainOp())->getValueOperand()->getType(); - auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); + auto *VecTy = + cast(getWidenedType(ScalarTy, E.Scalars.size())); Align CommonAlignment = computeCommonAlignment(E.Scalars); // Check if profitable to represent consecutive load + reverse as strided // load with stride -1. @@ -15247,7 +15378,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost getBuildVectorCost(ArrayRef VL, Value *Root) { if ((!Root && allConstant(VL)) || all_of(VL, IsaPred)) return TTI::TCC_Free; - auto *VecTy = getWidenedType(ScalarTy, VL.size()); + auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); InstructionCost GatherCost = 0; SmallVector Gathers(VL); if (!Root && isSplat(VL)) { @@ -15387,32 +15518,34 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask( MaskSlice, std::max(NumElts, MaskSlice.size()))) - Cost += - ::getShuffleCost(TTI, *ShuffleKinds[Part], - getWidenedType(ScalarTy, NumElts), MaskSlice); + Cost += ::getShuffleCost( + TTI, *ShuffleKinds[Part], + cast(getWidenedType(ScalarTy, NumElts)), MaskSlice); continue; } if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { - Cost += - ::getShuffleCost(TTI, *RegShuffleKind, - getWidenedType(ScalarTy, EltsPerVector), SubMask); + Cost += ::getShuffleCost( + TTI, *RegShuffleKind, + cast(getWidenedType(ScalarTy, EltsPerVector)), SubMask); } const unsigned BaseVF = getFullVectorNumberOfElements( *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector)); for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) { assert((Idx + SubVecSize) <= BaseVF && "SK_ExtractSubvector index out of range"); - Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector, - getWidenedType(ScalarTy, BaseVF), {}, CostKind, - Idx, getWidenedType(ScalarTy, SubVecSize)); + Cost += ::getShuffleCost( + TTI, TTI::SK_ExtractSubvector, + cast(getWidenedType(ScalarTy, BaseVF)), {}, CostKind, + Idx, cast(getWidenedType(ScalarTy, SubVecSize))); } // Second attempt to check, if just a permute is better estimated than // subvector extract. SubMask.assign(NumElts, PoisonMaskElem); copy(MaskSlice, SubMask.begin()); InstructionCost OriginalCost = ::getShuffleCost( - TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask); + TTI, *ShuffleKinds[Part], + cast(getWidenedType(ScalarTy, NumElts)), SubMask); if (OriginalCost < Cost) Cost = OriginalCost; } @@ -16087,9 +16220,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { I1 = I2 + CommonMask.size(); } } - Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, - getWidenedType(ScalarTy, CommonMask.size()), - SVMask, CostKind); + Cost += ::getShuffleCost( + TTI, TTI::SK_PermuteTwoSrc, + cast(getWidenedType(ScalarTy, CommonMask.size())), + SVMask, CostKind); } for (auto [E, Idx] : SubVectors) { Type *EScalarTy = E->Scalars.front()->getType(); @@ -16112,8 +16246,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } Cost += ::getShuffleCost( TTI, TTI::SK_InsertSubvector, - getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx, - getWidenedType(ScalarTy, E->getVectorFactor())); + cast(getWidenedType(ScalarTy, CommonMask.size())), {}, + CostKind, Idx, + cast(getWidenedType(ScalarTy, E->getVectorFactor()))); if (!CommonMask.empty()) { std::iota(std::next(CommonMask.begin(), Idx), std::next(CommonMask.begin(), Idx + E->getVectorFactor()), @@ -16312,7 +16447,7 @@ uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) { InstructionCost BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy, - VectorType *VecTy, VectorType *FinalVecTy, + Type *VecTy, Type *FinalVecTy, TTI::TargetCostKind CostKind) const { InstructionCost SpillsReloads = 0; @@ -16338,8 +16473,7 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy, PressureByClass[RegClass] += Parts; }; - auto GetEntryVecTy = - [&](const TreeEntry *TE) -> std::pair { + auto GetEntryVecTy = [&](const TreeEntry *TE) -> std::pair { Type *ScalarTy = getValueType(TE->Scalars.front()); auto BWIt = MinBWs.find(TE); if (BWIt != MinBWs.end()) { @@ -16491,21 +16625,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost VectorCost = 0; if (E->ReorderIndices.empty()) { VectorCost = ::getShuffleCost( - *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind, - E->CombinedEntriesWithIndices.back().second, - getWidenedType( + *TTI, TTI::SK_InsertSubvector, cast(FinalVecTy), {}, + CostKind, E->CombinedEntriesWithIndices.back().second, + cast(getWidenedType( ScalarTy, VectorizableTree[E->CombinedEntriesWithIndices.back().first] - ->getVectorFactor())); + ->getVectorFactor()))); } else { unsigned CommonVF = std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first] ->getVectorFactor(), VectorizableTree[E->CombinedEntriesWithIndices.back().first] ->getVectorFactor()); - VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, - getWidenedType(ScalarTy, CommonVF), - E->getSplitMask(), CostKind); + VectorCost = + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + cast(getWidenedType(ScalarTy, CommonVF)), + E->getSplitMask(), CostKind); } VectorCost += SpillsReloads; LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree")); @@ -16529,8 +16664,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (!E->ReuseShuffleIndices.empty()) ::addMask(Mask, E->ReuseShuffleIndices); if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) - CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, - Mask, CostKind, /*Index=*/0, VecTy); + CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + cast(FinalVecTy), Mask, CostKind, + /*Index=*/0, cast(VecTy)); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || E->State == TreeEntry::StridedVectorize || @@ -16638,8 +16774,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, "MaskedLoadCompressVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; - std::tie(ScalarCost, VecCost) = getGEPCosts( - *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy); + std::tie(ScalarCost, VecCost) = + getGEPCosts(*TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, + cast(VecTy)); LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, "Calculated GEPs cost for Tree")); @@ -16724,7 +16861,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, NumElts = ATy->getNumElements(); else NumElts = AggregateTy->getStructNumElements(); - SrcVecTy = getWidenedType(OrigScalarTy, NumElts); + SrcVecTy = cast(getWidenedType(OrigScalarTy, NumElts)); } } if (I->hasOneUse()) { @@ -16829,7 +16966,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // need to shift the vector. // Do not calculate the cost if the actual size is the register size and // we can merge this shuffle with the following SK_Select. - auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz); + auto *InsertVecTy = cast(getWidenedType(ScalarTy, InsertVecSz)); if (!IsIdentity) Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc, InsertVecTy, Mask); @@ -16845,7 +16982,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { if (InsertVecSz != VecSz) { - auto *ActualVecTy = getWidenedType(ScalarTy, VecSz); + auto *ActualVecTy = cast(getWidenedType(ScalarTy, VecSz)); Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {}, CostKind, OffsetBeg - Offset, InsertVecTy); } else { @@ -17028,10 +17165,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // (e.g. condition is while result is ). For // compares, the result type IS the mask (i1/vNi1). Construct the // right type so getCmpSelInstrCost sees the actual mask/result width. - auto *MaskTy = getWidenedType(ShuffleOrOp == Instruction::Select - ? VL0->getOperand(0)->getType() - : VL0->getType(), - VL.size()); + auto *MaskTy = cast(getWidenedType( + ShuffleOrOp == Instruction::Select ? VL0->getOperand(0)->getType() + : VL0->getType(), + VL.size())); InstructionCost VecCost = InstructionCost::getInvalid(); if (ShuffleOrOp == Instruction::Select) { @@ -17483,7 +17620,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, auto *CI = cast(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); SmallVector ArgTys = buildIntrinsicArgTypes( - CI, ID, VecTy->getNumElements(), + CI, ID, getNumElements(VecTy), It != MinBWs.end() ? It->second.first : 0, TTI); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; @@ -17596,7 +17733,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, }, Mask); VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask, CostKind); + cast(FinalVecTy), Mask, CostKind); // Patterns like [fadd,fsub] can be combined into a single instruction // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we // need to take into account their order when looking for the most used @@ -17607,9 +17744,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1)); // If this pattern is supported by the target then we consider the // order. - if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { + if (TTIRef.isLegalAltInstr(cast(VecTy), Opcode0, Opcode1, + OpcodeMask)) { InstructionCost AltVecCost = TTIRef.getAltInstrCost( - VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); + cast(VecTy), Opcode0, Opcode1, OpcodeMask, CostKind); return AltVecCost < VecCost ? AltVecCost : VecCost; } // TODO: Check the reverse order too. @@ -17645,7 +17783,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return true; })) return ::getShuffleCost( - *TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy, + *TTI, TargetTransformInfo::SK_PermuteSingleSrc, + cast(VecTy), calculateShufflevectorMask(E->Scalars)); } return TTI::TCC_Free; @@ -17724,6 +17863,18 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return true; } + // FIXME: support buildvector of the gather nodes with struct types. + if (any_of(VectorizableTree, [&](const std::unique_ptr &TE) { + return TE->isGather() && TE->hasState() && + TE->getOpcode() == Instruction::Call && + isa(TE->getMainOp()->getType()); + })) { + LLVM_DEBUG( + dbgs() << "SLP: rejecting tree with buildvector struct values of size " + << VectorizableTree.size() << ".\n"); + return true; + } + // Cache values from the root node and the cost-threshold options to avoid // re-querying them inside hot predicates below. const unsigned TreeSize = VectorizableTree.size(); @@ -18030,7 +18181,8 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { if (BackVF > 2 && allSameBlock(Back.Scalars) && !Back.Scalars.front()->getType()->isVectorTy() && TTI->getScalarizationOverhead( - getWidenedType(Back.Scalars.front()->getType(), BackVF), + cast( + getWidenedType(Back.Scalars.front()->getType(), BackVF)), APInt::getAllOnes(BackVF), /*Insert=*/true, /*Extract=*/false, TTI::TCK_RecipThroughput) > -SLPCostThreshold) @@ -18721,7 +18873,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( GatheredLoadsNodes.insert(&TE); if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize && !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement || - TE.getOpcode() == Instruction::Store))) { + TE.getOpcode() == Instruction::Store)) && + !isa(getValueType(TE.Scalars.front()))) { // Calculate costs of external uses. APInt DemandedElts = APInt::getZero(TE.getVectorFactor()); for (Value *V : TE.Scalars) { @@ -18734,9 +18887,10 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( if (It != MinBWs.end()) ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor()); - InstructionCost ExtCost = ::getScalarizationOverhead( - *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/false, - /*Extract=*/true, CostKind); + InstructionCost ExtCost = + ::getScalarizationOverhead(*TTI, ScalarTy, cast(VecTy), + DemandedElts, /*Insert=*/false, + /*Extract=*/true, CostKind); if (ExtCost.isValid() && ExtCost != 0) { if (!Scale) Scale = getScaleToLoopIterations(TE); @@ -18834,6 +18988,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) { TreeEntry *TE = Worklist.top().first; if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) || + isa(getValueType(TE->Scalars.front())) || // Exit early if the parent node is split node and any of scalars is // used in other split nodes. (TE->UserTreeIndex && @@ -18895,7 +19050,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, EntryVF); InstructionCost GatherCost = ::getScalarizationOverhead( - *TTI, ScalarTy, VecTy, DemandedElts, + *TTI, ScalarTy, cast(VecTy), DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind); SmallVector Mask; if (!TE->ReorderIndices.empty() && @@ -18915,8 +19070,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( if (!TE->ReuseShuffleIndices.empty()) ::addMask(Mask, TE->ReuseShuffleIndices); if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF)) - GatherCost += - ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask); + GatherCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + cast(VecTy), Mask); // If all scalars are reused in gather node(s) or other vector nodes, there // might be extra cost for inserting them. if ((!TE->hasState() || !TE->isAltShuffle()) && @@ -19012,7 +19167,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor()); InstructionCost ExtractsCost = ::getScalarizationOverhead( - *TTI, ScalarTy, VecTy, DemandedElts, + *TTI, ScalarTy, cast(VecTy), DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); InstructionCost BVCost = 0; for (const auto &[BVE, Values] : ValuesToInsert) { @@ -19026,7 +19181,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( } auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor()); BVCost += ::getScalarizationOverhead( - *TTI, ScalarTy, BVVecTy, BVDemandedElts, + *TTI, ScalarTy, cast(BVVecTy), BVDemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind, BVDemandedElts.isAllOnes(), BVValues); } @@ -19356,14 +19511,20 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, ? Instruction::ZExt : Instruction::SExt; VecTy = getWidenedType(MinTy, BundleWidth); - ExtraCost = - getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane); + ExtraCost = getExtractWithExtendCost(*TTI, Extend, ScalarTy, + cast(VecTy), EU.Lane); LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: " << ExtraCost << "\n"); } else { - ExtraCost = - getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy, - CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx); + Type *ExtractTy = VecTy; + if (auto *ST = dyn_cast(VecTy)) { + assert(EU.User && "Expected user for struct extract"); + const auto *EV = cast(EU.User); + ExtractTy = ExtractValueInst::getIndexedType(ST, EV->getIndices()); + } + ExtraCost = getVectorInstrCost( + *TTI, ScalarTy, Instruction::ExtractElement, ExtractTy, CostKind, + EU.Lane, EU.Scalar, ScalarUserAndIdx); LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from " << *VecTy << ": " << ExtraCost << "\n"); } @@ -19467,6 +19628,11 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, if (KeepScalar) { ExternalUsesAsOriginalScalar.insert(EU.Scalar); for (Value *V : Inst->operands()) { + // Struct operands cannot be rebuilt by the !User extraction + // path (it has no insertvalue chain), so leave their existing + // ExtractValueInst user in place. + if (isa(V->getType())) + continue; auto It = ValueToExtUses->find(V); if (It != ValueToExtUses->end()) { // Replace all uses to avoid compiler crash. @@ -19482,6 +19648,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, // compiler crash. if (auto *IOp = dyn_cast(Inst->getOperand(0))) { for (Value *V : IOp->operands()) { + if (isa(V->getType())) + continue; auto It = ValueToExtUses->find(V); if (It != ValueToExtUses->end()) { // Replace all uses to avoid compiler crash. @@ -19604,9 +19772,10 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, SmallVector OrigMask(VecVF, PoisonMaskElem); std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), OrigMask.begin()); - C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - getWidenedType(TE->getMainOp()->getType(), VecVF), - OrigMask); + C = ::getShuffleCost( + *TTI, TTI::SK_PermuteSingleSrc, + cast(getWidenedType(TE->getMainOp()->getType(), VecVF)), + OrigMask); LLVM_DEBUG( dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users.\n"; @@ -19622,9 +19791,10 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, ResizeMask[Mask[I]] = Mask[I]; } if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF)) - C = ::getShuffleCost( - *TTI, TTI::SK_PermuteSingleSrc, - getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask); + C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + cast(getWidenedType( + TE->getMainOp()->getType(), VecVF)), + ResizeMask); LLVM_DEBUG( dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users.\n"; @@ -19654,8 +19824,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, (Data.index() < VF && static_cast(Data.index()) == Data.value()); })) { - InstructionCost C = - ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask); + InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + cast(FTy), Mask); C = ScaleCost(C, *TEs.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement " @@ -19673,8 +19843,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, VF = Mask.size(); } auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); - InstructionCost C = - ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); + InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + cast(FTy), Mask); C = ScaleCost(C, *TEs.back()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of vector node and external " @@ -20566,8 +20736,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto *VecTy = getWidenedType(VL.front()->getType(), NewVF); - auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size()); + auto *VecTy = + cast(getWidenedType(VL.front()->getType(), NewVF)); + auto *MaskVecTy = + cast(getWidenedType(VL.front()->getType(), SubMask.size())); auto GetShuffleCost = [&, &TTI = *TTI](ArrayRef Mask, ArrayRef Entries, @@ -20775,16 +20947,16 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, any_of(VL, [](Value *V) { return !isa(V) && isConstant(V); }); // 1. Shuffle input source vector and constant vector. if (!ForPoisonSrc && IsAnyNonUndefConst) { - Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, VecTy, - ConstantShuffleMask); + Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, + cast(VecTy), ConstantShuffleMask); } // 2. Insert unique non-constants. if (!DemandedElements.isZero()) - Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements, - /*Insert=*/true, - /*Extract=*/false, CostKind, - ForPoisonSrc && !IsAnyNonUndefConst, VL); + Cost += getScalarizationOverhead( + *TTI, ScalarTy, cast(VecTy), DemandedElements, + /*Insert=*/true, + /*Extract=*/false, CostKind, ForPoisonSrc && !IsAnyNonUndefConst, VL); return Cost; } @@ -22307,8 +22479,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, for (auto [Idx, I] : enumerate(BVMask)) if (I != PoisonMaskElem) NewMask[Idx] = Mask.size(); - SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, - NewMask, CostKind); + SplatCost += + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + cast(VecTy), NewMask, CostKind); InstructionCost BVCost = TTI->getVectorInstrCost( Instruction::InsertElement, VecTy, CostKind, *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V); @@ -22320,7 +22493,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, if (I != PoisonMaskElem) NewMask[Idx] = I; BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - VecTy, NewMask, CostKind); + cast(VecTy), NewMask, + CostKind); } return SplatCost <= BVCost; }; @@ -22517,6 +22691,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool IsReverseOrder = !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); auto FinalShuffle = [&](Value *V, const TreeEntry *E) { + if (isa(ScalarTy)) { + // TODO: Reordering of struct types is not supported. + assert(E->ReorderIndices.empty() && + "Expected no reordering for struct types."); + assert(E->ReuseShuffleIndices.empty() && + "Expected no reuse shuffle indices for struct types."); + return V; + } ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); if (E->getOpcode() == Instruction::Store && E->State == TreeEntry::Vectorize) { @@ -22648,6 +22830,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ExtractValue: { + SmallVector Indices; + SmallVector Calls; + if (checkEVsForVecCalls(E->Scalars, E->getOperations(), *TLI, Indices, + Calls)) { + setInsertPointAfterBundle(E); + Value *V = vectorizeOperand(E, 0); + V = Builder.CreateExtractValue(V, Indices); + if (auto *I = dyn_cast(V)) + V = ::propagateMetadata(I, E->Scalars); + V = FinalShuffle(V, E); + E->VectorizedValue = V; + return V; + } auto *LI = cast(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); Value *Ptr = LI->getPointerOperand(); @@ -23181,7 +23376,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return Builder.getInt64(I % ScalarTyNumElements); }); VecPtr = Builder.CreateGEP( - VecTy->getElementType(), + toScalarizedTy(VecTy), Builder.CreateShuffleVector( VecPtr, createReplicatedMask(ScalarTyNumElements, VF)), ConstantVector::get(Indices)); @@ -23240,7 +23435,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::experimental_vp_strided_store, {VecTy, Ptr->getType(), StrideTy}, {VecValue, Ptr, StrideVal, - Builder.getAllOnesMask(VecTy->getElementCount()), + Builder.getAllOnesMask( + ElementCount::getFixed(getNumElements(VecTy))), Builder.getInt32(E->Scalars.size())}); Inst->addParamAttr( /*ArgNo=*/1, @@ -23290,7 +23486,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); SmallVector ArgTys = buildIntrinsicArgTypes( - CI, ID, VecTy->getNumElements(), + CI, ID, getNumElements(VecTy), It != MinBWs.end() ? It->second.first : 0, TTI); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); bool UseIntrinsic = ID != Intrinsic::not_intrinsic && @@ -23300,8 +23496,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector OpVecs; SmallVector TysForDecl; // Add return type if intrinsic is overloaded on it. - if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) - TysForDecl.push_back(VecTy); + if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) { + ArrayRef ContainedTys = getContainedTypes(VecTy); + for (auto [Idx, Ty] : enumerate(ContainedTys)) { + if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, Idx, TTI)) + TysForDecl.push_back(Ty); + } + } auto *CEI = cast(VL0); for (unsigned I : seq(0, CI->arg_size())) { // Some intrinsics have scalar arguments. This argument should not be @@ -23325,7 +23526,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ScalarArg->getType()->getScalarType() && It == MinBWs.end()) { auto *CastTy = - getWidenedType(ScalarArg->getType(), VecTy->getNumElements()); + getWidenedType(ScalarArg->getType(), getNumElements(VecTy)); OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I)); } else if (It != MinBWs.end()) { OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I)); @@ -23340,7 +23541,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!UseIntrinsic) { VFShape Shape = VFShape::get(CI->getFunctionType(), - ElementCount::getFixed(VecTy->getNumElements()), + ElementCount::getFixed(getNumElements(VecTy)), false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { @@ -23807,10 +24008,21 @@ Value *BoUpSLP::vectorizeTree( SmallVector> ShuffledInserts; // Maps vector instruction to original insertelement instruction DenseMap VectorToInsertElement; - // Maps extract Scalar to the corresponding extractelement instruction in the - // basic block. Only one extractelement per block should be emitted. - DenseMap>> + // Maps extract Scalar (plus the struct-field index path, when extracting + // from a struct of vectors) to the corresponding extractelement instruction + // in the basic block. Only one extractelement per block should be emitted. + // The index path is stored in an owning SmallVector so the key remains + // valid after the per-lane ExtractValueInst (whose Indices buffer it was + // copied from) is erased later in this loop. + SmallDenseMap>, + DenseMap>> ScalarToEEs; + // Maps (struct-of-vectors Vec, field-index path) to the corresponding + // per-block extractvalue, so different external lanes that need the same + // struct field of the same vectorized call share a single extractvalue. + SmallDenseMap>, + DenseMap> + StructFieldExtracts; SmallDenseSet UsedInserts; DenseMap, Value *> VectorCasts; SmallDenseSet ScalarsWithNullptrUser; @@ -23842,7 +24054,18 @@ Value *BoUpSLP::vectorizeTree( Value *ExV = nullptr; auto *Inst = dyn_cast(Scalar); bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst); - auto It = ScalarToEEs.find(Scalar); + // For struct-typed scalars, the User must be an ExtractValueInst that + // describes which struct field is being extracted. Copy its indices + // into an owning SmallVector so the cache key survives erasure of the + // ExtractValueInst. + SmallVector Indices; + if (isa(Scalar->getType())) { + assert(User && "User expected for StructType extract."); + const auto *EV = cast(User); + Indices.assign(EV->getIndices()); + } + auto Key = std::make_pair(Scalar, Indices); + auto It = ScalarToEEs.find(Key); if (It != ScalarToEEs.end()) { // No need to emit many extracts, just move the only one in the // current block. @@ -23900,19 +24123,53 @@ Value *BoUpSLP::vectorizeTree( Ex = createExtractVector(Builder, Vec, VecTyNumElements, ExternalUse.Lane * VecTyNumElements); } else { - Ex = Builder.CreateExtractElement(Vec, Lane); + if (isa(Vec->getType())) { + assert(isa(Scalar->getType()) && + "Vec is struct of vectors only when Scalar is struct."); + auto FieldKey = std::make_pair(Vec, Indices); + BasicBlock *EVBB = Builder.GetInsertBlock(); + Value *FieldVec = nullptr; + auto FieldIt = StructFieldExtracts.find(FieldKey); + if (FieldIt != StructFieldExtracts.end()) { + auto BBIt = FieldIt->second.find(EVBB); + if (BBIt != FieldIt->second.end()) + FieldVec = BBIt->second; + } + if (!FieldVec) { + FieldVec = Builder.CreateExtractValue(Vec, Indices); + StructFieldExtracts[FieldKey][EVBB] = FieldVec; + } else if (auto *FieldI = dyn_cast(FieldVec); + FieldI && Builder.GetInsertPoint() != EVBB->end() && + Builder.GetInsertPoint()->comesBefore(FieldI)) { + // Cached extractvalue is below the current insertion point; + // move it up so the extractelement we are about to emit can + // use it. + FieldI->moveBefore(*EVBB, Builder.GetInsertPoint()); + } + Vec = FieldVec; + } + if (SLPReVec && isVectorizedTy(Scalar->getType())) { + unsigned VecTyNumElements = getNumElements(Scalar->getType()); + // When REVEC is enabled, we need to extract a vector. + // Note: The element size of Scalar may be different from the + // element size of Vec. + Ex = createExtractVector(Builder, Vec, VecTyNumElements, + ExternalUse.Lane * VecTyNumElements); + } else { + Ex = Builder.CreateExtractElement(Vec, Lane); + } } // If necessary, sign-extend or zero-extend ScalarRoot // to the larger type. ExV = Ex; - if (Scalar->getType() != Ex->getType()) + if (!isa(Scalar->getType()) && + Scalar->getType() != Ex->getType()) ExV = Builder.CreateIntCast( Ex, Scalar->getType(), !isKnownNonNegative(Scalar, SimplifyQuery(*DL))); auto *I = dyn_cast(Ex); - ScalarToEEs[Scalar].try_emplace(I ? I->getParent() - : &F->getEntryBlock(), - std::make_pair(Ex, ExV)); + ScalarToEEs[Key].try_emplace(I ? I->getParent() : &F->getEntryBlock(), + std::make_pair(Ex, ExV)); } // The then branch of the previous if may produce constants, since 0 // operand might be a constant. @@ -24079,7 +24336,13 @@ Value *BoUpSLP::vectorizeTree( } else { Builder.SetInsertPoint(cast(User)); Value *NewInst = ExtractAndExtendIfNeeded(Vec); - User->replaceUsesOfWith(Scalar, NewInst); + if (isa(Scalar->getType()) && + isa_and_nonnull(User)) { + User->replaceAllUsesWith(NewInst); + eraseInstruction(cast(User)); + } else { + User->replaceUsesOfWith(Scalar, NewInst); + } } } else { Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); @@ -27645,7 +27908,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). auto *VecTy = getWidenedType(ScalarTy, VF); - if (TTI->getNumberOfParts(VecTy) == VF) + if (getNumberOfParts(*TTI, VecTy, ScalarTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { unsigned ActualVF = std::min(MaxInst - I, VF); @@ -28723,14 +28986,15 @@ class HorizontalReduction { Type *ScalarTy = Candidates.front()->getType(); ReduxWidth = getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); - VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + VectorType *Tp = cast(getWidenedType(ScalarTy, ReduxWidth)); NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy); NumRegs = TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); while (NumParts > NumRegs) { assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0."); ReduxWidth = bit_floor(ReduxWidth - 1); - VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + VectorType *Tp = + cast(getWidenedType(ScalarTy, ReduxWidth)); NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy); NumRegs = TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); @@ -29338,14 +29602,16 @@ class HorizontalReduction { } else { VectorCost = TTI->getExtendedReductionCost( RdxOpcode, !IsSigned, RedTy, - getWidenedType(RType, ReduxWidth), FMF, CostKind); + cast(getWidenedType(RType, ReduxWidth)), FMF, + CostKind); } } } else { Type *RedTy = VectorTy->getElementType(); auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); - VectorType *RVecTy = getWidenedType(RType, ReduxWidth); + VectorType *RVecTy = + cast(getWidenedType(RType, ReduxWidth)); InstructionCost FMACost = InstructionCost::getInvalid(); if (RdxKind == RecurKind::FAdd) { // Check if the reduction operands can be converted to FMA. @@ -29417,7 +29683,8 @@ class HorizontalReduction { Type *RedTy = VectorTy->getElementType(); auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); - VectorType *RVecTy = getWidenedType(RType, ReduxWidth); + VectorType *RVecTy = + cast(getWidenedType(RType, ReduxWidth)); IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF); VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind); if (RType != RedTy) { @@ -30181,7 +30448,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!HorRdx.matchReductionForOperands()) return false; // Check the cost of operations. - VectorType *VecTy = getWidenedType(Ty, Ops.size()); + auto *VecTy = cast(getWidenedType(Ty, Ops.size())); constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ScalarCost = TTI.getScalarizationOverhead( @@ -30581,6 +30848,17 @@ static bool isNonVectorizableInst(const Instruction *I, } if (isa(I)) return true; + if (const auto *EV = dyn_cast(I)) { + const auto *Arg = EV->getAggregateOperand(); + if (const auto *CI = dyn_cast(Arg)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (isTriviallyVectorizable(ID)) + return true; + if (!VFDatabase::getMappings(*CI).empty()) + return true; + return false; + } + } if (const auto *RI = dyn_cast(I)) return RI->getNumOperands() > 0 && (SLPReVec || !I->getOperand(0)->getType()->isVectorTy()) && @@ -30609,6 +30887,10 @@ static void forEachOperandChainCandidate(Instruction *I, Func F, F(AI->getNewValOperand(), 1); return; } + if (auto *EV = dyn_cast(I)) { + F(EV->getAggregateOperand(), 0); + return; + } if (ForReduction && !NonVectReductions) return; if (auto *SI = dyn_cast(I)) { @@ -30765,7 +31047,7 @@ bool SLPVectorizerPass::vectorizeNonVectorizableInsts( auto *OpI = dyn_cast(Op); if (!OpI || OpI->getParent() != BB || R.isDeleted(OpI) || isa(OpI) || - !isValidElementType(OpI->getType())) + (!isValidElementType(OpI->getType()) && !isa(OpI))) return; if (!Seen.insert(OpI).second) return; @@ -31232,7 +31514,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (auto *ValTy = dyn_cast( PostProcessStores.front()->getValueOperand()->getType())) ScalarTy = ::getWidenedType(ScalarTy, getNumElements(ValTy)); - auto *VecTy = ::getWidenedType(ScalarTy, PostProcessStores.size()); + auto *VecTy = cast( + ::getWidenedType(ScalarTy, PostProcessStores.size())); InstructionCost ExtractsCost = ::getScalarizationOverhead( *TTI, ScalarTy, VecTy, APInt::getAllOnes(PostProcessStores.size()), /*Insert=*/false, /*Extract=*/true, TTI::TCK_RecipThroughput, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll index 8d7dd9b9621c8..5d3dd1661fb8f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,48 +26,146 @@ declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.sadd.with.overflow.i8 (i8 , i8 ) define void @add_v8i64() { -; CHECK-LABEL: @add_v8i64( -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 -; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v8i64( +; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v8i64( +; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v8i64( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -113,88 +211,106 @@ define void @add_v8i64() { } define void @add_v16i32() { -; CHECK-LABEL: @add_v16i32( -; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 -; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v16i32( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v16i32( +; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v16i32( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -280,168 +396,106 @@ define void @add_v16i32() { } define void @add_v32i16() { -; CHECK-LABEL: @add_v32i16( -; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A0]], i16 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A1]], i16 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A2]], i16 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A3]], i16 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A4]], i16 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A5]], i16 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A6]], i16 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A7]], i16 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A8]], i16 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A9]], i16 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A10]], i16 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A11]], i16 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A12]], i16 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A13]], i16 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A14]], i16 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A15]], i16 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A16]], i16 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A17]], i16 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A18]], i16 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A19]], i16 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A20]], i16 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A21]], i16 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A22]], i16 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A23]], i16 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A24]], i16 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A25]], i16 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A26]], i16 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A27]], i16 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A28]], i16 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A29]], i16 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A30]], i16 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A31]], i16 [[B31]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 -; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 -; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 -; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 -; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 -; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 -; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 -; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 -; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 -; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 -; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 -; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 -; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 -; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 -; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 -; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 -; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 -; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 -; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 -; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 -; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 -; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 -; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 -; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 -; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 -; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 -; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 -; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 -; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 -; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v32i16( +; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v32i16( +; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v32i16( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -607,328 +661,106 @@ define void @add_v32i16() { } define void @add_v64i8() { -; CHECK-LABEL: @add_v64i8( -; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 -; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 -; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A0]], i8 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A1]], i8 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A2]], i8 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A3]], i8 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A4]], i8 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A5]], i8 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A6]], i8 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A7]], i8 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A8]], i8 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A9]], i8 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A10]], i8 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A11]], i8 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A12]], i8 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A13]], i8 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A14]], i8 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A15]], i8 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A16]], i8 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A17]], i8 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A18]], i8 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A19]], i8 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A20]], i8 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A21]], i8 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A22]], i8 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A23]], i8 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A24]], i8 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A25]], i8 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A26]], i8 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A27]], i8 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A28]], i8 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A29]], i8 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A30]], i8 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A31]], i8 [[B31]]) -; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A32]], i8 [[B32]]) -; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A33]], i8 [[B33]]) -; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A34]], i8 [[B34]]) -; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A35]], i8 [[B35]]) -; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A36]], i8 [[B36]]) -; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A37]], i8 [[B37]]) -; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A38]], i8 [[B38]]) -; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A39]], i8 [[B39]]) -; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A40]], i8 [[B40]]) -; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A41]], i8 [[B41]]) -; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A42]], i8 [[B42]]) -; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A43]], i8 [[B43]]) -; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A44]], i8 [[B44]]) -; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A45]], i8 [[B45]]) -; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A46]], i8 [[B46]]) -; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A47]], i8 [[B47]]) -; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A48]], i8 [[B48]]) -; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A49]], i8 [[B49]]) -; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A50]], i8 [[B50]]) -; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A51]], i8 [[B51]]) -; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A52]], i8 [[B52]]) -; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A53]], i8 [[B53]]) -; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A54]], i8 [[B54]]) -; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A55]], i8 [[B55]]) -; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A56]], i8 [[B56]]) -; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A57]], i8 [[B57]]) -; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A58]], i8 [[B58]]) -; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A59]], i8 [[B59]]) -; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A60]], i8 [[B60]]) -; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A61]], i8 [[B61]]) -; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A62]], i8 [[B62]]) -; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A63]], i8 [[B63]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 -; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 -; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 -; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 -; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 -; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 -; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 -; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 -; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 -; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 -; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 -; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 -; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 -; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 -; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 -; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 -; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 -; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 -; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 -; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 -; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 -; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 -; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 -; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 -; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 -; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 -; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 -; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 -; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 -; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 -; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 -; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 -; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 -; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 -; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 -; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 -; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 -; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 -; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 -; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 -; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 -; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 -; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 -; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 -; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 -; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 -; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 -; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 -; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 -; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 -; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 -; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 -; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 -; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 -; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 -; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 -; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 -; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 -; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 -; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 -; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 -; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 -; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 -; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 -; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 -; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 -; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 -; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 -; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 -; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 -; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 -; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 -; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 -; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 -; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 -; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 -; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 -; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 -; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 -; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 -; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 -; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 -; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 -; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 -; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 -; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 -; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 -; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v64i8( +; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v64i8( +; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v64i8( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1252,3 +1084,5 @@ define void @add_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll index fc67cec60f177..f5d2212cbe584 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,48 +26,146 @@ declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.uadd.with.overflow.i8 (i8 , i8 ) define void @add_v8i64() { -; CHECK-LABEL: @add_v8i64( -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 -; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v8i64( +; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v8i64( +; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v8i64( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -113,88 +211,106 @@ define void @add_v8i64() { } define void @add_v16i32() { -; CHECK-LABEL: @add_v16i32( -; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 -; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v16i32( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v16i32( +; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v16i32( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -280,168 +396,106 @@ define void @add_v16i32() { } define void @add_v32i16() { -; CHECK-LABEL: @add_v32i16( -; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A0]], i16 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A1]], i16 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A2]], i16 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A3]], i16 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A4]], i16 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A5]], i16 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A6]], i16 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A7]], i16 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A8]], i16 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A9]], i16 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A10]], i16 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A11]], i16 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A12]], i16 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A13]], i16 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A14]], i16 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A15]], i16 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A16]], i16 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A17]], i16 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A18]], i16 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A19]], i16 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A20]], i16 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A21]], i16 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A22]], i16 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A23]], i16 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A24]], i16 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A25]], i16 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A26]], i16 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A27]], i16 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A28]], i16 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A29]], i16 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A30]], i16 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A31]], i16 [[B31]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 -; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 -; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 -; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 -; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 -; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 -; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 -; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 -; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 -; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 -; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 -; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 -; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 -; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 -; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 -; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 -; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 -; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 -; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 -; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 -; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 -; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 -; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 -; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 -; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 -; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 -; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 -; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 -; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 -; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v32i16( +; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v32i16( +; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v32i16( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -607,328 +661,106 @@ define void @add_v32i16() { } define void @add_v64i8() { -; CHECK-LABEL: @add_v64i8( -; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 -; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 -; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A0]], i8 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A1]], i8 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A2]], i8 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A3]], i8 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A4]], i8 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A5]], i8 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A6]], i8 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A7]], i8 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A8]], i8 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A9]], i8 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A10]], i8 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A11]], i8 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A12]], i8 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A13]], i8 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A14]], i8 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A15]], i8 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A16]], i8 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A17]], i8 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A18]], i8 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A19]], i8 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A20]], i8 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A21]], i8 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A22]], i8 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A23]], i8 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A24]], i8 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A25]], i8 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A26]], i8 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A27]], i8 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A28]], i8 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A29]], i8 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A30]], i8 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A31]], i8 [[B31]]) -; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A32]], i8 [[B32]]) -; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A33]], i8 [[B33]]) -; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A34]], i8 [[B34]]) -; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A35]], i8 [[B35]]) -; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A36]], i8 [[B36]]) -; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A37]], i8 [[B37]]) -; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A38]], i8 [[B38]]) -; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A39]], i8 [[B39]]) -; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A40]], i8 [[B40]]) -; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A41]], i8 [[B41]]) -; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A42]], i8 [[B42]]) -; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A43]], i8 [[B43]]) -; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A44]], i8 [[B44]]) -; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A45]], i8 [[B45]]) -; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A46]], i8 [[B46]]) -; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A47]], i8 [[B47]]) -; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A48]], i8 [[B48]]) -; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A49]], i8 [[B49]]) -; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A50]], i8 [[B50]]) -; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A51]], i8 [[B51]]) -; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A52]], i8 [[B52]]) -; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A53]], i8 [[B53]]) -; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A54]], i8 [[B54]]) -; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A55]], i8 [[B55]]) -; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A56]], i8 [[B56]]) -; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A57]], i8 [[B57]]) -; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A58]], i8 [[B58]]) -; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A59]], i8 [[B59]]) -; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A60]], i8 [[B60]]) -; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A61]], i8 [[B61]]) -; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A62]], i8 [[B62]]) -; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A63]], i8 [[B63]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 -; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 -; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 -; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 -; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 -; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 -; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 -; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 -; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 -; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 -; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 -; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 -; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 -; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 -; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 -; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 -; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 -; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 -; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 -; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 -; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 -; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 -; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 -; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 -; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 -; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 -; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 -; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 -; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 -; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 -; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 -; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 -; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 -; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 -; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 -; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 -; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 -; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 -; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 -; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 -; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 -; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 -; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 -; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 -; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 -; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 -; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 -; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 -; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 -; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 -; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 -; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 -; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 -; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 -; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 -; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 -; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 -; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 -; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 -; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 -; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 -; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 -; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 -; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 -; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 -; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 -; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 -; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 -; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 -; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 -; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 -; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 -; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 -; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 -; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 -; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 -; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 -; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 -; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 -; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 -; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 -; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 -; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 -; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 -; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 -; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 -; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 -; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @add_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; SLM-LABEL: @add_v64i8( +; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SLM-NEXT: ret void +; +; AVX-LABEL: @add_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @add_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @add_v64i8( +; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @add_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @add_v64i8( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1252,3 +1084,5 @@ define void @add_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll index 72a3ddd0bb747..c7470f28d1c7b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,48 +26,126 @@ declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.smul.with.overflow.i8 (i8 , i8 ) define void @mul_v8i64() { -; CHECK-LABEL: @mul_v8i64( -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 -; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v8i64( +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8 +; SLM-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <2 x i64> [[TMP4]], ptr @c64, align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[TMP7:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SLM-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[TMP11:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SLM-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[TMP15:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v8i64( +; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <8 x i64> [[TMP4]], ptr @c64, align 8 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v8i64( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -113,88 +191,226 @@ define void @mul_v8i64() { } define void @mul_v16i32() { -; CHECK-LABEL: @mul_v16i32( -; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 -; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v16i32( +; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; SSE-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; SSE-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; SSE-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; SSE-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; SSE-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; SSE-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; SSE-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; SSE-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; SSE-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; SSE-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; SSE-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; SSE-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; SSE-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; SSE-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; SSE-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; SSE-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; SSE-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; SSE-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; SSE-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; SSE-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; SSE-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; SSE-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; SSE-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; SSE-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; SSE-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; SSE-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; SSE-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; SSE-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; SSE-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; SSE-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; SSE-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; SSE-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; SSE-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; SSE-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; SSE-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; SSE-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; SSE-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; SSE-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; SSE-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; SSE-NEXT: store i32 [[R0]], ptr @c32, align 4 +; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; SSE-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; SSE-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; SSE-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; SSE-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; SSE-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; SSE-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; SSE-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; SSE-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v16i32( +; SLM-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; SLM-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; SLM-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; SLM-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; SLM-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; SLM-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; SLM-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; SLM-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; SLM-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; SLM-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; SLM-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; SLM-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; SLM-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; SLM-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; SLM-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; SLM-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; SLM-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; SLM-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; SLM-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; SLM-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; SLM-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; SLM-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; SLM-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; SLM-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; SLM-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; SLM-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; SLM-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; SLM-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; SLM-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; SLM-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; SLM-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; SLM-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; SLM-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; SLM-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; SLM-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; SLM-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; SLM-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; SLM-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; SLM-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; SLM-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; SLM-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; SLM-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; SLM-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; SLM-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; SLM-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; SLM-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; SLM-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; SLM-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; SLM-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; SLM-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; SLM-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; SLM-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; SLM-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; SLM-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; SLM-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; SLM-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; SLM-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; SLM-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; SLM-NEXT: store i32 [[R0]], ptr @c32, align 4 +; SLM-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; SLM-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; SLM-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; SLM-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SLM-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; SLM-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; SLM-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; SLM-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SLM-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; SLM-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; SLM-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; SLM-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SLM-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; SLM-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; SLM-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v16i32( +; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v16i32( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -280,168 +496,106 @@ define void @mul_v16i32() { } define void @mul_v32i16() { -; CHECK-LABEL: @mul_v32i16( -; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A0]], i16 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A1]], i16 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A2]], i16 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A3]], i16 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A4]], i16 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A5]], i16 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A6]], i16 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A7]], i16 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A8]], i16 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A9]], i16 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A10]], i16 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A11]], i16 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A12]], i16 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A13]], i16 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A14]], i16 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A15]], i16 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A16]], i16 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A17]], i16 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A18]], i16 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A19]], i16 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A20]], i16 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A21]], i16 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A22]], i16 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A23]], i16 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A24]], i16 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A25]], i16 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A26]], i16 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A27]], i16 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A28]], i16 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A29]], i16 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A30]], i16 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A31]], i16 [[B31]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 -; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 -; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 -; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 -; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 -; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 -; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 -; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 -; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 -; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 -; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 -; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 -; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 -; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 -; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 -; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 -; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 -; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 -; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 -; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 -; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 -; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 -; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 -; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 -; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 -; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 -; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 -; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 -; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 -; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v32i16( +; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v32i16( +; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v32i16( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -607,328 +761,106 @@ define void @mul_v32i16() { } define void @mul_v64i8() { -; CHECK-LABEL: @mul_v64i8( -; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 -; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 -; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A0]], i8 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A1]], i8 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A2]], i8 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A3]], i8 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A4]], i8 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A5]], i8 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A6]], i8 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A7]], i8 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A8]], i8 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A9]], i8 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A10]], i8 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A11]], i8 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A12]], i8 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A13]], i8 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A14]], i8 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A15]], i8 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A16]], i8 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A17]], i8 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A18]], i8 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A19]], i8 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A20]], i8 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A21]], i8 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A22]], i8 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A23]], i8 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A24]], i8 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A25]], i8 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A26]], i8 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A27]], i8 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A28]], i8 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A29]], i8 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A30]], i8 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A31]], i8 [[B31]]) -; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A32]], i8 [[B32]]) -; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A33]], i8 [[B33]]) -; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A34]], i8 [[B34]]) -; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A35]], i8 [[B35]]) -; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A36]], i8 [[B36]]) -; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A37]], i8 [[B37]]) -; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A38]], i8 [[B38]]) -; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A39]], i8 [[B39]]) -; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A40]], i8 [[B40]]) -; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A41]], i8 [[B41]]) -; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A42]], i8 [[B42]]) -; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A43]], i8 [[B43]]) -; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A44]], i8 [[B44]]) -; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A45]], i8 [[B45]]) -; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A46]], i8 [[B46]]) -; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A47]], i8 [[B47]]) -; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A48]], i8 [[B48]]) -; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A49]], i8 [[B49]]) -; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A50]], i8 [[B50]]) -; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A51]], i8 [[B51]]) -; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A52]], i8 [[B52]]) -; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A53]], i8 [[B53]]) -; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A54]], i8 [[B54]]) -; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A55]], i8 [[B55]]) -; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A56]], i8 [[B56]]) -; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A57]], i8 [[B57]]) -; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A58]], i8 [[B58]]) -; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A59]], i8 [[B59]]) -; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A60]], i8 [[B60]]) -; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A61]], i8 [[B61]]) -; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A62]], i8 [[B62]]) -; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A63]], i8 [[B63]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 -; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 -; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 -; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 -; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 -; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 -; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 -; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 -; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 -; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 -; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 -; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 -; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 -; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 -; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 -; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 -; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 -; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 -; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 -; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 -; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 -; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 -; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 -; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 -; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 -; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 -; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 -; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 -; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 -; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 -; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 -; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 -; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 -; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 -; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 -; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 -; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 -; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 -; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 -; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 -; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 -; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 -; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 -; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 -; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 -; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 -; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 -; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 -; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 -; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 -; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 -; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 -; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 -; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 -; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 -; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 -; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 -; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 -; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 -; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 -; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 -; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 -; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 -; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 -; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 -; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 -; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 -; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 -; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 -; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 -; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 -; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 -; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 -; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 -; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 -; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 -; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 -; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 -; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 -; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 -; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 -; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 -; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 -; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 -; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 -; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 -; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 -; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v64i8( +; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v64i8( +; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v64i8( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1252,3 +1184,5 @@ define void @mul_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll index 4126f06e8ca81..4c1d070a569e7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,48 +26,126 @@ declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.umul.with.overflow.i8 (i8 , i8 ) define void @mul_v8i64() { -; CHECK-LABEL: @mul_v8i64( -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 -; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v8i64( +; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8 +; SLM-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <2 x i64> [[TMP4]], ptr @c64, align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[TMP7:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SLM-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[TMP11:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SLM-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[TMP15:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v8i64( +; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v8i64( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -113,88 +191,106 @@ define void @mul_v8i64() { } define void @mul_v16i32() { -; CHECK-LABEL: @mul_v16i32( -; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 -; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v16i32( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v16i32( +; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v16i32( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -280,168 +376,106 @@ define void @mul_v16i32() { } define void @mul_v32i16() { -; CHECK-LABEL: @mul_v32i16( -; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A0]], i16 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A1]], i16 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A2]], i16 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A3]], i16 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A4]], i16 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A5]], i16 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A6]], i16 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A7]], i16 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A8]], i16 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A9]], i16 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A10]], i16 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A11]], i16 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A12]], i16 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A13]], i16 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A14]], i16 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A15]], i16 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A16]], i16 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A17]], i16 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A18]], i16 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A19]], i16 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A20]], i16 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A21]], i16 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A22]], i16 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A23]], i16 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A24]], i16 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A25]], i16 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A26]], i16 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A27]], i16 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A28]], i16 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A29]], i16 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A30]], i16 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A31]], i16 [[B31]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 -; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 -; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 -; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 -; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 -; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 -; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 -; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 -; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 -; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 -; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 -; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 -; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 -; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 -; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 -; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 -; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 -; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 -; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 -; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 -; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 -; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 -; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 -; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 -; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 -; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 -; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 -; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 -; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 -; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v32i16( +; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v32i16( +; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v32i16( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -607,328 +641,106 @@ define void @mul_v32i16() { } define void @mul_v64i8() { -; CHECK-LABEL: @mul_v64i8( -; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 -; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 -; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A0]], i8 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A1]], i8 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A2]], i8 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A3]], i8 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A4]], i8 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A5]], i8 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A6]], i8 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A7]], i8 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A8]], i8 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A9]], i8 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A10]], i8 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A11]], i8 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A12]], i8 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A13]], i8 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A14]], i8 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A15]], i8 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A16]], i8 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A17]], i8 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A18]], i8 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A19]], i8 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A20]], i8 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A21]], i8 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A22]], i8 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A23]], i8 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A24]], i8 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A25]], i8 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A26]], i8 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A27]], i8 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A28]], i8 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A29]], i8 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A30]], i8 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A31]], i8 [[B31]]) -; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A32]], i8 [[B32]]) -; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A33]], i8 [[B33]]) -; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A34]], i8 [[B34]]) -; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A35]], i8 [[B35]]) -; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A36]], i8 [[B36]]) -; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A37]], i8 [[B37]]) -; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A38]], i8 [[B38]]) -; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A39]], i8 [[B39]]) -; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A40]], i8 [[B40]]) -; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A41]], i8 [[B41]]) -; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A42]], i8 [[B42]]) -; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A43]], i8 [[B43]]) -; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A44]], i8 [[B44]]) -; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A45]], i8 [[B45]]) -; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A46]], i8 [[B46]]) -; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A47]], i8 [[B47]]) -; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A48]], i8 [[B48]]) -; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A49]], i8 [[B49]]) -; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A50]], i8 [[B50]]) -; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A51]], i8 [[B51]]) -; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A52]], i8 [[B52]]) -; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A53]], i8 [[B53]]) -; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A54]], i8 [[B54]]) -; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A55]], i8 [[B55]]) -; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A56]], i8 [[B56]]) -; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A57]], i8 [[B57]]) -; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A58]], i8 [[B58]]) -; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A59]], i8 [[B59]]) -; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A60]], i8 [[B60]]) -; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A61]], i8 [[B61]]) -; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A62]], i8 [[B62]]) -; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A63]], i8 [[B63]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 -; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 -; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 -; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 -; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 -; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 -; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 -; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 -; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 -; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 -; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 -; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 -; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 -; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 -; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 -; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 -; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 -; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 -; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 -; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 -; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 -; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 -; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 -; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 -; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 -; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 -; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 -; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 -; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 -; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 -; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 -; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 -; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 -; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 -; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 -; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 -; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 -; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 -; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 -; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 -; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 -; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 -; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 -; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 -; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 -; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 -; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 -; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 -; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 -; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 -; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 -; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 -; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 -; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 -; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 -; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 -; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 -; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 -; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 -; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 -; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 -; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 -; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 -; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 -; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 -; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 -; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 -; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 -; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 -; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 -; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 -; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 -; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 -; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 -; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 -; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 -; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 -; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 -; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 -; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 -; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 -; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 -; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 -; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 -; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 -; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 -; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 -; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @mul_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; SLM-LABEL: @mul_v64i8( +; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SLM-NEXT: ret void +; +; AVX-LABEL: @mul_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @mul_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @mul_v64i8( +; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @mul_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @mul_v64i8( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1252,3 +1064,5 @@ define void @mul_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll index d628dddd16cb1..fa1ed4dd49c8d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,48 +26,146 @@ declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.ssub.with.overflow.i8 (i8 , i8 ) define void @sub_v8i64() { -; CHECK-LABEL: @sub_v8i64( -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 -; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v8i64( +; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v8i64( +; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v8i64( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -113,88 +211,106 @@ define void @sub_v8i64() { } define void @sub_v16i32() { -; CHECK-LABEL: @sub_v16i32( -; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 -; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v16i32( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v16i32( +; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v16i32( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -280,168 +396,106 @@ define void @sub_v16i32() { } define void @sub_v32i16() { -; CHECK-LABEL: @sub_v32i16( -; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A0]], i16 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A1]], i16 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A2]], i16 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A3]], i16 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A4]], i16 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A5]], i16 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A6]], i16 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A7]], i16 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A8]], i16 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A9]], i16 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A10]], i16 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A11]], i16 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A12]], i16 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A13]], i16 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A14]], i16 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A15]], i16 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A16]], i16 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A17]], i16 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A18]], i16 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A19]], i16 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A20]], i16 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A21]], i16 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A22]], i16 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A23]], i16 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A24]], i16 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A25]], i16 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A26]], i16 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A27]], i16 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A28]], i16 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A29]], i16 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A30]], i16 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A31]], i16 [[B31]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 -; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 -; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 -; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 -; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 -; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 -; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 -; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 -; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 -; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 -; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 -; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 -; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 -; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 -; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 -; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 -; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 -; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 -; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 -; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 -; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 -; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 -; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 -; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 -; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 -; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 -; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 -; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 -; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 -; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v32i16( +; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v32i16( +; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v32i16( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -607,328 +661,106 @@ define void @sub_v32i16() { } define void @sub_v64i8() { -; CHECK-LABEL: @sub_v64i8( -; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 -; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 -; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A0]], i8 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A1]], i8 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A2]], i8 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A3]], i8 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A4]], i8 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A5]], i8 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A6]], i8 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A7]], i8 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A8]], i8 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A9]], i8 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A10]], i8 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A11]], i8 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A12]], i8 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A13]], i8 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A14]], i8 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A15]], i8 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A16]], i8 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A17]], i8 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A18]], i8 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A19]], i8 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A20]], i8 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A21]], i8 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A22]], i8 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A23]], i8 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A24]], i8 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A25]], i8 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A26]], i8 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A27]], i8 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A28]], i8 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A29]], i8 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A30]], i8 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A31]], i8 [[B31]]) -; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A32]], i8 [[B32]]) -; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A33]], i8 [[B33]]) -; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A34]], i8 [[B34]]) -; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A35]], i8 [[B35]]) -; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A36]], i8 [[B36]]) -; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A37]], i8 [[B37]]) -; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A38]], i8 [[B38]]) -; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A39]], i8 [[B39]]) -; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A40]], i8 [[B40]]) -; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A41]], i8 [[B41]]) -; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A42]], i8 [[B42]]) -; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A43]], i8 [[B43]]) -; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A44]], i8 [[B44]]) -; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A45]], i8 [[B45]]) -; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A46]], i8 [[B46]]) -; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A47]], i8 [[B47]]) -; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A48]], i8 [[B48]]) -; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A49]], i8 [[B49]]) -; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A50]], i8 [[B50]]) -; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A51]], i8 [[B51]]) -; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A52]], i8 [[B52]]) -; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A53]], i8 [[B53]]) -; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A54]], i8 [[B54]]) -; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A55]], i8 [[B55]]) -; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A56]], i8 [[B56]]) -; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A57]], i8 [[B57]]) -; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A58]], i8 [[B58]]) -; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A59]], i8 [[B59]]) -; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A60]], i8 [[B60]]) -; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A61]], i8 [[B61]]) -; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A62]], i8 [[B62]]) -; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A63]], i8 [[B63]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 -; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 -; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 -; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 -; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 -; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 -; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 -; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 -; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 -; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 -; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 -; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 -; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 -; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 -; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 -; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 -; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 -; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 -; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 -; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 -; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 -; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 -; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 -; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 -; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 -; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 -; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 -; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 -; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 -; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 -; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 -; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 -; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 -; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 -; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 -; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 -; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 -; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 -; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 -; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 -; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 -; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 -; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 -; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 -; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 -; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 -; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 -; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 -; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 -; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 -; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 -; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 -; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 -; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 -; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 -; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 -; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 -; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 -; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 -; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 -; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 -; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 -; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 -; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 -; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 -; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 -; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 -; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 -; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 -; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 -; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 -; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 -; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 -; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 -; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 -; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 -; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 -; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 -; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 -; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 -; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 -; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 -; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 -; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 -; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 -; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 -; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 -; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v64i8( +; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v64i8( +; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v64i8( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1252,3 +1084,5 @@ define void @sub_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll index 11a68a5dfbcca..9c683eacc7062 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,48 +26,146 @@ declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.usub.with.overflow.i8 (i8 , i8 ) define void @sub_v8i64() { -; CHECK-LABEL: @sub_v8i64( -; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 -; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v8i64( +; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v8i64( +; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v8i64( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX512_256-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -113,88 +211,106 @@ define void @sub_v8i64() { } define void @sub_v16i32() { -; CHECK-LABEL: @sub_v16i32( -; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 -; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v16i32( +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v16i32( +; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v16i32( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX512_256-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -280,168 +396,106 @@ define void @sub_v16i32() { } define void @sub_v32i16() { -; CHECK-LABEL: @sub_v32i16( -; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A0]], i16 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A1]], i16 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A2]], i16 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A3]], i16 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A4]], i16 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A5]], i16 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A6]], i16 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A7]], i16 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A8]], i16 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A9]], i16 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A10]], i16 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A11]], i16 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A12]], i16 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A13]], i16 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A14]], i16 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A15]], i16 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A16]], i16 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A17]], i16 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A18]], i16 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A19]], i16 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A20]], i16 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A21]], i16 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A22]], i16 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A23]], i16 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A24]], i16 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A25]], i16 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A26]], i16 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A27]], i16 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A28]], i16 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A29]], i16 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A30]], i16 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A31]], i16 [[B31]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 -; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 -; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 -; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 -; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 -; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 -; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 -; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 -; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 -; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 -; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 -; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 -; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 -; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 -; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 -; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 -; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 -; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 -; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 -; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 -; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 -; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 -; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 -; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 -; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 -; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 -; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 -; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 -; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 -; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v32i16( +; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v32i16( +; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v32i16( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX512_256-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -607,328 +661,106 @@ define void @sub_v32i16() { } define void @sub_v64i8() { -; CHECK-LABEL: @sub_v64i8( -; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 -; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 -; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 -; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 -; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 -; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 -; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 -; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 -; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 -; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 -; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 -; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 -; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 -; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 -; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 -; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 -; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 -; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 -; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 -; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 -; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 -; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 -; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 -; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 -; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 -; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 -; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 -; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 -; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 -; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 -; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 -; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 -; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 -; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 -; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 -; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 -; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 -; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 -; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 -; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 -; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 -; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 -; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 -; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 -; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 -; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 -; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 -; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 -; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 -; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 -; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 -; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 -; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 -; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 -; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 -; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 -; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 -; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 -; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 -; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 -; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 -; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 -; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A0]], i8 [[B0]]) -; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A1]], i8 [[B1]]) -; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A2]], i8 [[B2]]) -; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A3]], i8 [[B3]]) -; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A4]], i8 [[B4]]) -; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A5]], i8 [[B5]]) -; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A6]], i8 [[B6]]) -; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A7]], i8 [[B7]]) -; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A8]], i8 [[B8]]) -; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A9]], i8 [[B9]]) -; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A10]], i8 [[B10]]) -; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A11]], i8 [[B11]]) -; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A12]], i8 [[B12]]) -; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A13]], i8 [[B13]]) -; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A14]], i8 [[B14]]) -; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A15]], i8 [[B15]]) -; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A16]], i8 [[B16]]) -; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A17]], i8 [[B17]]) -; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A18]], i8 [[B18]]) -; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A19]], i8 [[B19]]) -; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A20]], i8 [[B20]]) -; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A21]], i8 [[B21]]) -; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A22]], i8 [[B22]]) -; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A23]], i8 [[B23]]) -; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A24]], i8 [[B24]]) -; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A25]], i8 [[B25]]) -; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A26]], i8 [[B26]]) -; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A27]], i8 [[B27]]) -; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A28]], i8 [[B28]]) -; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A29]], i8 [[B29]]) -; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A30]], i8 [[B30]]) -; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A31]], i8 [[B31]]) -; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A32]], i8 [[B32]]) -; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A33]], i8 [[B33]]) -; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A34]], i8 [[B34]]) -; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A35]], i8 [[B35]]) -; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A36]], i8 [[B36]]) -; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A37]], i8 [[B37]]) -; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A38]], i8 [[B38]]) -; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A39]], i8 [[B39]]) -; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A40]], i8 [[B40]]) -; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A41]], i8 [[B41]]) -; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A42]], i8 [[B42]]) -; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A43]], i8 [[B43]]) -; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A44]], i8 [[B44]]) -; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A45]], i8 [[B45]]) -; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A46]], i8 [[B46]]) -; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A47]], i8 [[B47]]) -; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A48]], i8 [[B48]]) -; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A49]], i8 [[B49]]) -; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A50]], i8 [[B50]]) -; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A51]], i8 [[B51]]) -; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A52]], i8 [[B52]]) -; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A53]], i8 [[B53]]) -; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A54]], i8 [[B54]]) -; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A55]], i8 [[B55]]) -; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A56]], i8 [[B56]]) -; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A57]], i8 [[B57]]) -; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A58]], i8 [[B58]]) -; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A59]], i8 [[B59]]) -; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A60]], i8 [[B60]]) -; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A61]], i8 [[B61]]) -; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A62]], i8 [[B62]]) -; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A63]], i8 [[B63]]) -; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 -; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 -; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 -; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 -; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 -; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 -; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 -; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 -; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 -; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 -; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 -; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 -; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 -; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 -; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 -; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 -; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 -; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 -; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 -; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 -; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 -; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 -; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 -; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 -; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 -; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 -; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 -; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 -; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 -; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 -; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 -; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 -; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 -; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 -; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 -; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 -; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 -; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 -; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 -; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 -; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 -; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 -; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 -; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 -; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 -; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 -; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 -; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 -; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 -; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 -; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 -; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 -; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 -; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 -; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 -; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 -; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 -; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 -; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 -; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 -; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 -; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 -; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 -; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 -; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 -; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 -; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 -; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 -; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 -; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 -; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 -; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 -; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 -; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 -; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 -; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 -; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 -; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 -; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 -; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 -; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 -; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 -; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 -; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 -; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 -; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 -; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 -; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 -; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 -; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 -; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 -; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 -; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 -; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 -; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 -; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 -; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 -; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 -; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 -; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 -; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 -; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 -; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 -; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 -; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 -; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 -; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 -; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 -; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 -; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 -; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 -; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 -; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 -; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 -; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 -; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 -; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 -; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 -; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @sub_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; SLM-LABEL: @sub_v64i8( +; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 +; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) +; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 +; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 +; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) +; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 +; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SLM-NEXT: ret void +; +; AVX-LABEL: @sub_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX2-LABEL: @sub_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX2-NEXT: ret void +; +; KNL-LABEL: @sub_v64i8( +; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; KNL-NEXT: ret void +; +; AVX512-LABEL: @sub_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 +; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; +; AVX512_256-LABEL: @sub_v64i8( +; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 +; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) +; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 +; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX512_256-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1252,3 +1084,5 @@ define void @sub_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll index 7740aaa14b805..9683f71bd40e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll @@ -13,10 +13,12 @@ define float @test(ptr %0, double %1, double %2, double %3) { ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x double> [[TMP9]], <3 x double> poison, <3 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = fmul <3 x double> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1356 -; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP1]], 0.000000e+00 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP28]], ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> , double [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <3 x double> , double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <3 x double> [[TMP18]], double [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> poison, <3 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <3 x double> , <3 x double> [[TMP18]], <3 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = fadd <3 x double> [[TMP17]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <3 x double> , double [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP22:%.*]] = fadd <3 x double> [[TMP21]], [[TMP20]] diff --git a/llvm/test/Transforms/SLPVectorizer/sincos.ll b/llvm/test/Transforms/SLPVectorizer/sincos.ll index 76545dedac5f5..504467d0049d7 100644 --- a/llvm/test/Transforms/SLPVectorizer/sincos.ll +++ b/llvm/test/Transforms/SLPVectorizer/sincos.ll @@ -8,52 +8,40 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @phase, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, double } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { double, double } [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr @phase, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast { <8 x double>, <8 x double> } @llvm.sincos.v8f64(<8 x double> [[TMP0]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP4]], i32 0 ; CHECK-NEXT: store double [[TMP2]], ptr @sinval, align 16 ; CHECK-NEXT: store double [[TMP3]], ptr @cosval, align 16 -; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 8), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { double, double } [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP4]], i32 1 ; CHECK-NEXT: store double [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 8), align 8 ; CHECK-NEXT: store double [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 8), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 16 -; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { double, double } [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { double, double } [[TMP9]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x double> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x double> [[TMP4]], i32 2 ; CHECK-NEXT: store double [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 16 ; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 24), align 8 -; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { double, double } [[TMP13]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { double, double } [[TMP13]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x double> [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x double> [[TMP4]], i32 3 ; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 24), align 8 ; CHECK-NEXT: store double [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 24), align 8 -; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16 -; CHECK-NEXT: [[TMP17:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { double, double } [[TMP17]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { double, double } [[TMP17]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x double> [[TMP5]], i32 4 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x double> [[TMP4]], i32 4 ; CHECK-NEXT: store double [[TMP18]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16 ; CHECK-NEXT: store double [[TMP19]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16 -; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 40), align 8 -; CHECK-NEXT: [[TMP21:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP20]]) -; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, double } [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { double, double } [[TMP21]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x double> [[TMP5]], i32 5 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x double> [[TMP4]], i32 5 ; CHECK-NEXT: store double [[TMP22]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 40), align 8 ; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 40), align 8 -; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 16 -; CHECK-NEXT: [[TMP25:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP24]]) -; CHECK-NEXT: [[TMP26:%.*]] = extractvalue { double, double } [[TMP25]], 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { double, double } [[TMP25]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x double> [[TMP5]], i32 6 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x double> [[TMP4]], i32 6 ; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 16 ; CHECK-NEXT: store double [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 16 -; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 56), align 8 -; CHECK-NEXT: [[TMP29:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP28]]) -; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { double, double } [[TMP29]], 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { double, double } [[TMP29]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x double> [[TMP5]], i32 7 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x double> [[TMP4]], i32 7 ; CHECK-NEXT: store double [[TMP30]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 56), align 8 ; CHECK-NEXT: store double [[TMP31]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 56), align 8 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll index 45d6e395b6886..10bee3262f738 100644 --- a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll @@ -8,28 +8,24 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @phase, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr @phase, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast { <8 x double>, <8 x double> } @llvm.sincos.v8f64(<8 x double> [[TMP0]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @sinval, align 16 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr @cosval, align 16 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> ; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 8 ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16 -; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> ; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16 ; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 8 -; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> ; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 8 ; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 8 ; CHECK-NEXT: ret i32 0 From 17a0494ad9287db8e7c4df516b968955f3461a9b Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Fri, 8 May 2026 13:33:02 -0400 Subject: [PATCH 075/538] [PowerPC][NFC]Refactor EmitInstrWithCustomInserter (#196114) Currently PPCTargetLowering::EmitInstrWithCustomInserter() uses a large if/else-if structure. Update to use switch and move ATOMIC_CMP_SWAP and SELECT code to helper functions for better readability and maintenance. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 996 +++++++++++--------- 1 file changed, 558 insertions(+), 438 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 407093fd2b849..e959100d713dd 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -13854,8 +13854,13 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI, return TailMBB; } -static bool IsSelectCC(MachineInstr &MI) { - switch (MI.getOpcode()) { +/// Check if the opcode is a SELECT or SELECT_CC variant. +/// @param Opcode The opcode to check +/// @param CheckOnlyCC If true, only return true for SELECT_CC variants; +/// if false, return true for both SELECT and SELECT_CC +static bool IsSelect(unsigned Opcode, bool CheckOnlyCC = false) { + switch (Opcode) { + // SELECT_CC variants - always return true case PPC::SELECT_CC_I4: case PPC::SELECT_CC_I8: case PPC::SELECT_CC_F4: @@ -13868,13 +13873,7 @@ static bool IsSelectCC(MachineInstr &MI) { case PPC::SELECT_CC_SPE4: case PPC::SELECT_CC_SPE: return true; - default: - return false; - } -} - -static bool IsSelect(MachineInstr &MI) { - switch (MI.getOpcode()) { + // SELECT variants - only return true if CheckOnlyCC is false case PPC::SELECT_I4: case PPC::SELECT_I8: case PPC::SELECT_F4: @@ -13886,49 +13885,25 @@ static bool IsSelect(MachineInstr &MI) { case PPC::SELECT_VSFRC: case PPC::SELECT_VSSRC: case PPC::SELECT_VSRC: - return true; + return !CheckOnlyCC; // true if checking all SELECTs, false if only CC default: return false; } } +static bool IsSelectCC(unsigned Opcode) { return IsSelect(Opcode, true); } -MachineBasicBlock * -PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, - MachineBasicBlock *BB) const { - if (MI.getOpcode() == TargetOpcode::STACKMAP || - MI.getOpcode() == TargetOpcode::PATCHPOINT) { - if (Subtarget.is64BitELFABI() && - MI.getOpcode() == TargetOpcode::PATCHPOINT && - !Subtarget.isUsingPCRelativeCalls()) { - // Call lowering should have added an r2 operand to indicate a dependence - // on the TOC base pointer value. It can't however, because there is no - // way to mark the dependence as implicit there, and so the stackmap code - // will confuse it with a regular operand. Instead, add the dependence - // here. - MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); - } - - return emitPatchPoint(MI, BB); - } - - if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || - MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { - return emitEHSjLjSetJmp(MI, BB); - } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || - MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { - return emitEHSjLjLongJmp(MI, BB); - } - - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // To "insert" these instructions we actually have to insert their - // control-flow patterns. - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = ++BB->getIterator(); - - MachineFunction *F = BB->getParent(); - MachineRegisterInfo &MRI = F->getRegInfo(); - +/// Emit SELECT instruction, using ISEL if available, otherwise use +/// branch-based control flow. +/// +/// For targets with ISEL support (SELECT_CC_I4/I8, SELECT_I4/I8), this +/// generates a single ISEL instruction. Otherwise, it creates a +/// branch-based control flow pattern with PHI nodes. +static MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII, + const PPCSubtarget &Subtarget) { + assert(IsSelect(MI.getOpcode()) && "Instruction must be a SELECT variant"); + + // Check if we can use ISEL for this SELECT if (Subtarget.hasISEL() && (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || @@ -13944,74 +13919,424 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, DebugLoc dl = MI.getDebugLoc(); TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); - } else if (IsSelectCC(MI) || IsSelect(MI)) { - // The incoming instruction knows the destination vreg to set, the - // condition code register to branch on, the true/false values to - // select between, and a branch opcode to use. - - // thisMBB: - // ... - // TrueVal = ... - // cmpTY ccX, r1, r2 - // bCC sinkMBB - // fallthrough --> copy0MBB - MachineBasicBlock *thisMBB = BB; - MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - DebugLoc dl = MI.getDebugLoc(); - F->insert(It, copy0MBB); - F->insert(It, sinkMBB); + MI.eraseFromParent(); + return BB; + } - if (isPhysRegUsedAfter(PPC::CARRY, MI.getIterator())) { - copy0MBB->addLiveIn(PPC::CARRY); - sinkMBB->addLiveIn(PPC::CARRY); - } + // Fall back to branch-based SELECT implementation + MachineFunction *F = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + DebugLoc dl = MI.getDebugLoc(); - // Set the call frame size on entry to the new basic blocks. - // See https://reviews.llvm.org/D156113. - unsigned CallFrameSize = TII->getCallFrameSizeAt(MI); - copy0MBB->setCallFrameSize(CallFrameSize); - sinkMBB->setCallFrameSize(CallFrameSize); + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + if (isPhysRegUsedAfter(PPC::CARRY, MI.getIterator())) { + copy0MBB->addLiveIn(PPC::CARRY); + sinkMBB->addLiveIn(PPC::CARRY); + } - // Next, add the true and fallthrough blocks as its successors. - BB->addSuccessor(copy0MBB); - BB->addSuccessor(sinkMBB); + // Set the call frame size on entry to the new basic blocks. + unsigned CallFrameSize = TII->getCallFrameSizeAt(MI); + copy0MBB->setCallFrameSize(CallFrameSize); + sinkMBB->setCallFrameSize(CallFrameSize); - if (IsSelect(MI)) { - BuildMI(BB, dl, TII->get(PPC::BC)) - .addReg(MI.getOperand(1).getReg()) - .addMBB(sinkMBB); - } else { - unsigned SelectPred = MI.getOperand(4).getImm(); - BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(SelectPred) - .addReg(MI.getOperand(1).getReg()) - .addMBB(sinkMBB); - } + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); - // copy0MBB: - // %FalseValue = ... - // # fallthrough to sinkMBB - BB = copy0MBB; + // Add successors + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); - // Update machine-CFG edges - BB->addSuccessor(sinkMBB); + // Build branch instruction + if (IsSelectCC(MI.getOpcode())) + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(MI.getOperand(4).getImm()) + .addReg(MI.getOperand(1).getReg()) + .addMBB(sinkMBB); + else + BuildMI(BB, dl, TII->get(PPC::BC)) + .addReg(MI.getOperand(1).getReg()) + .addMBB(sinkMBB); + + // copy0MBB: fallthrough to sinkMBB + BB = copy0MBB; + BB->addSuccessor(sinkMBB); + + // sinkMBB: PHI instruction + BB = sinkMBB; + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) + .addReg(MI.getOperand(3).getReg()) + .addMBB(copy0MBB) + .addReg(MI.getOperand(2).getReg()) + .addMBB(thisMBB); + MI.eraseFromParent(); + return BB; +} + +/// Helper function to create basic blocks for atomic compare-and-swap. +/// Creates three basic blocks (loop1MBB, loop2MBB, exitMBB) and sets up +/// the control flow structure common to both hardware and software +/// implementations of atomic compare-and-swap operations. +static void createAtomicLoopBlocks(MachineFunction *F, MachineBasicBlock *BB, + MachineBasicBlock *&loop1MBB, + MachineBasicBlock *&loop2MBB, + MachineBasicBlock *&exitMBB, + MachineInstr &MI, + MachineFunction::iterator It) { + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); + loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); + exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loop1MBB); + F->insert(It, loop2MBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(loop1MBB); +} + +/// Emit hardware-supported atomic compare-and-swap for I32/I64 and I8/I16 +/// with partword atomic support. +/// +/// This uses native PowerPC atomic instructions (LBARX/LHARX/LWARX/LDARX for +/// load-and-reserve, STBCX/STHCX/STWCX/STDCX for store-conditional) to +/// implement atomic compare-and-swap at byte, halfword, word, or doubleword +/// granularity. +/// +/// Control flow: +/// thisMBB -> loop1MBB -> loop2MBB -> exitMBB +/// | | +/// +------------+ +/// +/// loop1MBB: +/// - Load-and-reserve from memory +/// - Compare loaded value with expected old value +/// - Branch to exitMBB if not equal (CAS failed) +/// loop2MBB: +/// - Store-conditional new value to memory +/// - Branch back to loop1MBB if store failed (retry) +/// - Fall through to exitMBB on success +static MachineBasicBlock * +emitAtomicCmpSwapHardware(MachineInstr &MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII, + const PPCSubtarget &Subtarget) { + MachineFunction *F = BB->getParent(); + MachineFunction::iterator It = ++BB->getIterator(); + + bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; + + unsigned LoadMnemonic = PPC::LDARX; + unsigned StoreMnemonic = PPC::STDCX; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Compare and swap of unknown size"); + case PPC::ATOMIC_CMP_SWAP_I8: + LoadMnemonic = PPC::LBARX; + StoreMnemonic = PPC::STBCX; + assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); + break; + case PPC::ATOMIC_CMP_SWAP_I16: + LoadMnemonic = PPC::LHARX; + StoreMnemonic = PPC::STHCX; + assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); + break; + case PPC::ATOMIC_CMP_SWAP_I32: + LoadMnemonic = PPC::LWARX; + StoreMnemonic = PPC::STWCX; + break; + case PPC::ATOMIC_CMP_SWAP_I64: + LoadMnemonic = PPC::LDARX; + StoreMnemonic = PPC::STDCX; + break; + } + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register oldval = MI.getOperand(3).getReg(); + Register newval = MI.getOperand(4).getReg(); + DebugLoc dl = MI.getDebugLoc(); + + MachineBasicBlock *loop1MBB, *loop2MBB, *exitMBB; + createAtomicLoopBlocks(F, BB, loop1MBB, loop2MBB, exitMBB, MI, It); + + Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); + + // loop1MBB: + // l[bhwd]arx dest, ptr + // cmp[wd] dest, oldval + // bne- exitBB + BB = loop1MBB; + BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), CrReg) + .addReg(dest) + .addReg(oldval); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE_MINUS) + .addReg(CrReg) + .addMBB(exitMBB); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(exitMBB); + + // loop2MBB: + // st[bhwd]cx. newval, ptr + // bne- loopMBB + // b exitBB + BB = loop2MBB; + BuildMI(BB, dl, TII->get(StoreMnemonic)) + .addReg(newval) + .addReg(ptrA) + .addReg(ptrB); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR0) + .addMBB(loop1MBB); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + return exitMBB; +} + +/// Emit software-emulated atomic compare-and-swap for I8/I16 without +/// hardware partword atomic support. +/// +/// This emulates byte/halfword atomic operations using word (32-bit) atomic +/// instructions. Since PowerPC atomic instructions work at word granularity, +/// we must: +/// 1. Align the pointer to a word boundary +/// 2. Calculate the bit shift for the target byte/halfword within the word +/// 3. Create masks to isolate the target byte/halfword +/// 4. Shift old/new values into the correct bit position +/// 5. Use LWARX/STWCX on the full word +/// 6. Mask and merge to preserve other bytes in the word +/// 7. Extract and shift the result back +/// +/// Control flow: +/// thisMBB -> loop1MBB -> loop2MBB -> exitMBB +/// | | +/// +------------+ +/// +/// loop1MBB: +/// - LWARX: Load-and-reserve full word +/// - Mask to extract target byte/halfword +/// - Compare with expected old value +/// - Branch to exitMBB if not equal (CAS failed) +/// loop2MBB: +/// - Merge new value with other bytes in the word +/// - STWCX: Store-conditional full word +/// - Branch back to loop1MBB if store failed (retry) +/// - Fall through to exitMBB on success +/// exitMBB: +/// - Extract and return the loaded value +static MachineBasicBlock * +emitAtomicCmpSwapSoftware(MachineInstr &MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII, + const PPCSubtarget &Subtarget) { + MachineFunction *F = BB->getParent(); + MachineFunction::iterator It = ++BB->getIterator(); + + bool is64bit = Subtarget.isPPC64(); + bool isLittleEndian = Subtarget.isLittleEndian(); + bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; - // sinkMBB: - // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] - // ... - BB = sinkMBB; - BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) - .addReg(MI.getOperand(3).getReg()) - .addMBB(copy0MBB) - .addReg(MI.getOperand(2).getReg()) - .addMBB(thisMBB); - } else if (MI.getOpcode() == PPC::ReadTB) { + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register oldval = MI.getOperand(3).getReg(); + Register newval = MI.getOperand(4).getReg(); + DebugLoc dl = MI.getDebugLoc(); + + MachineBasicBlock *loop1MBB, *loop2MBB, *exitMBB; + createAtomicLoopBlocks(F, BB, loop1MBB, loop2MBB, exitMBB, MI, It); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = + is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + // Lambda to create virtual registers + auto createVReg = [&](const TargetRegisterClass *RC) { + return RegInfo.createVirtualRegister(RC); + }; + + Register PtrReg = createVReg(RC); + Register Shift1Reg = createVReg(GPRC); + Register ShiftReg = isLittleEndian ? Shift1Reg : createVReg(GPRC); + Register NewVal2Reg = createVReg(GPRC); + Register NewVal3Reg = createVReg(GPRC); + Register OldVal2Reg = createVReg(GPRC); + Register OldVal3Reg = createVReg(GPRC); + Register MaskReg = createVReg(GPRC); + Register Mask2Reg = createVReg(GPRC); + Register Mask3Reg = createVReg(GPRC); + Register Tmp2Reg = createVReg(GPRC); + Register Tmp4Reg = createVReg(GPRC); + Register TmpDestReg = createVReg(GPRC); + Register TmpReg = createVReg(GPRC); + Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + Register CrReg = createVReg(&PPC::CRRCRegClass); + + // Compute aligned pointer and shift amount + Register Ptr1Reg; + if (ptrA != ZeroReg) { + Ptr1Reg = createVReg(RC); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) + .addReg(ptrA) + .addReg(ptrB); + } else { + Ptr1Reg = ptrB; + } + + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg) + .addReg(Ptr1Reg, {}, is64bit ? PPC::sub_32 : 0) + .addImm(3) + .addImm(27) + .addImm(is8bit ? 28 : 27); + if (!isLittleEndian) + BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg) + .addReg(Shift1Reg) + .addImm(is8bit ? 24 : 16); + if (is64bit) + BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) + .addReg(Ptr1Reg) + .addImm(0) + .addImm(61); + else + BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) + .addReg(Ptr1Reg) + .addImm(0) + .addImm(0) + .addImm(29); + + // Prepare masked values + BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) + .addReg(newval) + .addReg(ShiftReg); + BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) + .addReg(oldval) + .addReg(ShiftReg); + if (is8bit) + BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); + else { + BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); + BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) + .addReg(Mask3Reg) + .addImm(65535); + } + BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) + .addReg(Mask2Reg) + .addReg(ShiftReg); + BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) + .addReg(NewVal2Reg) + .addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) + .addReg(OldVal2Reg) + .addReg(MaskReg); + + // loop1MBB: + // lwarx tmpDest, ptr + // and tmp, tmpDest, mask + // cmpw tmp, oldval3 + // bne- exitBB + BB = loop1MBB; + BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) + .addReg(ZeroReg) + .addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::AND), TmpReg) + .addReg(TmpDestReg) + .addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::CMPW), CrReg).addReg(TmpReg).addReg(OldVal3Reg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(CrReg) + .addMBB(exitMBB); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(exitMBB); + + // loop2MBB: + // andc tmp2, tmpDest, mask + // or tmp4, tmp2, newval3 + // stwcx. tmp4, ptr + // bne- loop1MBB + // b exitBB + BB = loop2MBB; + BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) + .addReg(TmpDestReg) + .addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg) + .addReg(Tmp2Reg) + .addReg(NewVal3Reg); + BuildMI(BB, dl, TII->get(PPC::STWCX)) + .addReg(Tmp4Reg) + .addReg(ZeroReg) + .addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(PPC::CR0) + .addMBB(loop1MBB); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // srw dest, tmpDest, shift + BB = exitMBB; + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest) + .addReg(TmpReg) + .addReg(ShiftReg); + + return BB; +} + +MachineBasicBlock * +PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + + // To "insert" these instructions we actually have to insert their + // control-flow patterns. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + MachineFunction *F = BB->getParent(); + MachineRegisterInfo &MRI = F->getRegInfo(); + + // Handle SELECT with ISEL support first (before generic SELECT handling) + if (IsSelect(MI.getOpcode())) + return emitSelect(MI, BB, TII, Subtarget); + + switch (MI.getOpcode()) { + case TargetOpcode::STACKMAP: + return emitPatchPoint(MI, BB); + case TargetOpcode::PATCHPOINT: + // Call lowering should have added an r2 operand to indicate a dependence + // on the TOC base pointer value. It can't however, because there is no + // way to mark the dependence as implicit there, and so the stackmap code + // will confuse it with a regular operand. Instead, add the dependence + // here. + if (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) + MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); + return emitPatchPoint(MI, BB); + + case PPC::EH_SjLj_SetJmp32: + case PPC::EH_SjLj_SetJmp64: + return emitEHSjLjSetJmp(MI, BB); + + case PPC::EH_SjLj_LongJmp32: + case PPC::EH_SjLj_LongJmp64: + return emitEHSjLjLongJmp(MI, BB); + + case PPC::ReadTB: { // To read the 64-bit time-base register on a 32-bit target, we read the // two halves. Should the counter have wrapped while it was being read, we // need to try again. @@ -14059,351 +14384,123 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB->addSuccessor(readMBB); BB->addSuccessor(sinkMBB); - } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_NOWP) + break; + } + case PPC::ATOMIC_LOAD_ADD_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, PPC::ADD4); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD) + break; + case PPC::ATOMIC_LOAD_ADD: BB = EmitAtomicBinary(MI, BB, PPC::ADD4); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) + break; + case PPC::ATOMIC_LOAD_ADD_I64: BB = EmitAtomicBinary(MI, BB, PPC::ADD8); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_NOWP) + break; + case PPC::ATOMIC_LOAD_AND_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, PPC::AND); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND) + break; + case PPC::ATOMIC_LOAD_AND: BB = EmitAtomicBinary(MI, BB, PPC::AND); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) + break; + case PPC::ATOMIC_LOAD_AND_I64: BB = EmitAtomicBinary(MI, BB, PPC::AND8); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_NOWP) + break; + case PPC::ATOMIC_LOAD_OR_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, PPC::OR); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR) + break; + case PPC::ATOMIC_LOAD_OR: BB = EmitAtomicBinary(MI, BB, PPC::OR); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) + break; + case PPC::ATOMIC_LOAD_OR_I64: BB = EmitAtomicBinary(MI, BB, PPC::OR8); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_NOWP) + break; + case PPC::ATOMIC_LOAD_XOR_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, PPC::XOR); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR) + break; + case PPC::ATOMIC_LOAD_XOR: BB = EmitAtomicBinary(MI, BB, PPC::XOR); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) + break; + case PPC::ATOMIC_LOAD_XOR_I64: BB = EmitAtomicBinary(MI, BB, PPC::XOR8); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_NOWP) + break; + case PPC::ATOMIC_LOAD_NAND_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, PPC::NAND); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND) + break; + case PPC::ATOMIC_LOAD_NAND: BB = EmitAtomicBinary(MI, BB, PPC::NAND); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) + break; + case PPC::ATOMIC_LOAD_NAND_I64: BB = EmitAtomicBinary(MI, BB, PPC::NAND8); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_NOWP) + break; + case PPC::ATOMIC_LOAD_SUB_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, PPC::SUBF); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB) + break; + case PPC::ATOMIC_LOAD_SUB: BB = EmitAtomicBinary(MI, BB, PPC::SUBF); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) + break; + case PPC::ATOMIC_LOAD_SUB_I64: BB = EmitAtomicBinary(MI, BB, PPC::SUBF8); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_NOWP) + break; + case PPC::ATOMIC_LOAD_MIN_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, 0, PPC::CMPW, PPC::PRED_LT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN) + break; + case PPC::ATOMIC_LOAD_MIN: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPW, PPC::PRED_LT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) + break; + case PPC::ATOMIC_LOAD_MIN_I64: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPD, PPC::PRED_LT); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_NOWP) + break; + case PPC::ATOMIC_LOAD_MAX_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, 0, PPC::CMPW, PPC::PRED_GT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX) + break; + case PPC::ATOMIC_LOAD_MAX: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPW, PPC::PRED_GT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) + break; + case PPC::ATOMIC_LOAD_MAX_I64: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPD, PPC::PRED_GT); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_NOWP) + break; + case PPC::ATOMIC_LOAD_UMIN_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, 0, PPC::CMPLW, PPC::PRED_LT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN) + break; + case PPC::ATOMIC_LOAD_UMIN: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPLW, PPC::PRED_LT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) + break; + case PPC::ATOMIC_LOAD_UMIN_I64: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPLD, PPC::PRED_LT); - - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_NOWP) + break; + case PPC::ATOMIC_LOAD_UMAX_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, 0, PPC::CMPLW, PPC::PRED_GT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX) + break; + case PPC::ATOMIC_LOAD_UMAX: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPLW, PPC::PRED_GT); - else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) + break; + case PPC::ATOMIC_LOAD_UMAX_I64: BB = EmitAtomicBinary(MI, BB, 0, PPC::CMPLD, PPC::PRED_GT); - - else if (MI.getOpcode() == PPC::ATOMIC_SWAP_NOWP) + break; + case PPC::ATOMIC_SWAP_NOWP: BB = EmitPartwordAtomicBinary(MI, BB, 0); - else if (MI.getOpcode() == PPC::ATOMIC_SWAP || - MI.getOpcode() == PPC::ATOMIC_SWAP_I64) + break; + case PPC::ATOMIC_SWAP: + case PPC::ATOMIC_SWAP_I64: BB = EmitAtomicBinary(MI, BB, 0); - else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || - MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || - (Subtarget.hasPartwordAtomics() && - MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || - (Subtarget.hasPartwordAtomics() && - MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { - bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; - - auto LoadMnemonic = PPC::LDARX; - auto StoreMnemonic = PPC::STDCX; - switch (MI.getOpcode()) { - default: - llvm_unreachable("Compare and swap of unknown size"); - case PPC::ATOMIC_CMP_SWAP_I8: - LoadMnemonic = PPC::LBARX; - StoreMnemonic = PPC::STBCX; - assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); - break; - case PPC::ATOMIC_CMP_SWAP_I16: - LoadMnemonic = PPC::LHARX; - StoreMnemonic = PPC::STHCX; - assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); - break; - case PPC::ATOMIC_CMP_SWAP_I32: - LoadMnemonic = PPC::LWARX; - StoreMnemonic = PPC::STWCX; - break; - case PPC::ATOMIC_CMP_SWAP_I64: - LoadMnemonic = PPC::LDARX; - StoreMnemonic = PPC::STDCX; - break; - } - MachineRegisterInfo &RegInfo = F->getRegInfo(); - Register dest = MI.getOperand(0).getReg(); - Register ptrA = MI.getOperand(1).getReg(); - Register ptrB = MI.getOperand(2).getReg(); - Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); - Register oldval = MI.getOperand(3).getReg(); - Register newval = MI.getOperand(4).getReg(); - DebugLoc dl = MI.getDebugLoc(); - - MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(It, loop1MBB); - F->insert(It, loop2MBB); - F->insert(It, exitMBB); - exitMBB->splice(exitMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - exitMBB->transferSuccessorsAndUpdatePHIs(BB); - - // thisMBB: - // ... - // fallthrough --> loopMBB - BB->addSuccessor(loop1MBB); - - // loop1MBB: - // l[bhwd]arx dest, ptr - // cmp[wd] dest, oldval - // bne- exitBB - // loop2MBB: - // st[bhwd]cx. newval, ptr - // bne- loopMBB - // b exitBB - // exitBB: - BB = loop1MBB; - BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB); - BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), CrReg) - .addReg(dest) - .addReg(oldval); - BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE_MINUS) - .addReg(CrReg) - .addMBB(exitMBB); - BB->addSuccessor(loop2MBB); - BB->addSuccessor(exitMBB); - - BB = loop2MBB; - BuildMI(BB, dl, TII->get(StoreMnemonic)) - .addReg(newval) - .addReg(ptrA) - .addReg(ptrB); - BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE_MINUS) - .addReg(PPC::CR0) - .addMBB(loop1MBB); - BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); - BB->addSuccessor(loop1MBB); - BB->addSuccessor(exitMBB); - - // exitMBB: - // ... - BB = exitMBB; - } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || - MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { - // We must use 64-bit registers for addresses when targeting 64-bit, - // since we're actually doing arithmetic on them. Other registers - // can be 32-bit. - bool is64bit = Subtarget.isPPC64(); - bool isLittleEndian = Subtarget.isLittleEndian(); - bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; - - Register dest = MI.getOperand(0).getReg(); - Register ptrA = MI.getOperand(1).getReg(); - Register ptrB = MI.getOperand(2).getReg(); - Register oldval = MI.getOperand(3).getReg(); - Register newval = MI.getOperand(4).getReg(); - DebugLoc dl = MI.getDebugLoc(); - - MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(It, loop1MBB); - F->insert(It, loop2MBB); - F->insert(It, exitMBB); - exitMBB->splice(exitMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - exitMBB->transferSuccessorsAndUpdatePHIs(BB); - - MachineRegisterInfo &RegInfo = F->getRegInfo(); - const TargetRegisterClass *RC = - is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - - Register PtrReg = RegInfo.createVirtualRegister(RC); - Register Shift1Reg = RegInfo.createVirtualRegister(GPRC); - Register ShiftReg = - isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); - Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC); - Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC); - Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC); - Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC); - Register MaskReg = RegInfo.createVirtualRegister(GPRC); - Register Mask2Reg = RegInfo.createVirtualRegister(GPRC); - Register Mask3Reg = RegInfo.createVirtualRegister(GPRC); - Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC); - Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC); - Register TmpDestReg = RegInfo.createVirtualRegister(GPRC); - Register Ptr1Reg; - Register TmpReg = RegInfo.createVirtualRegister(GPRC); - Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; - Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); - // thisMBB: - // ... - // fallthrough --> loopMBB - BB->addSuccessor(loop1MBB); - - // The 4-byte load must be aligned, while a char or short may be - // anywhere in the word. Hence all this nasty bookkeeping code. - // add ptr1, ptrA, ptrB [copy if ptrA==0] - // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] - // xori shift, shift1, 24 [16] - // rlwinm ptr, ptr1, 0, 0, 29 - // slw newval2, newval, shift - // slw oldval2, oldval,shift - // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] - // slw mask, mask2, shift - // and newval3, newval2, mask - // and oldval3, oldval2, mask - // loop1MBB: - // lwarx tmpDest, ptr - // and tmp, tmpDest, mask - // cmpw tmp, oldval3 - // bne- exitBB - // loop2MBB: - // andc tmp2, tmpDest, mask - // or tmp4, tmp2, newval3 - // stwcx. tmp4, ptr - // bne- loop1MBB - // b exitBB - // exitBB: - // srw dest, tmpDest, shift - if (ptrA != ZeroReg) { - Ptr1Reg = RegInfo.createVirtualRegister(RC); - BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) - .addReg(ptrA) - .addReg(ptrB); - } else { - Ptr1Reg = ptrB; - } - - // We need use 32-bit subregister to avoid mismatch register class in 64-bit - // mode. - BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg) - .addReg(Ptr1Reg, {}, is64bit ? PPC::sub_32 : 0) - .addImm(3) - .addImm(27) - .addImm(is8bit ? 28 : 27); - if (!isLittleEndian) - BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg) - .addReg(Shift1Reg) - .addImm(is8bit ? 24 : 16); - if (is64bit) - BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) - .addReg(Ptr1Reg) - .addImm(0) - .addImm(61); + break; + case PPC::ATOMIC_CMP_SWAP_I32: + case PPC::ATOMIC_CMP_SWAP_I64: + case PPC::ATOMIC_CMP_SWAP_I8: + case PPC::ATOMIC_CMP_SWAP_I16: { + // Use hardware-supported atomic operations if available + bool useHardware = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || + MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || + (Subtarget.hasPartwordAtomics() && + (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || + MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)); + + if (useHardware) + BB = emitAtomicCmpSwapHardware(MI, BB, TII, Subtarget); else - BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) - .addReg(Ptr1Reg) - .addImm(0) - .addImm(0) - .addImm(29); - BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) - .addReg(newval) - .addReg(ShiftReg); - BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) - .addReg(oldval) - .addReg(ShiftReg); - if (is8bit) - BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); - else { - BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); - BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) - .addReg(Mask3Reg) - .addImm(65535); - } - BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) - .addReg(Mask2Reg) - .addReg(ShiftReg); - BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) - .addReg(NewVal2Reg) - .addReg(MaskReg); - BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) - .addReg(OldVal2Reg) - .addReg(MaskReg); - - BB = loop1MBB; - BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) - .addReg(ZeroReg) - .addReg(PtrReg); - BuildMI(BB, dl, TII->get(PPC::AND), TmpReg) - .addReg(TmpDestReg) - .addReg(MaskReg); - BuildMI(BB, dl, TII->get(PPC::CMPW), CrReg) - .addReg(TmpReg) - .addReg(OldVal3Reg); - BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) - .addReg(CrReg) - .addMBB(exitMBB); - BB->addSuccessor(loop2MBB); - BB->addSuccessor(exitMBB); - - BB = loop2MBB; - BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) - .addReg(TmpDestReg) - .addReg(MaskReg); - BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg) - .addReg(Tmp2Reg) - .addReg(NewVal3Reg); - BuildMI(BB, dl, TII->get(PPC::STWCX)) - .addReg(Tmp4Reg) - .addReg(ZeroReg) - .addReg(PtrReg); - BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) - .addReg(PPC::CR0) - .addMBB(loop1MBB); - BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); - BB->addSuccessor(loop1MBB); - BB->addSuccessor(exitMBB); - - // exitMBB: - // ... - BB = exitMBB; - BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest) - .addReg(TmpReg) - .addReg(ShiftReg); - } else if (MI.getOpcode() == PPC::FADDrtz) { + BB = emitAtomicCmpSwapSoftware(MI, BB, TII, Subtarget); + break; + } + case PPC::FADDrtz: { // This pseudo performs an FADD with rounding mode temporarily forced // to round-to-zero. We emit this via custom inserter since the FPSCR // is not modeled at the SelectionDAG level. @@ -14436,10 +14533,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Restore FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); - } else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT || - MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT || - MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 || - MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) { + break; + } + case PPC::ANDI_rec_1_EQ_BIT: + case PPC::ANDI_rec_1_GT_BIT: + case PPC::ANDI_rec_1_EQ_BIT8: + case PPC::ANDI_rec_1_GT_BIT8: { unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 || MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) ? PPC::ANDI8_rec @@ -14458,7 +14557,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT); - } else if (MI.getOpcode() == PPC::TCHECK_RET) { + break; + } + case PPC::TCHECK_RET: { DebugLoc Dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); @@ -14466,14 +14567,18 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(CRReg); - } else if (MI.getOpcode() == PPC::TBEGIN_RET) { + break; + } + case PPC::TBEGIN_RET: { DebugLoc Dl = MI.getDebugLoc(); unsigned Imm = MI.getOperand(1).getImm(); BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm); BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(PPC::CR0EQ); - } else if (MI.getOpcode() == PPC::SETRNDi) { + break; + } + case PPC::SETRNDi: { DebugLoc dl = MI.getDebugLoc(); Register OldFPSCRReg = MI.getOperand(0).getReg(); @@ -14500,7 +14605,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0)) .addImm(30) .addReg(PPC::RM, RegState::ImplicitDefine); - } else if (MI.getOpcode() == PPC::SETRND) { + break; + } + case PPC::SETRND: { DebugLoc dl = MI.getDebugLoc(); // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg @@ -14609,7 +14716,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(NewFPSCRReg) .addImm(0) .addImm(0); - } else if (MI.getOpcode() == PPC::SETFLM) { + break; + } + case PPC::SETFLM: { DebugLoc Dl = MI.getDebugLoc(); // Result of setflm is previous FPSCR content, so we need to save it first. @@ -14626,10 +14735,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(NewFPSCRReg) .addImm(0) .addImm(0); - } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 || - MI.getOpcode() == PPC::PROBED_ALLOCA_64) { + break; + } + case PPC::PROBED_ALLOCA_32: + case PPC::PROBED_ALLOCA_64: return emitProbedAlloca(MI, BB); - } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) { + + case PPC::SPLIT_QUADWORD: { DebugLoc DL = MI.getDebugLoc(); Register Src = MI.getOperand(2).getReg(); Register Lo = MI.getOperand(0).getReg(); @@ -14640,8 +14752,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY)) .addDef(Hi) .addUse(Src, {}, PPC::sub_gp8_x0); - } else if (MI.getOpcode() == PPC::LQX_PSEUDO || - MI.getOpcode() == PPC::STQX_PSEUDO) { + break; + } + case PPC::LQX_PSEUDO: + case PPC::STQX_PSEUDO: { DebugLoc DL = MI.getDebugLoc(); // Ptr is used as the ptr_rc_no_r0 part // of LQ/STQ's memory operand and adding result of RA and RB, @@ -14658,8 +14772,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(Val, getDefRegState(MI.getOpcode() == PPC::LQX_PSEUDO)) .addImm(0) .addReg(Ptr); - } else if (MI.getOpcode() == PPC::LWAT_PSEUDO || - MI.getOpcode() == PPC::LDAT_PSEUDO) { + break; + } + case PPC::LWAT_PSEUDO: + case PPC::LDAT_PSEUDO: { DebugLoc DL = MI.getDebugLoc(); Register DstReg = MI.getOperand(0).getReg(); Register PtrReg = MI.getOperand(1).getReg(); @@ -14697,8 +14813,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, else BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) .addReg(Result64); - } else if (MI.getOpcode() == PPC::LWAT_COND_PSEUDO || - MI.getOpcode() == PPC::LDAT_COND_PSEUDO) { + break; + } + case PPC::LWAT_COND_PSEUDO: + case PPC::LDAT_COND_PSEUDO: { DebugLoc DL = MI.getDebugLoc(); Register DstReg = MI.getOperand(0).getReg(); Register PtrReg = MI.getOperand(1).getReg(); @@ -14723,7 +14841,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, else BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) .addReg(Result64); - } else { + break; + } + default: llvm_unreachable("Unexpected instr type to insert"); } From e6efa1a4c9f6c4aba5dd224025a18a9c839b763c Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Fri, 8 May 2026 19:35:02 +0200 Subject: [PATCH 076/538] [AMDGPU] Pre-commit unit test for RP tracking `reset`/`advance` inconsistencies fix (#196098) This adds a new AMDGPU unit test file for testing the behavior of `GCNRPTracker` and its related classes. The two test showcase confusing return value and behavioral semantics for variants of the advance and reset functions, which will be clarified in a follow up commit. --- llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 + .../Target/AMDGPU/GCNRegPressureTest.cpp | 156 ++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt index 3203738cc1c69..bbd426f56342c 100644 --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -27,6 +27,7 @@ add_llvm_target_unittest(AMDGPUTests CSETest.cpp DwarfRegMappings.cpp ExecMayBeModifiedBeforeAnyUse.cpp + GCNRegPressureTest.cpp LiveRegUnits.cpp PALMetadata.cpp UniformityAnalysisTest.cpp diff --git a/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp b/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp new file mode 100644 index 0000000000000..ad84f4df65288 --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp @@ -0,0 +1,156 @@ +//===- GCNRegPressureTest.cpp -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "GCNRegPressure.h" +#include "AMDGPUUnitTests.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MIRParser/MIRParser.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Passes/PassBuilder.h" +#include "gtest/gtest.h" + +using namespace llvm; + +class GCNRegPressureTest : public llvm::CodeGenTestBase { +public: + void SetUp() override { setUpImpl("amdgcn--", "gfx908", ""); } +}; + +TEST_F(GCNRegPressureTest, DownwardTrackerEndOnDbgVal) { + StringRef MIR = R"( +name: DownwardTrackerEndOnDbgVal +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + + bb.1: + DBG_VALUE %0 + DBG_VALUE %1 + %2:vgpr_32 = IMPLICIT_DEF + + bb.3: + S_NOP 0, implicit %0, implicit %1, implicit %2 + S_ENDPGM 0 +... +)"; + EXPECT_TRUE(parseMIR(MIR)); + MachineFunction &MF = getMF("DownwardTrackerEndOnDbgVal"); + const LiveIntervals &LIS = MFAM.getResult(MF); + + // MBB1 live-in pressure is equivalent to MBB0 live-out pressure. + MachineBasicBlock &MBB0 = *MF.getBlockNumbered(0); + MachineBasicBlock &MBB1 = *MF.getBlockNumbered(1); + GCNRPTracker::LiveRegSet MBB1LiveIns = + getLiveRegs(LIS.getInstructionIndex(*MBB0.rbegin()).getDeadSlot(), LIS, + MF.getRegInfo()); + + // Track pressure across MBB1. + { + GCNDownwardRPTracker RPTracker(LIS), RPTrackerNoLiveIns(LIS); + + // There is a non-debug instruction in bb.1 (%2's def), so advance should + // return true. + EXPECT_TRUE(RPTracker.advance(MBB1.begin(), MBB1.end(), &MBB1LiveIns)); + EXPECT_TRUE(RPTrackerNoLiveIns.advance(MBB1.begin(), MBB1.end(), nullptr)); + + // When advance returns true, maximum pressure should be the pressured + // induced by the block's live-ins plus %2's def i.e., 3 VGPRs. + EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 3U); + EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 3U); + } + + // Track pressure just across the first debug value of bb.1. + { + MachineBasicBlock::iterator Dbg1 = std::next(MBB1.begin()); + GCNDownwardRPTracker RPTracker(LIS), RPTrackerNoLiveIns(LIS); + + // The following unpacks a call to + // advance(*MBB1.begin(), Dbg1, [MBB1LiveIns|nullptr]) + // which would return false in this case. + // + // There aren't any non-debug instruction between the beginning of bb1 and + // Dbg1 (exclusive). However, the call to reset takes the end of the MBB as + // the limit, so it pushes the beginning of the block up to %2's def and + // considers the reset successful. + EXPECT_TRUE(RPTracker.reset(*MBB1.begin(), &MBB1LiveIns)); + EXPECT_TRUE(RPTrackerNoLiveIns.reset(*MBB1.begin(), nullptr)); + // advance then unnecessarily processes instructions in order until the end + // of the block, even though it is already past Dbg1. It still returns false + // because it is stopped by the end of block delimiter, not the end + // iterator. + EXPECT_FALSE(RPTracker.advance(Dbg1)); + EXPECT_FALSE(RPTrackerNoLiveIns.advance(Dbg1)); + + // In that case, the maximum pressure is also the pressure induced by the + // block's live-ins plus %2's def i.e., 3 VGPRs. This is confusing because + // %2's def is outside the [Begin,End) range we passed to advance, and there + // is no indication that a false return value should make the tracked + // pressure invalid. + EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 3U); + EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 3U); + } +} + +TEST_F(GCNRegPressureTest, DownwardTrackerAllDbgVal) { + StringRef MIR = R"( +name: DownwardTrackerAllDbgVal +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + + bb.1: + DBG_VALUE %0 + + bb.2: + S_NOP 0, implicit %0 + S_ENDPGM 0 +... +)"; + EXPECT_TRUE(parseMIR(MIR)); + MachineFunction &MF = getMF("DownwardTrackerAllDbgVal"); + const LiveIntervals &LIS = MFAM.getResult(MF); + + // MBB1 live-in pressure is equivalent to MBB0 live-out pressure. + MachineBasicBlock &MBB0 = *MF.getBlockNumbered(0); + GCNRPTracker::LiveRegSet MBB1LiveIns = + getLiveRegs(LIS.getInstructionIndex(*MBB0.rbegin()).getDeadSlot(), LIS, + MF.getRegInfo()); + + MachineBasicBlock &MBB1 = *MF.getBlockNumbered(1); + GCNDownwardRPTracker RPTracker(LIS), RPTrackerNoLiveIns(LIS); + + // The following unpacks a call to + // advance(MBB1.begin(), MBB1.end(), [MBB1LiveIns|nullptr]) + // which would return true in this case. + // + // There aren't any non-debug instruction in bb.2, the reset is therefore + // unsuccessful. However the advance caller discards that return value and + // proceeds to calling its override. + EXPECT_FALSE(RPTracker.reset(*MBB1.begin(), &MBB1LiveIns)); + EXPECT_FALSE(RPTrackerNoLiveIns.reset(*MBB1.begin(), nullptr)); + // advance then produces true even though no advancement actually happened. + EXPECT_TRUE(RPTracker.advance(MBB1.end())); + EXPECT_TRUE(RPTrackerNoLiveIns.advance(MBB1.end())); + + // In that case, the maximum pressure is unchanged from the beginning since + // reset was unsuccessful. This is confusing because the top-level advance + // call produced true, yet the block's live-in pressure was not considered. + EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 0U); + EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 0U); +} From edd7810ba1372a81eae2c2af290d98fb194c5b5f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 May 2026 13:41:53 -0400 Subject: [PATCH 077/538] Revert "[SLP] Vectorize struct-returning intrinsics" This reverts commit b0c6df7b95b3c70d78c65a39598007f722794d38 to fix buildbots https://lab.llvm.org/buildbot/#/builders/52/builds/17118 Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/196591 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 625 +++------ .../SLPVectorizer/X86/arith-add-saddo.ll | 1064 ++++++++------- .../SLPVectorizer/X86/arith-add-uaddo.ll | 1064 ++++++++------- .../SLPVectorizer/X86/arith-mul-smulo.ll | 1164 +++++++++-------- .../SLPVectorizer/X86/arith-mul-umulo.ll | 1044 +++++++++------ .../SLPVectorizer/X86/arith-sub-ssubo.ll | 1064 ++++++++------- .../SLPVectorizer/X86/arith-sub-usubo.ll | 1064 ++++++++------- ...revec-non-power-2-to-power-2-large-vect.ll | 8 +- llvm/test/Transforms/SLPVectorizer/sincos.ll | 52 +- .../SLPVectorizer/struct-return-revec.ll | 28 +- 10 files changed, 3912 insertions(+), 3265 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a508b1d1f744f..f1a6eb2d7e8af 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -28,7 +28,6 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" @@ -72,7 +71,6 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" -#include "llvm/IR/VectorTypeUtils.h" #ifdef EXPENSIVE_CHECKS #include "llvm/IR/Verifier.h" #endif @@ -302,10 +300,10 @@ static const unsigned MaxPHINumOperands = 128; /// be inevitably scalarized. static bool isValidElementType(Type *Ty) { // TODO: Support ScalableVectorType. - if (SLPReVec && isVectorizedTy(Ty)) - Ty = toScalarizedTy(Ty); - return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() && - !Ty->isVoidTy(); + if (SLPReVec && isa(Ty)) + Ty = Ty->getScalarType(); + return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && + !Ty->isPPC_FP128Ty(); } /// Returns the "element type" of the given value/instruction \p V. @@ -330,33 +328,15 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) { static unsigned getNumElements(Type *Ty) { assert(!isa(Ty) && "ScalableVectorType is not supported."); - if (isVectorizedTy(Ty)) - return getVectorizedTypeVF(Ty).getFixedValue(); + if (auto *VecTy = dyn_cast(Ty)) + return VecTy->getNumElements(); return 1; } /// \returns the vector type of ScalarTy based on vectorization factor. -static Type *getWidenedType(Type *ScalarTy, unsigned VF) { - if (VF == 1 && !isVectorizedTy(ScalarTy)) { - // Workaround for 1 x vector types: toVectorizedTy returns the type - // unchanged when EC is scalar, but BoUpSLP relies on widening to - // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the - // pipeline operating on vector types. - if (auto *StructTy = dyn_cast(ScalarTy)) { - assert(isUnpackedStructLiteral(StructTy) && - "expected unpacked struct literal"); - assert(all_of(StructTy->elements(), VectorType::isValidElementType) && - "expected all element types to be valid vector element types"); - return StructType::get( - StructTy->getContext(), - map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * { - return FixedVectorType::get(ElTy, 1); - })); - } - return FixedVectorType::get(ScalarTy, 1); - } - return toVectorizedTy(toScalarizedTy(ScalarTy), - ElementCount::getFixed(VF * getNumElements(ScalarTy))); +static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { + return FixedVectorType::get(ScalarTy->getScalarType(), + VF * getNumElements(ScalarTy)); } /// Returns the number of elements of the given type \p Ty, not less than \p Sz, @@ -364,7 +344,7 @@ static Type *getWidenedType(Type *ScalarTy, unsigned VF) { /// legalization. static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz) { - if (!isValidElementType(Ty) || isa(Ty)) + if (!isValidElementType(Ty)) return bit_ceil(Sz); // Find the number of elements, which forms full vectors. const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); @@ -379,7 +359,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz) { - if (!isValidElementType(Ty) || isa(Ty)) + if (!isValidElementType(Ty)) return bit_floor(Sz); // Find the number of elements, which forms full vectors. unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); @@ -2059,8 +2039,6 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, return false; if (has_single_bit(Sz)) return true; - if (isa(Ty)) - return false; const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) && Sz % NumParts == 0; @@ -2070,20 +2048,19 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, /// phase. If the type is going to be scalarized or does not uses whole /// registers, returns 1. static unsigned -getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy, +getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, + Type *ScalarTy, const unsigned Limit = std::numeric_limits::max()) { - if (isa(VecTy)) - return 1; unsigned NumParts = TTI.getNumberOfParts(VecTy); if (NumParts == 0 || NumParts >= Limit) return 1; unsigned Sz = getNumElements(VecTy); unsigned ScalarSz = getNumElements(ScalarTy); - Type *ElementTy = toScalarizedTy(VecTy); - unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz); + unsigned PWSz = + getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz); if (NumParts >= Sz || PWSz % NumParts != 0 || (PWSz / NumParts) % ScalarSz != 0 || - !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts)) + !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts)) return 1; const unsigned NumElts = PWSz / NumParts; if (divideCeil(Sz, NumElts) != NumParts) @@ -2232,14 +2209,14 @@ class slpvectorizer::BoUpSLP { ReductionBitWidth >= DL->getTypeSizeInBits( VectorizableTree.front()->Scalars.front()->getType())) - return cast( - getWidenedType(VectorizableTree.front()->Scalars.front()->getType(), - VectorizableTree.front()->getVectorFactor())); - return cast(getWidenedType( + return getWidenedType( + VectorizableTree.front()->Scalars.front()->getType(), + VectorizableTree.front()->getVectorFactor()); + return getWidenedType( IntegerType::get( VectorizableTree.front()->Scalars.front()->getContext(), ReductionBitWidth), - VectorizableTree.front()->getVectorFactor())); + VectorizableTree.front()->getVectorFactor()); } /// Returns true if the tree results in one of the reduced bitcasts variants. @@ -4012,7 +3989,8 @@ class slpvectorizer::BoUpSLP { /// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself /// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs. InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy, - Type *VecTy, Type *FinalVecTy, + VectorType *VecTy, + VectorType *FinalVecTy, TTI::TargetCostKind CostKind) const; /// This is the recursive part of buildTree. @@ -7129,12 +7107,12 @@ static InstructionCost getExtractWithExtendCost( const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { - if (isVectorizedTy(Dst)) { + if (auto *ScalarTy = dyn_cast(Dst)) { assert(SLPReVec && "Only supported by REVEC."); - auto *SubTp = cast( - getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst))); + auto *SubTp = + getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements()); return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind, - Index * getNumElements(Dst), SubTp) + + Index * ScalarTy->getNumElements(), SubTp) + TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None, CostKind); } @@ -7227,7 +7205,7 @@ static bool isMaskedLoadCompress( InterleaveFactor = 0; Type *ScalarTy = VL.front()->getType(); const size_t Sz = VL.size(); - auto *VecTy = cast(getWidenedType(ScalarTy, Sz)); + auto *VecTy = getWidenedType(ScalarTy, Sz); constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; SmallVector Mask; if (!Order.empty()) @@ -7263,7 +7241,7 @@ static bool isMaskedLoadCompress( // Check for very large distances between elements. if (*Diff / Sz >= MaxRegSize / 8) return false; - LoadVecTy = cast(getWidenedType(ScalarTy, *Diff + 1)); + LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); Align CommonAlignment = LI->getAlign(); IsMasked = !isSafeToLoadUnconditionally( @@ -7312,8 +7290,8 @@ static bool isMaskedLoadCompress( } if (IsStrided && !IsMasked && Order.empty()) { // Check for potential segmented(interleaved) loads. - VectorType *AlignedLoadVecTy = cast(getWidenedType( - ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1))); + VectorType *AlignedLoadVecTy = getWidenedType( + ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)); if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment, DL, cast(VL.back()), &AC, &DT, &TLI)) @@ -7504,7 +7482,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate( Type *StrideTy = DL->getIndexType(Ptr0->getType()); SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal); - SPtrInfo.Ty = cast(getWidenedType(NewScalarTy, VecSz)); + SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz); return true; } @@ -7559,8 +7537,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, NewScalarTy = Type::getIntNTy( SE->getContext(), DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets); - auto *StridedLoadTy = - cast(getWidenedType(NewScalarTy, VecSz)); + FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz); unsigned MinProfitableStridedOps = IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores; const unsigned BaseTyNumElts = getNumElements(BaseTy); @@ -7759,7 +7736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); - auto *VecTy = cast(getWidenedType(ScalarTy, Sz)); + auto *VecTy = getWidenedType(ScalarTy, Sz); Align CommonAlignment = computeCommonAlignment(VL); // Cache masked gather legality - both the !IsSorted path below and the // post-branch check use the same VecTy/CommonAlignment, and the underlying @@ -7840,7 +7817,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // estimate as a buildvector, otherwise estimate as splat. APInt DemandedElts = APInt::getAllOnes(Sz); Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType(); - auto *PtrVecTy = cast(getWidenedType(PtrScalarTy, Sz)); + VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz); // Cache the underlying object of PointerOps.front() - it is invariant // across the per-V comparisons below and getUnderlyingObject walks // GEP/cast chains. @@ -7937,7 +7914,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( } for (const auto &[SliceStart, LS] : States) { const unsigned SliceVF = std::min(VF, VL.size() - SliceStart); - auto *SubVecTy = cast(getWidenedType(ScalarTy, SliceVF)); + auto *SubVecTy = getWidenedType(ScalarTy, SliceVF); auto *LI0 = cast(VL[SliceStart]); InstructionCost VectorGEPCost = (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) @@ -8542,8 +8519,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, const auto *It = find_if_not(TE.Scalars, isConstant); if (It == TE.Scalars.begin()) return OrdersType(); - auto *Ty = - cast(getWidenedType(TE.Scalars.front()->getType(), Sz)); + auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz); if (It != TE.Scalars.end()) { OrdersType Order(Sz, Sz); unsigned Idx = std::distance(TE.Scalars.begin(), It); @@ -8801,12 +8777,6 @@ void BoUpSLP::reorderTopToBottom() { // Maps a TreeEntry to the reorder indices of external users. DenseMap> ExternalUserReorderMap; - // TODO: Reordering of struct types is not supported. - if (any_of(VectorizableTree, [](const std::unique_ptr &TE) { - return TE->State == TreeEntry::Vectorize && - isa(getValueType(TE->Scalars.front())); - })) - return; // Compute IgnoreReorder once - it depends only on UserIgnoreList and // VectorizableTree.front(), which do not change during this loop. const bool IgnoreReorder = @@ -8833,8 +8803,7 @@ void BoUpSLP::reorderTopToBottom() { if (TE->hasState() && TE->isAltShuffle() && TE->State != TreeEntry::SplitVectorize) { Type *ScalarTy = TE->Scalars[0]->getType(); - auto *VecTy = - cast(getWidenedType(ScalarTy, TE->Scalars.size())); + VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); unsigned Opcode1 = TE->getAltOpcode(); SmallBitVector OpcodeMask( @@ -9203,10 +9172,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } if (Users.first) { auto &Data = Users; - // TODO: Reordering of struct types is not supported. - if (Data.first->State == TreeEntry::Vectorize && - isa(getValueType(Data.first->Scalars.front()))) - continue; if (Data.first->State == TreeEntry::SplitVectorize) { assert( Data.second.size() <= 2 && @@ -10007,8 +9972,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads( ArrayRef Values(reinterpret_cast(Loads.begin()), Loads.size()); Align Alignment = computeCommonAlignment(Values); - auto *Ty = cast( - getWidenedType(Loads.front()->getType(), Loads.size())); + auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size()); return TTI->isLegalMaskedGather(Ty, Alignment) && !TTI->forceScalarizeMaskedGather(Ty, Alignment); }; @@ -10306,8 +10270,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads( // Segmented load detected - vectorize at maximum vector factor. if (InterleaveFactor <= Slice.size() && TTI.isLegalInterleavedAccessType( - cast( - getWidenedType(Slice.front()->getType(), VF)), + getWidenedType(Slice.front()->getType(), VF), InterleaveFactor, cast(Slice.front())->getAlign(), cast(Slice.front()) @@ -10567,10 +10530,11 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, /// function (if possible) calls. Returns invalid cost for the corresponding /// calls, if they cannot be vectorized/will be scalarized. static std::pair -getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI, - TargetLibraryInfo *TLI, ArrayRef ArgTys) { +getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + ArrayRef ArgTys) { auto Shape = VFShape::get(CI->getFunctionType(), - ElementCount::getFixed(getNumElements(VecTy)), + ElementCount::getFixed(VecTy->getNumElements()), false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); auto LibCost = InstructionCost::getInvalid(); @@ -10631,68 +10595,6 @@ ArrayRef BoUpSLP::getLoopNest(const Loop *L) { return Res; } -/// Detects an extractvalue bundle that can be widened by vectorizing the -/// underlying struct-returning calls. -/// -/// \p VL is a bundle whose state \p S is Instruction::ExtractValue. The -/// bundle is acceptable for widening into one struct-of-vectors call only -/// when: -/// - every element of \p VL is an ExtractValueInst, -/// - every ExtractValueInst extracts the same struct field (its -/// getIndices() matches the main op's indices), -/// - the aggregate operands form a uniform set of CallInsts (per -/// getSameOpcode) that is not an alt-shuffle and whose return type is -/// a literal struct, and -/// - every user of every such call is itself an ExtractValueInst, so the -/// external-use extraction code can rebuild scalars via extractvalue + -/// extractelement without needing an insertvalue chain. -/// -/// On success returns true and fills \p Indices with the common field -/// index path and \p Calls with the per-lane aggregate calls (in VL order), -/// for the caller to feed as the operand of the new tree entry. Otherwise -/// returns false and leaves the output parameters untouched. -static bool checkEVsForVecCalls(ArrayRef VL, - const InstructionsState &S, - const TargetLibraryInfo &TLI, - SmallVectorImpl &Indices, - SmallVectorImpl &Calls) { - assert(S && S.getOpcode() == Instruction::ExtractValue && - "Expected extractvalue instruction state."); - if (!all_of(VL, IsaPred)) - return false; - auto *VL0 = cast(S.getMainOp()); - ArrayRef VL0Indices = VL0->getIndices(); - SmallVector Aggregates; - for (Value *V : VL) { - if (V == VL0) { - Aggregates.push_back(VL0->getAggregateOperand()); - continue; - } - auto *IV = cast(V); - if (IV->getIndices() != VL0Indices) - return false; - Value *Agg = IV->getAggregateOperand(); - Aggregates.push_back(Agg); - } - const InstructionsState AggState = getSameOpcode(Aggregates, TLI); - if (AggState && AggState.getOpcode() == Instruction::Call && - !AggState.isAltShuffle() && - isa(AggState.getMainOp()->getType())) { - // The struct-returning call may have non-bundle users too. The external - // extraction code rebuilds scalars by extractvalue + extractelement, - // which only works when every user of the call is an ExtractValueInst. - // Bail out if any aggregate has a different kind of user. - for (Value *Agg : Aggregates) { - if (!all_of(Agg->users(), IsaPred)) - return false; - } - Indices.assign(VL0Indices.begin(), VL0Indices.end()); - Calls.swap(Aggregates); - return true; - } - return false; -} - BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, @@ -10738,11 +10640,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( bool Reuse = canReuseExtract(VL, CurrentOrder); if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; - SmallVector Indices; - SmallVector Calls; - if (ShuffleOrOp == Instruction::ExtractValue && - checkEVsForVecCalls(VL, S, *TLI, Indices, Calls)) - return TreeEntry::Vectorize; LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); return TreeEntry::NeedToGather; } @@ -11264,12 +11161,6 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, const BoUpSLP &R, bool BuildGatherOnly = true) { - // TODO: Reordering of struct types is not supported. - if (isa(getValueType(VL.front()))) { - LLVM_DEBUG(dbgs() << "SLP: struct type in bundle.\n"); - ReuseShuffleIndices.clear(); - return true; - } // Check that every instruction appears once in this bundle. SmallVector UniqueValues; SmallDenseMap UniquePositions(VL.size()); @@ -11381,9 +11272,8 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, if (Val != PoisonMaskElem && UniquePositions.contains(UniqueValues[Val])) DemandedElts.setBit(Idx); Type *ScalarTy = ::getValueType(UniqueValues.front()); - auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); - auto *UniquesVecTy = - cast(getWidenedType(ScalarTy, NumUniqueScalarValues)); + auto *VecTy = getWidenedType(ScalarTy, VL.size()); + auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues); const unsigned NumParts = ::getNumberOfParts(TTI, VecTy, ScalarTy); const unsigned UniquesNumParts = ::getNumberOfParts(TTI, UniquesVecTy, ScalarTy); @@ -11539,7 +11429,7 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, Op2.push_back(V); } Type *ScalarTy = getValueType(VL.front()); - auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); + VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); unsigned Opcode0 = LocalState.getOpcode(); unsigned Opcode1 = LocalState.getAltOpcode(); SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1)); @@ -11574,8 +11464,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, if (!ReorderIndices.empty()) inversePermutation(ReorderIndices, Mask); unsigned NumParts = TTI->getNumberOfParts(VecTy); - auto *Op1VecTy = cast(getWidenedType(ScalarTy, Op1.size())); - auto *Op2VecTy = cast(getWidenedType(ScalarTy, Op2.size())); + VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size()); + VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size()); // Check non-profitable single register ops, which better to be represented // as alternate ops. if (NumParts >= VL.size()) @@ -11583,8 +11473,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; InstructionCost InsertCost = ::getShuffleCost( *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy); - auto *SubVecTy = cast( - getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()))); + FixedVectorType *SubVecTy = + getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size())); InstructionCost NewShuffleCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind); if (!LocalState.isCmpOp() && NumParts <= 1 && @@ -11784,16 +11674,7 @@ class InstructionsCompatibilityAnalysis { Handler.getOperands(I).end()); return; } - case Instruction::ExtractValue: { - SmallVector Indices; - SmallVector Calls; - if (checkEVsForVecCalls(VL, S, TLI, Indices, Calls)) { - Operands.assign(1, {}); - Operands[0].swap(Calls); - return; - } - [[fallthrough]]; - } + case Instruction::ExtractValue: case Instruction::ExtractElement: // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. @@ -12181,7 +12062,7 @@ class InstructionsCompatibilityAnalysis { } if (S && S.isAltShuffle()) { Type *ScalarTy = S.getMainOp()->getType(); - auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); + VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); unsigned Opcode0 = S.getOpcode(); unsigned Opcode1 = S.getAltOpcode(); SmallBitVector OpcodeMask( @@ -12246,7 +12127,8 @@ class InstructionsCompatibilityAnalysis { constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind); InstructionCost VectorCost; - auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size()); + FixedVectorType *VecTy = + getWidenedType(S.getMainOp()->getType(), VL.size()); switch (MainOpcode) { case Instruction::Add: case Instruction::Sub: @@ -12648,7 +12530,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, // Rough cost estimation, if the vector code (+ potential extracts) is // more profitable than the scalar + buildvector. Type *ScalarTy = VL.front()->getType(); - auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); + auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost VectorizeCostEstimate = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) + ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted, @@ -12967,12 +12849,6 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, // This is a special case, as it does not gather, but at the same time // we are not extending buildTreeRec() towards the operands. TE->setOperands(Operands); - if (ShuffleOrOp == Instruction::ExtractValue) { - SmallVector Indices; - SmallVector Calls; - if (checkEVsForVecCalls(VL, S, *TLI, Indices, Calls)) - buildTreeRec(Operands.front(), Depth + 1, {TE, 0}); - } return; } case Instruction::InsertElement: { @@ -14276,11 +14152,10 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = 0; auto *ScalarTy = TE.Scalars.front()->getType(); - auto *VecTy = cast(getWidenedType(ScalarTy, TE.Scalars.size())); + auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size()); for (auto [Idx, Sz] : SubVectors) { - Cost += - ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, - Idx, cast(getWidenedType(ScalarTy, Sz))); + Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, + Idx, getWidenedType(ScalarTy, Sz)); } Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/true, @@ -14493,10 +14368,8 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order, Stride * LhsTE->getVectorFactor()); FastMathFlags FMF; SmallPtrSet CheckedExtracts; - auto *VecTy = - cast(getWidenedType(ScalarTy, TE.getVectorFactor())); - auto *SrcVecTy = - cast(getWidenedType(SrcScalarTy, LhsTE->getVectorFactor())); + auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor()); + auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()); TTI::CastContextHint CastCtx = getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0)); InstructionCost VecCost = @@ -14627,7 +14500,7 @@ bool BoUpSLP::matchesInversedZExtSelect( if (InversedCmpsIndices.empty()) return false; - Type *VecTy = + VectorType *VecTy = getWidenedType(Cmp->getOperand(0)->getType(), CmpTE->getVectorFactor()); Type *CmpTy = CmpInst::makeCmpResultType(VecTy); @@ -14688,16 +14561,14 @@ bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const { // Check if bitcast is cheaper than select. auto *DstTy = IntegerType::getIntNTy(ScalarTy->getContext(), SelectTE.getVectorFactor()); - Type *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor()); + VectorType *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor()); Type *CmpTy = CmpInst::makeCmpResultType(OpTy); - auto *VecTy = - cast(getWidenedType(ScalarTy, SelectTE.getVectorFactor())); + VectorType *VecTy = getWidenedType(ScalarTy, SelectTE.getVectorFactor()); auto It = MinBWs.find(&SelectTE); if (It != MinBWs.end()) { auto *EffectiveScalarTy = IntegerType::get(F->getContext(), It->second.first); - VecTy = cast( - getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor())); + VecTy = getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor()); } TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost BitcastCost = TTI->getCastInstrCost( @@ -15011,8 +14882,7 @@ void BoUpSLP::transformNodes() { if (E.State != TreeEntry::Vectorize) break; Type *ScalarTy = E.getMainOp()->getType(); - auto *VecTy = - cast(getWidenedType(ScalarTy, E.Scalars.size())); + auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); Align CommonAlignment = computeCommonAlignment(E.Scalars); // Check if profitable to represent consecutive load + reverse as strided // load with stride -1. @@ -15050,8 +14920,7 @@ void BoUpSLP::transformNodes() { case Instruction::Store: { Type *ScalarTy = cast(E.getMainOp())->getValueOperand()->getType(); - auto *VecTy = - cast(getWidenedType(ScalarTy, E.Scalars.size())); + auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); Align CommonAlignment = computeCommonAlignment(E.Scalars); // Check if profitable to represent consecutive load + reverse as strided // load with stride -1. @@ -15378,7 +15247,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost getBuildVectorCost(ArrayRef VL, Value *Root) { if ((!Root && allConstant(VL)) || all_of(VL, IsaPred)) return TTI::TCC_Free; - auto *VecTy = cast(getWidenedType(ScalarTy, VL.size())); + auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL); if (!Root && isSplat(VL)) { @@ -15518,34 +15387,32 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask( MaskSlice, std::max(NumElts, MaskSlice.size()))) - Cost += ::getShuffleCost( - TTI, *ShuffleKinds[Part], - cast(getWidenedType(ScalarTy, NumElts)), MaskSlice); + Cost += + ::getShuffleCost(TTI, *ShuffleKinds[Part], + getWidenedType(ScalarTy, NumElts), MaskSlice); continue; } if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { - Cost += ::getShuffleCost( - TTI, *RegShuffleKind, - cast(getWidenedType(ScalarTy, EltsPerVector)), SubMask); + Cost += + ::getShuffleCost(TTI, *RegShuffleKind, + getWidenedType(ScalarTy, EltsPerVector), SubMask); } const unsigned BaseVF = getFullVectorNumberOfElements( *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector)); for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) { assert((Idx + SubVecSize) <= BaseVF && "SK_ExtractSubvector index out of range"); - Cost += ::getShuffleCost( - TTI, TTI::SK_ExtractSubvector, - cast(getWidenedType(ScalarTy, BaseVF)), {}, CostKind, - Idx, cast(getWidenedType(ScalarTy, SubVecSize))); + Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector, + getWidenedType(ScalarTy, BaseVF), {}, CostKind, + Idx, getWidenedType(ScalarTy, SubVecSize)); } // Second attempt to check, if just a permute is better estimated than // subvector extract. SubMask.assign(NumElts, PoisonMaskElem); copy(MaskSlice, SubMask.begin()); InstructionCost OriginalCost = ::getShuffleCost( - TTI, *ShuffleKinds[Part], - cast(getWidenedType(ScalarTy, NumElts)), SubMask); + TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask); if (OriginalCost < Cost) Cost = OriginalCost; } @@ -16220,10 +16087,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { I1 = I2 + CommonMask.size(); } } - Cost += ::getShuffleCost( - TTI, TTI::SK_PermuteTwoSrc, - cast(getWidenedType(ScalarTy, CommonMask.size())), - SVMask, CostKind); + Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, + getWidenedType(ScalarTy, CommonMask.size()), + SVMask, CostKind); } for (auto [E, Idx] : SubVectors) { Type *EScalarTy = E->Scalars.front()->getType(); @@ -16246,9 +16112,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } Cost += ::getShuffleCost( TTI, TTI::SK_InsertSubvector, - cast(getWidenedType(ScalarTy, CommonMask.size())), {}, - CostKind, Idx, - cast(getWidenedType(ScalarTy, E->getVectorFactor()))); + getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx, + getWidenedType(ScalarTy, E->getVectorFactor())); if (!CommonMask.empty()) { std::iota(std::next(CommonMask.begin(), Idx), std::next(CommonMask.begin(), Idx + E->getVectorFactor()), @@ -16447,7 +16312,7 @@ uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) { InstructionCost BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy, - Type *VecTy, Type *FinalVecTy, + VectorType *VecTy, VectorType *FinalVecTy, TTI::TargetCostKind CostKind) const { InstructionCost SpillsReloads = 0; @@ -16473,7 +16338,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy, PressureByClass[RegClass] += Parts; }; - auto GetEntryVecTy = [&](const TreeEntry *TE) -> std::pair { + auto GetEntryVecTy = + [&](const TreeEntry *TE) -> std::pair { Type *ScalarTy = getValueType(TE->Scalars.front()); auto BWIt = MinBWs.find(TE); if (BWIt != MinBWs.end()) { @@ -16625,22 +16491,21 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost VectorCost = 0; if (E->ReorderIndices.empty()) { VectorCost = ::getShuffleCost( - *TTI, TTI::SK_InsertSubvector, cast(FinalVecTy), {}, - CostKind, E->CombinedEntriesWithIndices.back().second, - cast(getWidenedType( + *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind, + E->CombinedEntriesWithIndices.back().second, + getWidenedType( ScalarTy, VectorizableTree[E->CombinedEntriesWithIndices.back().first] - ->getVectorFactor()))); + ->getVectorFactor())); } else { unsigned CommonVF = std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first] ->getVectorFactor(), VectorizableTree[E->CombinedEntriesWithIndices.back().first] ->getVectorFactor()); - VectorCost = - ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, - cast(getWidenedType(ScalarTy, CommonVF)), - E->getSplitMask(), CostKind); + VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, + getWidenedType(ScalarTy, CommonVF), + E->getSplitMask(), CostKind); } VectorCost += SpillsReloads; LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree")); @@ -16664,9 +16529,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (!E->ReuseShuffleIndices.empty()) ::addMask(Mask, E->ReuseShuffleIndices); if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) - CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - cast(FinalVecTy), Mask, CostKind, - /*Index=*/0, cast(VecTy)); + CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, + Mask, CostKind, /*Index=*/0, VecTy); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || E->State == TreeEntry::StridedVectorize || @@ -16774,9 +16638,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, "MaskedLoadCompressVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; - std::tie(ScalarCost, VecCost) = - getGEPCosts(*TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, - cast(VecTy)); + std::tie(ScalarCost, VecCost) = getGEPCosts( + *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy); LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, "Calculated GEPs cost for Tree")); @@ -16861,7 +16724,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, NumElts = ATy->getNumElements(); else NumElts = AggregateTy->getStructNumElements(); - SrcVecTy = cast(getWidenedType(OrigScalarTy, NumElts)); + SrcVecTy = getWidenedType(OrigScalarTy, NumElts); } } if (I->hasOneUse()) { @@ -16966,7 +16829,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // need to shift the vector. // Do not calculate the cost if the actual size is the register size and // we can merge this shuffle with the following SK_Select. - auto *InsertVecTy = cast(getWidenedType(ScalarTy, InsertVecSz)); + auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz); if (!IsIdentity) Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc, InsertVecTy, Mask); @@ -16982,7 +16845,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { if (InsertVecSz != VecSz) { - auto *ActualVecTy = cast(getWidenedType(ScalarTy, VecSz)); + auto *ActualVecTy = getWidenedType(ScalarTy, VecSz); Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {}, CostKind, OffsetBeg - Offset, InsertVecTy); } else { @@ -17165,10 +17028,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // (e.g. condition is while result is ). For // compares, the result type IS the mask (i1/vNi1). Construct the // right type so getCmpSelInstrCost sees the actual mask/result width. - auto *MaskTy = cast(getWidenedType( - ShuffleOrOp == Instruction::Select ? VL0->getOperand(0)->getType() - : VL0->getType(), - VL.size())); + auto *MaskTy = getWidenedType(ShuffleOrOp == Instruction::Select + ? VL0->getOperand(0)->getType() + : VL0->getType(), + VL.size()); InstructionCost VecCost = InstructionCost::getInvalid(); if (ShuffleOrOp == Instruction::Select) { @@ -17620,7 +17483,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, auto *CI = cast(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); SmallVector ArgTys = buildIntrinsicArgTypes( - CI, ID, getNumElements(VecTy), + CI, ID, VecTy->getNumElements(), It != MinBWs.end() ? It->second.first : 0, TTI); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; @@ -17733,7 +17596,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, }, Mask); VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, - cast(FinalVecTy), Mask, CostKind); + FinalVecTy, Mask, CostKind); // Patterns like [fadd,fsub] can be combined into a single instruction // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we // need to take into account their order when looking for the most used @@ -17744,10 +17607,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1)); // If this pattern is supported by the target then we consider the // order. - if (TTIRef.isLegalAltInstr(cast(VecTy), Opcode0, Opcode1, - OpcodeMask)) { + if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { InstructionCost AltVecCost = TTIRef.getAltInstrCost( - cast(VecTy), Opcode0, Opcode1, OpcodeMask, CostKind); + VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); return AltVecCost < VecCost ? AltVecCost : VecCost; } // TODO: Check the reverse order too. @@ -17783,8 +17645,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return true; })) return ::getShuffleCost( - *TTI, TargetTransformInfo::SK_PermuteSingleSrc, - cast(VecTy), + *TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy, calculateShufflevectorMask(E->Scalars)); } return TTI::TCC_Free; @@ -17863,18 +17724,6 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return true; } - // FIXME: support buildvector of the gather nodes with struct types. - if (any_of(VectorizableTree, [&](const std::unique_ptr &TE) { - return TE->isGather() && TE->hasState() && - TE->getOpcode() == Instruction::Call && - isa(TE->getMainOp()->getType()); - })) { - LLVM_DEBUG( - dbgs() << "SLP: rejecting tree with buildvector struct values of size " - << VectorizableTree.size() << ".\n"); - return true; - } - // Cache values from the root node and the cost-threshold options to avoid // re-querying them inside hot predicates below. const unsigned TreeSize = VectorizableTree.size(); @@ -18181,8 +18030,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { if (BackVF > 2 && allSameBlock(Back.Scalars) && !Back.Scalars.front()->getType()->isVectorTy() && TTI->getScalarizationOverhead( - cast( - getWidenedType(Back.Scalars.front()->getType(), BackVF)), + getWidenedType(Back.Scalars.front()->getType(), BackVF), APInt::getAllOnes(BackVF), /*Insert=*/true, /*Extract=*/false, TTI::TCK_RecipThroughput) > -SLPCostThreshold) @@ -18873,8 +18721,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( GatheredLoadsNodes.insert(&TE); if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize && !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement || - TE.getOpcode() == Instruction::Store)) && - !isa(getValueType(TE.Scalars.front()))) { + TE.getOpcode() == Instruction::Store))) { // Calculate costs of external uses. APInt DemandedElts = APInt::getZero(TE.getVectorFactor()); for (Value *V : TE.Scalars) { @@ -18887,10 +18734,9 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( if (It != MinBWs.end()) ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor()); - InstructionCost ExtCost = - ::getScalarizationOverhead(*TTI, ScalarTy, cast(VecTy), - DemandedElts, /*Insert=*/false, - /*Extract=*/true, CostKind); + InstructionCost ExtCost = ::getScalarizationOverhead( + *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/false, + /*Extract=*/true, CostKind); if (ExtCost.isValid() && ExtCost != 0) { if (!Scale) Scale = getScaleToLoopIterations(TE); @@ -18988,7 +18834,6 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) { TreeEntry *TE = Worklist.top().first; if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) || - isa(getValueType(TE->Scalars.front())) || // Exit early if the parent node is split node and any of scalars is // used in other split nodes. (TE->UserTreeIndex && @@ -19050,7 +18895,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, EntryVF); InstructionCost GatherCost = ::getScalarizationOverhead( - *TTI, ScalarTy, cast(VecTy), DemandedElts, + *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind); SmallVector Mask; if (!TE->ReorderIndices.empty() && @@ -19070,8 +18915,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( if (!TE->ReuseShuffleIndices.empty()) ::addMask(Mask, TE->ReuseShuffleIndices); if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF)) - GatherCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - cast(VecTy), Mask); + GatherCost += + ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask); // If all scalars are reused in gather node(s) or other vector nodes, there // might be extra cost for inserting them. if ((!TE->hasState() || !TE->isAltShuffle()) && @@ -19167,7 +19012,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor()); InstructionCost ExtractsCost = ::getScalarizationOverhead( - *TTI, ScalarTy, cast(VecTy), DemandedElts, + *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); InstructionCost BVCost = 0; for (const auto &[BVE, Values] : ValuesToInsert) { @@ -19181,7 +19026,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( } auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor()); BVCost += ::getScalarizationOverhead( - *TTI, ScalarTy, cast(BVVecTy), BVDemandedElts, + *TTI, ScalarTy, BVVecTy, BVDemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind, BVDemandedElts.isAllOnes(), BVValues); } @@ -19511,20 +19356,14 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, ? Instruction::ZExt : Instruction::SExt; VecTy = getWidenedType(MinTy, BundleWidth); - ExtraCost = getExtractWithExtendCost(*TTI, Extend, ScalarTy, - cast(VecTy), EU.Lane); + ExtraCost = + getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane); LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: " << ExtraCost << "\n"); } else { - Type *ExtractTy = VecTy; - if (auto *ST = dyn_cast(VecTy)) { - assert(EU.User && "Expected user for struct extract"); - const auto *EV = cast(EU.User); - ExtractTy = ExtractValueInst::getIndexedType(ST, EV->getIndices()); - } - ExtraCost = getVectorInstrCost( - *TTI, ScalarTy, Instruction::ExtractElement, ExtractTy, CostKind, - EU.Lane, EU.Scalar, ScalarUserAndIdx); + ExtraCost = + getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy, + CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx); LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from " << *VecTy << ": " << ExtraCost << "\n"); } @@ -19628,11 +19467,6 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, if (KeepScalar) { ExternalUsesAsOriginalScalar.insert(EU.Scalar); for (Value *V : Inst->operands()) { - // Struct operands cannot be rebuilt by the !User extraction - // path (it has no insertvalue chain), so leave their existing - // ExtractValueInst user in place. - if (isa(V->getType())) - continue; auto It = ValueToExtUses->find(V); if (It != ValueToExtUses->end()) { // Replace all uses to avoid compiler crash. @@ -19648,8 +19482,6 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, // compiler crash. if (auto *IOp = dyn_cast(Inst->getOperand(0))) { for (Value *V : IOp->operands()) { - if (isa(V->getType())) - continue; auto It = ValueToExtUses->find(V); if (It != ValueToExtUses->end()) { // Replace all uses to avoid compiler crash. @@ -19772,10 +19604,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, SmallVector OrigMask(VecVF, PoisonMaskElem); std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), OrigMask.begin()); - C = ::getShuffleCost( - *TTI, TTI::SK_PermuteSingleSrc, - cast(getWidenedType(TE->getMainOp()->getType(), VecVF)), - OrigMask); + C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + getWidenedType(TE->getMainOp()->getType(), VecVF), + OrigMask); LLVM_DEBUG( dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users.\n"; @@ -19791,10 +19622,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, ResizeMask[Mask[I]] = Mask[I]; } if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF)) - C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - cast(getWidenedType( - TE->getMainOp()->getType(), VecVF)), - ResizeMask); + C = ::getShuffleCost( + *TTI, TTI::SK_PermuteSingleSrc, + getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask); LLVM_DEBUG( dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users.\n"; @@ -19824,8 +19654,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, (Data.index() < VF && static_cast(Data.index()) == Data.value()); })) { - InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - cast(FTy), Mask); + InstructionCost C = + ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask); C = ScaleCost(C, *TEs.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement " @@ -19843,8 +19673,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, VF = Mask.size(); } auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); - InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, - cast(FTy), Mask); + InstructionCost C = + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); C = ScaleCost(C, *TEs.back()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of vector node and external " @@ -20736,10 +20566,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto *VecTy = - cast(getWidenedType(VL.front()->getType(), NewVF)); - auto *MaskVecTy = - cast(getWidenedType(VL.front()->getType(), SubMask.size())); + auto *VecTy = getWidenedType(VL.front()->getType(), NewVF); + auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size()); auto GetShuffleCost = [&, &TTI = *TTI](ArrayRef Mask, ArrayRef Entries, @@ -20947,16 +20775,16 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, any_of(VL, [](Value *V) { return !isa(V) && isConstant(V); }); // 1. Shuffle input source vector and constant vector. if (!ForPoisonSrc && IsAnyNonUndefConst) { - Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, - cast(VecTy), ConstantShuffleMask); + Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, VecTy, + ConstantShuffleMask); } // 2. Insert unique non-constants. if (!DemandedElements.isZero()) - Cost += getScalarizationOverhead( - *TTI, ScalarTy, cast(VecTy), DemandedElements, - /*Insert=*/true, - /*Extract=*/false, CostKind, ForPoisonSrc && !IsAnyNonUndefConst, VL); + Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements, + /*Insert=*/true, + /*Extract=*/false, CostKind, + ForPoisonSrc && !IsAnyNonUndefConst, VL); return Cost; } @@ -22479,9 +22307,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, for (auto [Idx, I] : enumerate(BVMask)) if (I != PoisonMaskElem) NewMask[Idx] = Mask.size(); - SplatCost += - ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, - cast(VecTy), NewMask, CostKind); + SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, + NewMask, CostKind); InstructionCost BVCost = TTI->getVectorInstrCost( Instruction::InsertElement, VecTy, CostKind, *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V); @@ -22493,8 +22320,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, if (I != PoisonMaskElem) NewMask[Idx] = I; BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - cast(VecTy), NewMask, - CostKind); + VecTy, NewMask, CostKind); } return SplatCost <= BVCost; }; @@ -22691,14 +22517,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool IsReverseOrder = !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices); auto FinalShuffle = [&](Value *V, const TreeEntry *E) { - if (isa(ScalarTy)) { - // TODO: Reordering of struct types is not supported. - assert(E->ReorderIndices.empty() && - "Expected no reordering for struct types."); - assert(E->ReuseShuffleIndices.empty() && - "Expected no reuse shuffle indices for struct types."); - return V; - } ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); if (E->getOpcode() == Instruction::Store && E->State == TreeEntry::Vectorize) { @@ -22830,19 +22648,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ExtractValue: { - SmallVector Indices; - SmallVector Calls; - if (checkEVsForVecCalls(E->Scalars, E->getOperations(), *TLI, Indices, - Calls)) { - setInsertPointAfterBundle(E); - Value *V = vectorizeOperand(E, 0); - V = Builder.CreateExtractValue(V, Indices); - if (auto *I = dyn_cast(V)) - V = ::propagateMetadata(I, E->Scalars); - V = FinalShuffle(V, E); - E->VectorizedValue = V; - return V; - } auto *LI = cast(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); Value *Ptr = LI->getPointerOperand(); @@ -23376,7 +23181,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return Builder.getInt64(I % ScalarTyNumElements); }); VecPtr = Builder.CreateGEP( - toScalarizedTy(VecTy), + VecTy->getElementType(), Builder.CreateShuffleVector( VecPtr, createReplicatedMask(ScalarTyNumElements, VF)), ConstantVector::get(Indices)); @@ -23435,8 +23240,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::experimental_vp_strided_store, {VecTy, Ptr->getType(), StrideTy}, {VecValue, Ptr, StrideVal, - Builder.getAllOnesMask( - ElementCount::getFixed(getNumElements(VecTy))), + Builder.getAllOnesMask(VecTy->getElementCount()), Builder.getInt32(E->Scalars.size())}); Inst->addParamAttr( /*ArgNo=*/1, @@ -23486,7 +23290,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); SmallVector ArgTys = buildIntrinsicArgTypes( - CI, ID, getNumElements(VecTy), + CI, ID, VecTy->getNumElements(), It != MinBWs.end() ? It->second.first : 0, TTI); auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); bool UseIntrinsic = ID != Intrinsic::not_intrinsic && @@ -23496,13 +23300,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector OpVecs; SmallVector TysForDecl; // Add return type if intrinsic is overloaded on it. - if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) { - ArrayRef ContainedTys = getContainedTypes(VecTy); - for (auto [Idx, Ty] : enumerate(ContainedTys)) { - if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, Idx, TTI)) - TysForDecl.push_back(Ty); - } - } + if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) + TysForDecl.push_back(VecTy); auto *CEI = cast(VL0); for (unsigned I : seq(0, CI->arg_size())) { // Some intrinsics have scalar arguments. This argument should not be @@ -23526,7 +23325,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ScalarArg->getType()->getScalarType() && It == MinBWs.end()) { auto *CastTy = - getWidenedType(ScalarArg->getType(), getNumElements(VecTy)); + getWidenedType(ScalarArg->getType(), VecTy->getNumElements()); OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I)); } else if (It != MinBWs.end()) { OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I)); @@ -23541,7 +23340,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!UseIntrinsic) { VFShape Shape = VFShape::get(CI->getFunctionType(), - ElementCount::getFixed(getNumElements(VecTy)), + ElementCount::getFixed(VecTy->getNumElements()), false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { @@ -24008,21 +23807,10 @@ Value *BoUpSLP::vectorizeTree( SmallVector> ShuffledInserts; // Maps vector instruction to original insertelement instruction DenseMap VectorToInsertElement; - // Maps extract Scalar (plus the struct-field index path, when extracting - // from a struct of vectors) to the corresponding extractelement instruction - // in the basic block. Only one extractelement per block should be emitted. - // The index path is stored in an owning SmallVector so the key remains - // valid after the per-lane ExtractValueInst (whose Indices buffer it was - // copied from) is erased later in this loop. - SmallDenseMap>, - DenseMap>> + // Maps extract Scalar to the corresponding extractelement instruction in the + // basic block. Only one extractelement per block should be emitted. + DenseMap>> ScalarToEEs; - // Maps (struct-of-vectors Vec, field-index path) to the corresponding - // per-block extractvalue, so different external lanes that need the same - // struct field of the same vectorized call share a single extractvalue. - SmallDenseMap>, - DenseMap> - StructFieldExtracts; SmallDenseSet UsedInserts; DenseMap, Value *> VectorCasts; SmallDenseSet ScalarsWithNullptrUser; @@ -24054,18 +23842,7 @@ Value *BoUpSLP::vectorizeTree( Value *ExV = nullptr; auto *Inst = dyn_cast(Scalar); bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst); - // For struct-typed scalars, the User must be an ExtractValueInst that - // describes which struct field is being extracted. Copy its indices - // into an owning SmallVector so the cache key survives erasure of the - // ExtractValueInst. - SmallVector Indices; - if (isa(Scalar->getType())) { - assert(User && "User expected for StructType extract."); - const auto *EV = cast(User); - Indices.assign(EV->getIndices()); - } - auto Key = std::make_pair(Scalar, Indices); - auto It = ScalarToEEs.find(Key); + auto It = ScalarToEEs.find(Scalar); if (It != ScalarToEEs.end()) { // No need to emit many extracts, just move the only one in the // current block. @@ -24123,53 +23900,19 @@ Value *BoUpSLP::vectorizeTree( Ex = createExtractVector(Builder, Vec, VecTyNumElements, ExternalUse.Lane * VecTyNumElements); } else { - if (isa(Vec->getType())) { - assert(isa(Scalar->getType()) && - "Vec is struct of vectors only when Scalar is struct."); - auto FieldKey = std::make_pair(Vec, Indices); - BasicBlock *EVBB = Builder.GetInsertBlock(); - Value *FieldVec = nullptr; - auto FieldIt = StructFieldExtracts.find(FieldKey); - if (FieldIt != StructFieldExtracts.end()) { - auto BBIt = FieldIt->second.find(EVBB); - if (BBIt != FieldIt->second.end()) - FieldVec = BBIt->second; - } - if (!FieldVec) { - FieldVec = Builder.CreateExtractValue(Vec, Indices); - StructFieldExtracts[FieldKey][EVBB] = FieldVec; - } else if (auto *FieldI = dyn_cast(FieldVec); - FieldI && Builder.GetInsertPoint() != EVBB->end() && - Builder.GetInsertPoint()->comesBefore(FieldI)) { - // Cached extractvalue is below the current insertion point; - // move it up so the extractelement we are about to emit can - // use it. - FieldI->moveBefore(*EVBB, Builder.GetInsertPoint()); - } - Vec = FieldVec; - } - if (SLPReVec && isVectorizedTy(Scalar->getType())) { - unsigned VecTyNumElements = getNumElements(Scalar->getType()); - // When REVEC is enabled, we need to extract a vector. - // Note: The element size of Scalar may be different from the - // element size of Vec. - Ex = createExtractVector(Builder, Vec, VecTyNumElements, - ExternalUse.Lane * VecTyNumElements); - } else { - Ex = Builder.CreateExtractElement(Vec, Lane); - } + Ex = Builder.CreateExtractElement(Vec, Lane); } // If necessary, sign-extend or zero-extend ScalarRoot // to the larger type. ExV = Ex; - if (!isa(Scalar->getType()) && - Scalar->getType() != Ex->getType()) + if (Scalar->getType() != Ex->getType()) ExV = Builder.CreateIntCast( Ex, Scalar->getType(), !isKnownNonNegative(Scalar, SimplifyQuery(*DL))); auto *I = dyn_cast(Ex); - ScalarToEEs[Key].try_emplace(I ? I->getParent() : &F->getEntryBlock(), - std::make_pair(Ex, ExV)); + ScalarToEEs[Scalar].try_emplace(I ? I->getParent() + : &F->getEntryBlock(), + std::make_pair(Ex, ExV)); } // The then branch of the previous if may produce constants, since 0 // operand might be a constant. @@ -24336,13 +24079,7 @@ Value *BoUpSLP::vectorizeTree( } else { Builder.SetInsertPoint(cast(User)); Value *NewInst = ExtractAndExtendIfNeeded(Vec); - if (isa(Scalar->getType()) && - isa_and_nonnull(User)) { - User->replaceAllUsesWith(NewInst); - eraseInstruction(cast(User)); - } else { - User->replaceUsesOfWith(Scalar, NewInst); - } + User->replaceUsesOfWith(Scalar, NewInst); } } else { Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); @@ -27908,7 +27645,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). auto *VecTy = getWidenedType(ScalarTy, VF); - if (getNumberOfParts(*TTI, VecTy, ScalarTy) == VF) + if (TTI->getNumberOfParts(VecTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { unsigned ActualVF = std::min(MaxInst - I, VF); @@ -28986,15 +28723,14 @@ class HorizontalReduction { Type *ScalarTy = Candidates.front()->getType(); ReduxWidth = getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); - VectorType *Tp = cast(getWidenedType(ScalarTy, ReduxWidth)); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy); NumRegs = TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); while (NumParts > NumRegs) { assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0."); ReduxWidth = bit_floor(ReduxWidth - 1); - VectorType *Tp = - cast(getWidenedType(ScalarTy, ReduxWidth)); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy); NumRegs = TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); @@ -29602,16 +29338,14 @@ class HorizontalReduction { } else { VectorCost = TTI->getExtendedReductionCost( RdxOpcode, !IsSigned, RedTy, - cast(getWidenedType(RType, ReduxWidth)), FMF, - CostKind); + getWidenedType(RType, ReduxWidth), FMF, CostKind); } } } else { Type *RedTy = VectorTy->getElementType(); auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); - VectorType *RVecTy = - cast(getWidenedType(RType, ReduxWidth)); + VectorType *RVecTy = getWidenedType(RType, ReduxWidth); InstructionCost FMACost = InstructionCost::getInvalid(); if (RdxKind == RecurKind::FAdd) { // Check if the reduction operands can be converted to FMA. @@ -29683,8 +29417,7 @@ class HorizontalReduction { Type *RedTy = VectorTy->getElementType(); auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); - VectorType *RVecTy = - cast(getWidenedType(RType, ReduxWidth)); + VectorType *RVecTy = getWidenedType(RType, ReduxWidth); IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF); VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind); if (RType != RedTy) { @@ -30448,7 +30181,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!HorRdx.matchReductionForOperands()) return false; // Check the cost of operations. - auto *VecTy = cast(getWidenedType(Ty, Ops.size())); + VectorType *VecTy = getWidenedType(Ty, Ops.size()); constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ScalarCost = TTI.getScalarizationOverhead( @@ -30848,17 +30581,6 @@ static bool isNonVectorizableInst(const Instruction *I, } if (isa(I)) return true; - if (const auto *EV = dyn_cast(I)) { - const auto *Arg = EV->getAggregateOperand(); - if (const auto *CI = dyn_cast(Arg)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (isTriviallyVectorizable(ID)) - return true; - if (!VFDatabase::getMappings(*CI).empty()) - return true; - return false; - } - } if (const auto *RI = dyn_cast(I)) return RI->getNumOperands() > 0 && (SLPReVec || !I->getOperand(0)->getType()->isVectorTy()) && @@ -30887,10 +30609,6 @@ static void forEachOperandChainCandidate(Instruction *I, Func F, F(AI->getNewValOperand(), 1); return; } - if (auto *EV = dyn_cast(I)) { - F(EV->getAggregateOperand(), 0); - return; - } if (ForReduction && !NonVectReductions) return; if (auto *SI = dyn_cast(I)) { @@ -31047,7 +30765,7 @@ bool SLPVectorizerPass::vectorizeNonVectorizableInsts( auto *OpI = dyn_cast(Op); if (!OpI || OpI->getParent() != BB || R.isDeleted(OpI) || isa(OpI) || - (!isValidElementType(OpI->getType()) && !isa(OpI))) + !isValidElementType(OpI->getType())) return; if (!Seen.insert(OpI).second) return; @@ -31514,8 +31232,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (auto *ValTy = dyn_cast( PostProcessStores.front()->getValueOperand()->getType())) ScalarTy = ::getWidenedType(ScalarTy, getNumElements(ValTy)); - auto *VecTy = cast( - ::getWidenedType(ScalarTy, PostProcessStores.size())); + auto *VecTy = ::getWidenedType(ScalarTy, PostProcessStores.size()); InstructionCost ExtractsCost = ::getScalarizationOverhead( *TTI, ScalarTy, VecTy, APInt::getAllOnes(PostProcessStores.size()), /*Insert=*/false, /*Extract=*/true, TTI::TCK_RecipThroughput, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll index 5d3dd1661fb8f..8d7dd9b9621c8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,146 +26,48 @@ declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.sadd.with.overflow.i8 (i8 , i8 ) define void @add_v8i64() { -; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v8i64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v8i64( -; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v8i64( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v8i64( +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 +; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; CHECK-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -211,106 +113,88 @@ define void @add_v8i64() { } define void @add_v16i32() { -; SSE-LABEL: @add_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v16i32( -; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v16i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v16i32( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v16i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; CHECK-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -396,106 +280,168 @@ define void @add_v16i32() { } define void @add_v32i16() { -; SSE-LABEL: @add_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v32i16( -; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v32i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v32i16( -; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v32i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v32i16( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v32i16( +; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A0]], i16 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A1]], i16 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A2]], i16 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A3]], i16 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A4]], i16 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A5]], i16 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A6]], i16 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A7]], i16 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A8]], i16 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A9]], i16 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A10]], i16 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A11]], i16 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A12]], i16 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A13]], i16 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A14]], i16 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A15]], i16 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A16]], i16 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A17]], i16 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A18]], i16 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A19]], i16 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A20]], i16 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A21]], i16 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A22]], i16 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A23]], i16 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A24]], i16 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A25]], i16 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A26]], i16 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A27]], i16 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A28]], i16 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A29]], i16 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A30]], i16 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A31]], i16 [[B31]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 +; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 +; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 +; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 +; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 +; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 +; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 +; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 +; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 +; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 +; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 +; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 +; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 +; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 +; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 +; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 +; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 +; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 +; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 +; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 +; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 +; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 +; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 +; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 +; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 +; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 +; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 +; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 +; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 +; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 +; CHECK-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -661,106 +607,328 @@ define void @add_v32i16() { } define void @add_v64i8() { -; SSE-LABEL: @add_v64i8( -; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v64i8( -; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v64i8( -; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v64i8( -; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v64i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v64i8( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v64i8( +; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 +; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 +; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A0]], i8 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A1]], i8 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A2]], i8 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A3]], i8 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A4]], i8 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A5]], i8 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A6]], i8 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A7]], i8 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A8]], i8 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A9]], i8 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A10]], i8 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A11]], i8 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A12]], i8 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A13]], i8 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A14]], i8 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A15]], i8 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A16]], i8 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A17]], i8 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A18]], i8 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A19]], i8 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A20]], i8 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A21]], i8 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A22]], i8 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A23]], i8 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A24]], i8 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A25]], i8 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A26]], i8 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A27]], i8 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A28]], i8 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A29]], i8 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A30]], i8 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A31]], i8 [[B31]]) +; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A32]], i8 [[B32]]) +; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A33]], i8 [[B33]]) +; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A34]], i8 [[B34]]) +; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A35]], i8 [[B35]]) +; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A36]], i8 [[B36]]) +; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A37]], i8 [[B37]]) +; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A38]], i8 [[B38]]) +; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A39]], i8 [[B39]]) +; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A40]], i8 [[B40]]) +; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A41]], i8 [[B41]]) +; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A42]], i8 [[B42]]) +; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A43]], i8 [[B43]]) +; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A44]], i8 [[B44]]) +; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A45]], i8 [[B45]]) +; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A46]], i8 [[B46]]) +; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A47]], i8 [[B47]]) +; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A48]], i8 [[B48]]) +; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A49]], i8 [[B49]]) +; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A50]], i8 [[B50]]) +; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A51]], i8 [[B51]]) +; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A52]], i8 [[B52]]) +; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A53]], i8 [[B53]]) +; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A54]], i8 [[B54]]) +; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A55]], i8 [[B55]]) +; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A56]], i8 [[B56]]) +; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A57]], i8 [[B57]]) +; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A58]], i8 [[B58]]) +; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A59]], i8 [[B59]]) +; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A60]], i8 [[B60]]) +; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A61]], i8 [[B61]]) +; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A62]], i8 [[B62]]) +; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A63]], i8 [[B63]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 +; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 +; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 +; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 +; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 +; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 +; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 +; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 +; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 +; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 +; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 +; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 +; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 +; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 +; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 +; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 +; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 +; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 +; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 +; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 +; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 +; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 +; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 +; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 +; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 +; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 +; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 +; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 +; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 +; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 +; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 +; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 +; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 +; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 +; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 +; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 +; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 +; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 +; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 +; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 +; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 +; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 +; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 +; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 +; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 +; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 +; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 +; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 +; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 +; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 +; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 +; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 +; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 +; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 +; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 +; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 +; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 +; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 +; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 +; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 +; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 +; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 +; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 +; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 +; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 +; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 +; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 +; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 +; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 +; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 +; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 +; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 +; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 +; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 +; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 +; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 +; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 +; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 +; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 +; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 +; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 +; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 +; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 +; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 +; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 +; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 +; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 +; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 +; CHECK-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1084,5 +1252,3 @@ define void @add_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll index f5d2212cbe584..fc67cec60f177 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,146 +26,48 @@ declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.uadd.with.overflow.i8 (i8 , i8 ) define void @add_v8i64() { -; SSE-LABEL: @add_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v8i64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v8i64( -; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v8i64( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v8i64( +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 +; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; CHECK-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -211,106 +113,88 @@ define void @add_v8i64() { } define void @add_v16i32() { -; SSE-LABEL: @add_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v16i32( -; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v16i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v16i32( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v16i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; CHECK-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -396,106 +280,168 @@ define void @add_v16i32() { } define void @add_v32i16() { -; SSE-LABEL: @add_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v32i16( -; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v32i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v32i16( -; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v32i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v32i16( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v32i16( +; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A0]], i16 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A1]], i16 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A2]], i16 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A3]], i16 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A4]], i16 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A5]], i16 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A6]], i16 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A7]], i16 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A8]], i16 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A9]], i16 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A10]], i16 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A11]], i16 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A12]], i16 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A13]], i16 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A14]], i16 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A15]], i16 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A16]], i16 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A17]], i16 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A18]], i16 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A19]], i16 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A20]], i16 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A21]], i16 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A22]], i16 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A23]], i16 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A24]], i16 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A25]], i16 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A26]], i16 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A27]], i16 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A28]], i16 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A29]], i16 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A30]], i16 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A31]], i16 [[B31]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 +; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 +; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 +; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 +; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 +; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 +; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 +; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 +; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 +; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 +; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 +; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 +; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 +; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 +; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 +; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 +; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 +; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 +; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 +; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 +; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 +; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 +; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 +; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 +; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 +; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 +; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 +; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 +; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 +; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 +; CHECK-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -661,106 +607,328 @@ define void @add_v32i16() { } define void @add_v64i8() { -; SSE-LABEL: @add_v64i8( -; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: ret void -; -; SLM-LABEL: @add_v64i8( -; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SLM-NEXT: ret void -; -; AVX-LABEL: @add_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @add_v64i8( -; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @add_v64i8( -; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @add_v64i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @add_v64i8( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @add_v64i8( +; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 +; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 +; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A0]], i8 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A1]], i8 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A2]], i8 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A3]], i8 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A4]], i8 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A5]], i8 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A6]], i8 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A7]], i8 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A8]], i8 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A9]], i8 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A10]], i8 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A11]], i8 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A12]], i8 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A13]], i8 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A14]], i8 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A15]], i8 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A16]], i8 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A17]], i8 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A18]], i8 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A19]], i8 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A20]], i8 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A21]], i8 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A22]], i8 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A23]], i8 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A24]], i8 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A25]], i8 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A26]], i8 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A27]], i8 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A28]], i8 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A29]], i8 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A30]], i8 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A31]], i8 [[B31]]) +; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A32]], i8 [[B32]]) +; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A33]], i8 [[B33]]) +; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A34]], i8 [[B34]]) +; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A35]], i8 [[B35]]) +; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A36]], i8 [[B36]]) +; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A37]], i8 [[B37]]) +; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A38]], i8 [[B38]]) +; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A39]], i8 [[B39]]) +; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A40]], i8 [[B40]]) +; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A41]], i8 [[B41]]) +; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A42]], i8 [[B42]]) +; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A43]], i8 [[B43]]) +; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A44]], i8 [[B44]]) +; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A45]], i8 [[B45]]) +; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A46]], i8 [[B46]]) +; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A47]], i8 [[B47]]) +; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A48]], i8 [[B48]]) +; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A49]], i8 [[B49]]) +; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A50]], i8 [[B50]]) +; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A51]], i8 [[B51]]) +; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A52]], i8 [[B52]]) +; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A53]], i8 [[B53]]) +; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A54]], i8 [[B54]]) +; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A55]], i8 [[B55]]) +; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A56]], i8 [[B56]]) +; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A57]], i8 [[B57]]) +; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A58]], i8 [[B58]]) +; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A59]], i8 [[B59]]) +; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A60]], i8 [[B60]]) +; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A61]], i8 [[B61]]) +; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A62]], i8 [[B62]]) +; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A63]], i8 [[B63]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 +; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 +; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 +; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 +; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 +; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 +; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 +; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 +; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 +; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 +; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 +; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 +; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 +; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 +; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 +; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 +; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 +; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 +; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 +; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 +; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 +; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 +; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 +; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 +; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 +; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 +; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 +; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 +; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 +; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 +; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 +; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 +; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 +; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 +; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 +; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 +; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 +; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 +; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 +; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 +; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 +; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 +; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 +; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 +; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 +; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 +; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 +; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 +; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 +; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 +; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 +; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 +; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 +; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 +; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 +; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 +; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 +; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 +; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 +; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 +; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 +; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 +; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 +; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 +; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 +; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 +; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 +; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 +; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 +; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 +; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 +; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 +; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 +; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 +; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 +; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 +; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 +; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 +; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 +; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 +; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 +; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 +; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 +; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 +; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 +; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 +; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 +; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 +; CHECK-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1084,5 +1252,3 @@ define void @add_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll index c7470f28d1c7b..72a3ddd0bb747 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,126 +26,48 @@ declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.smul.with.overflow.i8 (i8 , i8 ) define void @mul_v8i64() { -; SSE-LABEL: @mul_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8 -; SLM-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <2 x i64> [[TMP4]], ptr @c64, align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[TMP7:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SLM-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[TMP11:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SLM-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[TMP15:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v8i64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v8i64( -; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <8 x i64> [[TMP4]], ptr @c64, align 8 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v8i64( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v8i64( +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 +; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; CHECK-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -191,226 +113,88 @@ define void @mul_v8i64() { } define void @mul_v16i32() { -; SSE-LABEL: @mul_v16i32( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; SSE-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; SSE-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; SSE-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; SSE-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; SSE-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; SSE-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; SSE-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; SSE-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; SSE-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; SSE-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; SSE-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; SSE-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; SSE-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; SSE-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; SSE-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; SSE-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; SSE-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; SSE-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; SSE-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; SSE-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; SSE-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; SSE-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; SSE-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; SSE-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; SSE-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; SSE-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; SSE-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; SSE-NEXT: store i32 [[R0]], ptr @c32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v16i32( -; SLM-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SLM-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SLM-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 -; SLM-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 -; SLM-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 -; SLM-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 -; SLM-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 -; SLM-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 -; SLM-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 -; SLM-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 -; SLM-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 -; SLM-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 -; SLM-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 -; SLM-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; SLM-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; SLM-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 -; SLM-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 -; SLM-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SLM-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 -; SLM-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 -; SLM-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 -; SLM-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SLM-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 -; SLM-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 -; SLM-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 -; SLM-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SLM-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 -; SLM-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 -; SLM-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 -; SLM-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) -; SLM-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) -; SLM-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) -; SLM-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) -; SLM-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) -; SLM-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) -; SLM-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) -; SLM-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) -; SLM-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) -; SLM-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) -; SLM-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) -; SLM-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) -; SLM-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) -; SLM-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) -; SLM-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) -; SLM-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) -; SLM-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 -; SLM-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 -; SLM-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 -; SLM-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 -; SLM-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 -; SLM-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 -; SLM-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 -; SLM-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 -; SLM-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 -; SLM-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 -; SLM-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 -; SLM-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 -; SLM-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 -; SLM-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 -; SLM-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 -; SLM-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 -; SLM-NEXT: store i32 [[R0]], ptr @c32, align 4 -; SLM-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 -; SLM-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 -; SLM-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 -; SLM-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SLM-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 -; SLM-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 -; SLM-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 -; SLM-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SLM-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 -; SLM-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 -; SLM-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 -; SLM-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SLM-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 -; SLM-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 -; SLM-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v16i32( -; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v16i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v16i32( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v16i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; CHECK-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -496,106 +280,168 @@ define void @mul_v16i32() { } define void @mul_v32i16() { -; SSE-LABEL: @mul_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v32i16( -; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v32i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v32i16( -; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v32i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v32i16( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v32i16( +; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A0]], i16 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A1]], i16 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A2]], i16 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A3]], i16 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A4]], i16 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A5]], i16 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A6]], i16 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A7]], i16 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A8]], i16 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A9]], i16 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A10]], i16 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A11]], i16 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A12]], i16 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A13]], i16 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A14]], i16 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A15]], i16 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A16]], i16 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A17]], i16 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A18]], i16 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A19]], i16 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A20]], i16 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A21]], i16 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A22]], i16 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A23]], i16 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A24]], i16 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A25]], i16 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A26]], i16 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A27]], i16 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A28]], i16 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A29]], i16 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A30]], i16 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A31]], i16 [[B31]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 +; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 +; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 +; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 +; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 +; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 +; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 +; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 +; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 +; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 +; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 +; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 +; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 +; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 +; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 +; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 +; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 +; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 +; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 +; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 +; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 +; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 +; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 +; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 +; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 +; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 +; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 +; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 +; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 +; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 +; CHECK-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -761,106 +607,328 @@ define void @mul_v32i16() { } define void @mul_v64i8() { -; SSE-LABEL: @mul_v64i8( -; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v64i8( -; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v64i8( -; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v64i8( -; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v64i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v64i8( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v64i8( +; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 +; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 +; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A0]], i8 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A1]], i8 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A2]], i8 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A3]], i8 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A4]], i8 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A5]], i8 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A6]], i8 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A7]], i8 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A8]], i8 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A9]], i8 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A10]], i8 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A11]], i8 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A12]], i8 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A13]], i8 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A14]], i8 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A15]], i8 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A16]], i8 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A17]], i8 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A18]], i8 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A19]], i8 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A20]], i8 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A21]], i8 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A22]], i8 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A23]], i8 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A24]], i8 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A25]], i8 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A26]], i8 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A27]], i8 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A28]], i8 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A29]], i8 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A30]], i8 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A31]], i8 [[B31]]) +; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A32]], i8 [[B32]]) +; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A33]], i8 [[B33]]) +; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A34]], i8 [[B34]]) +; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A35]], i8 [[B35]]) +; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A36]], i8 [[B36]]) +; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A37]], i8 [[B37]]) +; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A38]], i8 [[B38]]) +; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A39]], i8 [[B39]]) +; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A40]], i8 [[B40]]) +; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A41]], i8 [[B41]]) +; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A42]], i8 [[B42]]) +; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A43]], i8 [[B43]]) +; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A44]], i8 [[B44]]) +; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A45]], i8 [[B45]]) +; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A46]], i8 [[B46]]) +; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A47]], i8 [[B47]]) +; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A48]], i8 [[B48]]) +; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A49]], i8 [[B49]]) +; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A50]], i8 [[B50]]) +; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A51]], i8 [[B51]]) +; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A52]], i8 [[B52]]) +; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A53]], i8 [[B53]]) +; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A54]], i8 [[B54]]) +; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A55]], i8 [[B55]]) +; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A56]], i8 [[B56]]) +; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A57]], i8 [[B57]]) +; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A58]], i8 [[B58]]) +; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A59]], i8 [[B59]]) +; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A60]], i8 [[B60]]) +; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A61]], i8 [[B61]]) +; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A62]], i8 [[B62]]) +; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A63]], i8 [[B63]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 +; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 +; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 +; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 +; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 +; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 +; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 +; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 +; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 +; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 +; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 +; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 +; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 +; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 +; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 +; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 +; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 +; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 +; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 +; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 +; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 +; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 +; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 +; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 +; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 +; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 +; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 +; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 +; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 +; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 +; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 +; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 +; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 +; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 +; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 +; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 +; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 +; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 +; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 +; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 +; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 +; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 +; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 +; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 +; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 +; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 +; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 +; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 +; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 +; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 +; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 +; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 +; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 +; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 +; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 +; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 +; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 +; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 +; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 +; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 +; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 +; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 +; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 +; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 +; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 +; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 +; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 +; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 +; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 +; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 +; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 +; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 +; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 +; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 +; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 +; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 +; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 +; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 +; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 +; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 +; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 +; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 +; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 +; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 +; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 +; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 +; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 +; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 +; CHECK-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1184,5 +1252,3 @@ define void @mul_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll index 4c1d070a569e7..4126f06e8ca81 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,126 +26,48 @@ declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.umul.with.overflow.i8 (i8 , i8 ) define void @mul_v8i64() { -; SSE-LABEL: @mul_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8 -; SLM-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <2 x i64> [[TMP4]], ptr @c64, align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[TMP7:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SLM-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[TMP11:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SLM-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[TMP15:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v8i64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v8i64( -; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v8i64( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v8i64( +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 +; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; CHECK-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -191,106 +113,88 @@ define void @mul_v8i64() { } define void @mul_v16i32() { -; SSE-LABEL: @mul_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v16i32( -; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v16i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v16i32( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v16i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; CHECK-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -376,106 +280,168 @@ define void @mul_v16i32() { } define void @mul_v32i16() { -; SSE-LABEL: @mul_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v32i16( -; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v32i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v32i16( -; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v32i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v32i16( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v32i16( +; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A0]], i16 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A1]], i16 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A2]], i16 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A3]], i16 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A4]], i16 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A5]], i16 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A6]], i16 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A7]], i16 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A8]], i16 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A9]], i16 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A10]], i16 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A11]], i16 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A12]], i16 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A13]], i16 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A14]], i16 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A15]], i16 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A16]], i16 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A17]], i16 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A18]], i16 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A19]], i16 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A20]], i16 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A21]], i16 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A22]], i16 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A23]], i16 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A24]], i16 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A25]], i16 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A26]], i16 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A27]], i16 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A28]], i16 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A29]], i16 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A30]], i16 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A31]], i16 [[B31]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 +; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 +; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 +; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 +; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 +; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 +; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 +; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 +; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 +; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 +; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 +; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 +; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 +; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 +; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 +; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 +; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 +; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 +; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 +; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 +; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 +; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 +; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 +; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 +; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 +; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 +; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 +; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 +; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 +; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 +; CHECK-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -641,106 +607,328 @@ define void @mul_v32i16() { } define void @mul_v64i8() { -; SSE-LABEL: @mul_v64i8( -; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: ret void -; -; SLM-LABEL: @mul_v64i8( -; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SLM-NEXT: ret void -; -; AVX-LABEL: @mul_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @mul_v64i8( -; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @mul_v64i8( -; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @mul_v64i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @mul_v64i8( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @mul_v64i8( +; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 +; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 +; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A0]], i8 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A1]], i8 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A2]], i8 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A3]], i8 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A4]], i8 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A5]], i8 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A6]], i8 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A7]], i8 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A8]], i8 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A9]], i8 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A10]], i8 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A11]], i8 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A12]], i8 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A13]], i8 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A14]], i8 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A15]], i8 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A16]], i8 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A17]], i8 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A18]], i8 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A19]], i8 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A20]], i8 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A21]], i8 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A22]], i8 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A23]], i8 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A24]], i8 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A25]], i8 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A26]], i8 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A27]], i8 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A28]], i8 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A29]], i8 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A30]], i8 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A31]], i8 [[B31]]) +; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A32]], i8 [[B32]]) +; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A33]], i8 [[B33]]) +; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A34]], i8 [[B34]]) +; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A35]], i8 [[B35]]) +; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A36]], i8 [[B36]]) +; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A37]], i8 [[B37]]) +; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A38]], i8 [[B38]]) +; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A39]], i8 [[B39]]) +; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A40]], i8 [[B40]]) +; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A41]], i8 [[B41]]) +; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A42]], i8 [[B42]]) +; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A43]], i8 [[B43]]) +; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A44]], i8 [[B44]]) +; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A45]], i8 [[B45]]) +; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A46]], i8 [[B46]]) +; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A47]], i8 [[B47]]) +; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A48]], i8 [[B48]]) +; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A49]], i8 [[B49]]) +; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A50]], i8 [[B50]]) +; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A51]], i8 [[B51]]) +; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A52]], i8 [[B52]]) +; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A53]], i8 [[B53]]) +; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A54]], i8 [[B54]]) +; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A55]], i8 [[B55]]) +; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A56]], i8 [[B56]]) +; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A57]], i8 [[B57]]) +; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A58]], i8 [[B58]]) +; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A59]], i8 [[B59]]) +; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A60]], i8 [[B60]]) +; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A61]], i8 [[B61]]) +; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A62]], i8 [[B62]]) +; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A63]], i8 [[B63]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 +; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 +; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 +; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 +; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 +; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 +; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 +; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 +; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 +; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 +; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 +; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 +; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 +; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 +; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 +; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 +; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 +; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 +; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 +; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 +; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 +; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 +; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 +; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 +; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 +; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 +; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 +; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 +; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 +; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 +; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 +; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 +; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 +; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 +; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 +; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 +; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 +; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 +; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 +; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 +; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 +; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 +; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 +; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 +; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 +; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 +; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 +; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 +; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 +; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 +; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 +; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 +; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 +; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 +; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 +; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 +; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 +; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 +; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 +; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 +; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 +; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 +; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 +; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 +; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 +; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 +; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 +; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 +; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 +; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 +; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 +; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 +; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 +; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 +; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 +; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 +; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 +; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 +; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 +; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 +; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 +; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 +; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 +; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 +; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 +; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 +; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 +; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 +; CHECK-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1064,5 +1252,3 @@ define void @mul_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll index fa1ed4dd49c8d..d628dddd16cb1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,146 +26,48 @@ declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.ssub.with.overflow.i8 (i8 , i8 ) define void @sub_v8i64() { -; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v8i64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v8i64( -; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v8i64( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v8i64( +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 +; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; CHECK-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -211,106 +113,88 @@ define void @sub_v8i64() { } define void @sub_v16i32() { -; SSE-LABEL: @sub_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v16i32( -; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v16i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v16i32( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v16i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; CHECK-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -396,106 +280,168 @@ define void @sub_v16i32() { } define void @sub_v32i16() { -; SSE-LABEL: @sub_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v32i16( -; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v32i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v32i16( -; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v32i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v32i16( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v32i16( +; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A0]], i16 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A1]], i16 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A2]], i16 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A3]], i16 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A4]], i16 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A5]], i16 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A6]], i16 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A7]], i16 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A8]], i16 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A9]], i16 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A10]], i16 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A11]], i16 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A12]], i16 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A13]], i16 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A14]], i16 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A15]], i16 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A16]], i16 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A17]], i16 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A18]], i16 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A19]], i16 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A20]], i16 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A21]], i16 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A22]], i16 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A23]], i16 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A24]], i16 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A25]], i16 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A26]], i16 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A27]], i16 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A28]], i16 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A29]], i16 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A30]], i16 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A31]], i16 [[B31]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 +; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 +; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 +; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 +; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 +; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 +; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 +; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 +; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 +; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 +; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 +; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 +; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 +; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 +; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 +; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 +; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 +; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 +; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 +; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 +; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 +; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 +; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 +; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 +; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 +; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 +; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 +; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 +; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 +; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 +; CHECK-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -661,106 +607,328 @@ define void @sub_v32i16() { } define void @sub_v64i8() { -; SSE-LABEL: @sub_v64i8( -; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v64i8( -; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v64i8( -; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v64i8( -; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v64i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v64i8( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v64i8( +; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 +; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 +; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A0]], i8 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A1]], i8 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A2]], i8 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A3]], i8 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A4]], i8 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A5]], i8 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A6]], i8 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A7]], i8 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A8]], i8 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A9]], i8 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A10]], i8 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A11]], i8 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A12]], i8 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A13]], i8 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A14]], i8 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A15]], i8 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A16]], i8 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A17]], i8 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A18]], i8 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A19]], i8 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A20]], i8 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A21]], i8 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A22]], i8 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A23]], i8 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A24]], i8 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A25]], i8 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A26]], i8 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A27]], i8 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A28]], i8 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A29]], i8 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A30]], i8 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A31]], i8 [[B31]]) +; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A32]], i8 [[B32]]) +; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A33]], i8 [[B33]]) +; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A34]], i8 [[B34]]) +; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A35]], i8 [[B35]]) +; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A36]], i8 [[B36]]) +; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A37]], i8 [[B37]]) +; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A38]], i8 [[B38]]) +; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A39]], i8 [[B39]]) +; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A40]], i8 [[B40]]) +; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A41]], i8 [[B41]]) +; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A42]], i8 [[B42]]) +; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A43]], i8 [[B43]]) +; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A44]], i8 [[B44]]) +; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A45]], i8 [[B45]]) +; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A46]], i8 [[B46]]) +; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A47]], i8 [[B47]]) +; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A48]], i8 [[B48]]) +; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A49]], i8 [[B49]]) +; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A50]], i8 [[B50]]) +; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A51]], i8 [[B51]]) +; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A52]], i8 [[B52]]) +; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A53]], i8 [[B53]]) +; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A54]], i8 [[B54]]) +; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A55]], i8 [[B55]]) +; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A56]], i8 [[B56]]) +; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A57]], i8 [[B57]]) +; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A58]], i8 [[B58]]) +; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A59]], i8 [[B59]]) +; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A60]], i8 [[B60]]) +; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A61]], i8 [[B61]]) +; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A62]], i8 [[B62]]) +; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A63]], i8 [[B63]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 +; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 +; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 +; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 +; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 +; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 +; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 +; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 +; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 +; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 +; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 +; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 +; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 +; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 +; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 +; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 +; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 +; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 +; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 +; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 +; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 +; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 +; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 +; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 +; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 +; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 +; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 +; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 +; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 +; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 +; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 +; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 +; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 +; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 +; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 +; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 +; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 +; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 +; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 +; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 +; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 +; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 +; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 +; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 +; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 +; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 +; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 +; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 +; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 +; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 +; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 +; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 +; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 +; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 +; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 +; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 +; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 +; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 +; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 +; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 +; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 +; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 +; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 +; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 +; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 +; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 +; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 +; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 +; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 +; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 +; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 +; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 +; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 +; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 +; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 +; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 +; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 +; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 +; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 +; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 +; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 +; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 +; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 +; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 +; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 +; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 +; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 +; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 +; CHECK-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1084,5 +1252,3 @@ define void @sub_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll index 9c683eacc7062..11a68a5dfbcca 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -26,146 +26,48 @@ declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) declare {i8 , i1} @llvm.usub.with.overflow.i8 (i8 , i8 ) define void @sub_v8i64() { -; SSE-LABEL: @sub_v8i64( -; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 -; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 -; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 -; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 -; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 -; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 -; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 -; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 -; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 -; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 -; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 -; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 -; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 -; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 -; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) -; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) -; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) -; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) -; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) -; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) -; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) -; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) -; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 -; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 -; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 -; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 -; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 -; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 -; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 -; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 -; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8 -; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 -; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 -; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 -; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 -; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 -; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v8i64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v8i64( -; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v8i64( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 -; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 -; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) -; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v8i64( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v8i64( +; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0 +; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8 +; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; CHECK-NEXT: ret void ; %a0 = load i64, ptr @a64, align 8 %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 @@ -211,106 +113,88 @@ define void @sub_v8i64() { } define void @sub_v16i32() { -; SSE-LABEL: @sub_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 -; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 -; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 -; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v16i32( -; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v16i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v16i32( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v16i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4 +; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 +; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 +; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 +; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 +; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 +; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A2]], i32 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A3]], i32 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A4]], i32 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A5]], i32 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A6]], i32 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A7]], i32 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A8]], i32 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A9]], i32 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A10]], i32 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A11]], i32 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A12]], i32 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A13]], i32 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A14]], i32 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A15]], i32 [[B15]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0 +; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4 +; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4 +; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4 +; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4 +; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4 +; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4 +; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4 +; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 +; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 +; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 +; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 +; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 +; CHECK-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 @@ -396,106 +280,168 @@ define void @sub_v16i32() { } define void @sub_v32i16() { -; SSE-LABEL: @sub_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v32i16( -; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 -; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v32i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v32i16( -; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v32i16( -; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 -; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 -; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v32i16( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v32i16( +; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A0]], i16 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A1]], i16 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A2]], i16 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A3]], i16 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A4]], i16 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A5]], i16 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A6]], i16 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A7]], i16 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A8]], i16 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A9]], i16 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A10]], i16 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A11]], i16 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A12]], i16 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A13]], i16 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A14]], i16 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A15]], i16 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A16]], i16 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A17]], i16 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A18]], i16 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A19]], i16 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A20]], i16 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A21]], i16 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A22]], i16 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A23]], i16 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A24]], i16 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A25]], i16 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A26]], i16 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A27]], i16 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A28]], i16 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A29]], i16 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A30]], i16 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A31]], i16 [[B31]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0 +; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2 +; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2 +; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2 +; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2 +; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2 +; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2 +; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2 +; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2 +; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2 +; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 +; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 +; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 +; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 +; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 +; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 +; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 +; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 +; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 +; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 +; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 +; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 +; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 +; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 +; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 +; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 +; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 +; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 +; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 +; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 +; CHECK-NEXT: ret void ; %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 @@ -661,106 +607,328 @@ define void @sub_v32i16() { } define void @sub_v64i8() { -; SSE-LABEL: @sub_v64i8( -; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: ret void -; -; SLM-LABEL: @sub_v64i8( -; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 -; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0 -; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0 -; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 -; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) -; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0 -; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]]) -; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0 -; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SLM-NEXT: ret void -; -; AVX-LABEL: @sub_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX-NEXT: ret void -; -; AVX2-LABEL: @sub_v64i8( -; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX2-NEXT: ret void -; -; KNL-LABEL: @sub_v64i8( -; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; KNL-NEXT: ret void -; -; AVX512-LABEL: @sub_v64i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 -; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 -; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) -; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0 -; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1 -; AVX512-NEXT: ret void -; -; AVX512_256-LABEL: @sub_v64i8( -; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 -; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 -; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) -; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1 -; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]]) -; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0 -; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; AVX512_256-NEXT: ret void +; CHECK-LABEL: @sub_v64i8( +; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1 +; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1 +; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1 +; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1 +; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1 +; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1 +; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1 +; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1 +; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1 +; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1 +; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 +; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 +; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 +; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 +; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 +; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 +; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 +; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 +; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 +; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 +; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 +; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 +; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 +; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 +; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 +; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 +; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 +; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 +; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 +; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 +; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 +; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 +; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 +; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 +; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 +; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 +; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 +; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 +; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 +; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 +; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 +; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 +; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 +; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 +; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 +; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 +; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 +; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 +; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 +; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 +; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 +; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 +; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 +; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 +; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 +; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 +; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 +; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 +; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 +; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 +; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 +; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A0]], i8 [[B0]]) +; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A1]], i8 [[B1]]) +; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A2]], i8 [[B2]]) +; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A3]], i8 [[B3]]) +; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A4]], i8 [[B4]]) +; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A5]], i8 [[B5]]) +; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A6]], i8 [[B6]]) +; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A7]], i8 [[B7]]) +; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A8]], i8 [[B8]]) +; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A9]], i8 [[B9]]) +; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A10]], i8 [[B10]]) +; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A11]], i8 [[B11]]) +; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A12]], i8 [[B12]]) +; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A13]], i8 [[B13]]) +; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A14]], i8 [[B14]]) +; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A15]], i8 [[B15]]) +; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A16]], i8 [[B16]]) +; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A17]], i8 [[B17]]) +; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A18]], i8 [[B18]]) +; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A19]], i8 [[B19]]) +; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A20]], i8 [[B20]]) +; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A21]], i8 [[B21]]) +; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A22]], i8 [[B22]]) +; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A23]], i8 [[B23]]) +; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A24]], i8 [[B24]]) +; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A25]], i8 [[B25]]) +; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A26]], i8 [[B26]]) +; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A27]], i8 [[B27]]) +; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A28]], i8 [[B28]]) +; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A29]], i8 [[B29]]) +; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A30]], i8 [[B30]]) +; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A31]], i8 [[B31]]) +; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A32]], i8 [[B32]]) +; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A33]], i8 [[B33]]) +; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A34]], i8 [[B34]]) +; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A35]], i8 [[B35]]) +; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A36]], i8 [[B36]]) +; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A37]], i8 [[B37]]) +; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A38]], i8 [[B38]]) +; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A39]], i8 [[B39]]) +; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A40]], i8 [[B40]]) +; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A41]], i8 [[B41]]) +; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A42]], i8 [[B42]]) +; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A43]], i8 [[B43]]) +; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A44]], i8 [[B44]]) +; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A45]], i8 [[B45]]) +; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A46]], i8 [[B46]]) +; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A47]], i8 [[B47]]) +; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A48]], i8 [[B48]]) +; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A49]], i8 [[B49]]) +; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A50]], i8 [[B50]]) +; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A51]], i8 [[B51]]) +; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A52]], i8 [[B52]]) +; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A53]], i8 [[B53]]) +; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A54]], i8 [[B54]]) +; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A55]], i8 [[B55]]) +; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A56]], i8 [[B56]]) +; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A57]], i8 [[B57]]) +; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A58]], i8 [[B58]]) +; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A59]], i8 [[B59]]) +; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A60]], i8 [[B60]]) +; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A61]], i8 [[B61]]) +; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A62]], i8 [[B62]]) +; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A63]], i8 [[B63]]) +; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0 +; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0 +; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0 +; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0 +; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0 +; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0 +; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0 +; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0 +; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0 +; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0 +; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0 +; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0 +; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0 +; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0 +; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0 +; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0 +; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0 +; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0 +; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0 +; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0 +; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0 +; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0 +; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0 +; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0 +; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0 +; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0 +; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0 +; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0 +; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0 +; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0 +; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0 +; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0 +; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0 +; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0 +; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0 +; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0 +; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0 +; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0 +; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0 +; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0 +; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0 +; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0 +; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0 +; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0 +; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0 +; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0 +; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0 +; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0 +; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0 +; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0 +; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0 +; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0 +; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0 +; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0 +; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0 +; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0 +; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0 +; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0 +; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0 +; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0 +; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0 +; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0 +; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0 +; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0 +; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1 +; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1 +; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1 +; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1 +; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 +; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 +; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 +; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 +; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 +; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 +; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 +; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 +; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 +; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 +; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 +; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 +; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 +; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 +; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 +; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 +; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 +; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 +; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 +; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 +; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 +; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 +; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 +; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 +; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 +; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 +; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 +; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 +; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 +; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 +; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 +; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 +; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 +; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 +; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 +; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 +; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 +; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 +; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 +; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 +; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 +; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 +; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 +; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 +; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 +; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 +; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 +; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 +; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 +; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 +; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 +; CHECK-NEXT: ret void ; %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 @@ -1084,5 +1252,3 @@ define void @sub_v64i8() { store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll index 9683f71bd40e0..7740aaa14b805 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll @@ -13,12 +13,10 @@ define float @test(ptr %0, double %1, double %2, double %3) { ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x double> [[TMP9]], <3 x double> poison, <3 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = fmul <3 x double> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1356 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP28]], +; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP1]], 0.000000e+00 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> , double [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> poison, <3 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <3 x double> , <3 x double> [[TMP18]], <3 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <3 x double> , double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <3 x double> [[TMP18]], double [[TMP16]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = fadd <3 x double> [[TMP17]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <3 x double> , double [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP22:%.*]] = fadd <3 x double> [[TMP21]], [[TMP20]] diff --git a/llvm/test/Transforms/SLPVectorizer/sincos.ll b/llvm/test/Transforms/SLPVectorizer/sincos.ll index 504467d0049d7..76545dedac5f5 100644 --- a/llvm/test/Transforms/SLPVectorizer/sincos.ll +++ b/llvm/test/Transforms/SLPVectorizer/sincos.ll @@ -8,40 +8,52 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr @phase, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call fast { <8 x double>, <8 x double> } @llvm.sincos.v8f64(<8 x double> [[TMP0]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @phase, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, double } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { double, double } [[TMP1]], 1 ; CHECK-NEXT: store double [[TMP2]], ptr @sinval, align 16 ; CHECK-NEXT: store double [[TMP3]], ptr @cosval, align 16 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { double, double } [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 1 ; CHECK-NEXT: store double [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 8), align 8 ; CHECK-NEXT: store double [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 8), align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x double> [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x double> [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { double, double } [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { double, double } [[TMP9]], 1 ; CHECK-NEXT: store double [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 16 ; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 16 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x double> [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x double> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 24), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { double, double } [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { double, double } [[TMP13]], 1 ; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 24), align 8 ; CHECK-NEXT: store double [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 24), align 8 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x double> [[TMP5]], i32 4 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x double> [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16 +; CHECK-NEXT: [[TMP17:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { double, double } [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { double, double } [[TMP17]], 1 ; CHECK-NEXT: store double [[TMP18]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16 ; CHECK-NEXT: store double [[TMP19]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x double> [[TMP5]], i32 5 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x double> [[TMP4]], i32 5 +; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 40), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP20]]) +; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, double } [[TMP21]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { double, double } [[TMP21]], 1 ; CHECK-NEXT: store double [[TMP22]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 40), align 8 ; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 40), align 8 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x double> [[TMP5]], i32 6 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x double> [[TMP4]], i32 6 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP24]]) +; CHECK-NEXT: [[TMP26:%.*]] = extractvalue { double, double } [[TMP25]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { double, double } [[TMP25]], 1 ; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 16 ; CHECK-NEXT: store double [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 16 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x double> [[TMP5]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x double> [[TMP4]], i32 7 +; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 56), align 8 +; CHECK-NEXT: [[TMP29:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP28]]) +; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { double, double } [[TMP29]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { double, double } [[TMP29]], 1 ; CHECK-NEXT: store double [[TMP30]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 56), align 8 ; CHECK-NEXT: store double [[TMP31]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 56), align 8 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll index 10bee3262f738..45d6e395b6886 100644 --- a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll @@ -8,24 +8,28 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr @phase, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call fast { <8 x double>, <8 x double> } @llvm.sincos.v8f64(<8 x double> [[TMP0]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @phase, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 1 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @sinval, align 16 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr @cosval, align 16 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 1 ; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 8 ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 8 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 1 ; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16 ; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 1 ; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 8 ; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 8 ; CHECK-NEXT: ret i32 0 From a714b73f530f5d6dac88eb5c10509efb8d6c8232 Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Fri, 8 May 2026 19:42:47 +0200 Subject: [PATCH 078/538] [OFFLOAD][L0] Fix incorrect values in the Level Zero cached header (#196587) The current ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT and ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC values are incorrect as seen here: * https://github.com/oneapi-src/level-zero/blob/0f246f6edf90d56604f00f83b41d783dc6a9394e/include/ze_api.h#L318 * https://github.com/oneapi-src/level-zero/blob/0f246f6edf90d56604f00f83b41d783dc6a9394e/include/ze_api.h#L324 --- .../plugins-nextgen/level_zero/dynamic_l0/level_zero/ze_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/offload/plugins-nextgen/level_zero/dynamic_l0/level_zero/ze_api.h b/offload/plugins-nextgen/level_zero/dynamic_l0/level_zero/ze_api.h index 81a4c00269160..17e35a37f116e 100644 --- a/offload/plugins-nextgen/level_zero/dynamic_l0/level_zero/ze_api.h +++ b/offload/plugins-nextgen/level_zero/dynamic_l0/level_zero/ze_api.h @@ -166,8 +166,8 @@ typedef enum _ze_structure_type_t { ZE_STRUCTURE_TYPE_KERNEL_DESC = 0x1d, ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES = 0x1e, ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21, - ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT = 0x00020001, - ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00030001, + ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT = 0x1000f, + ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001, ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff } ze_structure_type_t; From 980a4619fc78bc07ecd3577c31673861ce408f9d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 8 May 2026 18:50:35 +0100 Subject: [PATCH 079/538] clang: Consolidate -aux-triple handling (#196551) All of the offload languages were essentially doing the same thing, with overcomplicated conditions conditional on the language. --- clang/include/clang/Driver/Action.h | 3 + clang/lib/Driver/ToolChains/Clang.cpp | 92 +++++++++----------- clang/test/Driver/sycl-offload-jit-xarch.cpp | 2 +- 3 files changed, 45 insertions(+), 52 deletions(-) diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index dbf1187da4db9..67937b00f6bcf 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -96,6 +96,9 @@ class Action { OFK_OpenMP = 0x04, OFK_HIP = 0x08, OFK_SYCL = 0x10, + + OFK_DeviceFirst = OFK_Cuda, + OFK_DeviceLast = OFK_SYCL }; static const char *getClassName(ActionClass AC); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index bdffa4fdd7e6b..92b3045dceff2 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4997,18 +4997,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } - const llvm::Triple *AuxTriple = - (IsCuda || IsHIP || IsSYCL) ? TC.getAuxTriple() : nullptr; - bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment(); bool IsUEFI = RawTriple.isUEFI(); bool IsIAMCU = RawTriple.isOSIAMCU(); - // Adjust IsWindowsXYZ for CUDA/HIP/SYCL compilations. Even when compiling in - // device mode (i.e., getToolchain().getTriple() is NVPTX/AMDGCN, not - // Windows), we need to pass Windows-specific flags to cc1. - if (IsCuda || IsHIP || IsSYCL) - IsWindowsMSVC |= AuxTriple && AuxTriple->isWindowsMSVCEnvironment(); - // C++ is not supported for IAMCU. if (IsIAMCU && types::isCXX(Input.getType())) D.Diag(diag::err_drv_clang_unsupported) << "C++ for IAMCU"; @@ -5022,6 +5013,34 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-triple"); CmdArgs.push_back(Args.MakeArgStringRef(TripleStr)); + bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment(); + + const llvm::Triple *AuxTriple = TC.getAuxTriple(); + if (AuxTriple) { + CmdArgs.push_back("-aux-triple"); + CmdArgs.push_back(Args.MakeArgStringRef(AuxTriple->str())); + + // Adjust IsWindowsXYZ for CUDA/HIP/SYCL compilations. Even when compiling + // in device mode (i.e., getToolchain().getTriple() is NVPTX/AMDGCN, not + // Windows), we need to pass Windows-specific flags to cc1. + IsWindowsMSVC |= AuxTriple->isWindowsMSVCEnvironment(); + } else if (JA.getOffloadingHostActiveKinds() != Action::OFK_None) { + // Figure out the device side triple for the host-side compilation. + for (unsigned I = Action::OFK_DeviceFirst; I <= Action::OFK_DeviceLast; + ++I) { + Compilation::const_offload_toolchains_range OffloadToolChains = + C.getOffloadToolChains(static_cast(I)); + if (OffloadToolChains.first == OffloadToolChains.second) + continue; + + const llvm::Triple &DeviceAuxTriple = + OffloadToolChains.first->second->getTriple(); + CmdArgs.push_back("-aux-triple"); + CmdArgs.push_back(Args.MakeArgStringRef(DeviceAuxTriple.str())); + break; + } + } + if (const Arg *MJ = Args.getLastArg(options::OPT_MJ)) { DumpCompilationDatabase(C, MJ->getValue(), TripleStr, Output, Input, Args); Args.ClaimAllArgs(options::OPT_MJ); @@ -5032,38 +5051,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.ClaimAllArgs(options::OPT_gen_cdb_fragment_path); } - if (IsCuda || IsHIP) { - CmdArgs.push_back("-aux-triple"); - - // We have to pass the triple of the host if compiling for a CUDA/HIP device - // and vice-versa. - if (IsCudaDevice || IsHIPDevice) { - StringRef AuxTripleStr = - C.getSingleOffloadToolChain()->getTriple().str(); - CmdArgs.push_back(Args.MakeArgStringRef(AuxTripleStr)); - } else { - // Host-side compilation. - StringRef AuxTripleStr = - (IsCuda ? C.getOffloadToolChains(Action::OFK_Cuda).first->second - : C.getOffloadToolChains(Action::OFK_HIP).first->second) - ->getTriple() - .str(); - CmdArgs.push_back(Args.MakeArgStringRef(AuxTripleStr)); - } - - if (JA.isDeviceOffloading(Action::OFK_HIP) && - (getToolChain().getTriple().isAMDGPU() || - (getToolChain().getTriple().isSPIRV() && - getToolChain().getTriple().getVendor() == llvm::Triple::AMD))) { - // Device side compilation printf - if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) { - CmdArgs.push_back(Args.MakeArgString( - "-mprintf-kind=" + - Args.getLastArgValue(options::OPT_mprintf_kind_EQ))); - // Force compiler error on invalid conversion specifiers - CmdArgs.push_back( - Args.MakeArgStringRef("-Werror=format-invalid-specifier")); - } + if ((getToolChain().getTriple().isAMDGPU() || + (getToolChain().getTriple().isSPIRV() && + getToolChain().getTriple().getVendor() == llvm::Triple::AMD))) { + // Device side compilation printf + if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) { + CmdArgs.push_back(Args.MakeArgString( + "-mprintf-kind=" + + Args.getLastArgValue(options::OPT_mprintf_kind_EQ))); + // Force compiler error on invalid conversion specifiers + CmdArgs.push_back( + Args.MakeArgStringRef("-Werror=format-invalid-specifier")); } } @@ -5095,12 +5093,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, if (IsSYCL) { if (IsSYCLDevice) { - // Host triple is needed when doing SYCL device compilations. - llvm::Triple AuxT = C.getDefaultToolChain().getTriple(); - std::string NormalizedTriple = AuxT.normalize(); - CmdArgs.push_back("-aux-triple"); - CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); - // We want to compile sycl kernels. CmdArgs.push_back("-fsycl-is-device"); @@ -6150,12 +6142,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // Prepare `-aux-target-cpu` and `-aux-target-feature` unless // `--gpu-use-aux-triple-only` is specified. - if (!Args.getLastArg(options::OPT_gpu_use_aux_triple_only) && - (IsCudaDevice || IsHIPDevice || IsSYCLDevice)) { + if (AuxTriple && !Args.getLastArg(options::OPT_gpu_use_aux_triple_only)) { const ArgList &HostArgs = C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_None); - std::string HostCPU = - getCPUName(D, HostArgs, *TC.getAuxTriple(), /*FromAs*/ false); + std::string HostCPU = getCPUName(D, HostArgs, *AuxTriple, /*FromAs*/ false); if (!HostCPU.empty()) { CmdArgs.push_back("-aux-target-cpu"); CmdArgs.push_back(Args.MakeArgString(HostCPU)); diff --git a/clang/test/Driver/sycl-offload-jit-xarch.cpp b/clang/test/Driver/sycl-offload-jit-xarch.cpp index e8a685f50fe84..53af2b16a33f1 100644 --- a/clang/test/Driver/sycl-offload-jit-xarch.cpp +++ b/clang/test/Driver/sycl-offload-jit-xarch.cpp @@ -4,7 +4,7 @@ // and clang-linker-wrapper call. // RUN: %clang -fsycl --offload-targets=spirv64-unknown-unknown -Xarch_spirv64 -O3 -### %s 2>&1 \ // RUN: | FileCheck -check-prefix=SYCL-DEVICE-O3 %s -// SYCL-DEVICE-O3: "-triple" "spirv64-unknown-unknown" "-O3"{{.*}} "-fsycl-is-device" +// SYCL-DEVICE-O3: "-triple" "spirv64-unknown-unknown" "-aux-triple" "{{.*}}" "-O3"{{.*}} "-fsycl-is-device" // SYCL-DEVICE-O3: {{"[^"]*clang-linker-wrapper[^"]*".* "--device-compiler=spirv64-unknown-unknown=-O3"}} // Verify that `-Xarch_spirv64` forwards libraries to the device linker. From 23be2035d437ed23abb22f4dff1752eb82adb659 Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Fri, 8 May 2026 13:58:51 -0400 Subject: [PATCH 080/538] [flang][OpenACC] support collapse on unstructured acc.loop (#196174) PR #164992 added unstructured-loop support to OpenACC lowering (no bounds on acc.loop, IVs privatized, body emitted as explicit cf), but it didn't covered the `collapse(N)` case. Compiling ``` !$acc parallel loop collapse(2) do j = 1, n do i = 1, n if (i == jdiag) then a(i,j) = 0.0d0 cycle end if a(i,j) = real(i + j, 8) end do end do ``` asserted in MLIR's runRegionDCE: "Assertion `mightHaveTerminator()' failed". Root cause: visitLoopControl unconditionally marked every inner DO of a collapsed nest via markDoConstructAsCollapsed. genFIR(DoConstruct) then read that marking and skipped the inner DO's loop machinery on the assumption that the parent acc.loop iterates and supplies the IV via a block argument. That assumption holds for the structured case, but not for the unstructured case added in #164992. Skipping it left the PFT-pre-allocated scaffold blocks (pre-header, header, exit) without terminators. Fix: add a markInnerCollapsed parameter (default true) to visitLoopControl, and pass false from privatizeInductionVariables (the unstructured case of buildACCLoopOp). Assisted-by: AI --- flang/lib/Lower/OpenACC.cpp | 41 ++++++--- flang/test/Lower/OpenACC/acc-unstructured.f90 | 88 +++++++++++++++++++ 2 files changed, 118 insertions(+), 11 deletions(-) diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 161023aa51db8..2e0ae0bdf7a27 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -1504,13 +1504,22 @@ static void determineDefaultLoopParMode( } // Helper to visit Bounds of DO LOOP nest. +// +// When `markInnerCollapsed` is true (the default), inner DOs that are absorbed +// into the parent acc.loop's collapse semantics are tagged via +// markDoConstructAsCollapsed so that genFIR(DoConstruct) skips re-emitting +// their loop machinery (the parent acc.loop iterates and supplies the IV via +// a block argument). Pass `false` when the parent acc.loop is unstructured: +// it has no IV block arguments and the inner DO's iteration mechanics must +// be emitted as ordinary control flow inside the acc.loop body. static void visitLoopControl( Fortran::lower::AbstractConverter &converter, const Fortran::parser::DoConstruct &outerDoConstruct, uint64_t loopsToProcess, Fortran::lower::pft::Evaluation &eval, std::function - callback) { + callback, + bool markInnerCollapsed = true) { Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); for (uint64_t i = 0; i < loopsToProcess; ++i) { const Fortran::parser::LoopControl *loopControl; @@ -1537,7 +1546,8 @@ static void visitLoopControl( if (!innerDo) break; // No deeper loop; stop collecting collapsed bounds. - Fortran::lower::markDoConstructAsCollapsed(*innerDo); + if (markInnerCollapsed) + Fortran::lower::markDoConstructAsCollapsed(*innerDo); mlir::Location loc = converter.genLocation(Fortran::parser::FindSourceLocation(*innerDo)); if (innerDo->IsDoConcurrent()) @@ -1920,15 +1930,24 @@ static void privatizeInductionVariables( llvm::SmallVector ivLocs; assert(!outerDoConstruct.IsDoConcurrent() && "do concurrent loops are not expected to contained early exits"); - visitLoopControl(converter, outerDoConstruct, loopsToProcess, eval, - [&](const Fortran::parser::LoopControl::Bounds &bounds, - mlir::Location loc) { - locs.push_back(loc); - Fortran::semantics::Symbol &ivSym = - bounds.Name().thing.symbol->GetUltimate(); - privatizeIv(converter, ivSym, currentLocation, ivTypes, - ivLocs, privateOperands, ivPrivate); - }); + // Do not mark inner DOs as "collapsed": the parent acc.loop is unstructured + // and has no IV block arguments, so each inner DO must emit its own + // iteration mechanics as cf inside the acc.loop body. Marking them collapsed + // would cause genFIR(DoConstruct) to lower only the body and leave the + // pre-allocated PFT scaffold blocks (preheader, header, exit) without + // terminators -- which then trips runRegionDCE's mightHaveTerminator + // assertion. + visitLoopControl( + converter, outerDoConstruct, loopsToProcess, eval, + [&](const Fortran::parser::LoopControl::Bounds &bounds, + mlir::Location loc) { + locs.push_back(loc); + Fortran::semantics::Symbol &ivSym = + bounds.Name().thing.symbol->GetUltimate(); + privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, + privateOperands, ivPrivate); + }, + /*markInnerCollapsed=*/false); } static mlir::acc::LoopOp diff --git a/flang/test/Lower/OpenACC/acc-unstructured.f90 b/flang/test/Lower/OpenACC/acc-unstructured.f90 index 57b678c1200d9..ce58ae90bdc35 100644 --- a/flang/test/Lower/OpenACC/acc-unstructured.f90 +++ b/flang/test/Lower/OpenACC/acc-unstructured.f90 @@ -222,3 +222,91 @@ subroutine test_unstructured8(a, n) ! CHECK: fir.load %{{.*}} : !fir.ref ! CHECK: arith.cmpi eq ! CHECK: cf.cond_br + +! Test that `acc parallel loop collapse(N)` whose body has an early-exit +! (here, `if (cond) then ... cycle ... end if`) lowers cleanly. The +! corresponding acc.loop must privatize all N induction variables, carry +! both `collapse = [N]` and `unstructured` attributes, and emit the +! iteration mechanics for all N levels as explicit cf inside the body. +! Reproducer derived from lorado issue #2856. +subroutine test_unstructured_collapse_cycle(a) + integer :: i, j, jdiag + real(8) :: a(:,:) + jdiag = 4 + !$acc parallel loop collapse(2) copy(a) + do j = 1, 8 + do i = 1, 8 + if (i == jdiag) then + a(i, j) = 0.0d0 + cycle + end if + a(i, j) = real(i + j, 8) + end do + end do + !$acc end parallel loop +end subroutine + +! CHECK-LABEL: func.func @_QPtest_unstructured_collapse_cycle +! CHECK: acc.parallel combined(loop) +! Both induction variables (j and i) are privatized: +! CHECK: %[[PRIVJ:.*]] = acc.private varPtr(%{{.*}} : !fir.ref) recipe(@privatization_ref_i32) -> !fir.ref {implicit = true, name = "j"} +! CHECK: %[[PRIVI:.*]] = acc.private varPtr(%{{.*}} : !fir.ref) recipe(@privatization_ref_i32) -> !fir.ref {implicit = true, name = "i"} +! No control(...) on acc.loop — bounds are not on the op: +! CHECK: acc.loop combined(parallel) private(%[[PRIVJ]], %[[PRIVI]] : !fir.ref, !fir.ref) { +! Outer loop trip-count test (j) emitted as cf: +! CHECK: arith.cmpi sgt +! CHECK: cf.cond_br +! Inner loop trip-count test (i) emitted as cf: +! CHECK: arith.cmpi sgt +! CHECK: cf.cond_br +! The if/cycle is a structured cf branch in the body: +! CHECK: arith.cmpi eq +! CHECK: cf.cond_br +! CHECK: acc.yield +! CHECK: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type], unstructured} + +! Test that `acc parallel loop collapse(N)` lowers cleanly when the early-exit +! is a STOP (the form already covered for collapse=1 by test_unstructured2). +subroutine test_unstructured_collapse_stop(a) + integer :: i, j, k + real :: a(:,:,:) + !$acc parallel loop collapse(3) + do i = 1, 10 + do j = 1, 10 + do k = 1, 10 + if (a(1,2,3) > 10) stop 'just to be unstructured' + end do + end do + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_unstructured_collapse_stop +! All three IVs privatized: +! CHECK: acc.private varPtr(%{{.*}} : !fir.ref) recipe(@privatization_ref_i32) -> !fir.ref {implicit = true, name = "i"} +! CHECK: acc.private varPtr(%{{.*}} : !fir.ref) recipe(@privatization_ref_i32) -> !fir.ref {implicit = true, name = "j"} +! CHECK: acc.private varPtr(%{{.*}} : !fir.ref) recipe(@privatization_ref_i32) -> !fir.ref {implicit = true, name = "k"} +! CHECK: acc.loop combined(parallel) private(%{{.*}}, %{{.*}}, %{{.*}} : !fir.ref, !fir.ref, !fir.ref) { +! CHECK: fir.call @_FortranAStopStatementText +! CHECK: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type], unstructured} + +! Test orphaned `acc loop collapse(N)` +subroutine test_unstructured_collapse_loop_only(a) + integer :: i, j, jdiag + real(8) :: a(:,:) + jdiag = 4 + !$acc loop collapse(2) + do j = 1, 8 + do i = 1, 8 + if (i == jdiag) then + a(i, j) = 0.0d0 + cycle + end if + a(i, j) = real(i + j, 8) + end do + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_unstructured_collapse_loop_only +! Standalone acc.loop (no `combined(...)`): +! CHECK: acc.loop private(%{{.*}}, %{{.*}} : !fir.ref, !fir.ref) { +! CHECK: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type], unstructured} From 0f3d9ce6ce2c1482dbc2c41af3d28ed17d4e8fe0 Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 8 May 2026 13:17:23 -0500 Subject: [PATCH 081/538] [flang][OpenMP] Fix component-level initializer in declare reduction (#195751) When a declare reduction initializer uses a component assignment such as `initializer(omp_priv%member = 0)`, the lowering would store the scalar RHS value (i32) directly to the whole derived-type reference, causing a FIR verification error: `'fir.store' op store value type must match memory reference type`. The root cause is that `MakeEvaluateExpr` extracts only the RHS expression from the `AssignmentStmt`, discarding the LHS component information. The lowering callback then returns this scalar value which gets stored to the wrong type. Fix this by mirroring the approach already used for combiner expressions: pass the parser-level `OmpStylizedInstance` to `processInitializer` so the callback can access the typed assignment and lower the full assignment (both LHS and RHS), correctly handling component designators, function calls on the RHS, and user-defined assignment. Fixes #184927 (with-initializer part; the without-initializer case remains unsupported). Assisted-by: Claude Opus 4.6. Co-authored-by: Matt P. Dziubinski --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 55 +++++++++++++++-- flang/lib/Lower/OpenMP/ClauseProcessor.h | 3 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 20 ++++++- .../OpenMP/declare-reduction-finalizer.f90 | 3 +- ...eclare-reduction-initializer-component.f90 | 41 +++++++++++++ ...e-reduction-initializer-defined-assign.f90 | 60 +++++++++++++++++++ ...declare-reduction-initializer-rhs-call.f90 | 52 ++++++++++++++++ 7 files changed, 225 insertions(+), 9 deletions(-) create mode 100644 flang/test/Lower/OpenMP/declare-reduction-initializer-component.f90 create mode 100644 flang/test/Lower/OpenMP/declare-reduction-initializer-defined-assign.f90 create mode 100644 flang/test/Lower/OpenMP/declare-reduction-initializer-rhs-call.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 1c39e90a922cf..5f5b4fe77f701 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -523,12 +523,26 @@ bool ClauseProcessor::processInclusive( } bool ClauseProcessor::processInitializer( - lower::SymMap &symMap, - ReductionProcessor::GenInitValueCBTy &genInitValueCB) const { + lower::SymMap &symMap, ReductionProcessor::GenInitValueCBTy &genInitValueCB, + const parser::OmpStylizedInstance *parserInitInstance) const { if (auto *clause = findUniqueClause()) { - genInitValueCB = [&, clause](fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Type type, mlir::Value moldArg, - mlir::Value privArg) { + // Extract the typed assignment from the parser-level instance, if + // the initializer is an assignment statement (as opposed to a call). + const evaluate::Assignment *assign = nullptr; + if (parserInitInstance) { + const auto &instance = std::get( + parserInitInstance->t); + if (const auto *assignStmt = + std::get_if(&instance.u)) { + if (auto *wrapper = assignStmt->typedAssignment.get()) + if (wrapper->v) + assign = &*wrapper->v; + } + } + genInitValueCB = [&, clause, assign](fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Type type, + mlir::Value moldArg, + mlir::Value privArg) { lower::SymMapScope scope(symMap); mlir::Value ompPrivVar; const StylizedInstance &inst = clause->v.front(); @@ -592,6 +606,37 @@ bool ClauseProcessor::processInitializer( return privVal; }, [&](const auto &expr) -> mlir::Value { + // For by-ref reductions with a typed assignment, lower + // the full assignment (both LHS and RHS) directly. + // This handles both whole-variable (omp_priv = val) and + // component-level (omp_priv%member = val) initializers. + // Mirror the combiner pattern: dispatch on assign->u to + // handle both intrinsic and user-defined assignment. + if (privArg && assign) { + lower::StatementContext assignCtx; + hlfir::Entity rhs = lower::convertExprToHLFIR( + loc, converter, assign->rhs, symMap, assignCtx); + rhs = hlfir::loadTrivialScalar(loc, builder, rhs); + hlfir::Entity lhs = lower::convertExprToHLFIR( + loc, converter, assign->lhs, symMap, assignCtx); + common::visit( + common::visitors{ + [&](const evaluate::Assignment::Intrinsic &) { + hlfir::AssignOp::create(builder, loc, rhs, lhs); + }, + [&](const evaluate::ProcedureRef &procRef) { + lower::convertUserDefinedAssignmentToHLFIR( + loc, converter, procRef, lhs, rhs, symMap); + }, + [&](const auto &) { + llvm_unreachable("Unexpected assignment type in " + "reduction initializer"); + }, + }, + assign->u); + assignCtx.finalizeAndPop(); + return mlir::Value{}; + } mlir::Value exprResult = fir::getBase(convertExprToValue( loc, converter, initExpr, symMap, stmtCtx)); if (auto refType = llvm::dyn_cast( diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index acf1068efb987..e138b4df30b71 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -97,7 +97,8 @@ class ClauseProcessor { mlir::omp::InclusiveClauseOps &result) const; bool processInitializer( lower::SymMap &symMap, - ReductionProcessor::GenInitValueCBTy &genInitValueCB) const; + ReductionProcessor::GenInitValueCBTy &genInitValueCB, + const parser::OmpStylizedInstance *parserInitInstance = nullptr) const; bool processMergeable(mlir::omp::MergeableClauseOps &result) const; bool processNogroup(mlir::omp::NogroupClauseOps &result) const; bool processNotinbranch(mlir::omp::NotinbranchClauseOps &result) const; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 88d28cf94b045..0ffc7bdae85b9 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -4215,6 +4215,18 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, assert(combinerExpr && "Expecting combiner expression"); auto parserInstIt = combinerExpr->v.begin(); + // Get the parser-level initializer expression (if present) so we can + // pass each parser::OmpStylizedInstance to processInitializer. + const parser::OmpInitializerExpression *initExpr = nullptr; + for (const auto &clause : construct.v.Clauses().v) { + initExpr = parser::omp::GetInitializerExpr(clause); + if (initExpr) + break; + } + auto parserInitInstIt = + initExpr ? initExpr->v.begin() + : std::list::const_iterator{}; + for (const auto &typeSpec : typeNameList.v) { (void)typeSpec; // Currently unused @@ -4257,7 +4269,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, parserInst); ReductionProcessor::GenInitValueCBTy genInitValueCB; ClauseProcessor cp(converter, semaCtx, clauses); - cp.processInitializer(symTable, genInitValueCB); + const parser::OmpStylizedInstance *parserInitInst = nullptr; + if (initExpr) { + assert(parserInitInstIt != initExpr->v.end() && + "Mismatched initializer instance count"); + parserInitInst = &*parserInitInstIt++; + } + cp.processInitializer(symTable, genInitValueCB, parserInitInst); mlir::Type redType = isByRef ? static_cast(fir::ReferenceType::get(reductionType)) diff --git a/flang/test/Lower/OpenMP/declare-reduction-finalizer.f90 b/flang/test/Lower/OpenMP/declare-reduction-finalizer.f90 index 2ec34c446e793..22a653179ce2d 100644 --- a/flang/test/Lower/OpenMP/declare-reduction-finalizer.f90 +++ b/flang/test/Lower/OpenMP/declare-reduction-finalizer.f90 @@ -39,8 +39,7 @@ end module m1 ! CHECK: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[INIT_ARG1]] {uniq_name = "omp_priv"} ! CHECK: %[[INIT_ADDR:.*]] = fir.address_of(@_QQro._QMm1Tt.0) ! CHECK: %[[INIT_DECL:.*]]:2 = hlfir.declare %[[INIT_ADDR]] -! CHECK: %[[INIT_VAL:.*]] = fir.load %[[INIT_DECL]]#0 -! CHECK: fir.store %[[INIT_VAL]] to %[[INIT_ARG1]] +! CHECK: hlfir.assign %[[INIT_DECL]]#0 to %[[PRIV_DECL]]#0 ! CHECK: omp.yield(%[[INIT_ARG1]] : ! ! -- combiner region diff --git a/flang/test/Lower/OpenMP/declare-reduction-initializer-component.f90 b/flang/test/Lower/OpenMP/declare-reduction-initializer-component.f90 new file mode 100644 index 0000000000000..b42fa610d17e0 --- /dev/null +++ b/flang/test/Lower/OpenMP/declare-reduction-initializer-component.f90 @@ -0,0 +1,41 @@ +! Test component-level initializer in declare reduction for derived types. +! Verifies that `initializer(omp_priv%member = 0)` correctly lowers to +! a component designate + assign (hlfir.designate + hlfir.assign), rather +! than storing the scalar directly to the whole derived-type reference. +! +! This is a regression test for https://github.com/llvm/llvm-project/issues/184927 + +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +subroutine test_component_init() + implicit none + type :: t + integer :: member + end type t + integer :: i + !$omp declare reduction(add_member : t : & + !$omp& omp_out%member = omp_out%member + omp_in%member) & + !$omp& initializer(omp_priv%member = 0) + type(t) :: x + x%member = 0 + !$omp parallel do reduction(add_member : x) num_threads(2) + do i = 1, 10 + x%member = x%member + 1 + end do + !$omp end parallel do +end subroutine + +!CHECK: omp.declare_reduction @add_member : !fir.ref> +!CHECK-SAME: alloc { +!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFtest_component_initTt{member:i32}> +!CHECK: omp.yield(%[[ALLOCA]] : !fir.ref>) +!CHECK: } init { +!CHECK: ^bb0(%[[INIT_ARG0:.*]]: !fir.ref>, +!CHECK-SAME: %[[INIT_ARG1:.*]]: !fir.ref>): +!CHECK: %[[OMP_ORIG:.*]]:2 = hlfir.declare %[[INIT_ARG0]] {uniq_name = "omp_orig"} +!CHECK: %[[OMP_PRIV:.*]]:2 = hlfir.declare %[[INIT_ARG1]] {uniq_name = "omp_priv"} +!CHECK: %[[ZERO:.*]] = arith.constant 0 : i32 +!CHECK: %[[MEMBER:.*]] = hlfir.designate %[[OMP_PRIV]]#0{"member"} : (!fir.ref>) -> !fir.ref +!CHECK: hlfir.assign %[[ZERO]] to %[[MEMBER]] : i32, !fir.ref +!CHECK: omp.yield(%[[INIT_ARG1]] : !fir.ref>) +!CHECK: } combiner { diff --git a/flang/test/Lower/OpenMP/declare-reduction-initializer-defined-assign.f90 b/flang/test/Lower/OpenMP/declare-reduction-initializer-defined-assign.f90 new file mode 100644 index 0000000000000..bdf48626fd2b3 --- /dev/null +++ b/flang/test/Lower/OpenMP/declare-reduction-initializer-defined-assign.f90 @@ -0,0 +1,60 @@ +! Test user-defined assignment in declare reduction initializer. +! Verifies that `initializer(omp_priv = t(1))` correctly dispatches to the +! user-defined `assignment(=)` subroutine, not intrinsic assignment. +! +! This is a regression test for https://github.com/llvm/llvm-project/issues/184927 + +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +module m_defined_assign + implicit none + type :: t + integer :: val = -999 + end type t + + interface assignment(=) + module procedure custom_assign + end interface + +contains + subroutine custom_assign(lhs, rhs) + type(t), intent(out) :: lhs + type(t), intent(in) :: rhs + lhs%val = rhs%val * 10 + end subroutine +end module + +subroutine test_defined_assign_init() + use m_defined_assign + implicit none + integer :: i + type(t) :: x + + !$omp declare reduction(add_t : t : omp_out%val = omp_out%val + omp_in%val) & + !$omp& initializer(omp_priv = t(1)) + + x = t(0) + !$omp parallel do reduction(add_t : x) num_threads(2) + do i = 1, 2 + x%val = x%val + 1 + end do + !$omp end parallel do +end subroutine + +!CHECK: omp.declare_reduction @add_t : +!CHECK-SAME: alloc { +!CHECK: %[[ALLOCA:.*]] = fir.alloca +!CHECK: omp.yield(%[[ALLOCA]] : +!CHECK: } init { +!CHECK: ^bb0(%[[INIT_ARG0:.*]]: !fir.ref>, +!CHECK-SAME: %[[INIT_ARG1:.*]]: !fir.ref>): +!CHECK: %[[OMP_ORIG:.*]]:2 = hlfir.declare %[[INIT_ARG0]] {uniq_name = "omp_orig"} +!CHECK: %[[OMP_PRIV:.*]]:2 = hlfir.declare %[[INIT_ARG1]] {uniq_name = "omp_priv"} +!CHECK: %[[INIT_ADDR:.*]] = fir.address_of(@_QQro._QMm_defined_assignTt.0) +!CHECK: %[[INIT_DECL:.*]]:2 = hlfir.declare %[[INIT_ADDR]] +!CHECK: %[[AS_EXPR:.*]] = hlfir.as_expr %[[INIT_DECL]]#0 +!CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[AS_EXPR]] {adapt.valuebyref} +!CHECK: fir.call @_QMm_defined_assignPcustom_assign(%[[OMP_PRIV]]#0, %[[ASSOC]]#0) +!CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 +!CHECK: omp.yield(%[[INIT_ARG1]] : +!CHECK: } combiner { diff --git a/flang/test/Lower/OpenMP/declare-reduction-initializer-rhs-call.f90 b/flang/test/Lower/OpenMP/declare-reduction-initializer-rhs-call.f90 new file mode 100644 index 0000000000000..7d409b27464e2 --- /dev/null +++ b/flang/test/Lower/OpenMP/declare-reduction-initializer-rhs-call.f90 @@ -0,0 +1,52 @@ +! Test that a function call on the RHS of a component-level initializer in +! declare reduction is correctly lowered through the assignment path (not +! the ProcedureRef/subroutine path). Verifies that the init region contains +! a call to the function, followed by a component designate and assign. +! +! This is a regression test for https://github.com/llvm/llvm-project/issues/184927 + +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +module m + implicit none + type :: t + integer :: member + end type t +contains + function init_val() result(res) + integer :: res + res = 42 + end function +end module + +subroutine test_rhs_call() + use m + implicit none + integer :: i + type(t) :: x + + !$omp declare reduction(add_t : t : omp_out%member = omp_out%member + omp_in%member) & + !$omp& initializer(omp_priv%member = init_val()) + + x%member = 0 + !$omp parallel do reduction(add_t : x) num_threads(2) + do i = 1, 2 + x%member = x%member + 1 + end do + !$omp end parallel do +end subroutine + +!CHECK: omp.declare_reduction @add_t : +!CHECK-SAME: alloc { +!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QMmTt{member:i32}> +!CHECK: omp.yield(%[[ALLOCA]] : +!CHECK: } init { +!CHECK: ^bb0(%[[INIT_ARG0:.*]]: !fir.ref>, +!CHECK-SAME: %[[INIT_ARG1:.*]]: !fir.ref>): +!CHECK: %[[OMP_ORIG:.*]]:2 = hlfir.declare %[[INIT_ARG0]] {uniq_name = "omp_orig"} +!CHECK: %[[OMP_PRIV:.*]]:2 = hlfir.declare %[[INIT_ARG1]] {uniq_name = "omp_priv"} +!CHECK: %[[CALL:.*]] = fir.call @_QMmPinit_val() {{.*}} : () -> i32 +!CHECK: %[[MEMBER:.*]] = hlfir.designate %[[OMP_PRIV]]#0{"member"} : (!fir.ref>) -> !fir.ref +!CHECK: hlfir.assign %[[CALL]] to %[[MEMBER]] : i32, !fir.ref +!CHECK: omp.yield(%[[INIT_ARG1]] : !fir.ref>) +!CHECK: } combiner { From 0e4477b677f04d4970a413be954f6b6792b560df Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Fri, 8 May 2026 14:30:56 -0400 Subject: [PATCH 082/538] [compiler-rt][profile][NFC] Introduce INSTR_PROF_INSTRUMENT_GPU_FUNC macro (#196538) Add a macro INSTR_PROF_INSTRUMENT_GPU_FUNC for the name of the GPU profiling function __llvm_profile_instrument_gpu (added in #187136), following the same pattern as INSTR_PROF_VALUE_PROF_MEMOP_FUNC. Use the macro in both the declaration in InstrProfiling.h and the definition in InstrProfilingPlatformGPU.c. This prepares the upcoming HIP/AMDGPU offload PGO patch (#177665) to use the same macro when calling this function. --- compiler-rt/include/profile/InstrProfData.inc | 3 +++ compiler-rt/lib/profile/InstrProfiling.h | 4 ++-- compiler-rt/lib/profile/InstrProfilingPlatformGPU.c | 6 +++--- llvm/include/llvm/ProfileData/InstrProfData.inc | 3 +++ 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index 7525feab8f133..3117cb2fddf36 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -893,6 +893,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_VALUE_PROF_MEMOP_FUNC __llvm_profile_instrument_memop #define INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR \ INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_MEMOP_FUNC) +#define INSTR_PROF_INSTRUMENT_GPU_FUNC __llvm_profile_instrument_gpu +#define INSTR_PROF_INSTRUMENT_GPU_FUNC_STR \ + INSTR_PROF_QUOTE(INSTR_PROF_INSTRUMENT_GPU_FUNC) /* InstrProfile per-function control data alignment. */ #define INSTR_PROF_DATA_ALIGNMENT 8 diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 1d22934bd6ef1..6fc26b59d3cdc 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -178,8 +178,8 @@ void __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data, * perform the counter update. \c Uniform is an optional counter tracking the * number of uniform. */ -void __llvm_profile_instrument_gpu(uint64_t *Counter, uint64_t *Uniform, - uint64_t Step); +void INSTR_PROF_INSTRUMENT_GPU_FUNC(uint64_t *Counter, uint64_t *Uniform, + uint64_t Step); /*! * \brief Write instrumentation data to the current file. diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c index ab7031343c855..3c67e2c7089d4 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c @@ -29,9 +29,9 @@ static int is_uniform(uint64_t mask) { // Wave-cooperative counter increment. The instrumentation pass emits calls to // this in place of the default non-atomic load/add/store or atomicrmw sequence. // The optional uniform counter allows calculating wave uniformity if present. -COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter, - uint64_t *uniform, - uint64_t step) { +COMPILER_RT_VISIBILITY void INSTR_PROF_INSTRUMENT_GPU_FUNC(uint64_t *counter, + uint64_t *uniform, + uint64_t step) { uint64_t mask = __gpu_lane_mask(); if (__gpu_is_first_in_lane(mask)) { __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask), diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 7525feab8f133..3117cb2fddf36 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -893,6 +893,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_VALUE_PROF_MEMOP_FUNC __llvm_profile_instrument_memop #define INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR \ INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_MEMOP_FUNC) +#define INSTR_PROF_INSTRUMENT_GPU_FUNC __llvm_profile_instrument_gpu +#define INSTR_PROF_INSTRUMENT_GPU_FUNC_STR \ + INSTR_PROF_QUOTE(INSTR_PROF_INSTRUMENT_GPU_FUNC) /* InstrProfile per-function control data alignment. */ #define INSTR_PROF_DATA_ALIGNMENT 8 From f10f4e853f319dc5a3b835aa7631d3bc155b260a Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 8 May 2026 20:55:58 +0200 Subject: [PATCH 083/538] [AMDGPU] Add subtarget features for MAD NC and 64-bit MIN/MAX instructions (#196326) --- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++++++++++ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 25fc64d178858..f12c404c035ae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1317,10 +1317,18 @@ defm MadU32Inst : AMDGPUSubtargetFeature<"mad-u32-inst", "Has v_mad_u32 instruction" >; +defm MadNC64_32Insts : AMDGPUSubtargetFeature<"mad-nc-64-32-insts", + "Has v_mad_nc_{u64_u32|i64_i32} instructions" +>; + defm AddMinMaxInsts : AMDGPUSubtargetFeature<"add-min-max-insts", "Has v_add_{min|max}_{i|u}32 instructions" >; +defm MinMaxI64Insts : AMDGPUSubtargetFeature<"min-max-i64-insts", + "Has v_{min|max}_{i|u}64 instructions" +>; + defm PkAddMinMaxInsts : AMDGPUSubtargetFeature<"pk-add-min-max-insts", "Has v_pk_add_{min|max}_{i|u}16 instructions" >; @@ -2097,7 +2105,9 @@ def FeatureISAVersion12_50_Common : FeatureSet< FeatureLshlAddU64Inst, FeatureAddSubU64Insts, FeatureMadU32Inst, + FeatureMadNC64_32Insts, FeatureAddMinMaxInsts, + FeatureMinMaxI64Insts, FeaturePkAddMinMaxInsts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 41e76063386ec..fc772ffeb1141 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -191,7 +191,7 @@ let SubtargetPredicate = HasLerpInst in let SchedRW = [WriteIntMul] in { let SubtargetPredicate = HasMadU32Inst in defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; - let SubtargetPredicate = isGFX1250Plus in { + let SubtargetPredicate = HasMadNC64_32Insts in { defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; } @@ -232,12 +232,12 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile, f } // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 -let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { +let SubtargetPredicate = HasMinMaxI64Insts, SchedRW = [WriteDoubleAdd] in { defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>; defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>; defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>; defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>; -} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] +} // End SubtargetPredicate = HasMinMaxI64Insts, SchedRW = [WriteDoubleAdd] } // End isReMaterializable = 1 From f80aa050ba4dbe81eeadac5ec0ddd3653b2c2b90 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 8 May 2026 21:03:22 +0200 Subject: [PATCH 084/538] [InstCombine][NFC] Replace buildAssumeFromKnowledge with CreateAlignmentAssumption (#196254) --- llvm/include/llvm/IR/IRBuilder.h | 2 +- .../Transforms/Utils/AssumeBundleBuilder.h | 7 ---- llvm/lib/IR/IRBuilder.cpp | 2 +- .../InstCombine/InstCombineCalls.cpp | 24 ++--------- .../Transforms/Utils/AssumeBundleBuilder.cpp | 10 ----- llvm/test/Transforms/InstCombine/assume.ll | 40 +++++++++++++++++++ 6 files changed, 45 insertions(+), 40 deletions(-) diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index e28a9027740a3..ca085bb4aaa11 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2807,7 +2807,7 @@ class IRBuilderBase { /// specified alignment. LLVM_ABI CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, - unsigned Alignment, + uint64_t Alignment, Value *OffsetValue = nullptr); /// Create an assume intrinsic call that represents an alignment diff --git a/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h b/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h index 2f45a1c5f5b67..1c0c318c8b0f2 100644 --- a/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h +++ b/llvm/include/llvm/Transforms/Utils/AssumeBundleBuilder.h @@ -47,13 +47,6 @@ LLVM_ABI AssumeInst *buildAssumeFromInst(Instruction *I); LLVM_ABI bool salvageKnowledge(Instruction *I, AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr); -/// Build and return a new assume created from the provided knowledge -/// if the knowledge in the assume is fully redundant this will return nullptr -LLVM_ABI AssumeInst * -buildAssumeFromKnowledge(ArrayRef Knowledge, - Instruction *CtxI, AssumptionCache *AC = nullptr, - DominatorTree *DT = nullptr); - /// This pass attempts to minimize the number of assume without loosing any /// information. struct AssumeSimplifyPass : public OptionalPassInfoMixin { diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 09945044c79a1..706a977a5b6d5 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1363,7 +1363,7 @@ CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL, CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, - unsigned Alignment, + uint64_t Alignment, Value *OffsetValue) { assert(isa(PtrValue->getType()) && "trying to create an alignment assumption on a non-pointer?"); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index b4e2ebea4d196..a622e0248fce8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3607,17 +3607,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { SmallVector OpBundles; II->getOperandBundlesAsDefs(OpBundles); - /// This will remove the boolean Condition from the assume given as - /// argument and remove the assume if it becomes useless. - /// always returns nullptr for use as a return values. - auto RemoveConditionFromAssume = [&](Instruction *Assume) -> Instruction * { - assert(isa(Assume)); - if (isAssumeWithEmptyBundle(*cast(II))) - return eraseInstFromFunction(CI); - replaceUse(II->getOperandUse(0), ConstantInt::getTrue(II->getContext())); - return nullptr; - }; - // Canonicalize assume(a && b) -> assume(a); assume(b); // Note: New assumption intrinsics created here are registered by // the InstCombineIRInserter object. @@ -3706,7 +3695,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } } - Instruction *Next = II->getNextNode(); // Convert nonnull assume like: // %A = icmp ne i32* %PTR, null // call void @llvm.assume(i1 %A) @@ -3740,15 +3728,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { /// offset and alignment. /// TODO: we can generate a GEP instead of merging the alignment with /// the offset. - RetainedKnowledge RK{Attribute::Alignment, - MinAlign(Offset, AlignMask + 1), A}; - if (auto *Replacement = - buildAssumeFromKnowledge(RK, Next, &AC, &DT)) { - - Replacement->insertAfter(II->getIterator()); - AC.registerAssumption(Replacement); - } - return RemoveConditionFromAssume(II); + Builder.CreateAlignmentAssumption(getDataLayout(), A, + MinAlign(Offset, AlignMask + 1)); + return eraseInstFromFunction(*II); } } } diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 8df3be85c9535..26bca70e1056f 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -311,16 +311,6 @@ bool llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC, return Changed; } -AssumeInst * -llvm::buildAssumeFromKnowledge(ArrayRef Knowledge, - Instruction *CtxI, AssumptionCache *AC, - DominatorTree *DT) { - AssumeBuilderState Builder(CtxI->getModule(), CtxI, AC, DT); - for (const RetainedKnowledge &RK : Knowledge) - Builder.addKnowledge(RK); - return Builder.build(); -} - RetainedKnowledge llvm::simplifyRetainedKnowledge(AssumeInst *Assume, RetainedKnowledge RK, AssumptionCache *AC, diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 40fe2f5f46f1e..c39f606382650 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -8,6 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" declare void @llvm.assume(i1) #1 declare ptr @get_ptr() +declare void @use_i64(i64) ; Check that the assume has not been removed: @@ -81,6 +82,45 @@ entry: ret void } +define void @align_with_offset_less_than_align(ptr %ptr) { +; CHECK-LABEL: @align_with_offset_less_than_align( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INT:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[INT]], 3 +; CHECK-NEXT: [[AND:%.*]] = and i64 [[ADD]], 7 +; CHECK-NEXT: call void @use_i64(i64 [[AND]]) +; CHECK-NEXT: ret void +; +entry: + %int = ptrtoint ptr %ptr to i64 + %add = add i64 %int, 3 + %and = and i64 %add, 7 + %cmp = icmp eq i64 0, %and + call void @llvm.assume(i1 %cmp) + call void @use_i64(i64 %and) + ret void +} + +define void @align_with_offset_greater_than_align(ptr %ptr) { +; CHECK-LABEL: @align_with_offset_greater_than_align( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INT:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[INT]], 6 +; CHECK-NEXT: [[AND:%.*]] = and i64 [[ADD]], 6 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i64 2) ] +; CHECK-NEXT: call void @use_i64(i64 [[AND]]) +; CHECK-NEXT: ret void +; +entry: + %int = ptrtoint ptr %ptr to i64 + %add = add i64 %int, 14 + %and = and i64 %add, 7 + %cmp = icmp eq i64 0, %and + call void @llvm.assume(i1 %cmp) + call void @use_i64(i64 %and) + ret void +} + define void @redundant_align() { ; CHECK-LABEL: @redundant_align( ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_ptr() From 4d90a0f84b4ed64ecce7d2c71055607f236ce247 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 12:03:34 -0700 Subject: [PATCH 085/538] [DWARFLinker] Emit .debug_names entries for DW_TAG_template_alias (#196440) The tag was missing from the accelerator-records saver's switch, so template alias DIEs were skipped and --verify-dwarf=output rejected the result. --- llvm/lib/DWARFLinker/Parallel/AcceleratorRecordsSaver.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/DWARFLinker/Parallel/AcceleratorRecordsSaver.cpp b/llvm/lib/DWARFLinker/Parallel/AcceleratorRecordsSaver.cpp index 745ed13f86c7c..d17dde285b773 100644 --- a/llvm/lib/DWARFLinker/Parallel/AcceleratorRecordsSaver.cpp +++ b/llvm/lib/DWARFLinker/Parallel/AcceleratorRecordsSaver.cpp @@ -82,6 +82,7 @@ void AcceleratorRecordsSaver::save(const DWARFDebugInfoEntry *InputDieEntry, case dwarf::DW_TAG_string_type: case dwarf::DW_TAG_structure_type: case dwarf::DW_TAG_subroutine_type: + case dwarf::DW_TAG_template_alias: case dwarf::DW_TAG_typedef: case dwarf::DW_TAG_union_type: case dwarf::DW_TAG_ptr_to_member_type: From f6c8904ee142dda4e57da590b1557eb1ccc0d28a Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Fri, 8 May 2026 12:06:56 -0700 Subject: [PATCH 086/538] [lldb] Fix TestPtrauthBRKc47xX16Invalid.py (#196408) LLDB correctly detects the pointer authentication failure. --- .../ptrauth_diagnostics/brkC47x_x16_invalid/brkC47x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/ptrauth_diagnostics/brkC47x_x16_invalid/brkC47x.c b/lldb/test/API/functionalities/ptrauth_diagnostics/brkC47x_x16_invalid/brkC47x.c index 7119277351585..044466ead300b 100644 --- a/lldb/test/API/functionalities/ptrauth_diagnostics/brkC47x_x16_invalid/brkC47x.c +++ b/lldb/test/API/functionalities/ptrauth_diagnostics/brkC47x_x16_invalid/brkC47x.c @@ -1,7 +1,7 @@ int main() { //% self.filecheck("c", "brkC47x.c") // CHECK: stop reason = EXC_BAD_ACCESS - // CHECK-NOT: Note: Possible pointer authentication failure detected. + // CHECK-NEXT: Note: Possible pointer authentication failure detected. asm volatile ( "mov x16, #0xbad \n" "brk 0xc470 \n" From 328ddd90ae4ee017dc920629d574fec479dadd71 Mon Sep 17 00:00:00 2001 From: Nerixyz Date: Fri, 8 May 2026 21:14:37 +0200 Subject: [PATCH 087/538] [lldb] Remove `__iter/len__` from `SBTypeEnumMember` (#196610) `SBTypeEnumMember` doesn't have a `GetSize` and `GetTypeEnumMemberAtIndex`, so having `__iter__` and `__len__` doesn't make sense. These are on `SBTypeEnumMemberList`. From the docstrings, it looks like the extensions were copied from said type. --- lldb/bindings/interface/SBTypeEnumMemberExtensions.i | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lldb/bindings/interface/SBTypeEnumMemberExtensions.i b/lldb/bindings/interface/SBTypeEnumMemberExtensions.i index 9f0e16afcfd8a..3b0154f40727f 100644 --- a/lldb/bindings/interface/SBTypeEnumMemberExtensions.i +++ b/lldb/bindings/interface/SBTypeEnumMemberExtensions.i @@ -2,14 +2,6 @@ STRING_EXTENSION_LEVEL_OUTSIDE(SBTypeEnumMember, lldb::eDescriptionLevelBrief) %extend lldb::SBTypeEnumMember { #ifdef SWIGPYTHON %pythoncode %{ - def __iter__(self): - '''Iterate over all members in a lldb.SBTypeEnumMemberList object.''' - return lldb_iter(self, 'GetSize', 'GetTypeEnumMemberAtIndex') - - def __len__(self): - '''Return the number of members in a lldb.SBTypeEnumMemberList object.''' - return self.GetSize() - name = property(GetName, None, doc='''A read only property that returns the name for this enum member as a string.''') type = property(GetType, None, doc='''A read only property that returns an lldb object that represents the type (lldb.SBType) for this enum member.''') signed = property(GetValueAsSigned, None, doc='''A read only property that returns the value of this enum member as a signed integer.''') From eefbaf0431b2aca6444aca80837816c96003ac3a Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Fri, 8 May 2026 21:19:34 +0200 Subject: [PATCH 088/538] [CIR] Implement CoawaitExpr for ComplexType (#194027) Implement CoawaitExpr support for ComplexType Issue https://github.com/llvm/llvm-project/issues/192331 --- clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp | 20 +++++-- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 3 +- clang/test/CIR/CodeGen/coro-task.cpp | 59 +++++++++++++++++++++ 3 files changed, 76 insertions(+), 6 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp index 12408b7c59458..870015c844a6d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp @@ -558,12 +558,22 @@ emitSuspendExpression(CIRGenFunction &cgf, CGCoroData &coro, if (!awaitRes.rv.isIgnored()) { // Create the alloca in the block before the scope wrapping // cir.await. + mlir::Value value; + RValue rv = awaitRes.rv; + if (rv.isScalar()) { + value = rv.getValue(); + } else if (rv.isComplex()) { + value = rv.getComplexValue(); + } else { + cgf.cgm.errorNYI("emitSuspendExpression: Aggregate value"); + return; + } + tmpResumeRValAddr = cgf.emitAlloca( - "__coawait_resume_rval", awaitRes.rv.getValue().getType(), loc, - CharUnits::One(), + "__coawait_resume_rval", value.getType(), loc, CharUnits::One(), builder.getBestAllocaInsertPoint(scopeParentBlock)); // Store the rvalue so we can reload it before the promise call. - builder.CIRBaseBuilderTy::createStore(loc, awaitRes.rv.getValue(), + builder.CIRBaseBuilderTy::createStore(loc, value, tmpResumeRValAddr); } } @@ -614,7 +624,9 @@ static RValue emitSuspendExpr(CIRGenFunction &cgf, // once we have a testcase and prove all pieces work. cgf.cgm.errorNYI("emitSuspendExpr Aggregate"); } else { // complex - cgf.cgm.errorNYI("emitSuspendExpr Complex"); + rval = RValue::getComplex(cir::LoadOp::create( + cgf.getBuilder(), scopeLoc, rval.getComplexValue().getType(), + tmpResumeRValAddr)); } return rval; } diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index b359c2bd719e4..6a26b2c987f3e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -86,8 +86,7 @@ class ComplexExprEmitter : public StmtVisitor { return Visit(pe->getReplacement()); } mlir::Value VisitCoawaitExpr(CoawaitExpr *s) { - cgf.cgm.errorNYI(s->getExprLoc(), "ComplexExprEmitter VisitCoawaitExpr"); - return {}; + return cgf.emitCoawaitExpr(*s).getComplexValue(); } mlir::Value VisitCoyieldExpr(CoyieldExpr *s) { cgf.cgm.errorNYI(s->getExprLoc(), "ComplexExprEmitter VisitCoyieldExpr"); diff --git a/clang/test/CIR/CodeGen/coro-task.cpp b/clang/test/CIR/CodeGen/coro-task.cpp index 2874009157f88..46d7cde18b047 100644 --- a/clang/test/CIR/CodeGen/coro-task.cpp +++ b/clang/test/CIR/CodeGen/coro-task.cpp @@ -839,3 +839,62 @@ folly::coro::Task co_return_with_dtor(int flag) { // OGCG: cleanup4: // OGCG: call void @_ZN7HasDtorD1Ev({{.*}} %[[LOCAL]]) // OGCG: br label %coro.final + +folly::coro::Task fetchData() noexcept { + int __complex__ a; + co_return a; +} + +folly::coro::Task complex_co_await() noexcept { + co_await fetchData(); +} + +// CIR: cir.func coroutine {{.*}} @_Z16complex_co_awaitv + +// CIR: %[[COMPLEX_ADDR:.*]] = cir.alloca !rec_folly3A3Acoro3A3ATask3C_Complex_int3E, !cir.ptr, ["ref.tmp1"] +// CIR: %[[RESUME_VAL_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["__coawait_resume_rval"] + +// CIR: cir.cleanup.scope { +// CIR: cir.await(init, ready : { +// CIR: }, suspend : { +// CIR: }, resume : { +// CIR: },) + +// CIR: cir.coro.body { +// CIR: %[[CALL:.*]] = cir.call @_Z9fetchDatav() nothrow : () -> !rec_folly3A3Acoro3A3ATask3C_Complex_int3E +// CIR: cir.store {{.*}} %[[CALL]], %[[COMPLEX_ADDR]] : !rec_folly3A3Acoro3A3ATask3C_Complex_int3E, !cir.ptr + +// CIR: cir.await(user, ready : { +// CIR: }, suspend : { +// CIR: }, resume : { +// CIR: %[[RESUME_VAL:.*]] = cir.call @_ZN5folly4coro4TaskICiE12await_resumeEv(%[[COMPLEX_ADDR]]) : (!cir.ptr {llvm.align = 1 : i64, llvm.dereferenceable = 1 : i64, llvm.nonnull, llvm.noundef}) -> (!cir.complex {llvm.noundef}) +// CIR: cir.store %[[RESUME_VAL]], %[[RESUME_VAL_ADDR]] : !cir.complex, !cir.ptr> +// CIR: },) +// CIR: %[[V:.*]] = cir.load %[[RESUME_VAL_ADDR]] : !cir.ptr>, !cir.complex +// CIR: cir.yield +// CIR: } + +// CIR: } cleanup normal { +// CIR: } + +// OGCG: define dso_local void @_Z16complex_co_awaitv() + +// OGCG: %[[RESUME_VAL_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[RESUME_REAL_ADDR:.*]] = alloca i32, align 4 +// OGCG: %[[RESUME_IMAG_ADDR:.*]] = alloca i32, align 4 + +// OGCG: coro.init: +// OGCG: await.ready: +// OGCG: %[[RESUME_VAL:.*]] = call noundef i64 @_ZN5folly4coro4TaskICiE12await_resumeEv(ptr noundef nonnull align 1 dereferenceable(1) %{{.*}}) +// OGCG: store i64 %[[RESUME_VAL]], ptr %[[RESUME_VAL_ADDR]], align 4 +// OGCG: %[[RESUME_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESUME_VAL_ADDR]], i32 0, i32 0 +// OGCG: %[[RESUME_REAL:.*]] = load i32, ptr %[[RESUME_REAL_PTR]], align 4 +// OGCG: store i32 %[[RESUME_REAL]], ptr %[[RESUME_REAL_ADDR]], align 4 +// OGCG: %[[RESUME_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESUME_VAL_ADDR]], i32 0, i32 1 +// OGCG: %[[RESUME_IMAG:.*]] = load i32, ptr %[[RESUME_IMAG_PTR]], align 4 +// OGCG: store i32 %[[RESUME_IMAG]], ptr %[[RESUME_IMAG_ADDR]], align 4 +// OGCG: br label %[[CLEANUP_FROM_AWAIT_READY:.*]] + +// OGCG: [[CLEANUP_CONT:.*]]: +// OGCG: %[[RESUME_REAL:.*]] = load i32, ptr %[[RESUME_REAL_ADDR]], align 4 +// OGCG: %[[RESUME_IMAG:.*]] = load i32, ptr %[[RESUME_IMAG_ADDR]], align 4 From c8408b35f9b6d6af3095a3134a76347f2981f154 Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Fri, 8 May 2026 12:31:35 -0700 Subject: [PATCH 089/538] [cir] fix IR dump comments from #195198 (#196605) --- clang/test/CIR/CodeGen/delete-array-throwing-dtor.cpp | 4 ++-- clang/test/CIR/Transforms/idiom-recognizer.cpp | 2 +- clang/test/CIR/mlprint.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/test/CIR/CodeGen/delete-array-throwing-dtor.cpp b/clang/test/CIR/CodeGen/delete-array-throwing-dtor.cpp index c39227f3432a7..bcb572f7d932e 100644 --- a/clang/test/CIR/CodeGen/delete-array-throwing-dtor.cpp +++ b/clang/test/CIR/CodeGen/delete-array-throwing-dtor.cpp @@ -16,7 +16,7 @@ void test_delete_array_throwing_dtor(ThrowingDtor *ptr) { delete[] ptr; } -// CIR-BEFORE-CXXABI: IR Dump Before CXXABILowering (cir-cxxabi-lowering) +// CIR-BEFORE-CXXABI: IR Dump Before CXXABILowering: cir-cxxabi-lowering // CIR-BEFORE-CXXABI: cir.func {{.*}} @_Z31test_delete_array_throwing_dtorP12ThrowingDtor // CIR-BEFORE-CXXABI: %[[PTR:.*]] = cir.load @@ -26,7 +26,7 @@ void test_delete_array_throwing_dtor(ThrowingDtor *ptr) { // CIR-BEFORE-CXXABI: cir.delete_array %[[PTR]] : !cir.ptr dtor_may_throw {delete_fn = @_ZdaPvm, delete_params = #cir.usual_delete_params, element_dtor = @_ZN12ThrowingDtorD1Ev} // CIR-BEFORE-CXXABI: } -// CIR-AFTER-CXXABI: IR Dump After CXXABILowering (cir-cxxabi-lowering) +// CIR-AFTER-CXXABI: IR Dump After CXXABILowering: cir-cxxabi-lowering // CIR-AFTER-CXXABI: cir.func {{.*}} @_Z31test_delete_array_throwing_dtorP12ThrowingDtor // CIR-AFTER-CXXABI: %[[PTR:.*]] = cir.load diff --git a/clang/test/CIR/Transforms/idiom-recognizer.cpp b/clang/test/CIR/Transforms/idiom-recognizer.cpp index 5ca243edef66e..28a2b502eb18a 100644 --- a/clang/test/CIR/Transforms/idiom-recognizer.cpp +++ b/clang/test/CIR/Transforms/idiom-recognizer.cpp @@ -1,2 +1,2 @@ // RUN: %clang_cc1 -fclangir -emit-cir -mmlir --mlir-print-ir-after-all -clangir-enable-idiom-recognizer %s -o %t.cir 2>&1 | FileCheck %s -check-prefix=CIR -// CIR: IR Dump After IdiomRecognizer (cir-idiom-recognizer) +// CIR: IR Dump After IdiomRecognizer: cir-idiom-recognizer diff --git a/clang/test/CIR/mlprint.c b/clang/test/CIR/mlprint.c index 1630bc1e3ce9b..6ca5b5b06ddda 100644 --- a/clang/test/CIR/mlprint.c +++ b/clang/test/CIR/mlprint.c @@ -6,9 +6,9 @@ int foo(void) { return i; } -// CIR: IR Dump After CIRCanonicalize (cir-canonicalize) +// CIR: IR Dump After CIRCanonicalize: cir-canonicalize // CIR: cir.func{{.*}} @foo() -> !s32i -// LLVM: IR Dump After cir::direct::ConvertCIRToLLVMPass (cir-flat-to-llvm) +// LLVM: IR Dump After cir::direct::ConvertCIRToLLVMPass: cir-flat-to-llvm // LLVM: llvm.func @foo() -> i32 // LLVM: IR Dump After // LLVM: define{{.*}} i32 @foo() From b84f58ee844ca929db2fff2e41e2195e255548b8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 8 May 2026 21:36:42 +0200 Subject: [PATCH 090/538] [VPlan] Unify inner and outer loop paths (NFCI). (#192868) Move combine the logic of tryToBuildVPlanWithVPRecipes and tryToBuildVPlan, as well as planInVPlanNativePath and plan. This unifies the code paths to construct plans for both inner and outer loop vectorization, and removes some duplication. It also ensures we run almost the same VPlan-transformations in both modes. Currently a few code paths need to be guarded with a check if we are dealing with an inner and outer loop. PR: https://github.com/llvm/llvm-project/pull/192868 --- .../llvm/Transforms/Vectorize/LoopVectorize.h | 9 +- .../Vectorize/LoopVectorizationPlanner.cpp | 48 +++ .../Vectorize/LoopVectorizationPlanner.h | 42 +-- .../Transforms/Vectorize/LoopVectorize.cpp | 356 ++++++------------ llvm/lib/Transforms/Vectorize/VPlan.cpp | 29 +- llvm/lib/Transforms/Vectorize/VPlan.h | 4 + .../Transforms/Vectorize/VPlanPredicator.cpp | 3 + .../VPlan/vplan-stress-test-no-explict-vf.ll | 2 +- .../VPlan/vplan_hcfg_stress_test.ll | 2 +- .../LoopVectorize/explicit_outer_detection.ll | 2 +- 10 files changed, 196 insertions(+), 301 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h index 18906aa7eeae3..0d45c159d315c 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -169,9 +169,12 @@ struct LoopVectorizePass : public OptionalPassInfoMixin { /// purposes along with the corresponding optimization remark \p RemarkName. /// If \p I is passed, it is an instruction that prevents vectorization. /// Otherwise, the loop \p TheLoop is used for the location of the remark. -LLVM_ABI void reportVectorizationFailure( - const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr); +LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, + const StringRef OREMsg, + const StringRef ORETag, + OptimizationRemarkEmitter *ORE, + const Loop *TheLoop, + Instruction *I = nullptr); /// Same as above, but the debug message and optimization remark are identical inline void reportVectorizationFailure(const StringRef DebugMsg, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp index 91476cf232fe0..f29834d2f804e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp @@ -27,6 +27,8 @@ using namespace llvm; #define DEBUG_TYPE "loop-vectorize" +extern cl::opt VPlanBuildOuterloopStressTest; + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -618,3 +620,49 @@ void VFSelectionContext::collectInLoopReductions() { << " reduction for phi: " << *Phi << "\n"); } } + +// TODO: we could return a pair of values that specify the max VF and +// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of +// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment +// doesn't have a cost model that can choose which plan to execute if +// more than one is generated. +FixedScalableVFPair +VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) { + if (UserVF.isScalable() && !supportsScalableVectors()) { + reportVectorizationFailure( + "Scalable vectorization requested but not supported by the target", + "the scalable user-specified vectorization width for outer-loop " + "vectorization cannot be used because the target does not support " + "scalable vectors.", + "ScalableVFUnfeasible", ORE, TheLoop); + return FixedScalableVFPair::getNone(); + } + + ElementCount VF = UserVF; + if (VF.isZero()) { + auto [_, WidestType] = getSmallestAndWidestTypes(); + + auto RegKind = TTI.enableScalableVectorization() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; + + TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); + unsigned N = RegSize.getKnownMinValue() / WidestType; + VF = ElementCount::get(N, RegSize.isScalable()); + LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); + + // Make sure we have a VF > 1 for stress testing. + if (VPlanBuildOuterloopStressTest && VF.isScalar()) { + LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " + << "overriding computed VF.\n"); + VF = ElementCount::getFixed(4); + } + } + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); + if (VF.isScalar()) + return FixedScalableVFPair::getNone(); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); + return FixedScalableVFPair(VF); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index a6789974e0bd6..00b689326d770 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -703,6 +703,10 @@ class VFSelectionContext { /// for size, returning true here aborts vectorization. bool runtimeChecksRequired(); + /// Returns a scalable VF to use for outer-loop vectorization if the target + /// supports it and a fixed VF otherwise. + FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF); + /// Compute smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -789,10 +793,6 @@ class LoopVectorizationPlanner { /// interleaving should be avoided up-front, no plans are generated. void plan(ElementCount UserVF, unsigned UserIC); - /// Use the VPlan-native path to plan how to best vectorize, return the best - /// VF and its cost. - VectorizationFactor planInVPlanNativePath(ElementCount UserVF); - /// Return the VPlan for \p VF. At the moment, there is always a single VPlan /// for each VF. VPlan &getPlanFor(ElementCount VF) const; @@ -881,33 +881,21 @@ class LoopVectorizationPlanner { unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll); -protected: - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); - private: - /// Build a VPlan according to the information gathered by Legal. \return a - /// VPlan for vectorization factors \p Range.Start and up to \p Range.End - /// exclusive, possibly decreasing \p Range.End. If no VPlan can be built for - /// the input range, set the largest included VF to the maximum VF for which - /// no plan could be built. - VPlanPtr tryToBuildVPlan(VFRange &Range); - - /// Build a VPlan using VPRecipes according to the information gather by - /// Legal. This method is only used for the legacy inner loop vectorizer. - /// \p Range's largest included VF is restricted to the maximum VF the - /// returned VPlan is valid for. If no VPlan can be built for the input range, - /// set the largest included VF to the maximum VF for which no plan could be - /// built. Each VPlan is built starting from a copy of \p InitialPlan, which - /// is a plain CFG VPlan wrapping the original scalar loop. - VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range); + /// Build a VPlan using VPRecipes according to the information gathered by + /// Legal and VPlan-based analysis. For outer loops, performs basic recipe + /// conversion only. For inner loops, \p Range's largest included VF is + /// restricted to the maximum VF the returned VPlan is valid for. If no VPlan + /// can be built for the input range, set the largest included VF to the + /// maximum VF for which no plan could be built. Each VPlan is built starting + /// from a copy of \p InitialPlan, which is a plain CFG VPlan wrapping the + /// original scalar loop. + VPlanPtr tryToBuildVPlan(VPlanPtr InitialPlan, VFRange &Range); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + /// legal to vectorize the loop. + void buildVPlans(ElementCount MinVF, ElementCount MaxVF); /// Add ComputeReductionResult recipes to the middle block to compute the /// final reduction results. Add Select recipes to the latch block when diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ae1d6d83cccd4..1ace2275e2b6d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -350,8 +350,8 @@ cl::opt llvm::VPlanPrintVectorRegionScope( // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the // verification of the H-CFGs built. -static cl::opt VPlanBuildStressTest( - "vplan-build-stress-test", cl::init(false), cl::Hidden, +cl::opt VPlanBuildOuterloopStressTest( + "vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden, cl::desc( "Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " @@ -745,8 +745,8 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I) { + OptimizationRemarkEmitter *ORE, + const Loop *TheLoop, Instruction *I) { LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE); ORE->emit( @@ -1877,7 +1877,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // now, only collect outer loops that have explicit vectorization hints. If we // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. - if (L.isInnermost() || VPlanBuildStressTest || + if (L.isInnermost() || VPlanBuildOuterloopStressTest || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); @@ -2868,6 +2868,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { + // For outer loops, use simple type-based heuristic VF. No cost model or + // memory dependence analysis is available. + if (!TheLoop->isInnermost()) { + return Config.computeVPlanOuterloopVF(UserVF); + } + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may be useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -5666,83 +5672,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } -// This function will select a scalable VF if the target supports scalable -// vectors and a fixed one otherwise. -// TODO: we could return a pair of values that specify the max VF and -// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of -// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment -// doesn't have a cost model that can choose which plan to execute if -// more than one is generated. -static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, - VFSelectionContext &Config) { - unsigned WidestType = Config.getSmallestAndWidestTypes().second; - - TargetTransformInfo::RegisterKind RegKind = - TTI.enableScalableVectorization() - ? TargetTransformInfo::RGK_ScalableVector - : TargetTransformInfo::RGK_FixedWidthVector; - - TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); - unsigned N = RegSize.getKnownMinValue() / WidestType; - return ElementCount::get(N, RegSize.isScalable()); -} - -VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - ElementCount VF = UserVF; - // Outer loop handling: They may require CFG and instruction level - // transformations before even evaluating whether vectorization is profitable. - // Since we cannot modify the incoming IR, we need to build VPlan upfront in - // the vectorization pipeline. - if (!OrigLoop->isInnermost()) { - // If the user doesn't provide a vectorization factor, determine a - // reasonable one. - if (UserVF.isZero()) { - VF = determineVPlanVF(TTI, Config); - LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); - - // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { - LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " - << "overriding computed VF.\n"); - VF = ElementCount::getFixed(4); - } - } else if (UserVF.isScalable() && !Config.supportsScalableVectors()) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " - << "not supported by the target.\n"); - reportVectorizationFailure( - "Scalable vectorization requested but not supported by the target", - "the scalable user-specified vectorization width for outer-loop " - "vectorization cannot be used because the target does not support " - "scalable vectors.", - "ScalableVFUnfeasible", ORE, OrigLoop); - return VectorizationFactor::Disabled(); - } - assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF.getKnownMinValue()) && - "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") - << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF, VF); - - if (VPlans.empty()) - return VectorizationFactor::Disabled(); - - // For VPlan build stress testing, we bail out after VPlan construction. - if (VPlanBuildStressTest) - return VectorizationFactor::Disabled(); - - return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; - } - - LLVM_DEBUG( - dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " - "VPlan-native path.\n"); - return VectorizationFactor::Disabled(); -} - void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); Config.collectElementTypesForWidening(&CM.ValuesToIgnore); @@ -5750,6 +5680,16 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return; + if (!OrigLoop->isInnermost()) { + // For outer loops, computeMaxVF returns a single non-scalar VF; build a + // plan for only that VF. + ElementCount VF = + MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF; + buildVPlans(VF, VF); + LLVM_DEBUG(printPlans(dbgs())); + return; + } + // Compute the minimal bitwidths required for integer operations in the loop // for later use by the cost model. Config.computeMinimalBitwidths(); @@ -5790,9 +5730,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (EpilogueUserVF.isVector() && ElementCount::isKnownLT(EpilogueUserVF, UserVF)) { CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF); - buildVPlansWithVPRecipes(EpilogueUserVF, EpilogueUserVF); + buildVPlans(EpilogueUserVF, EpilogueUserVF); } - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlans(UserVF, UserVF); if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) { // For scalar VF, skip VPlan cost check as VPlan cost is designed for // vector VFs only. @@ -5824,8 +5764,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.collectNonVectorizedAndSetWideningDecisions(VF); } - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlans(ElementCount::getFixed(1), MaxFactors.FixedVF); + buildVPlans(ElementCount::getScalable(1), MaxFactors.ScalableVF); LLVM_DEBUG(printPlans(dbgs())); } @@ -6043,22 +5983,25 @@ LoopVectorizationPlanner::computeBestVF() { return {VectorizationFactor::Disabled(), nullptr}; // If there is a single VPlan with a single VF, return it directly. VPlan &FirstPlan = *VPlans[0]; + ElementCount UserVF = Hints.getWidth(); - if (hasPlanWithVF(UserVF)) { - if (VPlans.size() == 1) { - assert(FirstPlan.getSingleVF() == UserVF && - "UserVF must match single VF"); - return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan}; - } - if (EpilogueVectorizationForceVF > 1) { - assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built"); - assert(VPlans[0]->getSingleVF() == - ElementCount::getFixed(EpilogueVectorizationForceVF) && - "expected first plan to be for the forced epilogue VF"); - assert(VPlans[1]->getSingleVF() == UserVF && - "expected second plan to be for the forced UserVF"); - return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()}; - } + if (VPlans.size() == 1) { + // For outer loops, the plan has a single vector VF determined by the + // heuristic. + assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) || + FirstPlan.isOuterLoop()) && + "must have a single scalar VF, UserVF or an outer loop"); + return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan}; + } + + if (hasPlanWithVF(UserVF) && EpilogueVectorizationForceVF > 1) { + assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built"); + assert(VPlans[0]->getSingleVF() == + ElementCount::getFixed(EpilogueVectorizationForceVF) && + "expected first plan to be for the forced epilogue VF"); + assert(VPlans[1]->getSingleVF() == UserVF && + "expected second plan to be for the forced UserVF"); + return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()}; } LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: " @@ -6804,30 +6747,38 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, // optimizations. static void printOptimizedVPlan(VPlan &) {} -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, + ElementCount MaxVF) { if (ElementCount::isKnownGT(MinVF, MaxVF)) return; - assert(OrigLoop->isInnermost() && "Inner loop expected."); - - const LoopAccessInfo *LAI = Legal->getLAI(); - LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(), - OrigLoop, LI, DT, PSE.getSE()); - if (!LAI->getRuntimePointerChecking()->getChecks().empty() && - !LAI->getRuntimePointerChecking()->getDiffChecks()) { - // Only use noalias metadata when using memory checks guaranteeing no - // overlap across all iterations. - LVer.prepareNoAliasMetadata(); + bool IsInnerLoop = OrigLoop->isInnermost(); + + // Set up loop versioning for inner loops with memory runtime checks. + // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not + // called for them. + std::optional LVer; + if (IsInnerLoop) { + const LoopAccessInfo *LAI = Legal->getLAI(); + LVer.emplace(*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, + LI, DT, PSE.getSE()); + if (!LAI->getRuntimePointerChecking()->getChecks().empty() && + !LAI->getRuntimePointerChecking()->getDiffChecks()) { + // Only use noalias metadata when using memory checks guaranteeing no + // overlap across all iterations. + LVer->prepareNoAliasMetadata(); + } } // Create initial base VPlan0, to serve as common starting point for all // candidates built later for specific VF ranges. auto VPlan0 = VPlanTransforms::buildVPlan0( OrigLoop, *LI, Legal->getWidestInductionType(), - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer); + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, + LVer ? &*LVer : nullptr); - // Create recipes for header phis. + // Create recipes for header phis. For outer loops, reductions, recurrences + // and in-loop reductions are empty since legality doesn't detect them. if (!RUN_VPLAN_PASS(VPlanTransforms::createHeaderPhiRecipes, *VPlan0, PSE, *OrigLoop, Legal->getInductionVars(), Legal->getReductionVars(), @@ -6862,8 +6813,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - auto Plan = tryToBuildVPlanWithVPRecipes( - std::unique_ptr(VPlan0->duplicate()), SubRange); + auto Plan = + tryToBuildVPlan(std::unique_ptr(VPlan0->duplicate()), SubRange); VF = SubRange.End; if (!Plan) @@ -6891,9 +6842,21 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -VPlanPtr -LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VPlanPtr Plan, - VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan, + VFRange &Range) { + + // For outer loops, the plan only needs basic recipe conversion and induction + // live-out optimization; the full inner-loop recipe building below does not + // apply (no widening decisions, interleave groups, reductions, etc.). + if (Plan->isOuterLoop()) { + for (ElementCount VF : Range) + Plan->addVF(VF); + if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI)) + return nullptr; + VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE, + /*FoldTail=*/false); + return Plan; + } using namespace llvm::VPlanPatternMatch; SmallPtrSet *, 1> InterleaveGroups; @@ -7116,47 +7079,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VPlanPtr Plan, return Plan; } -VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { - // Outer loop handling: They may require CFG and instruction level - // transformations before even evaluating whether vectorization is profitable. - // Since we cannot modify the incoming IR, we need to build VPlan upfront in - // the vectorization pipeline. - assert(!OrigLoop->isInnermost()); - assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - - auto Plan = VPlanTransforms::buildVPlan0( - OrigLoop, *LI, Legal->getWidestInductionType(), - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); - - if (!VPlanTransforms::createHeaderPhiRecipes( - *Plan, PSE, *OrigLoop, Legal->getInductionVars(), - MapVector(), - SmallPtrSet(), SmallPtrSet(), - /*AllowReordering=*/false)) - return nullptr; - [[maybe_unused]] bool CanHandleExits = VPlanTransforms::handleEarlyExits( - *Plan, UncountableExitStyle::NoUncountableExit, OrigLoop, PSE, *DT, - Legal->getAssumptionCache()); - assert(CanHandleExits && - "early-exits are not supported in VPlan-native path"); - VPlanTransforms::addMiddleCheck(*Plan, /*TailFolded*/ false); - - VPlanTransforms::createLoopRegions(*Plan); - - for (ElementCount VF : Range) - Plan->addVF(VF); - - if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI)) - return nullptr; - - // Optimize induction live-out users to use precomputed end values. - VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE, - /*FoldTail=*/false); - - assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); - return Plan; -} - void LoopVectorizationPlanner::addReductionResultComputation( VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { using namespace VPlanPatternMatch; @@ -7357,7 +7279,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { // VPlan-native path does not do any analysis for runtime checks // currently. - assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && + assert((!EnableVPlanNativePath || !Plan.isOuterLoop()) && "Runtime checks are not supported for outer loops yet"); if (Config.OptForSize) { @@ -7438,75 +7360,6 @@ getEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, return CM_EpilogueAllowed; } -// Process the loop in the VPlan-native vectorization path. This path builds -// VPlan upfront in the vectorization pipeline, which allows to apply -// VPlan-to-VPlan transformations from the very beginning without modifying the -// input LLVM IR. -static bool processLoopInVPlanNativePath( - Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, - LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, - TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, - std::function GetBFI, bool OptForSize, - LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) { - - if (isa(PSE.getBackedgeTakenCount())) { - LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); - return false; - } - assert(EnableVPlanNativePath && "VPlan-native path is disabled."); - Function *F = L->getHeader()->getParent(); - InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - - EpilogueLowering SEL = - getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI); - - VFSelectionContext Config(*TTI, LVL, L, *F, PSE, DB, ORE, &Hints, OptForSize); - LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, AC, ORE, - GetBFI, F, &Hints, IAI, Config); - // Use the planner for outer loop vectorization. - // TODO: CM is not used at this point inside the planner. Turn CM into an - // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, Config, IAI, PSE, - Hints, ORE); - - // Get user vectorization factor. - ElementCount UserVF = Hints.getWidth(); - - Config.collectElementTypesForWidening(); - - // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); - - // If we are stress testing VPlan builds, do not attempt to generate vector - // code. Masked vector code generation support will follow soon. - // Also, do not attempt to vectorize if no vector code will be produced. - if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) - return false; - - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - - { - GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind); - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, - Checks, BestPlan); - LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName() - << "\"\n"); - LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1, - VF.MinProfitableTripCount); - bool HasBranchWeights = - hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights); - - reportVectorization(ORE, L, VF, 1); - - LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT); - } - - assert(!verifyFunction(*F, &dbgs())); - return true; -} - // Emit a remark if there are stores to floats that required a floating point // extension. If the vectorized loop was generated with floating point there // will be a performance penalty from the conversion overhead and the change in @@ -8176,6 +8029,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + bool IsInnerLoop = L->isInnermost(); + + // Outer loops require a computable trip count. + if (!IsInnerLoop && isa(PSE.getBackedgeTakenCount())) { + LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); + return false; + } + if (LVL.hasUncountableEarlyExit()) { if (!EnableEarlyExitVectorization) { reportVectorizationFailure("Auto-vectorization of loops with uncountable " @@ -8185,24 +8046,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { } } - // Entrance to the VPlan-native vectorization path. Outer loops are processed - // here. They may require CFG and instruction level transformations before - // even evaluating whether vectorization is profitable. Since we cannot modify - // the incoming IR, we need to build VPlan upfront in the vectorization - // pipeline. - if (!L->isInnermost()) - return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, GetBFI, OptForSize, Hints, - Requirements); - - assert(L->isInnermost() && "Inner loop expected."); - InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); - bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + bool UseInterleaved = + IsInnerLoop && TTI->enableInterleavedAccessVectorization(); // If an override option has been passed in for interleaved accesses, use it. if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) - UseInterleaved = EnableInterleavedMemAccesses; + UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses; // Analyze interleaved memory accesses. if (UseInterleaved) @@ -8305,7 +8155,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); - if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth()) + // Outer loops don't have LoopAccessInfo, so skip the safety check and reset + // UserIC (interleaving is not supported for outer loops). + if (!IsInnerLoop) + UserIC = 0; + else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth()) UserIC = 1; // Plan how to best vectorize. @@ -8313,11 +8167,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { auto [VF, BestPlanPtr] = LVP.computeBestVF(); unsigned IC = 1; - if (ORE->allowExtraAnalysis(LV_NAME)) + // For VPlan build stress testing of outer loops, bail after plan + // construction. + if (!IsInnerLoop && VPlanBuildOuterloopStressTest) + return false; + + if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME)) LVP.emitInvalidCostRemarks(ORE); GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind); - if (LVP.hasPlanWithVF(VF.Width)) { + if (IsInnerLoop && LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = LVP.selectInterleaveCount(*BestPlanPtr, VF.Width, VF.Cost); @@ -8560,6 +8419,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { VF.MinProfitableTripCount); LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights); + if (!IsInnerLoop) + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName() + << "\"\n"); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); ++LoopsVectorized; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 77cc6484e9c6c..3c8f3362ae93a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1078,6 +1078,14 @@ const VPRegionBlock *VPlan::getVectorLoopRegion() const { return nullptr; } +bool VPlan::isOuterLoop() const { + const VPRegionBlock *LoopRegion = getVectorLoopRegion(); + assert(LoopRegion && "expected a vector loop region"); + return any_of(VPBlockUtils::blocksOnly( + vp_depth_first_shallow(LoopRegion->getEntry())), + [](const VPRegionBlock *R) { return !R->isReplicator(); }); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPlan::printLiveIns(raw_ostream &O) const { VPSlotTracker SlotTracker(this); @@ -1667,27 +1675,6 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( return PredicateAtRangeStart; } -/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, -/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range -/// of VF's starting at a given VF and extending it as much as possible. Each -/// vectorization decision can potentially shorten this sub-range during -/// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, - ElementCount MaxVF) { - auto MaxVFTimes2 = MaxVF * 2; - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { - VFRange SubRange = {VF, MaxVFTimes2}; - if (auto Plan = tryToBuildVPlan(SubRange)) { - VPlanTransforms::optimize(*Plan); - // Update the name of the latch of the top-level vector loop region region - // after optimizations which includes block folding. - Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch"); - VPlans.push_back(std::move(Plan)); - } - VF = SubRange.End; - } -} - VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6a1ea6b3439bf..51193964bdd83 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4616,6 +4616,10 @@ class VPlan { LLVM_ABI_FOR_TEST VPRegionBlock *getVectorLoopRegion(); LLVM_ABI_FOR_TEST const VPRegionBlock *getVectorLoopRegion() const; + /// Returns true if this VPlan is for an outer loop, i.e., its vector + /// loop region contains a nested loop region. + LLVM_ABI_FOR_TEST bool isOuterLoop() const; + /// Returns the 'middle' block of the plan, that is the block that selects /// whether to execute the scalar tail loop or the exit block from the loop /// latch. If there is an early exit from the vector loop, the middle block diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 9710767f905fe..2717b80e2eeaa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -260,6 +260,9 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { } void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) { + // Nested loop regions (outer-loop vectorization) are not supported yet. + if (Plan.isOuterLoop()) + return; VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll index e03110fc3807a..7f442f7d72e78 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -vplan-build-outerloop-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s ; This test checks that, when stress testing VPlan, if the computed VF ; is 1, we override it to VF = 4. diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll index f6b215f43d68e..f60a620deecf9 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path -vplan-build-outerloop-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s ; Verify that the stress testing flag for the VPlan H-CFG builder works as ; expected with and without enabling the VPlan H-CFG Verifier. diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll index 71bcd90304e43..a610f0669f483 100644 --- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -73,7 +73,7 @@ for.end15: ; CHECK-LABEL: case2 ; CHECK: LV: Loop hints: force=enabled width=0 interleave=0 ; CHECK: LV: We can vectorize this outer loop! -; CHECK: LV: Using VF 1 to build VPlans. +; CHECK: LV: VPlan computed VF 1. define void @case2(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) { entry: From 1ab839d4e02e74eef584b4b25428e854382feb98 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 15:40:06 -0400 Subject: [PATCH 091/538] [gn] port 2e2d90b98661 (#196618) --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index e9e1b5e155ed0..b474fba4239aa 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -420,6 +420,7 @@ if (current_toolchain == default_toolchain) { "__configuration/language.h", "__configuration/namespace.h", "__configuration/platform.h", + "__configuration/utility.h", "__coroutine/coroutine_handle.h", "__coroutine/coroutine_traits.h", "__coroutine/noop_coroutine_handle.h", From 62efd9d242f0ec0b358c47654acbed6d49baac9a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 15:40:54 -0400 Subject: [PATCH 092/538] [gn build] Port 3fe311f215d0 (#196619) --- llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn index ac63bbc6ee3b3..d331a619d2f74 100644 --- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn @@ -53,6 +53,7 @@ static_library("Target") { "OperatingSystem.cpp", "PathMappingList.cpp", "Platform.cpp", + "Policy.cpp", "Process.cpp", "ProcessTrace.cpp", "Queue.cpp", From c26716f5860dcca61326c59a10cff1bb7aab65e8 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 15:41:13 -0400 Subject: [PATCH 093/538] [gn build] Port c507e2024c9a (#196620) --- llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn index 9e033976b3088..a2dc483ddd252 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn @@ -50,6 +50,7 @@ static_library("IPO") { "Inliner.cpp", "Instrumentor.cpp", "InstrumentorConfigFile.cpp", + "InstrumentorStubPrinter.cpp", "Internalize.cpp", "LoopExtractor.cpp", "LowerTypeTests.cpp", From e9a914ae0141846d5cdf6a1dba7d121c8c614c5e Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 15:41:30 -0400 Subject: [PATCH 094/538] [gn build] Port e6efa1a4c9f6 (#196621) --- llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn index efe3d733f74b0..1e2d0bef08e4d 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn @@ -20,6 +20,7 @@ unittest("AMDGPUTests") { "CSETest.cpp", "DwarfRegMappings.cpp", "ExecMayBeModifiedBeforeAnyUse.cpp", + "GCNRegPressureTest.cpp", "LiveRegUnits.cpp", "PALMetadata.cpp", "UniformityAnalysisTest.cpp", From 792610e06148a98656d02c35e15b8c8fc9d612f1 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 15:41:48 -0400 Subject: [PATCH 095/538] [gn build] Port ebb9a79cd370 (#196622) --- .../gn/secondary/clang/unittests/DependencyScanning/BUILD.gn | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn index 95c748a992ed6..4a40696ed802e 100644 --- a/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn @@ -8,5 +8,8 @@ unittest("ClangDependencyScanningTests") { "//llvm/lib/Option", "//llvm/lib/Support", ] - sources = [ "DependencyScanningFilesystemTest.cpp" ] + sources = [ + "DependencyScanningFilesystemTest.cpp", + "InProcessModuleCacheTest.cpp", + ] } From cd1ed45c4f266488ce1f03725a382eb359cc0c15 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 8 May 2026 15:44:13 -0400 Subject: [PATCH 096/538] [gn] port 7e74c78ea342 (#196624) --- llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn index faed86ef98839..3c74c4feb1292 100644 --- a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn @@ -18,6 +18,7 @@ write_cmake_config("Config") { "LLDB_ENABLE_LZMA=", "LLDB_ENABLE_CURSES=", "CURSES_HAVE_NCURSES_CURSES_H=", + "LLDB_ENABLE_DYNAMIC_SCRIPTINTERPRETERS=", "LLDB_ENABLE_LUA=", "LLDB_ENABLE_MTE=", "LLDB_ENABLE_PYTHON=", From 1ae2255db9d0098a1ed30a184fcdfcc787247cce Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Fri, 8 May 2026 12:54:52 -0700 Subject: [PATCH 097/538] [clang][deps] Use `ModuleDepCollector` for Make output (#182063) The dependency scanner works significantly differently depending on what kind of output it's asked to produce. The Make output format has been using the regular Clang dependency collection mechanism since it was first implemented. This means the implementation works very differently to the rest of the scanner and isn't able to turn implicit module command lines into Makefiles using explicit modules. This PR unifies the two implementations, using `ModuleDepCollector` even for Make output. Emitting explicit module builds into Makefiles will come in a later PR. --- .../DependencyScannerImpl.h | 7 +-- .../DependencyScanning/ModuleDepCollector.h | 3 + .../clang/Tooling/DependencyScanningTool.h | 1 + .../DependencyScannerImpl.cpp | 60 ++++--------------- .../DependencyScanning/ModuleDepCollector.cpp | 7 +++ clang/lib/Tooling/DependencyScanningTool.cpp | 30 ++++++---- clang/test/ClangScanDeps/modules-cc1.cpp | 2 +- .../modules-has-include-umbrella-header.c | 3 +- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 4 +- .../Tooling/DependencyScannerTest.cpp | 10 ++-- 10 files changed, 56 insertions(+), 71 deletions(-) diff --git a/clang/include/clang/DependencyScanning/DependencyScannerImpl.h b/clang/include/clang/DependencyScanning/DependencyScannerImpl.h index 55dcbd6fe0e9f..31dcffb2f01dd 100644 --- a/clang/include/clang/DependencyScanning/DependencyScannerImpl.h +++ b/clang/include/clang/DependencyScanning/DependencyScannerImpl.h @@ -121,11 +121,10 @@ computePrebuiltModulesASTMap(CompilerInstance &ScanInstance, std::shared_ptr initializeScanInstanceDependencyCollector( CompilerInstance &ScanInstance, std::unique_ptr DepOutputOpts, - StringRef WorkingDirectory, DependencyConsumer &Consumer, - DependencyScanningService &Service, CompilerInvocation &Inv, - DependencyActionController &Controller, + DependencyConsumer &Consumer, DependencyScanningService &Service, + CompilerInvocation &Inv, DependencyActionController &Controller, PrebuiltModulesAttrsMap PrebuiltModulesASTMap, - llvm::SmallVector &StableDirs); + SmallVector &StableDirs); } // namespace dependencies } // namespace clang diff --git a/clang/include/clang/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/DependencyScanning/ModuleDepCollector.h index e7dd907a00381..108127fbbe523 100644 --- a/clang/include/clang/DependencyScanning/ModuleDepCollector.h +++ b/clang/include/clang/DependencyScanning/ModuleDepCollector.h @@ -228,6 +228,9 @@ class ModuleDepCollectorPP final : public PPCallbacks { void LexedFileChanged(FileID FID, LexedFileChangeReason Reason, SrcMgr::CharacteristicKind FileType, FileID PrevFID, SourceLocation Loc) override; + void HasInclude(SourceLocation Loc, StringRef FileName, bool IsAngled, + OptionalFileEntryRef File, + SrcMgr::CharacteristicKind FileType) override; void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, diff --git a/clang/include/clang/Tooling/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanningTool.h index c845e212ce153..c368e93fa6286 100644 --- a/clang/include/clang/Tooling/DependencyScanningTool.h +++ b/clang/include/clang/Tooling/DependencyScanningTool.h @@ -47,6 +47,7 @@ class DependencyScanningTool { /// dependency file contents otherwise. std::optional getDependencyFile(ArrayRef CommandLine, StringRef CWD, + dependencies::LookupModuleOutputCallback LookupModuleOutput, DiagnosticConsumer &DiagConsumer); /// Collect the module dependency in P1689 format for C++20 named modules. diff --git a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp index 40a7d1b908a6c..224413bb99cbc 100644 --- a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp +++ b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp @@ -27,32 +27,6 @@ using namespace clang; using namespace dependencies; -namespace { -/// Forwards the gatherered dependencies to the consumer. -class DependencyConsumerForwarder : public DependencyFileGenerator { -public: - DependencyConsumerForwarder(std::unique_ptr Opts, - StringRef WorkingDirectory, DependencyConsumer &C) - : DependencyFileGenerator(*Opts), WorkingDirectory(WorkingDirectory), - Opts(std::move(Opts)), C(C) {} - - void finishedMainFile(DiagnosticsEngine &Diags) override { - C.handleDependencyOutputOpts(*Opts); - llvm::SmallString<256> CanonPath; - for (const auto &File : getDependencies()) { - CanonPath = File; - llvm::sys::path::remove_dots(CanonPath, /*remove_dot_dot=*/true); - llvm::sys::path::make_absolute(WorkingDirectory, CanonPath); - C.handleFileDependency(CanonPath); - } - } - -private: - StringRef WorkingDirectory; - std::unique_ptr Opts; - DependencyConsumer &C; -}; - static bool checkHeaderSearchPaths(const HeaderSearchOptions &HSOpts, const HeaderSearchOptions &ExistingHSOpts, DiagnosticsEngine *Diags, @@ -77,6 +51,8 @@ static bool checkHeaderSearchPaths(const HeaderSearchOptions &HSOpts, return false; } +namespace { + using PrebuiltModuleFilesT = decltype(HeaderSearchOptions::PrebuiltModuleFiles); /// A listener that collects the imported modules and the input @@ -531,27 +507,14 @@ std::shared_ptr dependencies::initializeScanInstanceDependencyCollector( CompilerInstance &ScanInstance, std::unique_ptr DepOutputOpts, - StringRef WorkingDirectory, DependencyConsumer &Consumer, - DependencyScanningService &Service, CompilerInvocation &Inv, - DependencyActionController &Controller, + DependencyConsumer &Consumer, DependencyScanningService &Service, + CompilerInvocation &Inv, DependencyActionController &Controller, PrebuiltModulesAttrsMap PrebuiltModulesASTMap, - llvm::SmallVector &StableDirs) { - std::shared_ptr MDC; - switch (Service.getOpts().Format) { - case ScanningOutputFormat::Make: - ScanInstance.addDependencyCollector( - std::make_shared( - std::move(DepOutputOpts), WorkingDirectory, Consumer)); - break; - case ScanningOutputFormat::P1689: - case ScanningOutputFormat::Full: - MDC = std::make_shared( - Service, std::move(DepOutputOpts), ScanInstance, Consumer, Controller, - Inv, std::move(PrebuiltModulesASTMap), StableDirs); - ScanInstance.addDependencyCollector(MDC); - break; - } - + SmallVector &StableDirs) { + auto MDC = std::make_shared( + Service, std::move(DepOutputOpts), ScanInstance, Consumer, Controller, + Inv, std::move(PrebuiltModulesASTMap), StableDirs); + ScanInstance.addDependencyCollector(MDC); return MDC; } @@ -804,9 +767,8 @@ bool DependencyScanningAction::runInvocation( auto DepOutputOpts = createDependencyOutputOptions(*OriginalInvocation); MDC = initializeScanInstanceDependencyCollector( - ScanInstance, std::move(DepOutputOpts), WorkingDirectory, Consumer, - Service, *OriginalInvocation, Controller, *MaybePrebuiltModulesASTMap, - StableDirs); + ScanInstance, std::move(DepOutputOpts), Consumer, Service, + *OriginalInvocation, Controller, *MaybePrebuiltModulesASTMap, StableDirs); if (ScanInstance.getDiagnostics().hasErrorOccurred()) return false; diff --git a/clang/lib/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/DependencyScanning/ModuleDepCollector.cpp index f9bc4cc3098ef..127b26bf2e0f7 100644 --- a/clang/lib/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/DependencyScanning/ModuleDepCollector.cpp @@ -551,6 +551,13 @@ void ModuleDepCollectorPP::LexedFileChanged(FileID FID, MDC.addFileDep(llvm::sys::path::remove_leading_dotslash(*Filename)); } +void ModuleDepCollectorPP::HasInclude(SourceLocation Loc, StringRef FileName, + bool IsAngled, OptionalFileEntryRef File, + SrcMgr::CharacteristicKind FileType) { + if (File) + MDC.addFileDep(File->getName()); +} + void ModuleDepCollectorPP::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File, diff --git a/clang/lib/Tooling/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanningTool.cpp index 1d4ab579f5827..d9d368c6ffde3 100644 --- a/clang/lib/Tooling/DependencyScanningTool.cpp +++ b/clang/lib/Tooling/DependencyScanningTool.cpp @@ -38,14 +38,20 @@ class MakeDependencyPrinterConsumer : public DependencyConsumer { } void handleFileDependency(StringRef File) override { - Dependencies.push_back(std::string(File)); + SmallString<128> NormalizedFile = File; + llvm::sys::path::remove_dots(NormalizedFile, /*remove_dot_dot=*/true); + Dependencies.emplace_back(NormalizedFile.str()); } // These are ignored for the make format as it can't support the full // set of deps, and handleFileDependency handles enough for implicitly // built modules to work. void handlePrebuiltModuleDependency(PrebuiltModuleDep PMD) override {} - void handleModuleDependency(ModuleDeps MD) override {} + void handleModuleDependency(ModuleDeps MD) override { + MD.forEachFileDep([this](StringRef File) { + DependenciesFromModules.push_back(std::string(File)); + }); + } void handleDirectModuleDependency(ModuleID ID) override {} void handleVisibleModule(std::string ModuleName) override {} void handleContextHash(std::string Hash) override {} @@ -56,10 +62,13 @@ class MakeDependencyPrinterConsumer : public DependencyConsumer { class DependencyPrinter : public DependencyFileGenerator { public: DependencyPrinter(DependencyOutputOptions &Opts, - ArrayRef Dependencies) + ArrayRef Dependencies, + ArrayRef ModuleDependencies) : DependencyFileGenerator(Opts) { for (const auto &Dep : Dependencies) addDependency(Dep); + for (const auto &Dep : ModuleDependencies) + addDependency(Dep); } void printDependencies(std::string &S) { @@ -68,13 +77,14 @@ class MakeDependencyPrinterConsumer : public DependencyConsumer { } }; - DependencyPrinter Generator(*Opts, Dependencies); + DependencyPrinter Generator(*Opts, Dependencies, DependenciesFromModules); Generator.printDependencies(S); } protected: std::unique_ptr Opts; std::vector Dependencies; + std::vector DependenciesFromModules; }; } // anonymous namespace @@ -187,12 +197,12 @@ bool tooling::computeDependencies( Controller, DiagConsumer, OverlayFS); } -std::optional -DependencyScanningTool::getDependencyFile(ArrayRef CommandLine, - StringRef CWD, - DiagnosticConsumer &DiagConsumer) { +std::optional DependencyScanningTool::getDependencyFile( + ArrayRef CommandLine, StringRef CWD, + LookupModuleOutputCallback LookupModuleOutput, + DiagnosticConsumer &DiagConsumer) { MakeDependencyPrinterConsumer DepConsumer; - CallbackActionController Controller(nullptr); + CallbackActionController Controller(LookupModuleOutput); if (!computeDependencies(Worker, CWD, CommandLine, DepConsumer, Controller, DiagConsumer)) return std::nullopt; @@ -540,7 +550,7 @@ bool CompilerInstanceWithContext::computeDependencies( }); auto MDC = initializeScanInstanceDependencyCollector( - CI, std::make_unique(*OutputOpts), CWD, Consumer, + CI, std::make_unique(*OutputOpts), Consumer, Worker.Service, /* The MDC's constructor makes a copy of the OriginalInvocation, so we can pass it in without worrying that it might be changed across diff --git a/clang/test/ClangScanDeps/modules-cc1.cpp b/clang/test/ClangScanDeps/modules-cc1.cpp index 04a365249f379..28fc020847d56 100644 --- a/clang/test/ClangScanDeps/modules-cc1.cpp +++ b/clang/test/ClangScanDeps/modules-cc1.cpp @@ -16,7 +16,7 @@ module header1 { header "header.h" } [{ "file": "DIR/modules_cc1.cpp", "directory": "DIR", - "command": "clang -cc1 DIR/modules_cc1.cpp -fimplicit-module-maps -o modules_cc1.o" + "command": "clang -cc1 DIR/modules_cc1.cpp -fmodules -fmodules-cache-path=DIR/cache -fimplicit-module-maps -o modules_cc1.o" }] // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json diff --git a/clang/test/ClangScanDeps/modules-has-include-umbrella-header.c b/clang/test/ClangScanDeps/modules-has-include-umbrella-header.c index 022c59ca65db2..f7f804794ab41 100644 --- a/clang/test/ClangScanDeps/modules-has-include-umbrella-header.c +++ b/clang/test/ClangScanDeps/modules-has-include-umbrella-header.c @@ -65,7 +65,8 @@ module Dependency { header "dependency.h" } // CHECK-NEXT: "command-line": [ // CHECK: ], // CHECK: "file-deps": [ -// CHECK-NEXT: "[[PREFIX]]/tu.c" +// CHECK-NEXT: "[[PREFIX]]/tu.c", +// CHECK-NEXT: "[[PREFIX]]/frameworks/FW.framework/PrivateHeaders/B.h" // CHECK-NEXT: ], // CHECK-NEXT: "input-file": "[[PREFIX]]/tu.c" // CHECK-NEXT: } diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 1d80ac519bb20..8944c5fc48e30 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -1010,8 +1010,8 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { // Run the tool on it. if (Format == ScanningOutputFormat::Make) { - auto MaybeFile = - WorkerTool.getDependencyFile(Input->CommandLine, CWD, DiagConsumer); + auto MaybeFile = WorkerTool.getDependencyFile( + Input->CommandLine, CWD, LookupOutput, DiagConsumer); handleDiagnostics(Filename, S, Errs); if (MaybeFile) DependencyOS.applyLocked([&](raw_ostream &OS) { OS << *MaybeFile; }); diff --git a/clang/unittests/Tooling/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScannerTest.cpp index 79fd5a312d2b9..86d4e0ee1b8c5 100644 --- a/clang/unittests/Tooling/DependencyScannerTest.cpp +++ b/clang/unittests/Tooling/DependencyScannerTest.cpp @@ -235,8 +235,9 @@ TEST(DependencyScanner, ScanDepsWithFS) { DependencyScanningTool ScanTool(Service); TextDiagnosticBuffer DiagConsumer; - std::optional DepFile = - ScanTool.getDependencyFile(CommandLine, CWD, DiagConsumer); + std::optional DepFile = ScanTool.getDependencyFile( + CommandLine, CWD, CallbackActionController::lookupUnreachableModuleOutput, + DiagConsumer); ASSERT_TRUE(DepFile.has_value()); EXPECT_EQ(llvm::sys::path::convert_to_slash(*DepFile), "test.cpp.o: /root/test.cpp /root/header.h\n"); @@ -297,8 +298,9 @@ TEST(DependencyScanner, ScanDepsWithModuleLookup) { // matter, the point of the test is to check that files are not read // unnecessarily. TextDiagnosticBuffer DiagConsumer; - std::optional DepFile = - ScanTool.getDependencyFile(CommandLine, CWD, DiagConsumer); + std::optional DepFile = ScanTool.getDependencyFile( + CommandLine, CWD, CallbackActionController::lookupUnreachableModuleOutput, + DiagConsumer); ASSERT_FALSE(DepFile.has_value()); EXPECT_TRUE(!llvm::is_contained(InterceptFS->StatPaths, OtherPath)); From 0a2f943c74ba69e1a68d0ab7bce2e25596c914da Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 8 May 2026 22:00:21 +0200 Subject: [PATCH 098/538] [libc++] Remove _LIBCPP_HIDE_FROM_ABI from <__utility/pair.h> (#196508) This is a follow-up to #193045. This only drops `_LIBCPP_HIDE_FROM_ABI` in a small part of the code base to make sure everything works as expected. Once this has been in trunk for a while and there aren't any problems, there will be larger follow-up patches to remove `_LIBCPP_HIDE_FROM_ABI` throughout the code base. --- libcxx/include/__utility/pair.h | 174 ++++++++++++++------------------ 1 file changed, 77 insertions(+), 97 deletions(-) diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h index a8232bc9da9fa..7cc8a4e691084 100644 --- a/libcxx/include/__utility/pair.h +++ b/libcxx/include/__utility/pair.h @@ -55,22 +55,22 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct __check_pair_construction { template - static _LIBCPP_HIDE_FROM_ABI constexpr bool __enable_implicit_default() { + static constexpr bool __enable_implicit_default() { return __is_implicitly_default_constructible<_T1>::value && __is_implicitly_default_constructible<_T2>::value; } template - static _LIBCPP_HIDE_FROM_ABI constexpr bool __enable_default() { + static constexpr bool __enable_default() { return is_default_constructible<_T1>::value && is_default_constructible<_T2>::value; } template - static _LIBCPP_HIDE_FROM_ABI constexpr bool __is_pair_constructible() { + static constexpr bool __is_pair_constructible() { return is_constructible<_T1, _U1>::value && is_constructible<_T2, _U2>::value; } template - static _LIBCPP_HIDE_FROM_ABI constexpr bool __is_implicit() { + static constexpr bool __is_implicit() { return is_convertible<_U1, _T1>::value && is_convertible<_U2, _T2>::value; } }; @@ -79,9 +79,8 @@ struct __check_pair_construction { template struct __non_trivially_copyable_base { - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI __non_trivially_copyable_base() _NOEXCEPT {} - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI - __non_trivially_copyable_base(__non_trivially_copyable_base const&) _NOEXCEPT {} + _LIBCPP_CONSTEXPR __non_trivially_copyable_base() _NOEXCEPT {} + _LIBCPP_CONSTEXPR_SINCE_CXX14 __non_trivially_copyable_base(__non_trivially_copyable_base const&) _NOEXCEPT {} }; template @@ -101,18 +100,18 @@ struct pair pair, void>; - _LIBCPP_HIDE_FROM_ABI pair(pair const&) = default; - _LIBCPP_HIDE_FROM_ABI pair(pair&&) = default; + pair(pair const&) = default; + pair(pair&&) = default; #ifdef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI pair() : first(), second() {} + pair() : first(), second() {} - _LIBCPP_HIDE_FROM_ABI pair(_T1 const& __t1, _T2 const& __t2) : first(__t1), second(__t2) {} + pair(_T1 const& __t1, _T2 const& __t2) : first(__t1), second(__t2) {} template - _LIBCPP_HIDE_FROM_ABI pair(const pair<_U1, _U2>& __p) : first(__p.first), second(__p.second) {} + pair(const pair<_U1, _U2>& __p) : first(__p.first), second(__p.second) {} - _LIBCPP_HIDE_FROM_ABI pair& operator=(pair const& __p) { + pair& operator=(pair const& __p) { first = __p.first; second = __p.second; return *this; @@ -126,7 +125,7 @@ struct pair class _U2, __enable_if_t::value && is_assignable::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI pair& operator=(pair<_U1, _U2> const& __p) { + pair& operator=(pair<_U1, _U2> const& __p) { first = __p.first; second = __p.second; return *this; @@ -134,13 +133,12 @@ struct pair #else template , __enable_if_t<_CheckArgsDep::__enable_default(), int> = 0> - explicit(!_CheckArgsDep::__enable_implicit_default()) _LIBCPP_HIDE_FROM_ABI constexpr pair() noexcept( + explicit(!_CheckArgsDep::__enable_implicit_default()) constexpr pair() noexcept( is_nothrow_default_constructible::value && is_nothrow_default_constructible::value) : first(), second() {} template , __enable_if_t<_CheckArgsDep::template __is_pair_constructible<_T1 const&, _T2 const&>(), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!_CheckArgsDep::template __is_implicit<_T1 const&, _T2 const&>()) pair(_T1 const& __t1, _T2 const& __t2) noexcept(is_nothrow_copy_constructible::value && is_nothrow_copy_constructible::value) @@ -155,7 +153,6 @@ struct pair class _U2, # endif __enable_if_t<__check_pair_construction<_T1, _T2>::template __is_pair_constructible<_U1, _U2>(), int> = 0 > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!__check_pair_construction<_T1, _T2>::template __is_implicit<_U1, _U2>()) pair(_U1&& __u1, _U2&& __u2) noexcept(is_nothrow_constructible::value && is_nothrow_constructible::value) @@ -166,7 +163,7 @@ struct pair template ::template __is_pair_constructible<_U1&, _U2&>(), int> = 0> - _LIBCPP_HIDE_FROM_ABI constexpr explicit(!__check_pair_construction<_T1, _T2>::template __is_implicit<_U1&, _U2&>()) + constexpr explicit(!__check_pair_construction<_T1, _T2>::template __is_implicit<_U1&, _U2&>()) pair(pair<_U1, _U2>& __p) noexcept((is_nothrow_constructible::value && is_nothrow_constructible::value)) : first(__p.first), second(__p.second) {} @@ -177,7 +174,7 @@ struct pair class _U2, __enable_if_t<__check_pair_construction<_T1, _T2>::template __is_pair_constructible<_U1 const&, _U2 const&>(), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit( + _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit( !__check_pair_construction<_T1, _T2>::template __is_implicit<_U1 const&, _U2 const&>()) pair(pair<_U1, _U2> const& __p) noexcept(is_nothrow_constructible::value && is_nothrow_constructible::value) @@ -186,7 +183,6 @@ struct pair template ::template __is_pair_constructible<_U1, _U2>(), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(!__check_pair_construction<_T1, _T2>::template __is_implicit<_U1, _U2>()) pair(pair<_U1, _U2>&& __p) noexcept(is_nothrow_constructible::value && is_nothrow_constructible::value) @@ -198,8 +194,7 @@ struct pair class _U2, __enable_if_t<__check_pair_construction<_T1, _T2>::template __is_pair_constructible(), int> = 0> - _LIBCPP_HIDE_FROM_ABI constexpr explicit( - !__check_pair_construction<_T1, _T2>::template __is_implicit()) + constexpr explicit(!__check_pair_construction<_T1, _T2>::template __is_implicit()) pair(const pair<_U1, _U2>&& __p) noexcept(is_nothrow_constructible::value && is_nothrow_constructible::value) : first(std::move(__p.first)), second(std::move(__p.second)) {} @@ -209,19 +204,19 @@ struct pair template <__pair_like_no_subrange _PairLike> requires(is_constructible_v(std::declval<_PairLike &&>()))> && is_constructible_v(std::declval<_PairLike &&>()))>) - _LIBCPP_HIDE_FROM_ABI constexpr explicit( - !is_convertible_v(std::declval<_PairLike&&>())), first_type> || - !is_convertible_v(std::declval<_PairLike&&>())), second_type>) pair(_PairLike&& __p) + constexpr explicit(!is_convertible_v(std::declval<_PairLike&&>())), first_type> || + !is_convertible_v(std::declval<_PairLike&&>())), second_type>) + pair(_PairLike&& __p) : first(std::get<0>(std::forward<_PairLike>(__p))), second(std::get<1>(std::forward<_PairLike>(__p))) {} # endif template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 + _LIBCPP_CONSTEXPR_SINCE_CXX20 pair(piecewise_construct_t __pc, tuple<_Args1...> __first_args, tuple<_Args2...> __second_args) noexcept( is_nothrow_constructible::value && is_nothrow_constructible::value) : pair(__pc, __first_args, __second_args, __index_sequence_for<_Args1...>(), __index_sequence_for<_Args2...>()) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& + _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=(__conditional_t::value && is_copy_assignable::value, pair, __nat> const& __p) noexcept(is_nothrow_copy_assignable::value && @@ -231,7 +226,7 @@ struct pair return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=( + _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=( __conditional_t::value && is_move_assignable::value, pair, __nat>&& __p) noexcept(is_nothrow_move_assignable::value && is_nothrow_move_assignable::value) { @@ -245,7 +240,7 @@ struct pair class _U2, __enable_if_t::value && is_assignable::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=(pair<_U1, _U2> const& __p) { + _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=(pair<_U1, _U2> const& __p) { first = __p.first; second = __p.second; return *this; @@ -254,7 +249,7 @@ struct pair template ::value && is_assignable::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=(pair<_U1, _U2>&& __p) { + _LIBCPP_CONSTEXPR_SINCE_CXX20 pair& operator=(pair<_U1, _U2>&& __p) { first = std::forward<_U1>(__p.first); second = std::forward<_U2>(__p.second); return *this; @@ -262,7 +257,7 @@ struct pair # if _LIBCPP_STD_VER >= 23 template - _LIBCPP_HIDE_FROM_ABI constexpr const pair& operator=(pair const& __p) const + constexpr const pair& operator=(pair const& __p) const noexcept(is_nothrow_copy_assignable_v && is_nothrow_copy_assignable_v) requires(is_copy_assignable_v && is_copy_assignable_v) { @@ -272,7 +267,7 @@ struct pair } template - _LIBCPP_HIDE_FROM_ABI constexpr const pair& operator=(pair&& __p) const + constexpr const pair& operator=(pair&& __p) const noexcept(is_nothrow_assignable_v && is_nothrow_assignable_v) requires(is_assignable_v && is_assignable_v) @@ -283,7 +278,7 @@ struct pair } template - _LIBCPP_HIDE_FROM_ABI constexpr const pair& operator=(const pair<_U1, _U2>& __p) const + constexpr const pair& operator=(const pair<_U1, _U2>& __p) const requires(is_assignable_v && is_assignable_v) { first = __p.first; @@ -292,7 +287,7 @@ struct pair } template - _LIBCPP_HIDE_FROM_ABI constexpr const pair& operator=(pair<_U1, _U2>&& __p) const + constexpr const pair& operator=(pair<_U1, _U2>&& __p) const requires(is_assignable_v && is_assignable_v) { first = std::forward<_U1>(__p.first); @@ -304,7 +299,7 @@ struct pair requires(__different_from<_PairLike, pair> && is_assignable_v(std::declval<_PairLike>()))> && is_assignable_v(std::declval<_PairLike>()))>) - _LIBCPP_HIDE_FROM_ABI constexpr pair& operator=(_PairLike&& __p) { + constexpr pair& operator=(_PairLike&& __p) { first = std::get<0>(std::forward<_PairLike>(__p)); second = std::get<1>(std::forward<_PairLike>(__p)); return *this; @@ -314,7 +309,7 @@ struct pair requires(__different_from<_PairLike, pair> && is_assignable_v(std::declval<_PairLike>()))> && is_assignable_v(std::declval<_PairLike>()))>) - _LIBCPP_HIDE_FROM_ABI constexpr pair const& operator=(_PairLike&& __p) const { + constexpr pair const& operator=(_PairLike&& __p) const { first = std::get<0>(std::forward<_PairLike>(__p)); second = std::get<1>(std::forward<_PairLike>(__p)); return *this; @@ -328,34 +323,33 @@ struct pair template ::value && is_convertible<_U2 const&, _T2>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(tuple<_U1, _U2> const& __p) - : first(std::get<0>(__p)), second(std::get<1>(__p)) {} + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(tuple<_U1, _U2> const& __p) : first(std::get<0>(__p)), second(std::get<1>(__p)) {} template < class _U1, class _U2, __enable_if_t::value && is_constructible<_T2, _U2 const&>::value && !(is_convertible<_U1 const&, _T1>::value && is_convertible<_U2 const&, _T2>::value), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(tuple<_U1, _U2> const& __p) + _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(tuple<_U1, _U2> const& __p) : first(std::get<0>(__p)), second(std::get<1>(__p)) {} template ::value && is_convertible<_U2, _T2>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(tuple<_U1, _U2>&& __p) + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(tuple<_U1, _U2>&& __p) : first(std::get<0>(std::move(__p))), second(std::get<1>(std::move(__p))) {} template ::value && is_constructible<_T2, _U2>::value && !(is_convertible<_U1, _T1>::value && is_convertible<_U2, _T2>::value) > = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(tuple<_U1, _U2>&& __p) + _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(tuple<_U1, _U2>&& __p) : first(std::get<0>(std::move(__p))), second(std::get<1>(std::move(__p))) {} template ::value && is_assignable<_T2&, _U2 const&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(tuple<_U1, _U2> const& __p) { + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(tuple<_U1, _U2> const& __p) { first = std::get<0>(__p); second = std::get<1>(__p); return *this; @@ -364,7 +358,7 @@ struct pair template ::value && is_assignable<_T2&, _U2&&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(tuple<_U1, _U2>&& __p) { + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(tuple<_U1, _U2>&& __p) { first = std::get<0>(std::move(__p)); second = std::get<1>(std::move(__p)); return *this; @@ -373,36 +367,34 @@ struct pair // from std::array template ::value && is_convertible<_Up const&, _T2>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(array<_Up, 2> const& __p) : first(__p[0]), second(__p[1]) {} + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(array<_Up, 2> const& __p) : first(__p[0]), second(__p[1]) {} template ::value && is_constructible<_T2, _Up const&>::value && !(is_convertible<_Up const&, _T1>::value && is_convertible<_Up const&, _T2>::value), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(array<_Up, 2> const& __p) - : first(__p[0]), second(__p[1]) {} + _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(array<_Up, 2> const& __p) : first(__p[0]), second(__p[1]) {} template ::value && is_convertible<_Up, _T2>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(array<_Up, 2>&& __p) - : first(std::move(__p)[0]), second(std::move(__p)[1]) {} + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair(array<_Up, 2>&& __p) : first(std::move(__p)[0]), second(std::move(__p)[1]) {} template ::value && is_constructible<_T2, _Up>::value && !(is_convertible<_Up, _T1>::value && is_convertible<_Up, _T2>::value), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(array<_Up, 2>&& __p) + _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit pair(array<_Up, 2>&& __p) : first(std::move(__p)[0]), second(std::move(__p)[1]) {} template ::value && is_assignable<_T2&, _Up const&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(array<_Up, 2> const& __p) { + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(array<_Up, 2> const& __p) { first = std::get<0>(__p); second = std::get<1>(__p); return *this; } template ::value && is_assignable<_T2&, _Up>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(array<_Up, 2>&& __p) { + _LIBCPP_CONSTEXPR_SINCE_CXX14 pair& operator=(array<_Up, 2>&& __p) { first = std::get<0>(std::move(__p)); second = std::get<1>(std::move(__p)); return *this; @@ -410,7 +402,7 @@ struct pair # endif // _LIBCPP_STD_VER < 23 #endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair& __p) + _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair& __p) _NOEXCEPT_(__is_nothrow_swappable_v&& __is_nothrow_swappable_v) { using std::swap; swap(first, __p.first); @@ -418,7 +410,7 @@ struct pair } #if _LIBCPP_STD_VER >= 23 - _LIBCPP_HIDE_FROM_ABI constexpr void swap(const pair& __p) const + constexpr void swap(const pair& __p) const noexcept(__is_nothrow_swappable_v && __is_nothrow_swappable_v) { using std::swap; swap(first, __p.first); @@ -429,7 +421,7 @@ struct pair private: #ifndef _LIBCPP_CXX03_LANG template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 + _LIBCPP_CONSTEXPR_SINCE_CXX20 pair(piecewise_construct_t, tuple<_Args1...>& __first_args, tuple<_Args2...>& __second_args, @@ -448,8 +440,7 @@ pair(_T1, _T2) -> pair<_T1, _T2>; // [pairs.spec], specialized algorithms template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator==(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) +inline _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator==(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) #if _LIBCPP_STD_VER >= 26 requires requires { { __x.first == __y.first } -> __boolean_testable; @@ -463,8 +454,7 @@ operator==(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) #if _LIBCPP_STD_VER >= 20 template -_LIBCPP_HIDE_FROM_ABI constexpr common_comparison_category_t< __synth_three_way_result<_T1, _U1>, - __synth_three_way_result<_T2, _U2> > +constexpr common_comparison_category_t< __synth_three_way_result<_T1, _U1>, __synth_three_way_result<_T2, _U2> > operator<=>(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { if (auto __c = std::__synth_three_way(__x.first, __y.first); __c != 0) { return __c; @@ -475,32 +465,27 @@ operator<=>(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { #else // _LIBCPP_STD_VER >= 20 template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator!=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator!=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator<(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator<(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return __x.first < __y.first || (!(__y.first < __x.first) && __x.second < __y.second); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator>(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator>(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return __y < __x; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator>=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator>=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return !(__x < __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator<=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator<=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return !(__y < __x); } @@ -524,7 +509,7 @@ struct common_type, pair<_U1, _U2>> { #endif // _LIBCPP_STD_VER >= 23 template && __is_swappable_v<_T2>, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair<_T1, _T2>& __x, pair<_T1, _T2>& __y) +inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair<_T1, _T2>& __x, pair<_T1, _T2>& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_T1>&& __is_nothrow_swappable_v<_T2>) { __x.swap(__y); } @@ -532,15 +517,14 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair<_T1, _ #if _LIBCPP_STD_VER >= 23 template requires(__is_swappable_v && __is_swappable_v) -_LIBCPP_HIDE_FROM_ABI constexpr void -swap(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) noexcept(noexcept(__x.swap(__y))) { +constexpr void swap(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) noexcept(noexcept(__x.swap(__y))) { __x.swap(__y); } #endif template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 -pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> > make_pair(_T1&& __t1, _T2&& __t2) { +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> > +make_pair(_T1&& __t1, _T2&& __t2) { return pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> >(std::forward<_T1>(__t1), std::forward<_T2>(__t2)); } @@ -568,22 +552,22 @@ struct __get_pair; template <> struct __get_pair<0> { template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T1& get(pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 _T1& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __p.first; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T1& get(const pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T1& get(const pair<_T1, _T2>& __p) _NOEXCEPT { return __p.first; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T1&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 _T1&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward<_T1>(__p.first); } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T1&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T1&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward(__p.first); } }; @@ -591,92 +575,88 @@ struct __get_pair<0> { template <> struct __get_pair<1> { template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T2& get(pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 _T2& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __p.second; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T2& get(const pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T2& get(const pair<_T1, _T2>& __p) _NOEXCEPT { return __p.second; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T2&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 _T2&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward<_T2>(__p.second); } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T2&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T2&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward(__p.second); } }; template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 -typename tuple_element<_Ip, pair<_T1, _T2> >::type& +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __get_pair<_Ip>::get(__p); } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type& +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(const pair<_T1, _T2>& __p) _NOEXCEPT { return __get_pair<_Ip>::get(__p); } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 -typename tuple_element<_Ip, pair<_T1, _T2> >::type&& +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return __get_pair<_Ip>::get(std::move(__p)); } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&& +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { return __get_pair<_Ip>::get(std::move(__p)); } #if _LIBCPP_STD_VER >= 14 template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T1& get(pair<_T1, _T2>& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T1& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __p.first; } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T1 const& get(pair<_T1, _T2> const& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T1 const& get(pair<_T1, _T2> const& __p) _NOEXCEPT { return __p.first; } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T1&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T1&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward<_T1&&>(__p.first); } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T1 const&& get(pair<_T1, _T2> const&& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T1 const&& get(pair<_T1, _T2> const&& __p) _NOEXCEPT { return std::forward<_T1 const&&>(__p.first); } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T2& get(pair<_T1, _T2>& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T2& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __p.second; } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T2 const& get(pair<_T1, _T2> const& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T2 const& get(pair<_T1, _T2> const& __p) _NOEXCEPT { return __p.second; } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T2&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T2&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward<_T2&&>(__p.second); } template -[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr _T2 const&& get(pair<_T1, _T2> const&& __p) _NOEXCEPT { +[[__nodiscard__]] inline constexpr _T2 const&& get(pair<_T1, _T2> const&& __p) _NOEXCEPT { return std::forward<_T2 const&&>(__p.second); } From 0236ad5908975f7bfcb5e098c3e7718e4dac68ec Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 8 May 2026 16:18:52 -0400 Subject: [PATCH 099/538] [mlir][core] Restore dropped printIR behavior. (#196628) Restore checking for module scope which is dropped in #195198 --- mlir/lib/Pass/IRPrinting.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mlir/lib/Pass/IRPrinting.cpp b/mlir/lib/Pass/IRPrinting.cpp index 032d4f7e2d67d..006ce3cbe794a 100644 --- a/mlir/lib/Pass/IRPrinting.cpp +++ b/mlir/lib/Pass/IRPrinting.cpp @@ -48,6 +48,11 @@ class IRPrinterInstrumentation : public PassInstrumentation { static void printIR(Operation *op, bool printModuleScope, raw_ostream &out, OpPrintingFlags flags) { + // Check to see if we are not printing at module scope. + if (!printModuleScope) + return op->print(out, op->getBlock() ? flags.useLocalScope() : flags); + + // Otherwise, we are printing at module scope. // Find the top-level operation. auto *topLevelOp = op; while (auto *parentOp = topLevelOp->getParentOp()) From 588fa29a3bf528dac761b7ce4b774f468439503e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 8 May 2026 22:28:01 +0200 Subject: [PATCH 100/538] [VPlan] Fix cyclic phi type inference in early outer loop plans. (#196634) For phis check if any of the operands are VPIRValues or we already have cached types. If so, return them. This fixes a verification stack overflow in the VPlan outer loop path after https://github.com/llvm/llvm-project/pull/192868. --- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 103dff1889a6a..a42b631cd3304 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -72,10 +72,17 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return SetResultTyFromOp(); switch (Opcode) { + case Instruction::PHI: + for (VPValue *Op : R->operands()) { + if (auto *VIR = dyn_cast(Op)) + return VIR->getType(); + if (auto *Ty = CachedTypes.lookup(Op)) + return Ty; + } + LLVM_FALLTHROUGH; case Instruction::ExtractElement: case Instruction::InsertElement: case Instruction::Freeze: - case Instruction::PHI: case VPInstruction::Broadcast: case VPInstruction::ComputeReductionResult: case VPInstruction::ExitingIVValue: From 0c0126dfb550e54c43aadb5b2fa1ad95cffa595c Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 13:40:04 -0700 Subject: [PATCH 101/538] [DWARFLinker] Deduplicate .debug_frame CIEs across LinkContexts (#195393) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each LinkContext held its own EmittedCIEs map, so linking the same object twice (or two objects with identical CIEs) produced one CIE per LinkContext instead of one shared CIE. Hoist the registry to linker scope and split emission into three phases so contexts can emit their frames concurrently while still sharing one deduplicated CIE pool: 1. Scan (parallel, during link). scanFrameData() records the unique CIEs referenced by retained FDEs, in first-reference order, into FrameScanResult::CIEs. scanAndUnloadInput() chains the scan in front of the existing input-unload so the DWARFContext can be released before the post-link emit pass. 2. Merge (serial, after link completes). registerCIEs() walks each context's scanned CIEs in ObjectContexts order and try_emplaces them into the linker-wide CIERegistry. The first LinkContext to reference a CIE becomes its owner and reserves a local offset in its own .debug_frame section; later contexts only learn the owner's section and offset. 3. Emit (parallel). emitDebugFrame() writes each context's owned CIEs followed by its FDEs into its own SectionDescriptor. FDE CIE_pointers are recorded as DebugOffsetPatches against the owner's section; the existing patch resolver rebinds them to OwnerStartOffset + LocalOffset when global offsets are assigned. Each task writes only to its own section, so no locking is needed. Output is fully deterministic: ownership assignment, per-context CIE order, FDE order within a section, and section concatenation order all depend only on the input, not on thread scheduling. A context's CIEs may now appear after FDEs (from other contexts) that reference them — DWARF allows this, and cross-context FDE -> CIE pointers resolve correctly via the patch mechanism. --- .../DWARFLinker/Parallel/DWARFLinkerImpl.cpp | 205 ++++++++++++------ .../DWARFLinker/Parallel/DWARFLinkerImpl.h | 71 +++++- .../X86/DWARFLinkerParallel/frame-1.test | 36 --- .../X86/DWARFLinkerParallel/frame-2.test | 46 ---- llvm/test/tools/dsymutil/X86/frame-1.test | 3 +- llvm/test/tools/dsymutil/X86/frame-2.test | 5 +- 6 files changed, 215 insertions(+), 151 deletions(-) delete mode 100644 llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-1.test delete mode 100644 llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-2.test diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp index 5595cf9d9cbc5..3d036fc6ea865 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp @@ -186,8 +186,8 @@ Error DWARFLinkerImpl::link() { // Link object file. if (Error Err = Context->link(ArtificialTypeUnit.get())) GlobalData.error(std::move(Err), Context->InputDWARFFile.FileName); - - Context->InputDWARFFile.unload(); + if (Error Err = Context->unloadInput()) + GlobalData.error(std::move(Err), Context->InputDWARFFile.FileName); } } else { DefaultThreadPool Pool(llvm::parallel::strategy); @@ -196,8 +196,8 @@ Error DWARFLinkerImpl::link() { // Link object file. if (Error Err = Context->link(ArtificialTypeUnit.get())) GlobalData.error(std::move(Err), Context->InputDWARFFile.FileName); - - Context->InputDWARFFile.unload(); + if (Error Err = Context->unloadInput()) + GlobalData.error(std::move(Err), Context->InputDWARFFile.FileName); }); Pool.wait(); @@ -217,6 +217,25 @@ Error DWARFLinkerImpl::link() { } } + // Build the linker-wide CIE registry, then emit each context's + // .debug_frame in parallel. See CIERegistry for the ownership rules. + if (!GlobalData.getOptions().UpdateIndexTablesOnly) { + LinkContext::CIERegistry CIEs; + for (std::unique_ptr &Context : ObjectContexts) + if (Context->FrameScan) + Context->registerCIEs(CIEs); + + llvm::parallel::TaskGroup TGroup; + for (std::unique_ptr &Context : ObjectContexts) { + if (!Context->FrameScan) + continue; + TGroup.spawn([&]() { + if (Error Err = Context->emitDebugFrame(CIEs)) + GlobalData.error(std::move(Err), Context->InputDWARFFile.FileName); + }); + } + } + if (ArtificialTypeUnit != nullptr && !ArtificialTypeUnit->getTypePool() .getRoot() ->getValue() @@ -595,18 +614,6 @@ Error DWARFLinkerImpl::LinkContext::link(TypeUnit *ArtificialTypeUnit) { if (Error Err = emitInvariantSections()) return Err; - } else if (!CompileUnits.empty()) { - // Emit .debug_frame section. - - Error ResultErr = Error::success(); - llvm::parallel::TaskGroup TGroup; - // We use task group here as PerThreadBumpPtrAllocator should be called from - // the threads created by ThreadPoolExecutor. - TGroup.spawn([&]() { - if (Error Err = cloneAndEmitDebugFrame()) - ResultErr = std::move(Err); - }); - return ResultErr; } return Error::success(); @@ -752,12 +759,16 @@ Error DWARFLinkerImpl::LinkContext::emitInvariantSections() { return Error::success(); } -Error DWARFLinkerImpl::LinkContext::cloneAndEmitDebugFrame() { +Error DWARFLinkerImpl::LinkContext::scanFrameData() { + if (GlobalData.getOptions().UpdateIndexTablesOnly) + return Error::success(); if (!GlobalData.getTargetTriple().has_value()) return Error::success(); if (InputDWARFFile.Dwarf == nullptr) return Error::success(); + if (CompileUnits.empty()) + return Error::success(); const DWARFObject &InputDWARFObj = InputDWARFFile.Dwarf->getDWARFObj(); @@ -765,41 +776,50 @@ Error DWARFLinkerImpl::LinkContext::cloneAndEmitDebugFrame() { if (OrigFrameData.empty()) return Error::success(); + auto Scan = std::make_unique(); + Scan->FrameData = OrigFrameData; + Scan->AddressSize = InputDWARFObj.getAddressSize(); + RangesTy AllUnitsRanges; for (std::unique_ptr &Unit : CompileUnits) { for (auto CurRange : Unit->getFunctionRanges()) AllUnitsRanges.insert(CurRange.Range, CurRange.Value); } - unsigned SrcAddrSize = InputDWARFObj.getAddressSize(); - - SectionDescriptor &OutSection = - getOrCreateSectionDescriptor(DebugSectionKind::DebugFrame); - - DataExtractor Data(OrigFrameData, InputDWARFObj.isLittleEndian(), 0); + StringRef FrameBytes = Scan->FrameData; + DataExtractor Data(FrameBytes, InputDWARFObj.isLittleEndian(), 0); uint64_t InputOffset = 0; + const unsigned SrcAddrSize = Scan->AddressSize; + // Width of the CIE_pointer field at the start of every FDE (and of the + // CIE_id sentinel at the start of every CIE) in DWARF32 .debug_frame. + constexpr unsigned CIEPointerSize = 4; - // Store the data of the CIEs defined in this object, keyed by their - // offsets. - DenseMap LocalCIES; - - /// The CIEs that have been emitted in the output section. The actual CIE - /// data serves a the key to this StringMap. - StringMap EmittedCIEs; + // CIEs defined in this input, keyed by their input offsets. + DenseMap LocalCIEs; + DenseSet AddedCIEs; while (Data.isValidOffset(InputOffset)) { uint64_t EntryOffset = InputOffset; uint32_t InitialLength = Data.getU32(&InputOffset); if (InitialLength == 0xFFFFFFFF) - return createFileError(InputDWARFObj.getFileName(), + return createFileError(InputDWARFFile.FileName, createStringError(std::errc::invalid_argument, - "Dwarf64 bits no supported")); + "Dwarf64 bits not supported")); + + // Reject lengths that don't fit in the input section. substr() saturates + // silently, which would otherwise let a malformed length poison the + // CIE bytes used as the registry key. + if (InitialLength > FrameBytes.size() - InputOffset) + return createFileError( + InputDWARFFile.FileName, + createStringError(std::errc::invalid_argument, + "Truncated .debug_frame entry.")); uint32_t CIEId = Data.getU32(&InputOffset); if (CIEId == 0xFFFFFFFF) { // This is a CIE, store it. - StringRef CIEData = OrigFrameData.substr(EntryOffset, InitialLength + 4); - LocalCIES[EntryOffset] = CIEData; + StringRef CIEData = FrameBytes.substr(EntryOffset, InitialLength + 4); + LocalCIEs[EntryOffset] = CIEData; // The -4 is to account for the CIEId we just read. InputOffset += InitialLength - 4; continue; @@ -820,46 +840,105 @@ Error DWARFLinkerImpl::LinkContext::cloneAndEmitDebugFrame() { } // This is an FDE, and we have a mapping. - // Have we already emitted a corresponding CIE? - StringRef CIEData = LocalCIES[CIEId]; + StringRef CIEData = LocalCIEs.lookup(CIEId); if (CIEData.empty()) return createFileError( - InputDWARFObj.getFileName(), + InputDWARFFile.FileName, createStringError(std::errc::invalid_argument, "Inconsistent debug_frame content. Dropping.")); - uint64_t OffsetToCIERecord = OutSection.OS.tell(); - - // Look if we already emitted a CIE that corresponds to the - // referenced one (the CIE data is the key of that lookup). - auto IteratorInserted = - EmittedCIEs.insert(std::make_pair(CIEData, OffsetToCIERecord)); - OffsetToCIERecord = IteratorInserted.first->getValue(); - - // Emit CIE for this ID if it is not emitted yet. - if (IteratorInserted.second) - OutSection.OS << CIEData; - - // Remember offset to the FDE record, so that we might update - // field referencing CIE record(containing OffsetToCIERecord), - // when final offsets are known. OffsetToCIERecord(which is written later) - // is local to the current .debug_frame section, it should be updated - // with final offset of the .debug_frame section. - OutSection.notePatch( - DebugOffsetPatch{OutSection.OS.tell() + 4, &OutSection, true}); - - // Emit the FDE with updated address and CIE pointer. - // (4 + AddrSize) is the size of the CIEId + initial_location - // fields that will get reconstructed by emitFDE(). - unsigned FDERemainingBytes = InitialLength - (4 + SrcAddrSize); - emitFDE(OffsetToCIERecord, SrcAddrSize, Loc + Range->Value, - OrigFrameData.substr(InputOffset, FDERemainingBytes), OutSection); + // Reject FDEs whose length doesn't even cover the CIE_pointer and + // initial_location fields; otherwise the unsigned subtraction below + // would wrap and substr() would saturate to a giant garbage blob. + if (InitialLength < CIEPointerSize + SrcAddrSize) + return createFileError(InputDWARFFile.FileName, + createStringError(std::errc::invalid_argument, + "Truncated .debug_frame FDE.")); + + // Promote each CIE on first reference; CIEs no FDE references are + // dropped from the output. + if (AddedCIEs.insert(CIEId).second) + Scan->CIEs.push_back(CIEData); + + unsigned FDERemainingBytes = InitialLength - (CIEPointerSize + SrcAddrSize); + Scan->FDEs.push_back({CIEData, Loc + Range->Value, + FrameBytes.substr(InputOffset, FDERemainingBytes)}); InputOffset += FDERemainingBytes; } + FrameScan = std::move(Scan); return Error::success(); } +void DWARFLinkerImpl::LinkContext::registerCIEs(CIERegistry &CIEs) { + assert(FrameScan && "registerCIEs called without FrameScan"); + SectionDescriptor &OutSection = + getOrCreateSectionDescriptor(DebugSectionKind::DebugFrame); + + uint32_t NextLocalOffset = 0; + for (StringRef CIEBytes : FrameScan->CIEs) { + auto [It, Inserted] = + CIEs.try_emplace(CIEBytes, CIELocation{&OutSection, NextLocalOffset}); + if (Inserted) { + FrameScan->OwnedCIEs.push_back(CIEBytes); + NextLocalOffset += static_cast(CIEBytes.size()); + } + } +} + +Error DWARFLinkerImpl::LinkContext::emitDebugFrame(const CIERegistry &CIEs) { + assert(FrameScan && "emitDebugFrame called without FrameScan"); + SectionDescriptor &OutSection = + getSectionDescriptor(DebugSectionKind::DebugFrame); + + // Emit owned CIEs at the offsets registerCIEs reserved for them. + for (StringRef CIEBytes : FrameScan->OwnedCIEs) + OutSection.OS << CIEBytes; + + const dwarf::FormParams FP = OutSection.getFormParams(); + const unsigned SrcAddrSize = FrameScan->AddressSize; + + for (const FrameScanResult::FDE &FDE : FrameScan->FDEs) { + auto It = CIEs.find(FDE.CIEBytes); + assert(It != CIEs.end() && "CIE missing from registry"); + SectionDescriptor *CIEOwnerSection = It->second.OwnerSection; + const uint32_t CIELocalOffset = It->second.LocalOffset; + + const uint64_t FDEPos = OutSection.OS.tell(); + // Note: this guards against a single context's section exceeding the + // DWARF32 limit. It does NOT catch the post-glue overflow that would + // happen if the concatenated .debug_frame across all contexts pushes + // past 4 GB; that case slips through silently because StartOffset is + // not yet assigned. A post-glue check would belong in the patch + // resolver in OutputSections.cpp. + if (FDEPos > FP.getDwarfMaxOffset()) + return createFileError( + InputDWARFFile.FileName, + createStringError(".debug_frame section offset " + "0x" + + Twine::utohexstr(FDEPos) + " exceeds the " + + dwarf::FormatString(FP.Format) + " limit")); + + // CIE_pointer field follows the 4-byte initial_length. + OutSection.notePatch(DebugOffsetPatch{FDEPos + 4, CIEOwnerSection, true}); + + emitFDE(CIELocalOffset, SrcAddrSize, FDE.Address, FDE.Instructions, + OutSection); + } + + FrameScan.reset(); + return Error::success(); +} + +Error DWARFLinkerImpl::LinkContext::unloadInput() { + // Scan the input's .debug_frame now, while the DWARFContext is still + // loaded, so the later (post-pool) emission pass can run against the + // scan result alone. + Error ScanErr = scanFrameData(); + InputDWARFFile.unload(); + return ScanErr; +} + /// Emit a FDE into the debug_frame section. \p FDEBytes /// contains the FDE data without the length, CIE offset and address /// which will be replaced with the parameter values. diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h index 2dba5df559375..f808570e9ba06 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h @@ -14,6 +14,7 @@ #include "DWARFLinkerTypeUnit.h" #include "StringEntryToDwarfStringPoolEntryMap.h" #include "llvm/ADT/AddressRanges.h" +#include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/AccelTable.h" #include "llvm/DWARFLinker/Parallel/DWARFLinker.h" #include "llvm/DWARFLinker/StringPool.h" @@ -244,6 +245,59 @@ class DWARFLinkerImpl : public DWARFLinker { return Size; } + /// Section + local offset of a .debug_frame CIE that has been (or will + /// be) emitted by some LinkContext. Stored in CIERegistry so that any + /// FDE referencing the same CIE bytes can resolve its CIE_pointer to + /// OwnerSection->StartOffset + LocalOffset at output time, even when + /// the FDE lives in a different LinkContext's section. + struct CIELocation { + SectionDescriptor *OwnerSection; + uint32_t LocalOffset; + }; + + /// Linker-wide registry for .debug_frame CIEs. The key is the raw CIE + /// bytes. Populated by a serial pass over ObjectContexts (so ownership + /// is deterministic — first LinkContext wins) and then consumed + /// read-only by a parallel emission pass that writes each context's + /// .debug_frame section. SectionDescriptor pointers remain valid until + /// linking completes because they live in std::map-held shared_ptrs. + using CIERegistry = StringMap; + + /// Result of scanning one LinkContext's input .debug_frame. Produced + /// by scanFrameData() during the parallel link phase and consumed by + /// the serial CIE-registry merge and parallel emission passes. Owns a + /// copy of the raw frame bytes so the StringRef views below remain + /// valid after the input DWARFContext is unloaded. + struct FrameScanResult { + /// Owning copy of the input .debug_frame bytes. + SmallString<0> FrameData; + + /// Address size of the input object, used by emitFDE to size the + /// FDE's initial_location field. + unsigned AddressSize = 0; + + /// Unique CIEs referenced by at least one retained FDE in this + /// context, in first-reference order. Each element is a view into + /// FrameData and is a key into the linker-wide CIERegistry. + SmallVector CIEs; + + /// FDEs retained for emission. CIEBytes is the registry key; + /// Instructions is the FDE body after the initial_length / + /// CIE_pointer / initial_location fields. + struct FDE { + StringRef CIEBytes; + uint64_t Address = 0; + StringRef Instructions; + }; + SmallVector FDEs; + + /// CIEs this context owns, set during the serial CIE-registry + /// merge. Emission writes these at local offsets 0, + /// OwnedCIEs[0].size(), ... in order. + SmallVector OwnedCIEs; + }; + std::unique_ptr FrameScan; + /// Link compile units for this context. Error link(TypeUnit *ArtificialTypeUnit); @@ -255,8 +309,21 @@ class DWARFLinkerImpl : public DWARFLinker { /// Emit invariant sections. Error emitInvariantSections(); - /// Clone and emit .debug_frame. - Error cloneAndEmitDebugFrame(); + /// Unload the input DWARFContext after scanning the input .debug_frame into + /// FrameScan. + Error unloadInput(); + + /// Parse this context's input .debug_frame into FrameScan. Deferred + /// CIE/FDE emission happens later against the scan result alone. + Error scanFrameData(); + + /// Register this context's CIEs with the linker-wide registry. + void registerCIEs(CIERegistry &CIEs); + + /// Emit this context's .debug_frame section. Safe to call in parallel + /// across contexts because each call writes only to its own + /// SectionDescriptor. + Error emitDebugFrame(const CIERegistry &CIEs); /// Emit FDE record. void emitFDE(uint32_t CIEOffset, uint32_t AddrSize, uint64_t Address, diff --git a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-1.test b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-1.test deleted file mode 100644 index 6b02f32f8541c..0000000000000 --- a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-1.test +++ /dev/null @@ -1,36 +0,0 @@ -# RUN: rm -rf %t -# RUN: mkdir -p %t -# RUN: llc -filetype=obj %p/../../Inputs/frame-dw2.ll -o %t/frame-dw2.o -# RUN: dsymutil --linker parallel -f -oso-prepend-path=%t -y %s -o - | \ -# RUN: llvm-dwarfdump -debug-frame - | FileCheck %s - -# This test is meant to verify that identical CIEs will get reused -# in the same file but not inbetween files. For this to happen, we -# link twice the same file using this made-up debug map: - ---- -triple: 'i386-apple-darwin' -objects: - - filename: frame-dw2.o - symbols: - - { sym: _bar, objAddr: 0x0, binAddr: 0x1000, size: 0x12 } - - { sym: _baz, objAddr: 0x0, binAddr: 0x2000, size: 0x12 } - - filename: frame-dw2.o - symbols: - - { sym: _bar, objAddr: 0x0, binAddr: 0x3000, size: 0x12 } - - { sym: _baz, objAddr: 0x0, binAddr: 0x4000, size: 0x12 } -... - -# CHECK: .debug_frame contents: -# CHECK: 00000000 {{[0-9a-f]*}} ffffffff CIE -# CHECK-NOT: FDE -# CHECK: FDE cie=00000000 pc=00001000...00001 -# CHECK-NOT: FDE -# CHECK: FDE cie=00000000 pc=00002000...00002 -# CHECK: [[CIECU2:[0-9a-f]*]] {{[0-9a-f]*}} ffffffff CIE -# CHECK-NOT: FDE -# CHECK: FDE cie=[[CIECU2]] pc=00003000...00003 -# CHECK-NOT: FDE -# CHECK: FDE cie=[[CIECU2]] pc=00004000...00004 -# CHECK-NOT: FDE -# CHECK: .eh_frame contents: diff --git a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-2.test b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-2.test deleted file mode 100644 index 871c5d0fa40b8..0000000000000 --- a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/frame-2.test +++ /dev/null @@ -1,46 +0,0 @@ -# RUN: rm -rf %t -# RUN: mkdir -p %t -# RUN: llc -filetype=obj %p/../../Inputs/frame-dw2.ll -o %t/frame-dw2.o -# RUN: llc -filetype=obj %p/../../Inputs/frame-dw4.ll -o %t/frame-dw4.o -# RUN: dsymutil --linker parallel -f -oso-prepend-path=%t -y %s -o - | \ -# RUN: llvm-dwarfdump -debug-frame - | FileCheck %s - -# Check the handling of multiple different CIEs. To have CIEs that -# appear to be different, use a dwarf2 version of the file along with -# a dwarf 4 version. The CIE header version (and layout) will be different. ---- -triple: 'i386-apple-darwin' -objects: - - filename: frame-dw2.o - symbols: - - { sym: _bar, objAddr: 0x0, binAddr: 0x1000, size: 0x12 } - - { sym: _baz, objAddr: 0x0, binAddr: 0x2000, size: 0x12 } - - filename: frame-dw4.o - symbols: - - { sym: _baz, objAddr: 0x0, binAddr: 0x3000, size: 0x12 } - - filename: frame-dw2.o - symbols: - - { sym: _bar, objAddr: 0x0, binAddr: 0x4000, size: 0x12 } -... - -# CHECK: .debug_frame contents: -# CHECK: 00000000 {{[0-9a-f]*}} ffffffff CIE -# CHECK-NEXT: Format: DWARF32 -# CHECK-NEXT: Version:{{.*}}1 -# CHECK-NOT: FDE -# CHECK: FDE cie=00000000 pc=00001000...00001 -# CHECK-NOT: FDE -# CHECK: FDE cie=00000000 pc=00002000...00002 -# CHECK-NOT: FDE -# CHECK: [[CIEDW4:[0-9a-f]*]] 00000010 ffffffff CIE -# CHECK-NEXT: Format: DWARF32 -# CHECK-NEXT: Version:{{.*}}4 -# CHECK-NOT: FDE -# CHECK: FDE cie=[[CIEDW4]] pc=00003000...00003 -# CHECK: [[CIEDW1_2:[0-9a-f]*]] 00000010 ffffffff CIE -# CHECK-NEXT: Format: DWARF32 -# CHECK-NEXT: Version:{{.*}}1 -# CHECK-NOT: FDE -# CHECK: FDE cie=[[CIEDW1_2]] pc=00004000...00004 -# CHECK-NOT: FDE -# CHECK: .eh_frame contents: diff --git a/llvm/test/tools/dsymutil/X86/frame-1.test b/llvm/test/tools/dsymutil/X86/frame-1.test index e2d72928b728f..36223d56be4c1 100644 --- a/llvm/test/tools/dsymutil/X86/frame-1.test +++ b/llvm/test/tools/dsymutil/X86/frame-1.test @@ -2,6 +2,7 @@ # RUN: mkdir -p %t # RUN: llc -filetype=obj %p/../Inputs/frame-dw2.ll -o %t/frame-dw2.o # RUN: dsymutil --linker classic -f -oso-prepend-path=%t -y %s -o - | llvm-dwarfdump -debug-frame - | FileCheck %s +# RUN: dsymutil --linker parallel -f -oso-prepend-path=%t -y %s -o - | llvm-dwarfdump -debug-frame - | FileCheck %s # This test is meant to verify that identical CIEs will get reused # in the same file but also inbetween files. For this to happen, we @@ -29,5 +30,3 @@ objects: # CHECK: FDE cie=00000000 pc=00003000...00003 # CHECK-NOT: FDE # CHECK: .eh_frame contents: - -## FIXME: Support --linker parallel diff --git a/llvm/test/tools/dsymutil/X86/frame-2.test b/llvm/test/tools/dsymutil/X86/frame-2.test index 4db89ea815f71..27a1db46f35c8 100644 --- a/llvm/test/tools/dsymutil/X86/frame-2.test +++ b/llvm/test/tools/dsymutil/X86/frame-2.test @@ -3,6 +3,7 @@ # RUN: llc -filetype=obj %p/../Inputs/frame-dw2.ll -o %t/frame-dw2.o # RUN: llc -filetype=obj %p/../Inputs/frame-dw4.ll -o %t/frame-dw4.o # RUN: dsymutil --linker classic -f -oso-prepend-path=%t -y %s -o - | llvm-dwarfdump -debug-frame - | FileCheck %s +# RUN: dsymutil --linker parallel -f -oso-prepend-path=%t -y %s -o - | llvm-dwarfdump -debug-frame - | FileCheck %s # Check the handling of multiple different CIEs. To have CIEs that # appear to be different, use a dwarf2 version of the file along with @@ -17,6 +18,8 @@ objects: - filename: frame-dw4.o symbols: - { sym: _baz, objAddr: 0x0, binAddr: 0x3000, size: 0x12 } + # Third object references the dw2 CIE owned by the first context. + # Exercises the cross-context CIE pointer (FDE here, CIE there). - filename: frame-dw2.o symbols: - { sym: _bar, objAddr: 0x0, binAddr: 0x4000, size: 0x12 } @@ -39,5 +42,3 @@ objects: # CHECK: FDE cie=00000000 pc=00004000...00004 # CHECK-NOT: FDE # CHECK: .eh_frame contents: - -## FIXME: Support --linker parallel From 7b5bb30a36c918c961d63eadd1329c30e0b5c77c Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Fri, 8 May 2026 13:40:16 -0700 Subject: [PATCH 102/538] AMDGPU/GlobalISel: RegBankLegalize rules for cluster_load_b32/b64/b128 (#196186) --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 19 +++++++ .../AMDGPU/llvm.amdgcn.cluster.load.ll | 55 ++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 3a5d3e6ff1345..18d4ab39f0a9d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1875,6 +1875,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, amdgcn_global_store_async_from_lds_b128}) .Any({{}, {{}, {IntrId, VgprP1, VgprP3}}}); + addRulesForIOpcs({amdgcn_cluster_load_b32}) + .Any({{UniB32}, {{UniInVgprB32}, {IntrId, SgprP1, Imm, SgprB32_M0}}}) + .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1, Imm, SgprB32_M0}}}) + .Any( + {{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1, Imm, SgprB32_M0}}}); + + addRulesForIOpcs({amdgcn_cluster_load_b64}) + .Any({{UniB64}, {{UniInVgprB64}, {IntrId, SgprP1, Imm, SgprB32_M0}}}) + .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1, Imm, SgprB32_M0}}}) + .Any( + {{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1, Imm, SgprB32_M0}}}); + + addRulesForIOpcs({amdgcn_cluster_load_b128}) + .Any({{UniB128}, {{UniInVgprB128}, {IntrId, SgprP1, Imm, SgprB32_M0}}}) + .Any({{DivB128, _, UniP1}, + {{VgprB128}, {IntrId, SgprP1, Imm, SgprB32_M0}}}) + .Any({{DivB128, _, DivP1}, + {{VgprB128}, {IntrId, VgprP1, Imm, SgprB32_M0}}}); + addRulesForIOpcs({amdgcn_cluster_load_async_to_lds_b8, amdgcn_cluster_load_async_to_lds_b32, amdgcn_cluster_load_async_to_lds_b64, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll index 5a9f954b6d9f0..b2ccd23117fcc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s declare i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask) declare <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask) @@ -189,6 +189,59 @@ entry: store <4 x i32> %val, ptr addrspace(1) %use ret void } +define amdgpu_ps void @cluster_load_b32_saddr_vmask(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %mask) { +; GFX1250-LABEL: cluster_load_b32_saddr_vmask: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b64_saddr_vmask(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %mask) { +; GFX1250-LABEL: cluster_load_b64_saddr_vmask: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @cluster_load_b128_saddr_vmask(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %mask) { +; GFX1250-LABEL: cluster_load_b128_saddr_vmask: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: cluster_load_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask) + store <4 x i32> %val, ptr addrspace(1) %use + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX1250-GISEL: {{.*}} ; GFX1250-SDAG: {{.*}} From aa7cb8edabfc01e7ea366616f88334d31b1c8e21 Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Fri, 8 May 2026 13:41:42 -0700 Subject: [PATCH 103/538] AMDGPU/GlobalISel: RegBankLegalize rules for cvt fp8 e5m3 intrinsics (#196369) --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 8 ++- .../AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll | 57 ++++++++++++++++++- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 18d4ab39f0a9d..5cc0d073df17b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1749,13 +1749,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}}); addRulesForIOpcs({amdgcn_cvt_sr_bf8_f32, amdgcn_cvt_sr_fp8_f32, - amdgcn_cvt_pk_bf8_f32, amdgcn_cvt_pk_fp8_f32}, + amdgcn_cvt_sr_fp8_f32_e5m3, amdgcn_cvt_pk_bf8_f32, + amdgcn_cvt_pk_fp8_f32, amdgcn_cvt_pk_fp8_f32_e5m3}, Standard) .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}}) .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}}); - addRulesForIOpcs( - {amdgcn_cvt_off_f32_i4, amdgcn_cvt_f32_bf8, amdgcn_cvt_f32_fp8}, Standard) + addRulesForIOpcs({amdgcn_cvt_off_f32_i4, amdgcn_cvt_f32_bf8, + amdgcn_cvt_f32_fp8, amdgcn_cvt_f32_fp8_e5m3}, + Standard) .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}) .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}}); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll index 4b59f9f7cf9c7..c125c26dd59b7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s declare i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float, float, i32, i1) declare i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float, i32, i32, i32) @@ -232,3 +232,58 @@ define float @test_cvt_f32_fp8_e5m3_byte3(i32 %a) { %ret = tail call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 %a, i32 3) ret float %ret } + +define amdgpu_ps i32 @test_cvt_pk_fp8_f32_e5m3_word0_sss(float inreg %x, float inreg %y, i32 inreg %old) { +; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_e5m3_word0_sss: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_fp8_f32 v0.l, s0, s1 clamp +; GFX1250-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_e5m3_word0_sss: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v0, s0, s1 clamp +; GFX1250-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_e5m3_word0_sss: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_pk_fp8_f32 v0, s0, s1 clamp +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float %x, float %y, i32 %old, i1 false) + ret i32 %ret +} + +define amdgpu_ps i32 @test_cvt_sr_fp8_f32_e5m3_byte0_sss(float inreg %x, i32 inreg %r, i32 inreg %old) { +; GFX1250-LABEL: test_cvt_sr_fp8_f32_e5m3_byte0_sss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_sr_fp8_f32 v0, s0, s1 clamp +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float %x, i32 %r, i32 %old, i32 0) + ret i32 %ret +} + +define amdgpu_ps float @test_cvt_f32_fp8_e5m3_byte0_s(i32 inreg %a) { +; GFX1250-LABEL: test_cvt_f32_fp8_e5m3_byte0_s: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_cvt_f32_fp8_e64 v0, s0 clamp +; GFX1250-NEXT: ; return to shader part epilog + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 %a, i32 0) + ret float %ret +} From 7748bf56c178a11dc2c705d94be8a9951da62e19 Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Fri, 8 May 2026 20:43:31 +0000 Subject: [PATCH 104/538] [libc] Skip targets with unavailable __ONLY flags (#196637) When SKIP_FLAG_EXPANSION strips a flag that has the __ONLY modifier, remove_duplicated_flags drops the flag from the list. This leaves expand_flags_for_target with an empty flag list, causing it to create a plain (non-flag) target. The __ONLY semantics, "only build this target with the flag active", are silently violated. On x86-64 CI runners without FMA, this results in cosf_float_test and sinf_float_test being built and linked without FMA. The sincosf algorithm was tuned assuming fused multiply-add precision, so the unfused x*y+z fallback exceeds the 3.5 ULP tolerance (57 ULP for cosf, 12 ULP for sinf). Added an early return in add_target_with_flags: if any flag with the __ONLY modifier would be skipped, the target is not generated. Assisted-by: Automated tooling, human reviewed. --- libc/cmake/modules/LLVMLibCFlagRules.cmake | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake index 4bbd21ab569dc..d721756f9b4c5 100644 --- a/libc/cmake/modules/LLVMLibCFlagRules.cmake +++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake @@ -241,6 +241,22 @@ function(add_target_with_flags target_name) list(APPEND ADD_TO_EXPAND_FLAGS ${ADD_TO_EXPAND_ADD_FLAGS}) endif() list(APPEND ADD_TO_EXPAND_FLAGS ${deps_flag_list}) + + # If any flag with the __ONLY modifier is unavailable (i.e. its + # SKIP_FLAG_EXPANSION is set), skip this target entirely. The __ONLY + # modifier means the target must only be built with that flag active; + # building without it would produce incorrect results. + foreach(flag_with_modifier IN LISTS ADD_TO_EXPAND_FLAGS) + extract_flag_modifier(${flag_with_modifier} flag modifier) + if("${modifier}" STREQUAL "ONLY" AND SKIP_FLAG_EXPANSION_${flag}) + if(SHOW_INTERMEDIATE_OBJECTS) + message(STATUS "Not generating ${fq_target_name} since " + "${flag} is not available on the host.") + endif() + return() + endif() + endforeach() + remove_duplicated_flags("${ADD_TO_EXPAND_FLAGS}" flags) list(SORT flags) From a47d3636f953870d96fb6cc68817365fdad2f9fe Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 13:56:04 -0700 Subject: [PATCH 105/538] [DWARFLinker] Don't duplicate classes with in-class static decls (#196442) An in-class static declaration was forced to PlainDwarf placement and cascaded that up to its enclosing class. If the class was already in the type table via the out-of-line definition's specification, it ended up with Both placement and cloneDIE emitted two copies. Keep in-class static declarations in the type table so they stay with their enclosing type. --- .../Parallel/DependencyTracker.cpp | 26 +++++ .../Inputs/odr-static-member-decl/a.o | Bin 0 -> 2096 bytes .../Inputs/odr-static-member-decl/b.o | Bin 0 -> 2312 bytes .../odr-static-member-decl.test | 104 ++++++++++++++++++ 4 files changed, 130 insertions(+) create mode 100644 llvm/test/tools/dsymutil/Inputs/odr-static-member-decl/a.o create mode 100644 llvm/test/tools/dsymutil/Inputs/odr-static-member-decl/b.o create mode 100644 llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-static-member-decl.test diff --git a/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp b/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp index 68b01dd7ddad3..46f2a20f474c5 100644 --- a/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp +++ b/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp @@ -397,6 +397,32 @@ getFinalPlacementForEntry(const UnitEntryPairTy &Entry, return CompileUnit::PlainDwarf; if (Entry.DieEntry->getTag() == dwarf::DW_TAG_variable) { + // In-class static member declarations (e.g. "static constexpr int x = 1;") + // are DW_TAG_variable children of a DW_TAG_class_type / + // DW_TAG_structure_type / DW_TAG_union_type with DW_AT_declaration set. + // They are part of the class type and belong in the TypeTable together with + // the class. Forcing them into PlainDwarf would also drag the parent class + // into PlainDwarf (via markParentsAsKeepingChildren), producing a duplicate + // empty class declaration DIE alongside the full class definition emitted + // in another CU. + bool IsDeclaration = dwarf::toUnsigned( + Entry.CU->find(Entry.DieEntry, dwarf::DW_AT_declaration), 0); + bool ParentIsType = false; + if (IsDeclaration) { + if (std::optional ParentIdx = Entry.DieEntry->getParentIdx()) { + dwarf::Tag ParentTag = + Entry.CU->getDebugInfoEntry(*ParentIdx)->getTag(); + ParentIsType = ParentTag == dwarf::DW_TAG_class_type || + ParentTag == dwarf::DW_TAG_structure_type || + ParentTag == dwarf::DW_TAG_union_type; + } + } + if (IsDeclaration && ParentIsType) { + // Pure declarations have no runtime address; they belong with the class + // type. Always place in TypeTable regardless of how they were reached. + return CompileUnit::TypeTable; + } + // Do not put variable into the "TypeTable" and "PlainDwarf" at the same // time. if (EntryInfo.getPlacement() == CompileUnit::PlainDwarf || diff --git a/llvm/test/tools/dsymutil/Inputs/odr-static-member-decl/a.o b/llvm/test/tools/dsymutil/Inputs/odr-static-member-decl/a.o new file mode 100644 index 0000000000000000000000000000000000000000..3af19a5c265d8b28a47c2dcf7b32dd3546b49f87 GIT binary patch literal 2096 zcma)7U1%It6h3!mXLplnnoYKljR|$dj6a3W{upA=(k5n8le)#w#55J;GP^rzmhR3h zGn=?A=?@jf5|HXcse)R;2kDDXijM`KeDci)9|b`~@v-sWbMD+pH)8`iaOV8nbG|uq z&$)NL{o~hv5&%pEd?3e=C+U$zMjA%`iWc{v_vP9OXERjGLP2HB;v(`QcZgoHTnNY8~`V8W^F1fvq- z)tyzB{87A5WN=-|-}i~|EzuKH2DUfOJ=$v{9wlgb0}(E1QhL`3mkka%}x{s;TS zvuZT~7{&WV;-w^BDIO@>G%SJo(E1#!Y5TYy#rsPpeJb&m5N{Vi#N@o{Hd|J;&DWjR z>rRdHa!IBNaYrU+rE?qc#3&~Bs?T2Is~%PtByhfa{_aVn#ep~k;RMzFz}tklO~B>+gU_M207UFM*!l; zNEAf=l&m9*l6p+%bOE`WC|9l495K-tSikXbk)%H=k({X7Ihp{V?IsBiQ!|zqkVWA^zFDNF+zI+ z=JAln$IyFR*U^(O8KF&O978oIb3T0nudk#AN+xsw^z@57pss1a@bl>-9|NYSLPs+O zbbFAcXvr93s&<@$lq0dk3@O+Ojg)v?izP%XTkh!vg?vfn-Hr}Q6;{G&fv{=Za zWfqFhn1y`dX|T*{s|C4oz2aHk#azkWup4g6_Hr|BvuQcCGS1eL>o)w{V(BG6ciO7H zytq=FFegmEb^#)0uH9_q+?sc^vR-f0U=cb{ciM0f)_r>#_^ab27Yd8u!vU;T6Z?e< z(j8>_;_lg@>ynuMC$zKW^785Yv-v!#|BK6fsX+eP+xnqfQyt&@c$0R9b}8H)%AxCs z-aClQ(rP+}z{*%4RWp#lL<2-$Kz$J-<&!i?@;}jCs#rNB*V&~rz&a_GI*>@-#Hca! zk0NLP{#TI9$95e|y`{|b$!iiWVGQjFohioHp9lL$&})!Uc~qR%ezaB8`?bT$0O~ON z`A=Vbbop#@{`()wL(AuW#*+c;w({iTBX6GEynl{=vvq%=f&19AbJ)|`aPM_(G=54u qY-)r0F-1Z6Nb^iN8f)eny@ZH*fR%`HNm|lI9s21J67Om?`u+oI0rk@W literal 0 HcmV?d00001 diff --git a/llvm/test/tools/dsymutil/Inputs/odr-static-member-decl/b.o b/llvm/test/tools/dsymutil/Inputs/odr-static-member-decl/b.o new file mode 100644 index 0000000000000000000000000000000000000000..c4060a18e3346c6c6e59d6f6b0970f6b6f94e0c9 GIT binary patch literal 2312 zcma)7U1%It6h3!mXLplLnoTy4#H8ugX0--0yBk9+8rsGtZKB&6y0Nx`4Er-_7B{oY z&L(baw56i71WNTm+9xTfv`2JitFxrqXhAt1>914PJgH>7rba1x_S&NGavS*tcl3}boz zL(}dY7+Yt(16(m+uofuOjs+O&U$r7Ou-@S1U$oWySXL?cu|N z%+wWq_y75v8D2g^@_j1#0)A;U)qUg3GDv0yoj-uzVls zE*eig|19~2>ojy(@=2Ff+kPj}!Dm;`#Y04X0K$W~|2F2c{W~=K*dHwct-{~iNLxSt zhyB9BrJoiqy?$?DYyRFAz4^=d%U{tr=7?Sqi$ydwc40I2+qqa}AUrS}>mBa%+XGl^ zijG8-K`Vx{jEL!w8ViM<3O$Y3d{d;x79;E&7@8Rx=_b;;8(sS_!v6>@;xP)0p?7~M zgr1--nD`+0)ELiPj2yu8*rAa!=yniw_Biv%LU0-hC#`~ZlTcJQ#XiBW9D4{5&2{Qf zwV+Q3pLPVkP{y@DiVd8RpN)WD^97wr;zVbNRPO6g-l=twT7*fRI=A}(_tx`qv0nzx z0at)`f%kyRz*XQQ;5xuh(QLU~G^4p<#yTC{Fe`SUWJM>FdO}Y`_d2~3lgR`;JvsS= zo=hYkqpY4QmnohqWGk7!oCxY~mMQ zLl)ha9ja3pa`T)7w8L&R-1e{7j+s zhi7Yp&{h5YO->#slatCx(cx+Z}F#b-WN zX{$?Y`{o{z7kCWdKep@-z z&2h7w1vsm`z7XQi9o?*W834vE=K}Y@S7G;Pqe?Gqy16TOtZU-y_-m;T`#;$E@ikmN z5?7T+_YA*vc=PTly + S::x; } +# b.cpp: +# #include "h.h" +# const int S::x; +# int useB(S *s) { return s->y - S::x; } +# +# clang++ -target x86_64-apple-darwin -g -O0 -c a.cpp -o a.o +# clang++ -target x86_64-apple-darwin -g -O0 -c b.cpp -o b.o + +--- +triple: 'x86_64-apple-darwin' +objects: + - filename: 'odr-static-member-decl/a.o' + timestamp: 0 + symbols: + - { sym: __Z4useAP1S, objAddr: 0x0, binAddr: 0x100000000, size: 0x20 } + - filename: 'odr-static-member-decl/b.o' + timestamp: 0 + symbols: + - { sym: __Z4useBP1S, objAddr: 0x0, binAddr: 0x100000100, size: 0x20 } + - { sym: __ZN1S1xE, objAddr: 0x14, binAddr: 0x100000200, size: 0x4 } +... + +# Parallel linker: struct S — with the in-class static x and the member y — +# must appear exactly once, in the artificial type unit. The two source CUs +# must not carry their own copy. The --implicit-check-not above enforces +# "exactly one structure_type in the entire dump". + +# PARALLEL: DW_TAG_compile_unit +# PARALLEL: DW_AT_name{{.*}}"__artificial_type_unit" +# PARALLEL: DW_TAG_structure_type +# PARALLEL: DW_AT_name{{.*}}"S" +# PARALLEL: DW_TAG_member +# PARALLEL: DW_AT_name{{.*}}"y" +# PARALLEL: DW_TAG_variable +# PARALLEL: DW_AT_name{{.*}}"x" +# PARALLEL: DW_AT_declaration + +# PARALLEL: DW_TAG_compile_unit +# PARALLEL: DW_AT_name{{.*}}"a.cpp" +# PARALLEL: DW_TAG_subprogram +# PARALLEL: DW_AT_name{{.*}}"useA" + +# PARALLEL: DW_TAG_compile_unit +# PARALLEL: DW_AT_name{{.*}}"b.cpp" +# PARALLEL: DW_TAG_subprogram +# PARALLEL: DW_AT_name{{.*}}"useB" + +# Classic linker: there is no artificial type unit; each source CU keeps its +# own struct S. Verify both copies are well-formed (member y, in-class static +# decl x) and that each CU has its subprogram. + +# CLASSIC: DW_TAG_compile_unit +# CLASSIC: DW_AT_name{{.*}}"a.cpp" +# CLASSIC: DW_TAG_structure_type +# CLASSIC: DW_AT_name{{.*}}"S" +# CLASSIC: DW_TAG_variable +# CLASSIC: DW_AT_name{{.*}}"x" +# CLASSIC: DW_AT_declaration +# CLASSIC: DW_TAG_member +# CLASSIC: DW_AT_name{{.*}}"y" +# CLASSIC: DW_TAG_subprogram +# CLASSIC: DW_AT_name{{.*}}"useA" + +# CLASSIC: DW_TAG_compile_unit +# CLASSIC: DW_AT_name{{.*}}"b.cpp" +# CLASSIC: DW_TAG_structure_type +# CLASSIC: DW_AT_name{{.*}}"S" +# CLASSIC: DW_TAG_variable +# CLASSIC: DW_AT_name{{.*}}"x" +# CLASSIC: DW_AT_declaration +# CLASSIC: DW_TAG_member +# CLASSIC: DW_AT_name{{.*}}"y" +# CLASSIC: DW_TAG_subprogram +# CLASSIC: DW_AT_name{{.*}}"useB" From b1b3b430c6a39ad7452a7907a4c781c0672e2bd7 Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Fri, 8 May 2026 21:20:00 +0000 Subject: [PATCH 106/538] [libc] Disable -march=native in CI to fix sccache poisoning (#196560) -march=native is incompatible with shared build caches because sccache treats it as a literal string. Object files compiled on one CPU model get silently served to runners with a different CPU, causing SIGILL crashes in the opt_host memory tests. Made LIBC_COMPILE_OPTIONS_NATIVE a CMake cache variable so CI can override it. Both overlay and fullbuild workflows now pass -DLIBC_COMPILE_OPTIONS_NATIVE="" to disable -march=native. Local developer builds are unaffected and still default to -march=native. Reverted the per-CPU cache key approach from #196477 in favour of this fix, which addresses the root cause. Bumped sccache key versions (v2) in both workflows to invalidate the poisoned caches. Assisted-by: Automated tooling, human reviewed. --- .github/workflows/libc-fullbuild-tests.yml | 5 ++-- .github/workflows/libc-overlay-tests.yml | 28 ++----------------- .../modules/LLVMLibCCheckCpuFeatures.cmake | 19 +++++++++++-- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml index 3a9d1436d5d48..4a7cfa119b74c 100644 --- a/.github/workflows/libc-fullbuild-tests.yml +++ b/.github/workflows/libc-fullbuild-tests.yml @@ -120,7 +120,7 @@ jobs: uses: hendrikmuhs/ccache-action@33522472633dbd32578e909b315f5ee43ba878ce # v1.2.22 with: max-size: 1G - key: libc_fullbuild_${{ matrix.c_compiler }} + key: libc_fullbuild_v2_${{ matrix.c_compiler }} variant: sccache - name: Set reusable strings @@ -145,7 +145,8 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache - -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }}" + -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }} + -DLIBC_COMPILE_OPTIONS_NATIVE=''" if [[ "${{ matrix.include_scudo }}" == "ON" || "${{ matrix.build_fuzzing_tests }}" == "ON" ]]; then export RUNTIMES="$RUNTIMES;compiler-rt" diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml index a020f0bfd5cd3..f63150983aa03 100644 --- a/.github/workflows/libc-overlay-tests.yml +++ b/.github/workflows/libc-overlay-tests.yml @@ -44,31 +44,6 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - # The libc build uses -march=native for opt_host tests, which means - # the generated object files are specific to the runner's CPU model. - # sccache treats -march=native as a literal string in its cache key, - # so without per-CPU cache keys, object files compiled on one CPU - # model are silently served to runners with a different CPU, causing - # illegal instruction crashes at runtime. - - name: Detect CPU model - id: cpu-info - shell: bash - run: | - if [ "$RUNNER_OS" = "Linux" ]; then - # x86 has 'model name', ARM has 'CPU implementer' + 'CPU part'. - cpu_model=$(grep -m1 'model name' /proc/cpuinfo | cut -d: -f2 | xargs | tr ' ' '-' || true) - if [ -z "$cpu_model" ]; then - impl=$(grep -m1 'CPU implementer' /proc/cpuinfo | cut -d: -f2 | xargs) - part=$(grep -m1 'CPU part' /proc/cpuinfo | cut -d: -f2 | xargs) - cpu_model="arm-${impl:-unknown}-${part:-unknown}" - fi - elif [ "$RUNNER_OS" = "macOS" ]; then - cpu_model=$(sysctl -n machdep.cpu.brand_string | tr ' ' '-') - else - cpu_model="generic" - fi - echo "cpu-model=${cpu_model:-unknown}" >> "$GITHUB_OUTPUT" # Libc's build is relatively small comparing with other components of LLVM. # A fresh linux overlay takes about 180MiB of uncompressed disk space, which can @@ -81,7 +56,7 @@ jobs: uses: hendrikmuhs/ccache-action@33522472633dbd32578e909b315f5ee43ba878ce # v1.2.22 with: max-size: 1G - key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }}_${{ steps.cpu-info.outputs.cpu-model }} + key: libc_overlay_build_v2_${{ matrix.os }}_${{ matrix.compiler.c_compiler }} variant: sccache # MPFR is required by some of the mathlib tests. @@ -122,6 +97,7 @@ jobs: -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_POLICY_DEFAULT_CMP0141=NEW -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT=Embedded + -DLIBC_COMPILE_OPTIONS_NATIVE="" -DLLVM_ENABLE_RUNTIMES=libc -G Ninja -S ${{ github.workspace }}/runtimes diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake index d76f3b16b30ec..4b8f1c3399ff5 100644 --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -7,16 +7,29 @@ set(ALL_CPU_FEATURES "") if(LIBC_TARGET_ARCHITECTURE_IS_X86_64) set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX AVX2 AVX512F AVX512BW FMA) - set(LIBC_COMPILE_OPTIONS_NATIVE -march=native) + set(_libc_native_default -march=native) elseif(LIBC_TARGET_ARCHITECTURE_IS_AARCH64) set(ALL_CPU_FEATURES FullFP16 MOPS SVE SVE2) - set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native) + set(_libc_native_default -mcpu=native) +else() + set(_libc_native_default "") endif() if(LIBC_CROSSBUILD) - set(LIBC_COMPILE_OPTIONS_NATIVE ${LIBC_COMPILE_OPTIONS_DEFAULT}) + set(_libc_native_default ${LIBC_COMPILE_OPTIONS_DEFAULT}) endif() +# LIBC_COMPILE_OPTIONS_NATIVE controls the -march/-mcpu flag used for +# host-optimised builds and CPU feature detection. It defaults to +# -march=native (x86) or -mcpu=native (AArch64) for local developer builds. +# +# CI environments with shared build caches (e.g. sccache) should set this +# to an empty string (-DLIBC_COMPILE_OPTIONS_NATIVE="") because the cache +# treats -march=native as a literal string and will silently serve object +# files compiled for a different CPU model. +set(LIBC_COMPILE_OPTIONS_NATIVE "${_libc_native_default}" CACHE STRING + "Compile options for host-native builds. Set to empty to disable -march=native.") + # Making sure ALL_CPU_FEATURES is sorted. list(SORT ALL_CPU_FEATURES) From 14ac9dfdf3acbbde77e6c5461ab798bd646efe8f Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Fri, 8 May 2026 14:41:36 -0700 Subject: [PATCH 107/538] [lldb] Add lldb.summary and lldb.synthetic decorators (#195351) Adds two new decorators, `@lldb.summary` and `@lldb.synthetic`, analogous to the existing `@lldb.command` decorator. ```python @lldb.summary("MyType") def MyType_summary(valobj, _): return "summary string" @lldb.synthetic("MyContainer") class MyContainerSynthetic: def __init__(self, valobj, _): ... ``` These decorators result in `type summary add` and `type synthetic add` commands being run. An additional motivation: these decorators will make it straightforward to invoke the Python-to-LLDB formatter bytecode compiler (`formatter_bytecode.Compiler`), which currently requires command-line flags to know how to register formatters. With these decorators, the registration metadata is associated directly with the implementing function or class. See the docstrings and formatters.py test fixture for usage examples. Assisted-by: claude --- lldb/bindings/python/python-extensions.swig | 54 ++++++++++ lldb/docs/use/variable.rst | 102 ++++++++++++++++++ .../decorator-formatters/Makefile | 2 + .../TestDecoratorFormatters.py | 37 +++++++ .../decorator-formatters/broken_formatter.py | 6 ++ .../decorator-formatters/formatters.py | 50 +++++++++ .../decorator-formatters/main.cpp | 15 +++ 7 files changed, 266 insertions(+) create mode 100644 lldb/test/API/functionalities/data-formatter/decorator-formatters/Makefile create mode 100644 lldb/test/API/functionalities/data-formatter/decorator-formatters/TestDecoratorFormatters.py create mode 100644 lldb/test/API/functionalities/data-formatter/decorator-formatters/broken_formatter.py create mode 100644 lldb/test/API/functionalities/data-formatter/decorator-formatters/formatters.py create mode 100644 lldb/test/API/functionalities/data-formatter/decorator-formatters/main.cpp diff --git a/lldb/bindings/python/python-extensions.swig b/lldb/bindings/python/python-extensions.swig index 40fa76872ee96..8d6d825b65b3c 100644 --- a/lldb/bindings/python/python-extensions.swig +++ b/lldb/bindings/python/python-extensions.swig @@ -282,6 +282,60 @@ def command(command_name=None, doc=None): return callable + +def _add_formatter(cmd: str, type_name: str, options: dict): + import shlex + + type_name = shlex.quote(type_name) + + # Convert `option=True` to "--option", all others to "--option value". + flag_list = [] + for key, value in options.items(): + flag = key.replace("_", "-") + if value is True: + flag_list.append(f"--{flag}") + else: + flag_list.extend((f"--{flag}", value)) + + def decorator(obj): + qualified = f"{obj.__module__}.{obj.__qualname__}" + if isinstance(obj, type): + flag_list.extend(("--python-class", qualified)) + elif callable(obj): + flag_list.extend(("--python-function", qualified)) + + result = SBCommandReturnObject() + flags = " ".join(flag_list) + interp = debugger.GetCommandInterpreter() + interp.HandleCommand(f"{cmd} {flags} {type_name}", result) + if not result.Succeeded(): + raise RuntimeError(result.GetError()) + return obj + + return decorator + + +def summary(type_name, **kwargs): + """A decorator that registers a function as an LLDB type summary provider. + + @lldb.summary("MyType") + def MyTypeSummary(valobj, _): + return "summary string" + """ + return _add_formatter("type summary add", type_name, kwargs) + + +def synthetic(type_name, **kwargs): + """A decorator that registers a class as an LLDB synthetic child provider. + + @lldb.synthetic("MyType") + class MyTypeSynthetic: + def __init__(self, valobj, _): + ... + """ + return _add_formatter("type synthetic add", type_name, kwargs) + + class declaration(object): '''A class that represents a source declaration location with file, line and column.''' def __init__(self, file, line, col): diff --git a/lldb/docs/use/variable.rst b/lldb/docs/use/variable.rst index 82bb3c7ba1e11..3261edbc49797 100644 --- a/lldb/docs/use/variable.rst +++ b/lldb/docs/use/variable.rst @@ -882,6 +882,10 @@ you to input a Python script as a summary: LLDB will emit a warning if it is unable to find the function you passed, but will still register the binding. +- using the ``@lldb.summary`` decorator on a function definition. This combines + the definition and registration into a single step. See + :ref:`decorator-formatters` for details. + Regular Expression Typenames ---------------------------- @@ -1146,6 +1150,10 @@ children provider in LLDB: y = "Hello world" } +As an alternative to the two-step import-then-register pattern above, you can +use the ``@lldb.synthetic`` decorator to combine both steps. See +:ref:`decorator-formatters` for details. + LLDB has synthetic children providers for a core subset of STL classes, both in the version provided by libstdcpp and by libcxx, as well as for several Foundation classes. @@ -1212,6 +1220,100 @@ or other special provisions and the expression command chooses to ignore synthetic children in the interest of equivalency with code you asked to have compiled from source. +.. _decorator-formatters: + +Decorator-Based Formatter Registration +--------------------------------------- + +Beginning in version 23, LLDB provides Python decorators that combine the +definition and registration of formatters into a single step. When a module +using these decorators is loaded via ``command script import``, the formatters +are automatically registered. + +``@lldb.summary`` ++++++++++++++++++ + +The ``@lldb.summary`` decorator registers a Python function as a type summary +provider: + +.. code-block:: python + + import lldb + + @lldb.summary("Person") + def PersonSummary(valobj: lldb.SBValue, _) -> str: + name = valobj.GetChildMemberWithName("name").GetSummary() + age = valobj.GetChildMemberWithName("age").GetValueAsUnsigned() + return f"name={name}, age={age}" + +This is equivalent to defining the function and then running: + +:: + + (lldb) type summary add --python-function module.PersonSummary Person + +``@lldb.synthetic`` ++++++++++++++++++++ + +The ``@lldb.synthetic`` decorator registers a Python class as a synthetic child +provider: + +.. code-block:: python + + import lldb + + @lldb.synthetic("Person") + class PersonChildren: + def __init__(self, valobj: lldb.SBValue, _) -> None: + self.valobj = valobj + + def update(self) -> bool: + self.name = self.valobj.GetChildMemberWithName("name") + self.age = self.valobj.GetChildMemberWithName("age") + return True + + def num_children(self) -> int: + return 2 + + def get_child_at_index(self, index) -> lldb.SBValue: + if index == 0: + return self.name + if index == 1: + return self.age + return lldb.SBValue() + +This is equivalent to defining the class and then running: + +:: + + (lldb) type synthetic add --python-class module.PersonChildren Person + +Passing Options ++++++++++++++++ + +Both decorators accept keyword arguments that map to command-line flags of the +corresponding ``type summary add`` or ``type synthetic add`` commands. +Underscores in keyword names are converted to hyphens. A value of ``True`` +produces a bare flag; any other value is passed as the flag's argument. + +For example, to register a summary for a regex type pattern with the expand +flag: + +.. code-block:: python + + @lldb.summary("^std::vector<.+>$", regex=True, expand=True) + def VectorSummary(valobj, internal_dict): + ... + +This is equivalent to: + +:: + + (lldb) type summary add --python-function module.VectorSummary --regex --expand "^std::vector<.+>$" + +Other commonly used options include ``category`` to place the formatter in a +specific category. + Filters ------- diff --git a/lldb/test/API/functionalities/data-formatter/decorator-formatters/Makefile b/lldb/test/API/functionalities/data-formatter/decorator-formatters/Makefile new file mode 100644 index 0000000000000..3d0b98f13f3d7 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/decorator-formatters/Makefile @@ -0,0 +1,2 @@ +CXX_SOURCES := main.cpp +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/decorator-formatters/TestDecoratorFormatters.py b/lldb/test/API/functionalities/data-formatter/decorator-formatters/TestDecoratorFormatters.py new file mode 100644 index 0000000000000..5972442ec4c23 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/decorator-formatters/TestDecoratorFormatters.py @@ -0,0 +1,37 @@ +""" +Test @lldb.summary and @lldb.synthetic decorators lead to automatic formatter +registration, when using `command script import`. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_summary(self): + self.build() + self.runCmd("command script import formatters.py") + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "break here", lldb.SBFileSpec("main.cpp") + ) + frame = thread.selected_frame + ic = frame.var("ic") + self.assertEqual(ic.summary, "size=2") + fc = frame.var("fc") + self.assertEqual(fc.summary, "size=2") + + def test_synthetic(self): + self.build() + self.runCmd("command script import formatters.py") + lldbutil.run_to_source_breakpoint( + self, "break here", lldb.SBFileSpec("main.cpp") + ) + self.expect("v ic", substrs=["[0] = 10", "[1] = 20"]) + self.expect("v fc", substrs=["[0] = 10.5", "[1] = 20.25"]) + + def test_failure(self): + self.expect("command script import broken_formatter.py", error=True) diff --git a/lldb/test/API/functionalities/data-formatter/decorator-formatters/broken_formatter.py b/lldb/test/API/functionalities/data-formatter/decorator-formatters/broken_formatter.py new file mode 100644 index 0000000000000..64f60973fde54 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/decorator-formatters/broken_formatter.py @@ -0,0 +1,6 @@ +import lldb + + +@lldb.summary("Ignored", invalid=True) +def IgnoredSummary(valobj: lldb.SBValue, _) -> str: + return "nope" diff --git a/lldb/test/API/functionalities/data-formatter/decorator-formatters/formatters.py b/lldb/test/API/functionalities/data-formatter/decorator-formatters/formatters.py new file mode 100644 index 0000000000000..ee9c8443f5a03 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/decorator-formatters/formatters.py @@ -0,0 +1,50 @@ +import lldb + + +def _summary(valobj: lldb.SBValue) -> str: + size = ( + valobj.GetNonSyntheticValue() + .GetChildMemberWithName("size") + .GetValueAsUnsigned() + ) + return f"size={size}" + + +@lldb.summary("IntContainer", expand=True) +def IntContainerSummary(valobj: lldb.SBValue, _): + return _summary(valobj) + + +@lldb.summary("^Container<.+>$", regex=True, expand=True) +def ContainerSummary(valobj, _): + return _summary(valobj) + + +class _ContainerSyntheticBase: + valobj: lldb.SBValue + count: int + items: lldb.SBValue + + def __init__(self, valobj: lldb.SBValue, _) -> None: + self.valobj = valobj + + def update(self) -> bool: + self.count = self.valobj.GetChildMemberWithName("size").GetValueAsSigned() + self.items = self.valobj.GetChildMemberWithName("items") + return True + + def num_children(self) -> int: + return self.count + + def get_child_at_index(self, index: int) -> lldb.SBValue: + return self.items.GetChildAtIndex(index) + + +@lldb.synthetic("IntContainer") +class IntContainerSynthetic(_ContainerSyntheticBase): + pass + + +@lldb.synthetic("^Container<.+>$", regex=True) +class ContainerSynthetic(_ContainerSyntheticBase): + pass diff --git a/lldb/test/API/functionalities/data-formatter/decorator-formatters/main.cpp b/lldb/test/API/functionalities/data-formatter/decorator-formatters/main.cpp new file mode 100644 index 0000000000000..35bf7253d929b --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/decorator-formatters/main.cpp @@ -0,0 +1,15 @@ +struct IntContainer { + int items[3]; + int size; +}; + +template struct Container { + T items[3]; + int size; +}; + +int main() { + IntContainer ic = {{10, 20, 0}, 2}; + Container fc = {{10.5, 20.25, 0}, 2}; + return 0; // break here +} From 59152f43f351aac8798646ee28df061798512370 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 8 May 2026 22:58:59 +0100 Subject: [PATCH 108/538] [X86] combine-add.ll - regenerate to show missing add asm comments (#196647) --- llvm/test/CodeGen/X86/combine-add.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index 5efc8cd111d1f..c9455d5c4db4b 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -209,14 +209,14 @@ define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm1 ; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_add_sub_sub: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3] ; AVX-NEXT: retq %1 = sub <4 x i32> %a, %b %2 = sub <4 x i32> , %d From d063eeb4dd340de9f2167529aaccf5e26f22de7e Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Fri, 8 May 2026 15:34:12 -0700 Subject: [PATCH 109/538] [lld][WebAssembly] Remove the experimental warning for PIC/dynamic linking (#196566) The current dynamic linking support has been used for several years not both in emscripten and in wasi-sdk and is documented https://github.com/WebAssembly/tool-conventions/blob/main/DynamicLinking.md. We did/do have have plans to develop another version of the dynamic linking ABI that doesn't use a global symbol namespace, and that can still happen, but the current API is clearly production worthy regardless of future plans. This change removes the linker warning and the corresponding `--experimental-pic` flag. If we do want to still make breaking changes to the dylink format we can rename the `dylink.1` section (which already contains a version number). This change is leads the way for enabling shared libraries by default in emscripten. --- clang/lib/Interpreter/Wasm.cpp | 1 - lld/docs/WebAssembly.rst | 4 +-- lld/test/wasm/bad-data-relocs.s | 2 +- lld/test/wasm/bsymbolic.s | 4 +-- lld/test/wasm/compact-imports.s | 2 +- lld/test/wasm/data-segments.ll | 6 ++-- lld/test/wasm/dylink-non-pie.s | 2 +- lld/test/wasm/dylink.s | 12 ++++---- lld/test/wasm/global-base.test | 4 +-- lld/test/wasm/libsearch.s | 28 +++++++++---------- lld/test/wasm/lto/pic-empty.s | 2 +- lld/test/wasm/lto/relocation-model.ll | 2 +- lld/test/wasm/no-shlib-sigcheck.s | 10 +++---- lld/test/wasm/pie.s | 6 ++-- lld/test/wasm/rpath.s | 2 +- lld/test/wasm/runtime-relocations-himem.s | 2 +- lld/test/wasm/shared-export-dynamic.s | 4 +-- lld/test/wasm/shared-lazy.s | 12 ++++---- lld/test/wasm/shared-memory-bss.s | 2 +- lld/test/wasm/shared-needed.s | 6 ++-- lld/test/wasm/shared-weak-symbols.s | 2 +- lld/test/wasm/shared-weak-undefined.s | 4 +-- lld/test/wasm/shared.s | 2 +- lld/test/wasm/shared64.s | 2 +- lld/test/wasm/static-error.s | 4 +-- lld/test/wasm/tag-section.ll | 2 +- lld/test/wasm/tls-export.s | 2 +- lld/test/wasm/tls-non-shared-memory-basic.s | 2 +- lld/test/wasm/tls-non-shared-memory.s | 6 ++-- lld/test/wasm/tls-relocations.s | 2 +- lld/test/wasm/undef-shared.s | 2 +- lld/test/wasm/undefined-data.s | 2 +- lld/test/wasm/unresolved-symbols-dynamic.s | 3 +- lld/test/wasm/unresolved-symbols.s | 2 +- lld/test/wasm/unsupported-pic-relocations.s | 12 ++++---- lld/test/wasm/unsupported-pic-relocations64.s | 12 ++++---- lld/test/wasm/weak-undefined-pic.s | 2 +- lld/wasm/Config.h | 6 ++-- lld/wasm/Driver.cpp | 23 --------------- lld/wasm/Options.td | 4 +-- 40 files changed, 91 insertions(+), 118 deletions(-) diff --git a/clang/lib/Interpreter/Wasm.cpp b/clang/lib/Interpreter/Wasm.cpp index 007227c73dc5f..96600cf9fa6d0 100644 --- a/clang/lib/Interpreter/Wasm.cpp +++ b/clang/lib/Interpreter/Wasm.cpp @@ -120,7 +120,6 @@ llvm::Error WasmIncrementalExecutor::addModule(PartialTranslationUnit &PTU) { std::vector LinkerArgs = {"wasm-ld", "-shared", "--import-memory", - "--experimental-pic", "--stack-first", "--allow-undefined", ObjectFileName.c_str(), diff --git a/lld/docs/WebAssembly.rst b/lld/docs/WebAssembly.rst index a7e1bc4cbe97b..389fc0ac25553 100644 --- a/lld/docs/WebAssembly.rst +++ b/lld/docs/WebAssembly.rst @@ -108,9 +108,7 @@ WebAssembly-specific options: this means inputs should be compiled with `-fPIC` (i.e. `pic` or `dynamic-no-pic` relocation models). This options is useful for linking binaries that are themselves static (non-relocatable) but whose undefined - symbols are resolved by a dynamic linker. Since the dynamic linking API is - experimental, this option currently requires `--experimental-pic` to also - be specified. + symbols are resolved by a dynamic linker. .. option:: --import-memory diff --git a/lld/test/wasm/bad-data-relocs.s b/lld/test/wasm/bad-data-relocs.s index 7e2ef3e1dc3b7..6e95c5bf58ac1 100644 --- a/lld/test/wasm/bad-data-relocs.s +++ b/lld/test/wasm/bad-data-relocs.s @@ -2,7 +2,7 @@ ## generated in `-shared/`-pie` binaries. # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: not wasm-ld -pie --experimental-pic %t.o -o %t.wasm 2>&1 | FileCheck %s +# RUN: not wasm-ld -pie %t.o -o %t.wasm 2>&1 | FileCheck %s # CHECK: wasm-ld: error: invalid runtime relocation type in data section: R_WASM_FUNCTION_INDEX_I32 diff --git a/lld/test/wasm/bsymbolic.s b/lld/test/wasm/bsymbolic.s index 872fb1b53486b..f22a916b24b32 100644 --- a/lld/test/wasm/bsymbolic.s +++ b/lld/test/wasm/bsymbolic.s @@ -2,10 +2,10 @@ // RUN: wasm-ld --no-entry -Bsymbolic %t.o -o %t.wasm 2>&1 | FileCheck -check-prefix=WARNING %s // WARNING: warning: -Bsymbolic is only meaningful when combined with -shared -// RUN: wasm-ld --experimental-pic -shared %t.o -o %t0.so +// RUN: wasm-ld -shared %t.o -o %t0.so // RUN: obj2yaml %t0.so | FileCheck -check-prefix=NOOPTION %s -// RUN: wasm-ld --experimental-pic -shared -Bsymbolic %t.o -o %t1.so +// RUN: wasm-ld -shared -Bsymbolic %t.o -o %t1.so // RUN: obj2yaml %t1.so | FileCheck -check-prefix=SYMBOLIC %s // NOOPTION: - Type: IMPORT diff --git a/lld/test/wasm/compact-imports.s b/lld/test/wasm/compact-imports.s index 4c6ddc2456962..7c7ed198f57d4 100644 --- a/lld/test/wasm/compact-imports.s +++ b/lld/test/wasm/compact-imports.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld --experimental-pic --unresolved-symbols=import-dynamic %t.o -o %t.wasm +# RUN: wasm-ld --unresolved-symbols=import-dynamic %t.o -o %t.wasm .functype foo () -> () .functype bar () -> () diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll index 237f4285e3763..7a18fd5efb655 100644 --- a/lld/test/wasm/data-segments.ll +++ b/lld/test/wasm/data-segments.ll @@ -18,7 +18,7 @@ ; RUN: obj2yaml %t.bulk-mem64.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE64 ;; In -pie mode segments are combined into one active segment. -; RUN: wasm-ld --experimental-pic --import-memory -pie -no-gc-sections --no-entry %t.atomics.bulk-mem.pic.o -o %t.pic.wasm +; RUN: wasm-ld --import-memory -pie -no-gc-sections --no-entry %t.atomics.bulk-mem.pic.o -o %t.pic.wasm ; RUN: obj2yaml %t.pic.wasm | FileCheck %s --check-prefixes ACTIVE-PIC ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes PIC-NON-SHARED-DIS @@ -33,12 +33,12 @@ ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i64 ;; Also test in combination with PIC/pie -; RUN: wasm-ld --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic.o -o %t.shared.pic.wasm +; RUN: wasm-ld -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic.o -o %t.shared.pic.wasm ; RUN: obj2yaml %t.shared.pic.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE32-PIC ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.shared.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 ;; Also test in combination with PIC/pie + wasm64 -; RUN: wasm-ld -mwasm64 --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic-mem64.o -o %t.pic-mem64.wasm +; RUN: wasm-ld -mwasm64 -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic-mem64.o -o %t.pic-mem64.wasm ; RUN: obj2yaml %t.pic-mem64.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE64-PIC ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 diff --git a/lld/test/wasm/dylink-non-pie.s b/lld/test/wasm/dylink-non-pie.s index fddfddb4df658..1dceaeb9c9fce 100755 --- a/lld/test/wasm/dylink-non-pie.s +++ b/lld/test/wasm/dylink-non-pie.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.lib.o %p/Inputs/ret32.s -# RUN: wasm-ld -m wasm32 --experimental-pic -shared --no-entry %t.lib.o -o %t.lib.so +# RUN: wasm-ld -m wasm32 -shared --no-entry %t.lib.o -o %t.lib.so # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s # RUN: wasm-ld -m wasm32 -Bdynamic %t.o %t.lib.so -o %t.wasm # RUN: obj2yaml %t.wasm | FileCheck %s diff --git a/lld/test/wasm/dylink.s b/lld/test/wasm/dylink.s index d40778c3b2d6f..d129f9032a2d2 100644 --- a/lld/test/wasm/dylink.s +++ b/lld/test/wasm/dylink.s @@ -1,9 +1,9 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten -mattr=+exception-handling -o %t.o %s # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten %p/Inputs/ret32.s -o %t.ret32.o # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten %p/Inputs/libsearch-dyn.s -o %t.dyn.o -# RUN: wasm-ld --experimental-pic -shared %t.ret32.o %t.dyn.o -o %t.lib.so -# RUN: not wasm-ld --experimental-pic -pie -o %t.wasm %t.o 2>&1 | FileCheck --check-prefix=ERROR %s -# RUN: wasm-ld --experimental-pic -pie -o %t.wasm %t.o %t.lib.so +# RUN: wasm-ld -shared %t.ret32.o %t.dyn.o -o %t.lib.so +# RUN: not wasm-ld -pie -o %t.wasm %t.o 2>&1 | FileCheck --check-prefix=ERROR %s +# RUN: wasm-ld -pie -o %t.wasm %t.o %t.lib.so # RUN: obj2yaml %t.wasm | FileCheck %s # Same again for wasm64 @@ -11,9 +11,9 @@ # RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-emscripten -mattr=+exception-handling -o %t.o %s # RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-emscripten %p/Inputs/ret32.s -o %t.ret32.o # RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-emscripten %p/Inputs/libsearch-dyn.s -o %t.dyn.o -# RUN: wasm-ld --experimental-pic -mwasm64 -shared %t.ret32.o %t.dyn.o -o %t.lib.so -# RUN: not wasm-ld --experimental-pic -mwasm64 -pie -o %t.wasm %t.o 2>&1 | FileCheck --check-prefix=ERROR %s -# RUN: wasm-ld --experimental-pic -mwasm64 -pie -o %t.wasm %t.o %t.lib.so +# RUN: wasm-ld -mwasm64 -shared %t.ret32.o %t.dyn.o -o %t.lib.so +# RUN: not wasm-ld -mwasm64 -pie -o %t.wasm %t.o 2>&1 | FileCheck --check-prefix=ERROR %s +# RUN: wasm-ld -mwasm64 -pie -o %t.wasm %t.o %t.lib.so # RUN: obj2yaml %t.wasm | FileCheck %s # ERROR: error: {{.*}}: undefined symbol: ret32 diff --git a/lld/test/wasm/global-base.test b/lld/test/wasm/global-base.test index e84b8ec3ef9ce..d94446a7638bf 100644 --- a/lld/test/wasm/global-base.test +++ b/lld/test/wasm/global-base.test @@ -1,8 +1,8 @@ RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o # Check for error on `--global-base` with `-shared` and `-pie` -RUN: not wasm-ld --global-base=2048 --experimental-pic -shared -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=SHARED-ERROR -RUN: not wasm-ld --global-base=2048 --experimental-pic -pie -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=SHARED-ERROR +RUN: not wasm-ld --global-base=2048 -shared -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=SHARED-ERROR +RUN: not wasm-ld --global-base=2048 -pie -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=SHARED-ERROR SHARED-ERROR: error: --global-base may not be used with -shared/-pie # Check for error on `--global-base` which is lower than that end of the stack diff --git a/lld/test/wasm/libsearch.s b/lld/test/wasm/libsearch.s index 20f1e9b2bfa3f..f22d450242136 100644 --- a/lld/test/wasm/libsearch.s +++ b/lld/test/wasm/libsearch.s @@ -8,7 +8,7 @@ // RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown \ // RUN: %p/Inputs/use-bar.s -o %tbar.o // RUN: mkdir -p %t.dir -// RUN: wasm-ld -shared --experimental-pic %tdyn.o -o %t.dir/libls.so +// RUN: wasm-ld -shared %tdyn.o -o %t.dir/libls.so // RUN: cp -f %t.dir/libls.so %t.dir/libls2.so // RUN: rm -f %t.dir/libls.a // RUN: llvm-ar rcs %t.dir/libls.a %tst.o @@ -38,7 +38,7 @@ // STATIC: Name: _static // Should use explicitly specified dynamic library -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -l:libls.so +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -l:libls.so // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=DYNAMIC %s // DYNAMIC: Symbols [ // DYNAMIC-NOT: Name: _static @@ -48,13 +48,13 @@ // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s // Should prefer dynamic when linking PIE. -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=DYNAMIC %s // Check for library search order // RUN: mkdir -p %t.dir2 // RUN: cp %t.dir/libls.a %t.dir2 -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir2 -L%t.dir -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir2 -L%t.dir -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s // -L can be placed after -l @@ -65,32 +65,32 @@ // RUN: wasm-ld --emit-relocs --no-gc-sections -o %t3 %t.o --library-path %t.dir --library ls // Should not search for dynamic libraries if -Bstatic is specified -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s -// RUN: not wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o /dev/null %t.o -L%t.dir -Bstatic -lls2 2>&1 \ +// RUN: not wasm-ld -pie --emit-relocs --no-gc-sections -o /dev/null %t.o -L%t.dir -Bstatic -lls2 2>&1 \ // RUN: | FileCheck --check-prefix=NOLIB2 %s // NOLIB2: unable to find library -lls2 // -Bdynamic should restore default behaviour -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -Bdynamic -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -Bdynamic -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=DYNAMIC %s // -Bstatic and -Bdynamic should affect only libraries which follow them -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -lls -Bstatic -Bdynamic +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -lls -Bstatic -Bdynamic // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=DYNAMIC %s -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -lls -Bdynamic +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -lls -Bdynamic // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s // Check aliases as well -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -dn -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -dn -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -non_shared -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -non_shared -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -static -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -static -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=STATIC %s -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -dy -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -dy -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=DYNAMIC %s -// RUN: wasm-ld -pie --experimental-pic --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -call_shared -lls +// RUN: wasm-ld -pie --emit-relocs --no-gc-sections -o %t3 %t.o -L%t.dir -Bstatic -call_shared -lls // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=DYNAMIC %s /// -r implies -Bstatic and has precedence over -Bdynamic. diff --git a/lld/test/wasm/lto/pic-empty.s b/lld/test/wasm/lto/pic-empty.s index 4fe4dffda1dcb..9b09e539583f2 100644 --- a/lld/test/wasm/lto/pic-empty.s +++ b/lld/test/wasm/lto/pic-empty.s @@ -6,7 +6,7 @@ ; See https://github.com/llvm/llvm-project/issues/51681. ; RUN: llvm-as %s -o %t.o -; RUN: wasm-ld --lto-O2 --experimental-pic -shared --no-gc-sections --export=tls_int %t.o -o %t.so +; RUN: wasm-ld --lto-O2 -shared --no-gc-sections --export=tls_int %t.o -o %t.so ; RUN: obj2yaml %t.so | FileCheck %s target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-f128:64-n32:64-S128-ni:1:10:20" diff --git a/lld/test/wasm/lto/relocation-model.ll b/lld/test/wasm/lto/relocation-model.ll index a042615b8fe1c..d783973d107ba 100644 --- a/lld/test/wasm/lto/relocation-model.ll +++ b/lld/test/wasm/lto/relocation-model.ll @@ -10,7 +10,7 @@ ;; Linking with --unresolved-symbols=import-dynamic should also generate PIC ;; code for external references. -; RUN: wasm-ld %t.o -o %t_import.wasm -save-temps --experimental-pic --unresolved-symbols=import-dynamic +; RUN: wasm-ld %t.o -o %t_import.wasm -save-temps --unresolved-symbols=import-dynamic ; RUN: llvm-readobj -r %t_import.wasm.lto.o | FileCheck %s --check-prefix=PIC ; PIC: R_WASM_GLOBAL_INDEX_LEB foo diff --git a/lld/test/wasm/no-shlib-sigcheck.s b/lld/test/wasm/no-shlib-sigcheck.s index 13f2a2132ac7c..951a6df8471fe 100644 --- a/lld/test/wasm/no-shlib-sigcheck.s +++ b/lld/test/wasm/no-shlib-sigcheck.s @@ -1,17 +1,17 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten -o %t.o %s # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten %p/Inputs/ret32.s -o %t.ret32.o -# RUN: wasm-ld --experimental-pic -shared %t.ret32.o -o %t.lib.so +# RUN: wasm-ld -shared %t.ret32.o -o %t.lib.so ## Fails with signature mismatch by default -# RUN: not wasm-ld --experimental-pic -pie -o %t.wasm %t.o %t.lib.so 2>&1 | FileCheck --check-prefix=ERROR %s +# RUN: not wasm-ld -pie -o %t.wasm %t.o %t.lib.so 2>&1 | FileCheck --check-prefix=ERROR %s ## Same again with shared library first. -# RUN: not wasm-ld --experimental-pic -pie -o %t.wasm %t.lib.so %t.o 2>&1 | FileCheck --check-prefix=ERROR %s +# RUN: not wasm-ld -pie -o %t.wasm %t.lib.so %t.o 2>&1 | FileCheck --check-prefix=ERROR %s ## Succeeds with --no-shlib-sigcheck added -# RUN: wasm-ld --experimental-pic -pie -o %t.wasm %t.o %t.lib.so --no-shlib-sigcheck +# RUN: wasm-ld -pie -o %t.wasm %t.o %t.lib.so --no-shlib-sigcheck # RUN: obj2yaml %t.wasm | FileCheck %s ## Same again with shared library first. -# RUN: wasm-ld --experimental-pic -pie -o %t.wasm %t.lib.so %t.o --no-shlib-sigcheck +# RUN: wasm-ld -pie -o %t.wasm %t.lib.so %t.o --no-shlib-sigcheck # RUN: obj2yaml %t.wasm | FileCheck %s .functype ret32 (f32) -> (i64) diff --git a/lld/test/wasm/pie.s b/lld/test/wasm/pie.s index 21eac79207318..a86f50f04ce16 100644 --- a/lld/test/wasm/pie.s +++ b/lld/test/wasm/pie.s @@ -1,6 +1,6 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten -o %t.o %s # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten %S/Inputs/internal_func.s -o %t.internal_func.o -# RUN: wasm-ld --no-gc-sections --experimental-pic -pie --unresolved-symbols=import-dynamic -o %t.wasm %t.o %t.internal_func.o +# RUN: wasm-ld --no-gc-sections -pie --unresolved-symbols=import-dynamic -o %t.wasm %t.o %t.internal_func.o # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DISASSEM @@ -150,7 +150,7 @@ _start: # instruction in the InitExpr. We also, therefore, do not need these globals # to be mutable. -# RUN: wasm-ld --no-gc-sections --experimental-pic -pie --unresolved-symbols=import-dynamic --extra-features=extended-const -o %t.extended.wasm %t.o %t.internal_func.o +# RUN: wasm-ld --no-gc-sections -pie --unresolved-symbols=import-dynamic --extra-features=extended-const -o %t.extended.wasm %t.o %t.internal_func.o # RUN: obj2yaml %t.extended.wasm | FileCheck %s --check-prefix=EXTENDED-CONST # EXTENDED-CONST-NOT: __wasm_apply_global_relocs @@ -207,7 +207,7 @@ _start: # to be generated along with __wasm_start as the start # function. -# RUN: wasm-ld --no-gc-sections --shared-memory --experimental-pic -pie --unresolved-symbols=import-dynamic -o %t.shmem.wasm %t.o %t.internal_func.o +# RUN: wasm-ld --no-gc-sections --shared-memory -pie --unresolved-symbols=import-dynamic -o %t.shmem.wasm %t.o %t.internal_func.o # RUN: obj2yaml %t.shmem.wasm | FileCheck %s --check-prefix=SHMEM # RUN: llvm-objdump --disassemble-symbols=__wasm_start --no-show-raw-insn --no-leading-addr %t.shmem.wasm | FileCheck %s --check-prefix DISASSEM-SHMEM diff --git a/lld/test/wasm/rpath.s b/lld/test/wasm/rpath.s index 53372f490e9ad..1b430b40966a9 100644 --- a/lld/test/wasm/rpath.s +++ b/lld/test/wasm/rpath.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld -shared -o %t1.wasm %t.o -rpath /a/b/c -rpath /x/y/z --experimental-pic +# RUN: wasm-ld -shared -o %t1.wasm %t.o -rpath /a/b/c -rpath /x/y/z # RUN: obj2yaml %t1.wasm | FileCheck %s # CHECK: - Type: CUSTOM diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s index a12a93a6cb933..2d39a204c7904 100644 --- a/lld/test/wasm/runtime-relocations-himem.s +++ b/lld/test/wasm/runtime-relocations-himem.s @@ -3,7 +3,7 @@ ## instruction leading to invalid binaries. # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o +# RUN: wasm-ld --global-base=2147483648 --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o # XUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s -- diff --git a/lld/test/wasm/shared-export-dynamic.s b/lld/test/wasm/shared-export-dynamic.s index 015d73388f2b7..e651bd0e4d0bd 100644 --- a/lld/test/wasm/shared-export-dynamic.s +++ b/lld/test/wasm/shared-export-dynamic.s @@ -1,12 +1,12 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s # By default all `default` symbols should be exported -# RUN: wasm-ld -shared --experimental-pic -o %t.wasm %t.o +# RUN: wasm-ld -shared -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s -check-prefix=DEFAULT # DEFAULT: foo # Verify that `--no-export-dynamic` works with `-shared` -# RUN: wasm-ld -shared --experimental-pic --no-export-dynamic -o %t2.wasm %t.o +# RUN: wasm-ld -shared --no-export-dynamic -o %t2.wasm %t.o # RUN: obj2yaml %t2.wasm | FileCheck %s -check-prefix=NO-EXPORT # NO-EXPORT-NOT: foo diff --git a/lld/test/wasm/shared-lazy.s b/lld/test/wasm/shared-lazy.s index f1044547203a2..be8d1d57b7e1f 100644 --- a/lld/test/wasm/shared-lazy.s +++ b/lld/test/wasm/shared-lazy.s @@ -2,14 +2,14 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten a.s -o a.o -# RUN: wasm-ld a.o --experimental-pic -shared -o a.so +# RUN: wasm-ld a.o -shared -o a.so # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten b.s -o b.o -# RUN: wasm-ld b.o --experimental-pic -shared -o b.so +# RUN: wasm-ld b.o -shared -o b.so # RUN: llvm-ar rc a.a a.o # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-emscripten ref.s -o ref.o -# RUN: wasm-ld a.a b.so ref.o --experimental-pic -shared -o 1.so +# RUN: wasm-ld a.a b.so ref.o -shared -o 1.so # RUN: obj2yaml 1.so | FileCheck %s -# RUN: wasm-ld a.so a.a ref.o --experimental-pic -shared -o 1.so +# RUN: wasm-ld a.so a.a ref.o -shared -o 1.so # RUN: obj2yaml 1.so | FileCheck %s ## The definitions from a.so are used and we don't extract a member from the @@ -28,9 +28,9 @@ # CHECK-NEXT: GlobalMutable: true ## The extracted x1 is defined as STB_GLOBAL. -# RUN: wasm-ld ref.o a.a b.so -o 2.so --experimental-pic -shared +# RUN: wasm-ld ref.o a.a b.so -o 2.so -shared # RUN: obj2yaml 2.so | FileCheck %s --check-prefix=CHECK2 -# RUN: wasm-ld a.a ref.o b.so -o 2.so --experimental-pic -shared +# RUN: wasm-ld a.a ref.o b.so -o 2.so -shared # RUN: obj2yaml 2.so | FileCheck %s --check-prefix=CHECK2 # CHECK2: - Type: EXPORT diff --git a/lld/test/wasm/shared-memory-bss.s b/lld/test/wasm/shared-memory-bss.s index a8a05b8014238..58a55d2b1922d 100644 --- a/lld/test/wasm/shared-memory-bss.s +++ b/lld/test/wasm/shared-memory-bss.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld --experimental-pic -shared --shared-memory -o %t.so %t.o +# RUN: wasm-ld -shared --shared-memory -o %t.so %t.o # RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.so | FileCheck %s # RUN: obj2yaml %t.so | FileCheck %s --check-prefix=YAML diff --git a/lld/test/wasm/shared-needed.s b/lld/test/wasm/shared-needed.s index a9df361f2e8d9..bdc7dc3ce7d65 100644 --- a/lld/test/wasm/shared-needed.s +++ b/lld/test/wasm/shared-needed.s @@ -1,16 +1,16 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o -# RUN: wasm-ld -shared --experimental-pic -o %t.ret32.so %t.ret32.o +# RUN: wasm-ld -shared -o %t.ret32.so %t.ret32.o # RUN: obj2yaml %t.ret32.so | FileCheck %s -check-prefix=SO1 # Without linking against the ret32.so shared object we expect an undefined # symbol error -# RUN: not wasm-ld -shared --experimental-pic -o %t.so %t.o 2>&1 | FileCheck %s --check-prefix=ERROR +# RUN: not wasm-ld -shared -o %t.so %t.o 2>&1 | FileCheck %s --check-prefix=ERROR # ERROR: undefined symbol: ret32 -# RUN: wasm-ld -shared --experimental-pic -o %t.so %t.o %t.ret32.so +# RUN: wasm-ld -shared -o %t.so %t.o %t.ret32.so # RUN: obj2yaml %t.so | FileCheck %s -check-prefix=SO2 diff --git a/lld/test/wasm/shared-weak-symbols.s b/lld/test/wasm/shared-weak-symbols.s index df049ce4600fe..a287361c7d1c5 100644 --- a/lld/test/wasm/shared-weak-symbols.s +++ b/lld/test/wasm/shared-weak-symbols.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld --experimental-pic -shared -o %t.wasm %t.o +# RUN: wasm-ld -shared -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump -d %t.wasm | FileCheck %s -check-prefix=ASM diff --git a/lld/test/wasm/shared-weak-undefined.s b/lld/test/wasm/shared-weak-undefined.s index 58c2e3cd46b5a..7be38a1cea30b 100644 --- a/lld/test/wasm/shared-weak-undefined.s +++ b/lld/test/wasm/shared-weak-undefined.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld --experimental-pic -shared -o %t.wasm %t.o +# RUN: wasm-ld -shared -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump -d %t.wasm | FileCheck %s -check-prefix=ASM @@ -11,7 +11,7 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.ret32.o %p/Inputs/ret32.s # RUN: rm -f %t.dir/libret32.a # RUN: llvm-ar cru %t.dir/libret32.a %t.ret32.o -# RUN: wasm-ld --experimental-pic -shared -o %t.ret32.wasm %t.o %t.dir/libret32.a +# RUN: wasm-ld -shared -o %t.ret32.wasm %t.o %t.dir/libret32.a # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump -d %t.wasm | FileCheck %s -check-prefix=ASM diff --git a/lld/test/wasm/shared.s b/lld/test/wasm/shared.s index 5b40d4ebee7ab..1ea95d90ca0af 100644 --- a/lld/test/wasm/shared.s +++ b/lld/test/wasm/shared.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: wasm-ld --experimental-pic --unresolved-symbols=import-dynamic -shared -o %t.wasm %t.o +# RUN: wasm-ld --unresolved-symbols=import-dynamic -shared -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS diff --git a/lld/test/wasm/shared64.s b/lld/test/wasm/shared64.s index 831116d4d7fe7..da0577b03ec73 100644 --- a/lld/test/wasm/shared64.s +++ b/lld/test/wasm/shared64.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-unknown -o %t.o %s -# RUN: wasm-ld -mwasm64 --experimental-pic --unresolved-symbols=import-dynamic -shared -o %t.wasm %t.o +# RUN: wasm-ld -mwasm64 --unresolved-symbols=import-dynamic -shared -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS diff --git a/lld/test/wasm/static-error.s b/lld/test/wasm/static-error.s index 3557506a5f07a..b36e6e8634136 100644 --- a/lld/test/wasm/static-error.s +++ b/lld/test/wasm/static-error.s @@ -1,7 +1,7 @@ // RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o -// RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o +// RUN: wasm-ld -shared -o %t.so %t.o -// RUN: wasm-ld --experimental-pic -pie -o /dev/null %t.o %t.so +// RUN: wasm-ld -pie -o /dev/null %t.o %t.so // RUN: not wasm-ld -o /dev/null -static %t.o %t.so 2>&1 | FileCheck %s // CHECK: attempted static link of dynamic object diff --git a/lld/test/wasm/tag-section.ll b/lld/test/wasm/tag-section.ll index 45a578fb12f4f..86c891d3bb78d 100644 --- a/lld/test/wasm/tag-section.ll +++ b/lld/test/wasm/tag-section.ll @@ -13,7 +13,7 @@ ; RUN: llc -filetype=obj -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -relocation-model=pic %p/Inputs/tag-section1.ll -o %t1.o ; RUN: llc -filetype=obj -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -relocation-model=pic %p/Inputs/tag-section2.ll -o %t2.o ; RUN: llc -filetype=obj -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -relocation-model=pic %s -o %t.o -; RUN: wasm-ld --import-undefined --experimental-pic --unresolved-symbols=import-dynamic -pie -o %t_pic.wasm %t.o %t1.o %t2.o +; RUN: wasm-ld --import-undefined --unresolved-symbols=import-dynamic -pie -o %t_pic.wasm %t.o %t1.o %t2.o ; RUN: obj2yaml %t_pic.wasm | FileCheck %s --check-prefix=PIC target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" diff --git a/lld/test/wasm/tls-export.s b/lld/test/wasm/tls-export.s index 00c535c4689ca..8660a71cf5448 100644 --- a/lld/test/wasm/tls-export.s +++ b/lld/test/wasm/tls-export.s @@ -1,6 +1,6 @@ # RUN: split-file %s %t # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t/define-tls.o %t/define-tls.s -# RUN: wasm-ld -shared --experimental-pic -o %t/define-tls.so %t/define-tls.o +# RUN: wasm-ld -shared -o %t/define-tls.so %t/define-tls.o # RUN: obj2yaml %t/define-tls.so | FileCheck %s #--- define-tls.s diff --git a/lld/test/wasm/tls-non-shared-memory-basic.s b/lld/test/wasm/tls-non-shared-memory-basic.s index 66ccf8f4f945c..36d56060bd606 100644 --- a/lld/test/wasm/tls-non-shared-memory-basic.s +++ b/lld/test/wasm/tls-non-shared-memory-basic.s @@ -22,7 +22,7 @@ tls1: # RUN: wasm-ld --no-gc-sections --no-entry -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s -# RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o +# RUN: wasm-ld -shared -o %t.so %t.o # RUN: obj2yaml %t.so | FileCheck %s --check-prefix=PIC # CHECK: - Type: DATA diff --git a/lld/test/wasm/tls-non-shared-memory.s b/lld/test/wasm/tls-non-shared-memory.s index 0d73acb429b18..0a87ade7efb2e 100644 --- a/lld/test/wasm/tls-non-shared-memory.s +++ b/lld/test/wasm/tls-non-shared-memory.s @@ -46,13 +46,13 @@ tls1: # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump --disassemble-symbols=get_tls1 --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS -# RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o +# RUN: wasm-ld -shared -o %t.so %t.o # RUN: obj2yaml %t.so | FileCheck %s --check-prefixes=SHARED,PIC -# RUN: wasm-ld --experimental-pic --no-gc-sections --no-entry -pie -o %t-pie.wasm %t.o +# RUN: wasm-ld --no-gc-sections --no-entry -pie -o %t-pie.wasm %t.o # RUN: obj2yaml %t-pie.wasm | FileCheck %s --check-prefixes=PIE,PIC -# RUN: wasm-ld --experimental-pic --features=atomics,bulk-memory,extended-const --no-gc-sections --no-entry -pie -o %t-extended-const.wasm %t.o +# RUN: wasm-ld --features=atomics,bulk-memory,extended-const --no-gc-sections --no-entry -pie -o %t-extended-const.wasm %t.o # RUN: obj2yaml %t-extended-const.wasm | FileCheck %s --check-prefixes=EXT-CONST # CHECK: - Type: GLOBAL diff --git a/lld/test/wasm/tls-relocations.s b/lld/test/wasm/tls-relocations.s index 7260d72535a00..9679074d6a0db 100644 --- a/lld/test/wasm/tls-relocations.s +++ b/lld/test/wasm/tls-relocations.s @@ -26,7 +26,7 @@ tls_sym: .int8 11 .ascii "bulk-memory" -# RUN: wasm-ld --experimental-pic -pie -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o +# RUN: wasm-ld -pie -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o # RUN: obj2yaml %t.wasm | FileCheck %s # RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck --check-prefix=ASM %s -- diff --git a/lld/test/wasm/undef-shared.s b/lld/test/wasm/undef-shared.s index 4c270880ef531..44b5c1191944e 100644 --- a/lld/test/wasm/undef-shared.s +++ b/lld/test/wasm/undef-shared.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o -# RUN: not wasm-ld --experimental-pic %t.o -o /dev/null -shared 2>&1 | FileCheck %s +# RUN: not wasm-ld %t.o -o /dev/null -shared 2>&1 | FileCheck %s # CHECK: error: {{.*}}: undefined symbol: hidden .global hidden diff --git a/lld/test/wasm/undefined-data.s b/lld/test/wasm/undefined-data.s index 5e2a41606612a..6c3692025c424 100644 --- a/lld/test/wasm/undefined-data.s +++ b/lld/test/wasm/undefined-data.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s # RUN: not wasm-ld -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=UNDEF # RUN: wasm-ld --allow-undefined -o %t.wasm %t.o -# RUN: not wasm-ld --experimental-pic -shared --unresolved-symbols=import-dynamic -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=SHARED +# RUN: not wasm-ld -shared --unresolved-symbols=import-dynamic -o %t.wasm %t.o 2>&1 | FileCheck %s -check-prefix=SHARED .globl _start _start: diff --git a/lld/test/wasm/unresolved-symbols-dynamic.s b/lld/test/wasm/unresolved-symbols-dynamic.s index c3d4a753acf60..501c30defc7ff 100644 --- a/lld/test/wasm/unresolved-symbols-dynamic.s +++ b/lld/test/wasm/unresolved-symbols-dynamic.s @@ -1,8 +1,7 @@ # Unresolve data symbols are allowing under import-dynamic when GOT # relocations are used # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t-dynamic.o -# RUN: wasm-ld %t-dynamic.o -o %t.wasm --unresolved-symbols=import-dynamic 2>&1 | FileCheck -check-prefix=WARN %s -# WARN: wasm-ld: warning: dynamic imports are not yet stable (--unresolved-symbols=import-dynamic) +# RUN: wasm-ld %t-dynamic.o -o %t.wasm --unresolved-symbols=import-dynamic # RUN: obj2yaml %t.wasm | FileCheck %s .functype undef () -> () diff --git a/lld/test/wasm/unresolved-symbols.s b/lld/test/wasm/unresolved-symbols.s index d83a63ab3c576..7367e6fddf76a 100644 --- a/lld/test/wasm/unresolved-symbols.s +++ b/lld/test/wasm/unresolved-symbols.s @@ -85,7 +85,7 @@ # RUN: llvm-readobj %t4.wasm > /dev/null 2>&1 ## import-dynamic should fail due to incompatible relocations. -# RUN: not wasm-ld %t/main.o -o %t5.wasm --experimental-pic --unresolved-symbols=import-dynamic 2>&1 | FileCheck -check-prefix=ERRNOPIC %s +# RUN: not wasm-ld %t/main.o -o %t5.wasm --unresolved-symbols=import-dynamic 2>&1 | FileCheck -check-prefix=ERRNOPIC %s # ERRNOPIC: relocation R_WASM_MEMORY_ADDR_SLEB cannot be used against symbol `undef_data`; recompile with -fPIC # ERRNOPIC: relocation R_WASM_TABLE_INDEX_SLEB cannot be used against symbol `undef_func`; recompile with -fPIC diff --git a/lld/test/wasm/unsupported-pic-relocations.s b/lld/test/wasm/unsupported-pic-relocations.s index 2f85afa02c88b..da83ec8dc9a32 100644 --- a/lld/test/wasm/unsupported-pic-relocations.s +++ b/lld/test/wasm/unsupported-pic-relocations.s @@ -1,23 +1,23 @@ # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s -# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null 2>&1 | \ +# RUN: not wasm-ld -shared %t.o -o /dev/null 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=report-all 2>&1 | \ +# RUN: not wasm-ld -shared %t.o -o /dev/null --unresolved-symbols=report-all 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --warn-unresolved-symbols 2>&1 | \ +# RUN: not wasm-ld -shared %t.o -o /dev/null --warn-unresolved-symbols 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=ignore-all 2>&1 | \ +# RUN: not wasm-ld -shared %t.o -o /dev/null --unresolved-symbols=ignore-all 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ +# RUN: not wasm-ld -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ # RUN: FileCheck %s ## These errors should not be reported under -r/--relocation (i.e. when ## generating an object file) -# RUN: wasm-ld --experimental-pic -r %t.o -o /dev/null +# RUN: wasm-ld -r %t.o -o /dev/null .functype external_func () -> () diff --git a/lld/test/wasm/unsupported-pic-relocations64.s b/lld/test/wasm/unsupported-pic-relocations64.s index df885b8d75fbe..330738bb020cb 100644 --- a/lld/test/wasm/unsupported-pic-relocations64.s +++ b/lld/test/wasm/unsupported-pic-relocations64.s @@ -1,23 +1,23 @@ # RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-unknown -o %t.o %s -# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null 2>&1 | \ +# RUN: not wasm-ld -mwasm64 -shared %t.o -o /dev/null 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=report-all 2>&1 | \ +# RUN: not wasm-ld -mwasm64 -shared %t.o -o /dev/null --unresolved-symbols=report-all 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --warn-unresolved-symbols 2>&1 | \ +# RUN: not wasm-ld -mwasm64 -shared %t.o -o /dev/null --warn-unresolved-symbols 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=ignore-all 2>&1 | \ +# RUN: not wasm-ld -mwasm64 -shared %t.o -o /dev/null --unresolved-symbols=ignore-all 2>&1 | \ # RUN: FileCheck %s -# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ +# RUN: not wasm-ld -mwasm64 -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ # RUN: FileCheck %s ## These errors should not be reported under -r/--relocation (i.e. when ## generating an object file) -# RUN: wasm-ld -mwasm64 --experimental-pic -r %t.o -o /dev/null +# RUN: wasm-ld -mwasm64 -r %t.o -o /dev/null .functype external_func () -> () diff --git a/lld/test/wasm/weak-undefined-pic.s b/lld/test/wasm/weak-undefined-pic.s index 1a3a1715b4bb9..66232878be8af 100644 --- a/lld/test/wasm/weak-undefined-pic.s +++ b/lld/test/wasm/weak-undefined-pic.s @@ -74,7 +74,7 @@ _start: # With `-pie` or `-shared` the resolution should be deferred to the dynamic # linker and the function address should be imported as GOT.func.foo. # -# RUN: wasm-ld --experimental-pic -pie %t.o -o %t3.wasm +# RUN: wasm-ld -pie %t.o -o %t3.wasm # RUN: obj2yaml %t3.wasm | FileCheck %s --check-prefix=IMPORT # IMPORT: - Type: IMPORT diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index 31e08e4e248a4..491bf9233b0cf 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -56,7 +56,6 @@ struct Config { bool compressRelocations; bool demangle; bool disableVerify; - bool experimentalPic; bool emitRelocs; bool exportAll; bool exportDynamic; @@ -82,8 +81,9 @@ struct Config { bool stripAll; bool stripDebug; bool stackFirst; - // Because dyamanic linking under Wasm is still experimental we default to - // static linking + // Static linking is currently the default under WebAssembly. This may + // change as some point in the future if dynamic linking becomes more widely + // used. bool isStatic = true; bool thinLTOEmitImportsFiles; bool thinLTOEmitIndexFiles; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 508c6b9df90bd..2cba6ae540526 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -535,7 +535,6 @@ static void readConfigs(opt::InputArgList &args) { ctx.arg.demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); ctx.arg.disableVerify = args.hasArg(OPT_disable_verify); ctx.arg.emitRelocs = args.hasArg(OPT_emit_relocs); - ctx.arg.experimentalPic = args.hasArg(OPT_experimental_pic); ctx.arg.entry = getEntry(args); ctx.arg.exportAll = args.hasArg(OPT_export_all); ctx.arg.exportTable = args.hasArg(OPT_export_table); @@ -795,28 +794,6 @@ static void checkOptions(opt::InputArgList &args) { error("-r and --global-base may not by used together"); } - // To begin to prepare for Module Linking-style shared libraries, start - // warning about uses of `-shared` and related flags outside of Experimental - // mode, to give anyone using them a heads-up that they will be changing. - // - // Also, warn about flags which request explicit exports. - if (!ctx.arg.experimentalPic) { - // -shared will change meaning when Module Linking is implemented. - if (ctx.arg.shared) { - warn("creating shared libraries, with -shared, is not yet stable"); - } - - // -pie will change meaning when Module Linking is implemented. - if (ctx.arg.pie) { - warn("creating PIEs, with -pie, is not yet stable"); - } - - if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) { - warn("dynamic imports are not yet stable " - "(--unresolved-symbols=import-dynamic)"); - } - } - if (ctx.arg.bsymbolic && !ctx.arg.shared) { warn("-Bsymbolic is only meaningful when combined with -shared"); } diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 33ecf03176d36..a009cac7f57ad 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -320,6 +320,6 @@ def thinlto_prefix_replace_eq: JJ<"thinlto-prefix-replace=">; def lto_debug_pass_manager: FF<"lto-debug-pass-manager">, HelpText<"Debug new pass manager">; -// Experimental PIC mode. +// Legacy experimental PIC flag. Remove this at some point in the future. def experimental_pic: FF<"experimental-pic">, - HelpText<"Enable Experimental PIC">; + HelpText<"Legacy enable Experimental PIC flag; ignored">; From 4b248f20559b5f208d7ca8d5fb96ddbd84015f76 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Fri, 8 May 2026 15:50:24 -0700 Subject: [PATCH 110/538] [flang][cuda] Widen stream argument to i64 in stream intrinsic lowering (#196650) `genCUDASetDefaultStream` and `genCUDAStreamDestroy` build their runtime call with an `i64` stream parameter but pass the actual argument straight through, so a smaller-kind actual (e.g. the literal `0` in `cudaforSetDefaultStream(0)`) produces an ill-typed `fir.call`: ``` error: 'llvm.call' op operand type mismatch for operand 0: 'i32' != 'i64' ``` Insert a `fir.convert` to `i64` before the call, matching what `genCUDASetDefaultStreamArray` already does. --- .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 3 ++ flang/test/Lower/CUDA/cuda-default-stream.cuf | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index b53294b68ac92..bc95d7d2893a7 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -1135,6 +1135,8 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStream( assert(args.size() == 1); mlir::Value stream = fir::getBase(args[0]); mlir::Type i64Ty = builder.getI64Type(); + // Widen to i64 to accept smaller integer-kind actuals (e.g. literal 0). + stream = builder.createConvert(loc, i64Ty, stream); auto ctx = builder.getContext(); mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy}); auto funcOp = @@ -1172,6 +1174,7 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamDestroy( assert(args.size() == 1); mlir::Value stream = fir::getBase(args[0]); mlir::Type i64Ty = builder.getI64Type(); + stream = builder.createConvert(loc, i64Ty, stream); auto ctx = builder.getContext(); mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy}); auto funcOp = diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf index 5fc7de68b47d4..af09604865431 100644 --- a/flang/test/Lower/CUDA/cuda-default-stream.cuf +++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf @@ -49,3 +49,37 @@ end subroutine ! CHECK-LABEL: func.func @_QPstream_destroy() ! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%{{.*}}) fastmath : (i64) -> i32 + +! A default-kind (i32) actual argument must be widened to i64 before +! reaching the runtime stream call. +subroutine default_stream_i32_literal + use cuda_runtime_api + integer :: istat + istat = cudaforSetDefaultStream(0) + istat = cudaStreamDestroy(0) +end subroutine + +! CHECK-LABEL: func.func @_QPdefault_stream_i32_literal() +! CHECK: %[[ZERO1:.*]] = arith.constant 0 : i32 +! CHECK: %[[STRM1:.*]] = fir.convert %[[ZERO1]] : (i32) -> i64 +! CHECK: %{{.*}} = fir.call @_FortranACUFSetDefaultStream(%[[STRM1]]) fastmath : (i64) -> i32 +! CHECK: %[[ZERO2:.*]] = arith.constant 0 : i32 +! CHECK: %[[STRM2:.*]] = fir.convert %[[ZERO2]] : (i32) -> i64 +! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%[[STRM2]]) fastmath : (i64) -> i32 + +subroutine default_stream_i32_var + use cuda_runtime_api + integer :: istat + integer(4) :: s + s = 0 + istat = cudaforSetDefaultStream(s) + istat = cudaStreamDestroy(s) +end subroutine + +! CHECK-LABEL: func.func @_QPdefault_stream_i32_var() +! CHECK: %[[L1:.*]] = fir.load %{{.*}} : !fir.ref +! CHECK: %[[V1:.*]] = fir.convert %[[L1]] : (i32) -> i64 +! CHECK: %{{.*}} = fir.call @_FortranACUFSetDefaultStream(%[[V1]]) fastmath : (i64) -> i32 +! CHECK: %[[L2:.*]] = fir.load %{{.*}} : !fir.ref +! CHECK: %[[V2:.*]] = fir.convert %[[L2]] : (i32) -> i64 +! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%[[V2]]) fastmath : (i64) -> i32 From dc93944caaa93648a89e72021602556ce39003ee Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 8 May 2026 17:50:55 -0500 Subject: [PATCH 111/538] [mlir][AMDGPU] Add, unify verification of memref index counts (#196657) This PR verifies that, on operations that have `%memref[%idx0, %idx1, ...]` arguments, the number of indices matches the rank of the memref being passed in. While we're here, fixes capitalization for certain verification error messages. Assisted-by: Codex 5.5 (handled much of the implementation) --- .../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 3 - mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 68 ++++++-- mlir/test/Dialect/AMDGPU/invalid.mlir | 160 +++++++++++++++++- 3 files changed, 215 insertions(+), 16 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td index cea6c7c76fdc4..0ec788e21f0bf 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td @@ -1637,9 +1637,6 @@ class AMDGPU_DmaBaseOp : Variadic:$lds_indices)>, Results<(outs outType: $base)> { - // TODO: - // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. - let assemblyFormat = [{ $global `[` $global_indices `]` `,` $lds `[` $lds_indices `]` attr-dict `:` type($global) `,` type($lds) `->` type(results) }]; diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp index 03be67f33a1df..fd9a153ada2b8 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp @@ -38,6 +38,19 @@ using namespace mlir; using namespace mlir::amdgpu; +/// Verifies that the number of indices matches the rank of the indexed memref, +/// emitting an op error mentioning `indexName` on mismatch. +template +static LogicalResult verifyIndexCount(OpTy op, StringRef indexName, + MemRefType memrefType, + int64_t numIndices) { + int64_t rank = memrefType.getRank(); + if (rank != numIndices) + return op.emitOpError("expected ") + << rank << " " << indexName << " indices, got " << numIndices; + return success(); +} + //===----------------------------------------------------------------------===// // 8-bit float ops //===----------------------------------------------------------------------===// @@ -177,14 +190,11 @@ static LogicalResult verifyRawBufferOp(T &op) { if (!isGlobal) return op.emitOpError( - "Buffer ops must operate on a memref in global memory"); + "buffer ops must operate on a memref in global memory"); if (!bufferType.hasRank()) return op.emitOpError( - "Cannot meaningfully buffer_store to an unranked memref"); - if (static_cast(op.getIndices().size()) != bufferType.getRank()) - return op.emitOpError("Expected " + Twine(bufferType.getRank()) + - " indices to memref"); - return success(); + "cannot meaningfully buffer_store to an unranked memref"); + return verifyIndexCount(op, "buffer", bufferType, op.getIndices().size()); } LogicalResult RawBufferLoadOp::verify() { return verifyRawBufferOp(*this); } @@ -948,6 +958,12 @@ LogicalResult GatherToLDSOp::verify() { MemRefType srcType = cast(getSrc().getType()); MemRefType dstType = cast(getDst().getType()); + if (failed( + verifyIndexCount(*this, "source", srcType, getSrcIndices().size())) || + failed(verifyIndexCount(*this, "destination", dstType, + getDstIndices().size()))) + return failure(); + if (dstType.getRank() > 0 && !dstType.areTrailingDimsContiguous(1)) return emitOpError("destination type inner most dim must be contiguous"); @@ -1020,6 +1036,12 @@ LogicalResult GlobalLoadAsyncToLDSOp::verify() { MemRefType srcType = cast(getSrc().getType()); MemRefType dstType = cast(getDst().getType()); + if (failed( + verifyIndexCount(*this, "source", srcType, getSrcIndices().size())) || + failed(verifyIndexCount(*this, "destination", dstType, + getDstIndices().size()))) + return failure(); + if (srcType.getElementType() != dstType.getElementType()) return emitOpError("source and destination element types must match"); @@ -1050,6 +1072,10 @@ LogicalResult GlobalLoadAsyncToLDSOp::verify() { LogicalResult TransposeLoadOp::verify() { MemRefType srcType = cast(getSrc().getType()); + if (failed( + verifyIndexCount(*this, "source", srcType, getSrcIndices().size()))) + return failure(); + if (!hasWorkgroupMemorySpace(srcType.getMemorySpace())) return emitOpError("source memory address space must be Workgroup"); @@ -1086,6 +1112,10 @@ LogicalResult TransposeLoadOp::verify() { LogicalResult GlobalTransposeLoadOp::verify() { MemRefType srcType = cast(getSrc().getType()); + if (failed( + verifyIndexCount(*this, "source", srcType, getSrcIndices().size()))) + return failure(); + if (!hasGlobalMemorySpace(srcType.getMemorySpace())) return emitOpError("source memory address space must be Global"); @@ -1124,6 +1154,11 @@ template static LogicalResult verifyBase(BaseOp op) { auto ldsType = cast(op.getLds().getType()); auto globalType = cast(op.getGlobal().getType()); + if (failed(verifyIndexCount(op, "global", globalType, + op.getGlobalIndices().size())) || + failed(verifyIndexCount(op, "lds", ldsType, op.getLdsIndices().size()))) + return failure(); + if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace())) return op.emitOpError( "lds memref must have workgroup address space attribute."); @@ -1196,9 +1231,17 @@ static LogicalResult verifyDescriptorOp(DescriptorOp op) { "element type width must be 1, 2, 4 or 8 bytes, but was ") << elementTypeWidth << " bits long"; + if (!op.getAtomicBarrierAddress() && !op.getAtomicBarrierIndices().empty()) + return op.emitOpError( + "atomic barrier indices require an atomic barrier address"); + if (Value atomicBarrierAddress = op.getAtomicBarrierAddress()) { auto atomicBarrierAddressType = cast(atomicBarrierAddress.getType()); + if (failed(verifyIndexCount(op, "atomic barrier", atomicBarrierAddressType, + op.getAtomicBarrierIndices().size()))) + return failure(); + bool barrierInLDS = hasWorkgroupMemorySpace(atomicBarrierAddressType.getMemorySpace()); if (!barrierInLDS) @@ -1417,6 +1460,10 @@ void ScaledMFMAOp::getCanonicalizationPatterns(RewritePatternSet &results, template static LogicalResult verifyDsBarrierOpCommon(T &op) { MemRefType memrefType = llvm::cast(op.getBase().getType()); + if (failed( + verifyIndexCount(op, "barrier", memrefType, op.getIndices().size()))) + return failure(); + if (!hasWorkgroupMemorySpace(memrefType.getMemorySpace())) return op.emitOpError("barrier must be in workgroup (LDS) memory"); @@ -1446,18 +1493,15 @@ LogicalResult DsBarrierArriveOp::verify() { LogicalResult GlobalPrefetchOp::verify() { auto src = cast(getSrc().getType()); + if (failed(verifyIndexCount(*this, "source", src, getIndices().size()))) + return failure(); + Attribute memSpace = src.getMemorySpace(); if (!memSpace) return this->emitOpError("the source must have address space attribute"); if (!hasGlobalMemorySpace(memSpace)) return this->emitOpError("the source must reside in global address space"); - ArrayRef srcShape = src.getShape(); - const size_t numIndices = getIndices().size(); - if (srcShape.size() != numIndices) - return this->emitOpError( - "the number of indices must match the source shape size"); - const LoadTemporalHint temporalHint = getTemporalHint(); const Scope scope = getCacheScope(); const bool isSpeculative = getSpeculative(); diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 4fe3185a27bd1..4e4cfe53298c7 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -245,6 +245,14 @@ func.func @fat_raw_buffer_cast_stripping_offset_affine_map(%m: memref<8xi32, aff // ----- +func.func @raw_buffer_load_wrong_num_indices(%src: memref<4x4xf32>, %idx: i32) -> f32 { + // expected-error@+1 {{'amdgpu.raw_buffer_load' op expected 2 buffer indices, got 1}} + %0 = amdgpu.raw_buffer_load %src[%idx] : memref<4x4xf32>, i32 -> f32 + func.return %0 : f32 +} + +// ----- + func.func @swizzle_invalid_type(%arg0 : si32) -> si32 { // expected-error@+1 {{'amdgpu.swizzle_bitmode' op operand #0 must be Integer or Float or fixed-length vector of Integer or Float values of ranks 1}} %0 = amdgpu.swizzle_bitmode %arg0 1 2 4 : si32 @@ -317,6 +325,14 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me // ----- +func.func @transpose_load_wrong_num_indices(%idx : index, %mem : memref<128x32xf16, #gpu.address_space>) -> vector<4xf16> { + // expected-error@+1 {{'amdgpu.transpose_load' op expected 2 source indices, got 1}} + %0 = amdgpu.transpose_load %mem[%idx] : memref<128x32xf16, #gpu.address_space> -> vector<4xf16> + func.return %0 : vector<4xf16> +} + +// ----- + func.func @global_transpose_load_wrong_addrspace(%i : index, %j : index, %src : memref<128x256xf16, 3>) -> vector<8xf16> { // expected-error@+1 {{'amdgpu.global_transpose_load' op source memory address space must be Global}} @@ -327,6 +343,14 @@ func.func @global_transpose_load_wrong_addrspace(%i : index, %j : index, // ----- +func.func @global_transpose_load_wrong_num_indices(%idx : index, %mem : memref<128x32xf16, #gpu.address_space>) -> vector<8xf16> { + // expected-error@+1 {{'amdgpu.global_transpose_load' op expected 2 source indices, got 1}} + %0 = amdgpu.global_transpose_load %mem[%idx] : memref<128x32xf16, #gpu.address_space> -> vector<8xf16> + func.return %0 : vector<8xf16> +} + +// ----- + func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) { // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}} amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16> @@ -343,6 +367,16 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : // ----- +func.func @gather_to_lds_wrong_num_indices(%idx : index, + %src : memref<32x32xf16, #gpu.address_space>, + %dst : memref<32x32xf16, #gpu.address_space>) { + // expected-error@+1 {{'amdgpu.gather_to_lds' op expected 2 source indices, got 1}} + amdgpu.gather_to_lds %src[%idx], %dst[%idx, %idx] : vector<2xf16>, memref<32x32xf16, #gpu.address_space>, memref<32x32xf16, #gpu.address_space> + func.return +} + +// ----- + func.func @global_load_async_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf32, #gpu.address_space>, %mem2 : memref<32xf32>) { @@ -378,6 +412,18 @@ func.func @global_load_async_to_lds_src_not_global(%idx1 : index, // ----- +func.func @global_load_async_to_lds_wrong_num_indices(%idx : index, + %src : memref<32x32xf32, #gpu.address_space>, + %dst : memref<32x32xf32, #gpu.address_space>) { + // expected-error@+1 {{'amdgpu.global_load_async_to_lds' op expected 2 destination indices, got 1}} + amdgpu.global_load_async_to_lds %src[%idx, %idx], %dst[%idx] + : f32, memref<32x32xf32, #gpu.address_space>, + memref<32x32xf32, #gpu.address_space> + func.return +} + +// ----- + func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}} %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> @@ -441,6 +487,26 @@ func.func @make_gather_dma_base_invalid_addressspace(%idx: index, %smem : memref // ----- +func.func @make_dma_base_wrong_num_indices(%idx: index, + %global: memref<8x8xi32, #gpu.address_space>, + %lds: memref<8x8xi32, #gpu.address_space>) { + // expected-error@+1 {{'amdgpu.make_dma_base' op expected 2 global indices, got 1}} + amdgpu.make_dma_base %global[%idx], %lds[%idx, %idx] : memref<8x8xi32, #gpu.address_space>, memref<8x8xi32, #gpu.address_space> -> !amdgpu.tdm_base + return +} + +// ----- + +func.func @make_gather_dma_base_wrong_num_indices(%idx: index, + %global: memref<8x8xi32, #gpu.address_space>, + %lds: memref<8x8xi32, #gpu.address_space>) { + // expected-error@+1 {{'amdgpu.make_gather_dma_base' op expected 2 lds indices, got 1}} + amdgpu.make_gather_dma_base %global[%idx, %idx], %lds[%idx] : memref<8x8xi32, #gpu.address_space>, memref<8x8xi32, #gpu.address_space> -> !amdgpu.tdm_gather_base + return +} + +// ----- + func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base, %barrier: memref<8x!amdgpu.ds_barrier_state>, %idx: index) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op atomic barrier address must be in LDS.}} amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8x!amdgpu.ds_barrier_state>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor @@ -449,6 +515,33 @@ func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base, %barrier: // ----- +func.func @make_dma_descriptor_barrier_wrong_num_indices(%base: !amdgpu.tdm_base, + %barrier: memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>, + %idx: index) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op expected 2 atomic barrier indices, got 1}} + amdgpu.make_dma_descriptor %base + globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] + atomicBarrier(%barrier[%idx] : memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + return +} + +// ----- + +func.func @make_dma_descriptor_barrier_indices_without_address( + %base: !amdgpu.tdm_base, %idx: index) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op atomic barrier indices require an atomic barrier address}} + %0 = "amdgpu.make_dma_descriptor"(%base, %idx) <{ + global_static_sizes = array, + global_static_strides = array, + operandSegmentSizes = array, + shared_static_sizes = array + }> : (!amdgpu.tdm_base, index) -> !amdgpu.tdm_descriptor + return +} + +// ----- + // CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { @@ -500,6 +593,35 @@ func.func @make_gather_dma_descriptor_invalid_index_types(%base: !amdgpu.tdm_gat // ----- +func.func @make_gather_dma_descriptor_barrier_wrong_num_indices(%base: !amdgpu.tdm_gather_base, + %indices: vector<8xi16>, + %barrier: memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>, + %idx: index) { + // expected-error@+1 {{'amdgpu.make_gather_dma_descriptor' op expected 2 atomic barrier indices, got 1}} + amdgpu.make_gather_dma_descriptor %base[%indices] + globalSize [4, 4] globalStride [4, 1] sharedSize [1, 2] + atomicBarrier(%barrier[%idx] : memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>) + : !amdgpu.tdm_gather_base, vector<8xi16> -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +func.func @make_gather_dma_descriptor_barrier_indices_without_address( + %base: !amdgpu.tdm_gather_base, %indices: vector<8xi16>, + %idx: index) { + // expected-error@+1 {{'amdgpu.make_gather_dma_descriptor' op atomic barrier indices require an atomic barrier address}} + %0 = "amdgpu.make_gather_dma_descriptor"(%base, %indices, %idx) <{ + global_static_sizes = array, + global_static_strides = array, + operandSegmentSizes = array, + shared_static_sizes = array + }> : (!amdgpu.tdm_gather_base, vector<8xi16>, index) -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + func.func @sparse_mfma_dense_not_double_sparse(%a: vector<4xf16>, %b: vector<4xf16>, %c: vector<4xf32>, %idx: vector<4xi8>) -> vector<4xf32> { // expected-error@+1 {{'amdgpu.sparse_mfma' op operand #1 must be vector of 16-bit float values of length 8/16 or vector of bfloat16 type values of length 8/16 or vector of 8-bit signless integer values of length 16/32 or vector of f8E4M3FN type or f8E5M2 type values of length 16/32 or vector of f8E4M3FNUZ type or f8E5M2FNUZ type values of length 16/32, but got 'vector<4xf16>'}} %d = amdgpu.sparse_mfma 16x16x32 %a * %b + %c sparse(%idx : vector<4xi8>) : vector<4xf16>, vector<4xf16>, vector<4xf32> @@ -620,6 +742,15 @@ func.func @ds_barrier_init_non_workgroup(%barrier: memref>, + %idx: index, %participants: i32) { + // expected-error@+1 {{'amdgpu.ds_barrier_init' op expected 2 barrier indices, got 1}} + amdgpu.ds_barrier_init %barrier[%idx], %participants : memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>, i32 + func.return +} + +// ----- + func.func @ds_barrier_poll_state_non_workgroup(%barrier: memref>) -> !amdgpu.ds_barrier_state { // expected-error@+1 {{'amdgpu.ds_barrier_poll_state' op barrier must be in workgroup (LDS) memory}} %state = amdgpu.ds_barrier_poll_state %barrier[] : memref> -> !amdgpu.ds_barrier_state @@ -628,6 +759,24 @@ func.func @ds_barrier_poll_state_non_workgroup(%barrier: memref>, + %idx: index) -> !amdgpu.ds_barrier_state { + // expected-error@+1 {{'amdgpu.ds_barrier_poll_state' op expected 2 barrier indices, got 1}} + %state = amdgpu.ds_barrier_poll_state %barrier[%idx] : memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space> -> !amdgpu.ds_barrier_state + func.return %state : !amdgpu.ds_barrier_state +} + +// ----- + +func.func @ds_async_barrier_arrive_wrong_num_indices(%barrier: memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>, + %idx: index) { + // expected-error@+1 {{'amdgpu.ds_async_barrier_arrive' op expected 2 barrier indices, got 1}} + amdgpu.ds_async_barrier_arrive %barrier[%idx] : memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space> + func.return +} + +// ----- + func.func @ds_barrier_arrive_non_workgroup(%barrier: memref>, %count: i64) -> !amdgpu.ds_barrier_state { // expected-error@+1 {{'amdgpu.ds_barrier_arrive' op barrier must be in workgroup (LDS) memory}} %old_state = amdgpu.ds_barrier_arrive %barrier[], %count : memref>, i64 -> !amdgpu.ds_barrier_state @@ -636,6 +785,15 @@ func.func @ds_barrier_arrive_non_workgroup(%barrier: memref>, + %idx: index, %count: i64) -> !amdgpu.ds_barrier_state { + // expected-error@+1 {{'amdgpu.ds_barrier_arrive' op expected 2 barrier indices, got 1}} + %old_state = amdgpu.ds_barrier_arrive %barrier[%idx], %count : memref<8x8x!amdgpu.ds_barrier_state, #gpu.address_space>, i64 -> !amdgpu.ds_barrier_state + func.return %old_state : !amdgpu.ds_barrier_state +} + +// ----- + func.func @sparse_wmma_invalid_m(%a: vector<8xf16>, %b: vector<16xf16>, %c: vector<8xf32>, %idx: vector<4xi8>) -> vector<8xf32> { // expected-error@+1 {{'amdgpu.sparse_wmma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}} %d = amdgpu.sparse_wmma 32x16x32 %a * %b + %c sparse(%idx : vector<4xi8>) : vector<8xf16>, vector<16xf16>, vector<8xf32> @@ -778,7 +936,7 @@ func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.addres // GlobalPrefetchOp: number of indices must match source shape rank func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.address_space>, %i: i64) { - // expected-error@+1 {{'amdgpu.global_prefetch' op the number of indices must match the source shape size}} + // expected-error@+1 {{'amdgpu.global_prefetch' op expected 2 source indices, got 1}} amdgpu.global_prefetch %src[%i] RT DEV : memref<64x64xf16, #gpu.address_space> func.return } From 38d79280ac57159b92a23d90eb34a2f646b57f71 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 17:56:16 -0500 Subject: [PATCH 112/538] [lldb] Handle SIGINT via the MainLoop signal thread (on POSIX) (#195959) The driver's async SIGINT handler called SBDebugger::DispatchInputInterrupt directly. That is not async-signal-safe and can lead to a crash. Register SIGINT with the existing signal-thread MainLoop instead so DispatchInputInterrupt runs in normal thread context. The Windows path is unchanged and keeps the legacy async handler. While DispatchInputInterrupt runs, the callback temporarily installs SIG_DFL so a second Ctrl-C still hard-terminates the process, preserving the escape hatch users rely on when the debugger is unresponsive. Moving SIGINT off the main thread means a Ctrl-C no longer interrupts blocking syscalls there (e.g. a Python REPL waiting on input or sleeping), so Python never observes the queued interrupt and KeyboardInterrupt is not raised. To restore that behavior, after dispatching the interrupt the callback re-raises SIGINT on the main thread via pthread_kill; the resulting EINTR lets Python pick up the pending interrupt. A skip flag suppresses the re-entry that this self-send produces. Because the callback only ever runs on the signal thread, the flag and the captured main-thread id live in the lambda's captures and need no synchronization. rdar://158218595 --- lldb/tools/driver/Driver.cpp | 60 +++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index d47d3daf1c3fc..e58286f9ff41e 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -43,6 +43,9 @@ #include #include #include +#ifndef _WIN32 +#include +#endif #include #include #include @@ -651,11 +654,10 @@ void Driver::UpdateWindowSize() { } } -void sigint_handler(int signo) { #ifdef _WIN32 +void sigint_handler(int signo) { // Restore handler as it is not persistent on Windows. signal(SIGINT, sigint_handler); -#endif static std::atomic_flag g_interrupt_sent = ATOMIC_FLAG_INIT; if (g_driver != nullptr) { @@ -668,6 +670,7 @@ void sigint_handler(int signo) { _exit(signo); } +#endif static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) { std::string usage_str = tool_name.str() + " [options]"; @@ -781,15 +784,64 @@ int main(int argc, char const *argv[]) { // Setup LLDB signal handlers once the debugger has been initialized. SBDebugger::PrintDiagnosticsOnError(); - // FIXME: Migrate the SIGINT handler to be handled by the signal loop below. +#ifdef _WIN32 signal(SIGINT, sigint_handler); -#if !defined(_WIN32) +#else signal(SIGPIPE, SIG_IGN); + // Capture the main thread's id so the signal thread can target it. + pthread_t main_thread = pthread_self(); + + // Set when the signal thread sends itself a SIGINT to wake the main thread. + // The next callback invocation observes this flag and skips the work. A + // plain bool is sufficient because the callback only ever runs on the + // signal thread; it lives outside the lambda because MainLoopPosix copies + // the callback on every dispatch, which would discard in-lambda state. + bool skip_next_sigint = false; + // Handle signals in a MainLoop running on a separate thread. MainLoop signal_loop; Status signal_status; + auto sigint_handler = signal_loop.RegisterSignal( + SIGINT, + [&, main_thread](MainLoopBase &) { + // Skip the self-sent wakeup SIGINT queued at the end of the previous + // invocation. + if (std::exchange(skip_next_sigint, false)) + return; + + // Temporarily restore the default disposition so that a second SIGINT + // delivered while DispatchInputInterrupt is running hard-terminates + // the process. This preserves the "double Ctrl-C to force exit" + // escape hatch users rely on when the debugger is unresponsive. + struct sigaction old_action; + struct sigaction new_action = {}; + new_action.sa_handler = SIG_DFL; + sigemptyset(&new_action.sa_mask); + + int ret = sigaction(SIGINT, &new_action, &old_action); + UNUSED_IF_ASSERT_DISABLED(ret); + assert(ret == 0 && "sigaction failed"); + + if (g_driver) + g_driver->GetDebugger().DispatchInputInterrupt(); + + ret = sigaction(SIGINT, &old_action, nullptr); + UNUSED_IF_ASSERT_DISABLED(ret); + assert(ret == 0 && "sigaction failed"); + + // Wake the main thread so any blocking syscall (e.g. the Python REPL + // waiting on input or sleeping) returns with EINTR. This lets Python + // observe the pending interrupt queued by DispatchInputInterrupt and + // raise KeyboardInterrupt. Flag the resulting callback invocation so + // it's skipped rather than re-running DispatchInputInterrupt. + skip_next_sigint = true; + pthread_kill(main_thread, SIGINT); + }, + signal_status); + assert(sigint_handler && signal_status.Success()); + auto sigwinch_handler = signal_loop.RegisterSignal( SIGWINCH, [&](MainLoopBase &) { From 25295bcb2ca68ab3ac9a2cadb622e8dcfc49579b Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Fri, 8 May 2026 16:02:42 -0700 Subject: [PATCH 113/538] [BOLT][NFCI] Consolidate DataReader::setEntryCounts (#196411) FuncBranchData/BinaryFunction exec/external entry counts are set in multiple places in `DataReader`: - FBD: in `parse` and `appendFrom`, - BF: in `preprocessProfile` and `matchProfileData`. Consolidate to `setEntryCounts` called from `readProfile`. Drop explicit counters, compute them from `FBD::EntryData`. Test Plan: NFCI --- bolt/include/bolt/Profile/DataReader.h | 9 ++---- bolt/lib/Profile/DataReader.cpp | 44 +++++++++++++++----------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h index 31b23ff4cdd8e..d5fc1ef38d6a4 100644 --- a/bolt/include/bolt/Profile/DataReader.h +++ b/bolt/include/bolt/Profile/DataReader.h @@ -92,12 +92,6 @@ struct FuncBranchData { ContainerTy Data; ContainerTy EntryData; - /// Total execution count for the function. - int64_t ExecutionCount{0}; - - /// Total entry count from external code for the function. - uint64_t ExternEntryCount{0}; - /// Indicate if the data was used. bool Used{false}; @@ -115,6 +109,9 @@ struct FuncBranchData { /// by counting the number of executed branches for each BranchInfo uint64_t getNumExecutedBranches() const; + /// Set entry counts derived from EntryData to \p BF. + void setEntryCounts(BinaryFunction &BF) const; + /// Aggregation helpers DenseMap> IntraIndex; DenseMap> InterIndex; diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index 38e32d12028d3..5a551f0a895ad 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -84,8 +84,6 @@ void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) { } } llvm::stable_sort(Data); - ExecutionCount += FBD.ExecutionCount; - ExternEntryCount += FBD.ExternEntryCount; for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) { assert(I->To.Name == FBD.Name); auto NewElmt = EntryData.insert(EntryData.end(), *I); @@ -104,6 +102,23 @@ uint64_t FuncBranchData::getNumExecutedBranches() const { return ExecutedBranches; } +void FuncBranchData::setEntryCounts(BinaryFunction &BF) const { + uint64_t ExecCount = 0; + uint64_t ExternEntryCount = 0; + // If destination is the function start - update execution count. + // NB: the data is skewed since we cannot tell tail recursion from + // branches to the function start. + for (const BranchInfo &BI : EntryData) { + if (BI.To.Offset != 0) + continue; + ExecCount += BI.Branches; + if (!BI.From.IsSymbol) + ExternEntryCount += BI.Branches; + } + BF.setExecutionCount(ExecCount); + BF.setExternEntryCount(ExternEntryCount); +} + void BasicSampleInfo::mergeWith(const BasicSampleInfo &SI) { Hits += SI.Hits; } void BasicSampleInfo::print(raw_ostream &OS) const { @@ -240,8 +255,7 @@ Error DataReader::preprocessProfile(BinaryContext &BC) { } if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) { setBranchData(Function, FuncData); - Function.ExecutionCount = FuncData->ExecutionCount; - Function.ExternEntryCount = FuncData->ExternEntryCount; + FuncData->setEntryCounts(Function); FuncData->Used = true; } } @@ -333,6 +347,10 @@ std::error_code DataReader::parseInput() { } void DataReader::readProfile(BinaryFunction &BF) { + // Set entry counts for the common case. + if (FuncBranchData *FBD = getBranchData(BF)) + FBD->setEntryCounts(BF); + if (BF.empty()) return; @@ -351,6 +369,10 @@ void DataReader::readProfile(BinaryFunction &BF) { if (!FBD) return; + // Re-set entry counts in case FBD was swapped (LTO) or merged + // (fetchProfileForOtherEntryPoints). + FBD->setEntryCounts(BF); + // Assign basic block counts to function entry points. These only include // counts for outside entries. // @@ -397,8 +419,6 @@ void DataReader::matchProfileData(BinaryFunction &BF) { if (BF.ProfileMatchRatio == 1.0f) { if (fetchProfileForOtherEntryPoints(BF)) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); - BF.ExecutionCount = FBD->ExecutionCount; - BF.ExternEntryCount = FBD->ExternEntryCount; BF.RawSampleCount = FBD->getNumExecutedBranches(); } return; @@ -428,8 +448,6 @@ void DataReader::matchProfileData(BinaryFunction &BF) { // Update function profile data with the new set. setBranchData(BF, NewBranchData); NewBranchData->Used = true; - BF.ExecutionCount = NewBranchData->ExecutionCount; - BF.ExternEntryCount = NewBranchData->ExternEntryCount; BF.ProfileMatchRatio = 1.0f; break; } @@ -1168,16 +1186,6 @@ std::error_code DataReader::parse() { I = GetOrCreateFuncEntry(BI.To.Name); I->second.EntryData.emplace_back(std::move(BI)); } - - // If destination is the function start - update execution count. - // NB: the data is skewed since we cannot tell tail recursion from - // branches to the function start. - if (BI.To.IsSymbol && BI.To.Offset == 0) { - I = GetOrCreateFuncEntry(BI.To.Name); - I->second.ExecutionCount += BI.Branches; - if (!BI.From.IsSymbol) - I->second.ExternEntryCount += BI.Branches; - } } while (hasMemData()) { From dbbe548ed5dd3e3f47da7592179538b618fff926 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 8 May 2026 16:06:10 -0700 Subject: [PATCH 114/538] [DirectX] Not print invalid root signature definitions. (#196444) This patch adds a check during root signature printing pass, that makes sure we have valid root signature before starting printing. This is required after https://github.com/llvm/llvm-project/pull/194858 changed reportError to not stop after emitting the first error. Fix: https://github.com/llvm/llvm-project/issues/196430 --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 2 ++ llvm/lib/Target/DirectX/DXILRootSignature.h | 2 ++ .../RootSignature-Error-is-not-function.ll | 13 +++------- .../RootSignature-Error-is-not-value.ll | 15 +++-------- .../RootSignature-Invalid-Version.ll | 2 +- .../RootSignature-NullFunction-Error.ll | 1 - .../RootSignature-PrintOnlyValid.ll | 26 +++++++++++++++++++ 7 files changed, 38 insertions(+), 23 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-PrintOnlyValid.ll diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index ac3c7dde6b892..bd36334a16eac 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -169,6 +169,8 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, ModuleAnalysisManager &AM) { RootSignatureBindingInfo &RSDMap = AM.getResult(M); + if (RSDMap.empty()) + return PreservedAnalyses::all(); OS << "Root Signature Definitions" << "\n"; diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 1bc9e7ccbddf3..947cf6e79f0ef 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -42,6 +42,8 @@ class RootSignatureBindingInfo { iterator end() { return FuncToRsMap.end(); } + bool empty() const { return FuncToRsMap.empty(); } + mcdxbc::RootSignatureDesc *getDescForFunction(const Function *F) { const auto FuncRs = find(F); if (FuncRs == end()) diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll index fbda7561cecad..bca00c0bc007d 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll @@ -3,24 +3,17 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: First element of root signature is not a Function -; CHECK-NOT: Root Signature Definitions +; CHECK-NOT: Definition for 'main': -define void @main() #0 { -entry: - ret void -} -define void @anotherMain() #0 { +define void @main() #0 { entry: ret void } attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } -!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs -!2 = !{ ptr @main, !3, i32 2 } ; function, root signature -!3 = !{ !4 } ; list of root signature elements -!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout +!dx.rootsignatures = !{!5} ; list of function/root signature pairs !5 = !{ i32 -1, !6, i32 2 } ; function, root signature !6 = !{ !7 } ; list of root signature elements !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll index 94ab52e1f29c0..e46a293ee340a 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll @@ -3,24 +3,17 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: First element of root signature is not a Value -; CHECK-NOT: Root Signature Definitions +; CHECK-NOT: Definition for 'main' -define void @main() #0 { -entry: - ret void -} -define void @anotherMain() #0 { +define void @main() #0 { entry: ret void } attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } -!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs -!2 = !{ ptr @main, !3, i32 2 } ; function, root signature -!3 = !{ !4 } ; list of root signature elements -!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout -!5 = !{ !3, !6, i32 2 } ; function, root signature +!dx.rootsignatures = !{!5} ; list of function/root signature pairs +!5 = !{ !6, !6, i32 2 } ; function, root signature !6 = !{ !7 } ; list of root signature elements !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll index 26867e6d7ec25..452d0d386f731 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll @@ -4,7 +4,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Invalid value for Version: 4 -; CHECK-NOT: Root Signature Definitions + define void @main() #0 { entry: ret void diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll index c6b57ee31c87a..583eef780ecb6 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll @@ -1,7 +1,6 @@ ; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s ; CHECK: error: Function associated with Root Signature definition is null -; CHECK-NOT: Root Signature Definitions target triple = "dxil-unknown-shadermodel6.0-compute" diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-PrintOnlyValid.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-PrintOnlyValid.ll new file mode 100644 index 0000000000000..212c72615f6e0 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-PrintOnlyValid.ll @@ -0,0 +1,26 @@ +; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s + +target triple = "dxil-unknown-shadermodel6.0-compute" + +; CHECK: Root Signature Definitions +; CHECK: Definition for 'main': +; CHECK-NOT: Definition for 'anotherMain': +define void @main() #0 { +entry: + ret void +} + +define void @anotherMain() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs +!2 = !{ ptr @main, !3, i32 2 } ; function, root signature +!3 = !{ !4 } ; list of root signature elements +!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout +!5 = !{ i32 -1, !6, i32 2 } ; function, root signature +!6 = !{ !7 } ; list of root signature elements +!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout From f03da66169781297993e568cc070517084dbb28a Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Fri, 8 May 2026 16:25:42 -0700 Subject: [PATCH 115/538] [clang][deps] Move `ScanningOutputFormat` out of the library (#196631) Basing behavior of the dependency scanner on the final output format is a leaky abstraction. Instead, we should aim to introduce proper feature flags. --- clang-tools-extra/clangd/ProjectModules.cpp | 3 ++- .../DependencyScanningService.h | 20 ++---------------- .../DependencyScannerImpl.cpp | 2 +- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 21 ++++++++++++++++++- .../Tooling/DependencyScannerTest.cpp | 2 -- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/clang-tools-extra/clangd/ProjectModules.cpp b/clang-tools-extra/clangd/ProjectModules.cpp index 93347780e4a20..d3727171bff12 100644 --- a/clang-tools-extra/clangd/ProjectModules.cpp +++ b/clang-tools-extra/clangd/ProjectModules.cpp @@ -155,7 +155,8 @@ class ModuleDependencyScanner { dependencies::DependencyScanningServiceOptions Opts; Opts.MakeVFS = [&] { return TFS.view(std::nullopt); }; Opts.Mode = dependencies::ScanningMode::CanonicalPreprocessing; - Opts.Format = dependencies::ScanningOutputFormat::P1689; + Opts.EmitWarnings = false; + Opts.ReportAbsolutePaths = false; return Opts; }()) {} diff --git a/clang/include/clang/DependencyScanning/DependencyScanningService.h b/clang/include/clang/DependencyScanning/DependencyScanningService.h index f379381faea59..b9f6c9adcef48 100644 --- a/clang/include/clang/DependencyScanning/DependencyScanningService.h +++ b/clang/include/clang/DependencyScanning/DependencyScanningService.h @@ -29,22 +29,6 @@ enum class ScanningMode { DependencyDirectivesScan, }; -/// The format that is output by the dependency scanner. -enum class ScanningOutputFormat { - /// This is the Makefile compatible dep format. This will include all of the - /// deps necessary for an implicit modules build, but won't include any - /// intermodule dependency information. - Make, - - /// This outputs the full clang module dependency graph suitable for use for - /// explicitly building modules. - Full, - - /// This outputs the dependency graph for standard c++ modules in P1689R5 - /// format. - P1689, -}; - #define DSS_LAST_BITMASK_ENUM(Id) \ LLVM_MARK_AS_BITMASK_ENUM(Id), All = llvm::NextPowerOf2(Id) - 1 @@ -87,10 +71,10 @@ struct DependencyScanningServiceOptions { MakeVFS; // = [] { return llvm::vfs::createPhysicalFileSystem(); } /// Whether to use optimized dependency directive scan or full preprocessing. ScanningMode Mode = ScanningMode::DependencyDirectivesScan; - /// What output format are we expected to produce. - ScanningOutputFormat Format = ScanningOutputFormat::Full; /// How to optimize resulting explicit module command lines. ScanningOptimizations OptimizeArgs = ScanningOptimizations::Default; + /// Whether the scanner should emit warnings. + bool EmitWarnings = true; /// Whether to make reported file paths absolute. bool ReportAbsolutePaths = true; /// Whether to report modules visible from modules that are imported directly. diff --git a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp index 224413bb99cbc..3dfcc7e49ed88 100644 --- a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp +++ b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp @@ -396,7 +396,7 @@ void dependencies::initializeScanCompilerInstance( ScanInstance.setBuildingModule(false); ScanInstance.createVirtualFileSystem(FS, DiagConsumer); ScanInstance.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false); - if (Service.getOpts().Format == ScanningOutputFormat::P1689) + if (!Service.getOpts().EmitWarnings) ScanInstance.getDiagnostics().setIgnoreAllWarnings(true); ScanInstance.createFileManager(); ScanInstance.createSourceManager(); diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 8944c5fc48e30..3417dc2a07d39 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -81,6 +81,22 @@ enum ResourceDirRecipeKind { RDRK_InvokeCompiler, }; +/// The format that is output by the dependency scanner. +enum class ScanningOutputFormat { + /// This is the Makefile compatible dep format. This will include all of the + /// deps necessary for an implicit modules build, but won't include any + /// intermodule dependency information. + Make, + + /// This outputs the full clang module dependency graph suitable for use for + /// explicitly building modules. + Full, + + /// This outputs the dependency graph for standard c++ modules in P1689R5 + /// format. + P1689, +}; + static std::string OutputFileName = "-"; static ScanningMode ScanMode = ScanningMode::DependencyDirectivesScan; static ScanningOutputFormat Format = ScanningOutputFormat::Make; @@ -1155,8 +1171,11 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { DependencyScanningServiceOptions Opts; Opts.Mode = ScanMode; - Opts.Format = Format; Opts.OptimizeArgs = OptimizeArgs; + // The scanner currently ignores `#pragma clang diagnostic ...` and emits + // unexpected diagnostics. Work around this for now by disabling warnings + // entirely, at least for P1689 where people hit this most often. + Opts.EmitWarnings = Format != ScanningOutputFormat::P1689; // Within P1689 format, we don't want all the paths to be absolute path // since it may violate the traditional make style dependencies info. Opts.ReportAbsolutePaths = Format != ScanningOutputFormat::P1689; diff --git a/clang/unittests/Tooling/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScannerTest.cpp index 86d4e0ee1b8c5..6b119000000cf 100644 --- a/clang/unittests/Tooling/DependencyScannerTest.cpp +++ b/clang/unittests/Tooling/DependencyScannerTest.cpp @@ -230,7 +230,6 @@ TEST(DependencyScanner, ScanDepsWithFS) { DependencyScanningServiceOptions Opts; Opts.MakeVFS = [&] { return VFS; }; - Opts.Format = ScanningOutputFormat::Make; DependencyScanningService Service(std::move(Opts)); DependencyScanningTool ScanTool(Service); @@ -290,7 +289,6 @@ TEST(DependencyScanner, ScanDepsWithModuleLookup) { DependencyScanningServiceOptions Opts; Opts.MakeVFS = [&] { return InterceptFS; }; - Opts.Format = ScanningOutputFormat::Make; DependencyScanningService Service(std::move(Opts)); DependencyScanningTool ScanTool(Service); From 54ea7c5bf16b70bd774e67d35d2874e0017ccdbe Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 8 May 2026 16:28:39 -0700 Subject: [PATCH 116/538] [RISCV] Use the nhs.lea.h/w/d instead of nhs.lea.h/w/d.ze with Sh1AddPat. (#196660) The srliw already took care of zeroing the upper bits. Using the non-.ze form is consistent with the Zba version of this pattern. --- llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 6 +++--- llvm/test/CodeGen/RISCV/rv64zba.ll | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index 3a0f16e97c3d5..18261fe403d42 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -838,9 +838,9 @@ let Predicates = [HasVendorXAndesPerf, IsRV64] in { defm : Sh2Add_UWPat; defm : Sh3Add_UWPat; - def : Sh1AddPat; - def : Sh2AddPat; - def : Sh3AddPat; + def : Sh1AddPat; + def : Sh2AddPat; + def : Sh3AddPat; } // Predicates = [HasVendorXAndesPerf, IsRV64] let Predicates = [HasVendorXAndesPerf] in { diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index fb26b8b16a290..376c240f73483 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -2937,7 +2937,7 @@ define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) { ; RV64XANDESPERF-LABEL: srliw_1_sh1add: ; RV64XANDESPERF: # %bb.0: ; RV64XANDESPERF-NEXT: srliw a1, a1, 1 -; RV64XANDESPERF-NEXT: nds.lea.h.ze a0, a0, a1 +; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a1 ; RV64XANDESPERF-NEXT: lh a0, 0(a0) ; RV64XANDESPERF-NEXT: ret %3 = lshr i32 %1, 1 @@ -3004,7 +3004,7 @@ define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) { ; RV64XANDESPERF-LABEL: srliw_2_sh2add: ; RV64XANDESPERF: # %bb.0: ; RV64XANDESPERF-NEXT: srliw a1, a1, 2 -; RV64XANDESPERF-NEXT: nds.lea.w.ze a0, a0, a1 +; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a1 ; RV64XANDESPERF-NEXT: lw a0, 0(a0) ; RV64XANDESPERF-NEXT: ret %3 = lshr i32 %1, 2 @@ -3033,7 +3033,7 @@ define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) { ; RV64XANDESPERF-LABEL: srliw_3_sh3add: ; RV64XANDESPERF: # %bb.0: ; RV64XANDESPERF-NEXT: srliw a1, a1, 3 -; RV64XANDESPERF-NEXT: nds.lea.d.ze a0, a0, a1 +; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a1 ; RV64XANDESPERF-NEXT: ld a0, 0(a0) ; RV64XANDESPERF-NEXT: ret %3 = lshr i32 %1, 3 From bc654b438ffe99cece54ccd70f0e1d48ccbe1c8a Mon Sep 17 00:00:00 2001 From: YongKang Zhu Date: Fri, 8 May 2026 17:51:48 -0700 Subject: [PATCH 117/538] Revert "[BOLT] Fix EH data encoding checks in relocateEHFrameSection (#195691)" (#196672) This reverts commit 7ab26d7c3a160e1dc166f2673644baa396703ee5. There is test failure in bolt-tests::exceptions-split-strip.test. --- bolt/lib/Rewrite/RewriteInstance.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 9f6eebf7f7834..43d4421e06928 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2162,10 +2162,13 @@ void RewriteInstance::relocateEHFrameSection() { return; // Only fix references that are relative to other locations. - if ((DwarfType & 0xf0) != dwarf::DW_EH_PE_pcrel && - (DwarfType & 0xf0) != dwarf::DW_EH_PE_textrel && - (DwarfType & 0xf0) != dwarf::DW_EH_PE_funcrel && - (DwarfType & 0xf0) != dwarf::DW_EH_PE_datarel) + if (!(DwarfType & dwarf::DW_EH_PE_pcrel) && + !(DwarfType & dwarf::DW_EH_PE_textrel) && + !(DwarfType & dwarf::DW_EH_PE_funcrel) && + !(DwarfType & dwarf::DW_EH_PE_datarel)) + return; + + if (!(DwarfType & dwarf::DW_EH_PE_sdata4)) return; uint32_t RelType; From c7120bbf16a8b5d82fc045ab30a577638ab0e2a7 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Sat, 9 May 2026 09:15:09 +0800 Subject: [PATCH 118/538] [mlir][tensor] Enhance pattern to fold extract_slice(insert_slice) (#195045) Extend the DropRedundantRankExpansionOnExtractSliceOfInsertSlice pattern to support cases where the expanded dimensions are a subset of the dropped dimensions, rather than requiring them to be exactly equal. For example: ``` %inserted_slice = tensor.insert_slice %src into %dest[0, 0, 0, 0] [1, 1, 128, 480] [1, 1, 1, 1] : tensor<128x480xf32> into tensor<1x1x128x480xf32> %extracted_slice = tensor.extract_slice %inserted_slice[0, 0, 0, 0] [1, 1, 123, 1] [1, 1, 1, 1] : tensor<1x1x128x480xf32> to tensor<123xf32> ``` can be folded into: ``` %extracted_slice = tensor.extract_slice %src[0, 0] [123, 1] [1, 1] : tensor<128x480xf32> to tensor<123xf32> ``` --- .../DropRedundantRankExpansionPatterns.cpp | 27 ++++++++++++------- ...redundant-insert-slice-rank-expansion.mlir | 12 +++++++++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/DropRedundantRankExpansionPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/DropRedundantRankExpansionPatterns.cpp index 4253548d11f49..55ad6256b0f6c 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/DropRedundantRankExpansionPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/DropRedundantRankExpansionPatterns.cpp @@ -20,9 +20,14 @@ using namespace mlir::tensor; namespace { /// Drop redundant rank expansion of insert_slice that are directly followed /// by extract_slice. E.g.: -/// %0 = tensor.insert_slice ... : tensor<5x10xf32> into tensor<1x1x5x10xf32> +/// %0 = tensor.insert_slice %in... : tensor<5x10xf32> into tensor<1x1x5x10xf32> /// %1 = tensor.extract_slice %0[0, 0, 2, 3] [1, 1, 2, 2] [1, 1, 1, 1] /// : tensor<1x1x5x10xf32> to tensor<2x2xf32> +/// +/// can be folded into: +/// +/// %1 = tensor.extract_slice %in[2, 3] [2, 2] [1, 1] +/// : tensor<5x10xf32> to tensor<2x2xf32> struct DropRedundantRankExpansionOnExtractSliceOfInsertSlice : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -41,9 +46,8 @@ struct DropRedundantRankExpansionOnExtractSliceOfInsertSlice return failure(); llvm::SmallBitVector expandedDims = insertSliceOp.getDroppedDims(); - // TODO: This could be extended to support cases where the dropped dims are - // a subset of the expanded dims. - if (expandedDims != droppedDims) + // Support cases where the expanded dims are a subset of the droped dims. + if (!expandedDims.subsetOf(droppedDims)) return failure(); // The tensor.insert_slice may not be redundant if it has multiple users. @@ -58,18 +62,21 @@ struct DropRedundantRankExpansionOnExtractSliceOfInsertSlice // Extract directly from the source. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(extractSliceOp); + SmallVector mixedOffsets = extractSliceOp.getMixedOffsets(); + SmallVector mixedSizes = extractSliceOp.getMixedSizes(); + SmallVector mixedStrides = extractSliceOp.getMixedStrides(); SmallVector newOffsets, newSizes, newStrides; for (int64_t i = 0, e = extractSliceOp.getSourceType().getRank(); i < e; ++i) { - if (droppedDims.test(i)) + if (expandedDims.test(i)) continue; - newOffsets.push_back(extractSliceOp.getMixedOffsets()[i]); - newSizes.push_back(extractSliceOp.getMixedSizes()[i]); - newStrides.push_back(extractSliceOp.getMixedStrides()[i]); + newOffsets.push_back(mixedOffsets[i]); + newSizes.push_back(mixedSizes[i]); + newStrides.push_back(mixedStrides[i]); } rewriter.replaceOpWithNewOp( - extractSliceOp, /*source=*/insertSliceOp.getSource(), newOffsets, - newSizes, newStrides); + extractSliceOp, extractSliceOp.getResultType(), + /*source=*/insertSliceOp.getSource(), newOffsets, newSizes, newStrides); rewriter.eraseOp(insertSliceOp); return success(); } diff --git a/mlir/test/Dialect/Tensor/drop-redundant-insert-slice-rank-expansion.mlir b/mlir/test/Dialect/Tensor/drop-redundant-insert-slice-rank-expansion.mlir index 0496b93257a9e..e21d85411b2a7 100644 --- a/mlir/test/Dialect/Tensor/drop-redundant-insert-slice-rank-expansion.mlir +++ b/mlir/test/Dialect/Tensor/drop-redundant-insert-slice-rank-expansion.mlir @@ -12,6 +12,18 @@ func.func @test_drop_rank_expansion(%src: tensor<128x480xf32>, %dest: tensor<1x1 // ----- +// CHECK-LABEL: func @test_drop_rank_expansion( +// CHECK-SAME: %[[src:.*]]: tensor<128x480xf32>, +// CHECK: %[[extract:.*]] = tensor.extract_slice %[[src]][0, 0] [123, 1] [1, 1] : tensor<128x480xf32> to tensor<123xf32> +// CHECK: return %[[extract]] +func.func @test_drop_rank_expansion(%src: tensor<128x480xf32>, %dest: tensor<1x1x128x480xf32>) -> tensor<123xf32> { + %inserted_slice = tensor.insert_slice %src into %dest[0, 0, 0, 0] [1, 1, 128, 480] [1, 1, 1, 1] : tensor<128x480xf32> into tensor<1x1x128x480xf32> + %extracted_slice = tensor.extract_slice %inserted_slice[0, 0, 0, 0] [1, 1, 123, 1] [1, 1, 1, 1] : tensor<1x1x128x480xf32> to tensor<123xf32> + return %extracted_slice : tensor<123xf32> +} + +// ----- + func.func @fold_casting_insert_slice_of_extract_slice(%in : tensor, %dest : tensor<8x1x8xf32>) -> tensor<8x1x8xf32> { %extracted_slice = tensor.extract_slice %in[0, 0, 0, 0] [1, 8, 1, 8] [1, 1, 1, 1] : tensor to tensor<8x8xf32> %inserted_slice = tensor.insert_slice %extracted_slice into %dest[0, 0, 0] [8, 1, 8] [1, 1, 1] : tensor<8x8xf32> into tensor<8x1x8xf32> From d2db5bba03413f6d4bcae1a4ae3d284bf2a7de8e Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 8 May 2026 18:18:41 -0700 Subject: [PATCH 119/538] [CodeGen] Use unique_ptr for FunctionInfo to prevent memory leaks (#196603) Raw pointer return from `FunctionInfo::create` caused leaks in callers like `computeABIInfoUsingLib`, breaking BPF tests on ASan bots. Using `std::unique_ptr` enforces automatic cleanup. Fixes leak from #194460. Buildbot: https://lab.llvm.org/buildbot/#/builders/52/builds/17090 Assisted-by: Gemini --- clang/lib/CodeGen/CGCall.cpp | 2 +- llvm/include/llvm/ABI/FunctionInfo.h | 2 +- llvm/lib/ABI/FunctionInfo.cpp | 13 ++++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 1cafe364c4c42..a2b9c945788ee 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -843,7 +843,7 @@ void CodeGenModule::computeABIInfoUsingLib(CGFunctionInfo &FI) { if (Required.allowsOptionalArgs()) NumRequired = Required.getNumRequiredArgs(); - llvm::abi::FunctionInfo *AbiFI = llvm::abi::FunctionInfo::create( + auto AbiFI = llvm::abi::FunctionInfo::create( FI.getCallingConvention(), AbiMapper->convertType(FI.getReturnType()), MappedArgTypes, NumRequired); diff --git a/llvm/include/llvm/ABI/FunctionInfo.h b/llvm/include/llvm/ABI/FunctionInfo.h index 7f7b6a44ba6ad..0ebd0700836e2 100644 --- a/llvm/include/llvm/ABI/FunctionInfo.h +++ b/llvm/include/llvm/ABI/FunctionInfo.h @@ -234,7 +234,7 @@ class FunctionInfo final : private TrailingObjects { unsigned arg_size() const { return NumArgs; } - static FunctionInfo * + static std::unique_ptr create(CallingConv::ID CC, const Type *ReturnType, ArrayRef ArgTypes, std::optional NumRequired = std::nullopt); diff --git a/llvm/lib/ABI/FunctionInfo.cpp b/llvm/lib/ABI/FunctionInfo.cpp index f89d90c74ea03..4e3b4c3f22aff 100644 --- a/llvm/lib/ABI/FunctionInfo.cpp +++ b/llvm/lib/ABI/FunctionInfo.cpp @@ -12,16 +12,19 @@ using namespace llvm; using namespace llvm::abi; -FunctionInfo *FunctionInfo::create(CallingConv::ID CC, const Type *ReturnType, - ArrayRef ArgTypes, - std::optional NumRequired) { +std::unique_ptr +FunctionInfo::create(CallingConv::ID CC, const Type *ReturnType, + ArrayRef ArgTypes, + std::optional NumRequired) { assert(!NumRequired || *NumRequired <= ArgTypes.size()); void *Buffer = operator new(totalSizeToAlloc(ArgTypes.size())); - FunctionInfo *FI = - new (Buffer) FunctionInfo(CC, ReturnType, ArgTypes.size(), NumRequired); + // FunctionInfo overloads operator delete, so we can use std::unique_ptr + // without worrying about sized deallocation of trailing objects. + std::unique_ptr FI( + new (Buffer) FunctionInfo(CC, ReturnType, ArgTypes.size(), NumRequired)); ArgEntry *Args = FI->getTrailingObjects(); for (unsigned I = 0; I < ArgTypes.size(); ++I) From 74e07e1dca5f2535c8667cec33363220357c0fa6 Mon Sep 17 00:00:00 2001 From: Jianjian Guan Date: Sat, 9 May 2026 10:34:16 +0800 Subject: [PATCH 120/538] [CIR][RISCV] Support zksh builtin codegen (#196463) --- clang/lib/CIR/CodeGen/CIRGenBuiltinRISCV.cpp | 10 +++++-- .../CIR/CodeGenBuiltins/RISCV/riscv-zksh.c | 26 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 clang/test/CIR/CodeGenBuiltins/RISCV/riscv-zksh.c diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinRISCV.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinRISCV.cpp index bd9202705d147..6793e3b11cef2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinRISCV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinRISCV.cpp @@ -121,8 +121,14 @@ CIRGenFunction::emitRISCVBuiltinExpr(unsigned builtinID, const CallExpr *e) { break; } // Zksh - case RISCV::BI__builtin_riscv_sm3p0: - case RISCV::BI__builtin_riscv_sm3p1: + case RISCV::BI__builtin_riscv_sm3p0: { + intrinsicName = "riscv.sm3p0"; + break; + } + case RISCV::BI__builtin_riscv_sm3p1: { + intrinsicName = "riscv.sm3p1"; + break; + } // Zbb case RISCV::BI__builtin_riscv_clz_32: case RISCV::BI__builtin_riscv_clz_64: diff --git a/clang/test/CIR/CodeGenBuiltins/RISCV/riscv-zksh.c b/clang/test/CIR/CodeGenBuiltins/RISCV/riscv-zksh.c new file mode 100644 index 0000000000000..97417ae038831 --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/RISCV/riscv-zksh.c @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -triple riscv32 -target-feature +zksh -fclangir -emit-cir %s -o - | FileCheck %s --check-prefixes=CIR +// RUN: %clang_cc1 -triple riscv64 -target-feature +zksh -fclangir -emit-cir %s -o - | FileCheck %s --check-prefixes=CIR +// RUN: %clang_cc1 -triple riscv32 -target-feature +zksh -fclangir -emit-llvm %s -o - | FileCheck %s --check-prefixes=LLVM +// RUN: %clang_cc1 -triple riscv64 -target-feature +zksh -fclangir -emit-llvm %s -o - | FileCheck %s --check-prefixes=LLVM +// RUN: %clang_cc1 -triple riscv32 -target-feature +zksh -emit-llvm %s -o - | FileCheck %s --check-prefixes=LLVM +// RUN: %clang_cc1 -triple riscv64 -target-feature +zksh -emit-llvm %s -o - | FileCheck %s --check-prefixes=LLVM + +// CIR-LABEL: cir.func{{.*}} @test_builtin_sm3p0( +// CIR: {{%.*}} = cir.call_llvm_intrinsic "riscv.sm3p0" {{%.*}} : (!u32i) -> !u32i +// CIR: cir.return +// LLVM-LABEL: @test_builtin_sm3p0( +// LLVM: call i32 @llvm.riscv.sm3p0(i32 {{%.*}}) +// LLVM: ret i32 +unsigned int test_builtin_sm3p0(unsigned int a) { + return __builtin_riscv_sm3p0(a); +} + +// CIR-LABEL: cir.func{{.*}} @test_builtin_sm3p1( +// CIR: {{%.*}} = cir.call_llvm_intrinsic "riscv.sm3p1" {{%.*}} : (!u32i) -> !u32i +// CIR: cir.return +// LLVM-LABEL: @test_builtin_sm3p1( +// LLVM: call i32 @llvm.riscv.sm3p1(i32 {{%.*}}) +// LLVM: ret i32 +unsigned int test_builtin_sm3p1(unsigned int a) { + return __builtin_riscv_sm3p1(a); +} From 5f1858428127267ff63c78c49ca6241674c4d72d Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 21:53:49 -0500 Subject: [PATCH 121/538] [lldb] Fix CommandObjects that don't set a return status (#196588) Several CommandObject subclasses had DoExecute paths that returned without ever calling SetStatus on the CommandReturnObject. The status was silently left at its initial eReturnStatusStarted value, which made Succeeded() report false for what were really successful commands and left CommandReturnObject in an undefined state. --- lldb/source/API/SBCommandInterpreter.cpp | 8 +++++++- lldb/source/Commands/CommandObjectBreakpoint.cpp | 1 + lldb/source/Commands/CommandObjectCommands.cpp | 3 +++ lldb/source/Commands/CommandObjectExpression.cpp | 4 ++++ lldb/source/Commands/CommandObjectFrame.cpp | 2 ++ lldb/source/Commands/CommandObjectHelp.cpp | 1 + lldb/source/Commands/CommandObjectLog.cpp | 3 ++- lldb/source/Commands/CommandObjectMemory.cpp | 3 +++ lldb/source/Commands/CommandObjectPlatform.cpp | 1 + lldb/source/Commands/CommandObjectRegister.cpp | 2 ++ lldb/source/Commands/CommandObjectScripting.cpp | 2 ++ lldb/source/Commands/CommandObjectSession.cpp | 1 + lldb/source/Commands/CommandObjectSettings.cpp | 3 +++ lldb/source/Commands/CommandObjectSource.cpp | 2 ++ lldb/source/Commands/CommandObjectTarget.cpp | 7 +++++++ lldb/source/Commands/CommandObjectThread.cpp | 1 + lldb/source/Commands/CommandObjectType.cpp | 6 ++++-- 17 files changed, 46 insertions(+), 4 deletions(-) diff --git a/lldb/source/API/SBCommandInterpreter.cpp b/lldb/source/API/SBCommandInterpreter.cpp index 4c1cddc21b972..a1593d67ffd6e 100644 --- a/lldb/source/API/SBCommandInterpreter.cpp +++ b/lldb/source/API/SBCommandInterpreter.cpp @@ -75,7 +75,13 @@ class CommandPluginInterfaceImplementation : public CommandObjectParsed { SBCommandReturnObject sb_return(result); SBCommandInterpreter sb_interpreter(&m_interpreter); SBDebugger debugger_sb(m_interpreter.GetDebugger().shared_from_this()); - m_backend->DoExecute(debugger_sb, command.GetArgumentVector(), sb_return); + bool success = m_backend->DoExecute(debugger_sb, + command.GetArgumentVector(), sb_return); + // If the plugin command did not set its own status, infer it from the + // boolean return value so that callers always see a defined status. + if (result.GetStatus() == eReturnStatusInvalid) + result.SetStatus(success ? eReturnStatusSuccessFinishResult + : eReturnStatusFailed); } lldb::SBCommandPluginInterface *m_backend; std::optional m_auto_repeat_command; diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp index bb71055ebb3cb..c462a4875b127 100644 --- a/lldb/source/Commands/CommandObjectBreakpoint.cpp +++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp @@ -2959,6 +2959,7 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed { m_bp_opts.GetBreakpointOptions(), m_access_options.GetPermissions()); } + result.SetStatus(eReturnStatusSuccessFinishNoResult); } private: diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp index 1f4783c26be8e..84e661ec01f53 100644 --- a/lldb/source/Commands/CommandObjectCommands.cpp +++ b/lldb/source/Commands/CommandObjectCommands.cpp @@ -868,6 +868,7 @@ a number follows 'f':" if (error.Success()) { AddRegexCommandToInterpreter(); + result.SetStatus(eReturnStatusSuccessFinishNoResult); } } if (error.Fail()) { @@ -2498,6 +2499,8 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed, if (m_options.m_class_name.empty() && m_options.m_funct_name.empty()) { m_interpreter.GetPythonCommandsFromIOHandler(" ", // Prompt *this); // IOHandlerDelegate + // Still gathering input; the IOHandler will set the final status. + result.SetStatus(eReturnStatusStarted); return; } diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp index 0b86c329572ee..76cb60c7e4804 100644 --- a/lldb/source/Commands/CommandObjectExpression.cpp +++ b/lldb/source/Commands/CommandObjectExpression.cpp @@ -593,6 +593,8 @@ void CommandObjectExpression::DoExecute(llvm::StringRef command, if (command.empty()) { GetMultilineExpression(); + // Still gathering input; the IOHandler will set the final status. + result.SetStatus(eReturnStatusStarted); return; } @@ -660,6 +662,8 @@ void CommandObjectExpression::DoExecute(llvm::StringRef command, // No expression following options else if (expr.empty()) { GetMultilineExpression(); + // Still gathering input; the IOHandler will set the final status. + result.SetStatus(eReturnStatusStarted); return; } } diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index b1cc6c42a04fc..f039eb6be6eaa 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -176,6 +176,8 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed { ValueObjectPrinter printer(*valobj_sp, &result.GetOutputStream(), options); if (llvm::Error error = printer.PrintValueObject()) result.AppendError(toString(std::move(error))); + else + result.SetStatus(eReturnStatusSuccessFinishResult); } CommandOptions m_options; diff --git a/lldb/source/Commands/CommandObjectHelp.cpp b/lldb/source/Commands/CommandObjectHelp.cpp index f1dbd03fe97cb..a29ded846b100 100644 --- a/lldb/source/Commands/CommandObjectHelp.cpp +++ b/lldb/source/Commands/CommandObjectHelp.cpp @@ -168,6 +168,7 @@ void CommandObjectHelp::DoExecute(Args &command, CommandReturnObject &result) { for (size_t i = 0; i < match_count; i++) { output_strm.Printf("\t%s\n", matches.GetStringAtIndex(i)); } + result.SetStatus(eReturnStatusSuccessFinishNoResult); } else { // Maybe the user is asking for help about a command argument rather than // a command. diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp index b61627f4d4d55..e51a85e9f0308 100644 --- a/lldb/source/Commands/CommandObjectLog.cpp +++ b/lldb/source/Commands/CommandObjectLog.cpp @@ -273,7 +273,8 @@ class CommandObjectLogDisable : public CommandObjectParsed { if (Log::DisableLogChannel(channel, args.GetArgumentArrayRef(), error_stream)) result.SetStatus(eReturnStatusSuccessFinishNoResult); - result.GetErrorStream() << error; + else + result.AppendError(error); } } }; diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp index ac91b63e378a8..9164e209b6bc2 100644 --- a/lldb/source/Commands/CommandObjectMemory.cpp +++ b/lldb/source/Commands/CommandObjectMemory.cpp @@ -769,6 +769,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed { result.GetOutputStream().Printf( "%zi bytes %s to '%s'\n", bytes_written, append ? "appended" : "written", path.c_str()); + result.SetStatus(eReturnStatusSuccessFinishResult); return; } else { result.AppendErrorWithFormat("Failed to write %" PRIu64 @@ -820,6 +821,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed { return; } } + result.SetStatus(eReturnStatusSuccessFinishResult); return; } @@ -1510,6 +1512,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed { return; } } + result.SetStatus(eReturnStatusSuccessFinishNoResult); } OptionGroupOptions m_option_group; diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp index 37ae2899f79b3..52b571acd13ee 100644 --- a/lldb/source/Commands/CommandObjectPlatform.cpp +++ b/lldb/source/Commands/CommandObjectPlatform.cpp @@ -1300,6 +1300,7 @@ class CommandObjectPlatformProcessList : public CommandObjectParsed { ostrm, platform_sp->GetUserIDResolver(), m_options.show_args, m_options.verbose); } + result.SetStatus(eReturnStatusSuccessFinishResult); } } } else { diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp index 29d1cd6dc13e4..c86fd11d4d9e3 100644 --- a/lldb/source/Commands/CommandObjectRegister.cpp +++ b/lldb/source/Commands/CommandObjectRegister.cpp @@ -220,6 +220,8 @@ class CommandObjectRegisterRead : public CommandObjectParsed { } } } + if (result.GetStatus() != eReturnStatusFailed) + result.SetStatus(eReturnStatusSuccessFinishResult); } class CommandOptions : public OptionGroup { diff --git a/lldb/source/Commands/CommandObjectScripting.cpp b/lldb/source/Commands/CommandObjectScripting.cpp index 1f8ee0a9554ec..21400a62d697f 100644 --- a/lldb/source/Commands/CommandObjectScripting.cpp +++ b/lldb/source/Commands/CommandObjectScripting.cpp @@ -229,6 +229,8 @@ class CommandObjectScriptingExtensionList : public CommandObjectParsed { if (!num_listed_interface) s << " None\n"; + + result.SetStatus(eReturnStatusSuccessFinishResult); } private: diff --git a/lldb/source/Commands/CommandObjectSession.cpp b/lldb/source/Commands/CommandObjectSession.cpp index ac7eec5e04f0a..586e0f405920f 100644 --- a/lldb/source/Commands/CommandObjectSession.cpp +++ b/lldb/source/Commands/CommandObjectSession.cpp @@ -179,6 +179,7 @@ class CommandObjectSessionHistory : public CommandObjectParsed { } history.Dump(result.GetOutputStream(), start_idx.second, stop_idx.second); + result.SetStatus(lldb::eReturnStatusSuccessFinishResult); } } } diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp index 34a59d506da7f..b09c65982a277 100644 --- a/lldb/source/Commands/CommandObjectSettings.cpp +++ b/lldb/source/Commands/CommandObjectSettings.cpp @@ -409,6 +409,7 @@ class CommandObjectSettingsWrite : public CommandObjectParsed { if (args.empty()) { GetDebugger().DumpAllPropertyValues(&clean_ctx, out_file, OptionValue::eDumpGroupExport); + result.SetStatus(eReturnStatusSuccessFinishNoResult); return; } @@ -419,6 +420,8 @@ class CommandObjectSettingsWrite : public CommandObjectParsed { result.AppendError(error.AsCString()); } } + if (result.GetStatus() != eReturnStatusFailed) + result.SetStatus(eReturnStatusSuccessFinishNoResult); } private: diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp index b2896e02264f7..0d131bcd626aa 100644 --- a/lldb/source/Commands/CommandObjectSource.cpp +++ b/lldb/source/Commands/CommandObjectSource.cpp @@ -1199,6 +1199,8 @@ class CommandObjectSourceList : public CommandObjectParsed { } } } + if (result.GetStatus() != eReturnStatusFailed) + result.SetStatus(eReturnStatusSuccessFinishResult); } const SymbolContextList *GetBreakpointLocations() { diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index eb71e77447281..40da1418e8d7b 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -305,6 +305,8 @@ class CommandObjectTargetCreate : public CommandObjectParsed { if (!label.empty()) { if (auto E = target_sp->SetLabel(label)) result.SetError(std::move(E)); + else + result.SetStatus(eReturnStatusSuccessFinishNoResult); return; } @@ -989,6 +991,8 @@ class CommandObjectTargetVariable : public CommandObjectParsed { m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(), m_cmd_name); + if (result.GetStatus() != eReturnStatusFailed) + result.SetStatus(eReturnStatusSuccessFinishResult); } OptionGroupOptions m_option_group; @@ -3104,6 +3108,8 @@ class CommandObjectTargetModulesLoad result.AppendError("either the \"--file \" or the \"--uuid " "\" option must be specified.\n"); } + if (result.GetStatus() != eReturnStatusFailed) + result.SetStatus(eReturnStatusSuccessFinishResult); } OptionGroupOptions m_option_group; @@ -3772,6 +3778,7 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); } + result.SetStatus(eReturnStatusSuccessFinishResult); } CommandOptions m_options; diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp index c51cc837dc47b..89b2e14e09c5b 100644 --- a/lldb/source/Commands/CommandObjectThread.cpp +++ b/lldb/source/Commands/CommandObjectThread.cpp @@ -782,6 +782,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed { if (!error.Success()) { result.AppendMessage(error.AsCString()); + result.SetStatus(eReturnStatusFailed); return; } diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp index a1a593ddfac63..9b3d4428272cd 100644 --- a/lldb/source/Commands/CommandObjectType.cpp +++ b/lldb/source/Commands/CommandObjectType.cpp @@ -1559,10 +1559,12 @@ void CommandObjectTypeSummaryAdd::DoExecute(Args &command, #else result.AppendError("python is disabled"); #endif - return; + } else { + Execute_StringSummary(command, result); } - Execute_StringSummary(command, result); + if (result.GetStatus() != eReturnStatusFailed) + result.SetStatus(eReturnStatusSuccessFinishResult); } static bool FixArrayTypeNameWithRegex(ConstString &type_name) { From e4d58800d9990d93d57e32c2c6d9895a466c8c82 Mon Sep 17 00:00:00 2001 From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com> Date: Sat, 9 May 2026 11:18:13 +0800 Subject: [PATCH 122/538] [AMDGPU] Support atomic load and store for vector float types (v2f16, v2i16, v4i16, v4f16, v2f32) (#192904) Add support for atomic load and store on <2 x half>, <4 x half>, and <2 x float> vector types in the AMDGPU backend. These types are promoted to equivalently sized integer types before instruction selection: <2 x half> -> i32 <4 x half> -> i64 <2 x i16> -> i32 <4 x i16> -> i64 <2 x float> -> i64 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 + llvm/lib/Target/AMDGPU/DSInstructions.td | 6 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 8 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 + .../inst-select-load-atomic-flat.mir | 60 +- .../inst-select-load-atomic-global.mir | 48 +- .../inst-select-load-atomic-local.mir | 72 +- .../inst-select-store-atomic-flat.mir | 24 +- .../inst-select-store-atomic-local.mir | 74 +- llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll | 561 ++++++++ .../test/CodeGen/AMDGPU/load-atomic-global.ll | 1246 +++++++++++++++++ llvm/test/CodeGen/AMDGPU/load-atomic-local.ll | 1204 ++++++++++++++++ llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll | 198 +++ .../CodeGen/AMDGPU/store-atomic-global.ll | 438 ++++++ .../test/CodeGen/AMDGPU/store-atomic-local.ll | 429 ++++++ 15 files changed, 4274 insertions(+), 120 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load-atomic-global.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load-atomic-local.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-atomic-global.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-atomic-local.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ee41e5bf73ae2..058ca9c29625f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -168,6 +168,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote); AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16); + setOperationAction(ISD::ATOMIC_LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f32, MVT::i64); + setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote); AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32); @@ -180,6 +183,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote); AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16); + setOperationAction(ISD::ATOMIC_STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f32, MVT::i64); + // There are no 64-bit extloads. These should be done as a 32-bit extload and // an extension to 64-bit. for (MVT VT : MVT::integer_valuetypes()) diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 48a8d2325af33..204cd89d4aefb 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1004,7 +1004,10 @@ defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; +defm : DSReadPat_mc ; defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; let OtherPredicates = [D16PreservesUnusedBits] in { // TODO: Atomic loads @@ -1067,7 +1070,10 @@ defm : DSWritePat_mc ; defm : DSWritePat_t16 ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; +defm : DSWritePat_mc ; defm : DSWritePat_mc ; +defm : DSWritePat_mc ; +defm : DSWritePat_mc ; let OtherPredicates = [HasD16LoadStore] in { def : DSWritePat ; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c0fb73df9c764..a0963fcd5ce55 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2101,7 +2101,9 @@ let OtherPredicates = [HasFlatAddressSpace], True16Predicate = UseRealTrue16Inst } defm : FlatLoadPats ; +defm : FlatLoadPats ; defm : FlatLoadPats ; +defm : FlatLoadPats ; defm : FlatLoadPats ; defm : FlatLoadPats ; @@ -2126,8 +2128,10 @@ defm : FlatStorePats ; } defm : FlatStorePats ; +defm : FlatStorePats ; defm : FlatStorePats ; defm : FlatStorePats ; +defm : FlatStorePats ; defm : FlatStorePats ; defm : FlatStorePats ; defm : FlatStorePats ; @@ -2281,8 +2285,10 @@ defm : GlobalFLATStorePats ; // the memory legalizer will set the cache bits and insert the // appropriate waits. defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATStorePats ; @@ -2328,7 +2334,9 @@ defm : GlobalFLATLoadPats_D16 defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 20599228beea8..95b214f0da4c8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -705,6 +705,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); + setOperationAction(ISD::ATOMIC_LOAD, MVT::v2i16, Promote); + AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2i16, MVT::i32); + setOperationAction(ISD::ATOMIC_LOAD, MVT::v2f16, Promote); + AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::ATOMIC_STORE, MVT::v2i16, Promote); + AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2i16, MVT::i32); + setOperationAction(ISD::ATOMIC_STORE, MVT::v2f16, Promote); + AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f16, MVT::i32); + setOperationAction(ISD::AND, MVT::v2i16, Promote); AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); setOperationAction(ISD::OR, MVT::v2i16, Promote); @@ -719,6 +729,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); + setOperationAction(ISD::ATOMIC_LOAD, MVT::v4i16, Promote); + AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4i16, MVT::i64); + setOperationAction(ISD::ATOMIC_LOAD, MVT::v4f16, Promote); + AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4f16, MVT::i64); + + setOperationAction(ISD::ATOMIC_STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4i16, MVT::i64); + setOperationAction(ISD::ATOMIC_STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4f16, MVT::i64); + setOperationAction(ISD::STORE, MVT::v4i16, Promote); AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v4f16, Promote); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir index 5bfb2b2e4d578..eebf5fac2d9d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir @@ -70,37 +70,37 @@ body: | ; GFX7-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) - ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>)) + ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; ; GFX9-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) - ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>)) + ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; ; GFX10-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) - ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>)) + ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; ; GFX11-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) - ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>)) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; ; GFX12-LABEL: name: load_atomic_flat_v2s16_seq_cst ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>)) - ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>)) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst (<2 x s16>), align 4, addrspace 0) $vgpr0 = COPY %1 @@ -274,37 +274,37 @@ body: | ; GFX7-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>)) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX9-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>)) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX10-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>)) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX11-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>)) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX12-LABEL: name: load_atomic_flat_v4s16_seq_cst ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>)) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst (<4 x s16>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir index 474f1308d8e24..3a79370b750ba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir @@ -89,30 +89,30 @@ body: | ; GFX7-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s16>), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; ; GFX9-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) - ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s16>), addrspace 1) + ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; ; GFX10-LABEL: name: load_atomic_global_v2s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1) - ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s16>), addrspace 1) + ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst (<2 x s16>), align 4, addrspace 1) $vgpr0 = COPY %1 @@ -303,30 +303,30 @@ body: | ; GFX7-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>), addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX7-FLAT-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<4 x s16>), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX9-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<4 x s16>), addrspace 1) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; ; GFX10-LABEL: name: load_atomic_global_v4s16_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<4 x s16>), addrspace 1) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst (<4 x s16>), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir index 5c2df3904b817..ffaa84d3ca700 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir @@ -24,6 +24,7 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (s32), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] + ; ; GFX7-LABEL: name: load_atomic_local_s32_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -31,6 +32,7 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (s32), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] + ; ; GFX9-LABEL: name: load_atomic_local_s32_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -57,23 +59,25 @@ body: | ; GFX6-LABEL: name: load_atomic_local_v2s16_seq_cst ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<2 x s16>), addrspace 3) - ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (<2 x s16>), addrspace 3) + ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] + ; ; GFX7-LABEL: name: load_atomic_local_v2s16_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<2 x s16>), addrspace 3) - ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (<2 x s16>), addrspace 3) + ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] + ; ; GFX9-LABEL: name: load_atomic_local_v2s16_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<2 x s16>), addrspace 3) - ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s16>), addrspace 3) + ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst (<2 x s16>), align 4, addrspace 3) $vgpr0 = COPY %1 @@ -98,6 +102,7 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p3) :: (load seq_cst (p3), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX7-LABEL: name: load_atomic_local_p3_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -105,6 +110,7 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p3) :: (load seq_cst (p3), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; ; GFX9-LABEL: name: load_atomic_local_p3_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -135,6 +141,7 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (s64), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] + ; ; GFX7-LABEL: name: load_atomic_local_s64_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -142,6 +149,7 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (s64), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] + ; ; GFX9-LABEL: name: load_atomic_local_s64_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -168,23 +176,25 @@ body: | ; GFX6-LABEL: name: load_atomic_local_v2s32_seq_cst ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<2 x s32>), addrspace 3) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 3) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] + ; ; GFX7-LABEL: name: load_atomic_local_v2s32_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<2 x s32>), addrspace 3) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 3) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] + ; ; GFX9-LABEL: name: load_atomic_local_v2s32_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<2 x s32>), addrspace 3) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 3) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -205,23 +215,25 @@ body: | ; GFX6-LABEL: name: load_atomic_local_v4s16_seq_cst ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<4 x s16>), addrspace 3) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (<4 x s16>), addrspace 3) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] + ; ; GFX7-LABEL: name: load_atomic_local_v4s16_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<4 x s16>), addrspace 3) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (<4 x s16>), addrspace 3) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] + ; ; GFX9-LABEL: name: load_atomic_local_v4s16_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load seq_cst (<4 x s16>), addrspace 3) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<4 x s16>), addrspace 3) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst (<4 x s16>), align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -246,6 +258,7 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load seq_cst (p1), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX7-LABEL: name: load_atomic_local_p1_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -253,6 +266,7 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load seq_cst (p1), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; ; GFX9-LABEL: name: load_atomic_local_p1_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -283,6 +297,7 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p3) :: (load seq_cst (p0), addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX7-LABEL: name: load_atomic_local_p0_seq_cst ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -290,6 +305,7 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p3) :: (load seq_cst (p0), addrspace 3) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; ; GFX9-LABEL: name: load_atomic_local_p0_seq_cst ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -322,6 +338,7 @@ body: | ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load seq_cst (s32), addrspace 3) ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] + ; ; GFX7-LABEL: name: load_atomic_local_s32_seq_cst_gep_65535 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} @@ -329,6 +346,7 @@ body: | ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load seq_cst (s32), addrspace 3) ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]] + ; ; GFX9-LABEL: name: load_atomic_local_s32_seq_cst_gep_65535 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir index ae010a872a41d..3feb2698fed4a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir @@ -49,16 +49,16 @@ body: | ; GFX7-LABEL: name: atomic_store_flat_v2s16_seq_cst ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2 - ; GFX7-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p0) :: (store seq_cst (<2 x s16>)) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s16>)) ; ; GFX9-LABEL: name: atomic_store_flat_v2s16_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2 - ; GFX9-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p0) :: (store seq_cst (<2 x s16>)) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s16>)) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p0) = COPY $vgpr1_vgpr2 G_STORE %0, %1 :: (store seq_cst (<2 x s16>), align 4, addrspace 0) @@ -229,16 +229,16 @@ body: | ; GFX7-LABEL: name: atomic_store_flat_v4s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store seq_cst (<4 x s16>)) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<4 x s16>)) ; ; GFX9-LABEL: name: atomic_store_flat_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store seq_cst (<4 x s16>)) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<4 x s16>)) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p0) = COPY $vgpr2_vgpr3 G_STORE %0, %1 :: (store seq_cst (<4 x s16>), align 8, addrspace 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir index d290f1b2403e4..2e39861060716 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir @@ -24,6 +24,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (s32), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_s32_seq_cst ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7-NEXT: {{ $}} @@ -31,6 +32,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (s32), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_s32_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -57,23 +59,25 @@ body: | ; GFX6-LABEL: name: atomic_store_local_v2s16_seq_cst ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p3) :: (store seq_cst (<2 x s16>), addrspace 3) + ; GFX6-NEXT: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (<2 x s16>), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_v2s16_seq_cst ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p3) :: (store seq_cst (<2 x s16>), addrspace 3) + ; GFX7-NEXT: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (<2 x s16>), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_v2s16_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 - ; GFX9-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p3) :: (store seq_cst (<2 x s16>), addrspace 3) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store seq_cst (<2 x s16>), addrspace 3) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store seq_cst (<2 x s16>), align 4, addrspace 3) @@ -98,6 +102,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](p3), [[COPY1]](p3) :: (store seq_cst (p3), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_p3_seq_cst ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7-NEXT: {{ $}} @@ -105,6 +110,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: G_STORE [[COPY]](p3), [[COPY1]](p3) :: (store seq_cst (p3), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_p3_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -135,6 +141,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p3) :: (store seq_cst (p5), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_p5_seq_cst ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7-NEXT: {{ $}} @@ -142,6 +149,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p3) :: (store seq_cst (p5), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_p5_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -172,6 +180,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](p6), [[COPY1]](p3) :: (store seq_cst (p6), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_p6_seq_cst ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7-NEXT: {{ $}} @@ -179,6 +188,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: G_STORE [[COPY]](p6), [[COPY1]](p3) :: (store seq_cst (p6), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_p6_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -209,6 +219,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (s64), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_s64_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -216,6 +227,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (s64), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -242,23 +254,25 @@ body: | ; GFX6-LABEL: name: atomic_store_local_v2s32_seq_cst ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store seq_cst (<2 x s32>), addrspace 3) + ; GFX6-NEXT: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (<2 x s32>), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_v2s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store seq_cst (<2 x s32>), addrspace 3) + ; GFX7-NEXT: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (<2 x s32>), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_v2s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX9-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store seq_cst (<2 x s32>), addrspace 3) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store seq_cst (<2 x s32>), addrspace 3) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store seq_cst (<2 x s32>), align 8, addrspace 3) @@ -279,23 +293,25 @@ body: | ; GFX6-LABEL: name: atomic_store_local_v4s16_seq_cst ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store seq_cst (<4 x s16>), addrspace 3) + ; GFX6-NEXT: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (<4 x s16>), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_v4s16_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store seq_cst (<4 x s16>), addrspace 3) + ; GFX7-NEXT: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store seq_cst (<4 x s16>), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX9-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store seq_cst (<4 x s16>), addrspace 3) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store seq_cst (<4 x s16>), addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store seq_cst (<4 x s16>), align 8, addrspace 3) @@ -320,6 +336,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p3) :: (store seq_cst (p0), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_p0_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -327,6 +344,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p3) :: (store seq_cst (p0), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_p0_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -356,6 +374,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store seq_cst (p1), addrspace 3) + ; ; GFX7-LABEL: name: atomic_store_local_p1_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -363,6 +382,7 @@ body: | ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 ; GFX7-NEXT: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store seq_cst (p1), addrspace 3) + ; ; GFX9-LABEL: name: atomic_store_local_p1_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll b/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll new file mode 100644 index 0000000000000..6a9062939d778 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll @@ -0,0 +1,561 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(0) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_monotonic_agent(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dword v0, v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dword v0, v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(0) %p syncscope("agent") monotonic, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dword v0, v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dword v0, v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(0) %p syncscope("agent") monotonic, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_monotonic_agent(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(0) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_monotonic_agent(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v4 +; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(0) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_min(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_min: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_min: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_min: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_min: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-4096 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(0) %p, i64 -4096 + %a0 = load atomic <2 x float>, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_max(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:4095 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] offset:4095 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:4095 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(0) %p, i64 4095 + %a0 = load atomic <2 x float>, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_min(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_min: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: flat_load_dword v0, v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_min: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_min: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_min: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_min: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:-4096 scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_min: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:-4096 scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(0) %p, i64 -4096 + %a = load atomic <2 x i16>, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspace(0) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dword v0, v[0:1] offset:4095 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:4095 glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:4095 glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:4095 scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:4095 scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(0) %p, i64 4095 + %a = load atomic <2 x i16>, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll b/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll new file mode 100644 index 0000000000000..7e4b3e010c45e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll @@ -0,0 +1,1246 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(1) %p syncscope("agent") monotonic, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl1_inv +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl1_inv +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(1) %p syncscope("agent") monotonic, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl1_inv +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl1_inv +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(1) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl1_inv +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl1_inv +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v4 +; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(1) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v4 +; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl1_inv +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl1_inv +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v4 +; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_1(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:1 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:1 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:1 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 1 + %a0 = load atomic <2 x float>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_max(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:4095 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:2047 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: global_store_dword v[2:3], v0, off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: global_store_dword v[2:3], v0, off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4095 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4095 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4095 + %a0 = load atomic <2 x float>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_1(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:1 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:1 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:1 glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:1 glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:1 scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:1 scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 1 + %a = load atomic <2 x i16>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4095 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: global_load_dword v0, v[0:1], off offset:2047 glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-SDAG-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-SDAG-NEXT: global_store_short v[2:3], v0, off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:4095 glc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:4095 glc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:4095 scope:SCOPE_DEV +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:4095 scope:SCOPE_DEV +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-GISEL-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4095 + %a = load atomic <2 x i16>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(1) %out, align 4 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-atomic-local.ll b/llvm/test/CodeGen/AMDGPU/load-atomic-local.ll new file mode 100644 index 0000000000000..3bd6d043bab45 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-atomic-local.ll @@ -0,0 +1,1204 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("agent") monotonic, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 4 + %num1 = extractelement <2 x half> %a0, i32 0 + %num2 = extractelement <2 x half> %a0, i32 1 + %res = fadd half %num1, %num2 + store half %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("agent") monotonic, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 4 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f16x4_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h +; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l +; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8 + %num1 = extractelement <4 x half> %a0, i32 0 + %num2 = extractelement <4 x half> %a0, i32 1 + %num3 = extractelement <4 x half> %a0, i32 2 + %num4 = extractelement <4 x half> %a0, i32 3 + %add = fadd half %num1, %num2 + %mul = fmul half %num3, %num4 + %res = fadd half %add, %mul + store half %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_add_nc_u16 v0, v2, v0 +; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_add_nc_u16 v0, v2, v0 +; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: buffer_gl0_inv +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: buffer_gl0_inv +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x4_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_add_nc_u16 v0, v2, v0 +; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l +; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l +; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8 + %num1 = extractelement <4 x i16> %a0, i32 0 + %num2 = extractelement <4 x i16> %a0, i32 1 + %num3 = extractelement <4 x i16> %a0, i32 2 + %num4 = extractelement <4 x i16> %a0, i32 3 + %add = add i16 %num1, %num2 + %mul = mul i16 %num3, %num4 + %res = add i16 %add, %mul + store i16 %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_1(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 offset:1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 offset:1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_load_b64 v[2:3], v0 offset:1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b64 v[2:3], v0 offset:1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 1 + %a0 = load atomic <2 x float>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_max(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b64 v[2:3], v0 offset:4095 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b64 v[2:3], v0 offset:4095 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_load_b64 v[2:3], v0 offset:4095 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b64 v[2:3], v0 offset:4095 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 4095 + %a0 = load atomic <2 x float>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_1(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 offset:1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 offset:1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 offset:1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 offset:1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 offset:1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 offset:1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 1 + %a = load atomic <2 x i16>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspace(3) %p, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_read_b32 v0, v0 offset:4095 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_read_b32 v0, v0 offset:4095 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 offset:4095 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 offset:4095 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 offset:4095 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 offset:4095 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 4095 + %a = load atomic <2 x i16>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 + %e0 = extractelement <2 x i16> %a, i32 0 + %e1 = extractelement <2 x i16> %a, i32 1 + %sum = add i16 %e0, %e1 + store i16 %sum, ptr addrspace(3) %out, align 4 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll b/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll new file mode 100644 index 0000000000000..c2f3128162ab8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_cs void @atomic_store_f32x2_monotonic_agent(<2 x float> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(0) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_monotonic_agent(<2 x half> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dword v[1:2], v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dword v[1:2], v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b32 v[1:2], v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b32 v[1:2], v0 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(0) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_monotonic_agent(<2 x i16> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dword v[1:2], v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dword v[1:2], v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b32 v[1:2], v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b32 v[1:2], v0 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(0) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_monotonic_agent(<4 x half> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(0) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent(<4 x i16> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(0) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_min(<4 x i16> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff000, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:-4096 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(0) %out, i64 -4096 + store atomic <4 x i16> %in, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_max(<4 x i16> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] offset:4088 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0xff8, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] offset:4088 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:4088 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(0) %out, i64 4088 + store atomic <4 x i16> %in, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll b/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll new file mode 100644 index 0000000000000..60f8194ea698a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll @@ -0,0 +1,438 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_cs void @atomic_store_f32x2_monotonic_agent(<2 x float> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f32x2_seq_cst_agent(<2 x float> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f32x2_seq_cst_wavefront(<2 x float> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_monotonic_agent(<2 x half> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_seq_cst_agent(<2 x half> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_monotonic_wavefront(<2 x half> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_monotonic_agent(<2 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_seq_cst_agent(<2 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_monotonic_wavefront(<2 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_monotonic_agent(<4 x half> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_seq_cst_agent(<4 x half> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_monotonic_wavefront(<4 x half> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent(<4 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_seq_cst_agent(<4 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_wavefront(<4 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_min(<4 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:-4096 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off offset:-4096 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent_offset_min: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off offset:-4096 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(1) %out, i64 -4096 + store atomic <4 x i16> %in, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_max(<4 x i16> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:4088 +; GFX9-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v2 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off offset:2040 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xff8, v2 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off offset:4088 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off offset:4088 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4088 + store atomic <4 x i16> %in, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-atomic-local.ll b/llvm/test/CodeGen/AMDGPU/store-atomic-local.ll new file mode 100644 index 0000000000000..0d2f48d886f2d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-atomic-local.ll @@ -0,0 +1,429 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_cs void @atomic_store_f32x2_monotonic_agent(<2 x float> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f32x2_seq_cst_agent(<2 x float> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f32x2_seq_cst_wavefront(<2 x float> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_seq_cst_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(3) %out syncscope("wavefront") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_monotonic_agent(<2 x half> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_seq_cst_agent(<2 x half> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x2_monotonic_wavefront(<2 x half> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x2_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store atomic <2 x half> %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_monotonic_agent(<2 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_seq_cst_agent(<2 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_i16x2_monotonic_wavefront(<2 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b32 v1, v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x2_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store atomic <2 x i16> %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_monotonic_agent(<4 x half> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_seq_cst_agent(<4 x half> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_f16x4_monotonic_wavefront(<4 x half> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f16x4_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <4 x half> %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent(<4 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_seq_cst_agent(<4 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_seq_cst_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_wavefront(<4 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_wavefront: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store atomic <4 x i16> %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_1(<4 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent_offset_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent_offset_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] offset:1 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] offset:1 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent_offset_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] offset:1 +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(3) %out, i64 1 + store atomic <4 x i16> %in, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 + ret void +} + +define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_max(<4 x i16> %in, ptr addrspace(3) %out) { +; GFX9-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX9: ; %bb.0: +; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:4088 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: ds_write_b64 v2, v[0:1] offset:4088 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: ds_store_b64 v2, v[0:1] offset:4088 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i16x4_monotonic_agent_offset_max: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] offset:4088 +; GFX12-NEXT: s_endpgm + %gep = getelementptr inbounds i8, ptr addrspace(3) %out, i64 4088 + store atomic <4 x i16> %in, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX10-GISEL: {{.*}} +; GFX10-SDAG: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} From 0ad1bc96429863fe9fa65706df9a86cec649bf60 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 8 May 2026 23:02:17 -0500 Subject: [PATCH 123/538] Revert "[lldb] Handle SIGINT via the MainLoop signal thread (on POSIX)" (#196684) Reverts llvm/llvm-project#195959 because it caused `TestIOHandlerCompletion.py` to fail in CI (GreenDragon). --- lldb/tools/driver/Driver.cpp | 60 +++--------------------------------- 1 file changed, 4 insertions(+), 56 deletions(-) diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index e58286f9ff41e..d47d3daf1c3fc 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -43,9 +43,6 @@ #include #include #include -#ifndef _WIN32 -#include -#endif #include #include #include @@ -654,10 +651,11 @@ void Driver::UpdateWindowSize() { } } -#ifdef _WIN32 void sigint_handler(int signo) { +#ifdef _WIN32 // Restore handler as it is not persistent on Windows. signal(SIGINT, sigint_handler); +#endif static std::atomic_flag g_interrupt_sent = ATOMIC_FLAG_INIT; if (g_driver != nullptr) { @@ -670,7 +668,6 @@ void sigint_handler(int signo) { _exit(signo); } -#endif static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) { std::string usage_str = tool_name.str() + " [options]"; @@ -784,64 +781,15 @@ int main(int argc, char const *argv[]) { // Setup LLDB signal handlers once the debugger has been initialized. SBDebugger::PrintDiagnosticsOnError(); -#ifdef _WIN32 + // FIXME: Migrate the SIGINT handler to be handled by the signal loop below. signal(SIGINT, sigint_handler); -#else +#if !defined(_WIN32) signal(SIGPIPE, SIG_IGN); - // Capture the main thread's id so the signal thread can target it. - pthread_t main_thread = pthread_self(); - - // Set when the signal thread sends itself a SIGINT to wake the main thread. - // The next callback invocation observes this flag and skips the work. A - // plain bool is sufficient because the callback only ever runs on the - // signal thread; it lives outside the lambda because MainLoopPosix copies - // the callback on every dispatch, which would discard in-lambda state. - bool skip_next_sigint = false; - // Handle signals in a MainLoop running on a separate thread. MainLoop signal_loop; Status signal_status; - auto sigint_handler = signal_loop.RegisterSignal( - SIGINT, - [&, main_thread](MainLoopBase &) { - // Skip the self-sent wakeup SIGINT queued at the end of the previous - // invocation. - if (std::exchange(skip_next_sigint, false)) - return; - - // Temporarily restore the default disposition so that a second SIGINT - // delivered while DispatchInputInterrupt is running hard-terminates - // the process. This preserves the "double Ctrl-C to force exit" - // escape hatch users rely on when the debugger is unresponsive. - struct sigaction old_action; - struct sigaction new_action = {}; - new_action.sa_handler = SIG_DFL; - sigemptyset(&new_action.sa_mask); - - int ret = sigaction(SIGINT, &new_action, &old_action); - UNUSED_IF_ASSERT_DISABLED(ret); - assert(ret == 0 && "sigaction failed"); - - if (g_driver) - g_driver->GetDebugger().DispatchInputInterrupt(); - - ret = sigaction(SIGINT, &old_action, nullptr); - UNUSED_IF_ASSERT_DISABLED(ret); - assert(ret == 0 && "sigaction failed"); - - // Wake the main thread so any blocking syscall (e.g. the Python REPL - // waiting on input or sleeping) returns with EINTR. This lets Python - // observe the pending interrupt queued by DispatchInputInterrupt and - // raise KeyboardInterrupt. Flag the resulting callback invocation so - // it's skipped rather than re-running DispatchInputInterrupt. - skip_next_sigint = true; - pthread_kill(main_thread, SIGINT); - }, - signal_status); - assert(sigint_handler && signal_status.Success()); - auto sigwinch_handler = signal_loop.RegisterSignal( SIGWINCH, [&](MainLoopBase &) { From 6f2df1cb20cc4e6625a86670903bca6852474cb6 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Sat, 9 May 2026 13:50:27 +0800 Subject: [PATCH 124/538] [Clang] Do not eat SFINAE diagnostics for explicit template arguments (#139066) Instead of merely suggesting the template arguments are invalid, we now provide an explanation of why the explicit template argument is invalid. --- clang/docs/ReleaseNotes.rst | 2 + .../clang/Basic/DiagnosticSemaKinds.td | 12 ++--- clang/lib/Sema/SemaOverload.cpp | 44 +++++++++++-------- clang/test/AST/ByteCode/cxx20.cpp | 2 +- .../basic.namespace/namespace.udecl/p12.cpp | 8 ++-- clang/test/CXX/drs/cwg2xx.cpp | 8 ++-- clang/test/CXX/drs/cwg3xx.cpp | 4 +- clang/test/CXX/expr/expr.const/p3-0x.cpp | 2 +- clang/test/CXX/temp/temp.param/p8-cxx20.cpp | 2 +- .../test/CXX/temp/temp.res/temp.local/p1.cpp | 4 +- clang/test/Modules/cxx-templates.cpp | 2 +- clang/test/SemaCXX/builtin-align-cxx.cpp | 2 +- clang/test/SemaCXX/calling-conv-compat.cpp | 2 +- .../constexpr-function-recovery-crash.cpp | 2 +- clang/test/SemaCXX/cxx2a-template-lambdas.cpp | 7 +-- clang/test/SemaCXX/typo-correction.cpp | 2 +- clang/test/SemaSYCL/sycl-kernel-launch.cpp | 2 +- .../test/SemaTemplate/concepts-using-decl.cpp | 2 +- .../test/SemaTemplate/overload-candidates.cpp | 4 +- .../SemaTemplate/temp_arg_nontype_cxx11.cpp | 2 +- 20 files changed, 63 insertions(+), 52 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2a7c315192f2d..fa19d4b576575 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -476,6 +476,8 @@ Improvements to Clang's diagnostics code can automatically be made portable to other host platforms that don't support backslashes. +- Clang now explains why template deduction fails for explicit template arguments. + - No longer emitting a ``-Wpre-c2y-compat`` or extension diagnostic about use of octal literals with a ``0o`` prefix, and no longer emitting a ``-Wdeprecated-octal-literals`` diagnostic for use of octal literals without diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 9605c02b819d4..c638c23f24bb5 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -5231,14 +5231,14 @@ def note_ovl_candidate_inconsistent_deduction_types : Note< "candidate template ignored: deduced values %diff{" "of conflicting types for parameter %0 (%1 of type $ vs. %3 of type $)|" "%1 and %3 of conflicting types for parameter %0}2,4">; -def note_ovl_candidate_explicit_arg_mismatch_named : Note< - "candidate template ignored: invalid explicitly-specified argument " - "for template parameter %0">; def note_ovl_candidate_unsatisfied_constraints : Note< "candidate template ignored: constraints not satisfied%0">; -def note_ovl_candidate_explicit_arg_mismatch_unnamed : Note< - "candidate template ignored: invalid explicitly-specified argument " - "for %ordinal0 template parameter">; +def note_ovl_candidate_explicit_arg_mismatch : Note< + "candidate template ignored: %enum_select{" + "%Vague{invalid explicitly-specified argument}|" + "%Detailed{%3}}2 for %enum_select{" + "%Named{template parameter %1}|" + "%Unnamed{%ordinal1 template parameter}}0">; def note_ovl_candidate_instantiation_depth : Note< "candidate template ignored: substitution exceeded maximum template " "instantiation depth">; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 96c4ce489fe04..693aae757eb5a 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -33,6 +33,7 @@ #include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaARM.h" #include "clang/Sema/SemaCUDA.h" +#include "clang/Sema/SemaInternal.h" #include "clang/Sema/SemaObjC.h" #include "clang/Sema/Template.h" #include "clang/Sema/TemplateDeduction.h" @@ -764,8 +765,16 @@ clang::MakeDeductionFailureInfo(ASTContext &Context, break; case TemplateDeductionResult::Incomplete: + Result.Data = Info.Param.getOpaqueValue(); + break; case TemplateDeductionResult::InvalidExplicitArguments: Result.Data = Info.Param.getOpaqueValue(); + if (Info.hasSFINAEDiagnostic()) { + PartialDiagnosticAt *Diag = new (Result.Diagnostic) PartialDiagnosticAt( + SourceLocation(), PartialDiagnostic::NullDiagnostic()); + Info.takeSFINAEDiagnostic(*Diag); + Result.HasDiagnostic = true; + } break; case TemplateDeductionResult::DeducedMismatch: @@ -837,7 +846,6 @@ void DeductionFailureInfo::Destroy() { case TemplateDeductionResult::Incomplete: case TemplateDeductionResult::TooManyArguments: case TemplateDeductionResult::TooFewArguments: - case TemplateDeductionResult::InvalidExplicitArguments: case TemplateDeductionResult::CUDATargetMismatch: case TemplateDeductionResult::NonDependentConversionFailure: break; @@ -852,6 +860,7 @@ void DeductionFailureInfo::Destroy() { Data = nullptr; break; + case TemplateDeductionResult::InvalidExplicitArguments: case TemplateDeductionResult::SubstitutionFailure: // FIXME: Destroy the template argument list? Data = nullptr; @@ -12479,28 +12488,27 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated, return; } - case TemplateDeductionResult::InvalidExplicitArguments: + case TemplateDeductionResult::InvalidExplicitArguments: { assert(ParamD && "no parameter found for invalid explicit arguments"); + + auto Diag = S.Diag(Templated->getLocation(), + diag::note_ovl_candidate_explicit_arg_mismatch); if (ParamD->getDeclName()) - S.Diag(Templated->getLocation(), - diag::note_ovl_candidate_explicit_arg_mismatch_named) - << ParamD->getDeclName(); - else { - int index = 0; - if (TemplateTypeParmDecl *TTP = dyn_cast(ParamD)) - index = TTP->getIndex(); - else if (NonTypeTemplateParmDecl *NTTP - = dyn_cast(ParamD)) - index = NTTP->getIndex(); - else - index = cast(ParamD)->getIndex(); - S.Diag(Templated->getLocation(), - diag::note_ovl_candidate_explicit_arg_mismatch_unnamed) - << (index + 1); + Diag << diag::ExplicitArgMismatchNameKind::Named << ParamD->getDeclName(); + else + Diag << diag::ExplicitArgMismatchNameKind::Unnamed + << (getDepthAndIndex(ParamD).second + 1); + if (PartialDiagnosticAt *PDiag = DeductionFailure.getSFINAEDiagnostic()) { + SmallString<128> DiagContent; + PDiag->second.EmitToString(S.getDiagnostics(), DiagContent); + Diag << diag::ExplicitArgMismatchReasonKind::Detailed << DiagContent; + } else { + Diag << diag::ExplicitArgMismatchReasonKind::Vague; } + MaybeEmitInheritedConstructorNote(S, Found); return; - + } case TemplateDeductionResult::ConstraintsNotSatisfied: { // Format the template argument list into the argument string. SmallString<128> TemplateArgString; diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index b051c3a4984d9..d67357459653a 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -775,7 +775,7 @@ namespace FailingDestructor { } }; template - void f() {} // both-note {{invalid explicitly-specified argument}} + void f() {} // both-note {{non-type template argument is not a constant expression}} void g() { f(); // both-error {{no matching function}} diff --git a/clang/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp b/clang/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp index f12e0083fb0c9..8805e8f10dbdd 100644 --- a/clang/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp +++ b/clang/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp @@ -113,21 +113,21 @@ namespace test3 { struct Derived1 : Base { using Base::foo; - template Opaque<2> foo() { return Opaque<2>(); } // expected-note {{invalid explicitly-specified argument for template parameter 'n'}} + template Opaque<2> foo() { return Opaque<2>(); } // expected-note {{template argument for non-type template parameter must be an expression for template parameter 'n'}} }; struct Derived2 : Base { - template Opaque<2> foo() { return Opaque<2>(); } // expected-note {{invalid explicitly-specified argument for template parameter 'n'}} + template Opaque<2> foo() { return Opaque<2>(); } // expected-note {{template argument for non-type template parameter must be an expression for template parameter 'n'}} using Base::foo; }; struct Derived3 : Base { using Base::foo; - template Opaque<3> foo() { return Opaque<3>(); } // expected-note {{invalid explicitly-specified argument for template parameter 'T'}} + template Opaque<3> foo() { return Opaque<3>(); } // expected-note {{template argument for template type parameter must be a type for template parameter 'T'}} }; struct Derived4 : Base { - template Opaque<3> foo() { return Opaque<3>(); } // expected-note {{invalid explicitly-specified argument for template parameter 'T'}} + template Opaque<3> foo() { return Opaque<3>(); } // expected-note {{template argument for template type parameter must be a type for template parameter 'T'}} using Base::foo; }; diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index 427320082a450..f81ab02e7d748 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -681,15 +681,15 @@ namespace cwg241 { // cwg241: 9 A::g<3>(b); C::f<3>(b); // expected-error@-1 {{no matching function for call to 'f'}} - // expected-note@#cwg241-C-f {{candidate template ignored: invalid explicitly-specified argument for template parameter 'T'}} + // expected-note@#cwg241-C-f {{candidate template ignored: template argument for template type parameter must be a type for template parameter 'T'}} C::g<3>(b); // expected-error@-1 {{no matching function for call to 'g'}} - // expected-note@#cwg241-C-g {{candidate template ignored: invalid explicitly-specified argument for template parameter 'T'}} + // expected-note@#cwg241-C-g {{candidate template ignored: template argument for template type parameter must be a type for template parameter 'T'}} using C::f; using C::g; f<3>(b); // expected-error@-1 {{no matching function for call to 'f'}} - // expected-note@#cwg241-C-f {{candidate template ignored: invalid explicitly-specified argument for template parameter 'T'}} + // expected-note@#cwg241-C-f {{candidate template ignored: template argument for template type parameter must be a type for template parameter 'T'}} // expected-note@#cwg241-A-f {{candidate function template not viable: requires 0 arguments, but 1 was provided}} g<3>(b); } @@ -983,7 +983,7 @@ namespace cwg258 { // cwg258: 2.8 int &w = b.f(0); int &x = b.g(0); // expected-error@-1 {{no matching member function for call to 'g'}} - // expected-note@#cwg258-B-g {{candidate template ignored: invalid explicitly-specified argument for 1st template parameter}} + // expected-note@#cwg258-B-g {{candidate template ignored: template argument for non-type template parameter must be an expression for 1st template parameter}} int &y = b.h(); float &z = const_cast(b).h(); diff --git a/clang/test/CXX/drs/cwg3xx.cpp b/clang/test/CXX/drs/cwg3xx.cpp index 1b7b273f76b66..10bf57e422f33 100644 --- a/clang/test/CXX/drs/cwg3xx.cpp +++ b/clang/test/CXX/drs/cwg3xx.cpp @@ -975,8 +975,8 @@ namespace cwg354 { // cwg354: 3.1 c++11 int b0 = both<0>(); int b1 = both<(int*)0>(); // cxx98-error@-1 {{no matching function for call to 'both'}} - // cxx98-note@#cwg354-both-int-ptr {{candidate template ignored: invalid explicitly-specified argument for 1st template parameter}} - // cxx98-note@#cwg354-both-int {{candidate template ignored: invalid explicitly-specified argument for 1st template parameter}} + // cxx98-note@#cwg354-both-int-ptr {{candidate template ignored: non-type template argument does not refer to any declaration for 1st template parameter}} + // cxx98-note@#cwg354-both-int {{candidate template ignored: non-type template argument of type 'int *' must have an integral or enumeration type for 1st template parameter}} template struct ptr_mem {}; // #cwg354-ptr_mem ptr_mem<0> m0; // #cwg354-m0 diff --git a/clang/test/CXX/expr/expr.const/p3-0x.cpp b/clang/test/CXX/expr/expr.const/p3-0x.cpp index 3eedef3cf7712..eb7d59f679e6b 100644 --- a/clang/test/CXX/expr/expr.const/p3-0x.cpp +++ b/clang/test/CXX/expr/expr.const/p3-0x.cpp @@ -107,7 +107,7 @@ void c() { break; } } -template int f() { return B; } // expected-note {{candidate template ignored: invalid explicitly-specified argument for template parameter 'B'}} +template int f() { return B; } // expected-note {{candidate template ignored: conversion from 'int (S::*)() const' to 'bool' is not allowed in a converted constant expression for template parameter 'B'}} template int f<&S::operator int>(); // expected-error {{does not refer to a function template}} template int f<(bool)&S::operator int>(); diff --git a/clang/test/CXX/temp/temp.param/p8-cxx20.cpp b/clang/test/CXX/temp/temp.param/p8-cxx20.cpp index a3478c0669661..9b1d697cd9805 100644 --- a/clang/test/CXX/temp/temp.param/p8-cxx20.cpp +++ b/clang/test/CXX/temp/temp.param/p8-cxx20.cpp @@ -40,7 +40,7 @@ namespace ConstDestruction { }; template - void f() {} // expected-note 2{{invalid explicitly-specified argument}} + void f() {} // expected-note 2{{non-type template argument is not a constant expression}} void g() { f(); diff --git a/clang/test/CXX/temp/temp.res/temp.local/p1.cpp b/clang/test/CXX/temp/temp.res/temp.local/p1.cpp index faa85cb5fce30..c6d7f194182dd 100644 --- a/clang/test/CXX/temp/temp.res/temp.local/p1.cpp +++ b/clang/test/CXX/temp/temp.res/temp.local/p1.cpp @@ -10,9 +10,9 @@ template char id; template struct TempType {}; template class> struct TempTemp {}; -template void use(int&); // expected-note {{invalid explicitly-specified argument}} expected-note {{no known conversion}} +template void use(int&); // expected-note {{use of class template 'B::template C' requires template arguments}} expected-note {{no known conversion}} template class> void use(float&); // expected-note 2{{no known conversion}} -template void use(char&); // expected-note 2{{invalid explicitly-specified argument}} +template void use(char&); // expected-note 2{{template argument for non-type template parameter must be an expression}} template struct A { template struct C {}; diff --git a/clang/test/Modules/cxx-templates.cpp b/clang/test/Modules/cxx-templates.cpp index 1537682f37438..15f6091774424 100644 --- a/clang/test/Modules/cxx-templates.cpp +++ b/clang/test/Modules/cxx-templates.cpp @@ -42,7 +42,7 @@ void g() { template_param_kinds_1(); // ok, from cxx-templates-b.h template_param_kinds_2(); // expected-error {{no matching function for call}} - // expected-note@Inputs/cxx-templates-a.h:11 {{candidate}} + // expected-note@Inputs/cxx-templates-a.h:11 {{non-type parameter of template template parameter cannot be narrowed from type 'int' to 'char'}} // expected-note@Inputs/cxx-templates-b.h:11 {{candidate}} template_param_kinds_2(); // expected-error {{ambiguous}} diff --git a/clang/test/SemaCXX/builtin-align-cxx.cpp b/clang/test/SemaCXX/builtin-align-cxx.cpp index 213a285e23eb2..51e610ccc0cd1 100644 --- a/clang/test/SemaCXX/builtin-align-cxx.cpp +++ b/clang/test/SemaCXX/builtin-align-cxx.cpp @@ -4,7 +4,7 @@ // Check that we don't crash when using dependent types in __builtin_align: template -void *c(void *d) { // expected-note{{candidate template ignored}} +void *c(void *d) { // expected-note{{a non-type template parameter cannot have type 'struct x' before C++20}} return __builtin_align_down(d, b); } diff --git a/clang/test/SemaCXX/calling-conv-compat.cpp b/clang/test/SemaCXX/calling-conv-compat.cpp index 9bb448ffef225..abd595df7663a 100644 --- a/clang/test/SemaCXX/calling-conv-compat.cpp +++ b/clang/test/SemaCXX/calling-conv-compat.cpp @@ -425,6 +425,6 @@ namespace D50526 { void h() { g(); } #if !_M_X64 // expected-error@-2 {{no matching function for call to}} - // expected-note@-4 {{invalid explicitly-specified argument}} + // expected-note@-4 {{non-type template argument of type 'void ()' cannot be converted to a value of type 'void (*)() __attribute__((stdcall))'}} #endif } diff --git a/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp b/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp index 90ee7892b2fc2..de8fe057893d5 100644 --- a/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp +++ b/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp @@ -60,7 +60,7 @@ constexpr void test8() { throw "bad"; } -template constexpr int f(int y) { // expected-note {{candidate template ignored}} +template constexpr int f(int y) { // expected-note {{non-type template argument is not a constant expression}} return x * y; } constexpr int test9(int x) { diff --git a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp index 00ba291fbd198..45d265e2cdc2b 100644 --- a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp +++ b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -std=c++03 -verify -Dstatic_assert=_Static_assert -Wno-c++11-extensions -Wno-c++14-extensions -Wno-c++17-extensions -Wno-c++20-extensions %s // RUN: %clang_cc1 -std=c++11 -verify=expected,cxx11,cxx11-cxx14 -Wno-c++20-extensions -Wno-c++17-extensions -Wno-c++14-extensions %s // RUN: %clang_cc1 -std=c++14 -verify=expected,cxx11-cxx14,cxx14 -Wno-c++20-extensions -Wno-c++17-extensions %s -// RUN: %clang_cc1 -std=c++17 -verify -Wno-c++20-extensions %s -// RUN: %clang_cc1 -std=c++20 -verify %s +// RUN: %clang_cc1 -std=c++17 -verify=expected,cxx17,cxx17-cxx20 -Wno-c++20-extensions %s +// RUN: %clang_cc1 -std=c++20 -verify=expected,cxx20,cxx17-cxx20 %s template inline const bool is_same = false; @@ -45,8 +45,9 @@ template constexpr T outer() { // FIXME: The C++11 error seems wrong return []() { return x; }.template operator()<123>(); // expected-error {{no matching member function}} \ - expected-note {{candidate template ignored}} \ cxx11-note {{non-literal type '' cannot be used in a constant expression}} \ + cxx11-cxx14-note {{non-type template argument does not refer to any declaration}} \ + cxx17-cxx20-note {{value of type 'int' is not implicitly convertible to 'int *'}} \ cxx14-note {{non-literal type}} } static_assert(outer() == 123); // cxx11-cxx14-error {{not an integral constant expression}} cxx11-cxx14-note {{in call}} diff --git a/clang/test/SemaCXX/typo-correction.cpp b/clang/test/SemaCXX/typo-correction.cpp index e4dadf83e0a08..6aac3981bb1a1 100644 --- a/clang/test/SemaCXX/typo-correction.cpp +++ b/clang/test/SemaCXX/typo-correction.cpp @@ -612,7 +612,7 @@ int bar() { namespace testIncludeTypeInTemplateArgument { template -void foo(T t = {}, U = {}); // expected-note {{candidate template ignored}} +void foo(T t = {}, U = {}); // expected-note {{template argument for template type parameter must be a type}} class AddObservation {}; // expected-note {{declared here}} int bar1() { diff --git a/clang/test/SemaSYCL/sycl-kernel-launch.cpp b/clang/test/SemaSYCL/sycl-kernel-launch.cpp index 20d9becb81929..b673025f03b40 100644 --- a/clang/test/SemaSYCL/sycl-kernel-launch.cpp +++ b/clang/test/SemaSYCL/sycl-kernel-launch.cpp @@ -324,7 +324,7 @@ namespace bad6 { // No matching function for call to sycl_kernel_launch; mismatched template parameter kind. namespace bad7 { - // expected-note@+2 {{candidate template ignored: invalid explicitly-specified argument for 1st template parameter}} + // expected-note@+2 {{candidate template ignored: template argument for non-type template parameter must be an expression}} template void sycl_kernel_launch(const char *, Ts...); // expected-error@+4 {{no matching function for call to 'sycl_kernel_launch'}} diff --git a/clang/test/SemaTemplate/concepts-using-decl.cpp b/clang/test/SemaTemplate/concepts-using-decl.cpp index 41f7b6d2f8faa..26bd0b60b691c 100644 --- a/clang/test/SemaTemplate/concepts-using-decl.cpp +++ b/clang/test/SemaTemplate/concepts-using-decl.cpp @@ -165,7 +165,7 @@ struct base { struct bar : public base { using base::foo; - template + template int foo() { return 2; }; // expected-note {{candidate template ignored: substitution failure: too many template arguments for function template 'foo'}} }; diff --git a/clang/test/SemaTemplate/overload-candidates.cpp b/clang/test/SemaTemplate/overload-candidates.cpp index a9c86b2118fbb..3a5bedca938bd 100644 --- a/clang/test/SemaTemplate/overload-candidates.cpp +++ b/clang/test/SemaTemplate/overload-candidates.cpp @@ -17,9 +17,9 @@ void test_dyn_cast(int* ptr) { } template - void get(const T&); // expected-note{{candidate template ignored: invalid explicitly-specified argument for template parameter 'I'}} + void get(const T&); // expected-note{{candidate template ignored: template argument for non-type template parameter must be an expression for template parameter 'I'}} template class, typename T> - void get(const T&); // expected-note{{candidate template ignored: invalid explicitly-specified argument for 1st template parameter}} + void get(const T&); // expected-note{{candidate template ignored: template argument for template template parameter must be a class template}} void test_get(void *ptr) { get(ptr); // expected-error{{no matching function for call to 'get'}} diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp index 0b785700ee238..ea86227d8f569 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp @@ -43,7 +43,7 @@ void TempFunc() {} void Useage() { //expected-error@+2 {{no matching function}} - //expected-note@-4 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'b'}} + //expected-note@-4 {{candidate template ignored: non-type template argument evaluates to -1, which cannot be narrowed to type 'unsigned int' for template parameter 'b'}} TempFunc<1, -1, 1>(); } } From c71e9918df120e272966ce4e9caa0e3abdb5a390 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Sat, 9 May 2026 14:27:37 +0800 Subject: [PATCH 125/538] [Utils] Fix duplicate DomTree updates in SplitIndirectBrCriticalEdges (#196475) SplitIndirectBrCriticalEdges generates DomTree Insert/Delete pairs for each predecessor in OtherPreds. However, OtherPreds can contain duplicate entries when a conditional branch has both targets pointing to the same block (e.g., `br i1 %c, label %X, label %X`). This produces duplicate DomTree updates for the same edge, triggering the assertion `std::abs(NumInsertions) <= 1 && "Unbalanced operations!"` in LegalizeUpdates. Fix by tracking which source blocks have already had DomTree updates emitted, and skipping duplicates. --- .../Transforms/Utils/BreakCriticalEdges.cpp | 6 +++- .../X86/split-indirectbr-duplicate-pred.ll | 31 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/split-indirectbr-duplicate-pred.ll diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index fd6b2562bee54..65bff2b878bda 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -416,6 +416,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F, BlockFrequency BlockFreqForDirectSucc; SmallVector DTUpdates; + SmallPtrSet SeenSrcs; if (DTU) DTUpdates.reserve(OtherPreds.size() * 2 + 1); for (BasicBlock *Pred : OtherPreds) { @@ -426,7 +427,10 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F, if (ShouldUpdateAnalysis) BlockFreqForDirectSucc += BFI->getBlockFreq(Src) * BPI->getEdgeProbability(Src, DirectSucc); - if (DTU) { + // A predecessor may appear multiple times in OtherPreds (e.g., a CondBr + // with both targets pointing to the same block). Only emit one pair of + // DomTree updates per unique source. + if (DTU && SeenSrcs.insert(Src).second) { DTUpdates.push_back({DominatorTree::Insert, Src, DirectSucc}); DTUpdates.push_back({DominatorTree::Delete, Src, Target}); } diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/split-indirectbr-duplicate-pred.ll b/llvm/test/Transforms/CodeGenPrepare/X86/split-indirectbr-duplicate-pred.ll new file mode 100644 index 0000000000000..d2c9941198c1e --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/split-indirectbr-duplicate-pred.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes='require,function(codegenprepare)' -S -mtriple=x86_64 < %s | FileCheck %s + +; Test that SplitIndirectBrCriticalEdges does not crash when a predecessor +; block has a conditional branch with both targets pointing to the same +; destination (producing duplicate entries in the predecessor list). + + +define i32 @duplicate_pred_condbr(ptr %addr, i1 %cond) { +; CHECK-LABEL: define i32 @duplicate_pred_condbr( +; CHECK-SAME: ptr [[ADDR:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: indirectbr ptr [[ADDR]], [label %[[TARGET:.*]], label %[[COND_BB:.*]]] +; CHECK: [[COND_BB]]: +; CHECK-NEXT: br [[DOTSPLIT:label %.*]] +; CHECK: [[TARGET]]: +; CHECK-NEXT: br [[DOTSPLIT]] +; CHECK: [[_SPLIT:.*:]] +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ 0, %[[TARGET]] ], [ 1, %[[COND_BB]] ] +; CHECK-NEXT: ret i32 [[MERGE]] +; +entry: + indirectbr ptr %addr, [label %target, label %cond.bb] + +cond.bb: + br i1 %cond, label %target, label %target + +target: + %result = phi i32 [ 0, %entry ], [ 1, %cond.bb ], [ 1, %cond.bb ] + ret i32 %result +} From 66d4162d99dd0592c33e23293deb187df5ad13d4 Mon Sep 17 00:00:00 2001 From: David Rivera Date: Sat, 9 May 2026 02:41:43 -0400 Subject: [PATCH 126/538] [CIR][CUDA][NVPTX] Set ptx_kernel calling convention on CUDA kernels (#195382) Related: https://github.com/llvm/llvm-project/issues/179278, https://github.com/llvm/llvm-project/issues/175871 More target attributes like: NoInline on kernels, CUDALaunchBoundsAttr, CUDAGridConstantAttr param attrs, nvvm.annotations for surface/texture VarDecls to be deferred for later patches. --- clang/include/clang/CIR/MissingFeatures.h | 2 + clang/lib/CIR/CodeGen/TargetInfo.cpp | 41 +++++++++++++++++++ clang/test/CIR/CodeGenCUDA/address-spaces.cu | 2 +- clang/test/CIR/CodeGenCUDA/ptx-kernels.cu | 42 ++++++++++++++++++++ 4 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 clang/test/CIR/CodeGenCUDA/ptx-kernels.cu diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index b285d93ac007d..ba5c2bf786a99 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -38,6 +38,7 @@ struct MissingFeatures { static bool opGlobalPragmaClangSection() { return false; } static bool opGlobalAnnotations() { return false; } static bool opGlobalCtorPriority() { return false; } + static bool emitNVVMMetadata() { return false; } static bool setDSOLocal() { return false; } static bool supportIFuncAttr() { return false; } @@ -88,6 +89,7 @@ struct MissingFeatures { static bool opFuncUnwindTablesAttr() { return false; } static bool opFuncWillReturn() { return false; } static bool opFuncNoReturn() { return false; } + static bool handleCUDALaunchBoundsAttr() { return false; } static bool setLLVMFunctionFEnvAttributes() { return false; } // CallOp handling diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp index fc939cd9605ab..71ccb6e24a8aa 100644 --- a/clang/lib/CIR/CodeGen/TargetInfo.cpp +++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp @@ -6,6 +6,7 @@ #include "clang/Basic/AddressSpaces.h" #include "clang/CIR/Dialect/IR/CIRAttrs.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/MissingFeatures.h" using namespace clang; using namespace clang::CIRGen; @@ -132,6 +133,46 @@ class NVPTXTargetCIRGenInfo : public TargetCIRGenInfo { public: NVPTXTargetCIRGenInfo(CIRGenTypes &cgt) : TargetCIRGenInfo(std::make_unique(cgt)) {} + + void setTargetAttributes(const clang::Decl *decl, mlir::Operation *global, + CIRGenModule &cgm) const override { + auto globalValue = mlir::dyn_cast(global); + if (globalValue && globalValue.isDeclaration()) + return; + + const auto *vd = dyn_cast_or_null(decl); + if (vd) { + if (cgm.getLangOpts().CUDA) { + if (vd->getType()->isCUDADeviceBuiltinSurfaceType() || + vd->getType()->isCUDADeviceBuiltinTextureType()) + assert(!cir::MissingFeatures::emitNVVMMetadata()); + return; + } + } + + const auto *fd = dyn_cast_or_null(decl); + if (!fd) + return; + + auto func = mlir::cast(global); + + // Perform special handling in OpenCL/CUDA mode. + if (cgm.getLangOpts().OpenCL || cgm.getLangOpts().CUDA) { + // Use function attributes to check for kernel functions. By default, all + // functions are device functions. + if (fd->hasAttr() || fd->hasAttr()) { + // OpenCL/CUDA kernel functions get kernel metadata. Kernel functions + // are also not subject to inlining. + func.setInlineKind(cir::InlineKind::NoInline); + if (fd->hasAttr()) { + func.setCallingConv(cir::CallingConv::PTXKernel); + assert(!cir::MissingFeatures::opFuncParameterAttributes()); + } + if (fd->hasAttr()) + assert(!cir::MissingFeatures::handleCUDALaunchBoundsAttr()); + } + } + } }; } // namespace diff --git a/clang/test/CIR/CodeGenCUDA/address-spaces.cu b/clang/test/CIR/CodeGenCUDA/address-spaces.cu index cc1791a8f2244..2f235c8702899 100644 --- a/clang/test/CIR/CodeGenCUDA/address-spaces.cu +++ b/clang/test/CIR/CodeGenCUDA/address-spaces.cu @@ -86,7 +86,7 @@ __global__ void fn() { // CIR-DEVICE: cir.store {{.*}}%[[VAL]], %[[J]] : !s32i, !cir.ptr // CIR-DEVICE: cir.return -// LLVM-DEVICE: define dso_local void @_Z2fnv() +// LLVM-DEVICE: define dso_local ptx_kernel void @_Z2fnv() // LLVM-DEVICE: %[[ALLOCA:.*]] = alloca i32, i64 1, align 4 // LLVM-DEVICE: store i32 0, ptr %[[ALLOCA]], align 4 // LLVM-DEVICE: %[[VAL:.*]] = load i32, ptr %[[ALLOCA]], align 4 diff --git a/clang/test/CIR/CodeGenCUDA/ptx-kernels.cu b/clang/test/CIR/CodeGenCUDA/ptx-kernels.cu new file mode 100644 index 0000000000000..155e59638eac7 --- /dev/null +++ b/clang/test/CIR/CodeGenCUDA/ptx-kernels.cu @@ -0,0 +1,42 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -x cuda -fclangir \ +// RUN: -fcuda-is-device -emit-cir %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR %s --input-file=%t.cir + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -x cuda -fclangir \ +// RUN: -fcuda-is-device -emit-llvm %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM %s --input-file=%t.ll + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -x cuda \ +// RUN: -fcuda-is-device -emit-llvm %s -o %t.ogcg.ll +// RUN: FileCheck --check-prefix=OGCG %s --input-file=%t.ogcg.ll + +#include "Inputs/cuda.h" + +// CIR: cir.func {{.*}} @device_function() +// LLVM: define{{.*}} void @device_function +// OGCG: define{{.*}} void @device_function +extern "C" +__device__ void device_function() {} + +// CIR: cir.func {{.*}} @global_function() cc(ptx_kernel) +// LLVM: define{{.*}} ptx_kernel void @global_function +// OGCG: define{{.*}} ptx_kernel void @global_function +extern "C" +__global__ void global_function() { + device_function(); +} + +template __global__ void templated_kernel(T param) {} +template __global__ void templated_kernel(int); +// CIR-DAG: cir.func {{.*}} @_Z16templated_kernelIiEvT_({{.*}}) cc(ptx_kernel) +// LLVM-DAG: define{{.*}} ptx_kernel void @_Z16templated_kernelIiEvT_( +// OGCG-DAG: define{{.*}} ptx_kernel void @_Z16templated_kernelIiEvT_( + +namespace { +__global__ void anonymous_ns_kernel() {} +// CIR-DAG: cir.func {{.*}} @_ZN12_GLOBAL__N_119anonymous_ns_kernelEv() cc(ptx_kernel) +// LLVM-DAG: define{{.*}} ptx_kernel void @_ZN12_GLOBAL__N_119anonymous_ns_kernelEv( +// OGCG-DAG: define{{.*}} ptx_kernel void @_ZN12_GLOBAL__N_119anonymous_ns_kernelEv( +} From 6da957d8cbaffca03f00c747e360dbbdbada556e Mon Sep 17 00:00:00 2001 From: Tony Varghese Date: Sat, 9 May 2026 12:15:03 +0530 Subject: [PATCH 127/538] [DAGTypeLegalizer] Add missing BR_CC handler for soft-promoted half operands (#196214) `SoftPromoteHalfOperand` had no case for `ISD::BR_CC`, causing a crash when a half-typed `fcmp` result fed directly into a conditional branch. All other comparison-related nodes (`SETCC, SELECT_CC`) were already handled. Add `SoftPromoteHalfOp_BR_CC` following the same pattern as `SoftPromoteHalfOp_SELECT_CC`. Fixes #195562 --------- Co-authored-by: Tony Varghese --- .../SelectionDAG/LegalizeFloatTypes.cpp | 27 +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../PowerPC/soft-promote-half-br-cc.ll | 162 ++++++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/soft-promote-half-br-cc.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 25f4f75eaedea..016b304494fbd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -3197,6 +3197,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: Res = SoftPromoteHalfOp_FP_EXTEND(N); break; case ISD::SELECT_CC: Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break; + case ISD::BR_CC: + Res = SoftPromoteHalfOp_BR_CC(N); + break; case ISD::SETCC: Res = SoftPromoteHalfOp_SETCC(N); break; case ISD::STORE: Res = SoftPromoteHalfOp_STORE(N, OpNo); break; case ISD::ATOMIC_STORE: @@ -3309,6 +3312,30 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N) { N->getOperand(1)); } +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BR_CC(SDNode *N) { + // ISD::BR_CC node: chain(0), condcode(1), LHS(2), RHS(3), dest(4) + // The comparison operands (LHS, RHS) are soft-promoted halfs. + SDValue Op0 = N->getOperand(2); + SDValue Op1 = N->getOperand(3); + SDLoc dl(N); + + EVT SVT = Op0.getValueType(); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), SVT); + + // Get the soft-promoted i16 values + Op0 = GetSoftPromotedHalf(Op0); + Op1 = GetSoftPromotedHalf(Op1); + + // Promote both comparison operands to the larger FP type. + unsigned PromotionOpcode = GetPromotionOpcode(SVT, NVT); + Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0); + Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1); + + // Create a new BR_CC node with promoted operands + return DAG.getNode(ISD::BR_CC, dl, MVT::Other, N->getOperand(0), + N->getOperand(1), Op0, Op1, N->getOperand(4)); +} + SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Can only soften the comparison values"); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 89698a2c77123..84d5b454ba28e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -806,6 +806,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N); SDValue SoftPromoteHalfOp_SETCC(SDNode *N); SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo); + SDValue SoftPromoteHalfOp_BR_CC(SDNode *N); SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_ATOMIC_STORE(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo); diff --git a/llvm/test/CodeGen/PowerPC/soft-promote-half-br-cc.ll b/llvm/test/CodeGen/PowerPC/soft-promote-half-br-cc.ll new file mode 100644 index 0000000000000..2726e287f5df7 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/soft-promote-half-br-cc.ll @@ -0,0 +1,162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 < %s | FileCheck %s --check-prefix=CHECK-P9 +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 < %s | FileCheck %s --check-prefix=CHECK-P8 + +; Basic comparison with branch +define i32 @test_br_cc_olt(half %a, half %b) nounwind { +; CHECK-P9-LABEL: test_br_cc_olt: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: clrlwi 3, 3, 16 +; CHECK-P9-NEXT: clrlwi 4, 4, 16 +; CHECK-P9-NEXT: mtfprwz 0, 4 +; CHECK-P9-NEXT: mtfprwz 1, 3 +; CHECK-P9-NEXT: xscvhpdp 0, 0 +; CHECK-P9-NEXT: xscvhpdp 1, 1 +; CHECK-P9-NEXT: fcmpu 0, 1, 0 +; CHECK-P9-NEXT: bge 0, .LBB0_2 +; CHECK-P9-NEXT: # %bb.1: # %if.then +; CHECK-P9-NEXT: li 3, 1 +; CHECK-P9-NEXT: blr +; CHECK-P9-NEXT: .LBB0_2: # %if.else +; CHECK-P9-NEXT: li 3, 0 +; CHECK-P9-NEXT: blr +; +; CHECK-P8-LABEL: test_br_cc_olt: +; CHECK-P8: # %bb.0: +; CHECK-P8-NEXT: mflr 0 +; CHECK-P8-NEXT: std 30, -24(1) # 8-byte Folded Spill +; CHECK-P8-NEXT: stfd 31, -8(1) # 8-byte Folded Spill +; CHECK-P8-NEXT: stdu 1, -64(1) +; CHECK-P8-NEXT: mr 30, 3 +; CHECK-P8-NEXT: clrldi 3, 4, 48 +; CHECK-P8-NEXT: std 0, 80(1) +; CHECK-P8-NEXT: bl __extendhfsf2 +; CHECK-P8-NEXT: nop +; CHECK-P8-NEXT: clrldi 3, 30, 48 +; CHECK-P8-NEXT: fmr 31, 1 +; CHECK-P8-NEXT: bl __extendhfsf2 +; CHECK-P8-NEXT: nop +; CHECK-P8-NEXT: fcmpu 0, 1, 31 +; CHECK-P8-NEXT: bge 0, .LBB0_2 +; CHECK-P8-NEXT: # %bb.1: # %if.then +; CHECK-P8-NEXT: li 3, 1 +; CHECK-P8-NEXT: b .LBB0_3 +; CHECK-P8-NEXT: .LBB0_2: # %if.else +; CHECK-P8-NEXT: li 3, 0 +; CHECK-P8-NEXT: .LBB0_3: # %if.then +; CHECK-P8-NEXT: addi 1, 1, 64 +; CHECK-P8-NEXT: ld 0, 16(1) +; CHECK-P8-NEXT: lfd 31, -8(1) # 8-byte Folded Reload +; CHECK-P8-NEXT: ld 30, -24(1) # 8-byte Folded Reload +; CHECK-P8-NEXT: mtlr 0 +; CHECK-P8-NEXT: blr + %cmp = fcmp olt half %a, %b + br i1 %cmp, label %if.then, label %if.else +if.then: + ret i32 1 +if.else: + ret i32 0 +} + +; Test with constant +define i32 @test_br_cc_constant(half %a) nounwind { +; CHECK-P9-LABEL: test_br_cc_constant: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: clrlwi 3, 3, 16 +; CHECK-P9-NEXT: xxlxor 1, 1, 1 +; CHECK-P9-NEXT: mtfprwz 0, 3 +; CHECK-P9-NEXT: xscvhpdp 0, 0 +; CHECK-P9-NEXT: fcmpu 0, 0, 1 +; CHECK-P9-NEXT: ble 0, .LBB1_2 +; CHECK-P9-NEXT: # %bb.1: # %if.then +; CHECK-P9-NEXT: li 3, 1 +; CHECK-P9-NEXT: blr +; CHECK-P9-NEXT: .LBB1_2: # %if.else +; CHECK-P9-NEXT: li 3, 0 +; CHECK-P9-NEXT: blr +; +; CHECK-P8-LABEL: test_br_cc_constant: +; CHECK-P8: # %bb.0: +; CHECK-P8-NEXT: mflr 0 +; CHECK-P8-NEXT: stdu 1, -32(1) +; CHECK-P8-NEXT: clrldi 3, 3, 48 +; CHECK-P8-NEXT: std 0, 48(1) +; CHECK-P8-NEXT: bl __extendhfsf2 +; CHECK-P8-NEXT: nop +; CHECK-P8-NEXT: xxlxor 0, 0, 0 +; CHECK-P8-NEXT: fcmpu 0, 1, 0 +; CHECK-P8-NEXT: ble 0, .LBB1_2 +; CHECK-P8-NEXT: # %bb.1: # %if.then +; CHECK-P8-NEXT: li 3, 1 +; CHECK-P8-NEXT: b .LBB1_3 +; CHECK-P8-NEXT: .LBB1_2: # %if.else +; CHECK-P8-NEXT: li 3, 0 +; CHECK-P8-NEXT: .LBB1_3: # %if.then +; CHECK-P8-NEXT: addi 1, 1, 32 +; CHECK-P8-NEXT: ld 0, 16(1) +; CHECK-P8-NEXT: mtlr 0 +; CHECK-P8-NEXT: blr + %cmp = fcmp ogt half %a, 0xH0000 + br i1 %cmp, label %if.then, label %if.else +if.then: + ret i32 1 +if.else: + ret i32 0 +} + +; vector reduction + branch +define fastcc i16 @test_vector_reduce_br(half %arg) nounwind { +; CHECK-P9-LABEL: test_vector_reduce_br: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: clrlwi 3, 3, 16 +; CHECK-P9-NEXT: xxlxor 1, 1, 1 +; CHECK-P9-NEXT: mtfprwz 0, 3 +; CHECK-P9-NEXT: xscvhpdp 0, 0 +; CHECK-P9-NEXT: fcmpu 0, 0, 1 +; CHECK-P9-NEXT: bc 12, 0, .LBB2_3 +; CHECK-P9-NEXT: # %bb.1: +; CHECK-P9-NEXT: fcmpu 0, 0, 0 +; CHECK-P9-NEXT: bc 12, 3, .LBB2_3 +; CHECK-P9-NEXT: # %bb.2: # %taken +; CHECK-P9-NEXT: li 3, 0 +; CHECK-P9-NEXT: blr +; CHECK-P9-NEXT: .LBB2_3: # %not_taken +; CHECK-P9-NEXT: li 3, 1 +; CHECK-P9-NEXT: blr +; +; CHECK-P8-LABEL: test_vector_reduce_br: +; CHECK-P8: # %bb.0: +; CHECK-P8-NEXT: mflr 0 +; CHECK-P8-NEXT: stdu 1, -32(1) +; CHECK-P8-NEXT: clrldi 3, 3, 48 +; CHECK-P8-NEXT: std 0, 48(1) +; CHECK-P8-NEXT: bl __extendhfsf2 +; CHECK-P8-NEXT: nop +; CHECK-P8-NEXT: xxlxor 0, 0, 0 +; CHECK-P8-NEXT: fcmpu 0, 1, 0 +; CHECK-P8-NEXT: bc 12, 0, .LBB2_3 +; CHECK-P8-NEXT: # %bb.1: +; CHECK-P8-NEXT: fcmpu 0, 1, 1 +; CHECK-P8-NEXT: bc 12, 3, .LBB2_3 +; CHECK-P8-NEXT: # %bb.2: # %taken +; CHECK-P8-NEXT: li 3, 0 +; CHECK-P8-NEXT: b .LBB2_4 +; CHECK-P8-NEXT: .LBB2_3: # %not_taken +; CHECK-P8-NEXT: li 3, 1 +; CHECK-P8-NEXT: .LBB2_4: # %taken +; CHECK-P8-NEXT: addi 1, 1, 32 +; CHECK-P8-NEXT: ld 0, 16(1) +; CHECK-P8-NEXT: mtlr 0 +; CHECK-P8-NEXT: blr + %reduce = tail call half @llvm.vector.reduce.fmin.v4f16(<4 x half> zeroinitializer) + %cmp = fcmp ole half %reduce, %arg + br i1 %cmp, label %taken, label %not_taken +taken: + ret i16 0 +not_taken: + ret i16 1 +} + +declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) From 8e36604e98a2cd211cee12844e1309b60d54c44f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 8 May 2026 23:59:50 -0700 Subject: [PATCH 128/538] [RISCV][GISel] Add test coverage for the srliw+shXadd patterns. NFC (#196676) GISel isn't canonicalizing the shift pair to an AND the same way SelectionDAG does so the patterns weren't firing. Add more directed tests that use an And explicitly. --- llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll index 9c9c014e3c172..e2e7c01fdc096 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll @@ -613,6 +613,24 @@ define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) { ret i16 %6 } +define i64 @srliw_1_sh1add_2(i64 %x, i64 %y) { +; RV64I-LABEL: srliw_1_sh1add_2: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a0, a0, 1 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: srliw_1_sh1add_2: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: srliw a0, a0, 1 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret + %a = and i64 %x, u0xfffffffe + %b = add i64 %a, %y + ret i64 %b +} + define i128 @slliuw_ptrdiff(i64 %diff, ptr %baseptr) { ; RV64I-LABEL: slliuw_ptrdiff: ; RV64I: # %bb.0: @@ -661,6 +679,24 @@ define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) { ret i32 %6 } +define i64 @srliw_2_sh2add_2(i64 %x, i64 %y) { +; RV64I-LABEL: srliw_2_sh2add_2: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a0, a0, 2 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: srliw_2_sh2add_2: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: srliw a0, a0, 2 +; RV64ZBA-NEXT: sh2add a0, a0, a1 +; RV64ZBA-NEXT: ret + %a = and i64 %x, u0xfffffffc + %b = add i64 %a, %y + ret i64 %b +} + define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) { ; RV64I-LABEL: srliw_3_sh3add: ; RV64I: # %bb.0: @@ -683,6 +719,24 @@ define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) { ret i64 %6 } +define i64 @srliw_3_sh3add_2(i64 %x, i64 %y) { +; RV64I-LABEL: srliw_3_sh3add_2: +; RV64I: # %bb.0: +; RV64I-NEXT: srliw a0, a0, 3 +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: srliw_3_sh3add_2: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: srliw a0, a0, 3 +; RV64ZBA-NEXT: sh3add a0, a0, a1 +; RV64ZBA-NEXT: ret + %a = and i64 %x, u0xfffffff8 + %b = add i64 %a, %y + ret i64 %b +} + define signext i32 @srliw_1_sh2add(ptr %0, i32 signext %1) { ; RV64I-LABEL: srliw_1_sh2add: ; RV64I: # %bb.0: From d1f0d1b37eb77c64c9a54479ad339e22f9b80c1e Mon Sep 17 00:00:00 2001 From: Edward Nathan Varghese Date: Sat, 9 May 2026 12:30:52 +0530 Subject: [PATCH 129/538] [clang][AMDGPU] Reject malformed target IDs with empty components (#196140) Fixes #196078 An extra colon in `-mcpu` (e.g. `gfx900::xnack+`) produced an empty feature component and triggered an assertion in `StringRef::back()`. Return `std::nullopt` for malformed target IDs instead. --- clang/lib/Basic/TargetID.cpp | 2 ++ clang/test/Driver/amdgpu-invalid-target-id.s | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/clang/lib/Basic/TargetID.cpp b/clang/lib/Basic/TargetID.cpp index 0aca490e17903..446577930b017 100644 --- a/clang/lib/Basic/TargetID.cpp +++ b/clang/lib/Basic/TargetID.cpp @@ -89,6 +89,8 @@ parseTargetIDWithFormatCheckingOnly(llvm::StringRef TargetID, while (!Features.empty()) { auto Splits = Features.split(':'); + if (Splits.first.empty()) + return std::nullopt; auto Sign = Splits.first.back(); auto Feature = Splits.first.drop_back(); if (Sign != '+' && Sign != '-') diff --git a/clang/test/Driver/amdgpu-invalid-target-id.s b/clang/test/Driver/amdgpu-invalid-target-id.s index 7d1d8e4772338..4fe5493a3e6bb 100644 --- a/clang/test/Driver/amdgpu-invalid-target-id.s +++ b/clang/test/Driver/amdgpu-invalid-target-id.s @@ -39,3 +39,9 @@ // RUN: %s 2>&1 | FileCheck -check-prefix=NOCOLON %s // NOCOLON: error: invalid target ID 'gfx900+xnack' + +// RUN: not %clang --target=amdgcn-amd-amdhsa \ +// RUN: -mcpu=gfx900::xnack+ -nostdlib \ +// RUN: %s 2>&1 | FileCheck -check-prefix=EXTRACOL %s + +// EXTRACOL: error: invalid target ID 'gfx900::xnack+' From 89f9ebd1e329a0969a470608ab9fd481defcfd7a Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 9 May 2026 08:30:09 +0100 Subject: [PATCH 130/538] [AArch64][GlobalISel] Enable BF16 legalization for fadd and friends. (#196081) This enabled bf16 promotion for the following operations in GISel, promoting them to f32 and truncating the result back: G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM, G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 7 +- .../AArch64/Atomics/aarch64-atomicrmw-lsfe.ll | 285 +- .../Atomics/aarch64-atomicrmw-v8a_fp.ll | 660 ++-- .../GlobalISel/legalizer-info-validation.mir | 68 +- .../test/CodeGen/AArch64/bf16-instructions.ll | 1302 ++++--- .../CodeGen/AArch64/bf16-v4-instructions.ll | 1558 ++++++--- .../CodeGen/AArch64/bf16-v8-instructions.ll | 3108 +++++++++++------ 7 files changed, 4564 insertions(+), 2424 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 605cb86a7bb60..88d2455aae425 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -450,7 +450,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor(HasFP16, {f16, v4f16, v8f16}) .libcallFor({f128}) .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) - .minScalarOrElt(0, MinFPScalar) + .widenScalarIf( + [=](const LegalityQuery &Q) { + return (!HasFP16 && Q.Types[0].getScalarType().isFloat16()) || + Q.Types[0].getScalarType().isBFloat16(); + }, + changeElementTo(0, f32)) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) .clampNumElements(0, v2s64, v2s64) diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll index 290c8a60d700d..6a0b689b16888 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll @@ -239,8 +239,11 @@ define dso_local half @atomicrmw_fadd_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: @@ -253,8 +256,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: @@ -267,8 +273,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_release: @@ -281,8 +290,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: @@ -295,8 +307,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: @@ -449,12 +464,15 @@ define dso_local half @atomicrmw_fsub_half_aligned_seq_cst(ptr %ptr, half %value define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -467,12 +485,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -485,12 +506,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -503,12 +527,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -521,12 +548,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -714,8 +744,11 @@ define dso_local half @atomicrmw_fsub_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: @@ -728,8 +761,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: @@ -742,8 +778,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_release: @@ -756,8 +795,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: @@ -770,8 +812,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: @@ -1085,8 +1130,11 @@ define dso_local half @atomicrmw_fmax_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: @@ -1099,8 +1147,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: @@ -1113,8 +1164,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_release: @@ -1127,8 +1181,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: @@ -1141,8 +1198,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: @@ -1456,8 +1516,11 @@ define dso_local half @atomicrmw_fmin_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: @@ -1470,8 +1533,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: @@ -1484,8 +1550,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_release: @@ -1498,8 +1567,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: @@ -1512,8 +1584,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: @@ -1827,8 +1902,11 @@ define dso_local half @atomicrmw_fmaximum_half_unaligned_seq_cst(ptr %ptr, half define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_monotonic: @@ -1841,8 +1919,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_monotonic(ptr %ptr, define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acquire: @@ -1855,8 +1936,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acquire(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_release: @@ -1869,8 +1953,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_release(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acq_rel: @@ -1883,8 +1970,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acq_rel(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_seq_cst: @@ -2198,8 +2288,11 @@ define dso_local half @atomicrmw_fminimum_half_unaligned_seq_cst(ptr %ptr, half define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_monotonic: @@ -2212,8 +2305,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_monotonic(ptr %ptr, define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_acquire: @@ -2226,8 +2322,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acquire(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_release: @@ -2240,8 +2339,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_release(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_acq_rel: @@ -2254,8 +2356,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acq_rel(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_seq_cst: diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll index ccb4fd61b002c..d71231059c3b3 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll @@ -75,12 +75,15 @@ define dso_local half @atomicrmw_fadd_half_aligned_seq_cst(ptr %ptr, half %value define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -93,12 +96,15 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_monotonic(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -111,12 +117,15 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acquire(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -129,12 +138,15 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_release(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -147,12 +159,15 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acq_rel(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -340,8 +355,11 @@ define dso_local half @atomicrmw_fadd_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: @@ -354,8 +372,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: @@ -368,8 +389,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_release: @@ -382,8 +406,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: @@ -396,8 +423,11 @@ define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: @@ -550,12 +580,15 @@ define dso_local half @atomicrmw_fsub_half_aligned_seq_cst(ptr %ptr, half %value define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -568,12 +601,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -586,12 +622,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -604,12 +643,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -622,12 +664,15 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -815,8 +860,11 @@ define dso_local half @atomicrmw_fsub_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: @@ -829,8 +877,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: @@ -843,8 +894,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_release: @@ -857,8 +911,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: @@ -871,8 +928,11 @@ define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: @@ -1025,12 +1085,15 @@ define dso_local half @atomicrmw_fmax_half_aligned_seq_cst(ptr %ptr, half %value define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -1043,12 +1106,15 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_monotonic(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -1061,12 +1127,15 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acquire(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -1079,12 +1148,15 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_release(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -1097,12 +1169,15 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acq_rel(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -1290,8 +1365,11 @@ define dso_local half @atomicrmw_fmax_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: @@ -1304,8 +1382,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: @@ -1318,8 +1399,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_release: @@ -1332,8 +1416,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: @@ -1346,8 +1433,11 @@ define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: @@ -1500,12 +1590,15 @@ define dso_local half @atomicrmw_fmin_half_aligned_seq_cst(ptr %ptr, half %value define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -1518,12 +1611,15 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_monotonic(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -1536,12 +1632,15 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acquire(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -1554,12 +1653,15 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_release(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -1572,12 +1674,15 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acq_rel(ptr %ptr, bfloat define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -1765,8 +1870,11 @@ define dso_local half @atomicrmw_fmin_half_unaligned_seq_cst(ptr %ptr, half %val define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: @@ -1779,8 +1887,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: @@ -1793,8 +1904,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_release: @@ -1807,8 +1921,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: @@ -1821,8 +1938,11 @@ define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloa define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: @@ -1975,12 +2095,15 @@ define dso_local half @atomicrmw_fmaximum_half_aligned_seq_cst(ptr %ptr, half %v define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -1993,12 +2116,15 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_monotonic(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -2011,12 +2137,15 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_acquire(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -2029,12 +2158,15 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_release(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -2047,12 +2179,15 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_acq_rel(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fmaximum_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -2240,8 +2375,11 @@ define dso_local half @atomicrmw_fmaximum_half_unaligned_seq_cst(ptr %ptr, half define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_monotonic: @@ -2254,8 +2392,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_monotonic(ptr %ptr, define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acquire: @@ -2268,8 +2409,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acquire(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_release: @@ -2282,8 +2426,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_release(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_acq_rel: @@ -2296,8 +2443,11 @@ define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_acq_rel(ptr %ptr, b define dso_local bfloat @atomicrmw_fmaximum_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fmaximum_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fmaximum_bfloat_unaligned_seq_cst: @@ -2450,12 +2600,15 @@ define dso_local half @atomicrmw_fminimum_half_aligned_seq_cst(ptr %ptr, half %v define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_aligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_aligned_monotonic: ; -O1: ldxrh w9, [x0] @@ -2468,12 +2621,15 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_monotonic(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_aligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_aligned_acquire: ; -O1: ldaxrh w9, [x0] @@ -2486,12 +2642,15 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_acquire(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_aligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_aligned_release: ; -O1: ldxrh w9, [x0] @@ -2504,12 +2663,15 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_release(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_aligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_aligned_acq_rel: ; -O1: ldaxrh w9, [x0] @@ -2522,12 +2684,15 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_acq_rel(ptr %ptr, bfl define dso_local bfloat @atomicrmw_fminimum_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_aligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 -; -O0: ldaxrh w8, [x11] -; -O0: cmp w8, w9, uxth -; -O0: stlxrh w10, w12, [x11] -; -O0: subs w9, w8, w9, uxth +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w11, w9, w11 +; -O0: add w9, w9, w11 +; -O0: csel w8, w8, w9, vs +; -O0: ldaxrh w9, [x10] +; -O0: cmp w9, w11, uxth +; -O0: stlxrh w8, w12, [x10] +; -O0: subs w8, w8, w9, uxth ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_aligned_seq_cst: ; -O1: ldaxrh w9, [x0] @@ -2715,8 +2880,11 @@ define dso_local half @atomicrmw_fminimum_half_unaligned_seq_cst(ptr %ptr, half define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_monotonic: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_monotonic: @@ -2729,8 +2897,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_monotonic(ptr %ptr, define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_acquire: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_acquire: @@ -2743,8 +2914,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acquire(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_release(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_release: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_release: @@ -2757,8 +2931,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_release(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_acq_rel: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_acq_rel: @@ -2771,8 +2948,11 @@ define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_acq_rel(ptr %ptr, b define dso_local bfloat @atomicrmw_fminimum_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { ; -O0-LABEL: atomicrmw_fminimum_bfloat_unaligned_seq_cst: -; -O0: add w8, w8, w9 -; -O0: add w8, w8, w9 +; -O0: orr w8, w8, #0x400000 +; -O0: and w9, w9, #0x1 +; -O0: add w10, w9, w10 +; -O0: add w9, w9, w10 +; -O0: csel w8, w8, w9, vs ; -O0: bl __atomic_compare_exchange ; ; -O1-LABEL: atomicrmw_fminimum_bfloat_unaligned_seq_cst: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index ff4fbc0e1afeb..17bb373070568 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -179,12 +179,12 @@ # # DEBUG-NEXT: G_INTRINSIC_TRUNC (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_INTRINSIC_ROUND (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected @@ -195,8 +195,8 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined @@ -516,27 +516,27 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMAD (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -627,12 +627,12 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined @@ -641,12 +641,12 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FMINIMUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMAXIMUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FMINIMUMNUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined @@ -772,8 +772,8 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -818,20 +818,20 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FRINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FNEARBYINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 1, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_ADDRSPACE_CAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index d9f661d2d1d34..05bf7a3edbd45 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -4,12 +4,7 @@ ; RUN: llc < %s -mtriple aarch64 -mattr=-bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI ; RUN: llc < %s -mtriple aarch64 -mattr=+bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-BF16-GI -; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_fadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fsub -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmul -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_frem ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f32_f16 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une @@ -39,7 +34,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32_fadd ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sitofp_i32_fadd ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_double -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sin ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_cos @@ -57,26 +51,10 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log10 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log2 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fma ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fabs ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_minnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_floor -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_ceil -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_trunc -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_rint -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_round -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd -; -; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_fadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fsub -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmul -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; +; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_frem ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f32_f16 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une @@ -106,7 +84,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32_fadd ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sitofp_i32_fadd ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_double -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sin ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_cos @@ -124,184 +101,341 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log10 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log2 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fma ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fabs ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fneg -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_minnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_floor -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_ceil -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_trunc -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_rint -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_round -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fadd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd s0, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fadd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fadd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fadd s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fadd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fadd s0, s0, s1 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fadd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fadd s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fadd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fadd s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret %r = fadd bfloat %a, %b ret bfloat %r } define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fsub: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fsub s0, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fsub: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fsub s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fsub: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fsub s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fsub: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fsub s0, s0, s1 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fsub: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fsub s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fsub: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fsub s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret %r = fsub bfloat %a, %b ret bfloat %r } define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fmul: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmul s0, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmul: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmul s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fmul: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmul s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fmul: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmul s0, s0, s1 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fmul: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fmul s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fmul: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmul s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret %r = fmul bfloat %a, %b ret bfloat %r } define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { -; CHECK-CVT-LABEL: test_fmadd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmul s0, s0, s1 -; CHECK-CVT-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 -; CHECK-CVT-NEXT: add w8, w8, w10 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd s0, s0, s1 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 -; CHECK-CVT-NEXT: add w8, w8, w10 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmadd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w10, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmul s0, s0, s1 +; CHECK-CVT-SD-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w8, w10 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w8, w10 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fmadd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret - %mul = fmul fast bfloat %a, %b - %r = fadd fast bfloat %mul, %c - ret bfloat %r -} - -define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fdiv: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fdiv s0, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fmadd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s0, s0, s1, s2 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fdiv: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fdiv s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-CVT-GI-LABEL: test_fmadd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w10, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fmul s0, s0, s1 +; CHECK-CVT-GI-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-GI-NEXT: add w11, w8, w10 +; CHECK-CVT-GI-NEXT: orr w8, w8, #0x400000 +; CHECK-CVT-GI-NEXT: add w9, w11, w9 +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fadd s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-GI-NEXT: add w10, w8, w10 +; CHECK-CVT-GI-NEXT: orr w8, w8, #0x400000 +; CHECK-CVT-GI-NEXT: add w9, w10, w9 +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fmadd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmul s0, s0, s1 +; CHECK-BF16-GI-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: fadd s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret + %mul = fmul fast bfloat %a, %b + %r = fadd fast bfloat %mul, %c + ret bfloat %r +} + +define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { +; CHECK-CVT-SD-LABEL: test_fdiv: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fdiv s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fdiv: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fdiv s0, s0, s1 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fdiv: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fdiv s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fdiv: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fdiv s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret %r = fdiv bfloat %a, %b ret bfloat %r } @@ -1212,20 +1346,20 @@ define bfloat @test_bitcast_f16tobfloat(half %a) #0 { } define bfloat @test_sqrt(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_sqrt: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fsqrt s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_sqrt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fsqrt s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_sqrt: ; CHECK-BF16: // %bb.0: @@ -1234,6 +1368,24 @@ define bfloat @test_sqrt(bfloat %a) #0 { ; CHECK-BF16-NEXT: fsqrt s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_sqrt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fsqrt s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.sqrt.bf16(bfloat %a) ret bfloat %r } @@ -1812,36 +1964,70 @@ define bfloat @test_log2(bfloat %a) #0 { } define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 { -; CHECK-CVT-LABEL: test_fma: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmadd s0, s0, s1, s2 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 -; CHECK-CVT-NEXT: add w8, w8, w10 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fma: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w10, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmadd s0, s0, s1, s2 +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w8, w10 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fma: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fma: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s0, s0, s1, s2 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fma: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-GI-NEXT: mov w10, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: fmadd s0, s0, s1, s2 +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-GI-NEXT: add w10, w8, w10 +; CHECK-CVT-GI-NEXT: orr w8, w8, #0x400000 +; CHECK-CVT-GI-NEXT: add w9, w10, w9 +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fma: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: fmadd s0, s0, s1, s2 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret %r = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %r } @@ -1873,107 +2059,167 @@ define bfloat @test_fneg(bfloat %a) #0 { } define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_minnum: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fminnm s0, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret -; -; CHECK-BF16-LABEL: test_minnum: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fminnm s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret - %r = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b) - ret bfloat %r -} - -define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_maxnum: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmaxnm s0, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret -; -; CHECK-BF16-LABEL: test_maxnum: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmaxnm s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret - %r = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) - ret bfloat %r -} - -define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-SD-LABEL: test_copysign: +; CHECK-CVT-SD-LABEL: test_minnum: ; CHECK-CVT-SD: // %bb.0: ; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff ; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-SD-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: fminnm s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 ; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 ; CHECK-CVT-SD-NEXT: fmov s0, w8 ; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-SD-LABEL: test_copysign: +; CHECK-BF16-SD-LABEL: test_minnum: ; CHECK-BF16-SD: // %bb.0: ; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-SD-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-BF16-SD-NEXT: fminnm s0, s0, s1 ; CHECK-BF16-SD-NEXT: bfcvt h0, s0 ; CHECK-BF16-SD-NEXT: ret ; -; CHECK-CVT-GI-LABEL: test_copysign: +; CHECK-CVT-GI-LABEL: test_minnum: ; CHECK-CVT-GI: // %bb.0: -; CHECK-CVT-GI-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 -; CHECK-CVT-GI-NEXT: ret -; -; CHECK-BF16-GI-LABEL: test_copysign: -; CHECK-BF16-GI: // %bb.0: -; CHECK-BF16-GI-NEXT: mvni v2.4h, #128, lsl #8 -; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fminnm s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_minnum: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fminnm s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret + %r = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b) + ret bfloat %r +} + +define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { +; CHECK-CVT-SD-LABEL: test_maxnum: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmaxnm s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_maxnum: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmaxnm s0, s0, s1 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_maxnum: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fmaxnm s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_maxnum: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmaxnm s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret + %r = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) + ret bfloat %r +} + +define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { +; CHECK-CVT-SD-LABEL: test_copysign: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_copysign: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_copysign: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_copysign: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-BF16-GI-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-BF16-GI-NEXT: ret @@ -2147,20 +2393,20 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { } define bfloat @test_floor(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_floor: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frintm s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_floor: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frintm s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_floor: ; CHECK-BF16: // %bb.0: @@ -2169,25 +2415,43 @@ define bfloat @test_floor(bfloat %a) #0 { ; CHECK-BF16-NEXT: frintm s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_floor: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frintm s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.floor.bf16(bfloat %a) ret bfloat %r } define bfloat @test_ceil(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_ceil: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frintp s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_ceil: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frintp s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_ceil: ; CHECK-BF16: // %bb.0: @@ -2196,25 +2460,43 @@ define bfloat @test_ceil(bfloat %a) #0 { ; CHECK-BF16-NEXT: frintp s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_ceil: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frintp s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.ceil.bf16(bfloat %a) ret bfloat %r } define bfloat @test_trunc(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_trunc: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frintz s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_trunc: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frintz s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_trunc: ; CHECK-BF16: // %bb.0: @@ -2223,25 +2505,43 @@ define bfloat @test_trunc(bfloat %a) #0 { ; CHECK-BF16-NEXT: frintz s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_trunc: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frintz s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.trunc.bf16(bfloat %a) ret bfloat %r } define bfloat @test_rint(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_rint: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frintx s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_rint: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frintx s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_rint: ; CHECK-BF16: // %bb.0: @@ -2250,25 +2550,43 @@ define bfloat @test_rint(bfloat %a) #0 { ; CHECK-BF16-NEXT: frintx s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_rint: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frintx s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.rint.bf16(bfloat %a) ret bfloat %r } define bfloat @test_nearbyint(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_nearbyint: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frinti s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_nearbyint: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frinti s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_nearbyint: ; CHECK-BF16: // %bb.0: @@ -2277,25 +2595,43 @@ define bfloat @test_nearbyint(bfloat %a) #0 { ; CHECK-BF16-NEXT: frinti s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_nearbyint: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frinti s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.nearbyint.bf16(bfloat %a) ret bfloat %r } define bfloat @test_round(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_round: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frinta s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_round: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frinta s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_round: ; CHECK-BF16: // %bb.0: @@ -2304,25 +2640,43 @@ define bfloat @test_round(bfloat %a) #0 { ; CHECK-BF16-NEXT: frinta s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_round: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frinta s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.round.bf16(bfloat %a) ret bfloat %r } define bfloat @test_roundeven(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_roundeven: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: frintn s0, s0 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_roundeven: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: frintn s0, s0 +; CHECK-CVT-SD-NEXT: fmov w9, s0 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_roundeven: ; CHECK-BF16: // %bb.0: @@ -2331,52 +2685,118 @@ define bfloat @test_roundeven(bfloat %a) #0 { ; CHECK-BF16-NEXT: frintn s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_roundeven: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: frintn s0, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-GI-NEXT: add w8, w9, w8 +; CHECK-CVT-GI-NEXT: orr w9, w9, #0x400000 +; CHECK-CVT-GI-NEXT: add w8, w8, w10 +; CHECK-CVT-GI-NEXT: csel w8, w9, w8, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret %r = call bfloat @llvm.roundeven.bf16(bfloat %a) ret bfloat %r } define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 { -; CHECK-CVT-LABEL: test_fmuladd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmul s0, s0, s1 -; CHECK-CVT-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 -; CHECK-CVT-NEXT: add w8, w8, w10 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd s0, s0, s1 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 -; CHECK-CVT-NEXT: add w8, w8, w10 -; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmuladd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: mov w10, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmul s0, s0, s1 +; CHECK-CVT-SD-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w8, w10 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd s0, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w8, s0 +; CHECK-CVT-SD-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w8, w10 +; CHECK-CVT-SD-NEXT: add w8, w9, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fmuladd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmul s0, s0, s1 -; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fadd s0, s0, s1 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fmuladd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmul s0, s0, s1 +; CHECK-BF16-SD-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fadd s0, s0, s1 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fmuladd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: mov w10, #32767 // =0x7fff +; CHECK-CVT-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fmul s0, s0, s1 +; CHECK-CVT-GI-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-GI-NEXT: add w11, w8, w10 +; CHECK-CVT-GI-NEXT: orr w8, w8, #0x400000 +; CHECK-CVT-GI-NEXT: add w9, w11, w9 +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fadd s0, s0, s1 +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fcmp s0, #0.0 +; CHECK-CVT-GI-NEXT: ubfx w9, w8, #16, #1 +; CHECK-CVT-GI-NEXT: add w10, w8, w10 +; CHECK-CVT-GI-NEXT: orr w8, w8, #0x400000 +; CHECK-CVT-GI-NEXT: add w9, w10, w9 +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, vs +; CHECK-CVT-GI-NEXT: lsr w8, w8, #16 +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fmuladd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmul s0, s0, s1 +; CHECK-BF16-GI-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: fadd s0, s0, s1 +; CHECK-BF16-GI-NEXT: bfcvt h0, s0 +; CHECK-BF16-GI-NEXT: ret %r = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %r } diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll index c10436c93614c..823a245a3cc81 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll @@ -4,12 +4,7 @@ ; RUN: llc < %s -mtriple aarch64 -mattr=-bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI ; RUN: llc < %s -mtriple aarch64 -mattr=+bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-BF16-GI -; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_fadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fsub -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmul -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_frem ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt @@ -40,7 +35,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sin ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_cos @@ -58,26 +52,10 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log10 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log2 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fma ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fabs ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_minnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_floor -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_ceil -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_trunc -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_rint -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_round -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd -; -; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_fadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fsub -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmul -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; +; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_frem ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt @@ -108,7 +86,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sin ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_cos @@ -126,19 +103,8 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log10 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log2 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fma ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fabs ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fneg -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_minnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_floor -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_ceil -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_trunc -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_rint -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_round -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd define <4 x bfloat> @test_build(<4 x bfloat> %a) { ; CHECK-CVT-SD-LABEL: test_build: @@ -168,138 +134,289 @@ define <4 x bfloat> @test_build(<4 x bfloat> %a) { } define <4 x bfloat> @test_fadd(<4 x bfloat> %a, <4 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fadd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fadd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fadd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fadd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fadd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fadd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %r } define <4 x bfloat> @test_fsub(<4 x bfloat> %a, <4 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fsub: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fsub v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fsub: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fsub: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fsub v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fsub: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fsub: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fsub: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = fsub <4 x bfloat> %a, %b ret <4 x bfloat> %r } define <4 x bfloat> @test_fmul(<4 x bfloat> %a, <4 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fmul: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmul: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fmul: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fmul: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fmul: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fmul: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = fmul <4 x bfloat> %a, %b ret <4 x bfloat> %r } define <4 x bfloat> @test_fmadd(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) #0 { -; CHECK-CVT-LABEL: test_fmadd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8 -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v3.16b, v3.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v4.4s -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v4.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmadd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v4.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v3.16b, v3.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v4.4s +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v4.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fmadd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fmadd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fmadd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v5.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v7.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v5.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v7.4s, v4.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fmadd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %mul = fmul fast <4 x bfloat> %a, %b %r = fadd fast <4 x bfloat> %mul, %c ret <4 x bfloat> %r } define <4 x bfloat> @test_fdiv(<4 x bfloat> %a, <4 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fdiv: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fdiv v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fdiv: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fdiv v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fdiv: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fdiv v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fdiv: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fdiv v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fdiv: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fdiv v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fdiv: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fdiv v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = fdiv <4 x bfloat> %a, %b ret <4 x bfloat> %r } @@ -1154,74 +1271,99 @@ define <4 x bfloat> @test_bitcast_f16tobfloat(float, <4 x half> %a) { } define <4 x bfloat> @test_sqrt(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_sqrt: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-NEXT: mov h1, v0.h[1] -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: mov h3, v0.h[2] -; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: fsqrt s2, s2 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmov w10, s2 -; CHECK-CVT-NEXT: ubfx w12, w10, #16, #1 -; CHECK-CVT-NEXT: add w10, w10, w8 -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: fsqrt s1, s1 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: ubfx w11, w9, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: fsqrt s3, s3 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov w11, s3 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: add w9, w11, w8 -; CHECK-CVT-NEXT: add w9, w12, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: fsqrt s4, s0 -; CHECK-CVT-NEXT: fmov s0, w10 -; CHECK-CVT-NEXT: mov v0.h[1], v1.h[0] -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: mov v0.h[2], v1.h[0] -; CHECK-CVT-NEXT: fmov w10, s4 -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: mov v0.h[3], v1.h[0] -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_sqrt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-CVT-SD-NEXT: mov h1, v0.h[1] +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: mov h3, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h0, v0.h[3] +; CHECK-CVT-SD-NEXT: fsqrt s2, s2 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w10, s2 +; CHECK-CVT-SD-NEXT: ubfx w12, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w10, w10, w8 +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: fsqrt s1, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s1 +; CHECK-CVT-SD-NEXT: ubfx w11, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: fsqrt s3, s3 +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: fmov w11, s3 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w11, w8 +; CHECK-CVT-SD-NEXT: add w9, w12, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: fsqrt s4, s0 +; CHECK-CVT-SD-NEXT: fmov s0, w10 +; CHECK-CVT-SD-NEXT: mov v0.h[1], v1.h[0] +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-CVT-SD-NEXT: fmov w10, s4 +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s1, w8 +; CHECK-CVT-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_sqrt: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-BF16-NEXT: mov h1, v0.h[1] -; CHECK-BF16-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h3, v0.h[2] -; CHECK-BF16-NEXT: mov h0, v0.h[3] -; CHECK-BF16-NEXT: fsqrt s2, s2 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fsqrt s1, s1 -; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: fsqrt s3, s3 -; CHECK-BF16-NEXT: fsqrt s4, s0 -; CHECK-BF16-NEXT: bfcvt h0, s2 -; CHECK-BF16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-BF16-NEXT: bfcvt h1, s3 -; CHECK-BF16-NEXT: mov v0.h[2], v1.h[0] -; CHECK-BF16-NEXT: bfcvt h1, s4 -; CHECK-BF16-NEXT: mov v0.h[3], v1.h[0] -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_sqrt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-BF16-SD-NEXT: mov h1, v0.h[1] +; CHECK-BF16-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h3, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h0, v0.h[3] +; CHECK-BF16-SD-NEXT: fsqrt s2, s2 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fsqrt s1, s1 +; CHECK-BF16-SD-NEXT: bfcvt h1, s1 +; CHECK-BF16-SD-NEXT: fsqrt s3, s3 +; CHECK-BF16-SD-NEXT: fsqrt s4, s0 +; CHECK-BF16-SD-NEXT: bfcvt h0, s2 +; CHECK-BF16-SD-NEXT: mov v0.h[1], v1.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h1, s3 +; CHECK-BF16-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h1, s4 +; CHECK-BF16-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_sqrt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fsqrt v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_sqrt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: fsqrt v0.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } @@ -3078,106 +3220,135 @@ define <4 x bfloat> @test_log2(<4 x bfloat> %a) #0 { } define <4 x bfloat> @test_fma(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) #0 { -; CHECK-CVT-LABEL: test_fma: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-CVT-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-NEXT: mov h3, v2.h[1] -; CHECK-CVT-NEXT: mov h4, v1.h[1] -; CHECK-CVT-NEXT: mov w9, #32767 // =0x7fff -; CHECK-CVT-NEXT: mov h5, v0.h[1] -; CHECK-CVT-NEXT: shll v6.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v7.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v16.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov h17, v2.h[2] -; CHECK-CVT-NEXT: mov h18, v1.h[2] -; CHECK-CVT-NEXT: mov h19, v0.h[2] -; CHECK-CVT-NEXT: mov h2, v2.h[3] -; CHECK-CVT-NEXT: mov h1, v1.h[3] -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: fmadd s3, s5, s4, s3 -; CHECK-CVT-NEXT: fmadd s4, s16, s7, s6 -; CHECK-CVT-NEXT: shll v5.4s, v17.4h, #16 -; CHECK-CVT-NEXT: shll v6.4s, v18.4h, #16 -; CHECK-CVT-NEXT: shll v7.4s, v19.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmov w8, s3 -; CHECK-CVT-NEXT: fmov w10, s4 -; CHECK-CVT-NEXT: fmadd s3, s7, s6, s5 -; CHECK-CVT-NEXT: fmadd s1, s0, s1, s2 -; CHECK-CVT-NEXT: ubfx w11, w8, #16, #1 -; CHECK-CVT-NEXT: ubfx w12, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w8, w9 -; CHECK-CVT-NEXT: add w10, w10, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: fmov w11, s3 -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w8 -; CHECK-CVT-NEXT: fmov s0, w10 -; CHECK-CVT-NEXT: add w8, w11, w9 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: add w8, w12, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: add w8, w10, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v0.h[2], v1.h[0] -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: mov v0.h[3], v1.h[0] -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fma: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-CVT-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-CVT-SD-NEXT: mov h3, v2.h[1] +; CHECK-CVT-SD-NEXT: mov h4, v1.h[1] +; CHECK-CVT-SD-NEXT: mov w9, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: mov h5, v0.h[1] +; CHECK-CVT-SD-NEXT: shll v6.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v7.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov h17, v2.h[2] +; CHECK-CVT-SD-NEXT: mov h18, v1.h[2] +; CHECK-CVT-SD-NEXT: mov h19, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h2, v2.h[3] +; CHECK-CVT-SD-NEXT: mov h1, v1.h[3] +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-CVT-SD-NEXT: mov h0, v0.h[3] +; CHECK-CVT-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fmadd s3, s5, s4, s3 +; CHECK-CVT-SD-NEXT: fmadd s4, s16, s7, s6 +; CHECK-CVT-SD-NEXT: shll v5.4s, v17.4h, #16 +; CHECK-CVT-SD-NEXT: shll v6.4s, v18.4h, #16 +; CHECK-CVT-SD-NEXT: shll v7.4s, v19.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w8, s3 +; CHECK-CVT-SD-NEXT: fmov w10, s4 +; CHECK-CVT-SD-NEXT: fmadd s3, s7, s6, s5 +; CHECK-CVT-SD-NEXT: fmadd s1, s0, s1, s2 +; CHECK-CVT-SD-NEXT: ubfx w11, w8, #16, #1 +; CHECK-CVT-SD-NEXT: ubfx w12, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w8, w9 +; CHECK-CVT-SD-NEXT: add w10, w10, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: fmov w11, s3 +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w8 +; CHECK-CVT-SD-NEXT: fmov s0, w10 +; CHECK-CVT-SD-NEXT: add w8, w11, w9 +; CHECK-CVT-SD-NEXT: fmov w10, s1 +; CHECK-CVT-SD-NEXT: add w8, w12, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[1], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s1, w8 +; CHECK-CVT-SD-NEXT: add w8, w10, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-CVT-SD-NEXT: fmov s1, w8 +; CHECK-CVT-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fma: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-BF16-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-BF16-NEXT: mov h3, v2.h[1] -; CHECK-BF16-NEXT: mov h4, v1.h[1] -; CHECK-BF16-NEXT: mov h5, v0.h[1] -; CHECK-BF16-NEXT: shll v6.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v7.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h17, v2.h[2] -; CHECK-BF16-NEXT: mov h18, v1.h[2] -; CHECK-BF16-NEXT: mov h19, v0.h[2] -; CHECK-BF16-NEXT: mov h2, v2.h[3] -; CHECK-BF16-NEXT: mov h1, v1.h[3] -; CHECK-BF16-NEXT: fmadd s6, s16, s7, s6 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-BF16-NEXT: mov h16, v0.h[3] -; CHECK-BF16-NEXT: shll v7.4s, v19.4h, #16 -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: fmadd s3, s5, s4, s3 -; CHECK-BF16-NEXT: shll v4.4s, v17.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v18.4h, #16 -; CHECK-BF16-NEXT: bfcvt h0, s6 -; CHECK-BF16-NEXT: fmadd s4, s7, s5, s4 -; CHECK-BF16-NEXT: shll v5.4s, v16.4h, #16 -; CHECK-BF16-NEXT: bfcvt h3, s3 -; CHECK-BF16-NEXT: fmadd s1, s5, s1, s2 -; CHECK-BF16-NEXT: mov v0.h[1], v3.h[0] -; CHECK-BF16-NEXT: bfcvt h3, s4 -; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: mov v0.h[2], v3.h[0] -; CHECK-BF16-NEXT: mov v0.h[3], v1.h[0] -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fma: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-BF16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-BF16-SD-NEXT: mov h3, v2.h[1] +; CHECK-BF16-SD-NEXT: mov h4, v1.h[1] +; CHECK-BF16-SD-NEXT: mov h5, v0.h[1] +; CHECK-BF16-SD-NEXT: shll v6.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v7.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h17, v2.h[2] +; CHECK-BF16-SD-NEXT: mov h18, v1.h[2] +; CHECK-BF16-SD-NEXT: mov h19, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h2, v2.h[3] +; CHECK-BF16-SD-NEXT: mov h1, v1.h[3] +; CHECK-BF16-SD-NEXT: fmadd s6, s16, s7, s6 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16-SD-NEXT: mov h16, v0.h[3] +; CHECK-BF16-SD-NEXT: shll v7.4s, v19.4h, #16 +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s3, s5, s4, s3 +; CHECK-BF16-SD-NEXT: shll v4.4s, v17.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v18.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvt h0, s6 +; CHECK-BF16-SD-NEXT: fmadd s4, s7, s5, s4 +; CHECK-BF16-SD-NEXT: shll v5.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16-SD-NEXT: fmadd s1, s5, s1, s2 +; CHECK-BF16-SD-NEXT: mov v0.h[1], v3.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h3, s4 +; CHECK-BF16-SD-NEXT: bfcvt h1, s1 +; CHECK-BF16-SD-NEXT: mov v0.h[2], v3.h[0] +; CHECK-BF16-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fma: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmla v2.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: movi v0.4s, #1 +; CHECK-CVT-GI-NEXT: movi v1.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: bit v0.16b, v2.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fma: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: fmla v2.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %r } @@ -3202,179 +3373,233 @@ define <4 x bfloat> @test_fneg(<4 x bfloat> %a) #0 { } define <4 x bfloat> @test_minnum(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-CVT-LABEL: test_minnum: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-NEXT: mov h2, v1.h[1] -; CHECK-CVT-NEXT: mov h3, v0.h[1] -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov h6, v1.h[2] -; CHECK-CVT-NEXT: mov h7, v0.h[2] -; CHECK-CVT-NEXT: mov h1, v1.h[3] -; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fminnm s2, s3, s2 -; CHECK-CVT-NEXT: fminnm s3, s5, s4 -; CHECK-CVT-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v7.4h, #16 -; CHECK-CVT-NEXT: fminnm s1, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s2 -; CHECK-CVT-NEXT: fmov w10, s3 -; CHECK-CVT-NEXT: fminnm s2, s5, s4 -; CHECK-CVT-NEXT: ubfx w11, w9, #16, #1 -; CHECK-CVT-NEXT: ubfx w12, w10, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: add w10, w10, w8 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: fmov w11, s2 -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: fmov s0, w10 -; CHECK-CVT-NEXT: add w9, w11, w8 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: add w9, w12, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v0.h[2], v1.h[0] -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: mov v0.h[3], v1.h[0] -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_minnum: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-CVT-SD-NEXT: mov h2, v1.h[1] +; CHECK-CVT-SD-NEXT: mov h3, v0.h[1] +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov h6, v1.h[2] +; CHECK-CVT-SD-NEXT: mov h7, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h1, v1.h[3] +; CHECK-CVT-SD-NEXT: mov h0, v0.h[3] +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fminnm s2, s3, s2 +; CHECK-CVT-SD-NEXT: fminnm s3, s5, s4 +; CHECK-CVT-SD-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: fminnm s1, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s2 +; CHECK-CVT-SD-NEXT: fmov w10, s3 +; CHECK-CVT-SD-NEXT: fminnm s2, s5, s4 +; CHECK-CVT-SD-NEXT: ubfx w11, w9, #16, #1 +; CHECK-CVT-SD-NEXT: ubfx w12, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: add w10, w10, w8 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: fmov w11, s2 +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: fmov s0, w10 +; CHECK-CVT-SD-NEXT: add w9, w11, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s1 +; CHECK-CVT-SD-NEXT: add w9, w12, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[1], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-CVT-SD-NEXT: fmov s1, w8 +; CHECK-CVT-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_minnum: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-BF16-NEXT: mov h2, v1.h[1] -; CHECK-BF16-NEXT: mov h3, v0.h[1] -; CHECK-BF16-NEXT: mov h4, v1.h[2] -; CHECK-BF16-NEXT: shll v5.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v6.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h7, v0.h[2] -; CHECK-BF16-NEXT: mov h1, v1.h[3] -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: fminnm s2, s3, s2 -; CHECK-BF16-NEXT: fminnm s3, s6, s5 -; CHECK-BF16-NEXT: shll v5.4s, v7.4h, #16 -; CHECK-BF16-NEXT: mov h6, v0.h[3] -; CHECK-BF16-NEXT: fminnm s4, s5, s4 -; CHECK-BF16-NEXT: bfcvt h2, s2 -; CHECK-BF16-NEXT: bfcvt h0, s3 -; CHECK-BF16-NEXT: shll v3.4s, v6.4h, #16 -; CHECK-BF16-NEXT: mov v0.h[1], v2.h[0] -; CHECK-BF16-NEXT: bfcvt h2, s4 -; CHECK-BF16-NEXT: fminnm s1, s3, s1 -; CHECK-BF16-NEXT: mov v0.h[2], v2.h[0] -; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: mov v0.h[3], v1.h[0] -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_minnum: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-BF16-SD-NEXT: mov h2, v1.h[1] +; CHECK-BF16-SD-NEXT: mov h3, v0.h[1] +; CHECK-BF16-SD-NEXT: mov h4, v1.h[2] +; CHECK-BF16-SD-NEXT: shll v5.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v6.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h7, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h1, v1.h[3] +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fminnm s2, s3, s2 +; CHECK-BF16-SD-NEXT: fminnm s3, s6, s5 +; CHECK-BF16-SD-NEXT: shll v5.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: mov h6, v0.h[3] +; CHECK-BF16-SD-NEXT: fminnm s4, s5, s4 +; CHECK-BF16-SD-NEXT: bfcvt h2, s2 +; CHECK-BF16-SD-NEXT: bfcvt h0, s3 +; CHECK-BF16-SD-NEXT: shll v3.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: mov v0.h[1], v2.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h2, s4 +; CHECK-BF16-SD-NEXT: fminnm s1, s3, s1 +; CHECK-BF16-SD-NEXT: mov v0.h[2], v2.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h1, s1 +; CHECK-BF16-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_minnum: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fminnm v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_minnum: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fminnm v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %r } define <4 x bfloat> @test_maxnum(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-CVT-LABEL: test_maxnum: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-NEXT: mov h2, v1.h[1] -; CHECK-CVT-NEXT: mov h3, v0.h[1] -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov h6, v1.h[2] -; CHECK-CVT-NEXT: mov h7, v0.h[2] -; CHECK-CVT-NEXT: mov h1, v1.h[3] -; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmaxnm s2, s3, s2 -; CHECK-CVT-NEXT: fmaxnm s3, s5, s4 -; CHECK-CVT-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v7.4h, #16 -; CHECK-CVT-NEXT: fmaxnm s1, s0, s1 -; CHECK-CVT-NEXT: fmov w9, s2 -; CHECK-CVT-NEXT: fmov w10, s3 -; CHECK-CVT-NEXT: fmaxnm s2, s5, s4 -; CHECK-CVT-NEXT: ubfx w11, w9, #16, #1 -; CHECK-CVT-NEXT: ubfx w12, w10, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: add w10, w10, w8 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: fmov w11, s2 -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: fmov s0, w10 -; CHECK-CVT-NEXT: add w9, w11, w8 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: add w9, w12, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v0.h[2], v1.h[0] -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: mov v0.h[3], v1.h[0] -; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_maxnum: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-CVT-SD-NEXT: mov h2, v1.h[1] +; CHECK-CVT-SD-NEXT: mov h3, v0.h[1] +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov h6, v1.h[2] +; CHECK-CVT-SD-NEXT: mov h7, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h1, v1.h[3] +; CHECK-CVT-SD-NEXT: mov h0, v0.h[3] +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmaxnm s2, s3, s2 +; CHECK-CVT-SD-NEXT: fmaxnm s3, s5, s4 +; CHECK-CVT-SD-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: fmaxnm s1, s0, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s2 +; CHECK-CVT-SD-NEXT: fmov w10, s3 +; CHECK-CVT-SD-NEXT: fmaxnm s2, s5, s4 +; CHECK-CVT-SD-NEXT: ubfx w11, w9, #16, #1 +; CHECK-CVT-SD-NEXT: ubfx w12, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: add w10, w10, w8 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: fmov w11, s2 +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: fmov s0, w10 +; CHECK-CVT-SD-NEXT: add w9, w11, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s1 +; CHECK-CVT-SD-NEXT: add w9, w12, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[1], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-CVT-SD-NEXT: fmov s1, w8 +; CHECK-CVT-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-CVT-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_maxnum: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-BF16-NEXT: mov h2, v1.h[1] -; CHECK-BF16-NEXT: mov h3, v0.h[1] -; CHECK-BF16-NEXT: mov h4, v1.h[2] -; CHECK-BF16-NEXT: shll v5.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v6.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h7, v0.h[2] -; CHECK-BF16-NEXT: mov h1, v1.h[3] -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: fmaxnm s2, s3, s2 -; CHECK-BF16-NEXT: fmaxnm s3, s6, s5 -; CHECK-BF16-NEXT: shll v5.4s, v7.4h, #16 -; CHECK-BF16-NEXT: mov h6, v0.h[3] -; CHECK-BF16-NEXT: fmaxnm s4, s5, s4 -; CHECK-BF16-NEXT: bfcvt h2, s2 -; CHECK-BF16-NEXT: bfcvt h0, s3 -; CHECK-BF16-NEXT: shll v3.4s, v6.4h, #16 -; CHECK-BF16-NEXT: mov v0.h[1], v2.h[0] -; CHECK-BF16-NEXT: bfcvt h2, s4 -; CHECK-BF16-NEXT: fmaxnm s1, s3, s1 -; CHECK-BF16-NEXT: mov v0.h[2], v2.h[0] -; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: mov v0.h[3], v1.h[0] -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_maxnum: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-BF16-SD-NEXT: mov h2, v1.h[1] +; CHECK-BF16-SD-NEXT: mov h3, v0.h[1] +; CHECK-BF16-SD-NEXT: mov h4, v1.h[2] +; CHECK-BF16-SD-NEXT: shll v5.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v6.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h7, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h1, v1.h[3] +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fmaxnm s2, s3, s2 +; CHECK-BF16-SD-NEXT: fmaxnm s3, s6, s5 +; CHECK-BF16-SD-NEXT: shll v5.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: mov h6, v0.h[3] +; CHECK-BF16-SD-NEXT: fmaxnm s4, s5, s4 +; CHECK-BF16-SD-NEXT: bfcvt h2, s2 +; CHECK-BF16-SD-NEXT: bfcvt h0, s3 +; CHECK-BF16-SD-NEXT: shll v3.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: mov v0.h[1], v2.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h2, s4 +; CHECK-BF16-SD-NEXT: fmaxnm s1, s3, s1 +; CHECK-BF16-SD-NEXT: mov v0.h[2], v2.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h1, s1 +; CHECK-BF16-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-BF16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_maxnum: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_maxnum: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %r } @@ -3436,17 +3661,17 @@ define <4 x bfloat> @test_copysign_f32(<4 x bfloat> %a, <4 x float> %b) #0 { } define <4 x bfloat> @test_floor(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_floor: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frintm v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_floor: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frintm v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_floor: ; CHECK-BF16: // %bb.0: @@ -3454,22 +3679,40 @@ define <4 x bfloat> @test_floor(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frintm v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_floor: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintm v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.floor.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_ceil(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_ceil: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frintp v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_ceil: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frintp v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_ceil: ; CHECK-BF16: // %bb.0: @@ -3477,22 +3720,40 @@ define <4 x bfloat> @test_ceil(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frintp v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_ceil: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintp v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.ceil.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_trunc(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_trunc: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frintz v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_trunc: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frintz v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_trunc: ; CHECK-BF16: // %bb.0: @@ -3500,22 +3761,40 @@ define <4 x bfloat> @test_trunc(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frintz v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_trunc: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintz v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.trunc.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_rint(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_rint: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frintx v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_rint: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frintx v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_rint: ; CHECK-BF16: // %bb.0: @@ -3523,22 +3802,40 @@ define <4 x bfloat> @test_rint(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frintx v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_rint: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintx v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.rint.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_nearbyint(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_nearbyint: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frinti v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_nearbyint: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frinti v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_nearbyint: ; CHECK-BF16: // %bb.0: @@ -3546,22 +3843,40 @@ define <4 x bfloat> @test_nearbyint(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frinti v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_nearbyint: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frinti v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.nearbyint.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_round(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_round: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frinta v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_round: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frinta v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_round: ; CHECK-BF16: // %bb.0: @@ -3569,22 +3884,40 @@ define <4 x bfloat> @test_round(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frinta v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_round: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frinta v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.round.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_roundeven(<4 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_roundeven: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: frintn v0.4s, v0.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_roundeven: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: frintn v0.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-LABEL: test_roundeven: ; CHECK-BF16: // %bb.0: @@ -3592,42 +3925,103 @@ define <4 x bfloat> @test_roundeven(<4 x bfloat> %a) #0 { ; CHECK-BF16-NEXT: frintn v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s ; CHECK-BF16-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_roundeven: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v5.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintn v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v5.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret %r = call <4 x bfloat> @llvm.roundeven.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %r } define <4 x bfloat> @test_fmuladd(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) #0 { -; CHECK-CVT-LABEL: test_fmuladd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8 -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v3.16b, v3.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v4.4s -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v4.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmuladd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: movi v4.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v3.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v3.16b, v3.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v4.4s +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v4.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fmuladd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fmuladd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fmuladd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v5.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v7.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v5.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v7.4s, v4.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: ushr v2.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v4.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-CVT-GI-NEXT: add v2.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: mvn v3.16b, v4.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fmuladd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %r = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %r } diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index 7c0b4092a6f48..2b31e876487d9 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -5,12 +5,7 @@ ; RUN: llc < %s -mtriple aarch64 -mattr=-bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI ; RUN: llc < %s -mtriple aarch64 -mattr=+bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-BF16-GI -; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_fadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fsub -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmul -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_frem ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt @@ -45,7 +40,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_sin ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_cos @@ -63,26 +57,10 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log10 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_log2 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fma ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fabs ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_minnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_floor -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_ceil -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_trunc -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_rint -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_round -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd ; -; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_fadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fsub -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmul -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_frem ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt @@ -117,7 +95,6 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i16 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i32 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_uitofp_i64 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_powi ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_sin ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_cos @@ -135,19 +112,8 @@ ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log10 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_log2 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fma ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fabs ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fneg -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_minnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_floor -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_ceil -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_trunc -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_rint -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_round -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd define <8 x bfloat> @test_build(<8 x bfloat> %a) { ; CHECK-CVT-SD-LABEL: test_build: @@ -183,97 +149,209 @@ define <8 x bfloat> @test_build(<8 x bfloat> %a) { } define <8 x bfloat> @test_fadd(<8 x bfloat> %a, <8 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fadd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v2.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: fadd v3.4s, v4.4s, v3.4s -; CHECK-CVT-NEXT: fadd v1.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: ushr v0.4s, v3.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-CVT-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fadd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v2.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fadd v3.4s, v4.4s, v3.4s +; CHECK-CVT-SD-NEXT: fadd v1.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: and v2.16b, v4.16b, v2.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fadd: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-BF16-NEXT: fadd v2.4s, v3.4s, v2.4s -; CHECK-BF16-NEXT: fadd v1.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fadd: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: fadd v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fadd v1.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fadd: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: fadd v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fadd v1.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fadd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fadd v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v2.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fadd: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fadd v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fadd v1.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-GI-NEXT: ret %r = fadd <8 x bfloat> %a, %b ret <8 x bfloat> %r } define <8 x bfloat> @test_fsub(<8 x bfloat> %a, <8 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fsub: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v2.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: fsub v3.4s, v4.4s, v3.4s -; CHECK-CVT-NEXT: fsub v1.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: ushr v0.4s, v3.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-CVT-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fsub: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v2.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fsub v3.4s, v4.4s, v3.4s +; CHECK-CVT-SD-NEXT: fsub v1.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: and v2.16b, v4.16b, v2.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fsub: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-BF16-NEXT: fsub v2.4s, v3.4s, v2.4s -; CHECK-BF16-NEXT: fsub v1.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fsub: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: fsub v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fsub v1.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fsub: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: fsub v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fsub v1.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fsub: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fsub v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v2.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fsub: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fsub v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fsub v1.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-GI-NEXT: ret %r = fsub <8 x bfloat> %a, %b ret <8 x bfloat> %r } define <8 x bfloat> @test_fmul(<8 x bfloat> %a, <8 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fmul: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v2.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: fmul v3.4s, v4.4s, v3.4s -; CHECK-CVT-NEXT: fmul v1.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: ushr v0.4s, v3.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-CVT-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmul: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v2.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fmul v3.4s, v4.4s, v3.4s +; CHECK-CVT-SD-NEXT: fmul v1.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: and v2.16b, v4.16b, v2.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-SD-LABEL: test_fmul: ; CHECK-BF16-SD: // %bb.0: @@ -299,13 +377,45 @@ define <8 x bfloat> @test_fmul(<8 x bfloat> %a, <8 x bfloat> %b) { ; CHECK-BF16SVE-SD-NEXT: mov v0.16b, v2.16b ; CHECK-BF16SVE-SD-NEXT: ret ; +; CHECK-CVT-GI-LABEL: test_fmul: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v2.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; ; CHECK-BF16-GI-LABEL: test_fmul: ; CHECK-BF16-GI: // %bb.0: -; CHECK-BF16-GI-NEXT: shll v2.4s, v1.4h, #16 -; CHECK-BF16-GI-NEXT: shll v3.4s, v0.4h, #16 -; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 ; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-BF16-GI-NEXT: fmul v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fmul v2.4s, v2.4s, v3.4s ; CHECK-BF16-GI-NEXT: fmul v1.4s, v0.4s, v1.4s ; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s ; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v1.4s @@ -315,39 +425,39 @@ define <8 x bfloat> @test_fmul(<8 x bfloat> %a, <8 x bfloat> %b) { } define <8 x bfloat> @test_fmadd(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) #0 { -; CHECK-CVT-LABEL: test_fmadd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v6.4s, #127, msl #8 -; CHECK-CVT-NEXT: fmul v3.4s, v4.4s, v3.4s -; CHECK-CVT-NEXT: movi v4.4s, #1 -; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v5.16b, v4.16b -; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: and v3.16b, v5.16b, v4.16b -; CHECK-CVT-NEXT: addhn v1.4h, v1.4s, v6.4s -; CHECK-CVT-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-CVT-NEXT: shll v3.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: fadd v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd v2.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: ushr v0.4s, v1.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: and v1.16b, v3.16b, v4.16b -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v6.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmadd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v6.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fmul v3.4s, v4.4s, v3.4s +; CHECK-CVT-SD-NEXT: movi v4.4s, #1 +; CHECK-CVT-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: ushr v5.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v5.16b, v4.16b +; CHECK-CVT-SD-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: and v3.16b, v5.16b, v4.16b +; CHECK-CVT-SD-NEXT: addhn v1.4h, v1.4s, v6.4s +; CHECK-CVT-SD-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: shll v3.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v6.4s +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v2.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v1.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-CVT-SD-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: and v1.16b, v3.16b, v4.16b +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v6.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v6.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-SD-LABEL: test_fmadd: ; CHECK-BF16-SD: // %bb.0: @@ -388,19 +498,75 @@ define <8 x bfloat> @test_fmadd(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> % ; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16SVE-SD-NEXT: ret ; +; CHECK-CVT-GI-LABEL: test_fmadd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v7.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-CVT-GI-NEXT: movi v4.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v3.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v16.4s, v3.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v17.4s, v3.4s, v4.4s +; CHECK-CVT-GI-NEXT: ushr v6.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v18.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v19.4s, v0.4s, v4.4s +; CHECK-CVT-GI-NEXT: orr v3.16b, v3.16b, v7.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v6.16b, v6.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v16.16b, v16.16b +; CHECK-CVT-GI-NEXT: add v5.4s, v17.4s, v5.4s +; CHECK-CVT-GI-NEXT: mvn v17.16b, v18.16b +; CHECK-CVT-GI-NEXT: add v6.4s, v19.4s, v6.4s +; CHECK-CVT-GI-NEXT: bif v3.16b, v5.16b, v16.16b +; CHECK-CVT-GI-NEXT: shll v5.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-CVT-GI-NEXT: bif v0.16b, v6.16b, v17.16b +; CHECK-CVT-GI-NEXT: shrn v3.4h, v3.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fadd v3.4s, v3.4s, v5.4s +; CHECK-CVT-GI-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: ushr v2.4s, v3.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v6.4s, v3.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v3.4s, v4.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-CVT-GI-NEXT: orr v3.16b, v3.16b, v7.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v2.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v6.16b +; CHECK-CVT-GI-NEXT: mvn v6.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-CVT-GI-NEXT: bit v2.16b, v3.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; ; CHECK-BF16-GI-LABEL: test_fmadd: ; CHECK-BF16-GI: // %bb.0: -; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-BF16-GI-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v4.4s, v1.4h, #16 ; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-BF16-GI-NEXT: fmul v3.4s, v4.4s, v3.4s +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fmul v3.4s, v3.4s, v4.4s ; CHECK-BF16-GI-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-BF16-GI-NEXT: bfcvtn v3.4h, v3.4s -; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-GI-NEXT: shll v1.4s, v3.4h, #16 +; CHECK-BF16-GI-NEXT: bfcvtn v1.4h, v3.4s ; CHECK-BF16-GI-NEXT: shll v3.4s, v2.4h, #16 ; CHECK-BF16-GI-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-GI-NEXT: fadd v1.4s, v1.4s, v3.4s ; CHECK-BF16-GI-NEXT: fadd v2.4s, v0.4s, v2.4s @@ -413,37 +579,93 @@ define <8 x bfloat> @test_fmadd(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> % } define <8 x bfloat> @test_fdiv(<8 x bfloat> %a, <8 x bfloat> %b) { -; CHECK-CVT-LABEL: test_fdiv: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: fdiv v2.4s, v3.4s, v2.4s -; CHECK-CVT-NEXT: fdiv v1.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: movi v0.4s, #1 -; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 -; CHECK-CVT-NEXT: and v3.16b, v3.16b, v0.16b -; CHECK-CVT-NEXT: add v2.4s, v3.4s, v2.4s -; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16 -; CHECK-CVT-NEXT: and v3.16b, v4.16b, v0.16b -; CHECK-CVT-NEXT: addhn v0.4h, v2.4s, v5.4s -; CHECK-CVT-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fdiv: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fdiv v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fdiv v1.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: movi v0.4s, #1 +; CHECK-CVT-SD-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-CVT-SD-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-SD-NEXT: and v3.16b, v4.16b, v0.16b +; CHECK-CVT-SD-NEXT: addhn v0.4h, v2.4s, v5.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fdiv: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-BF16-NEXT: fdiv v2.4s, v3.4s, v2.4s -; CHECK-BF16-NEXT: fdiv v1.4s, v0.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fdiv: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: fdiv v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fdiv v1.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fdiv: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: fdiv v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fdiv v1.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fdiv: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fdiv v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fdiv v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v2.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fdiv: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fdiv v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fdiv v1.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-GI-NEXT: ret %r = fdiv <8 x bfloat> %a, %b ret <8 x bfloat> %r } @@ -1863,140 +2085,222 @@ define <8 x bfloat> @test_bitcast_f16tobfloat(float, <8 x half> %a) { } define <8 x bfloat> @test_sqrt(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_sqrt: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: mov h1, v0.h[1] -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: mov h3, v0.h[2] -; CHECK-CVT-NEXT: mov h4, v0.h[3] -; CHECK-CVT-NEXT: mov h5, v0.h[4] -; CHECK-CVT-NEXT: mov h6, v0.h[5] -; CHECK-CVT-NEXT: mov h7, v0.h[6] -; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fsqrt s2, s2 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-CVT-NEXT: shll v6.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fmov w10, s2 -; CHECK-CVT-NEXT: ubfx w12, w10, #16, #1 -; CHECK-CVT-NEXT: add w10, w10, w8 -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: fsqrt s1, s1 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: ubfx w11, w9, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: fsqrt s3, s3 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: fmov w11, s3 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: add w9, w11, w8 -; CHECK-CVT-NEXT: add w9, w12, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: fsqrt s4, s4 -; CHECK-CVT-NEXT: fsqrt s5, s5 -; CHECK-CVT-NEXT: fsqrt s6, s6 -; CHECK-CVT-NEXT: fsqrt s7, s7 -; CHECK-CVT-NEXT: fsqrt s1, s0 -; CHECK-CVT-NEXT: fmov s0, w10 -; CHECK-CVT-NEXT: fmov w10, s4 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s5 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v0.h[2], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s6 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v0.h[3], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s7 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v0.h[4], v2.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v0.h[5], v2.h[0] -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: mov v0.h[6], v1.h[0] -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: mov v0.h[7], v1.h[0] -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_sqrt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: mov h1, v0.h[1] +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: mov h3, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h4, v0.h[3] +; CHECK-CVT-SD-NEXT: mov h5, v0.h[4] +; CHECK-CVT-SD-NEXT: mov h6, v0.h[5] +; CHECK-CVT-SD-NEXT: mov h7, v0.h[6] +; CHECK-CVT-SD-NEXT: mov h0, v0.h[7] +; CHECK-CVT-SD-NEXT: fsqrt s2, s2 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-CVT-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w10, s2 +; CHECK-CVT-SD-NEXT: ubfx w12, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w10, w10, w8 +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: fsqrt s1, s1 +; CHECK-CVT-SD-NEXT: fmov w9, s1 +; CHECK-CVT-SD-NEXT: ubfx w11, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: fsqrt s3, s3 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: fmov w11, s3 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w11, w8 +; CHECK-CVT-SD-NEXT: add w9, w12, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: fsqrt s4, s4 +; CHECK-CVT-SD-NEXT: fsqrt s5, s5 +; CHECK-CVT-SD-NEXT: fsqrt s6, s6 +; CHECK-CVT-SD-NEXT: fsqrt s7, s7 +; CHECK-CVT-SD-NEXT: fsqrt s1, s0 +; CHECK-CVT-SD-NEXT: fmov s0, w10 +; CHECK-CVT-SD-NEXT: fmov w10, s4 +; CHECK-CVT-SD-NEXT: mov v0.h[1], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s5 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[2], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s6 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[3], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s7 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[4], v2.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v0.h[5], v2.h[0] +; CHECK-CVT-SD-NEXT: fmov w10, s1 +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: mov v0.h[6], v1.h[0] +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: fmov s1, w8 +; CHECK-CVT-SD-NEXT: mov v0.h[7], v1.h[0] +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_sqrt: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: mov h1, v0.h[1] -; CHECK-BF16-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h3, v0.h[2] -; CHECK-BF16-NEXT: mov h4, v0.h[3] -; CHECK-BF16-NEXT: mov h5, v0.h[4] -; CHECK-BF16-NEXT: mov h6, v0.h[5] -; CHECK-BF16-NEXT: mov h7, v0.h[6] -; CHECK-BF16-NEXT: mov h0, v0.h[7] -; CHECK-BF16-NEXT: fsqrt s2, s2 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-BF16-NEXT: shll v6.4s, v6.4h, #16 -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v0.4h, #16 -; CHECK-BF16-NEXT: bfcvt h0, s2 -; CHECK-BF16-NEXT: fsqrt s1, s1 -; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-BF16-NEXT: fsqrt s3, s3 -; CHECK-BF16-NEXT: bfcvt h1, s3 -; CHECK-BF16-NEXT: mov v0.h[2], v1.h[0] -; CHECK-BF16-NEXT: fsqrt s4, s4 -; CHECK-BF16-NEXT: bfcvt h1, s4 -; CHECK-BF16-NEXT: mov v0.h[3], v1.h[0] -; CHECK-BF16-NEXT: fsqrt s5, s5 -; CHECK-BF16-NEXT: bfcvt h1, s5 -; CHECK-BF16-NEXT: mov v0.h[4], v1.h[0] -; CHECK-BF16-NEXT: fsqrt s6, s6 -; CHECK-BF16-NEXT: bfcvt h1, s6 -; CHECK-BF16-NEXT: mov v0.h[5], v1.h[0] -; CHECK-BF16-NEXT: fsqrt s7, s7 -; CHECK-BF16-NEXT: bfcvt h1, s7 -; CHECK-BF16-NEXT: mov v0.h[6], v1.h[0] -; CHECK-BF16-NEXT: fsqrt s2, s16 -; CHECK-BF16-NEXT: bfcvt h1, s2 -; CHECK-BF16-NEXT: mov v0.h[7], v1.h[0] -; CHECK-BF16-NEXT: ret - %r = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a) - ret <8 x bfloat> %r -} - -define <8 x bfloat> @test_powi(<8 x bfloat> %a, i32 %b) #0 { -; CHECK-CVT-LABEL: test_powi: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: sub sp, sp, #64 -; CHECK-CVT-NEXT: mov h1, v0.h[1] -; CHECK-CVT-NEXT: str q0, [sp] // 16-byte Spill +; CHECK-BF16-SD-LABEL: test_sqrt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: mov h1, v0.h[1] +; CHECK-BF16-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h3, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h4, v0.h[3] +; CHECK-BF16-SD-NEXT: mov h5, v0.h[4] +; CHECK-BF16-SD-NEXT: mov h6, v0.h[5] +; CHECK-BF16-SD-NEXT: mov h7, v0.h[6] +; CHECK-BF16-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16-SD-NEXT: fsqrt s2, s2 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvt h0, s2 +; CHECK-BF16-SD-NEXT: fsqrt s1, s1 +; CHECK-BF16-SD-NEXT: bfcvt h1, s1 +; CHECK-BF16-SD-NEXT: mov v0.h[1], v1.h[0] +; CHECK-BF16-SD-NEXT: fsqrt s3, s3 +; CHECK-BF16-SD-NEXT: bfcvt h1, s3 +; CHECK-BF16-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-BF16-SD-NEXT: fsqrt s4, s4 +; CHECK-BF16-SD-NEXT: bfcvt h1, s4 +; CHECK-BF16-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-BF16-SD-NEXT: fsqrt s5, s5 +; CHECK-BF16-SD-NEXT: bfcvt h1, s5 +; CHECK-BF16-SD-NEXT: mov v0.h[4], v1.h[0] +; CHECK-BF16-SD-NEXT: fsqrt s6, s6 +; CHECK-BF16-SD-NEXT: bfcvt h1, s6 +; CHECK-BF16-SD-NEXT: mov v0.h[5], v1.h[0] +; CHECK-BF16-SD-NEXT: fsqrt s7, s7 +; CHECK-BF16-SD-NEXT: bfcvt h1, s7 +; CHECK-BF16-SD-NEXT: mov v0.h[6], v1.h[0] +; CHECK-BF16-SD-NEXT: fsqrt s2, s16 +; CHECK-BF16-SD-NEXT: bfcvt h1, s2 +; CHECK-BF16-SD-NEXT: mov v0.h[7], v1.h[0] +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_sqrt: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: mov h1, v0.h[1] +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h3, v0.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h4, v0.h[3] +; CHECK-BF16SVE-SD-NEXT: mov h5, v0.h[4] +; CHECK-BF16SVE-SD-NEXT: mov h6, v0.h[5] +; CHECK-BF16SVE-SD-NEXT: mov h7, v0.h[6] +; CHECK-BF16SVE-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16SVE-SD-NEXT: fsqrt s2, s2 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: bfcvt h0, s2 +; CHECK-BF16SVE-SD-NEXT: fsqrt s1, s1 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s1 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[1], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: fsqrt s3, s3 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s3 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[2], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: fsqrt s4, s4 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s4 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: fsqrt s5, s5 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s5 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[4], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: fsqrt s6, s6 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s6 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[5], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: fsqrt s7, s7 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s7 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[6], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: fsqrt s2, s16 +; CHECK-BF16SVE-SD-NEXT: bfcvt h1, s2 +; CHECK-BF16SVE-SD-NEXT: mov v0.h[7], v1.h[0] +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_sqrt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fsqrt v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fsqrt v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_sqrt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: fsqrt v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: fsqrt v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret + %r = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a) + ret <8 x bfloat> %r +} + +define <8 x bfloat> @test_powi(<8 x bfloat> %a, i32 %b) #0 { +; CHECK-CVT-LABEL: test_powi: +; CHECK-CVT: // %bb.0: +; CHECK-CVT-NEXT: sub sp, sp, #64 +; CHECK-CVT-NEXT: mov h1, v0.h[1] +; CHECK-CVT-NEXT: str q0, [sp] // 16-byte Spill ; CHECK-CVT-NEXT: str x30, [sp, #32] // 8-byte Spill ; CHECK-CVT-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; CHECK-CVT-NEXT: mov w19, w0 @@ -5451,192 +5755,314 @@ define <8 x bfloat> @test_log2(<8 x bfloat> %a) #0 { } define <8 x bfloat> @test_fma(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) #0 { -; CHECK-CVT-LABEL: test_fma: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: mov h3, v2.h[1] -; CHECK-CVT-NEXT: mov h4, v1.h[1] -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: mov h5, v0.h[1] -; CHECK-CVT-NEXT: shll v6.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v7.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v16.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov h17, v2.h[2] -; CHECK-CVT-NEXT: mov h18, v1.h[2] -; CHECK-CVT-NEXT: mov h19, v0.h[2] -; CHECK-CVT-NEXT: mov h20, v2.h[3] -; CHECK-CVT-NEXT: mov h21, v1.h[3] -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-CVT-NEXT: mov h22, v0.h[3] -; CHECK-CVT-NEXT: fmadd s6, s16, s7, s6 -; CHECK-CVT-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-CVT-NEXT: shll v7.4s, v17.4h, #16 -; CHECK-CVT-NEXT: shll v16.4s, v18.4h, #16 -; CHECK-CVT-NEXT: shll v17.4s, v19.4h, #16 -; CHECK-CVT-NEXT: shll v18.4s, v20.4h, #16 -; CHECK-CVT-NEXT: shll v19.4s, v21.4h, #16 -; CHECK-CVT-NEXT: fmadd s3, s5, s4, s3 -; CHECK-CVT-NEXT: shll v20.4s, v22.4h, #16 -; CHECK-CVT-NEXT: mov h21, v2.h[4] -; CHECK-CVT-NEXT: fmov w9, s6 -; CHECK-CVT-NEXT: mov h22, v1.h[4] -; CHECK-CVT-NEXT: mov h6, v0.h[4] -; CHECK-CVT-NEXT: fmadd s4, s17, s16, s7 -; CHECK-CVT-NEXT: mov h17, v2.h[5] -; CHECK-CVT-NEXT: fmadd s5, s20, s19, s18 -; CHECK-CVT-NEXT: mov h18, v1.h[5] -; CHECK-CVT-NEXT: mov h19, v0.h[5] -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: fmov w11, s3 -; CHECK-CVT-NEXT: shll v7.4s, v21.4h, #16 -; CHECK-CVT-NEXT: shll v16.4s, v22.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v6.4h, #16 -; CHECK-CVT-NEXT: add w9, w10, w9 -; CHECK-CVT-NEXT: shll v6.4s, v17.4h, #16 -; CHECK-CVT-NEXT: mov h17, v1.h[6] -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: add w10, w11, w8 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: fmov w11, s4 -; CHECK-CVT-NEXT: fmadd s4, s3, s16, s7 -; CHECK-CVT-NEXT: shll v7.4s, v18.4h, #16 -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: fmov s3, w9 -; CHECK-CVT-NEXT: fmov w9, s5 -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: shll v16.4s, v19.4h, #16 -; CHECK-CVT-NEXT: mov h18, v0.h[6] -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: add w11, w11, w8 -; CHECK-CVT-NEXT: mov h1, v1.h[7] -; CHECK-CVT-NEXT: fmov s5, w10 -; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: add w11, w12, w11 -; CHECK-CVT-NEXT: fmov w12, s4 -; CHECK-CVT-NEXT: fmadd s6, s16, s7, s6 -; CHECK-CVT-NEXT: lsr w10, w11, #16 -; CHECK-CVT-NEXT: ubfx w11, w9, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: mov v3.h[1], v5.h[0] -; CHECK-CVT-NEXT: mov h5, v2.h[6] -; CHECK-CVT-NEXT: shll v7.4s, v18.4h, #16 -; CHECK-CVT-NEXT: fmov s4, w10 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: mov h2, v2.h[7] -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: ubfx w10, w12, #16, #1 -; CHECK-CVT-NEXT: fmov w11, s6 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov v3.h[2], v4.h[0] -; CHECK-CVT-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v17.4h, #16 -; CHECK-CVT-NEXT: fmov s16, w9 -; CHECK-CVT-NEXT: add w9, w12, w8 -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: add w9, w10, w9 -; CHECK-CVT-NEXT: ubfx w10, w11, #16, #1 -; CHECK-CVT-NEXT: fmadd s4, s7, s5, s4 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v3.h[3], v16.h[0] -; CHECK-CVT-NEXT: fmadd s0, s0, s1, s2 -; CHECK-CVT-NEXT: fmov s5, w9 -; CHECK-CVT-NEXT: add w9, w11, w8 -; CHECK-CVT-NEXT: add w9, w10, w9 -; CHECK-CVT-NEXT: fmov w10, s4 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v3.h[4], v5.h[0] -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s0 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: mov v3.h[5], v1.h[0] -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v3.h[6], v0.h[0] -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: mov v3.h[7], v0.h[0] -; CHECK-CVT-NEXT: mov v0.16b, v3.16b -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fma: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: mov h3, v2.h[1] +; CHECK-CVT-SD-NEXT: mov h4, v1.h[1] +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: mov h5, v0.h[1] +; CHECK-CVT-SD-NEXT: shll v6.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v7.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov h17, v2.h[2] +; CHECK-CVT-SD-NEXT: mov h18, v1.h[2] +; CHECK-CVT-SD-NEXT: mov h19, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h20, v2.h[3] +; CHECK-CVT-SD-NEXT: mov h21, v1.h[3] +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-CVT-SD-NEXT: mov h22, v0.h[3] +; CHECK-CVT-SD-NEXT: fmadd s6, s16, s7, s6 +; CHECK-CVT-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-CVT-SD-NEXT: shll v7.4s, v17.4h, #16 +; CHECK-CVT-SD-NEXT: shll v16.4s, v18.4h, #16 +; CHECK-CVT-SD-NEXT: shll v17.4s, v19.4h, #16 +; CHECK-CVT-SD-NEXT: shll v18.4s, v20.4h, #16 +; CHECK-CVT-SD-NEXT: shll v19.4s, v21.4h, #16 +; CHECK-CVT-SD-NEXT: fmadd s3, s5, s4, s3 +; CHECK-CVT-SD-NEXT: shll v20.4s, v22.4h, #16 +; CHECK-CVT-SD-NEXT: mov h21, v2.h[4] +; CHECK-CVT-SD-NEXT: fmov w9, s6 +; CHECK-CVT-SD-NEXT: mov h22, v1.h[4] +; CHECK-CVT-SD-NEXT: mov h6, v0.h[4] +; CHECK-CVT-SD-NEXT: fmadd s4, s17, s16, s7 +; CHECK-CVT-SD-NEXT: mov h17, v2.h[5] +; CHECK-CVT-SD-NEXT: fmadd s5, s20, s19, s18 +; CHECK-CVT-SD-NEXT: mov h18, v1.h[5] +; CHECK-CVT-SD-NEXT: mov h19, v0.h[5] +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: fmov w11, s3 +; CHECK-CVT-SD-NEXT: shll v7.4s, v21.4h, #16 +; CHECK-CVT-SD-NEXT: shll v16.4s, v22.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w10, w9 +; CHECK-CVT-SD-NEXT: shll v6.4s, v17.4h, #16 +; CHECK-CVT-SD-NEXT: mov h17, v1.h[6] +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w10, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: fmov w11, s4 +; CHECK-CVT-SD-NEXT: fmadd s4, s3, s16, s7 +; CHECK-CVT-SD-NEXT: shll v7.4s, v18.4h, #16 +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: fmov s3, w9 +; CHECK-CVT-SD-NEXT: fmov w9, s5 +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: shll v16.4s, v19.4h, #16 +; CHECK-CVT-SD-NEXT: mov h18, v0.h[6] +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w11, w11, w8 +; CHECK-CVT-SD-NEXT: mov h1, v1.h[7] +; CHECK-CVT-SD-NEXT: fmov s5, w10 +; CHECK-CVT-SD-NEXT: mov h0, v0.h[7] +; CHECK-CVT-SD-NEXT: add w11, w12, w11 +; CHECK-CVT-SD-NEXT: fmov w12, s4 +; CHECK-CVT-SD-NEXT: fmadd s6, s16, s7, s6 +; CHECK-CVT-SD-NEXT: lsr w10, w11, #16 +; CHECK-CVT-SD-NEXT: ubfx w11, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: mov v3.h[1], v5.h[0] +; CHECK-CVT-SD-NEXT: mov h5, v2.h[6] +; CHECK-CVT-SD-NEXT: shll v7.4s, v18.4h, #16 +; CHECK-CVT-SD-NEXT: fmov s4, w10 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: mov h2, v2.h[7] +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: ubfx w10, w12, #16, #1 +; CHECK-CVT-SD-NEXT: fmov w11, s6 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov v3.h[2], v4.h[0] +; CHECK-CVT-SD-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v17.4h, #16 +; CHECK-CVT-SD-NEXT: fmov s16, w9 +; CHECK-CVT-SD-NEXT: add w9, w12, w8 +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w10, w9 +; CHECK-CVT-SD-NEXT: ubfx w10, w11, #16, #1 +; CHECK-CVT-SD-NEXT: fmadd s4, s7, s5, s4 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v3.h[3], v16.h[0] +; CHECK-CVT-SD-NEXT: fmadd s0, s0, s1, s2 +; CHECK-CVT-SD-NEXT: fmov s5, w9 +; CHECK-CVT-SD-NEXT: add w9, w11, w8 +; CHECK-CVT-SD-NEXT: add w9, w10, w9 +; CHECK-CVT-SD-NEXT: fmov w10, s4 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v3.h[4], v5.h[0] +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s0 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: mov v3.h[5], v1.h[0] +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s0, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v3.h[6], v0.h[0] +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: mov v3.h[7], v0.h[0] +; CHECK-CVT-SD-NEXT: mov v0.16b, v3.16b +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_fma: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: mov h3, v2.h[1] -; CHECK-BF16-NEXT: mov h4, v1.h[1] -; CHECK-BF16-NEXT: mov h5, v0.h[1] -; CHECK-BF16-NEXT: shll v6.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v7.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h17, v2.h[2] -; CHECK-BF16-NEXT: mov h18, v1.h[2] -; CHECK-BF16-NEXT: mov h19, v0.h[2] -; CHECK-BF16-NEXT: mov h20, v2.h[3] -; CHECK-BF16-NEXT: mov h21, v1.h[3] -; CHECK-BF16-NEXT: fmadd s6, s16, s7, s6 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-BF16-NEXT: mov h7, v0.h[3] -; CHECK-BF16-NEXT: shll v16.4s, v19.4h, #16 -; CHECK-BF16-NEXT: mov h19, v0.h[4] -; CHECK-BF16-NEXT: fmadd s4, s5, s4, s3 -; CHECK-BF16-NEXT: shll v3.4s, v17.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v18.4h, #16 -; CHECK-BF16-NEXT: mov h17, v2.h[4] -; CHECK-BF16-NEXT: mov h18, v1.h[4] -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: shll v19.4s, v19.4h, #16 -; CHECK-BF16-NEXT: fmadd s5, s16, s5, s3 -; CHECK-BF16-NEXT: bfcvt h3, s6 -; CHECK-BF16-NEXT: shll v6.4s, v20.4h, #16 -; CHECK-BF16-NEXT: bfcvt h4, s4 -; CHECK-BF16-NEXT: shll v16.4s, v21.4h, #16 -; CHECK-BF16-NEXT: shll v17.4s, v17.4h, #16 -; CHECK-BF16-NEXT: shll v18.4s, v18.4h, #16 -; CHECK-BF16-NEXT: fmadd s6, s7, s16, s6 -; CHECK-BF16-NEXT: bfcvt h5, s5 -; CHECK-BF16-NEXT: mov h7, v1.h[5] -; CHECK-BF16-NEXT: mov v3.h[1], v4.h[0] -; CHECK-BF16-NEXT: mov h4, v2.h[5] -; CHECK-BF16-NEXT: mov h16, v0.h[5] -; CHECK-BF16-NEXT: fmadd s17, s19, s18, s17 -; CHECK-BF16-NEXT: mov h18, v2.h[6] -; CHECK-BF16-NEXT: mov h19, v1.h[6] -; CHECK-BF16-NEXT: mov h2, v2.h[7] -; CHECK-BF16-NEXT: mov h1, v1.h[7] -; CHECK-BF16-NEXT: bfcvt h6, s6 -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: mov v3.h[2], v5.h[0] -; CHECK-BF16-NEXT: mov h5, v0.h[6] -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v16.4h, #16 -; CHECK-BF16-NEXT: mov h0, v0.h[7] -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: fmadd s4, s16, s7, s4 -; CHECK-BF16-NEXT: mov v3.h[3], v6.h[0] -; CHECK-BF16-NEXT: bfcvt h6, s17 -; CHECK-BF16-NEXT: shll v7.4s, v18.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v19.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmadd s5, s5, s16, s7 -; CHECK-BF16-NEXT: mov v3.h[4], v6.h[0] -; CHECK-BF16-NEXT: bfcvt h4, s4 -; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 -; CHECK-BF16-NEXT: mov v3.h[5], v4.h[0] -; CHECK-BF16-NEXT: bfcvt h4, s5 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: mov v3.h[6], v4.h[0] -; CHECK-BF16-NEXT: mov v3.h[7], v0.h[0] -; CHECK-BF16-NEXT: mov v0.16b, v3.16b -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_fma: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: mov h3, v2.h[1] +; CHECK-BF16-SD-NEXT: mov h4, v1.h[1] +; CHECK-BF16-SD-NEXT: mov h5, v0.h[1] +; CHECK-BF16-SD-NEXT: shll v6.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v7.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h17, v2.h[2] +; CHECK-BF16-SD-NEXT: mov h18, v1.h[2] +; CHECK-BF16-SD-NEXT: mov h19, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h20, v2.h[3] +; CHECK-BF16-SD-NEXT: mov h21, v1.h[3] +; CHECK-BF16-SD-NEXT: fmadd s6, s16, s7, s6 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16-SD-NEXT: mov h7, v0.h[3] +; CHECK-BF16-SD-NEXT: shll v16.4s, v19.4h, #16 +; CHECK-BF16-SD-NEXT: mov h19, v0.h[4] +; CHECK-BF16-SD-NEXT: fmadd s4, s5, s4, s3 +; CHECK-BF16-SD-NEXT: shll v3.4s, v17.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v18.4h, #16 +; CHECK-BF16-SD-NEXT: mov h17, v2.h[4] +; CHECK-BF16-SD-NEXT: mov h18, v1.h[4] +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: shll v19.4s, v19.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s5, s16, s5, s3 +; CHECK-BF16-SD-NEXT: bfcvt h3, s6 +; CHECK-BF16-SD-NEXT: shll v6.4s, v20.4h, #16 +; CHECK-BF16-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16-SD-NEXT: shll v16.4s, v21.4h, #16 +; CHECK-BF16-SD-NEXT: shll v17.4s, v17.4h, #16 +; CHECK-BF16-SD-NEXT: shll v18.4s, v18.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s6, s7, s16, s6 +; CHECK-BF16-SD-NEXT: bfcvt h5, s5 +; CHECK-BF16-SD-NEXT: mov h7, v1.h[5] +; CHECK-BF16-SD-NEXT: mov v3.h[1], v4.h[0] +; CHECK-BF16-SD-NEXT: mov h4, v2.h[5] +; CHECK-BF16-SD-NEXT: mov h16, v0.h[5] +; CHECK-BF16-SD-NEXT: fmadd s17, s19, s18, s17 +; CHECK-BF16-SD-NEXT: mov h18, v2.h[6] +; CHECK-BF16-SD-NEXT: mov h19, v1.h[6] +; CHECK-BF16-SD-NEXT: mov h2, v2.h[7] +; CHECK-BF16-SD-NEXT: mov h1, v1.h[7] +; CHECK-BF16-SD-NEXT: bfcvt h6, s6 +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: mov v3.h[2], v5.h[0] +; CHECK-BF16-SD-NEXT: mov h5, v0.h[6] +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s4, s16, s7, s4 +; CHECK-BF16-SD-NEXT: mov v3.h[3], v6.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h6, s17 +; CHECK-BF16-SD-NEXT: shll v7.4s, v18.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v19.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmadd s5, s5, s16, s7 +; CHECK-BF16-SD-NEXT: mov v3.h[4], v6.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16-SD-NEXT: fmadd s0, s0, s1, s2 +; CHECK-BF16-SD-NEXT: mov v3.h[5], v4.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h4, s5 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: mov v3.h[6], v4.h[0] +; CHECK-BF16-SD-NEXT: mov v3.h[7], v0.h[0] +; CHECK-BF16-SD-NEXT: mov v0.16b, v3.16b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fma: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: mov h3, v2.h[1] +; CHECK-BF16SVE-SD-NEXT: mov h4, v1.h[1] +; CHECK-BF16SVE-SD-NEXT: mov h5, v0.h[1] +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v2.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h17, v2.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h18, v1.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h19, v0.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h20, v2.h[3] +; CHECK-BF16SVE-SD-NEXT: mov h21, v1.h[3] +; CHECK-BF16SVE-SD-NEXT: fmadd s6, s16, s7, s6 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h7, v0.h[3] +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v19.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h19, v0.h[4] +; CHECK-BF16SVE-SD-NEXT: fmadd s4, s5, s4, s3 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v17.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v18.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h17, v2.h[4] +; CHECK-BF16SVE-SD-NEXT: mov h18, v1.h[4] +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v19.4s, v19.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmadd s5, s16, s5, s3 +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s6 +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v20.4h, #16 +; CHECK-BF16SVE-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v21.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v17.4s, v17.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v18.4s, v18.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmadd s6, s7, s16, s6 +; CHECK-BF16SVE-SD-NEXT: bfcvt h5, s5 +; CHECK-BF16SVE-SD-NEXT: mov h7, v1.h[5] +; CHECK-BF16SVE-SD-NEXT: mov v3.h[1], v4.h[0] +; CHECK-BF16SVE-SD-NEXT: mov h4, v2.h[5] +; CHECK-BF16SVE-SD-NEXT: mov h16, v0.h[5] +; CHECK-BF16SVE-SD-NEXT: fmadd s17, s19, s18, s17 +; CHECK-BF16SVE-SD-NEXT: mov h18, v2.h[6] +; CHECK-BF16SVE-SD-NEXT: mov h19, v1.h[6] +; CHECK-BF16SVE-SD-NEXT: mov h2, v2.h[7] +; CHECK-BF16SVE-SD-NEXT: mov h1, v1.h[7] +; CHECK-BF16SVE-SD-NEXT: bfcvt h6, s6 +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov v3.h[2], v5.h[0] +; CHECK-BF16SVE-SD-NEXT: mov h5, v0.h[6] +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmadd s4, s16, s7, s4 +; CHECK-BF16SVE-SD-NEXT: mov v3.h[3], v6.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h6, s17 +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v18.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v19.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmadd s5, s5, s16, s7 +; CHECK-BF16SVE-SD-NEXT: mov v3.h[4], v6.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16SVE-SD-NEXT: fmadd s0, s0, s1, s2 +; CHECK-BF16SVE-SD-NEXT: mov v3.h[5], v4.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h4, s5 +; CHECK-BF16SVE-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16SVE-SD-NEXT: mov v3.h[6], v4.h[0] +; CHECK-BF16SVE-SD-NEXT: mov v3.h[7], v0.h[0] +; CHECK-BF16SVE-SD-NEXT: mov v0.16b, v3.16b +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fma: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll v5.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmla v5.4s, v4.4s, v3.4s +; CHECK-CVT-GI-NEXT: fmla v2.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: movi v0.4s, #1 +; CHECK-CVT-GI-NEXT: movi v1.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v5.4s, v5.4s +; CHECK-CVT-GI-NEXT: ushr v3.4s, v5.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v5.4s, v1.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-CVT-GI-NEXT: and v0.16b, v4.16b, v0.16b +; CHECK-CVT-GI-NEXT: orr v4.16b, v5.16b, v6.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v3.4s, v16.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: mov v1.16b, v5.16b +; CHECK-CVT-GI-NEXT: bsl v1.16b, v4.16b, v3.16b +; CHECK-CVT-GI-NEXT: bif v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fma: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll v5.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-BF16-GI-NEXT: fmla v5.4s, v4.4s, v3.4s +; CHECK-BF16-GI-NEXT: fmla v2.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v5.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) ret <8 x bfloat> %r } @@ -5661,323 +6087,527 @@ define <8 x bfloat> @test_fneg(<8 x bfloat> %a) #0 { } define <8 x bfloat> @test_minnum(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-CVT-LABEL: test_minnum: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: mov h2, v1.h[1] -; CHECK-CVT-NEXT: mov h3, v0.h[1] -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov h6, v1.h[2] -; CHECK-CVT-NEXT: mov h7, v0.h[2] -; CHECK-CVT-NEXT: mov h16, v1.h[3] -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: fminnm s4, s5, s4 -; CHECK-CVT-NEXT: shll v5.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v6.4s, v7.4h, #16 -; CHECK-CVT-NEXT: mov h7, v1.h[4] -; CHECK-CVT-NEXT: shll v16.4s, v16.4h, #16 -; CHECK-CVT-NEXT: fminnm s2, s3, s2 -; CHECK-CVT-NEXT: mov h3, v0.h[3] -; CHECK-CVT-NEXT: fmov w9, s4 -; CHECK-CVT-NEXT: mov h4, v0.h[4] -; CHECK-CVT-NEXT: fminnm s5, s6, s5 -; CHECK-CVT-NEXT: shll v6.4s, v7.4h, #16 -; CHECK-CVT-NEXT: fmov w11, s2 -; CHECK-CVT-NEXT: shll v2.4s, v3.4h, #16 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: mov h3, v1.h[5] -; CHECK-CVT-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-CVT-NEXT: add w9, w10, w9 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: add w10, w11, w8 -; CHECK-CVT-NEXT: fminnm s7, s2, s16 -; CHECK-CVT-NEXT: fmov w11, s5 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov h5, v0.h[5] -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: fminnm s4, s4, s6 -; CHECK-CVT-NEXT: mov h6, v1.h[6] -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: ubfx w9, w11, #16, #1 -; CHECK-CVT-NEXT: add w11, w11, w8 -; CHECK-CVT-NEXT: mov h1, v1.h[7] -; CHECK-CVT-NEXT: fmov s16, w10 -; CHECK-CVT-NEXT: fmov w10, s7 -; CHECK-CVT-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-CVT-NEXT: add w9, w9, w11 -; CHECK-CVT-NEXT: mov h7, v0.h[6] -; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: mov v2.h[1], v16.h[0] -; CHECK-CVT-NEXT: add w10, w10, w8 -; CHECK-CVT-NEXT: fmov s16, w9 -; CHECK-CVT-NEXT: fminnm s3, s5, s3 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: add w9, w11, w10 -; CHECK-CVT-NEXT: fmov w10, s4 -; CHECK-CVT-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v7.4h, #16 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov v2.h[2], v16.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s6, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s3 -; CHECK-CVT-NEXT: fminnm s3, s5, s4 -; CHECK-CVT-NEXT: fminnm s0, s0, s1 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v2.h[3], v6.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s3 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v2.h[4], v4.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s0 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v2.h[5], v1.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v2.h[6], v0.h[0] -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0] -; CHECK-CVT-NEXT: mov v0.16b, v2.16b -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_minnum: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: mov h2, v1.h[1] +; CHECK-CVT-SD-NEXT: mov h3, v0.h[1] +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov h6, v1.h[2] +; CHECK-CVT-SD-NEXT: mov h7, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h16, v1.h[3] +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: fminnm s4, s5, s4 +; CHECK-CVT-SD-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v6.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: mov h7, v1.h[4] +; CHECK-CVT-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-CVT-SD-NEXT: fminnm s2, s3, s2 +; CHECK-CVT-SD-NEXT: mov h3, v0.h[3] +; CHECK-CVT-SD-NEXT: fmov w9, s4 +; CHECK-CVT-SD-NEXT: mov h4, v0.h[4] +; CHECK-CVT-SD-NEXT: fminnm s5, s6, s5 +; CHECK-CVT-SD-NEXT: shll v6.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w11, s2 +; CHECK-CVT-SD-NEXT: shll v2.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: mov h3, v1.h[5] +; CHECK-CVT-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w10, w9 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w10, w11, w8 +; CHECK-CVT-SD-NEXT: fminnm s7, s2, s16 +; CHECK-CVT-SD-NEXT: fmov w11, s5 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov h5, v0.h[5] +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: fminnm s4, s4, s6 +; CHECK-CVT-SD-NEXT: mov h6, v1.h[6] +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: ubfx w9, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w11, w11, w8 +; CHECK-CVT-SD-NEXT: mov h1, v1.h[7] +; CHECK-CVT-SD-NEXT: fmov s16, w10 +; CHECK-CVT-SD-NEXT: fmov w10, s7 +; CHECK-CVT-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w9, w11 +; CHECK-CVT-SD-NEXT: mov h7, v0.h[6] +; CHECK-CVT-SD-NEXT: mov h0, v0.h[7] +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: mov v2.h[1], v16.h[0] +; CHECK-CVT-SD-NEXT: add w10, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s16, w9 +; CHECK-CVT-SD-NEXT: fminnm s3, s5, s3 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w11, w10 +; CHECK-CVT-SD-NEXT: fmov w10, s4 +; CHECK-CVT-SD-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[2], v16.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s6, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s3 +; CHECK-CVT-SD-NEXT: fminnm s3, s5, s4 +; CHECK-CVT-SD-NEXT: fminnm s0, s0, s1 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[3], v6.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s4, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s3 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[4], v4.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s0 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[5], v1.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s0, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[6], v0.h[0] +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-CVT-SD-NEXT: mov v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_minnum: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: mov h2, v1.h[1] -; CHECK-BF16-NEXT: mov h3, v0.h[1] -; CHECK-BF16-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h6, v1.h[2] -; CHECK-BF16-NEXT: mov h7, v0.h[2] -; CHECK-BF16-NEXT: mov h16, v1.h[3] -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: fminnm s4, s5, s4 -; CHECK-BF16-NEXT: mov h5, v0.h[3] -; CHECK-BF16-NEXT: shll v6.4s, v6.4h, #16 -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: fminnm s3, s3, s2 -; CHECK-BF16-NEXT: bfcvt h2, s4 -; CHECK-BF16-NEXT: fminnm s4, s7, s6 -; CHECK-BF16-NEXT: shll v6.4s, v16.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-BF16-NEXT: mov h7, v1.h[4] -; CHECK-BF16-NEXT: mov h16, v0.h[4] -; CHECK-BF16-NEXT: bfcvt h3, s3 -; CHECK-BF16-NEXT: fminnm s5, s5, s6 -; CHECK-BF16-NEXT: bfcvt h4, s4 -; CHECK-BF16-NEXT: mov h6, v0.h[5] -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v16.4h, #16 -; CHECK-BF16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-BF16-NEXT: mov h3, v1.h[5] -; CHECK-BF16-NEXT: bfcvt h5, s5 -; CHECK-BF16-NEXT: fminnm s7, s16, s7 -; CHECK-BF16-NEXT: mov h16, v0.h[6] -; CHECK-BF16-NEXT: shll v6.4s, v6.4h, #16 -; CHECK-BF16-NEXT: mov h0, v0.h[7] -; CHECK-BF16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-BF16-NEXT: mov h4, v1.h[6] -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: mov h1, v1.h[7] -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fminnm s3, s6, s3 -; CHECK-BF16-NEXT: shll v6.4s, v16.4h, #16 -; CHECK-BF16-NEXT: mov v2.h[3], v5.h[0] -; CHECK-BF16-NEXT: bfcvt h5, s7 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: fminnm s4, s6, s4 -; CHECK-BF16-NEXT: bfcvt h3, s3 -; CHECK-BF16-NEXT: mov v2.h[4], v5.h[0] -; CHECK-BF16-NEXT: fminnm s0, s0, s1 -; CHECK-BF16-NEXT: mov v2.h[5], v3.h[0] -; CHECK-BF16-NEXT: bfcvt h3, s4 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: mov v2.h[6], v3.h[0] -; CHECK-BF16-NEXT: mov v2.h[7], v0.h[0] -; CHECK-BF16-NEXT: mov v0.16b, v2.16b -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_minnum: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: mov h2, v1.h[1] +; CHECK-BF16-SD-NEXT: mov h3, v0.h[1] +; CHECK-BF16-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h6, v1.h[2] +; CHECK-BF16-SD-NEXT: mov h7, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h16, v1.h[3] +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: fminnm s4, s5, s4 +; CHECK-BF16-SD-NEXT: mov h5, v0.h[3] +; CHECK-BF16-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: fminnm s3, s3, s2 +; CHECK-BF16-SD-NEXT: bfcvt h2, s4 +; CHECK-BF16-SD-NEXT: fminnm s4, s7, s6 +; CHECK-BF16-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16-SD-NEXT: mov h7, v1.h[4] +; CHECK-BF16-SD-NEXT: mov h16, v0.h[4] +; CHECK-BF16-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16-SD-NEXT: fminnm s5, s5, s6 +; CHECK-BF16-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16-SD-NEXT: mov h6, v0.h[5] +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: mov v2.h[1], v3.h[0] +; CHECK-BF16-SD-NEXT: mov h3, v1.h[5] +; CHECK-BF16-SD-NEXT: bfcvt h5, s5 +; CHECK-BF16-SD-NEXT: fminnm s7, s16, s7 +; CHECK-BF16-SD-NEXT: mov h16, v0.h[6] +; CHECK-BF16-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16-SD-NEXT: mov v2.h[2], v4.h[0] +; CHECK-BF16-SD-NEXT: mov h4, v1.h[6] +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: mov h1, v1.h[7] +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fminnm s3, s6, s3 +; CHECK-BF16-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: mov v2.h[3], v5.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h5, s7 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fminnm s4, s6, s4 +; CHECK-BF16-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16-SD-NEXT: mov v2.h[4], v5.h[0] +; CHECK-BF16-SD-NEXT: fminnm s0, s0, s1 +; CHECK-BF16-SD-NEXT: mov v2.h[5], v3.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h3, s4 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: mov v2.h[6], v3.h[0] +; CHECK-BF16-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-BF16-SD-NEXT: mov v0.16b, v2.16b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_minnum: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: mov h2, v1.h[1] +; CHECK-BF16SVE-SD-NEXT: mov h3, v0.h[1] +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h6, v1.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h7, v0.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h16, v1.h[3] +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fminnm s4, s5, s4 +; CHECK-BF16SVE-SD-NEXT: mov h5, v0.h[3] +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fminnm s3, s3, s2 +; CHECK-BF16SVE-SD-NEXT: bfcvt h2, s4 +; CHECK-BF16SVE-SD-NEXT: fminnm s4, s7, s6 +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h7, v1.h[4] +; CHECK-BF16SVE-SD-NEXT: mov h16, v0.h[4] +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16SVE-SD-NEXT: fminnm s5, s5, s6 +; CHECK-BF16SVE-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16SVE-SD-NEXT: mov h6, v0.h[5] +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[1], v3.h[0] +; CHECK-BF16SVE-SD-NEXT: mov h3, v1.h[5] +; CHECK-BF16SVE-SD-NEXT: bfcvt h5, s5 +; CHECK-BF16SVE-SD-NEXT: fminnm s7, s16, s7 +; CHECK-BF16SVE-SD-NEXT: mov h16, v0.h[6] +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16SVE-SD-NEXT: mov v2.h[2], v4.h[0] +; CHECK-BF16SVE-SD-NEXT: mov h4, v1.h[6] +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h1, v1.h[7] +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fminnm s3, s6, s3 +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[3], v5.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h5, s7 +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fminnm s4, s6, s4 +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[4], v5.h[0] +; CHECK-BF16SVE-SD-NEXT: fminnm s0, s0, s1 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[5], v3.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s4 +; CHECK-BF16SVE-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[6], v3.h[0] +; CHECK-BF16SVE-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-BF16SVE-SD-NEXT: mov v0.16b, v2.16b +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_minnum: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fminnm v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fminnm v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v2.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_minnum: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fminnm v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fminnm v1.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %r } define <8 x bfloat> @test_maxnum(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-CVT-LABEL: test_maxnum: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: mov h2, v1.h[1] -; CHECK-CVT-NEXT: mov h3, v0.h[1] -; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov h6, v1.h[2] -; CHECK-CVT-NEXT: mov h7, v0.h[2] -; CHECK-CVT-NEXT: mov h16, v1.h[3] -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: fmaxnm s4, s5, s4 -; CHECK-CVT-NEXT: shll v5.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v6.4s, v7.4h, #16 -; CHECK-CVT-NEXT: mov h7, v1.h[4] -; CHECK-CVT-NEXT: shll v16.4s, v16.4h, #16 -; CHECK-CVT-NEXT: fmaxnm s2, s3, s2 -; CHECK-CVT-NEXT: mov h3, v0.h[3] -; CHECK-CVT-NEXT: fmov w9, s4 -; CHECK-CVT-NEXT: mov h4, v0.h[4] -; CHECK-CVT-NEXT: fmaxnm s5, s6, s5 -; CHECK-CVT-NEXT: shll v6.4s, v7.4h, #16 -; CHECK-CVT-NEXT: fmov w11, s2 -; CHECK-CVT-NEXT: shll v2.4s, v3.4h, #16 -; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 -; CHECK-CVT-NEXT: add w9, w9, w8 -; CHECK-CVT-NEXT: mov h3, v1.h[5] -; CHECK-CVT-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-CVT-NEXT: add w9, w10, w9 -; CHECK-CVT-NEXT: ubfx w12, w11, #16, #1 -; CHECK-CVT-NEXT: add w10, w11, w8 -; CHECK-CVT-NEXT: fmaxnm s7, s2, s16 -; CHECK-CVT-NEXT: fmov w11, s5 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov h5, v0.h[5] -; CHECK-CVT-NEXT: add w10, w12, w10 -; CHECK-CVT-NEXT: fmaxnm s4, s4, s6 -; CHECK-CVT-NEXT: mov h6, v1.h[6] -; CHECK-CVT-NEXT: lsr w10, w10, #16 -; CHECK-CVT-NEXT: fmov s2, w9 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: ubfx w9, w11, #16, #1 -; CHECK-CVT-NEXT: add w11, w11, w8 -; CHECK-CVT-NEXT: mov h1, v1.h[7] -; CHECK-CVT-NEXT: fmov s16, w10 -; CHECK-CVT-NEXT: fmov w10, s7 -; CHECK-CVT-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-CVT-NEXT: add w9, w9, w11 -; CHECK-CVT-NEXT: mov h7, v0.h[6] -; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: mov v2.h[1], v16.h[0] -; CHECK-CVT-NEXT: add w10, w10, w8 -; CHECK-CVT-NEXT: fmov s16, w9 -; CHECK-CVT-NEXT: fmaxnm s3, s5, s3 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: add w9, w11, w10 -; CHECK-CVT-NEXT: fmov w10, s4 -; CHECK-CVT-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-CVT-NEXT: shll v5.4s, v7.4h, #16 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: mov v2.h[2], v16.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s6, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s3 -; CHECK-CVT-NEXT: fmaxnm s3, s5, s4 -; CHECK-CVT-NEXT: fmaxnm s0, s0, s1 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v2.h[3], v6.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s3 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v2.h[4], v4.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: add w9, w10, w8 -; CHECK-CVT-NEXT: fmov w10, s0 -; CHECK-CVT-NEXT: add w9, w11, w9 -; CHECK-CVT-NEXT: lsr w9, w9, #16 -; CHECK-CVT-NEXT: mov v2.h[5], v1.h[0] -; CHECK-CVT-NEXT: ubfx w11, w10, #16, #1 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: add w8, w11, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: mov v2.h[6], v0.h[0] -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0] -; CHECK-CVT-NEXT: mov v0.16b, v2.16b -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_maxnum: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: mov h2, v1.h[1] +; CHECK-CVT-SD-NEXT: mov h3, v0.h[1] +; CHECK-CVT-SD-NEXT: mov w8, #32767 // =0x7fff +; CHECK-CVT-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov h6, v1.h[2] +; CHECK-CVT-SD-NEXT: mov h7, v0.h[2] +; CHECK-CVT-SD-NEXT: mov h16, v1.h[3] +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: fmaxnm s4, s5, s4 +; CHECK-CVT-SD-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v6.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: mov h7, v1.h[4] +; CHECK-CVT-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-CVT-SD-NEXT: fmaxnm s2, s3, s2 +; CHECK-CVT-SD-NEXT: mov h3, v0.h[3] +; CHECK-CVT-SD-NEXT: fmov w9, s4 +; CHECK-CVT-SD-NEXT: mov h4, v0.h[4] +; CHECK-CVT-SD-NEXT: fmaxnm s5, s6, s5 +; CHECK-CVT-SD-NEXT: shll v6.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: fmov w11, s2 +; CHECK-CVT-SD-NEXT: shll v2.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: ubfx w10, w9, #16, #1 +; CHECK-CVT-SD-NEXT: add w9, w9, w8 +; CHECK-CVT-SD-NEXT: mov h3, v1.h[5] +; CHECK-CVT-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w10, w9 +; CHECK-CVT-SD-NEXT: ubfx w12, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w10, w11, w8 +; CHECK-CVT-SD-NEXT: fmaxnm s7, s2, s16 +; CHECK-CVT-SD-NEXT: fmov w11, s5 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov h5, v0.h[5] +; CHECK-CVT-SD-NEXT: add w10, w12, w10 +; CHECK-CVT-SD-NEXT: fmaxnm s4, s4, s6 +; CHECK-CVT-SD-NEXT: mov h6, v1.h[6] +; CHECK-CVT-SD-NEXT: lsr w10, w10, #16 +; CHECK-CVT-SD-NEXT: fmov s2, w9 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: ubfx w9, w11, #16, #1 +; CHECK-CVT-SD-NEXT: add w11, w11, w8 +; CHECK-CVT-SD-NEXT: mov h1, v1.h[7] +; CHECK-CVT-SD-NEXT: fmov s16, w10 +; CHECK-CVT-SD-NEXT: fmov w10, s7 +; CHECK-CVT-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w9, w11 +; CHECK-CVT-SD-NEXT: mov h7, v0.h[6] +; CHECK-CVT-SD-NEXT: mov h0, v0.h[7] +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: mov v2.h[1], v16.h[0] +; CHECK-CVT-SD-NEXT: add w10, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s16, w9 +; CHECK-CVT-SD-NEXT: fmaxnm s3, s5, s3 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: add w9, w11, w10 +; CHECK-CVT-SD-NEXT: fmov w10, s4 +; CHECK-CVT-SD-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-CVT-SD-NEXT: shll v5.4s, v7.4h, #16 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[2], v16.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s6, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s3 +; CHECK-CVT-SD-NEXT: fmaxnm s3, s5, s4 +; CHECK-CVT-SD-NEXT: fmaxnm s0, s0, s1 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[3], v6.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s4, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s3 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[4], v4.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: fmov s1, w9 +; CHECK-CVT-SD-NEXT: add w9, w10, w8 +; CHECK-CVT-SD-NEXT: fmov w10, s0 +; CHECK-CVT-SD-NEXT: add w9, w11, w9 +; CHECK-CVT-SD-NEXT: lsr w9, w9, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[5], v1.h[0] +; CHECK-CVT-SD-NEXT: ubfx w11, w10, #16, #1 +; CHECK-CVT-SD-NEXT: add w8, w10, w8 +; CHECK-CVT-SD-NEXT: fmov s0, w9 +; CHECK-CVT-SD-NEXT: add w8, w11, w8 +; CHECK-CVT-SD-NEXT: lsr w8, w8, #16 +; CHECK-CVT-SD-NEXT: mov v2.h[6], v0.h[0] +; CHECK-CVT-SD-NEXT: fmov s0, w8 +; CHECK-CVT-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-CVT-SD-NEXT: mov v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_maxnum: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: mov h2, v1.h[1] -; CHECK-BF16-NEXT: mov h3, v0.h[1] -; CHECK-BF16-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-BF16-NEXT: mov h6, v1.h[2] -; CHECK-BF16-NEXT: mov h7, v0.h[2] -; CHECK-BF16-NEXT: mov h16, v1.h[3] -; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: fmaxnm s4, s5, s4 -; CHECK-BF16-NEXT: mov h5, v0.h[3] -; CHECK-BF16-NEXT: shll v6.4s, v6.4h, #16 -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: fmaxnm s3, s3, s2 -; CHECK-BF16-NEXT: bfcvt h2, s4 -; CHECK-BF16-NEXT: fmaxnm s4, s7, s6 -; CHECK-BF16-NEXT: shll v6.4s, v16.4h, #16 -; CHECK-BF16-NEXT: shll v5.4s, v5.4h, #16 -; CHECK-BF16-NEXT: mov h7, v1.h[4] -; CHECK-BF16-NEXT: mov h16, v0.h[4] -; CHECK-BF16-NEXT: bfcvt h3, s3 -; CHECK-BF16-NEXT: fmaxnm s5, s5, s6 -; CHECK-BF16-NEXT: bfcvt h4, s4 -; CHECK-BF16-NEXT: mov h6, v0.h[5] -; CHECK-BF16-NEXT: shll v7.4s, v7.4h, #16 -; CHECK-BF16-NEXT: shll v16.4s, v16.4h, #16 -; CHECK-BF16-NEXT: mov v2.h[1], v3.h[0] -; CHECK-BF16-NEXT: mov h3, v1.h[5] -; CHECK-BF16-NEXT: bfcvt h5, s5 -; CHECK-BF16-NEXT: fmaxnm s7, s16, s7 -; CHECK-BF16-NEXT: mov h16, v0.h[6] -; CHECK-BF16-NEXT: shll v6.4s, v6.4h, #16 -; CHECK-BF16-NEXT: mov h0, v0.h[7] -; CHECK-BF16-NEXT: mov v2.h[2], v4.h[0] -; CHECK-BF16-NEXT: mov h4, v1.h[6] -; CHECK-BF16-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-BF16-NEXT: mov h1, v1.h[7] -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmaxnm s3, s6, s3 -; CHECK-BF16-NEXT: shll v6.4s, v16.4h, #16 -; CHECK-BF16-NEXT: mov v2.h[3], v5.h[0] -; CHECK-BF16-NEXT: bfcvt h5, s7 -; CHECK-BF16-NEXT: shll v4.4s, v4.4h, #16 -; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-BF16-NEXT: fmaxnm s4, s6, s4 -; CHECK-BF16-NEXT: bfcvt h3, s3 -; CHECK-BF16-NEXT: mov v2.h[4], v5.h[0] -; CHECK-BF16-NEXT: fmaxnm s0, s0, s1 -; CHECK-BF16-NEXT: mov v2.h[5], v3.h[0] -; CHECK-BF16-NEXT: bfcvt h3, s4 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: mov v2.h[6], v3.h[0] -; CHECK-BF16-NEXT: mov v2.h[7], v0.h[0] -; CHECK-BF16-NEXT: mov v0.16b, v2.16b -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_maxnum: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: mov h2, v1.h[1] +; CHECK-BF16-SD-NEXT: mov h3, v0.h[1] +; CHECK-BF16-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: mov h6, v1.h[2] +; CHECK-BF16-SD-NEXT: mov h7, v0.h[2] +; CHECK-BF16-SD-NEXT: mov h16, v1.h[3] +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: fmaxnm s4, s5, s4 +; CHECK-BF16-SD-NEXT: mov h5, v0.h[3] +; CHECK-BF16-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: fmaxnm s3, s3, s2 +; CHECK-BF16-SD-NEXT: bfcvt h2, s4 +; CHECK-BF16-SD-NEXT: fmaxnm s4, s7, s6 +; CHECK-BF16-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16-SD-NEXT: mov h7, v1.h[4] +; CHECK-BF16-SD-NEXT: mov h16, v0.h[4] +; CHECK-BF16-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16-SD-NEXT: fmaxnm s5, s5, s6 +; CHECK-BF16-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16-SD-NEXT: mov h6, v0.h[5] +; CHECK-BF16-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: mov v2.h[1], v3.h[0] +; CHECK-BF16-SD-NEXT: mov h3, v1.h[5] +; CHECK-BF16-SD-NEXT: bfcvt h5, s5 +; CHECK-BF16-SD-NEXT: fmaxnm s7, s16, s7 +; CHECK-BF16-SD-NEXT: mov h16, v0.h[6] +; CHECK-BF16-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16-SD-NEXT: mov v2.h[2], v4.h[0] +; CHECK-BF16-SD-NEXT: mov h4, v1.h[6] +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: mov h1, v1.h[7] +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fmaxnm s3, s6, s3 +; CHECK-BF16-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16-SD-NEXT: mov v2.h[3], v5.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h5, s7 +; CHECK-BF16-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fmaxnm s4, s6, s4 +; CHECK-BF16-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16-SD-NEXT: mov v2.h[4], v5.h[0] +; CHECK-BF16-SD-NEXT: fmaxnm s0, s0, s1 +; CHECK-BF16-SD-NEXT: mov v2.h[5], v3.h[0] +; CHECK-BF16-SD-NEXT: bfcvt h3, s4 +; CHECK-BF16-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16-SD-NEXT: mov v2.h[6], v3.h[0] +; CHECK-BF16-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-BF16-SD-NEXT: mov v0.16b, v2.16b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_maxnum: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: mov h2, v1.h[1] +; CHECK-BF16SVE-SD-NEXT: mov h3, v0.h[1] +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h6, v1.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h7, v0.h[2] +; CHECK-BF16SVE-SD-NEXT: mov h16, v1.h[3] +; CHECK-BF16SVE-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s4, s5, s4 +; CHECK-BF16SVE-SD-NEXT: mov h5, v0.h[3] +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s3, s3, s2 +; CHECK-BF16SVE-SD-NEXT: bfcvt h2, s4 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s4, s7, s6 +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h7, v1.h[4] +; CHECK-BF16SVE-SD-NEXT: mov h16, v0.h[4] +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s5, s5, s6 +; CHECK-BF16SVE-SD-NEXT: bfcvt h4, s4 +; CHECK-BF16SVE-SD-NEXT: mov h6, v0.h[5] +; CHECK-BF16SVE-SD-NEXT: shll v7.4s, v7.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v16.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[1], v3.h[0] +; CHECK-BF16SVE-SD-NEXT: mov h3, v1.h[5] +; CHECK-BF16SVE-SD-NEXT: bfcvt h5, s5 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s7, s16, s7 +; CHECK-BF16SVE-SD-NEXT: mov h16, v0.h[6] +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h0, v0.h[7] +; CHECK-BF16SVE-SD-NEXT: mov v2.h[2], v4.h[0] +; CHECK-BF16SVE-SD-NEXT: mov h4, v1.h[6] +; CHECK-BF16SVE-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov h1, v1.h[7] +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s3, s6, s3 +; CHECK-BF16SVE-SD-NEXT: shll v6.4s, v16.4h, #16 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[3], v5.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h5, s7 +; CHECK-BF16SVE-SD-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fmaxnm s4, s6, s4 +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s3 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[4], v5.h[0] +; CHECK-BF16SVE-SD-NEXT: fmaxnm s0, s0, s1 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[5], v3.h[0] +; CHECK-BF16SVE-SD-NEXT: bfcvt h3, s4 +; CHECK-BF16SVE-SD-NEXT: bfcvt h0, s0 +; CHECK-BF16SVE-SD-NEXT: mov v2.h[6], v3.h[0] +; CHECK-BF16SVE-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-BF16SVE-SD-NEXT: mov v0.16b, v2.16b +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_maxnum: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmaxnm v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v4.4s, v2.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v2.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v2.16b, v2.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-CVT-GI-NEXT: bif v2.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_maxnum: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fmaxnm v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fmaxnm v1.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v2.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %r } @@ -6057,263 +6687,613 @@ define <8 x bfloat> @test_copysign_f32(<8 x bfloat> %a, <8 x float> %b) #0 { } define <8 x bfloat> @test_floor(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_floor: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frintm v2.4s, v2.4s -; CHECK-CVT-NEXT: frintm v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_floor: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frintm v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frintm v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_floor: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frintm v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frintm v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_floor: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frintm v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frintm v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_floor: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frintm v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frintm v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_floor: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintm v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frintm v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_floor: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frintm v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frintm v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.floor.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_ceil(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_ceil: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frintp v2.4s, v2.4s -; CHECK-CVT-NEXT: frintp v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_ceil: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frintp v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frintp v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_ceil: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frintp v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frintp v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_ceil: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frintp v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frintp v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_ceil: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frintp v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frintp v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_ceil: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintp v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frintp v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_ceil: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frintp v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frintp v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.ceil.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_trunc(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_trunc: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frintz v2.4s, v2.4s -; CHECK-CVT-NEXT: frintz v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_trunc: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frintz v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frintz v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_trunc: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frintz v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frintz v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_trunc: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frintz v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frintz v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_trunc: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frintz v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frintz v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_trunc: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintz v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frintz v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_trunc: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frintz v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frintz v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.trunc.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_rint(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_rint: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frintx v2.4s, v2.4s -; CHECK-CVT-NEXT: frintx v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_rint: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frintx v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frintx v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_rint: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frintx v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frintx v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_rint: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frintx v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frintx v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_rint: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frintx v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frintx v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_rint: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintx v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frintx v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_rint: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frintx v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frintx v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.rint.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_nearbyint(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_nearbyint: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frinti v2.4s, v2.4s -; CHECK-CVT-NEXT: frinti v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_nearbyint: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frinti v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frinti v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_nearbyint: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frinti v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frinti v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_nearbyint: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frinti v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frinti v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_nearbyint: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frinti v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frinti v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_nearbyint: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frinti v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frinti v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_nearbyint: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frinti v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frinti v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.nearbyint.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_round(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_round: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frinta v2.4s, v2.4s -; CHECK-CVT-NEXT: frinta v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_round: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frinta v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frinta v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_round: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frinta v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frinta v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_round: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frinta v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frinta v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_round: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frinta v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frinta v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_round: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frinta v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frinta v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_round: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frinta v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frinta v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.round.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_roundeven(<8 x bfloat> %a) #0 { -; CHECK-CVT-LABEL: test_roundeven: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8 -; CHECK-CVT-NEXT: frintn v2.4s, v2.4s -; CHECK-CVT-NEXT: frintn v3.4s, v0.4s -; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v4.4s, v3.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_roundeven: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v1.4s, #1 +; CHECK-CVT-SD-NEXT: movi v5.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: frintn v2.4s, v2.4s +; CHECK-CVT-SD-NEXT: frintn v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v4.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-CVT-SD-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v5.4s +; CHECK-CVT-SD-NEXT: ret ; -; CHECK-BF16-LABEL: test_roundeven: -; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: shll v1.4s, v0.4h, #16 -; CHECK-BF16-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-BF16-NEXT: frintn v1.4s, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s -; CHECK-BF16-NEXT: frintn v1.4s, v2.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s -; CHECK-BF16-NEXT: ret +; CHECK-BF16-SD-LABEL: test_roundeven: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: frintn v1.4s, v1.4s +; CHECK-BF16-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-SD-NEXT: frintn v1.4s, v2.4s +; CHECK-BF16-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_roundeven: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: frintn v1.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16SVE-SD-NEXT: frintn v1.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_roundeven: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: movi v2.4s, #1 +; CHECK-CVT-GI-NEXT: movi v3.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: movi v6.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: frintn v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: frintn v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: ushr v4.4s, v1.4s, #16 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v7.4s, v1.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v1.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v1.16b, v6.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: and v4.16b, v4.16b, v2.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v5.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v7.16b +; CHECK-CVT-GI-NEXT: mvn v7.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v4.4s, v16.4s, v4.4s +; CHECK-CVT-GI-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: bif v1.16b, v4.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v2.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_roundeven: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: frintn v1.4s, v1.4s +; CHECK-BF16-GI-NEXT: frintn v2.4s, v0.4s +; CHECK-BF16-GI-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-GI-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-GI-NEXT: ret %r = call <8 x bfloat> @llvm.roundeven.v8bf16(<8 x bfloat> %a) ret <8 x bfloat> %r } define <8 x bfloat> @test_fmuladd(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) #0 { -; CHECK-CVT-LABEL: test_fmuladd: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-CVT-NEXT: shll2 v1.4s, v1.8h, #16 -; CHECK-CVT-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-CVT-NEXT: movi v6.4s, #127, msl #8 -; CHECK-CVT-NEXT: fmul v3.4s, v4.4s, v3.4s -; CHECK-CVT-NEXT: movi v4.4s, #1 -; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: ushr v1.4s, v3.4s, #16 -; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-CVT-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: and v3.16b, v5.16b, v4.16b -; CHECK-CVT-NEXT: addhn v1.4h, v1.4s, v6.4s -; CHECK-CVT-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-CVT-NEXT: shll v3.4s, v2.4h, #16 -; CHECK-CVT-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: fadd v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fadd v2.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: ushr v0.4s, v1.4s, #16 -; CHECK-CVT-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 -; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-CVT-NEXT: and v1.16b, v3.16b, v4.16b -; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v6.4s -; CHECK-CVT-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fmuladd: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: movi v6.4s, #127, msl #8 +; CHECK-CVT-SD-NEXT: fmul v3.4s, v4.4s, v3.4s +; CHECK-CVT-SD-NEXT: movi v4.4s, #1 +; CHECK-CVT-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: ushr v1.4s, v3.4s, #16 +; CHECK-CVT-SD-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-SD-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: and v3.16b, v5.16b, v4.16b +; CHECK-CVT-SD-NEXT: addhn v1.4h, v1.4s, v6.4s +; CHECK-CVT-SD-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-CVT-SD-NEXT: shll v3.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v6.4s +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v1.4s, v1.4s, v3.4s +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fadd v2.4s, v0.4s, v2.4s +; CHECK-CVT-SD-NEXT: ushr v0.4s, v1.4s, #16 +; CHECK-CVT-SD-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-CVT-SD-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: and v1.16b, v3.16b, v4.16b +; CHECK-CVT-SD-NEXT: addhn v0.4h, v0.4s, v6.4s +; CHECK-CVT-SD-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-CVT-SD-NEXT: addhn2 v0.8h, v1.4s, v6.4s +; CHECK-CVT-SD-NEXT: ret ; ; CHECK-BF16-SD-LABEL: test_fmuladd: ; CHECK-BF16-SD: // %bb.0: @@ -6354,13 +7334,69 @@ define <8 x bfloat> @test_fmuladd(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> ; CHECK-BF16SVE-SD-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16SVE-SD-NEXT: ret ; +; CHECK-CVT-GI-LABEL: test_fmuladd: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v4.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: movi v7.4s, #64, lsl #16 +; CHECK-CVT-GI-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-CVT-GI-NEXT: movi v4.4s, #127, msl #8 +; CHECK-CVT-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: movi v1.4s, #1 +; CHECK-CVT-GI-NEXT: ushr v5.4s, v3.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v16.4s, v3.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v17.4s, v3.4s, v4.4s +; CHECK-CVT-GI-NEXT: ushr v6.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v18.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v19.4s, v0.4s, v4.4s +; CHECK-CVT-GI-NEXT: orr v3.16b, v3.16b, v7.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v6.16b, v6.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v16.16b, v16.16b +; CHECK-CVT-GI-NEXT: add v5.4s, v17.4s, v5.4s +; CHECK-CVT-GI-NEXT: mvn v17.16b, v18.16b +; CHECK-CVT-GI-NEXT: add v6.4s, v19.4s, v6.4s +; CHECK-CVT-GI-NEXT: bif v3.16b, v5.16b, v16.16b +; CHECK-CVT-GI-NEXT: shll v5.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-CVT-GI-NEXT: bif v0.16b, v6.16b, v17.16b +; CHECK-CVT-GI-NEXT: shrn v3.4h, v3.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fadd v3.4s, v3.4s, v5.4s +; CHECK-CVT-GI-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-CVT-GI-NEXT: ushr v2.4s, v3.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v6.4s, v3.4s, v3.4s +; CHECK-CVT-GI-NEXT: add v16.4s, v3.4s, v4.4s +; CHECK-CVT-GI-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-CVT-GI-NEXT: fcmeq v17.4s, v0.4s, v0.4s +; CHECK-CVT-GI-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-CVT-GI-NEXT: orr v3.16b, v3.16b, v7.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v7.16b +; CHECK-CVT-GI-NEXT: and v2.16b, v2.16b, v1.16b +; CHECK-CVT-GI-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v5.16b, v6.16b +; CHECK-CVT-GI-NEXT: mvn v6.16b, v17.16b +; CHECK-CVT-GI-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-CVT-GI-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-CVT-GI-NEXT: bit v2.16b, v3.16b, v5.16b +; CHECK-CVT-GI-NEXT: bit v1.16b, v0.16b, v6.16b +; CHECK-CVT-GI-NEXT: shrn v0.4h, v2.4s, #16 +; CHECK-CVT-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-CVT-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-GI-NEXT: ret +; ; CHECK-BF16-GI-LABEL: test_fmuladd: ; CHECK-BF16-GI: // %bb.0: -; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 -; CHECK-BF16-GI-NEXT: shll v4.4s, v0.4h, #16 -; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v4.4s, v1.4h, #16 ; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-BF16-GI-NEXT: fmul v3.4s, v4.4s, v3.4s +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fmul v3.4s, v3.4s, v4.4s ; CHECK-BF16-GI-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-BF16-GI-NEXT: bfcvtn v1.4h, v3.4s ; CHECK-BF16-GI-NEXT: shll v3.4s, v2.4h, #16 From d55e108dbeca19c83679b3a54584fc99fd352c18 Mon Sep 17 00:00:00 2001 From: LumioseSil Date: Sat, 9 May 2026 04:35:58 -0400 Subject: [PATCH 131/538] [AArch64][NFC] Remove unused TRI member from class (#184363) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I’ve removed the TRI member and its initialization, leaving only MRI and TII as the stored pointers. --------- Co-authored-by: Benjamin Maxwell --- llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index e31b7c022c69b..5bac282fef887 100644 --- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunction.h" @@ -36,7 +35,6 @@ class AArch64DeadRegisterDefinitionsImpl { bool run(MachineFunction &MF); private: - const TargetRegisterInfo *TRI; const MachineRegisterInfo *MRI; const TargetInstrInfo *TII; bool Changed; @@ -190,7 +188,6 @@ void AArch64DeadRegisterDefinitionsImpl::processMachineBasicBlock( // Scan the function for instructions that have a dead definition of a // register. Replace that register with the zero register when possible. bool AArch64DeadRegisterDefinitionsImpl::run(MachineFunction &MF) { - TRI = MF.getSubtarget().getRegisterInfo(); TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); LLVM_DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n"); From e361f28b758967480d52fb21150b43be79ec82b9 Mon Sep 17 00:00:00 2001 From: Haohai Wen Date: Sat, 9 May 2026 16:44:09 +0800 Subject: [PATCH 132/538] [ObjectYAML][NFC] Extract BBAddrMap YAML types into shared namespace (#196019) Move BBAddrMapEntry and PGOAnalysisMapEntry out of namespace ELFYAML into a new format-agnostic namespace BBAddrMapYAML so that COFF YAML support can reuse the same schema and MappingTraits. --- llvm/include/llvm/ObjectYAML/BBAddrMapYAML.h | 132 +++++++++++++++++++ llvm/include/llvm/ObjectYAML/ELFYAML.h | 96 +------------- llvm/lib/ObjectYAML/BBAddrMapYAML.cpp | 73 ++++++++++ llvm/lib/ObjectYAML/CMakeLists.txt | 1 + llvm/lib/ObjectYAML/ELFEmitter.cpp | 8 +- llvm/lib/ObjectYAML/ELFYAML.cpp | 51 ------- llvm/tools/obj2yaml/elf2yaml.cpp | 11 +- 7 files changed, 219 insertions(+), 153 deletions(-) create mode 100644 llvm/include/llvm/ObjectYAML/BBAddrMapYAML.h create mode 100644 llvm/lib/ObjectYAML/BBAddrMapYAML.cpp diff --git a/llvm/include/llvm/ObjectYAML/BBAddrMapYAML.h b/llvm/include/llvm/ObjectYAML/BBAddrMapYAML.h new file mode 100644 index 0000000000000..ddf139b025015 --- /dev/null +++ b/llvm/include/llvm/ObjectYAML/BBAddrMapYAML.h @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the YAML representation of BB address maps +/// (SHT_LLVM_BB_ADDR_MAP / .llvm_bb_addr_map). The types here are +/// format-agnostic so they can be reused by ELFYAML and COFFYAML. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_BBADDRMAPYAML_H +#define LLVM_OBJECTYAML_BBADDRMAPYAML_H + +#include "llvm/Support/YAMLTraits.h" +#include +#include +#include + +namespace llvm { +namespace BBAddrMapYAML { + +struct BBAddrMapEntry { + struct BBEntry { + uint32_t ID; + llvm::yaml::Hex64 AddressOffset; + llvm::yaml::Hex64 Size; + llvm::yaml::Hex64 Metadata; + std::optional> CallsiteEndOffsets; + std::optional Hash; + }; + uint8_t Version; + llvm::yaml::Hex16 Feature; + + struct BBRangeEntry { + llvm::yaml::Hex64 BaseAddress; + std::optional NumBlocks; + std::optional> BBEntries; + }; + + std::optional NumBBRanges; + std::optional> BBRanges; + + llvm::yaml::Hex64 getFunctionAddress() const { + if (!BBRanges || BBRanges->empty()) + return 0; + return BBRanges->front().BaseAddress; + } + + // Returns if any BB entries have non-empty callsite offsets. + bool hasAnyCallsiteEndOffsets() const { + if (!BBRanges) + return false; + for (const BBRangeEntry &BBR : *BBRanges) { + if (!BBR.BBEntries) + continue; + for (const BBEntry &BBE : *BBR.BBEntries) + if (BBE.CallsiteEndOffsets && !BBE.CallsiteEndOffsets->empty()) + return true; + } + return false; + } +}; + +struct PGOAnalysisMapEntry { + struct PGOBBEntry { + struct SuccessorEntry { + uint32_t ID; + llvm::yaml::Hex32 BrProb; + std::optional PostLinkBrFreq; + }; + std::optional BBFreq; + std::optional PostLinkBBFreq; + std::optional> Successors; + }; + std::optional FuncEntryCount; + std::optional> PGOBBEntries; +}; + +} // end namespace BBAddrMapYAML +} // end namespace llvm + +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::BBAddrMapYAML::BBAddrMapEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::BBAddrMapYAML::BBAddrMapEntry::BBEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::BBAddrMapYAML::BBAddrMapEntry::BBRangeEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::BBAddrMapYAML::PGOAnalysisMapEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR( + llvm::BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR( + llvm::BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry) + +namespace llvm { +namespace yaml { + +template <> struct MappingTraits { + static void mapping(IO &IO, BBAddrMapYAML::BBAddrMapEntry &E); +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, BBAddrMapYAML::BBAddrMapEntry::BBRangeEntry &E); +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, BBAddrMapYAML::BBAddrMapEntry::BBEntry &E); +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, BBAddrMapYAML::PGOAnalysisMapEntry &E); +}; + +template <> +struct MappingTraits { + static void mapping(IO &IO, + BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry &E); +}; + +template <> +struct MappingTraits< + BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry> { + static void + mapping(IO &IO, + BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry &E); +}; + +} // end namespace yaml +} // end namespace llvm + +#endif // LLVM_OBJECTYAML_BBADDRMAPYAML_H diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h index a8236ca37b5ed..f5267ffd1b52a 100644 --- a/llvm/include/llvm/ObjectYAML/ELFYAML.h +++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h @@ -18,6 +18,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/Object/ELFTypes.h" +#include "llvm/ObjectYAML/BBAddrMapYAML.h" #include "llvm/ObjectYAML/DWARFYAML.h" #include "llvm/ObjectYAML/YAML.h" #include "llvm/Support/YAMLTraits.h" @@ -156,63 +157,6 @@ struct DynamicEntry { llvm::yaml::Hex64 Val; }; -struct BBAddrMapEntry { - struct BBEntry { - uint32_t ID; - llvm::yaml::Hex64 AddressOffset; - llvm::yaml::Hex64 Size; - llvm::yaml::Hex64 Metadata; - std::optional> CallsiteEndOffsets; - std::optional Hash; - }; - uint8_t Version; - llvm::yaml::Hex16 Feature; - - struct BBRangeEntry { - llvm::yaml::Hex64 BaseAddress; - std::optional NumBlocks; - std::optional> BBEntries; - }; - - std::optional NumBBRanges; - std::optional> BBRanges; - - llvm::yaml::Hex64 getFunctionAddress() const { - if (!BBRanges || BBRanges->empty()) - return 0; - return BBRanges->front().BaseAddress; - } - - // Returns if any BB entries have non-empty callsite offsets. - bool hasAnyCallsiteEndOffsets() const { - if (!BBRanges) - return false; - for (const ELFYAML::BBAddrMapEntry::BBRangeEntry &BBR : *BBRanges) { - if (!BBR.BBEntries) - continue; - for (const ELFYAML::BBAddrMapEntry::BBEntry &BBE : *BBR.BBEntries) - if (BBE.CallsiteEndOffsets && !BBE.CallsiteEndOffsets->empty()) - return true; - } - return false; - } -}; - -struct PGOAnalysisMapEntry { - struct PGOBBEntry { - struct SuccessorEntry { - uint32_t ID; - llvm::yaml::Hex32 BrProb; - std::optional PostLinkBrFreq; - }; - std::optional BBFreq; - std::optional PostLinkBBFreq; - std::optional> Successors; - }; - std::optional FuncEntryCount; - std::optional> PGOBBEntries; -}; - struct StackSizeEntry { llvm::yaml::Hex64 Address; llvm::yaml::Hex64 Size; @@ -359,8 +303,8 @@ struct SectionHeaderTable : Chunk { }; struct BBAddrMapSection : Section { - std::optional> Entries; - std::optional> PGOAnalyses; + std::optional> Entries; + std::optional> PGOAnalyses; BBAddrMapSection() : Section(ChunkKind::BBAddrMap) {} @@ -780,13 +724,6 @@ bool shouldAllocateFileSpace(ArrayRef Phdrs, } // end namespace llvm LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::BBAddrMapEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::BBAddrMapEntry::BBEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::BBAddrMapEntry::BBRangeEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::PGOAnalysisMapEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::PGOAnalysisMapEntry::PGOBBEntry) -LLVM_YAML_IS_SEQUENCE_VECTOR( - llvm::ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::LinkerOption) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::CallGraphEntryWeight) @@ -947,33 +884,6 @@ template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::StackSizeEntry &Rel); }; -template <> struct MappingTraits { - static void mapping(IO &IO, ELFYAML::BBAddrMapEntry &E); -}; - -template <> struct MappingTraits { - static void mapping(IO &IO, ELFYAML::BBAddrMapEntry::BBRangeEntry &E); -}; - -template <> struct MappingTraits { - static void mapping(IO &IO, ELFYAML::BBAddrMapEntry::BBEntry &E); -}; - -template <> struct MappingTraits { - static void mapping(IO &IO, ELFYAML::PGOAnalysisMapEntry &Rel); -}; - -template <> struct MappingTraits { - static void mapping(IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &Rel); -}; - -template <> -struct MappingTraits { - static void - mapping(IO &IO, - ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry &Rel); -}; - template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::GnuHashHeader &Rel); }; diff --git a/llvm/lib/ObjectYAML/BBAddrMapYAML.cpp b/llvm/lib/ObjectYAML/BBAddrMapYAML.cpp new file mode 100644 index 0000000000000..fbeda0d7f4ef5 --- /dev/null +++ b/llvm/lib/ObjectYAML/BBAddrMapYAML.cpp @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the YAMLIO mappings for the format-agnostic BB address +/// map YAML types declared in BBAddrMapYAML.h. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/BBAddrMapYAML.h" + +namespace llvm { +namespace yaml { + +void MappingTraits::mapping( + IO &IO, BBAddrMapYAML::BBAddrMapEntry &E) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapRequired("Version", E.Version); + IO.mapOptional("Feature", E.Feature, Hex16(0)); + IO.mapOptional("NumBBRanges", E.NumBBRanges); + IO.mapOptional("BBRanges", E.BBRanges); +} + +void MappingTraits::mapping( + IO &IO, BBAddrMapYAML::BBAddrMapEntry::BBRangeEntry &E) { + IO.mapOptional("BaseAddress", E.BaseAddress, Hex64(0)); + IO.mapOptional("NumBlocks", E.NumBlocks); + IO.mapOptional("BBEntries", E.BBEntries); +} + +void MappingTraits::mapping( + IO &IO, BBAddrMapYAML::BBAddrMapEntry::BBEntry &E) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("ID", E.ID); + IO.mapRequired("AddressOffset", E.AddressOffset); + IO.mapRequired("Size", E.Size); + IO.mapRequired("Metadata", E.Metadata); + IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets); + IO.mapOptional("Hash", E.Hash); +} + +void MappingTraits::mapping( + IO &IO, BBAddrMapYAML::PGOAnalysisMapEntry &E) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("FuncEntryCount", E.FuncEntryCount); + IO.mapOptional("PGOBBEntries", E.PGOBBEntries); +} + +void MappingTraits::mapping( + IO &IO, BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry &E) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("BBFreq", E.BBFreq); + IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq); + IO.mapOptional("Successors", E.Successors); +} + +void MappingTraits< + BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry>:: + mapping(IO &IO, + BBAddrMapYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry &E) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapRequired("ID", E.ID); + IO.mapRequired("BrProb", E.BrProb); + IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq); +} + +} // end namespace yaml +} // end namespace llvm diff --git a/llvm/lib/ObjectYAML/CMakeLists.txt b/llvm/lib/ObjectYAML/CMakeLists.txt index b36974d47d9f8..44c60ed6378e5 100644 --- a/llvm/lib/ObjectYAML/CMakeLists.txt +++ b/llvm/lib/ObjectYAML/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMObjectYAML ArchiveEmitter.cpp ArchiveYAML.cpp + BBAddrMapYAML.cpp CodeViewYAMLDebugSections.cpp CodeViewYAMLSymbols.cpp CodeViewYAMLTypeHashing.cpp diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 8530785d07c93..b60b27ff6f082 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -1453,7 +1453,7 @@ void ELFState::writeSectionContent( return; } - const std::vector *PGOAnalyses = nullptr; + const std::vector *PGOAnalyses = nullptr; if (Section.PGOAnalyses) { if (Section.Entries->size() != Section.PGOAnalyses->size()) WithColor::warning() << "PGOAnalyses must be the same length as Entries " @@ -1504,7 +1504,7 @@ void ELFState::writeSectionContent( uint64_t TotalNumBlocks = 0; bool EmitCallsiteEndOffsets = FeatureOrErr->CallsiteEndOffsets || E.hasAnyCallsiteEndOffsets(); - for (const ELFYAML::BBAddrMapEntry::BBRangeEntry &BBR : *E.BBRanges) { + for (const BBAddrMapYAML::BBAddrMapEntry::BBRangeEntry &BBR : *E.BBRanges) { // Write the base address of the range. CBA.write(BBR.BaseAddress, ELFT::Endianness); // Write number of BBEntries (number of basic blocks in this basic block @@ -1516,7 +1516,7 @@ void ELFState::writeSectionContent( // Write all BBEntries in this BBRange. if (!BBR.BBEntries || FeatureOrErr->OmitBBEntries) continue; - for (const ELFYAML::BBAddrMapEntry::BBEntry &BBE : *BBR.BBEntries) { + for (const BBAddrMapYAML::BBAddrMapEntry::BBEntry &BBE : *BBR.BBEntries) { ++TotalNumBlocks; if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP && E.Version > 1) SHeader.sh_size += CBA.writeULEB128(BBE.ID); @@ -1542,7 +1542,7 @@ void ELFState::writeSectionContent( } if (!PGOAnalyses) continue; - const ELFYAML::PGOAnalysisMapEntry &PGOEntry = PGOAnalyses->at(Idx); + const BBAddrMapYAML::PGOAnalysisMapEntry &PGOEntry = PGOAnalyses->at(Idx); if (PGOEntry.FuncEntryCount) SHeader.sh_size += CBA.writeULEB128(*PGOEntry.FuncEntryCount); diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f61ad8089c71b..58013f7a4be0c 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1815,57 +1815,6 @@ void MappingTraits::mapping( IO.mapRequired("Size", E.Size); } -void MappingTraits::mapping( - IO &IO, ELFYAML::BBAddrMapEntry &E) { - assert(IO.getContext() && "The IO context is not initialized"); - IO.mapRequired("Version", E.Version); - IO.mapOptional("Feature", E.Feature, Hex16(0)); - IO.mapOptional("NumBBRanges", E.NumBBRanges); - IO.mapOptional("BBRanges", E.BBRanges); -} - -void MappingTraits::mapping( - IO &IO, ELFYAML::BBAddrMapEntry::BBRangeEntry &E) { - IO.mapOptional("BaseAddress", E.BaseAddress, Hex64(0)); - IO.mapOptional("NumBlocks", E.NumBlocks); - IO.mapOptional("BBEntries", E.BBEntries); -} - -void MappingTraits::mapping( - IO &IO, ELFYAML::BBAddrMapEntry::BBEntry &E) { - assert(IO.getContext() && "The IO context is not initialized"); - IO.mapOptional("ID", E.ID); - IO.mapRequired("AddressOffset", E.AddressOffset); - IO.mapRequired("Size", E.Size); - IO.mapRequired("Metadata", E.Metadata); - IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets); - IO.mapOptional("Hash", E.Hash); -} - -void MappingTraits::mapping( - IO &IO, ELFYAML::PGOAnalysisMapEntry &E) { - assert(IO.getContext() && "The IO context is not initialized"); - IO.mapOptional("FuncEntryCount", E.FuncEntryCount); - IO.mapOptional("PGOBBEntries", E.PGOBBEntries); -} - -void MappingTraits::mapping( - IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) { - assert(IO.getContext() && "The IO context is not initialized"); - IO.mapOptional("BBFreq", E.BBFreq); - IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq); - IO.mapOptional("Successors", E.Successors); -} - -void MappingTraits:: - mapping(IO &IO, - ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry &E) { - assert(IO.getContext() && "The IO context is not initialized"); - IO.mapRequired("ID", E.ID); - IO.mapRequired("BrProb", E.BrProb); - IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq); -} - void MappingTraits::mapping(IO &IO, ELFYAML::GnuHashHeader &E) { assert(IO.getContext() && "The IO context is not initialized"); diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index a69fd1b2d0695..34bc496beee8b 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -892,9 +892,9 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { unsigned AddressSize = ELFT::Is64Bits ? 8 : 4; DataExtractor Data(Content, Obj.isLE()); - std::vector Entries; + std::vector Entries; bool HasAnyPGOAnalysisMapEntry = false; - std::vector PGOAnalyses; + std::vector PGOAnalyses; DataExtractor::Cursor Cur(0); uint8_t Version = 0; uint16_t Feature = 0; @@ -921,7 +921,7 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { Address = Data.getUnsigned(Cur, AddressSize); NumBlocks = Data.getULEB128(Cur); } - std::vector BBRanges; + std::vector BBRanges; uint64_t BaseAddress = 0; for (uint64_t BBRangeN = 0; Cur && BBRangeN != NumBBRanges; ++BBRangeN) { if (FeatureOrErr->MultiBBRange) { @@ -931,7 +931,7 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { BaseAddress = Address; } - std::vector BBEntries; + std::vector BBEntries; // Read the specified number of BB entries, or until decoding fails. for (uint64_t BlockIndex = 0; Cur && BlockIndex < NumBlocks; ++BlockIndex) { @@ -960,7 +960,8 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { Entries.push_back( {Version, Feature, /*NumBBRanges=*/{}, std::move(BBRanges)}); - ELFYAML::PGOAnalysisMapEntry &PGOAnalysis = PGOAnalyses.emplace_back(); + BBAddrMapYAML::PGOAnalysisMapEntry &PGOAnalysis = + PGOAnalyses.emplace_back(); if (FeatureOrErr->hasPGOAnalysis()) { HasAnyPGOAnalysisMapEntry = true; From 7c9f1d2128ca715ce9ee59d1d8066c68e5a1bf8a Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sat, 9 May 2026 12:49:56 +0400 Subject: [PATCH 133/538] [clang] Update `cxx_dr_status.html` (#196702) Updates from 2026-05-08 CWG telecon. --- clang/www/cxx_dr_status.html | 61 ++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index b881cfbc5002e..ace1f3b9aba33 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -1364,12 +1364,12 @@

C++ defect report implementation status

Comma operator and rvalue conversion Clang 2.7 - + 189 [lex.operators] - open + C++20 Definition of operator and punctuator - Not resolved + Unknown 190 @@ -17575,12 +17575,12 @@

C++ defect report implementation status

Unspecified interpretation of numeric-escape-sequence Unknown - + 2541 [module.unit] - open + NAD Linkage specifications, module purview, and module attachment - Not resolved + Unknown 2542 @@ -21928,7 +21928,7 @@

C++ defect report implementation status

3159 [temp.inst] - open + tentatively ready Instantiation of variables with incomplete array types Not resolved @@ -21942,7 +21942,7 @@

C++ defect report implementation status

3161 [expr.const] - open + drafting Self-initialization of constexpr-unknown references Not resolved @@ -21970,7 +21970,7 @@

C++ defect report implementation status

3165 [basic.link] - open + tentatively ready Use "equivalent type" to support templated entities Not resolved @@ -21984,14 +21984,14 @@

C++ defect report implementation status

3167 [dcl.init] - open + tentatively ready Initializing typedefs Not resolved 3168 [conv.rank] - open + tentatively ready Issues with integer conversion ranks Not resolved @@ -22068,8 +22068,8 @@

C++ defect report implementation status

3179 [dcl.fct] - open - More edge cases for void parameters + tentatively ready + More edge cases for a void function parameter Not resolved @@ -22106,6 +22106,41 @@

C++ defect report implementation status

tentatively ready Pointer arithmetic with similar types Not resolved + + + 3185 + [module.interface] + open + Exporting a static_assert-declaration + Not resolved + + + 3186 + [basic.lookup.argdep] + open + Argument-dependent lookup is for a name + Not resolved + + + 3187 + [lex.phases] + open + Treatment of universal-character-names outside of string-literals + Not resolved + + + 3188 + [diff.cpp20] + open + Behavior change for class template argument deduction + Not resolved + + + 3189 + [class.dtor] + open + Implicitly deleted destructors for union-like classes + Not resolved From d3a4bb081f6c3d6b410af0639f7ad2bb652c1d03 Mon Sep 17 00:00:00 2001 From: Daniil Dudkin Date: Sat, 9 May 2026 12:16:36 +0300 Subject: [PATCH 134/538] [clang-tidy] Avoid `use-nodiscard` false positives for class templates (#196661) Do not suggest adding `[[nodiscard]]` to functions returning a class template specialization whose primary template is already marked `[[nodiscard]]`. Class template specializations do not carry the `[[nodiscard]]` attribute on their own declarations, so `modernize-use-nodiscard` previously missed this case and emitted redundant diagnostics for return types such as: ```cpp template struct [[nodiscard]] Result; Result f() const; ``` Fixes #163425. --- .../modernize/UseNodiscardCheck.cpp | 5 +++++ clang-tools-extra/docs/ReleaseNotes.rst | 5 +++++ .../checkers/modernize/use-nodiscard.cpp | 20 +++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp index 4940b5590f803..caa3d8c00681e 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp @@ -84,6 +84,9 @@ void UseNodiscardCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { void UseNodiscardCheck::registerMatchers(MatchFinder *Finder) { auto FunctionObj = cxxRecordDecl(hasAnyName("::std::function", "::boost::function")); + auto NoDiscardClassTemplateSpecialization = + classTemplateSpecializationDecl(hasSpecializedTemplate(classTemplateDecl( + has(cxxRecordDecl(hasAttr(attr::WarnUnusedResult)))))); // Find all non-void const methods which have not already been marked to // warn on unused result. @@ -93,6 +96,8 @@ void UseNodiscardCheck::registerMatchers(MatchFinder *Finder) { unless(anyOf( returns(voidType()), returns(hasDeclaration(decl(hasAttr(attr::WarnUnusedResult)))), + returns(hasUnqualifiedDesugaredType(recordType( + hasDeclaration(NoDiscardClassTemplateSpecialization)))), isNoReturn(), isOverloadedOperator(), isVariadic(), hasTemplateReturnType(), hasClassMutableFields(), isConversionOperator(), hasAttr(attr::WarnUnusedResult), diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index c464be5f6311a..783c919f8ce8e 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -440,6 +440,11 @@ Changes in existing checks private deleted functions, if they do not have a public overload or are a special member function. +- Improved :doc:`modernize-use-nodiscard + ` check by avoiding false + positives on functions returning specializations of class templates marked + ``[[nodiscard]]``. + - Improved :doc:`modernize-use-std-format ` check: diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nodiscard.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nodiscard.cpp index 73ea4e46f76b6..e9097a7b19fdd 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nodiscard.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nodiscard.cpp @@ -24,6 +24,12 @@ typedef const unsigned &my_unsigned_const_reference; struct NO_DISCARD NoDiscardStruct{}; +template +struct NO_DISCARD NoDiscardTemplate {}; + +using NoDiscardTemplateAlias = NoDiscardTemplate; +typedef NoDiscardTemplate NoDiscardTemplateTypedef; + class Foo { public: using size_type = unsigned; @@ -164,6 +170,20 @@ class Foo { // Do not add ``[[nodiscard]]`` to functions returning types marked [[nodiscard]]. NoDiscardStruct f50() const; + + // Do not add ``[[nodiscard]]`` to functions returning class template + // specializations whose primary template is marked [[nodiscard]]. + NoDiscardTemplate f51() const; + + NoDiscardTemplateAlias f52() const; + + NoDiscardTemplateTypedef f53() const; + + const NoDiscardTemplate f54() const; + + const NoDiscardTemplate &f55() const; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'f55' should be marked NO_DISCARD [modernize-use-nodiscard] + // CHECK-FIXES: NO_DISCARD const NoDiscardTemplate &f55() const; }; // Do not add ``[[nodiscard]]`` to Lambda. From ebf6a41a1888bbb5d731fd93b054d7b7ee62350a Mon Sep 17 00:00:00 2001 From: Zeyi Xu Date: Sat, 9 May 2026 17:25:40 +0800 Subject: [PATCH 135/538] [CI] Ignore TidyFastChecks.inc for formatter CI. NFC. (#196682) `TidyFastChecks.inc` is generated and its contents should not be checked by clang-format CI workflow. Add a local `.clang-format-ignore` entry so the PR formatting check does not report diffs for this file. Related run: https://github.com/llvm/llvm-project/pull/194516#issuecomment-4332061836 --- clang-tools-extra/clangd/.clang-format-ignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 clang-tools-extra/clangd/.clang-format-ignore diff --git a/clang-tools-extra/clangd/.clang-format-ignore b/clang-tools-extra/clangd/.clang-format-ignore new file mode 100644 index 0000000000000..c77a4b67b098a --- /dev/null +++ b/clang-tools-extra/clangd/.clang-format-ignore @@ -0,0 +1,2 @@ +# Keep these entries unformatted. +TidyFastChecks.inc From 5022a168a7a90e4223572251b5751454cab1acf2 Mon Sep 17 00:00:00 2001 From: Mao Chuanjun <10255501521@stu.ecnu.edu.cn> Date: Sat, 9 May 2026 18:32:48 +0800 Subject: [PATCH 136/538] [clang-tidy] Migrate explicit-constructor check from google to misc and add relative aliases (#194807) Fixes #126032 --- .../CppCoreGuidelinesTidyModule.cpp | 3 ++ .../clang-tidy/google/CMakeLists.txt | 1 - .../clang-tidy/google/GoogleTidyModule.cpp | 4 +- .../clang-tidy/hicpp/HICPPTidyModule.cpp | 4 +- .../clang-tidy/misc/CMakeLists.txt | 3 +- .../ExplicitConstructorCheck.cpp | 4 +- .../ExplicitConstructorCheck.h | 14 +++-- .../clang-tidy/misc/MiscTidyModule.cpp | 3 ++ clang-tools-extra/docs/ReleaseNotes.rst | 12 +++++ .../explicit-constructor.rst | 7 +++ .../checks/google/explicit-constructor.rst | 51 +------------------ .../checks/hicpp/explicit-conversions.rst | 2 +- .../docs/clang-tidy/checks/list.rst | 6 ++- .../checks/misc/explicit-constructor.rst | 51 +++++++++++++++++++ .../explicit-constructor-cxx20.cpp | 6 +-- .../{google => misc}/explicit-constructor.cpp | 14 ++--- .../unittests/clang-tidy/GoogleModuleTest.cpp | 8 +-- 17 files changed, 112 insertions(+), 81 deletions(-) rename clang-tools-extra/clang-tidy/{google => misc}/ExplicitConstructorCheck.cpp (98%) rename clang-tools-extra/clang-tidy/{google => misc}/ExplicitConstructorCheck.h (68%) create mode 100644 clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/explicit-constructor.rst create mode 100644 clang-tools-extra/docs/clang-tidy/checks/misc/explicit-constructor.rst rename clang-tools-extra/test/clang-tidy/checkers/{google => misc}/explicit-constructor-cxx20.cpp (77%) rename clang-tools-extra/test/clang-tidy/checkers/{google => misc}/explicit-constructor.cpp (93%) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp index fab4f92be22b6..402579adfb5d3 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp @@ -9,6 +9,7 @@ #include "../ClangTidy.h" #include "../ClangTidyModule.h" #include "../bugprone/NarrowingConversionsCheck.h" +#include "../misc/ExplicitConstructorCheck.h" #include "../misc/NonPrivateMemberVariablesInClassesCheck.h" #include "../misc/UnconventionalAssignOperatorCheck.h" #include "../modernize/AvoidCArraysCheck.h" @@ -75,6 +76,8 @@ class CppCoreGuidelinesModule : public ClangTidyModule { "cppcoreguidelines-avoid-non-const-global-variables"); CheckFactories.registerCheck( "cppcoreguidelines-avoid-reference-coroutine-parameters"); + CheckFactories.registerCheck( + "cppcoreguidelines-explicit-constructor"); CheckFactories.registerCheck( "cppcoreguidelines-explicit-virtual-functions"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/google/CMakeLists.txt b/clang-tools-extra/clang-tidy/google/CMakeLists.txt index 71b555d5e538b..0ac12ababc74a 100644 --- a/clang-tools-extra/clang-tidy/google/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/google/CMakeLists.txt @@ -8,7 +8,6 @@ add_clang_library(clangTidyGoogleModule STATIC AvoidThrowingObjCExceptionCheck.cpp AvoidUnderscoreInGoogletestNameCheck.cpp DefaultArgumentsCheck.cpp - ExplicitConstructorCheck.cpp ExplicitMakePairCheck.cpp FloatTypesCheck.cpp FunctionNamingCheck.cpp diff --git a/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp b/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp index fd015e951e837..e21b6dec8d1ab 100644 --- a/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp @@ -9,6 +9,7 @@ #include "../ClangTidy.h" #include "../ClangTidyModule.h" #include "../misc/AnonymousNamespaceInHeaderCheck.h" +#include "../misc/ExplicitConstructorCheck.h" #include "../modernize/AvoidCStyleCastCheck.h" #include "../readability/BracesAroundStatementsCheck.h" #include "../readability/FunctionSizeCheck.h" @@ -17,7 +18,6 @@ #include "AvoidThrowingObjCExceptionCheck.h" #include "AvoidUnderscoreInGoogletestNameCheck.h" #include "DefaultArgumentsCheck.h" -#include "ExplicitConstructorCheck.h" #include "ExplicitMakePairCheck.h" #include "FloatTypesCheck.h" #include "FunctionNamingCheck.h" @@ -46,7 +46,7 @@ class GoogleModule : public ClangTidyModule { "google-build-using-namespace"); CheckFactories.registerCheck( "google-default-arguments"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "google-explicit-constructor"); CheckFactories.registerCheck( "google-global-names-in-headers"); diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp index 501e7fc0e2d9b..c87056f9141ca 100644 --- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp @@ -19,7 +19,7 @@ #include "../cppcoreguidelines/ProTypeMemberInitCheck.h" #include "../cppcoreguidelines/ProTypeVarargCheck.h" #include "../cppcoreguidelines/SpecialMemberFunctionsCheck.h" -#include "../google/ExplicitConstructorCheck.h" +#include "../misc/ExplicitConstructorCheck.h" #include "../misc/NewDeleteOverloadsCheck.h" #include "../misc/StaticAssertCheck.h" #include "../modernize/AvoidCArraysCheck.h" @@ -63,7 +63,7 @@ class HICPPModule : public ClangTidyModule { "hicpp-multiway-paths-covered"); CheckFactories.registerCheck( "hicpp-signed-bitwise"); - CheckFactories.registerCheck( + CheckFactories.registerCheck( "hicpp-explicit-conversions"); CheckFactories.registerCheck( "hicpp-function-size"); diff --git a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt index e34b0cf687be3..83a23b65f86db 100644 --- a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt @@ -19,10 +19,11 @@ set_target_properties(genconfusable PROPERTIES FOLDER "Clang Tools Extra/Sourceg add_clang_library(clangTidyMiscModule STATIC AnonymousNamespaceInHeaderCheck.cpp + ConfusableIdentifierCheck.cpp ConstCorrectnessCheck.cpp CoroutineHostileRAIICheck.cpp DefinitionsInHeadersCheck.cpp - ConfusableIdentifierCheck.cpp + ExplicitConstructorCheck.cpp HeaderIncludeCycleCheck.cpp IncludeCleanerCheck.cpp MiscTidyModule.cpp diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp b/clang-tools-extra/clang-tidy/misc/ExplicitConstructorCheck.cpp similarity index 98% rename from clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp rename to clang-tools-extra/clang-tidy/misc/ExplicitConstructorCheck.cpp index 2c64c97a2e95d..8c6f8ef978991 100644 --- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/ExplicitConstructorCheck.cpp @@ -14,7 +14,7 @@ using namespace clang::ast_matchers; -namespace clang::tidy::google { +namespace clang::tidy::misc { void ExplicitConstructorCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( @@ -136,4 +136,4 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) { Diag << FixItHint::CreateInsertion(Loc, "explicit "); } -} // namespace clang::tidy::google +} // namespace clang::tidy::misc diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h b/clang-tools-extra/clang-tidy/misc/ExplicitConstructorCheck.h similarity index 68% rename from clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h rename to clang-tools-extra/clang-tidy/misc/ExplicitConstructorCheck.h index 0954a83223b7c..44e37bdb08ca8 100644 --- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h +++ b/clang-tools-extra/clang-tidy/misc/ExplicitConstructorCheck.h @@ -6,19 +6,17 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_EXPLICITCONSTRUCTORCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_EXPLICITCONSTRUCTORCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_EXPLICITCONSTRUCTORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_EXPLICITCONSTRUCTORCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::google { +namespace clang::tidy::misc { /// Checks that all single-argument constructors are explicit. /// -/// See https://google.github.io/styleguide/cppguide.html#Explicit_Constructors -/// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/google/explicit-constructor.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/explicit-constructor.html class ExplicitConstructorCheck : public ClangTidyCheck { public: ExplicitConstructorCheck(StringRef Name, ClangTidyContext *Context) @@ -30,6 +28,6 @@ class ExplicitConstructorCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::google +} // namespace clang::tidy::misc -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_EXPLICITCONSTRUCTORCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_EXPLICITCONSTRUCTORCHECK_H diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp index f8550b30b9789..5a716606495db 100644 --- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp @@ -13,6 +13,7 @@ #include "ConstCorrectnessCheck.h" #include "CoroutineHostileRAIICheck.h" #include "DefinitionsInHeadersCheck.h" +#include "ExplicitConstructorCheck.h" #include "HeaderIncludeCycleCheck.h" #include "IncludeCleanerCheck.h" #include "MisleadingBidirectionalCheck.h" @@ -53,6 +54,8 @@ class MiscModule : public ClangTidyModule { "misc-coroutine-hostile-raii"); CheckFactories.registerCheck( "misc-definitions-in-headers"); + CheckFactories.registerCheck( + "misc-explicit-constructor"); CheckFactories.registerCheck( "misc-header-include-cycle"); CheckFactories.registerCheck("misc-include-cleaner"); diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 783c919f8ce8e..51251eacbcd5e 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -207,6 +207,18 @@ New check aliases to :doc:`bugprone-assignment-in-selection-statement `. +- Renamed :doc:`cppcoreguidelines-explicit-constructor + ` + to :doc:`misc-explicit-constructor + `. The + `cppcoreguidelines-explicit-constructor` name is kept as an alias. + +- Renamed :doc:`google-explicit-constructor + ` + to :doc:`misc-explicit-constructor + `. The + `google-explicit-constructor` name is kept as an alias. + - Renamed :doc:`hicpp-exception-baseclass ` to :doc:`bugprone-std-exception-baseclass diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/explicit-constructor.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/explicit-constructor.rst new file mode 100644 index 0000000000000..a83c86f8a468a --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/explicit-constructor.rst @@ -0,0 +1,7 @@ +.. title:: clang-tidy - cppcoreguidelines-explicit-constructor + +cppcoreguidelines-explicit-constructor +====================================== + +This check is an alias for +:doc:`misc-explicit-constructor <../misc/explicit-constructor>`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/google/explicit-constructor.rst b/clang-tools-extra/docs/clang-tidy/checks/google/explicit-constructor.rst index 1bef2686139b3..c86fdccd5e6dc 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/google/explicit-constructor.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/google/explicit-constructor.rst @@ -3,54 +3,7 @@ google-explicit-constructor =========================== - -Checks that constructors callable with a single argument and conversion -operators are marked explicit to avoid the risk of unintentional implicit -conversions. - -Consider this example: - -.. code-block:: c++ - - struct S { - int x; - operator bool() const { return true; } - }; - - bool f() { - S a{1}; - S b{2}; - return a == b; - } - -The function will return ``true``, since the objects are implicitly converted -to ``bool`` before comparison, which is unlikely to be the intent. - -The check will suggest inserting ``explicit`` before the constructor or -conversion operator declaration. However, copy and move constructors should not -be explicit, as well as constructors taking a single ``initializer_list`` -argument. - -This code: - -.. code-block:: c++ - - struct S { - S(int a); - explicit S(const S&); - operator bool() const; - ... - -will become - -.. code-block:: c++ - - struct S { - explicit S(int a); - S(const S&); - explicit operator bool() const; - ... - - +This check is an alias for +:doc:`misc-explicit-constructor <../misc/explicit-constructor>`. See https://google.github.io/styleguide/cppguide.html#Explicit_Constructors diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst index 927f7aaf015ff..5267b0c6b12e3 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst @@ -4,7 +4,7 @@ hicpp-explicit-conversions ========================== This check is an alias for -:doc:`google-explicit-constructor <../google/explicit-constructor>`. +:doc:`misc-explicit-constructor <../misc/explicit-constructor>`. Used to enforce parts of `rule 5.4.1 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 053ce6f0779d9..3e3cd92374ee9 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -230,7 +230,6 @@ Clang-Tidy Checks :doc:`google-build-explicit-make-pair `, :doc:`google-build-using-namespace `, :doc:`google-default-arguments `, - :doc:`google-explicit-constructor `, "Yes" :doc:`google-global-names-in-headers `, :doc:`google-objc-avoid-nsobject-new `, :doc:`google-objc-avoid-throwing-exception `, @@ -265,6 +264,7 @@ Clang-Tidy Checks :doc:`misc-const-correctness `, "Yes" :doc:`misc-coroutine-hostile-raii `, :doc:`misc-definitions-in-headers `, "Yes" + :doc:`misc-explicit-constructor `, "Yes" :doc:`misc-header-include-cycle `, :doc:`misc-include-cleaner `, "Yes" :doc:`misc-misleading-bidirectional `, @@ -586,6 +586,7 @@ Check aliases :doc:`cppcoreguidelines-avoid-c-arrays `, :doc:`modernize-avoid-c-arrays `, :doc:`cppcoreguidelines-avoid-magic-numbers `, :doc:`readability-magic-numbers `, :doc:`cppcoreguidelines-c-copy-assignment-signature `, :doc:`misc-unconventional-assign-operator `, + :doc:`cppcoreguidelines-explicit-constructor `, :doc:`misc-explicit-constructor `, "Yes" :doc:`cppcoreguidelines-explicit-virtual-functions `, :doc:`modernize-use-override `, "Yes" :doc:`cppcoreguidelines-macro-to-enum `, :doc:`modernize-macro-to-enum `, "Yes" :doc:`cppcoreguidelines-narrowing-conversions `, :doc:`bugprone-narrowing-conversions `, @@ -597,6 +598,7 @@ Check aliases :doc:`fuchsia-header-anon-namespaces `, :doc:`misc-anonymous-namespace-in-header `, :doc:`fuchsia-multiple-inheritance `, :doc:`misc-multiple-inheritance `, :doc:`google-build-namespaces `, :doc:`misc-anonymous-namespace-in-header `, + :doc:`google-explicit-constructor `, :doc:`misc-explicit-constructor `, "Yes" :doc:`google-readability-braces-around-statements `, :doc:`readability-braces-around-statements `, "Yes" :doc:`google-readability-casting `, :doc:`modernize-avoid-c-style-cast `, "Yes" :doc:`google-readability-function-size `, :doc:`readability-function-size `, @@ -606,7 +608,7 @@ Check aliases :doc:`hicpp-braces-around-statements `, :doc:`readability-braces-around-statements `, "Yes" :doc:`hicpp-deprecated-headers `, :doc:`modernize-deprecated-headers `, "Yes" :doc:`hicpp-exception-baseclass `, :doc:`bugprone-std-exception-baseclass `, - :doc:`hicpp-explicit-conversions `, :doc:`google-explicit-constructor `, "Yes" + :doc:`hicpp-explicit-conversions `, :doc:`misc-explicit-constructor `, "Yes" :doc:`hicpp-function-size `, :doc:`readability-function-size `, :doc:`hicpp-ignored-remove-result `, :doc:`bugprone-unused-return-value `, :doc:`hicpp-invalid-access-moved `, :doc:`bugprone-use-after-move `, diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/explicit-constructor.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/explicit-constructor.rst new file mode 100644 index 0000000000000..faaae6adb17a3 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/explicit-constructor.rst @@ -0,0 +1,51 @@ +.. title:: clang-tidy - misc-explicit-constructor + +misc-explicit-constructor +========================= + +Checks that constructors callable with a single argument and conversion +operators are marked explicit to avoid the risk of unintentional implicit +conversions. + +Consider this example: + +.. code-block:: c++ + + struct S { + int x; + operator bool() const { return true; } + }; + + bool f() { + S a{1}; + S b{2}; + return a == b; + } + +The function will return ``true``, since the objects are implicitly converted +to ``bool`` before comparison, which is unlikely to be the intent. + +The check will suggest inserting ``explicit`` before the constructor or +conversion operator declaration. However, copy and move constructors should not +be explicit, as well as constructors taking a single ``initializer_list`` +argument. + +This code: + +.. code-block:: c++ + + struct S { + S(int a); + explicit S(const S&); + operator bool() const; + ... + +will become + +.. code-block:: c++ + + struct S { + explicit S(int a); + S(const S&); + explicit operator bool() const; + ... diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/explicit-constructor-cxx20.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/explicit-constructor-cxx20.cpp similarity index 77% rename from clang-tools-extra/test/clang-tidy/checkers/google/explicit-constructor-cxx20.cpp rename to clang-tools-extra/test/clang-tidy/checkers/misc/explicit-constructor-cxx20.cpp index 95206f1ef420c..1ef1721ec6352 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/explicit-constructor-cxx20.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/explicit-constructor-cxx20.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s google-explicit-constructor %t -std=c++20-or-later +// RUN: %check_clang_tidy %s misc-explicit-constructor %t -std=c++20-or-later namespace issue_81121 { @@ -20,7 +20,7 @@ struct C { struct D { explicit(ConstFalse) D(int); - // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: single-argument constructors explicit expression evaluates to 'false' [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: single-argument constructors explicit expression evaluates to 'false' }; template @@ -41,7 +41,7 @@ struct G { template struct H { explicit(ConstFalse) H(int); - // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: single-argument constructors explicit expression evaluates to 'false' [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: single-argument constructors explicit expression evaluates to 'false' }; template diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/explicit-constructor.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/explicit-constructor.cpp similarity index 93% rename from clang-tools-extra/test/clang-tidy/checkers/google/explicit-constructor.cpp rename to clang-tools-extra/test/clang-tidy/checkers/misc/explicit-constructor.cpp index e8174b1aebb20..4cde6b2958fc1 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/explicit-constructor.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/explicit-constructor.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s google-explicit-constructor %t +// RUN: %check_clang_tidy %s misc-explicit-constructor %t namespace std { typedef decltype(sizeof(int)) size_t; @@ -43,15 +43,15 @@ struct A { operator double() const = delete; explicit A(const A& a) {} - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: copy constructor should not be declared explicit [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: copy constructor should not be declared explicit // CHECK-FIXES: A(const A& a) {} A(int x1); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: single-argument constructors must be marked explicit to avoid unintentional implicit conversions [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: single-argument constructors must be marked explicit to avoid unintentional implicit conversions // CHECK-FIXES: explicit A(int x1); A(double x2, double y = 3.14) {} - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: constructors that are callable with a single argument must be marked explicit to avoid unintentional implicit conversions [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: constructors that are callable with a single argument must be marked explicit to avoid unintentional implicit conversions // CHECK-FIXES: explicit A(double x2, double y = 3.14) {} template @@ -68,15 +68,15 @@ struct B { B(std::initializer_list &&list3) {} operator bool() const { return true; } - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: 'operator bool' must be marked explicit to avoid unintentional implicit conversions [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: 'operator bool' must be marked explicit to avoid unintentional implicit conversions // CHECK-FIXES: explicit operator bool() const { return true; } operator double() const; - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: 'operator double' must be marked explicit to avoid unintentional implicit conversions [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: 'operator double' must be marked explicit to avoid unintentional implicit conversions // CHECK-FIXES: explicit operator double() const; explicit B(::std::initializer_list list4) {} - // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: initializer-list constructor should not be declared explicit [google-explicit-constructor] + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: initializer-list constructor should not be declared explicit // CHECK-FIXES: B(::std::initializer_list list4) {} explicit B(const ::std::initializer_list &list5) {} diff --git a/clang-tools-extra/unittests/clang-tidy/GoogleModuleTest.cpp b/clang-tools-extra/unittests/clang-tidy/GoogleModuleTest.cpp index e9ab987e493c4..7c45d1f193f56 100644 --- a/clang-tools-extra/unittests/clang-tidy/GoogleModuleTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/GoogleModuleTest.cpp @@ -1,9 +1,12 @@ #include "ClangTidyTest.h" -#include "google/ExplicitConstructorCheck.h" #include "google/GlobalNamesInHeadersCheck.h" +#include "misc/ExplicitConstructorCheck.h" #include "gtest/gtest.h" using namespace clang::tidy::google; +using namespace clang::tidy::misc; +// TODO: Deprecate this test file in favor of typical lit-tests to avoid +// cross-module dependencies. namespace clang { namespace tidy { @@ -16,8 +19,7 @@ TEST(ExplicitConstructorCheckTest, SingleArgumentConstructorsOnly) { EXPECT_NO_CHANGES(ExplicitConstructorCheck, "class C { C(C&&); };"); EXPECT_NO_CHANGES(ExplicitConstructorCheck, "class C { C(const C&) = delete; };"); - EXPECT_NO_CHANGES(ExplicitConstructorCheck, - "class C { C(int) = delete; };"); + EXPECT_NO_CHANGES(ExplicitConstructorCheck, "class C { C(int) = delete; };"); } TEST(ExplicitConstructorCheckTest, Basic) { From e78381d2e38a35d0cbc33e642d5e874b8f88f212 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 9 May 2026 12:04:25 +0100 Subject: [PATCH 137/538] [AArch64][GlobalISel] Promote BF16 G_FCMP (#196093) This adds bf16 legalization for floating point compares. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 7 +- .../test/CodeGen/AArch64/bf16-instructions.ll | 992 ++++++++++---- .../CodeGen/AArch64/bf16-v4-instructions.ll | 559 ++++++-- .../CodeGen/AArch64/bf16-v8-instructions.ll | 1160 ++++++++++++++--- 4 files changed, 2164 insertions(+), 554 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 88d2455aae425..7d87847d7fc67 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -777,7 +777,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor(HasFP16, {{i32, f16}, {v4i16, v4f16}, {v8i16, v8f16}}) .widenScalarOrEltToNextPow2(1) .clampScalar(0, s32, s32) - .minScalarOrElt(1, MinFPScalar) + .widenScalarIf( + [=](const LegalityQuery &Q) { + return (!HasFP16 && Q.Types[1].getScalarType().isFloat16()) || + Q.Types[1].getScalarType().isBFloat16(); + }, + changeElementTo(1, f32)) .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) .minScalarEltSameAsIf( [=](const LegalityQuery &Query) { diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index 05bf7a3edbd45..add88a0b6f49a 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -5,24 +5,6 @@ ; RUN: llc < %s -mtriple aarch64 -mattr=+bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-BF16-GI ; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_frem -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f32_f16 -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fccmp -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_br_cc ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i32 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i64 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptoui_i32 @@ -55,24 +37,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg ; ; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_frem -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f32_f16 -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fccmp -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_br_cc ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i32 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i64 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptoui_i32 @@ -582,33 +546,108 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 { } define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { -; CHECK-LABEL: test_select_cc: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 -; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: fcsel s0, s0, s1, ne -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_select_cc: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-CVT-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $s1 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s2, s3 +; CHECK-CVT-SD-NEXT: fcsel s0, s0, s1, ne +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_select_cc: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-BF16-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $s1 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s2, s3 +; CHECK-BF16-SD-NEXT: fcsel s0, s0, s1, ne +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_select_cc: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-GI-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $s1 +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s1 +; CHECK-CVT-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s2, s3 +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, ne +; CHECK-CVT-GI-NEXT: fmov s0, w8 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_select_cc: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-GI-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $s1 +; CHECK-BF16-GI-NEXT: fmov w8, s0 +; CHECK-BF16-GI-NEXT: fmov w9, s1 +; CHECK-BF16-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s2, s3 +; CHECK-BF16-GI-NEXT: csel w8, w8, w9, ne +; CHECK-BF16-GI-NEXT: fmov s0, w8 +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-BF16-GI-NEXT: ret %cc = fcmp une bfloat %c, %d %r = select i1 %cc, bfloat %a, bfloat %b ret bfloat %r } define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 { -; CHECK-LABEL: test_select_cc_f32_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 -; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: fcsel s0, s0, s1, ne -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_select_cc_f32_f16: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-CVT-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s2, s3 +; CHECK-CVT-SD-NEXT: fcsel s0, s0, s1, ne +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_select_cc_f32_f16: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-BF16-SD-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-SD-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-SD-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s2, s3 +; CHECK-BF16-SD-NEXT: fcsel s0, s0, s1, ne +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_select_cc_f32_f16: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-GI-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-CVT-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s2, s3 +; CHECK-CVT-GI-NEXT: fcsel s0, s0, s1, ne +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_select_cc_f32_f16: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-GI-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-BF16-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s2, s3 +; CHECK-BF16-GI-NEXT: fcsel s0, s0, s1, ne +; CHECK-BF16-GI-NEXT: ret %cc = fcmp une bfloat %c, %d %r = select i1 %cc, float %a, float %b ret float %r @@ -662,217 +701,695 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) } define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_une: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, ne -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_une: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, ne +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_une: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, ne +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_une: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, ne +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_une: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, ne +; CHECK-BF16-GI-NEXT: ret %r = fcmp une bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ueq: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: csinc w0, w8, wzr, vc -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ueq: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w8, eq +; CHECK-CVT-SD-NEXT: csinc w0, w8, wzr, vc +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ueq: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w8, eq +; CHECK-BF16-SD-NEXT: csinc w0, w8, wzr, vc +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ueq: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w8, eq +; CHECK-CVT-GI-NEXT: cset w9, vs +; CHECK-CVT-GI-NEXT: orr w0, w8, w9 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ueq: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w8, eq +; CHECK-BF16-GI-NEXT: cset w9, vs +; CHECK-BF16-GI-NEXT: orr w0, w8, w9 +; CHECK-BF16-GI-NEXT: ret %r = fcmp ueq bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ugt: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, hi -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ugt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, hi +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ugt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, hi +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ugt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, hi +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ugt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, hi +; CHECK-BF16-GI-NEXT: ret %r = fcmp ugt bfloat %a, %b ret i1 %r } define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_uge: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, pl -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_uge: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, pl +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_uge: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, pl +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_uge: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, pl +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_uge: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, pl +; CHECK-BF16-GI-NEXT: ret %r = fcmp uge bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ult: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, lt -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ult: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, lt +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ult: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, lt +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ult: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, lt +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ult: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, lt +; CHECK-BF16-GI-NEXT: ret %r = fcmp ult bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ule: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, le -; CHECK-NEXT: ret - %r = fcmp ule bfloat %a, %b - ret i1 %r -} - -define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_uno: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, vs -; CHECK-NEXT: ret - %r = fcmp uno bfloat %a, %b - ret i1 %r -} - -define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_one: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w8, mi -; CHECK-NEXT: csinc w0, w8, wzr, le -; CHECK-NEXT: ret - %r = fcmp one bfloat %a, %b - ret i1 %r -} +; CHECK-CVT-SD-LABEL: test_fcmp_ule: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, le +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ule: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, le +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ule: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, le +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ule: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, le +; CHECK-BF16-GI-NEXT: ret + %r = fcmp ule bfloat %a, %b + ret i1 %r +} + +define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { +; CHECK-CVT-SD-LABEL: test_fcmp_uno: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, vs +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_uno: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, vs +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_uno: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, vs +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_uno: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, vs +; CHECK-BF16-GI-NEXT: ret + %r = fcmp uno bfloat %a, %b + ret i1 %r +} + +define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { +; CHECK-CVT-SD-LABEL: test_fcmp_one: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w8, mi +; CHECK-CVT-SD-NEXT: csinc w0, w8, wzr, le +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_one: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w8, mi +; CHECK-BF16-SD-NEXT: csinc w0, w8, wzr, le +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_one: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w8, mi +; CHECK-CVT-GI-NEXT: cset w9, gt +; CHECK-CVT-GI-NEXT: orr w0, w8, w9 +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_one: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w8, mi +; CHECK-BF16-GI-NEXT: cset w9, gt +; CHECK-BF16-GI-NEXT: orr w0, w8, w9 +; CHECK-BF16-GI-NEXT: ret + %r = fcmp one bfloat %a, %b + ret i1 %r +} define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_oeq: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_oeq: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, eq +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_oeq: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, eq +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_oeq: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, eq +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_oeq: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, eq +; CHECK-BF16-GI-NEXT: ret %r = fcmp oeq bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ogt: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, gt -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ogt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, gt +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ogt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, gt +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ogt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, gt +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ogt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, gt +; CHECK-BF16-GI-NEXT: ret %r = fcmp ogt bfloat %a, %b ret i1 %r } define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_oge: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, ge -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_oge: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, ge +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_oge: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, ge +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_oge: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, ge +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_oge: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, ge +; CHECK-BF16-GI-NEXT: ret %r = fcmp oge bfloat %a, %b ret i1 %r } define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_olt: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, mi -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_olt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, mi +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_olt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, mi +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_olt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, mi +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_olt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, mi +; CHECK-BF16-GI-NEXT: ret %r = fcmp olt bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ole: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, ls -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ole: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, ls +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ole: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, ls +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ole: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, ls +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ole: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, ls +; CHECK-BF16-GI-NEXT: ret %r = fcmp ole bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 { -; CHECK-LABEL: test_fcmp_ord: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w0, vc -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ord: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: cset w0, vc +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ord: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: cset w0, vc +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ord: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: cset w0, vc +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ord: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: cset w0, vc +; CHECK-BF16-GI-NEXT: ret %r = fcmp ord bfloat %a, %b ret i1 %r } define void @test_fccmp(bfloat %in, ptr %out) { -; CHECK-LABEL: test_fccmp: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2s, #69, lsl #24 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-NEXT: adrp x8, .LCPI29_0 -; CHECK-NEXT: movi v3.2s, #72, lsl #24 -; CHECK-NEXT: fcmp s2, s1 -; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] -; CHECK-NEXT: fccmp s2, s3, #4, mi -; CHECK-NEXT: fcsel s0, s0, s1, gt -; CHECK-NEXT: str h0, [x0] -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fccmp: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: movi v1.2s, #69, lsl #24 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: adrp x8, .LCPI29_0 +; CHECK-CVT-SD-NEXT: movi v3.2s, #72, lsl #24 +; CHECK-CVT-SD-NEXT: fcmp s2, s1 +; CHECK-CVT-SD-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] +; CHECK-CVT-SD-NEXT: fccmp s2, s3, #4, mi +; CHECK-CVT-SD-NEXT: fcsel s0, s0, s1, gt +; CHECK-CVT-SD-NEXT: str h0, [x0] +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fccmp: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: movi v1.2s, #69, lsl #24 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: adrp x8, .LCPI29_0 +; CHECK-BF16-SD-NEXT: movi v3.2s, #72, lsl #24 +; CHECK-BF16-SD-NEXT: fcmp s2, s1 +; CHECK-BF16-SD-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] +; CHECK-BF16-SD-NEXT: fccmp s2, s3, #4, mi +; CHECK-BF16-SD-NEXT: fcsel s0, s0, s1, gt +; CHECK-BF16-SD-NEXT: str h0, [x0] +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fccmp: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: movi v1.2s, #69, lsl #24 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0 +; CHECK-CVT-GI-NEXT: movi v3.2s, #72, lsl #24 +; CHECK-CVT-GI-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s2, s1 +; CHECK-CVT-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] +; CHECK-CVT-GI-NEXT: fmov w8, s0 +; CHECK-CVT-GI-NEXT: fmov w9, s1 +; CHECK-CVT-GI-NEXT: fccmp s4, s3, #4, mi +; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt +; CHECK-CVT-GI-NEXT: strh w8, [x0] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fccmp: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: movi v1.2s, #69, lsl #24 +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: adrp x8, .LCPI29_0 +; CHECK-BF16-GI-NEXT: movi v3.2s, #72, lsl #24 +; CHECK-BF16-GI-NEXT: shll v4.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s2, s1 +; CHECK-BF16-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] +; CHECK-BF16-GI-NEXT: fmov w8, s0 +; CHECK-BF16-GI-NEXT: fmov w9, s1 +; CHECK-BF16-GI-NEXT: fccmp s4, s3, #4, mi +; CHECK-BF16-GI-NEXT: csel w8, w8, w9, gt +; CHECK-BF16-GI-NEXT: strh w8, [x0] +; CHECK-BF16-GI-NEXT: ret %cmp1 = fcmp ogt bfloat %in, 0xR4800 %cmp2 = fcmp olt bfloat %in, 0xR4500 %cond = and i1 %cmp1, %cmp2 @@ -882,16 +1399,49 @@ define void @test_fccmp(bfloat %in, ptr %out) { } define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 { -; CHECK-LABEL: test_br_cc: -; CHECK: // %bb.0: // %common.ret -; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: csel x8, x0, x1, pl -; CHECK-NEXT: str wzr, [x8] -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_br_cc: +; CHECK-CVT-SD: // %bb.0: // %common.ret +; CHECK-CVT-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmp s0, s1 +; CHECK-CVT-SD-NEXT: csel x8, x0, x1, pl +; CHECK-CVT-SD-NEXT: str wzr, [x8] +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_br_cc: +; CHECK-BF16-SD: // %bb.0: // %common.ret +; CHECK-BF16-SD-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-SD-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmp s0, s1 +; CHECK-BF16-SD-NEXT: csel x8, x0, x1, pl +; CHECK-BF16-SD-NEXT: str wzr, [x8] +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_br_cc: +; CHECK-CVT-GI: // %bb.0: // %common.ret +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmp s0, s1 +; CHECK-CVT-GI-NEXT: csel x8, x0, x1, pl +; CHECK-CVT-GI-NEXT: str wzr, [x8] +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_br_cc: +; CHECK-BF16-GI: // %bb.0: // %common.ret +; CHECK-BF16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmp s0, s1 +; CHECK-BF16-GI-NEXT: csel x8, x0, x1, pl +; CHECK-BF16-GI-NEXT: str wzr, [x8] +; CHECK-BF16-GI-NEXT: ret %c = fcmp uge bfloat %a, %b br i1 %c, label %then, label %else then: diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll index 823a245a3cc81..93b80b012a8b6 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll @@ -5,20 +5,6 @@ ; RUN: llc < %s -mtriple aarch64 -mattr=+bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-BF16-GI ; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_frem -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i8 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i16 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i32 @@ -56,20 +42,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg ; ; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_frem -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i8 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i16 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i32 @@ -584,146 +556,449 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 { } define <4 x i1> @test_fcmp_une(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_une: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_une: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_une: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_une: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_une: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp une <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ueq(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ueq: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v2.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ueq: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ueq: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ueq: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ueq: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ueq <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ugt(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ugt: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ugt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ugt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ugt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ugt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ugt <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_uge(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_uge: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_uge: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_uge: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_uge: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_uge: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp uge <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ult(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ult: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ult: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ult: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ult: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ult: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ult <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ule(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ule: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ule: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ule: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ule: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ule: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ule <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_uno(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_uno: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v2.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_uno: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_uno: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: mvn v0.8b, v0.8b +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_uno: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_uno: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp uno <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_one(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_one: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v2.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_one: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_one: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_one: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_one: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp one <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_oeq(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_oeq: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_oeq: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_oeq: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_oeq: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_oeq: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp oeq <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_ogt(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ogt: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ogt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ogt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ogt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ogt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ogt <4 x bfloat> %a, %b ret <4 x i1> %1 } define <4 x i1> @test_fcmp_oge(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_oge: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_oge: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_oge: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_oge: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_oge: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp oge <4 x bfloat> %a, %b ret <4 x i1> %1 } @@ -753,15 +1028,45 @@ define <4 x i1> @test_fcmp_ole(<4 x bfloat> %a, <4 x bfloat> %b) #0 { } define <4 x i1> @test_fcmp_ord(<4 x bfloat> %a, <4 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ord: -; CHECK: // %bb.0: -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v2.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ord: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ord: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ord: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-CVT-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ord: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: fcmge v2.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-BF16-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ord <4 x bfloat> %a, %b ret <4 x i1> %1 } diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index 2b31e876487d9..b18b44b46d11a 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -6,20 +6,6 @@ ; RUN: llc < %s -mtriple aarch64 -mattr=+bf16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-BF16-GI ; CHECK-CVT-GI: warning: Instruction selection used fallback path for test_frem -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole -; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i8 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_v16i8 ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i16 @@ -61,20 +47,6 @@ ; CHECK-CVT-GI-NEXT: warning: Instruction selection used fallback path for test_fneg ; ; CHECK-BF16-GI: warning: Instruction selection used fallback path for test_frem -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole -; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i8 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_v16i8 ; CHECK-BF16-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i16 @@ -943,248 +915,1026 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 { } define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_une: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmeq v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_une: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmeq v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_une: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmeq v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_une: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmeq v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_une: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_une: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp une <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ueq: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v4.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmgt v3.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v1.16b, v2.16b, v4.16b -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ueq: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v4.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-CVT-SD-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ueq: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v4.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16-SD-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ueq: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmgt v4.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16SVE-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ueq: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v4.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-GI-NEXT: mvn v1.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ueq: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v4.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-GI-NEXT: mvn v1.16b, v1.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ueq <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ugt: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v1.8h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: fcmge v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ugt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ugt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ugt: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ugt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ugt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ugt <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_uge: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v1.8h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: fcmgt v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_uge: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_uge: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_uge: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_uge: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_uge: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp uge <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ult: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ult: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ult: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ult: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ult: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmge v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ult: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmge v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ult <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ule: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ule: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ule: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ule: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ule: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ule: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: mvn v1.16b, v2.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ule <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_uno: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v4.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmge v3.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v1.16b, v2.16b, v4.16b -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_uno: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v4.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-CVT-SD-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-CVT-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_uno: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v4.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16-SD-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_uno: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmge v4.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16SVE-SD-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16SVE-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16SVE-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_uno: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmge v4.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-GI-NEXT: mvn v1.16b, v1.16b +; CHECK-CVT-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_uno: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmge v4.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-GI-NEXT: mvn v1.16b, v1.16b +; CHECK-BF16-GI-NEXT: mvn v0.16b, v0.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp uno <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_one: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v4.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmgt v3.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v1.16b, v2.16b, v4.16b -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_one: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v4.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-CVT-SD-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_one: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v4.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16-SD-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_one: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmgt v4.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16SVE-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_one: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v4.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_one: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v4.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmgt v3.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp one <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_oeq: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmeq v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_oeq: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmeq v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_oeq: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmeq v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_oeq: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmeq v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_oeq: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_oeq: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp oeq <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ogt: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmgt v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ogt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ogt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ogt: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ogt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ogt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ogt <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_oge: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_oge: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_oge: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_oge: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_oge: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmge v2.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_oge: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmge v2.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp oge <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_olt: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v1.8h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: fcmgt v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_olt: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_olt: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_olt: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_olt: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_olt: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp olt <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ole: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v1.8h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: fcmge v2.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ole: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ole: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ole: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ole: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ole: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmge v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ole <8 x bfloat> %a, %b ret <8 x i1> %1 } define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 { -; CHECK-LABEL: test_fcmp_ord: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2 v2.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v3.4s, v0.8h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcmge v4.4s, v3.4s, v2.4s -; CHECK-NEXT: fcmgt v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmge v3.4s, v0.4s, v1.4s -; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: orr v1.16b, v2.16b, v4.16b -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: ret +; CHECK-CVT-SD-LABEL: test_fcmp_ord: +; CHECK-CVT-SD: // %bb.0: +; CHECK-CVT-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-CVT-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-CVT-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-SD-NEXT: fcmge v4.4s, v3.4s, v2.4s +; CHECK-CVT-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-CVT-SD-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-CVT-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-CVT-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-SD-NEXT: ret +; +; CHECK-BF16-SD-LABEL: test_fcmp_ord: +; CHECK-BF16-SD: // %bb.0: +; CHECK-BF16-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-SD-NEXT: fcmge v4.4s, v3.4s, v2.4s +; CHECK-BF16-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16-SD-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-BF16-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-SD-NEXT: ret +; +; CHECK-BF16SVE-SD-LABEL: test_fcmp_ord: +; CHECK-BF16SVE-SD: // %bb.0: +; CHECK-BF16SVE-SD-NEXT: shll2 v2.4s, v1.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll2 v3.4s, v0.8h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16SVE-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16SVE-SD-NEXT: fcmge v4.4s, v3.4s, v2.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v2.4s, v2.4s, v3.4s +; CHECK-BF16SVE-SD-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-BF16SVE-SD-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16SVE-SD-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16SVE-SD-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16SVE-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BF16SVE-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16SVE-SD-NEXT: ret +; +; CHECK-CVT-GI-LABEL: test_fcmp_ord: +; CHECK-CVT-GI: // %bb.0: +; CHECK-CVT-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-CVT-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-CVT-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-CVT-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-CVT-GI-NEXT: fcmge v4.4s, v2.4s, v3.4s +; CHECK-CVT-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-CVT-GI-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-CVT-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-CVT-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-CVT-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-CVT-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-CVT-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-CVT-GI-NEXT: ret +; +; CHECK-BF16-GI-LABEL: test_fcmp_ord: +; CHECK-BF16-GI: // %bb.0: +; CHECK-BF16-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-BF16-GI-NEXT: shll v3.4s, v1.4h, #16 +; CHECK-BF16-GI-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-BF16-GI-NEXT: shll2 v1.4s, v1.8h, #16 +; CHECK-BF16-GI-NEXT: fcmge v4.4s, v2.4s, v3.4s +; CHECK-BF16-GI-NEXT: fcmgt v2.4s, v3.4s, v2.4s +; CHECK-BF16-GI-NEXT: fcmge v3.4s, v0.4s, v1.4s +; CHECK-BF16-GI-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-BF16-GI-NEXT: orr v1.16b, v2.16b, v4.16b +; CHECK-BF16-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-BF16-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BF16-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-BF16-GI-NEXT: ret %1 = fcmp ord <8 x bfloat> %a, %b ret <8 x i1> %1 } From 062ddf58b98fb82a13469be0c4c913605eb6463f Mon Sep 17 00:00:00 2001 From: Kiva Date: Sat, 9 May 2026 20:15:22 +0800 Subject: [PATCH 138/538] [RISCV][NFC] Rename `Zvvmm` instruction file to `Zvvm` (#196692) Renames `RISCVInstrInfoZvvmm.td` to `RISCVInstrInfoZvvm.td` so `Zvvmm` and `Zvvfmm` share the same IME instruction file according to the spec. And all future instructions from the `Zvvm family` will be placed here too. This PR is required for reviewing #196486 in order to make GitHub show the diff correcrly. --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 2 +- .../RISCV/{RISCVInstrInfoZvvmm.td => RISCVInstrInfoZvvm.td} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename llvm/lib/Target/RISCV/{RISCVInstrInfoZvvmm.td => RISCVInstrInfoZvvm.td} (100%) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 3295d18a2d352..541bf302bf8e5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -2356,7 +2356,7 @@ include "RISCVInstrInfoZvk.td" include "RISCVInstrInfoZvdot4a8i.td" include "RISCVInstrInfoZvfofp8min.td" include "RISCVInstrInfoZvzip.td" -include "RISCVInstrInfoZvvmm.td" +include "RISCVInstrInfoZvvm.td" // Packed SIMD include "RISCVInstrInfoP.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvvmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvvm.td similarity index 100% rename from llvm/lib/Target/RISCV/RISCVInstrInfoZvvmm.td rename to llvm/lib/Target/RISCV/RISCVInstrInfoZvvm.td From ae3d770866bc434daa0e7f3e1fbadf8bbc260bac Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Sat, 9 May 2026 13:44:32 +0100 Subject: [PATCH 139/538] [BPF] Support Stack Arguments (#189060) Currently, bpf program and kfunc only support 5 register parameters. As bpf community and use cases keep expanding, there are some need to extend 5 register parameters by allocating additional parameters on stack. There are two main use cases here: 1. Currently kfunc is limited to 5 register parameters. In some special situation, people may want to have more than 5 parameters. One of example is for sched_ext. 2. Allowing more stack parameters can make bpf prog writer easier since they do not need to carefully limit the number of parameters for their programs. The following is the high-level design: - Use bpf register R11 as the frame pointer to stack parameters. This is to avoid mixing stacks due to R10. - Stack parameters must be after 5 register parameters. - All parameters should be at most 16 bytes as ByVal parameters are not supported. - Support for cpu v1 to v4 so all cpu versions can use this. A feature macro __BPF_FEATURE_STACK_ARGUMENT is defined and users can check whether stack argument is supported or not. The below is a simple asm code example about stack parameters: ``` bar: /* Retrieve two parameters from the caller of bar(). */ rX = *(u64 *)(r11 + 8) // 1st arg rY = *(u64 *)(r11 + 16) // 2nd arg ... /* Prepare the single stack parameters for foo1 */ *(u64 *)(r11 - 8) = rZ // 1st arg call foo1 ... /* Prepare the single stack parameters for foo2 */ *(u64 *)(r11 - 8) = rX // 1st arg *(u64 *)(r11 - 16) = rY // 2nd arg call foo2 ... foo1: /* Retrieve parameter '*(u64 *)(r11 - 8) = rZ' from bar(), * and assign the value rZ to rX. */ rX = *(u64 *)(r11 + 8) // 1st arg ... foo2: /* Retrieve parameters '*(u64 *)(r11 - 8/16) = rZ' from bar(), * and assign values rX/rY to rU/rV. */ rU = *(u64 *)(r11 + 8) // 1st arg rV = *(u64 *)(r11 + 16) // 2nd arg ... ``` The code patterns in the above try to follow x86_64/arm64 calling conventions. That is, the first argument is in lower location than the second argument, etc. The r11 based load should retrieve the value directly from the caller stack. The r11 based store should push the value directly on the specificed stack location. Internally in bpf backend, pseudo insns are generated for load_stack_arg and store_stack_arg. The BPFMIPeephole pass changes pseudo insns into proper real bpf insns like the above. --- clang/lib/Basic/Targets/BPF.cpp | 1 + .../test/Preprocessor/bpf-predefined-macros.c | 8 + llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp | 32 +++ llvm/lib/Target/BPF/BPFISelLowering.cpp | 97 ++++++--- llvm/lib/Target/BPF/BPFISelLowering.h | 3 - llvm/lib/Target/BPF/BPFInstrInfo.td | 32 +++ llvm/lib/Target/BPF/BPFMIPeephole.cpp | 60 ++++++ .../BPF/Disassembler/BPFDisassembler.cpp | 8 +- llvm/test/CodeGen/BPF/many_args1.ll | 6 +- llvm/test/CodeGen/BPF/many_args2.ll | 6 +- llvm/test/CodeGen/BPF/many_args3.ll | 199 ++++++++++++++++++ llvm/test/CodeGen/BPF/many_args4.ll | 65 ++++++ llvm/test/CodeGen/BPF/many_args5.ll | 27 +++ llvm/test/CodeGen/BPF/many_args6.ll | 32 +++ llvm/test/CodeGen/BPF/many_args7.ll | 33 +++ llvm/test/CodeGen/BPF/many_args8.ll | 36 ++++ 16 files changed, 598 insertions(+), 47 deletions(-) create mode 100644 llvm/test/CodeGen/BPF/many_args3.ll create mode 100644 llvm/test/CodeGen/BPF/many_args4.ll create mode 100644 llvm/test/CodeGen/BPF/many_args5.ll create mode 100644 llvm/test/CodeGen/BPF/many_args6.ll create mode 100644 llvm/test/CodeGen/BPF/many_args7.ll create mode 100644 llvm/test/CodeGen/BPF/many_args8.ll diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp index 8de1083d758c7..100769ea4cdb1 100644 --- a/clang/lib/Basic/Targets/BPF.cpp +++ b/clang/lib/Basic/Targets/BPF.cpp @@ -46,6 +46,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__BPF_FEATURE_ADDR_SPACE_CAST"); Builder.defineMacro("__BPF_FEATURE_MAY_GOTO"); Builder.defineMacro("__BPF_FEATURE_ATOMIC_MEM_ORDERING"); + Builder.defineMacro("__BPF_FEATURE_STACK_ARGUMENT"); if (CPU.empty()) CPU = "v3"; diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c index b4e37fdd7de37..90287b7b24e95 100644 --- a/clang/test/Preprocessor/bpf-predefined-macros.c +++ b/clang/test/Preprocessor/bpf-predefined-macros.c @@ -76,6 +76,9 @@ int w; #ifdef __BPF_FEATURE_ATOMIC_MEM_ORDERING int x; #endif +#ifdef __BPF_FEATURE_STACK_ARGUMENT +int y; +#endif // CHECK: int b; // CHECK: int c; @@ -123,6 +126,11 @@ int x; // CPU_V3: int x; // CPU_V4: int x; +// CPU_V1: int y; +// CPU_V2: int y; +// CPU_V3: int y; +// CPU_V4: int y; + // CPU_GENERIC: int g; // CPU_PROBE: int f; diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp index dadba52de4627..af034b134e632 100644 --- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -193,6 +193,38 @@ void BPFDAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case BPFISD::LOAD_STACK_ARG: { + SDValue Chain = Node->getOperand(0); + auto *CN = cast(Node->getOperand(1)); + SDValue Off = + CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(Node), MVT::i64); + EVT ValVT = Node->getValueType(0); + CurDAG->SelectNodeTo(Node, BPF::LOAD_STACK_ARG_PSEUDO, ValVT, MVT::Other, + Off, Chain); + return; + } + + case BPFISD::STORE_STACK_ARG: { + SDValue Chain = Node->getOperand(0); + auto *CN = cast(Node->getOperand(1)); + SDValue Off = + CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(Node), MVT::i64); + SDValue Val = Node->getOperand(2); + + // Use store-immediate when the value is a constant that fits in 32 bits. + if (auto *ValCN = dyn_cast(Val); + ValCN && Subtarget->hasStoreImm() && isInt<32>(ValCN->getSExtValue())) { + SDValue Imm = CurDAG->getTargetConstant(ValCN->getSExtValue(), + SDLoc(Node), MVT::i64); + CurDAG->SelectNodeTo(Node, BPF::STORE_STACK_ARG_IMM_PSEUDO, MVT::Other, + Off, Imm, Chain); + } else { + CurDAG->SelectNodeTo(Node, BPF::STORE_STACK_ARG_PSEUDO, MVT::Other, Off, + Val, Chain); + } + return; + } + case ISD::FrameIndex: { int FI = cast(Node)->getIndex(); EVT VT = Node->getValueType(0); diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp index 6ffa9d1693bc0..d846e6378acef 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -380,6 +380,21 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { // Calling Convention Implementation #include "BPFGenCallingConv.inc" +// Apply AssertSext/AssertZext and truncate based on VA's LocInfo. +static SDValue convertLocValType(SelectionDAG &DAG, const SDLoc &DL, + const CCValAssign &VA, EVT RegVT, + SDValue ArgValue) { + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + if (VA.getLocInfo() != CCValAssign::Full) + ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue); + return ArgValue; +} + SDValue BPFTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, @@ -400,13 +415,12 @@ SDValue BPFTargetLowering::LowerFormalArguments( CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, getHasAlu32() ? CC_BPF32 : CC_BPF64); - bool HasMemArgs = false; for (size_t I = 0; I < ArgLocs.size(); ++I) { auto &VA = ArgLocs[I]; + EVT RegVT = VA.getLocVT(); if (VA.isRegLoc()) { // Arguments passed in registers - EVT RegVT = VA.getLocVT(); MVT::SimpleValueType SimpleTy = RegVT.getSimpleVT().SimpleTy; switch (SimpleTy) { default: { @@ -423,40 +437,38 @@ SDValue BPFTargetLowering::LowerFormalArguments( SimpleTy == MVT::i64 ? &BPF::GPRRegClass : &BPF::GPR32RegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT); + InVals.push_back(convertLocValType(DAG, DL, VA, RegVT, ArgValue)); + break; + } + continue; + } - // If this is an value that has been promoted to wider types, insert an - // assert[sz]ext to capture this, then truncate to the right size. - if (VA.getLocInfo() == CCValAssign::SExt) - ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - else if (VA.getLocInfo() == CCValAssign::ZExt) - ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - - if (VA.getLocInfo() != CCValAssign::Full) - ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue); - - InVals.push_back(ArgValue); - + if (VA.isMemLoc()) { + // For example, two stack arguments, + // arg1: Off = 8 + // arg2: off = 16 + int Off = VA.getLocMemOffset() + 8; + if (Off > INT16_MAX) { + fail(DL, DAG, "extra parameter stack depth exceeded limit"); break; } - } else { - if (VA.isMemLoc()) - HasMemArgs = true; - else - report_fatal_error("unhandled argument location"); - InVals.push_back(DAG.getConstant(0, DL, VA.getLocVT())); + + // Physical extra argument slot is always 64-bit. + SDValue StackVal = DAG.getNode(BPFISD::LOAD_STACK_ARG, DL, + DAG.getVTList(MVT::i64, MVT::Other), Chain, + DAG.getConstant(Off, DL, MVT::i64)); + SDValue ArgValue = StackVal.getValue(0); + Chain = StackVal.getValue(1); + InVals.push_back(convertLocValType(DAG, DL, VA, MVT::i64, ArgValue)); + continue; } } - if (HasMemArgs) - fail(DL, DAG, "stack arguments are not supported"); + if (IsVarArg) fail(DL, DAG, "variadic functions are not supported"); return Chain; } -const size_t BPFTargetLowering::MaxArgs = 5; - static void resetRegMaskBit(const TargetRegisterInfo *TRI, uint32_t *RegMask, MCRegister Reg) { for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) @@ -504,9 +516,6 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned NumBytes = CCInfo.getStackSize(); - if (Outs.size() > MaxArgs) - fail(CLI.DL, DAG, "too many arguments", Callee); - for (auto &Arg : Outs) { ISD::ArgFlagsTy Flags = Arg.Flags; if (!Flags.isByVal()) @@ -518,10 +527,10 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, auto PtrVT = getPointerTy(MF.getDataLayout()); Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); - SmallVector, MaxArgs> RegsToPass; + SmallVector, 8> RegsToPass; // Walk arg assignments - for (size_t i = 0; i < std::min(ArgLocs.size(), MaxArgs); ++i) { + for (size_t i = 0; i < OutVals.size(); ++i) { CCValAssign &VA = ArgLocs[i]; SDValue &Arg = OutVals[i]; @@ -543,10 +552,30 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // Push arguments into RegsToPass vector - if (VA.isRegLoc()) + if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - else - report_fatal_error("stack arguments are not supported"); + continue; + } + + if (VA.isMemLoc()) { + int Off = -8 - VA.getLocMemOffset(); + if (Off < INT16_MIN) { + fail(CLI.DL, DAG, "extra parameter stack depth exceeded limit"); + break; + } + + // STORE_STACK_ARG requires i64 operands. With ALU32 mode, the CC + // promotion may only extend to i32, so extend to i64 if needed. + if (Arg.getValueType() != MVT::i64) + Arg = DAG.getNode(ISD::ANY_EXTEND, CLI.DL, MVT::i64, Arg); + + SDValue OffVal = DAG.getConstant(Off, CLI.DL, MVT::i64); + Chain = DAG.getNode(BPFISD::STORE_STACK_ARG, CLI.DL, MVT::Other, Chain, + OffVal, Arg); + continue; + } + + report_fatal_error("unhandled argument location"); } SDValue InGlue; diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h index b16fce0dd418f..fa6387754271f 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.h +++ b/llvm/lib/Target/BPF/BPFISelLowering.h @@ -98,9 +98,6 @@ class BPFTargetLowering : public TargetLowering { const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const; - // Maximum number of arguments to a call - static const size_t MaxArgs; - // Lower a call into CALLSEQ_START - BPFISD:CALL - CALLSEQ_END chain SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 3d2050e26ca0d..c86b45ee93cc6 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -43,6 +43,22 @@ def BPFcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_BPFCallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, [SDNPHasChain]>; +def SDT_BPFLoadStackArg : SDTypeProfile<1, 1, [ + SDTCisVT<0, i64>, // result value + SDTCisVT<1, i64> // operand: offset +]>; +def SDT_BPFStoreStackArg : SDTypeProfile<0, 2, [ + SDTCisVT<0, i64>, // operand 0: offset + SDTCisVT<1, i64> // operand 1: stored value +]>; + +def BPFload_stack_arg : SDNode<"BPFISD::LOAD_STACK_ARG", + SDT_BPFLoadStackArg, + [SDNPHasChain, SDNPMayLoad]>; +def BPFstore_stack_arg : SDNode<"BPFISD::STORE_STACK_ARG", + SDT_BPFStoreStackArg, + [SDNPHasChain, SDNPMayStore]>; + def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC>; def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>; def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY, @@ -300,6 +316,22 @@ let Predicates = [BPFHasGotox] in { } } +let hasSideEffects = 1, mayLoad = 1 in { + def LOAD_STACK_ARG_PSEUDO + : Pseudo<(outs GPR:$dst), (ins s16imm:$off), + "load_stack_arg\t$dst, $off", []>; +} + +let hasSideEffects = 1, mayStore = 1 in { + def STORE_STACK_ARG_PSEUDO + : Pseudo<(outs), (ins s16imm:$off, GPR:$src), + "store_stack_arg\t$off, $src", []>; + + def STORE_STACK_ARG_IMM_PSEUDO + : Pseudo<(outs), (ins s16imm:$off, i32imm:$val), + "store_stack_arg_imm\t$off, $val", []>; +} + // ALU instructions class ALU_RI pattern> diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp index b8e4db78955f5..3e5924a6ac74c 100644 --- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp +++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp @@ -323,6 +323,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass { bool insertMissingCallerSavedSpills(); bool removeMayGotoZero(); bool addExitAfterUnreachable(); + bool expandStackArgPseudos(); public: @@ -340,6 +341,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass { Changed |= insertMissingCallerSavedSpills(); Changed |= removeMayGotoZero(); Changed |= addExitAfterUnreachable(); + Changed |= expandStackArgPseudos(); return Changed; } }; @@ -752,6 +754,64 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() { return true; } +bool BPFMIPreEmitPeephole::expandStackArgPseudos() { + bool Changed = false; + + for (MachineBasicBlock &MBB : *MF) { + for (auto It = MBB.begin(), End = MBB.end(); It != End;) { + MachineInstr &MI = *It++; + DebugLoc DL = MI.getDebugLoc(); + + switch (MI.getOpcode()) { + default: + break; + + case BPF::LOAD_STACK_ARG_PSEUDO: { + Register DstReg = MI.getOperand(0).getReg(); + int16_t Off = MI.getOperand(1).getImm(); + + BuildMI(MBB, MI, DL, TII->get(BPF::LDD), DstReg) + .addReg(BPF::R11) + .addImm(Off); + MI.eraseFromParent(); + Changed = true; + break; + } + + case BPF::STORE_STACK_ARG_PSEUDO: { + int16_t Off = MI.getOperand(0).getImm(); + const MachineOperand &SrcMO = MI.getOperand(1); + Register SrcReg = SrcMO.getReg(); + bool IsKill = SrcMO.isKill(); + + BuildMI(MBB, MI, DL, TII->get(BPF::STD)) + .addReg(SrcReg, getKillRegState(IsKill)) + .addReg(BPF::R11) + .addImm(Off); + MI.eraseFromParent(); + Changed = true; + break; + } + + case BPF::STORE_STACK_ARG_IMM_PSEUDO: { + int16_t Off = MI.getOperand(0).getImm(); + int32_t Val = MI.getOperand(1).getImm(); + + BuildMI(MBB, MI, DL, TII->get(BPF::STD_imm)) + .addImm(Val) + .addReg(BPF::R11) + .addImm(Off); + MI.eraseFromParent(); + Changed = true; + break; + } + } + } + } + + return Changed; +} + } // end default namespace INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole", diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp index e776f9d32b3e5..fff72587714a4 100644 --- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp +++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp @@ -97,8 +97,8 @@ LLVMInitializeBPFDisassembler() { } static const unsigned GPRDecoderTable[] = { - BPF::R0, BPF::R1, BPF::R2, BPF::R3, BPF::R4, BPF::R5, - BPF::R6, BPF::R7, BPF::R8, BPF::R9, BPF::R10, BPF::R11}; + BPF::R0, BPF::R1, BPF::R2, BPF::R3, BPF::R4, BPF::R5, + BPF::R6, BPF::R7, BPF::R8, BPF::R9, BPF::R10, BPF::R11}; static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, @@ -112,8 +112,8 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, } static const unsigned GPR32DecoderTable[] = { - BPF::W0, BPF::W1, BPF::W2, BPF::W3, BPF::W4, BPF::W5, - BPF::W6, BPF::W7, BPF::W8, BPF::W9, BPF::W10, BPF::W11}; + BPF::W0, BPF::W1, BPF::W2, BPF::W3, BPF::W4, BPF::W5, + BPF::W6, BPF::W7, BPF::W8, BPF::W9, BPF::W10, BPF::W11}; static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, diff --git a/llvm/test/CodeGen/BPF/many_args1.ll b/llvm/test/CodeGen/BPF/many_args1.ll index 0e2ff1af06e55..e3a55deeecae6 100644 --- a/llvm/test/CodeGen/BPF/many_args1.ll +++ b/llvm/test/CodeGen/BPF/many_args1.ll @@ -1,6 +1,4 @@ -; RUN: not llc -mtriple=bpf -mcpu=v1 < %s 2> %t1 -; RUN: FileCheck %s < %t1 -; CHECK: error: :0:0: in function foo i32 (i32, i32, i32): {{t10|0x[0-f]+}}: i64 = GlobalAddress 0 too many arguments +; RUN: llc -mtriple=bpf -mcpu=v1 < %s | FileCheck %s ; Function Attrs: nounwind uwtable define i32 @foo(i32 %a, i32 %b, i32 %c) #0 { @@ -9,4 +7,6 @@ entry: ret i32 %call } +; CHECK: call bar + declare i32 @bar(i32, i32, i32, i32, i32, i32) #1 diff --git a/llvm/test/CodeGen/BPF/many_args2.ll b/llvm/test/CodeGen/BPF/many_args2.ll index d1feefc0e4047..f35fe5c9bf951 100644 --- a/llvm/test/CodeGen/BPF/many_args2.ll +++ b/llvm/test/CodeGen/BPF/many_args2.ll @@ -1,6 +1,4 @@ -; RUN: not llc -mtriple=bpf < %s 2> %t1 -; RUN: FileCheck %s < %t1 -; CHECK: error: :0:0: in function bar i32 (i32, i32, i32, i32, i32, i32): stack arguments are not supported +; RUN: llc -mtriple=bpf < %s | FileCheck %s ; Function Attrs: nounwind readnone uwtable define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f) #0 { @@ -8,6 +6,8 @@ entry: ret i32 1 } +; CHECK-LABEL: bar: + ; Function Attrs: nounwind readnone uwtable define i32 @foo(i32 %a, i32 %b, i32 %c) #0 { entry: diff --git a/llvm/test/CodeGen/BPF/many_args3.ll b/llvm/test/CodeGen/BPF/many_args3.ll new file mode 100644 index 0000000000000..cf2e79648c6b2 --- /dev/null +++ b/llvm/test/CodeGen/BPF/many_args3.ll @@ -0,0 +1,199 @@ +; RUN: llc < %s -mtriple=bpf -mcpu=v1 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v2 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v3 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v4 | FileCheck %s + +; Source code: +; struct t { long a; long b; }; +; long foo1(int a1, int a2, int a3, int a4, int a5, short a6, long a7) { +; return a1 + a2 + a3 + a4 + a5 + a6 + a7; +; } +; +; long foo2(int a1, int a2, int a3, int a4, int a5, struct t a6, int a7) { +; return a1 + a2 + a3 + a4 + a5 + a6.a + a6.b + a7; +; } +; +; long foo3(struct t a1, int a2, int a3, int a4, int a5, struct t a6) { +; return a1.a + a1.b + a2 + a3 + a4 + a5 + a6.a + a6.b; +; } +; +; long foo4(int a1, int a2, int a3, int a4, int a5, struct t a6, struct t a7) { +; return a1 + a2 + a3 + a4 + a5 + a6.a + a6.b + a7.a + a7.b; +; } +; +; long bar5(int a1, int a2, int a3, int a4, int a5, short a6, long a7); +; long foo5(int a1, int a2, int a3) { +; return bar5(a1, a2, a3, a2, a3, a1, a2); +; } +; +; long bar6(int a1, int a2, int a3, int a4, int a5, struct t a6, int a7); +; long foo6(int a1, int a2, int a3) { +; struct t tmp = {a1, a2}; +; return bar6(a1, a2, a3, a2, a3, tmp, a2); +; } +; +; long bar7(struct t a1, int a2, int a3, int a4, int a5, struct t a6); +; long foo7(int a1, int a2, int a3) { +; struct t tmp1 = {a1, a2}; +; struct t tmp2 = {a2, a3}; +; return bar7(tmp1, a3, a2, a1, a2, tmp2); +; } +; +; long bar8(int a1, int a2, int a3, int a4, int a5, struct t a6, struct t a7); +; long foo8(int a1, int a2, int a3) { +; struct t tmp1 = { a3, a2 }; +; struct t tmp2 = { a2, a3 }; +; return bar8(a1, a2, a3, a2, a3, tmp1, tmp2); +; } + +define dso_local i64 @foo1(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i16 noundef signext %5, i64 noundef %6) local_unnamed_addr { + %8 = add nsw i32 %1, %0 + %9 = add nsw i32 %8, %2 + %10 = add nsw i32 %9, %3 + %11 = add nsw i32 %10, %4 + %12 = sext i16 %5 to i32 + %13 = add nsw i32 %11, %12 + %14 = sext i32 %13 to i64 + %15 = add nsw i64 %6, %14 + ret i64 %15 +} + +; CHECK-LABEL: foo1: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) +; CHECK: r[[#]] = *(u64 *)(r11 + 16) + +define dso_local i64 @foo2(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, [2 x i64] %5, i32 noundef %6) local_unnamed_addr { + %8 = extractvalue [2 x i64] %5, 0 + %9 = extractvalue [2 x i64] %5, 1 + %10 = add nsw i32 %1, %0 + %11 = add nsw i32 %10, %2 + %12 = add nsw i32 %11, %3 + %13 = add nsw i32 %12, %4 + %14 = sext i32 %13 to i64 + %15 = add nsw i64 %8, %14 + %16 = add nsw i64 %15, %9 + %17 = sext i32 %6 to i64 + %18 = add nsw i64 %16, %17 + ret i64 %18 +} + +; CHECK-LABEL: foo2: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) +; CHECK: r[[#]] = *(u64 *)(r11 + 16) +; CHECK: r[[#]] = *(u64 *)(r11 + 24) + +define dso_local i64 @foo3([2 x i64] %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, [2 x i64] %5) local_unnamed_addr { + %7 = extractvalue [2 x i64] %0, 0 + %8 = extractvalue [2 x i64] %0, 1 + %9 = extractvalue [2 x i64] %5, 0 + %10 = extractvalue [2 x i64] %5, 1 + %11 = add nsw i64 %7, %8 + %12 = sext i32 %1 to i64 + %13 = add nsw i64 %11, %12 + %14 = sext i32 %2 to i64 + %15 = add nsw i64 %13, %14 + %16 = sext i32 %3 to i64 + %17 = add nsw i64 %15, %16 + %18 = sext i32 %4 to i64 + %19 = add nsw i64 %17, %18 + %20 = add nsw i64 %19, %9 + %21 = add nsw i64 %20, %10 + ret i64 %21 +} + +; CHECK-LABEL: foo3: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) +; CHECK: r[[#]] = *(u64 *)(r11 + 16) +; CHECK: r[[#]] = *(u64 *)(r11 + 24) + +define dso_local i64 @foo4(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, [2 x i64] %5, [2 x i64] %6) local_unnamed_addr { + %8 = extractvalue [2 x i64] %5, 0 + %9 = extractvalue [2 x i64] %5, 1 + %10 = extractvalue [2 x i64] %6, 0 + %11 = extractvalue [2 x i64] %6, 1 + %12 = add nsw i32 %1, %0 + %13 = add nsw i32 %12, %2 + %14 = add nsw i32 %13, %3 + %15 = add nsw i32 %14, %4 + %16 = sext i32 %15 to i64 + %17 = add nsw i64 %8, %16 + %18 = add nsw i64 %17, %9 + %19 = add nsw i64 %18, %10 + %20 = add nsw i64 %19, %11 + ret i64 %20 +} + +; CHECK-LABEL: foo4: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) +; CHECK: r[[#]] = *(u64 *)(r11 + 16) +; CHECK: r[[#]] = *(u64 *)(r11 + 24) +; CHECK: r[[#]] = *(u64 *)(r11 + 32) + +define dso_local i64 @foo5(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr { + %4 = trunc i32 %0 to i16 + %5 = sext i32 %1 to i64 + %6 = tail call i64 @bar5(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %2, i16 noundef signext %4, i64 noundef %5) + ret i64 %6 +} + +; CHECK-LABEL: foo5: +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK: *(u64 *)(r11 - 16) = r[[#]] + +declare dso_local i64 @bar5(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i16 noundef signext, i64 noundef) local_unnamed_addr + +define dso_local i64 @foo6(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr { + %4 = sext i32 %0 to i64 + %5 = sext i32 %1 to i64 + %6 = insertvalue [2 x i64] poison, i64 %4, 0 + %7 = insertvalue [2 x i64] %6, i64 %5, 1 + %8 = tail call i64 @bar6(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %2, [2 x i64] %7, i32 noundef %1) + ret i64 %8 +} + +; CHECK-LABEL: foo6: +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK: *(u64 *)(r11 - 16) = r[[#]] +; CHECK: *(u64 *)(r11 - 24) = r[[#]] + +declare dso_local i64 @bar6(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64], i32 noundef) local_unnamed_addr + +define dso_local i64 @foo7(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr { + %4 = sext i32 %0 to i64 + %5 = sext i32 %1 to i64 + %6 = sext i32 %2 to i64 + %7 = insertvalue [2 x i64] poison, i64 %4, 0 + %8 = insertvalue [2 x i64] %7, i64 %5, 1 + %9 = insertvalue [2 x i64] poison, i64 %5, 0 + %10 = insertvalue [2 x i64] %9, i64 %6, 1 + %11 = tail call i64 @bar7([2 x i64] %8, i32 noundef %2, i32 noundef %1, i32 noundef %0, i32 noundef %1, [2 x i64] %10) + ret i64 %11 +} + +; CHECK-LABEL: foo7: +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK: *(u64 *)(r11 - 16) = r[[#]] +; CHECK: *(u64 *)(r11 - 24) = r[[#]] + +declare dso_local i64 @bar7([2 x i64], i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64]) local_unnamed_addr + +define dso_local i64 @foo8(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr { + %4 = sext i32 %2 to i64 + %5 = sext i32 %1 to i64 + %6 = insertvalue [2 x i64] poison, i64 %4, 0 + %7 = insertvalue [2 x i64] %6, i64 %5, 1 + %8 = insertvalue [2 x i64] poison, i64 %5, 0 + %9 = insertvalue [2 x i64] %8, i64 %4, 1 + %10 = tail call i64 @bar8(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %2, [2 x i64] %7, [2 x i64] %9) + ret i64 %10 +} + +; CHECK-LABEL: foo8: +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK: *(u64 *)(r11 - 16) = r[[#]] +; CHECK: *(u64 *)(r11 - 24) = r[[#]] +; CHECK: *(u64 *)(r11 - 32) = r[[#]] + +; CHECK-NOT: *(u64 *)(r11 - 40) = r[[#]] + +declare dso_local i64 @bar8(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64], [2 x i64]) local_unnamed_addr diff --git a/llvm/test/CodeGen/BPF/many_args4.ll b/llvm/test/CodeGen/BPF/many_args4.ll new file mode 100644 index 0000000000000..19cfc91eaa089 --- /dev/null +++ b/llvm/test/CodeGen/BPF/many_args4.ll @@ -0,0 +1,65 @@ +; RUN: llc < %s -mtriple=bpf -mcpu=v1 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v2 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v3 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v4 | FileCheck %s + +; Source code: +; __attribute__((noinline)) long foo1(int a, int b, int c, int d, int e, int f) { +; return a + b + c + d + e + f; +; } +; +; __attribute__((noinline)) long foo2(int a, int b, int c, int d, int e, int f, int g) { +; return a + b + c + d + e + f + g; +; } +; +; long bar(int a, int b, int c, int d, int e, int f, int g) { +; long r1 = foo1(a, b, c, d, e, f + g); +; long r2 = foo2(a, b, c, d, e, f, g); +; return r1 + r2; +; } + + +define dso_local range(i64 -2147483648, 2147483648) i64 @foo1(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) local_unnamed_addr { + %7 = add nsw i32 %1, %0 + %8 = add nsw i32 %7, %2 + %9 = add nsw i32 %8, %3 + %10 = add nsw i32 %9, %4 + %11 = add nsw i32 %10, %5 + %12 = sext i32 %11 to i64 + ret i64 %12 +} + +; CHECK-LABEL: foo1: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) + +define dso_local range(i64 -2147483648, 2147483648) i64 @foo2(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6) local_unnamed_addr { + %8 = add nsw i32 %1, %0 + %9 = add nsw i32 %8, %2 + %10 = add nsw i32 %9, %3 + %11 = add nsw i32 %10, %4 + %12 = add nsw i32 %11, %5 + %13 = add nsw i32 %12, %6 + %14 = sext i32 %13 to i64 + ret i64 %14 +} + +; CHECK-LABEL: foo2: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) +; CHECK: r[[#]] = *(u64 *)(r11 + 16) + +define dso_local range(i64 -4294967296, 4294967295) i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6) local_unnamed_addr { + %8 = add nsw i32 %6, %5 + %9 = tail call i64 @foo1(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %8) + %10 = tail call i64 @foo2(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6) + %11 = add nsw i64 %10, %9 + ret i64 %11 +} + +; CHECK-LABEL: bar: +; CHECK: r[[#]] = *(u64 *)(r11 + 8) +; CHECK: r[[#]] = *(u64 *)(r11 + 16) +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK: call foo1 +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK: *(u64 *)(r11 - 16) = r[[#]] +; CHECK: call foo2 diff --git a/llvm/test/CodeGen/BPF/many_args5.ll b/llvm/test/CodeGen/BPF/many_args5.ll new file mode 100644 index 0000000000000..415c56222b7f4 --- /dev/null +++ b/llvm/test/CodeGen/BPF/many_args5.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=bpf -mcpu=v1 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v2 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v3 | FileCheck %s +; RUN: llc < %s -mtriple=bpf -mcpu=v4 | FileCheck --check-prefix=CHECK-V4 %s + +; Source code: +; long foo(int, int, int, int, int, long); +; long bar(int a, int b, int c, int d, int e) { +; return foo(a, b, c, d, e, 16) + foo(a, b, c, d, e, 0xffFFffFF); +; } + +define dso_local i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4) local_unnamed_addr { + %6 = tail call i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i64 noundef 16) + %7 = tail call i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, i32 noundef %4, i64 noundef 4294967295) + %8 = add nsw i64 %7, %6 + ret i64 %8 +} + +declare dso_local i64 @foo(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef) local_unnamed_addr + +; CHECK-LABEL: bar: +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK-V4: *(u64 *)(r11 - 8) = 16 +; CHECK: call foo +; CHECK-V4: call foo +; CHECK: *(u64 *)(r11 - 8) = r[[#]] +; CHECK-V4: *(u64 *)(r11 - 8) = r[[#]] diff --git a/llvm/test/CodeGen/BPF/many_args6.ll b/llvm/test/CodeGen/BPF/many_args6.ll new file mode 100644 index 0000000000000..8a2b4a939dc8d --- /dev/null +++ b/llvm/test/CodeGen/BPF/many_args6.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=bpf -mcpu=v1 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v2 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v3 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v4 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v1 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s +; RUN: llc -mtriple=bpf -mcpu=v2 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s +; RUN: llc -mtriple=bpf -mcpu=v3 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s +; RUN: llc -mtriple=bpf -mcpu=v4 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s + +; Source code: +; struct t { long a; long b; }; +; +; long foo(int a1, int a2, int a3, int a4, struct t a5) { +; return a1 + a2 + a3 + a4 + a5.a + a5.b; +; } + +define dso_local i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3, [2 x i64] %4) local_unnamed_addr { + %6 = extractvalue [2 x i64] %4, 0 + %7 = extractvalue [2 x i64] %4, 1 + %8 = add nsw i32 %1, %0 + %9 = add nsw i32 %8, %2 + %10 = add nsw i32 %9, %3 + %11 = sext i32 %10 to i64 + %12 = add nsw i64 %6, %11 + %13 = add nsw i64 %12, %7 + ret i64 %13 +} + +; The struct a5 is split: first half in r5, second half on stack. +; CHECK-LABEL: foo: +; CHECK-OFF-8: r[[#]] = *(u64 *)(r11 + 8) +; CHECK-OFF-16-NOT: r[[#]] = *(u64 *)(r11 + 16) diff --git a/llvm/test/CodeGen/BPF/many_args7.ll b/llvm/test/CodeGen/BPF/many_args7.ll new file mode 100644 index 0000000000000..3e170a53ef151 --- /dev/null +++ b/llvm/test/CodeGen/BPF/many_args7.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=bpf -mcpu=v1 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v2 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v3 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v4 < %s | FileCheck --check-prefix=CHECK-OFF-8 %s +; RUN: llc -mtriple=bpf -mcpu=v1 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s +; RUN: llc -mtriple=bpf -mcpu=v2 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s +; RUN: llc -mtriple=bpf -mcpu=v3 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s +; RUN: llc -mtriple=bpf -mcpu=v4 < %s | FileCheck --check-prefix=CHECK-OFF-16 %s + +; Source code: +; struct t { long a; long b; }; +; +; long bar(int a1, int a2, int a3, int a4, struct t a5); +; long foo(int a1, int a2, int a3) { +; struct t tmp = {a1, a2}; +; return bar(a1, a2, a3, a2, tmp); +; } + +define dso_local i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr { + %4 = sext i32 %0 to i64 + %5 = sext i32 %1 to i64 + %6 = insertvalue [2 x i64] poison, i64 %4, 0 + %7 = insertvalue [2 x i64] %6, i64 %5, 1 + %8 = tail call i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, [2 x i64] %7) + ret i64 %8 +} + +; The struct a5 is split: first half in r5, second half on stack. +; CHECK-LABEL: foo: +; CHECK-OFF-8: *(u64 *)(r11 - 8) = r[[#]] +; CHECK-OFF-16-NOT: *(u64 *)(r11 - 16) = r[[#]] + +declare dso_local i64 @bar(i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64]) local_unnamed_addr diff --git a/llvm/test/CodeGen/BPF/many_args8.ll b/llvm/test/CodeGen/BPF/many_args8.ll new file mode 100644 index 0000000000000..dcb543886620b --- /dev/null +++ b/llvm/test/CodeGen/BPF/many_args8.ll @@ -0,0 +1,36 @@ +; RUN: not llc -mtriple=bpf -mcpu=v3 < %s 2> %t1 +; RUN: FileCheck %s < %t1 +; CHECK: error: :0:0: in function foo i64 (i32, i32, i32): {{(0x[0-9a-fA-F]+|t[0-9]+)}}: i64 = GlobalAddress 0 pass by value not supported + +; Source code: +; struct t { long a; long b; long c;}; +; +; long bar(int a1, int a2, int a3, int a4, int a5, struct t a6); +; long foo(int a1, int a2, int a3) { +; struct t tmp = {a1, a2, a3}; +; return bar(a1, a2, a3, a2, a1, tmp); +; } + +%struct.t = type { i64, i64, i64 } + +define dso_local i64 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) local_unnamed_addr { + %4 = alloca %struct.t, align 8 + call void @llvm.lifetime.start.p0(ptr nonnull %4) + %5 = sext i32 %0 to i64 + store i64 %5, ptr %4, align 8 + %6 = getelementptr inbounds nuw i8, ptr %4, i64 8 + %7 = sext i32 %1 to i64 + store i64 %7, ptr %6, align 8 + %8 = getelementptr inbounds nuw i8, ptr %4, i64 16 + %9 = sext i32 %2 to i64 + store i64 %9, ptr %8, align 8 + %10 = tail call i64 @bar(i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %1, i32 noundef %0, ptr noundef nonnull byval(%struct.t) align 8 %4) + call void @llvm.lifetime.end.p0(ptr nonnull %4) + ret i64 %10 +} + +declare void @llvm.lifetime.start.p0(ptr captures(none)) + +declare dso_local i64 @bar(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, ptr noundef byval(%struct.t) align 8) local_unnamed_addr + +declare void @llvm.lifetime.end.p0(ptr captures(none)) From a2942d472aac907af6f47f8c7658288609b6e1de Mon Sep 17 00:00:00 2001 From: AbdallahRashed <63146988+AbdallahRashed@users.noreply.github.com> Date: Sat, 9 May 2026 15:13:53 +0200 Subject: [PATCH 140/538] [VectorCombine] foldShuffleChainsToReduce - add support for partial vector reductions (#195119) Extend foldShuffleChainsToReduce to recognize partial reduction patterns where only a subvector of the full vector is being reduced. For example, a <16 x i16> vector where the shuffle chain only reduces the lower 8 elements can now be folded into: shufflevector (extract lower <8 x i16>) + vector.reduce.smax The detection works by noticing when the bottom-up walk through the shuffle/op chain ends before consuming the full vector. The number of levels visited determines the subvector size (2^levels), and an extract_subvector + scalar reduction replaces the original chain when profitable. Fixes #194617 --- .../Transforms/Vectorize/VectorCombine.cpp | 39 ++++++++++++--- .../X86/horizontal-reduce-smax.ll | 40 +++------------ .../X86/horizontal-reduce-smin.ll | 40 +++------------ .../X86/horizontal-reduce-umax.ll | 40 +++------------ .../X86/horizontal-reduce-umin.ll | 40 +++------------ .../fold-shuffle-chains-to-reduce.ll | 50 +++++++++++++++++++ 6 files changed, 115 insertions(+), 134 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 5ba344ea9a808..10ad3a71c73de 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3995,6 +3995,8 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { InstWorklist.push(VecOpEE); + bool IsPartialReduction = false; + while (!InstWorklist.empty()) { Value *CI = InstWorklist.front(); InstWorklist.pop(); @@ -4125,12 +4127,19 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { ShouldBeCallOrBinInst ^= 1; } else { + // Check if this is a partial reduction - the chain ended because + // the source vector is not a recognized op/shuffle. + if (ShouldBeCallOrBinInst && VisitedCnt >= 1 && CI == PrevVecV[0]) { + IsPartialReduction = true; + break; + } return false; } } - // Pattern should end with a shuffle op. - if (ShouldBeCallOrBinInst) + // Full reduction pattern should end with a shuffle op. + // Partial reduction ends when the source vector is reached. + if (ShouldBeCallOrBinInst && !IsPartialReduction) return false; assert(VecSize != -1 && "Expected Match for Vector Size"); @@ -4147,14 +4156,32 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { if (!ReducedOp) return false; - IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); - InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + InstructionCost NewCost = 0; + FixedVectorType *ReduceVecTy = FinalVecVTy; + SmallVector ExtractMask; + + if (IsPartialReduction) { + unsigned SubVecSize = ShuffleMaskHalf; + ReduceVecTy = FixedVectorType::get(FVT->getElementType(), SubVecSize); + ExtractMask.resize(SubVecSize); + std::iota(ExtractMask.begin(), ExtractMask.end(), 0); + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + ReduceVecTy, FinalVecVTy, ExtractMask, CostKind, 0); + } + + IntrinsicCostAttributes ICA(ReducedOp, ReduceVecTy, {ReduceVecTy}); + NewCost += TTI.getIntrinsicInstrCost(ICA, CostKind); if (NewCost >= OrigCost) return false; - auto *ReducedResult = - Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); + Value *ReduceInput = FinalVecV; + if (IsPartialReduction) + ReduceInput = Builder.CreateShuffleVector(FinalVecV, ExtractMask); + + auto *ReducedResult = Builder.CreateIntrinsic( + ReducedOp, {ReduceInput->getType()}, {ReduceInput}); replaceValue(I, *ReducedResult); return true; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll index 85186dba0891f..ec8cd82b96a37 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll @@ -314,13 +314,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -338,13 +333,8 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -362,15 +352,8 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -391,15 +374,8 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll index 80c2929b5d5cf..650947d240ace 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll @@ -314,13 +314,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -338,13 +333,8 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -362,15 +352,8 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -391,15 +374,8 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll index dbb448c4b96e5..f7d5a99bc0da0 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll @@ -314,13 +314,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -338,13 +333,8 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -362,15 +352,8 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -391,15 +374,8 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll index bd2366d49a951..e2fc523dd271c 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll @@ -314,13 +314,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -338,13 +333,8 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -362,15 +352,8 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -391,15 +374,8 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[TMP1]]) ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll index 403ce33b5344e..71809534016d1 100644 --- a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -193,3 +193,53 @@ define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) { %7 = extractelement <6 x i16> %6, i64 0 ret i16 %7 } + +; Partial reduction: reduce lower 8 elements of a 16-element vector using smax. +define i16 @test_partial_reduce_v16i16_v8i16_smax(<16 x i16> %a0) { +; CHECK-LABEL: define i16 @test_partial_reduce_v16i16_v8i16_smax( +; CHECK-SAME: <16 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> + %2 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %a0, <16 x i16> %1) + %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> + %4 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %2, <16 x i16> %3) + %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> + %6 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %4, <16 x i16> %5) + %7 = extractelement <16 x i16> %6, i64 0 + ret i16 %7 +} + +; Partial reduction: reduce lower 4 elements of an 8-element vector using add. +define i32 @test_partial_reduce_v8i32_v4i32_add(<8 x i32> %a0) { +; CHECK-LABEL: define i32 @test_partial_reduce_v8i32_v4i32_add( +; CHECK-SAME: <8 x i32> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = shufflevector <8 x i32> %a0, <8 x i32> poison, <8 x i32> + %2 = add <8 x i32> %a0, %1 + %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <8 x i32> + %4 = add <8 x i32> %2, %3 + %5 = extractelement <8 x i32> %4, i64 0 + ret i32 %5 +} + +; Partial reduction: reduce lower 4 elements of a 16-element vector using umin. +define i16 @test_partial_reduce_v16i16_v4i16_umin(<16 x i16> %a0) { +; CHECK-LABEL: define i16 @test_partial_reduce_v16i16_v4i16_umin( +; CHECK-SAME: <16 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[TMP1]]) +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> + %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1) + %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> + %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3) + %5 = extractelement <16 x i16> %4, i64 0 + ret i16 %5 +} From 6004c174cd548eb00fc003a30f90be85dd430827 Mon Sep 17 00:00:00 2001 From: flovent Date: Sat, 9 May 2026 21:59:04 +0800 Subject: [PATCH 141/538] [clang-tidy] Correct `std::has_one_bit` to `std::has_single_bit` in `modernize-use-std-bit` (#196721) There isn't `std::has_one_bit` in standard library, the function checks if a number is an integral power of 2 is `std::has_single_bit`. https://en.cppreference.com/cpp/header/bit --- .../clang-tidy/modernize/UseStdBitCheck.cpp | 11 ++-- .../checks/modernize/use-std-bit.rst | 12 ++--- .../checkers/modernize/use-std-bit.cpp | 50 +++++++++---------- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdBitCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdBitCheck.cpp index 0abe7d6323bdf..b534757e89434 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdBitCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStdBitCheck.cpp @@ -71,13 +71,13 @@ void UseStdBitCheck::registerMatchers(MatchFinder *Finder) { }; // Determining if an integer is a power of 2 with following pattern: - // has_one_bit(v) = v && !(v & (v - 1)); + // has_single_bit(v) = v && !(v & (v - 1)); Finder->addMatcher( LogicalAnd(IsNonNull(BindDeclRef("v")), LogicalNot(BitwiseAnd( BoundDeclRef("v"), Sub(BoundDeclRef("v"), integerLiteral(equals(1)))))) - .bind("has_one_bit_expr"), + .bind("has_single_bit_expr"), this); // Computing popcount with following pattern: @@ -120,16 +120,17 @@ void UseStdBitCheck::check(const MatchFinder::MatchResult &Result) { const SourceManager &Source = Context.getSourceManager(); if (const auto *MatchedExpr = - Result.Nodes.getNodeAs("has_one_bit_expr")) { + Result.Nodes.getNodeAs("has_single_bit_expr")) { const auto *MatchedVarDecl = Result.Nodes.getNodeAs("v"); auto Diag = - diag(MatchedExpr->getBeginLoc(), "use 'std::has_one_bit' instead"); + diag(MatchedExpr->getBeginLoc(), "use 'std::has_single_bit' instead"); if (auto R = MatchedExpr->getSourceRange(); !R.getBegin().isMacroID() && !R.getEnd().isMacroID()) { Diag << FixItHint::CreateReplacement( MatchedExpr->getSourceRange(), - ("std::has_one_bit(" + MatchedVarDecl->getName() + ")").str()) + ("std::has_single_bit(" + MatchedVarDecl->getName() + ")") + .str()) << IncludeInserter.createIncludeInsertion( Source.getFileID(MatchedExpr->getBeginLoc()), ""); } diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-bit.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-bit.rst index 36848b7e3d228..9f90c1aa06cec 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-bit.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-bit.rst @@ -8,16 +8,16 @@ Finds common idioms which can be replaced by standard functions from the Covered scenarios: -============================== ======================= +============================== ========================== Expression Replacement ------------------------------- ----------------------- -``x && !(x & (x - 1))`` ``std::has_one_bit(x)`` -``(x != 0) && !(x & (x - 1))`` ``std::has_one_bit(x)`` -``(x > 0) && !(x & (x - 1))`` ``std::has_one_bit(x)`` +------------------------------ -------------------------- +``x && !(x & (x - 1))`` ``std::has_single_bit(x)`` +``(x != 0) && !(x & (x - 1))`` ``std::has_single_bit(x)`` +``(x > 0) && !(x & (x - 1))`` ``std::has_single_bit(x)`` ``std::bitset(x).count()`` ``std::popcount(x)`` ``x << 3 | x >> 61`` ``std::rotl(x, 3)`` ``x << 61 | x >> 3`` ``std::rotr(x, 3)`` -============================== ======================= +============================== ========================== Options ------- diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-bit.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-bit.cpp index 615d1202e8092..d1ca194424d20 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-bit.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-bit.cpp @@ -6,80 +6,80 @@ * has_one_bit pattern */ unsigned has_one_bit_bithack(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return x && !(x & (x - 1)); } unsigned long has_one_bit_bithack(unsigned long x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return x && !(x & (x - 1)); } unsigned short has_one_bit_bithack(unsigned short x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return x && !(x & (x - 1)); } unsigned has_one_bit_bithack_perm(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return x && !((x - 1) & (x)); } unsigned has_one_bit_bithack_otherperm(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return !((x - 1) & (x)) && x; } unsigned has_one_bit_bithack_variant_neq(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (x != 0) && !(x & (x - 1)); } unsigned has_one_bit_bithack_variant_neq_perm(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (x != 0) && !(x & (x - 1)); } unsigned has_one_bit_bithack_variant_gt(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (x > 0) && !(x & (x - 1)); } unsigned has_one_bit_bithacks_variant_gte(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (x >= 1) && !(x & (x - 1)); } unsigned has_one_bit_bithacks_variant_lt(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (0 < x) && !(x & (x - 1)); } unsigned has_one_bit_bithacks_variant_lte(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (1 <= x) && !(x & (x - 1)); } unsigned has_one_bit_bithack_variant_gt_perm(unsigned x) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] - // CHECK-FIXES: return std::has_one_bit(x); + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] + // CHECK-FIXES: return std::has_single_bit(x); return (x > 0) && !(x & (x - 1)); } #define HAS_ONE_BIT v && !(v & (v - 1)) unsigned has_one_bit_bithack_macro(unsigned v) { - // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_one_bit' instead [modernize-use-std-bit] + // CHECK-MESSAGES: :[[@LINE+2]]:10: warning: use 'std::has_single_bit' instead [modernize-use-std-bit] // No fixes, it comes from macro expansion. return HAS_ONE_BIT; } From c2f7e989431dea8eee88be3ba73fb6c89859ca6a Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Sat, 9 May 2026 23:00:21 +0800 Subject: [PATCH 142/538] [SelectionDAG] Don't convert sextload to zextload through a multi-use freeze (#196700) Resolves #196590. The patch https://github.com/llvm/llvm-project/pull/189317 to teach DAGCombiner to look through freeze incorrectly introduce a miscompile of sext -> zext. This resolves resolves the miscompile. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++- .../CodeGen/X86/reduce-load-width-freeze.ll | 29 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5a467a5a5ba53..a5f7a5ae330f0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16545,7 +16545,9 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { // the freeze can depend on the full load value. But its still safe to change // the extension type from anyext to zext. if (FreezeNode && !FreezeNode.hasOneUse() && - (LN0->getMemoryVT().bitsGT(ExtVT) || ExtType != ISD::ZEXTLOAD)) + (LN0->getMemoryVT().bitsGT(ExtVT) || ExtType != ISD::ZEXTLOAD || + (LN0->getExtensionType() != ISD::EXTLOAD && + LN0->getExtensionType() != ISD::ZEXTLOAD))) return SDValue(); auto AdjustBigEndianShift = [&](unsigned ShAmt) { diff --git a/llvm/test/CodeGen/X86/reduce-load-width-freeze.ll b/llvm/test/CodeGen/X86/reduce-load-width-freeze.ll index 555ea5d069d85..8c6f463647fa2 100644 --- a/llvm/test/CodeGen/X86/reduce-load-width-freeze.ll +++ b/llvm/test/CodeGen/X86/reduce-load-width-freeze.ll @@ -354,3 +354,32 @@ define i16 @srl_freeze_load_i64_to_i16(ptr %p) { %trunc = trunc i64 %srl to i16 ret i16 %trunc } + +@g6 = global i8 0 +@g1 = global i16 0 + +; no incorrect sext -> zext +define i1 @issue196590() { +; CHECK-LABEL: issue196590: +; CHECK: # %bb.0: +; CHECK-NEXT: movq g6@GOTPCREL(%rip), %rax +; CHECK-NEXT: movsbl (%rax), %eax +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: movq g1@GOTPCREL(%rip), %rdx +; CHECK-NEXT: movw %cx, (%rdx) +; CHECK-NEXT: leal (%rax,%rax), %ecx +; CHECK-NEXT: cmpl %eax, %ecx +; CHECK-NEXT: setg %al +; CHECK-NEXT: retq + %a = load i8, ptr @g6 + %zx = zext i8 %a to i16 + store i16 %zx, ptr @g1 + %sx = sext i8 %a to i32 + %b = load i8, ptr @g6 + %fr = freeze i8 %b + %fr16 = sext i8 %fr to i16 + %add = add i16 %fr16, %fr16 + %selsx = sext i16 %add to i32 + %cmp = icmp sgt i32 %selsx, %sx + ret i1 %cmp +} From 2caea408ab90333d86efc857a3924315f3bc90ab Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Sat, 9 May 2026 23:53:26 +0800 Subject: [PATCH 143/538] [libc++] LWG4324: `unique_ptr::operator*` is not SFINAE-friendly (#190919) --------- Co-authored-by: Hristo Hristov --- libcxx/docs/Status/Cxx2cIssues.csv | 2 +- libcxx/include/__memory/unique_ptr.h | 7 +++++++ .../dereference.single.pass.cpp | 13 +++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index c875f75528894..42ab96ef5657a 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -280,7 +280,7 @@ "`LWG4259 `__","P1148R0 changed the return values of searching functions of ``std::basic_string`` on some platforms","2026-03 (Croydon)","","","`#189822 `__","" "`LWG4290 `__","Missing *Mandates* clauses on ``is_sufficiently_aligned``","2026-03 (Croydon)","","","`#189823 `__","" "`LWG4314 `__","Missing move in ``mdspan`` layout ``mapping::operator()``","2026-03 (Croydon)","","","`#189824 `__","" -"`LWG4324 `__","``unique_ptr::operator*`` is not SFINAE-friendly","2026-03 (Croydon)","","","`#189825 `__","" +"`LWG4324 `__","``unique_ptr::operator*`` is not SFINAE-friendly","2026-03 (Croydon)","|Complete|","23","`#189825 `__","" "`LWG4325 `__","``std::indirect``'s ``operator==`` still does not support incomplete types","2026-03 (Croydon)","","","`#189826 `__","" "`LWG4339 `__","``task``'s coroutine frame may be released late","2026-03 (Croydon)","","","`#189827 `__","" "`LWG4347 `__","``task``'s stop source is always created","2026-03 (Croydon)","","","`#189828 `__","" diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h index 6a4ec0a466ba7..8fba2a4a1509d 100644 --- a/libcxx/include/__memory/unique_ptr.h +++ b/libcxx/include/__memory/unique_ptr.h @@ -117,6 +117,12 @@ struct __unique_ptr_deleter_sfinae<_Deleter&> { typedef false_type __enable_rval_overload; }; +template +inline const bool __can_dereference = false; + +template +inline const bool __can_dereference<_Tp, decltype((void)*std::declval<_Tp>())> = true; + #if defined(_LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI) # define _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__)) #else @@ -258,6 +264,7 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr { return *this; } + template , int> = 0> [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __add_lvalue_reference_t<_Tp> operator*() const _NOEXCEPT_(_NOEXCEPT_(*std::declval())) { return *__ptr_; diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp index 4063190838a27..b9d8a1b563b6c 100644 --- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp +++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/dereference.single.pass.cpp @@ -12,8 +12,11 @@ // test op*() +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + #include #include +#include #include #include @@ -31,6 +34,16 @@ struct Deleter { }; #endif +template +struct can_dereference : std::false_type {}; +template +struct can_dereference())> : std::true_type {}; + +static_assert(can_dereference >::value, ""); +static_assert(can_dereference&>::value, ""); +static_assert(!can_dereference >::value, ""); +static_assert(!can_dereference&>::value, ""); + TEST_CONSTEXPR_CXX23 bool test() { { std::unique_ptr p(new int(3)); From cb5d07624ef90d9b6a88987ef4316facde23c787 Mon Sep 17 00:00:00 2001 From: stativ Date: Sat, 9 May 2026 21:03:09 +0200 Subject: [PATCH 144/538] [clang-format] Add BreakFunctionDeclarationParameters option. (#196567) Adds an option the break function declaration parameters, always putting them on the next line after the function opening parentheses. This is an equivalent of `BreakFunctionDefinitionParameters`, but for function declarations. --------- Co-authored-by: Lukas Jirkovsky --- clang/docs/ClangFormatStyleOptions.rst | 15 +++++++++++ clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/Format/Format.h | 16 ++++++++++++ clang/lib/Format/Format.cpp | 3 +++ clang/lib/Format/TokenAnnotator.cpp | 6 +++++ clang/unittests/Format/AlignBracketsTest.cpp | 9 +++++++ clang/unittests/Format/ConfigParseTest.cpp | 1 + clang/unittests/Format/FormatTest.cpp | 27 ++++++++++++++++++++ 8 files changed, 79 insertions(+) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index d492f2364cf74..61f27bcf9dbbc 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -3858,6 +3858,21 @@ the configuration (without a prefix: ``Auto``). +.. _BreakFunctionDeclarationParameters: + +**BreakFunctionDeclarationParameters** (``Boolean``) :versionbadge:`clang-format 23` :ref:`¶ ` + If ``true``, clang-format will always break before function declaration + parameters. + + .. code-block:: c++ + + true: + void functionDeclaration( + int A, int B); + + false: + void functionDeclaration(int A, int B); + .. _BreakFunctionDefinitionParameters: **BreakFunctionDefinitionParameters** (``Boolean``) :versionbadge:`clang-format 19` :ref:`¶ ` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fa19d4b576575..0d43f864653cc 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -748,6 +748,8 @@ clang-format - Extend ``BreakBinaryOperations`` to accept a structured configuration with per-operator break rules and minimum chain length gating via ``PerOperator``. - Add ``AllowShortRecordOnASingleLine`` option and set it to ``EmptyAndAttached`` for LLVM style. +- Add ``BreakFunctionDeclarationParameters`` option to always break before function + declaration parameters. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index 98400a1609b6a..0e883837ac0e9 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -2644,6 +2644,20 @@ struct FormatStyle { /// \version 5 BreakConstructorInitializersStyle BreakConstructorInitializers; + /// If ``true``, clang-format will always break before function declaration + /// parameters. + /// \code + /// true: + /// void functionDeclaration( + /// int A, int B); + /// + /// false: + /// void functionDeclaration(int A, int B); + /// + /// \endcode + /// \version 23 + bool BreakFunctionDeclarationParameters; + /// If ``true``, clang-format will always break before function definition /// parameters. /// \code @@ -6076,6 +6090,8 @@ struct FormatStyle { BreakBeforeTernaryOperators == R.BreakBeforeTernaryOperators && BreakBinaryOperations == R.BreakBinaryOperations && BreakConstructorInitializers == R.BreakConstructorInitializers && + BreakFunctionDeclarationParameters == + R.BreakFunctionDeclarationParameters && BreakFunctionDefinitionParameters == R.BreakFunctionDefinitionParameters && BreakInheritanceList == R.BreakInheritanceList && diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 2147a812e27c1..74b31810843fc 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -1318,6 +1318,8 @@ template <> struct MappingTraits { IO.mapOptional("BreakBinaryOperations", Style.BreakBinaryOperations); IO.mapOptional("BreakConstructorInitializers", Style.BreakConstructorInitializers); + IO.mapOptional("BreakFunctionDeclarationParameters", + Style.BreakFunctionDeclarationParameters); IO.mapOptional("BreakFunctionDefinitionParameters", Style.BreakFunctionDefinitionParameters); IO.mapOptional("BreakInheritanceList", Style.BreakInheritanceList); @@ -1885,6 +1887,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.BreakBeforeTernaryOperators = true; LLVMStyle.BreakBinaryOperations = {FormatStyle::BBO_Never, {}}; LLVMStyle.BreakConstructorInitializers = FormatStyle::BCIS_BeforeColon; + LLVMStyle.BreakFunctionDeclarationParameters = false; LLVMStyle.BreakFunctionDefinitionParameters = false; LLVMStyle.BreakInheritanceList = FormatStyle::BILS_BeforeColon; LLVMStyle.BreakStringLiterals = true; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 898759cb8ea1b..640f03a4ac130 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -5758,6 +5758,12 @@ bool TokenAnnotator::mustBreakBefore(AnnotatedLine &Line, const FormatToken &Left = *Right.Previous; + if (Style.BreakFunctionDeclarationParameters && Line.MightBeFunctionDecl && + !Line.mightBeFunctionDefinition() && Left.MightBeFunctionDeclParen && + Left.ParameterCount > 0) { + return true; + } + if (Style.BreakFunctionDefinitionParameters && Line.MightBeFunctionDecl && Line.mightBeFunctionDefinition() && Left.MightBeFunctionDeclParen && Left.ParameterCount > 0) { diff --git a/clang/unittests/Format/AlignBracketsTest.cpp b/clang/unittests/Format/AlignBracketsTest.cpp index cd314305751e7..fcfcae20e3e11 100644 --- a/clang/unittests/Format/AlignBracketsTest.cpp +++ b/clang/unittests/Format/AlignBracketsTest.cpp @@ -731,6 +731,15 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) { "}", BreakAlways); + // Ensure BreakFunctionDeclarationParameters interacts correctly when + // PackParameters.BinPack is set to BPPS_AlwaysOnePerLine. + BreakAlways.BreakFunctionDeclarationParameters = true; + verifyFormat("void f(\n" + " int a,\n" + " int b);", + BreakAlways); + BreakAlways.BreakFunctionDeclarationParameters = false; + // Ensure BreakFunctionDefinitionParameters interacts correctly when // PackParameters.BinPack is set to BPPS_AlwaysOnePerLine. BreakAlways.BreakFunctionDefinitionParameters = true; diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 64b0e8702872c..ccb9c837d8362 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -183,6 +183,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(BreakBeforeCloseBracketSwitch); CHECK_PARSE_BOOL(BreakBeforeTemplateCloser); CHECK_PARSE_BOOL(BreakBeforeTernaryOperators); + CHECK_PARSE_BOOL(BreakFunctionDeclarationParameters); CHECK_PARSE_BOOL(BreakStringLiterals); CHECK_PARSE_BOOL(CompactNamespaces); CHECK_PARSE_BOOL(DerivePointerAlignment); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index f5e496652e15e..4245bd1c58153 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -8119,6 +8119,33 @@ TEST_F(FormatTest, AllowAllArgumentsOnNextLine) { Style); } +TEST_F(FormatTest, BreakFunctionDeclarationParameters) { + StringRef Input = "void functionDecl(int A, int B, int C);\n" + "void emptyFunctionDecl();\n" + "void functionDefinition(int A, int B, int C) {}"; + verifyFormat(Input); + + FormatStyle Style = getLLVMStyle(); + EXPECT_FALSE(Style.BreakFunctionDeclarationParameters); + Style.BreakFunctionDeclarationParameters = true; + verifyFormat("void functionDecl(\n" + " int A, int B, int C);\n" + "void emptyFunctionDecl();\n" + "void functionDefinition(int A, int B, int C) {}", + Input, Style); + + // Test the style where all parameters are on their own lines. + Style.AllowAllParametersOfDeclarationOnNextLine = false; + Style.PackParameters.BinPack = FormatStyle::BPPS_OnePerLine; + verifyFormat("void functionDecl(\n" + " int A,\n" + " int B,\n" + " int C);\n" + "void emptyFunctionDecl();\n" + "void functionDefinition(int A, int B, int C) {}", + Input, Style); +} + TEST_F(FormatTest, BreakFunctionDefinitionParameters) { StringRef Input = "void functionDecl(paramA, paramB, paramC);\n" "void emptyFunctionDefinition() {}\n" From 615a7e0f772bb3ef09b5b008075ad665876ce27a Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Sat, 9 May 2026 21:03:35 +0200 Subject: [PATCH 145/538] [mlir][SPIR-V] Convert math.fpowi to spirv.CL.pown (#196701) --- .../Conversion/MathToSPIRV/MathToSPIRV.cpp | 22 ++++++++++++++++++- .../MathToSPIRV/math-to-opencl-spirv.mlir | 14 ++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp index ea6be76373573..8d850e01d5e62 100644 --- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp +++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp @@ -445,6 +445,26 @@ struct PowFOpPattern final : public OpConversionPattern { } }; +/// Converts math.fpowi to spirv.CL.pown. +struct PowIOpPattern final : public OpConversionPattern { + using Base::Base; + + LogicalResult + matchAndRewrite(math::FPowIOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (LogicalResult res = checkSourceOpTypes(rewriter, op); failed(res)) + return res; + + Type dstType = getTypeConverter()->convertType(op.getType()); + if (!dstType) + return failure(); + + rewriter.replaceOpWithNewOp(op, dstType, adaptor.getLhs(), + adaptor.getRhs()); + return success(); + } +}; + /// Converts math.round to GLSL SPIRV extended ops. struct RoundOpPattern final : public OpConversionPattern { using Base::Base; @@ -556,7 +576,7 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, PowIOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir index dae1b43402718..037f69e63a2dc 100644 --- a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir +++ b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir @@ -140,6 +140,20 @@ func.func @float32_binary_vector(%lhs: vector<4xf32>, %rhs: vector<4xf32>) { return } +// CHECK-LABEL: @fpowi_scalar +func.func @fpowi_scalar(%base: f32, %power: i32) -> f32 { + // CHECK: spirv.CL.pown %{{.*}}, %{{.*}} : f32, i32 -> f32 + %0 = math.fpowi %base, %power : f32, i32 + return %0 : f32 +} + +// CHECK-LABEL: @fpowi_vector +func.func @fpowi_vector(%base: vector<4xf32>, %power: vector<4xi32>) -> vector<4xf32> { + // CHECK: spirv.CL.pown %{{.*}}, %{{.*}} : vector<4xf32>, vector<4xi32> -> vector<4xf32> + %0 = math.fpowi %base, %power : vector<4xf32>, vector<4xi32> + return %0 : vector<4xf32> +} + // CHECK-LABEL: @float32_ternary_scalar func.func @float32_ternary_scalar(%a: f32, %b: f32, %c: f32) { // CHECK: spirv.CL.fma %{{.*}}: f32 From 2d8bcb5fab244bc263e2450857788ef1a5c2b9c1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 9 May 2026 21:16:01 +0200 Subject: [PATCH 146/538] [VPlan] Lift isUsedByLoadStoreAddr into vputils, operate on VPValue(NFC) (#196415) Extract the helper previously scoped to VPReplicateRecipe::computeCost and make it available from VPlanUtils so other transforms can query whether a VPValue is used as part of another load or store's address. Also relax the input type from VPUser * to VPValue *: the worklist now tracks VPValues directly, and traversal is gated on the user being a VPSingleDefRecipe before walking its own users. This is NFC for the existing caller. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 52 +------------------ llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 51 ++++++++++++++++++ llvm/lib/Transforms/Vectorize/VPlanUtils.h | 4 ++ 3 files changed, 56 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d04b5edcfc212..11a91dcd46867 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3347,56 +3347,6 @@ static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr; } -/// Returns true if \p V is used as part of the address of another load or -/// store. -static bool isUsedByLoadStoreAddress(const VPUser *V) { - SmallPtrSet Seen; - SmallVector WorkList = {V}; - - while (!WorkList.empty()) { - auto *Cur = dyn_cast(WorkList.pop_back_val()); - if (!Cur || !Seen.insert(Cur).second) - continue; - - auto *Blend = dyn_cast(Cur); - // Skip blends that use V only through a compare by checking if any incoming - // value was already visited. - if (Blend && none_of(seq(0, Blend->getNumIncomingValues()), - [&](unsigned I) { - return Seen.contains( - Blend->getIncomingValue(I)->getDefiningRecipe()); - })) - continue; - - for (VPUser *U : Cur->users()) { - if (auto *InterleaveR = dyn_cast(U)) - if (InterleaveR->getAddr() == Cur) - return true; - if (auto *RepR = dyn_cast(U)) { - if (RepR->getOpcode() == Instruction::Load && - RepR->getOperand(0) == Cur) - return true; - if (RepR->getOpcode() == Instruction::Store && - RepR->getOperand(1) == Cur) - return true; - } - if (auto *MemR = dyn_cast(U)) { - if (MemR->getAddr() == Cur && MemR->isConsecutive()) - return true; - } - } - - // The legacy cost model only supports scalarization loads/stores with phi - // addresses, if the phi is directly used as load/store address. Don't - // traverse further for Blends. - if (Blend) - continue; - - append_range(WorkList, Cur->users()); - } - return false; -} - /// Return true if \p R is a predicated load/store with a loop-invariant address /// only masked by the header mask. static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, @@ -3539,7 +3489,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); bool UsedByLoadStoreAddress = - !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); + !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(this); InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UsedByLoadStoreAddress ? UI : nullptr); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index b776d5c75a849..5b80fa15a5535 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -781,3 +781,54 @@ VPInstruction *vputils::findComputeReductionResult(VPReductionPHIRecipe *PhiR) { return vputils::findUserOf( cast(SelR)); } + +bool vputils::isUsedByLoadStoreAddress(const VPValue *V) { + SmallPtrSet Seen; + SmallVector WorkList = {V}; + + while (!WorkList.empty()) { + const VPValue *Cur = WorkList.pop_back_val(); + if (!Seen.insert(Cur).second) + continue; + + auto *Blend = dyn_cast(Cur); + // Skip blends that use V only through a compare by checking if any incoming + // value was already visited. + if (Blend && none_of(seq(0, Blend->getNumIncomingValues()), + [&](unsigned I) { + return Seen.contains(Blend->getIncomingValue(I)); + })) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + // The legacy cost model only supports scalarization loads/stores with phi + // addresses, if the phi is directly used as load/store address. Don't + // traverse further for Blends. + if (Blend) + continue; + + // Only traverse further through users that also define a value (and can + // thus have their own users walked). + for (VPUser *U : Cur->users()) + if (auto *SDR = dyn_cast(U)) + WorkList.push_back(SDR); + } + return false; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index ac3a1005c8f24..2a4b8566d8475 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -159,6 +159,10 @@ VPInstruction *findCanonicalIVIncrement(VPlan &Plan); /// mirroring Value::stripPointerCasts. GEPNoWrapFlags getGEPFlagsForPtr(VPValue *Ptr); +/// Returns true if \p V is used as part of the address of another load or +/// store. +bool isUsedByLoadStoreAddress(const VPValue *V); + /// Find the ComputeReductionResult recipe for \p PhiR, looking through selects /// inserted for predicated reductions or tail folding. VPInstruction *findComputeReductionResult(VPReductionPHIRecipe *PhiR); From ee29cb17c46d03ab938660c27ffba67f8255dd27 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 9 May 2026 20:16:23 +0100 Subject: [PATCH 147/538] clang: Fix using -march=amdgcn in some r600 run lines (#196745) --- clang/test/Preprocessor/predefined-arch-macros.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index 395b663047747..e8e9dcc3341f7 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -4628,13 +4628,13 @@ // Begin r600 tests ---------------- -// RUN: %clang -march=amdgcn -E -dM %s -o - 2>&1 \ +// RUN: %clang -E -dM %s -o - 2>&1 \ // RUN: -target r600-unknown-unknown \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_R600 // CHECK_R600: #define __R600__ 1 // CHECK_R600-NOT: #define __HAS_FMAF__ 1 -// RUN: %clang -march=amdgcn -mcpu=cypress -E -dM %s -o - 2>&1 \ +// RUN: %clang -mcpu=cypress -E -dM %s -o - 2>&1 \ // RUN: -target r600-unknown-unknown \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_R600_FP64 // CHECK_R600_FP64-DAG: #define __R600__ 1 From 900dd1d7c76f338fbd7a2c6b1738f6bd5db624d5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 9 May 2026 20:26:19 +0100 Subject: [PATCH 148/538] clang/AMDGPU: Use all_equal instead of building a temporary set (#196742) --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index b457ee2cde1c3..d2d8e56eb22be 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -739,8 +739,7 @@ AMDGPUToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch, << getArchName() << llvm::toString(GPUsOrErr.takeError()) << "-mcpu"; } else { auto &GPUs = *GPUsOrErr; - if (llvm::SmallSet(GPUs.begin(), GPUs.end()).size() > - 1) + if (!llvm::all_equal(GPUs)) getDriver().Diag(diag::warn_drv_multi_gpu_arch) << getArchName() << llvm::join(GPUs, ", ") << "-mcpu"; DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_mcpu_EQ), From 492d774292daa9b22ebfe7e38c4fbe8fbab249d6 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Sat, 9 May 2026 21:44:43 +0200 Subject: [PATCH 149/538] [mlir][SPIR-V] Support spirv.selection_control attribute on scf.if (#196510) --- .../mlir/Dialect/SPIRV/IR/TargetAndABI.h | 3 +++ mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp | 8 +++++-- mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp | 4 ++++ mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp | 4 ++++ mlir/test/Conversion/SCFToSPIRV/if.mlir | 22 +++++++++++++++++++ .../test/Dialect/SPIRV/IR/target-and-abi.mlir | 11 ++++++++++ 6 files changed, 50 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/TargetAndABI.h b/mlir/include/mlir/Dialect/SPIRV/IR/TargetAndABI.h index 7e11eb653c126..6e302542bde35 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/TargetAndABI.h +++ b/mlir/include/mlir/Dialect/SPIRV/IR/TargetAndABI.h @@ -112,6 +112,9 @@ ResourceLimitsAttr getDefaultResourceLimits(MLIRContext *context); /// Returns the attribute name for specifying loop control. StringRef getLoopControlAttrName(); +/// Returns the attribute name for specifying selection control. +StringRef getSelectionControlAttrName(); + /// Returns the attribute name for specifying SPIR-V target environment. StringRef getTargetEnvAttrName(); diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp index a9c6f7db847d3..d5140f3faa6ff 100644 --- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp +++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp @@ -249,8 +249,12 @@ struct IfOpConversion : SCFToSPIRVPattern { // Create `spirv.selection` operation, selection header block and merge // block. - auto selectionOp = spirv::SelectionOp::create( - rewriter, loc, spirv::SelectionControl::None); + auto selectionControl = spirv::SelectionControl::None; + if (auto attr = ifOp->getAttrOfType( + spirv::getSelectionControlAttrName())) + selectionControl = attr.getValue(); + auto selectionOp = + spirv::SelectionOp::create(rewriter, loc, selectionControl); auto *mergeBlock = rewriter.createBlock(&selectionOp.getBody(), selectionOp.getBody().end()); spirv::MergeOp::create(rewriter, loc); diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp index 2b5e7a571f42d..5821391b426cb 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp @@ -1054,6 +1054,10 @@ LogicalResult SPIRVDialect::verifyOperationAttribute(Operation *op, if (!isa(attr)) return op->emitError("'") << symbol << "' must be a spirv::LoopControlAttr"; + } else if (symbol == spirv::getSelectionControlAttrName()) { + if (!isa(attr)) + return op->emitError("'") + << symbol << "' must be a spirv::SelectionControlAttr"; } else { return op->emitError("found unsupported '") << symbol << "' attribute on operation"; diff --git a/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp b/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp index 270cb6df20415..c604fd087ba46 100644 --- a/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/TargetAndABI.cpp @@ -167,6 +167,10 @@ spirv::getDefaultResourceLimits(MLIRContext *context) { StringRef spirv::getLoopControlAttrName() { return "spirv.loop_control"; } +StringRef spirv::getSelectionControlAttrName() { + return "spirv.selection_control"; +} + StringRef spirv::getTargetEnvAttrName() { return "spirv.target_env"; } spirv::TargetEnvAttr spirv::getDefaultTargetEnv(MLIRContext *context) { diff --git a/mlir/test/Conversion/SCFToSPIRV/if.mlir b/mlir/test/Conversion/SCFToSPIRV/if.mlir index 2c18da41dc021..0b3df9a533302 100644 --- a/mlir/test/Conversion/SCFToSPIRV/if.mlir +++ b/mlir/test/Conversion/SCFToSPIRV/if.mlir @@ -167,4 +167,26 @@ func.func @unsupported_yield_type(%arg0 : memref<8xi32>, %arg1 : memref<8xi32>, return } +// CHECK-LABEL: @selection_flatten +func.func @selection_flatten(%arg2 : memref<10xf32, #spirv.storage_class>, %arg3 : i1) { + %value = arith.constant 0.0 : f32 + %i = arith.constant 0 : index + // CHECK: spirv.mlir.selection control(Flatten) { + scf.if %arg3 { + memref.store %value, %arg2[%i] : memref<10xf32, #spirv.storage_class> + } {spirv.selection_control = #spirv.selection_control} + return +} + +// CHECK-LABEL: @selection_dont_flatten +func.func @selection_dont_flatten(%arg2 : memref<10xf32, #spirv.storage_class>, %arg3 : i1) { + %value = arith.constant 0.0 : f32 + %i = arith.constant 0 : index + // CHECK: spirv.mlir.selection control(DontFlatten) { + scf.if %arg3 { + memref.store %value, %arg2[%i] : memref<10xf32, #spirv.storage_class> + } {spirv.selection_control = #spirv.selection_control} + return +} + } // end module diff --git a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir index 63dea6af83556..e634186d3b9a8 100644 --- a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir +++ b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir @@ -346,3 +346,14 @@ func.func @vce() attributes { // CHECK: #spirv.vce vce = #spirv.vce } { return } + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.selection_control +//===----------------------------------------------------------------------===// + +// expected-error@+1 {{'spirv.selection_control' must be a spirv::SelectionControlAttr}} +func.func @selection_control_wrong_attr() attributes { + spirv.selection_control = 64 +} { return } From 7c0ae9c245315bdc078950d69585da8a761b5532 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Sat, 9 May 2026 16:06:41 -0400 Subject: [PATCH 150/538] [SLP][NFC]Add a test with scalable vector type in struct-returning intrinsic, NFC Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/196747 --- .../RISCV/scalable-type-as-input.ll | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/scalable-type-as-input.ll diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scalable-type-as-input.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scalable-type-as-input.ll new file mode 100644 index 0000000000000..a3f777709be7d --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scalable-type-as-input.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s + +define void @test(i32 %lhsWords) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[LHSWORDS:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr null, zeroinitializer, i32 0) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i32( [[WIDE_VP_LOAD]]) +; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = zext [[TMP0]] to +; CHECK-NEXT: [[WIDE_TRIP_COUNT404:%.*]] = zext i32 [[LHSWORDS]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[WIDE_TRIP_COUNT404]], i32 1, i1 false) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP1]], ptr null, zeroinitializer, i32 [[TMP2]]) +; CHECK-NEXT: ret void +; +entry: + %wide.vp.load = call @llvm.vp.load.nxv4i32.p0(ptr null, zeroinitializer, i32 0) + %strided.vec = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.vp.load) + %0 = extractvalue { , } %strided.vec, 0 + %1 = zext %0 to + %wide.trip.count404 = zext i32 %lhsWords to i64 + %2 = call i32 @llvm.experimental.get.vector.length.i64(i64 %wide.trip.count404, i32 1, i1 false) + call void @llvm.vp.store.nxv2i64.p0( %1, ptr null, zeroinitializer, i32 %2) + ret void +} + +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg) +declare void @llvm.vp.store.nxv2i64.p0(, ptr captures(none), , i32) +declare @llvm.vp.load.nxv4i32.p0(ptr captures(none), , i32) +declare { , } @llvm.vector.deinterleave2.nxv4i32() + From 45e5bfb456409d171db3a8f7fa8a553dc7bde0d8 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Sat, 9 May 2026 16:10:07 -0400 Subject: [PATCH 151/538] [SLP][NFC]Add a test with struct-returning intrinsics in different basic blocks, NFC Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/196748 --- .../X86/struct-return-different-bb.ll | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/struct-return-different-bb.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/struct-return-different-bb.ll b/llvm/test/Transforms/SLPVectorizer/X86/struct-return-different-bb.ll new file mode 100644 index 0000000000000..108edef7e0f0d --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/struct-return-different-bb.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i1 @test(ptr %0, i8 %1) { +; CHECK-LABEL: define i1 @test( +; CHECK-SAME: ptr [[TMP0:%.*]], i8 [[TMP1:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[TMP1]], i8 1) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i8, i1 } [[TMP3]], 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[BB8:.*]], label %[[BB5:.*]] +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[TMP1]], i8 1) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i8, i1 } [[TMP6]], 1 +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB8]], label %[[BB10:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, %[[BB10]] ], [ false, %[[BB5]] ], [ false, [[TMP2:%.*]] ] +; CHECK-NEXT: ret i1 [[TMP9]] +; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i8, i1 } [[TMP6]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = sext i8 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i8, i1 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i16 +; CHECK-NEXT: [[TMP15:%.*]] = sdiv i16 [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc i16 [[TMP15]] to i8 +; CHECK-NEXT: store i8 [[TMP16]], ptr [[TMP0]], align 1 +; CHECK-NEXT: br label %[[BB8]] +; + %3 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 %1, i8 1) + %4 = extractvalue { i8, i1 } %3, 1 + br i1 %4, label %8, label %5 + +5: ; preds = %2 + %6 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 %1, i8 1) + %7 = extractvalue { i8, i1 } %6, 1 + br i1 %7, label %8, label %10 + +8: ; preds = %10, %5, %2 + %9 = phi i1 [ false, %10 ], [ false, %5 ], [ false, %2 ] + ret i1 %9 + +10: ; preds = %5 + %11 = extractvalue { i8, i1 } %6, 0 + %12 = sext i8 %11 to i16 + %13 = extractvalue { i8, i1 } %3, 0 + %14 = sext i8 %13 to i16 + %15 = sdiv i16 %12, %14 + %16 = trunc i16 %15 to i8 + store i8 %16, ptr %0, align 1 + br label %8 +} + +declare { i8, i1 } @llvm.sadd.with.overflow.i8(i8, i8) From 6c979bbcfae4fa21c907f3d7d099cf8ac6543202 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 9 May 2026 14:10:02 -0700 Subject: [PATCH 152/538] [X86] Hoist ReservedIdentifiers to MCAsmInfo and shrink setup cost. NFC (#196699) PR #186570 added a per-MCAsmInfo `StringSet<>` populated with X86 register names plus Intel-syntax keywords, which caused a minor instructions:u increase. Avoid heap allocation and hoist `ReservedIdentifiers` to MCAsmInfo for other targets. For the register-name source, prefer `X86IntelInstPrinter::getRegisterName` over `MCRegisterInfo::getName`. The former is a TableGen-emitted accessor into a `static const char AsmStrs[]` pool in `X86GenAsmWriter1.inc`, populated from the lowercase asm-name argument of each `def XX : X86Reg<"xx", ...>;` in `X86RegisterInfo.td`. --- llvm/include/llvm/MC/MCAsmInfo.h | 14 +++++ .../Target/X86/MCTargetDesc/X86MCAsmInfo.cpp | 18 +++--- .../Target/X86/MCTargetDesc/X86MCAsmInfo.h | 5 -- .../X86/MCTargetDesc/X86MCTargetDesc.cpp | 55 ++++++++----------- 4 files changed, 45 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index f99f6463f35f9..e4ec45960a399 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -15,7 +15,9 @@ #ifndef LLVM_MC_MCASMINFO_H #define LLVM_MC_MCASMINFO_H +#include "llvm/ADT/CachedHashString.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCDirectives.h" @@ -433,6 +435,10 @@ class LLVM_ABI MCAsmInfo { llvm::StringMap NameToAtSpecifier; void initializeAtSpecifiers(ArrayRef); + // Lowercase identifiers (e.g. register names, dialect keywords) that must be + // quoted when used as a symbol name. + llvm::DenseSet ReservedIdentifiers; + const MCTargetOptions &TargetOptions; public: @@ -492,6 +498,14 @@ class LLVM_ABI MCAsmInfo { /// syntactically correct. virtual bool isValidUnquotedName(StringRef Name) const; + llvm::DenseSet &getReservedIdentifiers() { + return ReservedIdentifiers; + } + const llvm::DenseSet & + getReservedIdentifiers() const { + return ReservedIdentifiers; + } + virtual void printSwitchToSection(const MCSection &, uint32_t Subsection, const Triple &, raw_ostream &) const {} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index f431654bb57f2..6cb815008291b 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -187,32 +187,32 @@ X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM( AllowAtAtStartOfIdentifier = true; } -static bool isValidX86UnquotedName(const MCAsmInfo &MAI, - const StringSet<> &ReservedIdentifiers, - StringRef Name) { +static bool isValidX86UnquotedName(const MCAsmInfo &MAI, StringRef Name) { if (!MAI.MCAsmInfo::isValidUnquotedName(Name)) return false; // Only Intel-syntax output needs to avoid register/keyword collisions; AT&T // disambiguates registers with '%' and doesn't treat `byte`, `ptr`, etc. as // keywords. - return MAI.getOutputAssemblerDialect() == 0 || - !ReservedIdentifiers.contains(Name.lower()); + if (MAI.getOutputAssemblerDialect() == 0) + return true; + return !MAI.getReservedIdentifiers().contains( + CachedHashStringRef(Name.lower())); } bool X86MCAsmInfoDarwin::isValidUnquotedName(StringRef Name) const { - return isValidX86UnquotedName(*this, ReservedIdentifiers, Name); + return isValidX86UnquotedName(*this, Name); } bool X86ELFMCAsmInfo::isValidUnquotedName(StringRef Name) const { - return isValidX86UnquotedName(*this, ReservedIdentifiers, Name); + return isValidX86UnquotedName(*this, Name); } bool X86MCAsmInfoMicrosoft::isValidUnquotedName(StringRef Name) const { - return isValidX86UnquotedName(*this, ReservedIdentifiers, Name); + return isValidX86UnquotedName(*this, Name); } bool X86MCAsmInfoGNUCOFF::isValidUnquotedName(StringRef Name) const { - return isValidX86UnquotedName(*this, ReservedIdentifiers, Name); + return isValidX86UnquotedName(*this, Name); } void X86MCAsmInfoGNUCOFF::anchor() { } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index 3939fe32d9ffd..a0bc7ed1f5802 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -14,7 +14,6 @@ #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H #include "MCTargetDesc/X86MCExpr.h" -#include "llvm/ADT/StringSet.h" #include "llvm/MC/MCAsmInfoCOFF.h" #include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/MCAsmInfoELF.h" @@ -27,7 +26,6 @@ class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { virtual void anchor(); public: - StringSet<> ReservedIdentifiers; explicit X86MCAsmInfoDarwin(const Triple &Triple, const MCTargetOptions &Options); bool isValidUnquotedName(StringRef Name) const override; @@ -45,7 +43,6 @@ class X86ELFMCAsmInfo : public MCAsmInfoELF { void anchor() override; public: - StringSet<> ReservedIdentifiers; explicit X86ELFMCAsmInfo(const Triple &Triple, const MCTargetOptions &Options); bool isValidUnquotedName(StringRef Name) const override; @@ -55,7 +52,6 @@ class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { void anchor() override; public: - StringSet<> ReservedIdentifiers; explicit X86MCAsmInfoMicrosoft(const Triple &Triple, const MCTargetOptions &Options); bool isValidUnquotedName(StringRef Name) const override; @@ -73,7 +69,6 @@ class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { void anchor() override; public: - StringSet<> ReservedIdentifiers; explicit X86MCAsmInfoGNUCOFF(const Triple &Triple, const MCTargetOptions &Options); bool isValidUnquotedName(StringRef Name) const override; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 0ed8e65b18166..5ec4c836572ef 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -420,24 +420,28 @@ static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) { return X; } -static void populateReservedIdentifiers(StringSet<> &Set, +static void populateReservedIdentifiers(MCAsmInfo &MAI, const MCRegisterInfo &MRI) { - // Register names: `call rsi` is misassembled as an indirect call. + auto &Set = MAI.getReservedIdentifiers(); + // Register names: `call rsi` is misassembled as an indirect call. Use the + // Intel printer's table directly — it's the lowercase asm name in stable + // storage. MRI::getName() returns the uppercase enum name and would need + // an extra .lower() heap allocation per entry. for (unsigned i = 1, e = MRI.getNumRegs(); i < e; ++i) - if (const char *Name = MRI.getName(i)) + if (const char *Name = X86IntelInstPrinter::getRegisterName(i)) if (Name[0]) - Set.insert(StringRef(Name).lower()); + Set.insert(CachedHashStringRef(Name)); // Keywords that GAS Intel syntax misparses as constants, modifiers, or // pseudo-registers instead of symbol references (e.g., `call byte` calls // address 1, not symbol "byte"; `call flat` errors out). for (StringRef KW : {"byte", "word", "dword", "fword", "qword", "mmword", "tbyte", "oword", "xmmword", "ymmword", "zmmword", "offset", "flat", "near", "far", "short"}) - Set.insert(KW); + Set.insert(CachedHashStringRef(KW)); // Operator keywords parsed by GAS/X86AsmParser in Intel mode. for (StringRef KW : {"and", "eq", "ge", "gt", "le", "lt", "mod", "ne", "not", "or", "shl", "shr", "xor"}) - Set.insert(KW); + Set.insert(CachedHashStringRef(KW)); } static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, @@ -447,42 +451,27 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI; if (TheTriple.isOSBinFormatMachO()) { - if (is64Bit) { - auto *P = new X86_64MCAsmInfoDarwin(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; - } else { - auto *P = new X86MCAsmInfoDarwin(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; - } + if (is64Bit) + MAI = new X86_64MCAsmInfoDarwin(TheTriple, Options); + else + MAI = new X86MCAsmInfoDarwin(TheTriple, Options); } else if (TheTriple.isOSBinFormatELF()) { // Force the use of an ELF container. - auto *P = new X86ELFMCAsmInfo(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; + MAI = new X86ELFMCAsmInfo(TheTriple, Options); } else if (TheTriple.isWindowsMSVCEnvironment() || TheTriple.isWindowsCoreCLREnvironment() || TheTriple.isUEFI()) { - if (Options.getAssemblyLanguage().equals_insensitive("masm")) { - auto *P = new X86MCAsmInfoMicrosoftMASM(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; - } else { - auto *P = new X86MCAsmInfoMicrosoft(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; - } + if (Options.getAssemblyLanguage().equals_insensitive("masm")) + MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple, Options); + else + MAI = new X86MCAsmInfoMicrosoft(TheTriple, Options); } else if (TheTriple.isOSCygMing() || TheTriple.isWindowsItaniumEnvironment()) { - auto *P = new X86MCAsmInfoGNUCOFF(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; + MAI = new X86MCAsmInfoGNUCOFF(TheTriple, Options); } else { // The default is ELF. - auto *P = new X86ELFMCAsmInfo(TheTriple, Options); - populateReservedIdentifiers(P->ReservedIdentifiers, MRI); - MAI = P; + MAI = new X86ELFMCAsmInfo(TheTriple, Options); } + populateReservedIdentifiers(*MAI, MRI); // Initialize initial frame state. // Calculate amount of bytes used for return address storing From e07d245eed77beef46ce0d235983ec338225620e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 9 May 2026 14:29:52 -0700 Subject: [PATCH 153/538] [MCParser] .incbin: Don't retain the buffer, don't require NUL termination (#196696) processIncbinFile uses SourceMgr::AddIncludeFile, which * sets `RequiresNullTerminator=true` and disable `mmap` when the file size is a multiple of the page size, * and unnecessarily retains the throwaway buffer in `Buffers`. Switch to OpenIncludeFile so the buffer is freed when processIncbinFile returns, and pass RequiresNullTerminator=false. The buffer is consumed only by emitBytes; the lexer never scans it, so it does not need a trailing '\0' (different from #154972). Without that requirement, MemoryBuffer mmaps the file and RSS tracks only the touched pages. Stress test (1000 .incbin "blob.bin", 0, 16 against a 1 MiB blob): ``` Maximum RSS Before 1042944 KiB After 15360 KiB ``` Fix #62339 --- llvm/include/llvm/Support/SourceMgr.h | 3 ++- llvm/lib/MC/MCParser/AsmParser.cpp | 11 +++++++---- llvm/lib/Support/SourceMgr.cpp | 10 +++++++--- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h index 02e694cad8697..1bfbd1e7eaae7 100644 --- a/llvm/include/llvm/Support/SourceMgr.h +++ b/llvm/include/llvm/Support/SourceMgr.h @@ -200,7 +200,8 @@ class SourceMgr { /// buffer of the stacked file. The full path to the included file can be /// found in \p IncludedFile. LLVM_ABI ErrorOr> - OpenIncludeFile(const std::string &Filename, std::string &IncludedFile); + OpenIncludeFile(const std::string &Filename, std::string &IncludedFile, + bool RequiresNullTerminator = true); /// Return the ID of the buffer containing the specified location. /// diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 0d517fbd57472..482abd5bbbd55 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -860,14 +860,17 @@ bool AsmParser::processIncbinFile(const std::string &Filename, int64_t Skip, if (SymbolScanningMode) return false; + // The buffer is consumed only by emitBytes. Skip the NUL termination to + // enable mmap in more cases, reading only the touched pages instead of the + // whole file. std::string IncludedFile; - unsigned NewBuf = - SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile); - if (!NewBuf) + ErrorOr> BufOrErr = SrcMgr.OpenIncludeFile( + Filename, IncludedFile, /*RequiresNullTerminator=*/false); + if (!BufOrErr) return true; // Pick up the bytes from the file and emit them. - StringRef Bytes = SrcMgr.getMemoryBuffer(NewBuf)->getBuffer(); + StringRef Bytes = (*BufOrErr)->getBuffer(); Bytes = Bytes.drop_front(Skip); if (Count) { int64_t Res; diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp index 299615a6c8041..486537c7b0171 100644 --- a/llvm/lib/Support/SourceMgr.cpp +++ b/llvm/lib/Support/SourceMgr.cpp @@ -68,9 +68,13 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename, ErrorOr> SourceMgr::OpenIncludeFile(const std::string &Filename, - std::string &IncludedFile) { - auto GetFile = [this](StringRef Path) { - return FS ? FS->getBufferForFile(Path) : MemoryBuffer::getFile(Path); + std::string &IncludedFile, + bool RequiresNullTerminator) { + auto GetFile = [this, RequiresNullTerminator](StringRef Path) { + return FS ? FS->getBufferForFile(Path, /*FileSize=*/-1, + RequiresNullTerminator) + : MemoryBuffer::getFile(Path, /*IsText=*/false, + RequiresNullTerminator); }; ErrorOr> NewBufOrErr = GetFile(Filename); From 254259b4ed748ea637428e56de8a7e1d402f0afe Mon Sep 17 00:00:00 2001 From: Oliver Hunt Date: Sat, 9 May 2026 14:41:02 -0700 Subject: [PATCH 154/538] Revert "Avoid assert in substqualifier (#182707)" (#196755) This reverts commit e2def106757534b07a2d3ff15ddd48e14b69a66d. --- clang/docs/ReleaseNotes.rst | 1 - clang/lib/Sema/SemaDecl.cpp | 4 +--- clang/test/SemaTemplate/GH176152.cpp | 12 ------------ 3 files changed, 1 insertion(+), 16 deletions(-) delete mode 100644 clang/test/SemaTemplate/GH176152.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 0d43f864653cc..c17143e3c0398 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -572,7 +572,6 @@ Bug Fixes to C++ Support - Correctly diagnose uses of ``co_await`` / ``co_yield`` in the default argument of nested function declarations. (#GH98923) - Fixed a crash when diagnosing an invalid static member function with an explicit object parameter (#GH177741) - Clang incorrectly instantiated variable specializations outside of the immediate context. (#GH54439) -- Fixed a crash when instantiating an invalid out-of-line static data member definition in a local class. (#GH176152) - Fixed a crash when pack expansions are used as arguments for non-pack parameters of built-in templates. (#GH180307) - Fix a problem where pack index expressions where incorrectly being regarded as equivalent. - Fixed a bug where captured variables in non-mutable lambdas were incorrectly treated as mutable diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 7c5bcd56b346c..a9a4cb89d115f 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7926,7 +7926,7 @@ NamedDecl *Sema::ActOnVariableDeclarator( if (CurContext->isRecord()) { if (SC == SC_Static) { - if (CXXRecordDecl *RD = dyn_cast(DC)) { + if (const CXXRecordDecl *RD = dyn_cast(DC)) { // Walk up the enclosing DeclContexts to check for any that are // incompatible with static data members. const DeclContext *FunctionOrMethod = nullptr; @@ -7948,8 +7948,6 @@ NamedDecl *Sema::ActOnVariableDeclarator( Diag(D.getIdentifierLoc(), diag::err_static_data_member_not_allowed_in_local_class) << Name << RD->getDeclName() << RD->getTagKind(); - Invalid = true; - RD->setInvalidDecl(); } else if (AnonStruct) { // C++ [class.static.data]p4: Unnamed classes and classes contained // directly or indirectly within unnamed classes shall not contain diff --git a/clang/test/SemaTemplate/GH176152.cpp b/clang/test/SemaTemplate/GH176152.cpp deleted file mode 100644 index 7d61aa292982d..0000000000000 --- a/clang/test/SemaTemplate/GH176152.cpp +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s - -template int f(T) { - struct MyClass { - static int staticField; - // expected-error@-1 {{static data member 'staticField' not allowed in local struct 'MyClass'}} - }; - int MyClass::staticField = 42; - return 0; -} - -int x = f(0); From 5d03beef627a0d733c83c562d9a09e7201d9e001 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 9 May 2026 23:23:41 +0100 Subject: [PATCH 155/538] [DAG] canCreateUndefOrPoison - out of range vector insert/extract element indices only generate poison (#196720) Matches ValueTracking / GISel implementations - although testing options are limited until DAG has actual uses of UndefPoisonKind::UndefOnly --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0b7d8b7946f99..a221df567a10f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5993,10 +5993,13 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::INSERT_VECTOR_ELT: case ISD::EXTRACT_VECTOR_ELT: { // Ensure that the element index is in bounds. - EVT VecVT = Op.getOperand(0).getValueType(); - SDValue Idx = Op.getOperand(Opcode == ISD::INSERT_VECTOR_ELT ? 2 : 1); - KnownBits KnownIdx = computeKnownBits(Idx, Depth + 1); - return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements()); + if (includesPoison(Kind)) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDValue Idx = Op.getOperand(Opcode == ISD::INSERT_VECTOR_ELT ? 2 : 1); + KnownBits KnownIdx = computeKnownBits(Idx, Depth + 1); + return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements()); + } + return false; } case ISD::VECTOR_SHUFFLE: { From 4eedcd87fd8d2b9ba31d4b19e162557161ef0550 Mon Sep 17 00:00:00 2001 From: Oliver Hunt Date: Sat, 9 May 2026 16:12:21 -0700 Subject: [PATCH 156/538] [clang][NFC] Actually add the testcase for #195416 (#196759) --- clang/test/SemaCXX/GH195416.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 clang/test/SemaCXX/GH195416.cpp diff --git a/clang/test/SemaCXX/GH195416.cpp b/clang/test/SemaCXX/GH195416.cpp new file mode 100644 index 0000000000000..85ed2961d0ef6 --- /dev/null +++ b/clang/test/SemaCXX/GH195416.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s + +constexpr void gh195416() { + struct U { + struct S {}; + static constexpr S S::bar; + // expected-error@-1 {{non-friend class member 'bar' cannot have a qualified name}} + // expected-error@-2 {{static data member 'bar' not allowed in local struct 'S'}} + }; +} From e6c316e374e7380415e7abb15e8816b9027307b1 Mon Sep 17 00:00:00 2001 From: Anshul Nigham Date: Sat, 9 May 2026 16:35:58 -0700 Subject: [PATCH 157/538] [Docs] Match body/toctree ordering on Reference and UserGuides (#195542) The `toctree` section is hidden but used for previous/next breadcrumbs. This was suggested in https://github.com/llvm/llvm-project/pull/184440#issuecomment-4351195402 --- llvm/docs/Reference.rst | 82 ++++++++++++++-------------- llvm/docs/UserGuides.rst | 114 ++++++++++++++++++++------------------- 2 files changed, 100 insertions(+), 96 deletions(-) diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index cfbb98b578e76..56e367388b1a8 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -9,61 +9,63 @@ LLVM and API reference documentation. .. toctree:: :hidden: - AIToolPolicy - Atomics - BitCodeFormat - BlockFrequencyTerminology - BranchWeightMetadata - CalleeTypeMetadata - CallGraphSection - CIBestPractices + HowToUseAttributes CommandGuide/index - ContentAddressableStorage - ConvergenceAndUniformity - ConvergentOperations - Coroutines - DependenceGraphs/index - ExceptionHandling - Extensions - FaultMaps - FuzzingLLVM + CommandGuide/llvm-reduce + OptBisect + SymbolizerMarkupFormat + PDB/index GarbageCollection - GetElementPtr + Statepoints + LibFuzzer + FuzzingLLVM + LangRef + UndefinedBehavior + InAlloca + BitCodeFormat + MIRLangRef GlobalISel/index + ConvergentOperations + TestingGuide + TestSuiteGuide GwpAsan + XRay + XRayExample + FaultMaps + Atomics + ExceptionHandling + Extensions HowToSetUpLLVMStyleRTTI - HowToUseAttributes - InAlloca - InterfaceExportAnnotations - LangRef - LibFuzzer - MarkedUpDisassembly - MIRLangRef - OptBisect - PCSectionsMetadata - PDB/index - PointerAuth - MLGO + BlockFrequencyTerminology + BranchWeightMetadata + GetElementPtr ScudoHardenedAllocator MemoryModelRelaxationAnnotations MemTagSanitizer + DependenceGraphs/index + SpeculativeLoadHardening + SegmentedStacks + MarkedUpDisassembly + StackMaps + Coroutines + PointerAuth + YamlIO + ConvergenceAndUniformity + MLGO + ContentAddressableStorage + CIBestPractices + AIToolPolicy + CalleeTypeMetadata + CallGraphSection + InterfaceExportAnnotations + PCSectionsMetadata QualGroup Security SecurityTransparencyReports - SegmentedStacks - StackMaps - SpeculativeLoadHardening - Statepoints - SymbolizerMarkupFormat SystemLibrary - TestingGuide TransformMetadata TypeMetadata - UndefinedBehavior - XRay - XRayExample XRayFDRFormat - YamlIO API Reference ------------- diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index b137ba4dd4830..c4b9293b39ea4 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -12,79 +12,81 @@ intermediate LLVM representation. .. toctree:: :hidden: - AArch64SME - AddingConstrainedIntrinsics - AdminTasks - AdvancedBuilds - AliasAnalysis - AMDGPUUsage - AMDGPUAsyncOperations - AMDGPUExecutionSynchronization - Benchmarking - BigEndianNEON - BuildingADistribution + HowToBuildOnARM + HowToBuildWithPGO + HowToCrossCompileLLVM + CoverageMappingFormat CFIVerify + BuildingADistribution CMake - CMakePrimer - CodeGenerator - CodeOfConduct - CommandLine - CompileCudaWithLLVM - CoverageMappingFormat - CycleTerminology - DebuggingJITedCode - DirectXUsage Docker + SupportLibrary + AdvancedBuilds + WritingAnLLVMNewPMPass + WritingAnLLVMPass + Passes + StackSafetyAnalysis + MergeFunctions + AliasAnalysis + MemorySSA + MemProf + LoopTerminology + CycleTerminology + Vectorizers + LinkTimeOptimization DTLTO - FatLTO - ExtendingLLVM - GitHub GoldPlugin - GlobalISel/MIRPatterns - HowToBuildOnARM - HowToBuildWithPGO - HowToBuildWindowsItaniumPrograms - HowToCrossCompileBuiltinsOnArm - HowToCrossCompileLLVM + Remarks + SourceLevelDebugging HowToUpdateDebugInfo - InstCombineContributorGuide - InstrProfileFormat InstrRefDebugInfo + RemoveDIsDebugInfo KeyInstructionsDebugInfo - LFI - LinkTimeOptimization - LoopTerminology - MarkdownQuickstartTemplate - MemorySSA - MemProf - MergeFunctions + InstrProfileFormat + InstCombineContributorGuide + WritingAnLLVMBackend + CodeGenerator + TableGen/index + GlobalISel/MIRPatterns MCJITDesignAndImplementation - MisExpect ORCv2 - OpaquePointers JITLink - NewPassManager + DebuggingJITedCode + CommandLine + ExtendingLLVM + AddingConstrainedIntrinsics + HowToBuildWindowsItaniumPrograms + HowToCrossCompileBuiltinsOnArm + BigEndianNEON + AArch64SME + CompileCudaWithLLVM NVPTXUsage - Passes - ReportingGuide - ResponseGuide - Remarks - RemoveDIsDebugInfo + AMDGPUUsage + AMDGPUAsyncOperations + AMDGPUDwarfExtensionsForHeterogeneousDebugging + AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack + AMDGPUExecutionSynchronization + SPIRVUsage + DirectXUsage RISCVUsage RISCV/RISCVVectorExtension RISCV/RISCVVCIX - SourceLevelDebugging - SPIRVUsage SandboxIR - StackSafetyAnalysis - SupportLibrary - TableGen/index - TableGenFundamentals Telemetry - Vectorizers - WritingAnLLVMPass - WritingAnLLVMNewPMPass - WritingAnLLVMBackend + LFI + AdminTasks + Benchmarking + CMakePrimer + CodeOfConduct + FatLTO + GitHub + MarkdownQuickstartTemplate + MisExpect + OpaquePointers + NewPassManager + ReportingGuide + ResponseGuide + TableGenFundamentals yaml2obj Clang From 3b4499c9eba52ca462a0a6b3764f6bbdfa399da6 Mon Sep 17 00:00:00 2001 From: argothiel Date: Sun, 10 May 2026 02:32:42 +0200 Subject: [PATCH 158/538] [clangd] Add InsertReplaceEdit for code completion (#187623) Handle new insertReplaceSupport capability (defined in LSP 3.16). Add the new option to the protocol layer and pass it around to the code completion logic. Update CompletionItem::textEdit to become the union type as per the LSP specification. Add a new helper function to the Lexer public API to find the end of an identifier with full context lexing, to avoid duplicating the logic. Use the helper both in the Sema flow and in the comment completion flow. Use a simpler ASCII-only scan in no-Sema mode. Add LIT tests to verify auto-triggered completions, mid-word replacement, Unicode, and snippets. Add unit tests to verify insert/replace ranges with and without Sema, including comments and the feature-off case. Update the release notes to document the new capability. Fixes https://github.com/clangd/clangd/issues/2190 --------- Co-authored-by: timon-ul --- clang-tools-extra/clangd/ClangdLSPServer.cpp | 1 + clang-tools-extra/clangd/CodeComplete.cpp | 183 +++++++++--- clang-tools-extra/clangd/CodeComplete.h | 16 +- clang-tools-extra/clangd/Protocol.cpp | 13 +- clang-tools-extra/clangd/Protocol.h | 27 +- .../test/completion-auto-trigger-replace.test | 113 ++++++++ .../clangd/test/completion-replace.test | 271 ++++++++++++++++++ .../test/completion-snippets-replace.test | 68 +++++ .../clangd/unittests/CodeCompleteTests.cpp | 200 ++++++++++++- clang-tools-extra/docs/ReleaseNotes.rst | 4 + clang/include/clang/Lex/Lexer.h | 8 + clang/lib/Lex/Lexer.cpp | 23 ++ clang/unittests/Lex/LexerTest.cpp | 38 +++ 13 files changed, 907 insertions(+), 58 deletions(-) create mode 100644 clang-tools-extra/clangd/test/completion-auto-trigger-replace.test create mode 100644 clang-tools-extra/clangd/test/completion-replace.test create mode 100644 clang-tools-extra/clangd/test/completion-snippets-replace.test diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index 6e40a5278502c..04f58ab6446d1 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -518,6 +518,7 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, Opts.CodeComplete.EnableSnippets = Params.capabilities.CompletionSnippets; Opts.CodeComplete.IncludeFixIts = Params.capabilities.CompletionFixes; + Opts.CodeComplete.EnableInsertReplace = Params.capabilities.InsertReplace; Opts.CodeComplete.DocumentationFormat = Params.capabilities.CompletionDocumentationFormat; Opts.SignatureHelpDocumentationFormat = diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 9bd96ba96fcc0..71189ab9de8b2 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1384,14 +1384,17 @@ void loadMainFilePreambleMacros(const Preprocessor &PP, bool semaCodeComplete(std::unique_ptr Consumer, const clang::CodeCompleteOptions &Options, const SemaCompleteInput &Input, - IncludeStructure *Includes = nullptr) { + IncludeStructure *Includes = nullptr, + std::unique_ptr CI = nullptr) { trace::Span Tracer("Sema completion"); IgnoreDiagnostics IgnoreDiags; - auto CI = buildCompilerInvocation(Input.ParseInput, IgnoreDiags); if (!CI) { - elog("Couldn't create CompilerInvocation"); - return false; + CI = buildCompilerInvocation(Input.ParseInput, IgnoreDiags); + if (!CI) { + elog("Couldn't create CompilerInvocation"); + return false; + } } auto &FrontendOpts = CI->getFrontendOpts(); FrontendOpts.SkipFunctionBodies = true; @@ -1621,7 +1624,8 @@ class CodeCompleteFlow { bool Incomplete = false; // Would more be available with a higher limit? CompletionPrefix HeuristicPrefix; std::optional Filter; // Initialized once Sema runs. - Range ReplacedRange; + Range InsertRange; + std::optional ReplaceRange; std::vector QueryScopes; // Initialized once Sema runs. std::vector AccessibleScopes; // Initialized once Sema runs. // Initialized once QueryScopes is initialized, if there are scopes. @@ -1756,8 +1760,19 @@ class CodeCompleteFlow { IsUsingDeclaration = false; Filter = FuzzyMatcher(HeuristicPrefix.Name); auto Pos = offsetToPosition(Content, Offset); - ReplacedRange.start = ReplacedRange.end = Pos; - ReplacedRange.start.character -= HeuristicPrefix.Name.size(); + InsertRange.start = InsertRange.end = Pos; + InsertRange.start.character -= HeuristicPrefix.Name.size(); + + if (Opts.EnableInsertReplace) { + ReplaceRange.emplace(); + ReplaceRange->start = InsertRange.start; + // Scan forward past ASCII identifier characters to find replace end. + size_t ReplaceEnd = Offset; + while (ReplaceEnd < Content.size() && + isAsciiIdentifierContinue(Content[ReplaceEnd])) + ++ReplaceEnd; + ReplaceRange->end = offsetToPosition(Content, ReplaceEnd); + } llvm::StringMap ProxSources; ProxSources[FileName].Cost = 0; @@ -1838,19 +1853,26 @@ class CodeCompleteFlow { CodeCompleteResult runWithSema() { const auto &CodeCompletionRange = CharSourceRange::getCharRange( Recorder->CCSema->getPreprocessor().getCodeCompletionTokenRange()); + + const SourceManager &SM = Recorder->CCSema->getSourceManager(); + // When we are getting completions with an empty identifier, for example // std::vector asdf; // asdf.^; // Then the range will be invalid and we will be doing insertion, use // current cursor position in such cases as range. if (CodeCompletionRange.isValid()) { - ReplacedRange = halfOpenToRange(Recorder->CCSema->getSourceManager(), - CodeCompletionRange); + InsertRange = halfOpenToRange(SM, CodeCompletionRange); } else { const auto &Pos = sourceLocToPosition( - Recorder->CCSema->getSourceManager(), - Recorder->CCSema->getPreprocessor().getCodeCompletionLoc()); - ReplacedRange.start = ReplacedRange.end = Pos; + SM, Recorder->CCSema->getPreprocessor().getCodeCompletionLoc()); + InsertRange.start = InsertRange.end = Pos; + } + + if (Opts.EnableInsertReplace) { + ReplaceRange.emplace(); + ReplaceRange->start = InsertRange.start; + ReplaceRange->end = getEndOfCodeCompletionReplace(SM); } Filter = FuzzyMatcher( Recorder->CCSema->getPreprocessor().getCodeCompletionFilter()); @@ -1880,6 +1902,26 @@ class CodeCompleteFlow { return toCodeCompleteResult(Top); } + // Returns the LSP position at the end of the identifier suffix after the + // code completion cursor. + Position getEndOfCodeCompletionReplace(const SourceManager &SM) { + const Preprocessor &PP = Recorder->CCSema->getPreprocessor(); + const LangOptions &LangOpts = Recorder->CCSema->getLangOpts(); + + // Skip past the code completion NUL byte and scan forward through + // identifier continuation characters (letters, digits, _, $, UCN, + // unicode). This handles all cases uniformly: with prefix ("vac^1abc"), + // without prefix ("vec.^asdf"), and digit-starting ("vec.^1abc"). + const SourceLocation SuffixBegin = + PP.getCodeCompletionLoc().getLocWithOffset(1); + Position End = sourceLocToPosition( + SM, Lexer::findEndOfIdentifierContinuation(SuffixBegin, SM, LangOpts)); + // Adjust for the NUL byte inserted at the cursor by code completion, + // which inflates the column by 1. + End.character--; + return End; + } + CodeCompleteResult toCodeCompleteResult(const std::vector &Scored) { CodeCompleteResult Output; @@ -1891,7 +1933,8 @@ class CodeCompleteFlow { for (auto &C : Scored) { Output.Completions.push_back(toCodeCompletion(C.first)); Output.Completions.back().Score = C.second; - Output.Completions.back().CompletionTokenRange = ReplacedRange; + Output.Completions.back().CompletionInsertRange = InsertRange; + Output.Completions.back().CompletionReplaceRange = ReplaceRange; if (Opts.Index && !Output.Completions.back().Documentation) { for (auto &Cand : C.first) { if (Cand.SemaResult && @@ -1915,7 +1958,8 @@ class CodeCompleteFlow { } Output.HasMore = Incomplete; Output.Context = CCContextKind; - Output.CompletionRange = ReplacedRange; + Output.InsertRange = InsertRange; + Output.ReplaceRange = ReplaceRange; // Look up documentation from the index. if (Opts.Index) { @@ -2236,16 +2280,54 @@ CompletionPrefix guessCompletionPrefix(llvm::StringRef Content, return Result; } +// If Offset is inside what looks like argument comment (e.g. +// "/*^*/" or "/* foo = ^*/"), returns the offset pointing past the closing +// "*/". +static std::optional +maybeFunctionArgumentCommentEnd(const PathRef FileName, const unsigned Offset, + const llvm::StringRef Content, + const LangOptions &LangOpts) { + if (Offset > Content.size()) + return std::nullopt; + + SourceManagerForFile FileSM(FileName, Content); + const SourceManager &SM = FileSM.get(); + const SourceLocation Cursor = SM.getComposedLoc(SM.getMainFileID(), Offset); + const SourceLocation EndOfSuffix = + Lexer::findEndOfIdentifierContinuation(Cursor, SM, LangOpts); + const unsigned EndOfSuffixOffset = SM.getFileOffset(EndOfSuffix); + + const llvm::StringRef Rest = Content.drop_front(EndOfSuffixOffset); + llvm::StringRef RestTrimmed = Rest.ltrim(); + // Comment argument pattern: `/* name = */` — skip past optional `=`. + if (RestTrimmed.starts_with("=")) + RestTrimmed = RestTrimmed.drop_front(1).ltrim(); + if (RestTrimmed.starts_with("*/")) + return EndOfSuffixOffset + (Rest.size() - RestTrimmed.size()) + 2; + return std::nullopt; +} + // Code complete the argument name on "/*" inside function call. -// Offset should be pointing to the start of the comment, i.e.: +// OutsideStartOffset should be pointing before the comment, i.e.: // foo(^/*, rather than foo(/*^) where the cursor probably is. -CodeCompleteResult codeCompleteComment(PathRef FileName, unsigned Offset, - llvm::StringRef Prefix, - const PreambleData *Preamble, - const ParseInputs &ParseInput) { +CodeCompleteResult +codeCompleteComment(PathRef FileName, const unsigned CursorOffset, + unsigned OutsideStartOffset, llvm::StringRef Prefix, + const PreambleData *Preamble, const ParseInputs &ParseInput, + const CodeCompleteOptions &Opts) { if (Preamble == nullptr) // Can't run without Sema. return CodeCompleteResult(); + IgnoreDiagnostics IgnoreDiags; + auto CI = buildCompilerInvocation(ParseInput, IgnoreDiags); + if (!CI) + return CodeCompleteResult(); + + std::optional OutsideEndOffset; + if (Opts.EnableInsertReplace) + OutsideEndOffset = maybeFunctionArgumentCommentEnd( + FileName, CursorOffset, ParseInput.Contents, CI->getLangOpts()); + clang::CodeCompleteOptions Options; Options.IncludeGlobals = false; Options.IncludeMacros = false; @@ -2256,20 +2338,31 @@ CodeCompleteResult codeCompleteComment(PathRef FileName, unsigned Offset, // full patch. semaCodeComplete( std::make_unique(Options, ParamNames), Options, - {FileName, Offset, *Preamble, + {FileName, OutsideStartOffset, *Preamble, PreamblePatch::createFullPatch(FileName, ParseInput, *Preamble), - ParseInput}); + ParseInput}, + /*Includes=*/nullptr, std::move(CI)); if (ParamNames.empty()) return CodeCompleteResult(); CodeCompleteResult Result; - Range CompletionRange; + Range InsertRange; // Skip /* - Offset += 2; - CompletionRange.start = offsetToPosition(ParseInput.Contents, Offset); - CompletionRange.end = - offsetToPosition(ParseInput.Contents, Offset + Prefix.size()); - Result.CompletionRange = CompletionRange; + const unsigned InsideStartOffset = OutsideStartOffset + 2; + InsertRange.start = offsetToPosition(ParseInput.Contents, InsideStartOffset); + InsertRange.end = + offsetToPosition(ParseInput.Contents, InsideStartOffset + Prefix.size()); + Result.InsertRange = InsertRange; + + if (Opts.EnableInsertReplace) { + Range ReplaceRange; + ReplaceRange.start = InsertRange.start; + ReplaceRange.end = OutsideEndOffset ? offsetToPosition(ParseInput.Contents, + *OutsideEndOffset) + : InsertRange.end; + Result.ReplaceRange = ReplaceRange; + } + Result.Context = CodeCompletionContext::CCC_NaturalLanguage; for (llvm::StringRef Name : ParamNames) { if (!Name.starts_with(Prefix)) @@ -2278,7 +2371,8 @@ CodeCompleteResult codeCompleteComment(PathRef FileName, unsigned Offset, Item.Name = Name.str() + "=*/"; Item.FilterText = Item.Name; Item.Kind = CompletionItemKind::Text; - Item.CompletionTokenRange = CompletionRange; + Item.CompletionInsertRange = InsertRange; + Item.CompletionReplaceRange = Result.ReplaceRange; Item.Origin = SymbolOrigin::AST; Result.Completions.push_back(Item); } @@ -2318,8 +2412,8 @@ CodeCompleteResult codeComplete(PathRef FileName, Position Pos, // parsing, so we must move back the position before running it, extract // information we need and construct completion items ourselves. auto CommentPrefix = Content.substr(*OffsetBeforeComment + 2).trim(); - return codeCompleteComment(FileName, *OffsetBeforeComment, CommentPrefix, - Preamble, ParseInput); + return codeCompleteComment(FileName, *Offset, *OffsetBeforeComment, + CommentPrefix, Preamble, ParseInput, Opts); } auto Flow = CodeCompleteFlow( @@ -2429,7 +2523,9 @@ CompletionItem CodeCompletion::render(const CodeCompleteOptions &Opts) const { } LSP.sortText = sortText(Score.Total, FilterText); LSP.filterText = FilterText; - LSP.textEdit = {CompletionTokenRange, RequiredQualifier + Name, ""}; + TextEdit Edit; + Edit.range = CompletionInsertRange; + Edit.newText = RequiredQualifier + Name; // Merge continuous additionalTextEdits into main edit. The main motivation // behind this is to help LSP clients, it seems most of them are confused when // they are provided with additionalTextEdits that are consecutive to main @@ -2438,19 +2534,34 @@ CompletionItem CodeCompletion::render(const CodeCompleteOptions &Opts) const { // is mainly to help LSP clients again, so that changes do not effect each // other. for (const auto &FixIt : FixIts) { - if (FixIt.range.end == LSP.textEdit->range.start) { - LSP.textEdit->newText = FixIt.newText + LSP.textEdit->newText; - LSP.textEdit->range.start = FixIt.range.start; + if (FixIt.range.end == Edit.range.start) { + Edit.newText = FixIt.newText + Edit.newText; + Edit.range.start = FixIt.range.start; } else { LSP.additionalTextEdits.push_back(FixIt); } } if (Opts.EnableSnippets) - LSP.textEdit->newText += SnippetSuffix; + Edit.newText += SnippetSuffix; // FIXME(kadircet): Do not even fill insertText after making sure textEdit is // compatible with most of the editors. - LSP.insertText = LSP.textEdit->newText; + LSP.insertText = Edit.newText; + if (Opts.EnableInsertReplace) { + assert(CompletionReplaceRange && + "CompletionReplaceRange must be already set before render() " + "when EnableInsertReplace is on"); + InsertReplaceEdit IRE; + IRE.newText = std::move(Edit.newText); + IRE.insert = Edit.range; + IRE.replace = *CompletionReplaceRange; + // FixIt merging may have extended the insert range start; keep replace + // range as a superset per LSP spec. + IRE.replace.start = IRE.insert.start; + LSP.textEdit = std::move(IRE); + } else { + LSP.textEdit = std::move(Edit); + } // Some clients support snippets but work better with plaintext. // So if the snippet is trivial, let the client know. // https://github.com/clangd/clangd/issues/922 diff --git a/clang-tools-extra/clangd/CodeComplete.h b/clang-tools-extra/clangd/CodeComplete.h index b466965cfff4e..a9cb294fcb083 100644 --- a/clang-tools-extra/clangd/CodeComplete.h +++ b/clang-tools-extra/clangd/CodeComplete.h @@ -68,6 +68,10 @@ struct CodeCompleteOptions { /// Whether to present doc comments as plain-text or markdown. MarkupKind DocumentationFormat = MarkupKind::PlainText; + /// Whether to present the completion as a single textEdit range or as two + /// ranges (insert/replace). + bool EnableInsertReplace = false; + Config::HeaderInsertionPolicy InsertIncludes = Config::HeaderInsertionPolicy::IWYU; @@ -219,7 +223,9 @@ struct CodeCompletion { std::vector FixIts; /// Holds the range of the token we are going to replace with this completion. - Range CompletionTokenRange; + Range CompletionInsertRange; + /// If set, the range to use when the client's insert mode is "replace". + std::optional CompletionReplaceRange; // Scores are used to rank completion items. struct Scores { @@ -258,8 +264,12 @@ struct CodeCompleteResult { // The text that is being directly completed. // Example: foo.pb^ -> foo.push_back() // ~~ - // Typically matches the textEdit.range of Completions, but not guaranteed to. - std::optional CompletionRange; + // Typically matches the textEdit.range (or textEdit.insert range) of + // Completions, but not guaranteed to. + std::optional InsertRange; + // If not empty, typically matches the textEdit.replace range of Completions, + // but not guaranteed to. + std::optional ReplaceRange; // Usually the source will be parsed with a real C++ parser. // But heuristics may be used instead if e.g. the preamble is not ready. bool RanParser = true; diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp index 793db7b052990..f77b0773d445a 100644 --- a/clang-tools-extra/clangd/Protocol.cpp +++ b/clang-tools-extra/clangd/Protocol.cpp @@ -202,6 +202,14 @@ llvm::json::Value toJSON(const TextEdit &P) { return Result; } +llvm::json::Value toJSON(const InsertReplaceEdit &P) { + return llvm::json::Object{ + {"newText", P.newText}, + {"insert", P.insert}, + {"replace", P.replace}, + }; +} + bool fromJSON(const llvm::json::Value &Params, ChangeAnnotation &R, llvm::json::Path P) { llvm::json::ObjectMapper O(Params, P); @@ -414,6 +422,8 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R, break; } } + if (auto IRSupport = Item->getBoolean("insertReplaceSupport")) + R.InsertReplace = *IRSupport; } if (auto *ItemKind = Completion->getObject("completionItemKind")) { if (auto *ValueSet = ItemKind->get("valueSet")) { @@ -1184,7 +1194,8 @@ llvm::json::Value toJSON(const CompletionItem &CI) { if (CI.insertTextFormat != InsertTextFormat::Missing) Result["insertTextFormat"] = static_cast(CI.insertTextFormat); if (CI.textEdit) - Result["textEdit"] = *CI.textEdit; + Result["textEdit"] = std::visit( + [](const auto &V) { return llvm::json::Value(V); }, *CI.textEdit); if (!CI.additionalTextEdits.empty()) Result["additionalTextEdits"] = llvm::json::Array(CI.additionalTextEdits); if (CI.deprecated) diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h index 7a99721a1e856..9c1bb9d9bb059 100644 --- a/clang-tools-extra/clangd/Protocol.h +++ b/clang-tools-extra/clangd/Protocol.h @@ -34,6 +34,7 @@ #include #include #include +#include #include // This file is using the LSP syntax for identifier names which is different @@ -261,6 +262,18 @@ bool fromJSON(const llvm::json::Value &, TextEdit &, llvm::json::Path); llvm::json::Value toJSON(const TextEdit &); llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TextEdit &); +struct InsertReplaceEdit { + /// The string to be inserted. + std::string newText; + + /// The range if the insert is requested. + Range insert; + + /// The range if the replace is requested. + Range replace; +}; +llvm::json::Value toJSON(const InsertReplaceEdit &); + struct ChangeAnnotation { /// A human-readable string describing the actual change. The string /// is rendered prominent in the user interface. @@ -510,6 +523,10 @@ struct ClientCapabilities { /// textDocument.completion.completionItem.documentationFormat MarkupKind CompletionDocumentationFormat = MarkupKind::PlainText; + /// Client supports insert replace edit to control different behavior if a + /// completion item is inserted in the text or should replace text. + bool InsertReplace = false; + /// The client has support for completion item label details. /// textDocument.completion.completionItem.labelDetailsSupport. bool CompletionLabelDetail = false; @@ -1372,9 +1389,13 @@ struct CompletionItem { /// An edit which is applied to a document when selecting this completion. /// When an edit is provided `insertText` is ignored. /// - /// Note: The range of the edit must be a single line range and it must - /// contain the position at which completion has been requested. - std::optional textEdit; + /// Note 1: The text edit's range as well as both ranges from an insert + /// replace edit must be a single line range and must contain the position + /// at which completion has been requested. + /// Note 2: If an `InsertReplaceEdit` is returned, the edit's insert range + /// must be a prefix of the edit's replace range, meaning it must be + /// contained in and starting at the same position. + std::optional> textEdit; /// An optional array of additional text edits that are applied when selecting /// this completion. Edits must not overlap with the main edit nor with diff --git a/clang-tools-extra/clangd/test/completion-auto-trigger-replace.test b/clang-tools-extra/clangd/test/completion-auto-trigger-replace.test new file mode 100644 index 0000000000000..7201171c47641 --- /dev/null +++ b/clang-tools-extra/clangd/test/completion-auto-trigger-replace.test @@ -0,0 +1,113 @@ +# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s +# Tests InsertReplaceEdit with auto-triggered completions. +{ + "jsonrpc": "2.0", + "id": 0, + "method": "initialize", + "params": { + "processId": 123, + "rootPath": "clangd", + "capabilities": { + "textDocument": { + "completion": { + "completionItem": { + "insertReplaceSupport": true + } + } + } + }, + "trace": "off" + } +} +--- +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct vector { int size; };\nvoid test(vector *a, vector *b) {\n if (a > b) {} \n a->size = 10;\n a->\n}"}}} +--- +# Case 1 (rejected trigger): ">" in "a > b" is comparison, not "->". +# insertReplaceSupport doesn't break trigger rejection. +{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":9},"context":{"triggerKind":2,"triggerCharacter":">"}}} +# CHECK: "id": 1, +# CHECK-NEXT: "jsonrpc": "2.0" +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [] +# CHECK-NEXT: } +--- +# Case 2 (trigger with word after cursor): "a->^size = 10;" +# Cursor right after "->", word "size" follows. Insert range is empty. +# insert: [5,5], replace: [5,9] +{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":3,"character":5},"context":{"triggerKind":2,"triggerCharacter":">"}}} +# CHECK: "id": 2, +# CHECK-NEXT: "jsonrpc": "2.0" +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK-NEXT: { +# CHECK-NEXT: "detail": "int", +# CHECK-NEXT: "filterText": "size", +# CHECK-NEXT: "insertText": "size", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " size", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}size", +# CHECK-NEXT: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 3 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 3 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "size", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 9, +# CHECK-NEXT: "line": 3 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 3 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } +--- +# Case 3 (trigger with nothing after cursor): "a->^" +# Cursor right after "->", nothing follows. insert == replace. +# insert: [5,5], replace: [5,5] +{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":4,"character":5},"context":{"triggerKind":2,"triggerCharacter":">"}}} +# CHECK: "id": 3, +# CHECK-NEXT: "jsonrpc": "2.0" +# CHECK-NEXT: "result": { +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 4 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 4 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": {{.*}}, +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 4 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 4 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +{"jsonrpc":"2.0","id":4,"method":"shutdown"} +--- +{"jsonrpc":"2.0","method":"exit"} diff --git a/clang-tools-extra/clangd/test/completion-replace.test b/clang-tools-extra/clangd/test/completion-replace.test new file mode 100644 index 0000000000000..909f6d992576e --- /dev/null +++ b/clang-tools-extra/clangd/test/completion-replace.test @@ -0,0 +1,271 @@ +# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s +# RUN: clangd -lit-test -pch-storage=memory < %s | FileCheck -strict-whitespace %s +# Tests InsertReplaceEdit ranges when insertReplaceSupport is true. +# insert range = [token_start, cursor), replace range = [token_start, token_end). +{ + "jsonrpc": "2.0", + "id": 0, + "method": "initialize", + "params": { + "processId": 123, + "rootPath": "clangd", + "capabilities": { + "textDocument": { + "completion": { + "completionItem": { + "insertReplaceSupport": true + } + } + } + }, + "trace": "off" + } +} +--- +# Case 1 (cursor at end of token): "S().a^" +# Prefix typed, nothing after cursor. insert == replace. +# insert: [4,5], replace: [4,5] +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().a;\n}"}}} +--- +{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":5}}} +# CHECK: "id": 1 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "a", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 5, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +# Case 2 (mid-word cursor): "S().ab^c" +# Cursor mid-word. Replace range extends past cursor to cover suffix. +# insert: [4,6], replace: [4,7] +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp","version":2},"contentChanges":[{"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().abc;\n}"}]}} +--- +{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":6}}} +# CHECK: "id": 2 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "filterText": "abc", +# CHECK-NEXT: "insertText": "abc", +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 6, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "abc", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 7, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +# Case 3 (no prefix, word after cursor): "S().^xyz" +# Cursor right after dot, entire existing word is suffix. Insert range is empty. +# insert: [4,4], replace: [4,7] +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp","version":3},"contentChanges":[{"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().xyz;\n}"}]}} +--- +{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":4}}} +# CHECK: "id": 3 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": {{.*}}, +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 7, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +# Case 4 (unicode mid-word): "S().chrz^ąszcz" +# Cursor mid-word with multi-byte suffix. Tests that ranges handle unicode. +# "chrząszcz" is 9 characters starting at column 4. +# insert: [4,8], replace: [4,13] +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp","version":4},"contentChanges":[{"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().chrząszcz;\n}"}]}} +--- +{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":8}}} +# CHECK: "id": 4 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "filterText": "chrząszcz", +# CHECK-NEXT: "insertText": "chrząszcz", +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 8, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "chrząszcz", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 13, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +# Case 5 (replace stops at paren): "S().ab^c(123)" +# Replace range covers only the identifier, not the '(' or arguments. +# insert: [4,6], replace: [4,7] +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp","version":5},"contentChanges":[{"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().abc(123);\n}"}]}} +--- +{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":6}}} +# CHECK: "id": 5 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "filterText": "abc", +# CHECK-NEXT: "insertText": "abc", +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 6, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "abc", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 7, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +# Case 6 (replace stops at angle bracket): "S().ab^c()" +# Replace range covers only the identifier, not '<' or template arguments. +# insert: [4,6], replace: [4,7] +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp","version":6},"contentChanges":[{"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().abc();\n}"}]}} +--- +{"jsonrpc":"2.0","id":6,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":6}}} +# CHECK: "id": 6 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "filterText": "abc", +# CHECK-NEXT: "insertText": "abc", +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 6, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "abc", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 7, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +# Case 7 (replace stops before '='): "S().ab^c = 1;" +# Replace range covers only the identifier, not the space or '='. +# insert: [4,6], replace: [4,7] +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp","version":7},"contentChanges":[{"text":"struct S { int a; int abc; int chrząszcz; };\nint main() {\nS().abc = 1;\n}"}]}} +--- +{"jsonrpc":"2.0","id":7,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":6}}} +# CHECK: "id": 7 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: "filterText": "abc", +# CHECK-NEXT: "insertText": "abc", +# CHECK: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 6, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "abc", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 7, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 4, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +--- +{"jsonrpc":"2.0","id":8,"method":"shutdown"} +--- +{"jsonrpc":"2.0","method":"exit"} diff --git a/clang-tools-extra/clangd/test/completion-snippets-replace.test b/clang-tools-extra/clangd/test/completion-snippets-replace.test new file mode 100644 index 0000000000000..6d694aca0793b --- /dev/null +++ b/clang-tools-extra/clangd/test/completion-snippets-replace.test @@ -0,0 +1,68 @@ +# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s +# RUN: clangd -lit-test -pch-storage=memory < %s | FileCheck -strict-whitespace %s +{ + "jsonrpc": "2.0", + "id": 0, + "method": "initialize", + "params": { + "processId": 123, + "rootPath": "clangd", + "capabilities": { + "textDocument": { + "completion": { + "completionItem": { + "snippetSupport": true, + "insertReplaceSupport": true + } + } + } + }, + "trace": "off" + } +} +--- +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int func_with_args(int a, int b);\nint main() {\nfunc_with\n}"}}} +--- +{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":2,"character":7}}} +# CHECK: "id": 1 +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": {{.*}} +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "func_with_args", +# CHECK-NEXT: "insertText": "func_with_args(${1:int a}, ${2:int b})", +# CHECK-NEXT: "insertTextFormat": 2, +# CHECK-NEXT: "kind": 3, +# CHECK-NEXT: "label": " func_with_args(int a, int b)", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}func_with_args" +# CHECK-NEXT: "textEdit": { +# CHECK-NEXT: "insert": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 7, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 0, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: }, +# CHECK-NEXT: "newText": "func_with_args(${1:int a}, ${2:int b})", +# CHECK-NEXT: "replace": { +# CHECK-NEXT: "end": { +# CHECK-NEXT: "character": 9, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: }, +# CHECK-NEXT: "start": { +# CHECK-NEXT: "character": 0, +# CHECK-NEXT: "line": 2 +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } +--- +{"jsonrpc":"2.0","id":4,"method":"shutdown"} +--- +{"jsonrpc":"2.0","method":"exit"} diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 386ffb54924a7..726fee9c2f0fe 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -92,7 +92,7 @@ MATCHER_P(snippetSuffix, Text, "") { return arg.SnippetSuffix == Text; } MATCHER_P(origin, OriginSet, "") { return arg.Origin == OriginSet; } MATCHER_P(signature, S, "") { return arg.Signature == S; } MATCHER_P(replacesRange, Range, "") { - return arg.CompletionTokenRange == Range; + return arg.CompletionInsertRange == Range; } // Shorthand for Contains(named(Name)). @@ -2619,14 +2619,14 @@ TEST(CompletionTest, RenderWithFixItMerged) { C.Name = "x"; C.RequiredQualifier = "Foo::"; C.FixIts = {FixIt}; - C.CompletionTokenRange.start.character = 5; + C.CompletionInsertRange.start.character = 5; CodeCompleteOptions Opts; Opts.IncludeFixIts = true; auto R = C.render(Opts); EXPECT_TRUE(R.textEdit); - EXPECT_EQ(R.textEdit->newText, "->Foo::x"); + EXPECT_EQ(std::get(*R.textEdit).newText, "->Foo::x"); EXPECT_TRUE(R.additionalTextEdits.empty()); } @@ -2639,18 +2639,18 @@ TEST(CompletionTest, RenderWithFixItNonMerged) { C.Name = "x"; C.RequiredQualifier = "Foo::"; C.FixIts = {FixIt}; - C.CompletionTokenRange.start.character = 5; + C.CompletionInsertRange.start.character = 5; CodeCompleteOptions Opts; Opts.IncludeFixIts = true; auto R = C.render(Opts); EXPECT_TRUE(R.textEdit); - EXPECT_EQ(R.textEdit->newText, "Foo::x"); + EXPECT_EQ(std::get(*R.textEdit).newText, "Foo::x"); EXPECT_THAT(R.additionalTextEdits, UnorderedElementsAre(FixIt)); } -TEST(CompletionTest, CompletionTokenRange) { +TEST(CompletionTest, CompletionInsertRange) { MockFS FS; MockCompilationDatabase CDB; TestTU TU; @@ -2692,7 +2692,7 @@ TEST(CompletionTest, CompletionTokenRange) { ADD_FAILURE() << "Results.Completions.size() != 1" << Text; continue; } - EXPECT_THAT(Results.Completions.front().CompletionTokenRange, + EXPECT_THAT(Results.Completions.front().CompletionInsertRange, TestCode.range()); } } @@ -4080,23 +4080,155 @@ TEST(CompletionTest, DelayedTemplateParsing) { TEST(CompletionTest, CompletionRange) { const char *WithRange = "auto x = [[abc]]^"; auto Completions = completions(WithRange); - EXPECT_EQ(Completions.CompletionRange, Annotations(WithRange).range()); + EXPECT_EQ(Completions.InsertRange, Annotations(WithRange).range()); Completions = completionsNoCompile(WithRange); - EXPECT_EQ(Completions.CompletionRange, Annotations(WithRange).range()); + EXPECT_EQ(Completions.InsertRange, Annotations(WithRange).range()); const char *EmptyRange = "auto x = [[]]^"; Completions = completions(EmptyRange); - EXPECT_EQ(Completions.CompletionRange, Annotations(EmptyRange).range()); + EXPECT_EQ(Completions.InsertRange, Annotations(EmptyRange).range()); Completions = completionsNoCompile(EmptyRange); - EXPECT_EQ(Completions.CompletionRange, Annotations(EmptyRange).range()); + EXPECT_EQ(Completions.InsertRange, Annotations(EmptyRange).range()); // Sema doesn't trigger at all here, while the no-sema completion runs // heuristics as normal and reports a range. It'd be nice to be consistent. const char *NoCompletion = "/* foo [[]]^ */"; Completions = completions(NoCompletion); - EXPECT_EQ(Completions.CompletionRange, std::nullopt); + EXPECT_EQ(Completions.InsertRange, std::nullopt); Completions = completionsNoCompile(NoCompletion); - EXPECT_EQ(Completions.CompletionRange, Annotations(NoCompletion).range()); + EXPECT_EQ(Completions.InsertRange, Annotations(NoCompletion).range()); +} + +TEST(CompletionTest, ReplaceRange) { + clangd::CodeCompleteOptions Opts; + Opts.EnableInsertReplace = true; + + // Cursor at end of token: insert == replace. + const char *EndOfToken = + "struct S { int abc; }; void f() { S s; s.[[abc]]^; }"; + CodeCompleteResult Completions = + completions(EndOfToken, /*IndexSymbols=*/{}, Opts); + Annotations A(EndOfToken); + EXPECT_EQ(Completions.InsertRange, A.range()); + EXPECT_EQ(Completions.ReplaceRange, A.range()); + + // Cursor mid-word: replace extends past cursor. + const char *MidWord = "struct S { int abcd; }; void f() { S s; " + "s.$replace[[$insert[[ab^]]cd]]; }"; + Completions = completions(MidWord, /*IndexSymbols=*/{}, Opts); + A = Annotations(MidWord); + EXPECT_EQ(Completions.InsertRange, A.range("insert")); + EXPECT_EQ(Completions.ReplaceRange, A.range("replace")); + + // Empty prefix: insert range is empty, replace covers the word. + const char *EmptyPrefix = "struct S { int abcd; }; void f() { S s; " + "s.$replace[[$insert[[^]]abcd]]; }"; + Completions = completions(EmptyPrefix, /*IndexSymbols=*/{}, Opts); + A = Annotations(EmptyPrefix); + EXPECT_EQ(Completions.InsertRange, A.range("insert")); + EXPECT_EQ(Completions.ReplaceRange, A.range("replace")); + + // Cursor mid-word with UTF-8 continuation: replace extends past UTF-8. + const char *MidWordUTF8 = "struct S { int ab🙂cd; }; void f() { S s; " + "s.$replace[[$insert[[ab^]]🙂cd]]; }"; + Completions = completions(MidWordUTF8, /*IndexSymbols=*/{}, Opts); + A = Annotations(MidWordUTF8); + EXPECT_EQ(Completions.InsertRange, A.range("insert")); + EXPECT_EQ(Completions.ReplaceRange, A.range("replace")); + + // Replace range stops at '(' (method call). + const char *BeforeParen = "struct S { int abcd(); }; void f() { S s; " + "s.$replace[[$insert[[ab^]]cd]](123); }"; + Completions = completions(BeforeParen, /*IndexSymbols=*/{}, Opts); + A = Annotations(BeforeParen); + EXPECT_EQ(Completions.InsertRange, A.range("insert")); + EXPECT_EQ(Completions.ReplaceRange, A.range("replace")); + + // Replace range stops at '<' (template arguments). + const char *BeforeAngle = + "struct S { template int abcd(); }; void f() { S s; " + "s.$replace[[$insert[[ab^]]cd]](); }"; + Completions = completions(BeforeAngle, /*IndexSymbols=*/{}, Opts); + A = Annotations(BeforeAngle); + EXPECT_EQ(Completions.InsertRange, A.range("insert")); + EXPECT_EQ(Completions.ReplaceRange, A.range("replace")); + + // Replace range stops at ' ' (before '='). + const char *BeforeEquals = + "void f() { int $replace[[$insert[[ab^]]cd]] = 1; }"; + Completions = completions(BeforeEquals, /*IndexSymbols=*/{}, Opts); + A = Annotations(BeforeEquals); + EXPECT_EQ(Completions.InsertRange, A.range("insert")); + EXPECT_EQ(Completions.ReplaceRange, A.range("replace")); + + // EnableInsertReplace off: ReplaceRange should not be set. + Opts.EnableInsertReplace = false; + const char *NoReplace = "auto x = [[abc]]^"; + Completions = completions(NoReplace, /*IndexSymbols=*/{}, Opts); + EXPECT_EQ(Completions.InsertRange, Annotations(NoReplace).range()); + EXPECT_EQ(Completions.ReplaceRange, std::nullopt); +} + +TEST(CompletionTest, ReplaceRangeNoCompile) { + clangd::CodeCompleteOptions Opts; + Opts.EnableInsertReplace = true; + + // Cursor at end of token: insert == replace. + const char *EndOfToken = "auto x = [[abc]]^"; + Annotations A(EndOfToken); + CodeCompleteResult Results = + completionsNoCompile(EndOfToken, /*IndexSymbols=*/{}, Opts); + EXPECT_EQ(Results.InsertRange, A.range()); + EXPECT_EQ(Results.ReplaceRange, A.range()); + + // Cursor mid-word: replace extends past cursor. + const char *MidWord = "auto x = $replace[[$insert[[ab^]]cd]]"; + Results = completionsNoCompile(MidWord, /*IndexSymbols=*/{}, Opts); + A = Annotations(MidWord); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + + // Empty prefix: insert range is empty, replace covers the word. + const char *EmptyPrefix = "auto x = $replace[[$insert[[^]]abcd]]"; + Results = completionsNoCompile(EmptyPrefix, /*IndexSymbols=*/{}, Opts); + A = Annotations(EmptyPrefix); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + + // ASCII heuristic stops at non-ASCII: replace doesn't extend past UTF-8. + const char *MidWordUTF8 = "auto x = $replace[[$insert[[ab^]]]]🙂cd"; + Results = completionsNoCompile(MidWordUTF8, /*IndexSymbols=*/{}, Opts); + A = Annotations(MidWordUTF8); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + + // Replace range stops at '(' (method call). + const char *BeforeParen = "auto x = $replace[[$insert[[ab^]]cd]](123);"; + Results = completionsNoCompile(BeforeParen, /*IndexSymbols=*/{}, Opts); + A = Annotations(BeforeParen); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + + // Replace range stops at '<' (template arguments). + const char *BeforeAngle = "auto x = $replace[[$insert[[ab^]]cd]]();"; + Results = completionsNoCompile(BeforeAngle, /*IndexSymbols=*/{}, Opts); + A = Annotations(BeforeAngle); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + + // Replace range stops at ' ' (before '='). + const char *BeforeEquals = "auto $replace[[$insert[[ab^]]cd]] = 1;"; + Results = completionsNoCompile(BeforeEquals, /*IndexSymbols=*/{}, Opts); + A = Annotations(BeforeEquals); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + + // EnableInsertReplace off: ReplaceRange should not be set. + Opts.EnableInsertReplace = false; + const char *NoReplace = "auto x = [[abc]]^"; + Results = completionsNoCompile(NoReplace, /*IndexSymbols=*/{}, Opts); + EXPECT_EQ(Results.InsertRange, Annotations(NoReplace).range()); + EXPECT_EQ(Results.ReplaceRange, std::nullopt); } TEST(NoCompileCompletionTest, Basic) { @@ -4399,7 +4531,7 @@ TEST(CompletionTest, CommentParamName) { { std::string CompletionRangeTest(Code + "fun(/*[[^]]"); auto Results = completions(CompletionRangeTest); - EXPECT_THAT(Results.CompletionRange, + EXPECT_THAT(Results.InsertRange, llvm::ValueIs(Annotations(CompletionRangeTest).range())); EXPECT_THAT( Results.Completions, @@ -4410,7 +4542,7 @@ TEST(CompletionTest, CommentParamName) { { std::string CompletionRangeTest(Code + "fun(/*[[fo^]]"); auto Results = completions(CompletionRangeTest); - EXPECT_THAT(Results.CompletionRange, + EXPECT_THAT(Results.InsertRange, llvm::ValueIs(Annotations(CompletionRangeTest).range())); EXPECT_THAT( Results.Completions, @@ -4418,6 +4550,44 @@ TEST(CompletionTest, CommentParamName) { AllOf(replacesRange(Annotations(CompletionRangeTest).range()), origin(SymbolOrigin::AST), kind(CompletionItemKind::Text)))); } + + // Test replace ranges (comment completion replaces up to */). + clangd::CodeCompleteOptions ReplaceOpts; + ReplaceOpts.EnableInsertReplace = true; + { + // With */ (no =): replace extends past suffix to */. + const std::string NoEquals(Code + "fun(/*$replace[[$insert[[fo^]]o*/]])"); + const CodeCompleteResult Results = completions(NoEquals, {}, ReplaceOpts); + const Annotations A(NoEquals); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + } + { + // With = and */: replace extends past = to */. + const std::string WithEquals(Code + + "fun(/*$replace[[$insert[[fo^]]o=*/]])"); + const CodeCompleteResult Results = completions(WithEquals, {}, ReplaceOpts); + const Annotations A(WithEquals); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + } + { + // Without */: replace == insert. + const std::string NoClose(Code + "fun(/*[[fo^]]"); + const CodeCompleteResult Results = completions(NoClose, {}, ReplaceOpts); + const Annotations A(NoClose); + EXPECT_EQ(Results.InsertRange, A.range()); + EXPECT_EQ(Results.ReplaceRange, A.range()); + } + { + // With */ and UTF-8 suffix: replace extends past UTF-8 to */. + const std::string WithUTF8(Code + + "fun(/*$replace[[$insert[[fo^]]o🙂=*/]])"); + const CodeCompleteResult Results = completions(WithUTF8, {}, ReplaceOpts); + const Annotations A(WithUTF8); + EXPECT_EQ(Results.InsertRange, A.range("insert")); + EXPECT_EQ(Results.ReplaceRange, A.range("replace")); + } } TEST(CompletionTest, Concepts) { diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 51251eacbcd5e..9821c7e74ebf7 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -79,6 +79,10 @@ Code completion - Now also provides include files without extension, if they are in a directory only called ``include``. +- Added support for ``InsertReplaceEdit`` in code completion (LSP 3.16), + allowing clients that advertise ``insertReplaceSupport`` to receive both + insert and replace ranges for completion items. + - Changed completion-style default to ``detailed``. This means function overloads will no longer be bundled together, but instead each have their own completion item. This gives the user a better overview of the diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h index 0459a863bc08d..fb65266c8b990 100644 --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -372,6 +372,14 @@ class Lexer : public PreprocessorLexer { const SourceManager &SM, const LangOptions &LangOpts); + /// Finds the end of an identifier-continuation sequence starting at \p Loc. + /// This consumes identifier continuation characters (letters, digits, + /// underscores, dollar signs if enabled, UCNs, and unicode), and returns + /// the source location immediately after the consumed sequence. + static SourceLocation + findEndOfIdentifierContinuation(SourceLocation Loc, const SourceManager &SM, + const LangOptions &LangOpts); + /// Relex the token at the specified location. /// \returns true if there was a failure, false on success. static bool getRawToken(SourceLocation Loc, Token &Result, diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index e85dc6679d508..d33ac2e271a22 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -515,6 +515,29 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc, return TheTok.getLength(); } +SourceLocation Lexer::findEndOfIdentifierContinuation( + SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { + Loc = SM.getExpansionLoc(Loc); + const FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc); + bool Invalid = false; + const StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); + if (Invalid) + return Loc; + + const char *StrData = Buffer.data() + LocInfo.second; + if (StrData >= Buffer.end()) + return Loc; + + // Use the lexer continuation rules directly, without requiring identifier + // start at Loc. + Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, + Buffer.begin(), StrData, Buffer.end()); + Token Tok; + Tok.startToken(); + TheLexer.LexIdentifierContinue(Tok, StrData); + return Loc.getLocWithOffset(Tok.getLength()); +} + /// Relex the token at the specified location. /// \returns true if there was a failure, false on success. bool Lexer::getRawToken(SourceLocation Loc, Token &Result, diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp index da335d6e81820..52c3522f1ed53 100644 --- a/clang/unittests/Lex/LexerTest.cpp +++ b/clang/unittests/Lex/LexerTest.cpp @@ -850,4 +850,42 @@ TEST_F(LexerTest, CheckFirstPPToken) { EXPECT_TRUE(Tok.getRawIdentifier() == "FOO"); } } + +TEST_F(LexerTest, FindEndOfIdentifierContinuation) { + const auto Measure = [&](const StringRef Code, + const unsigned Offset) -> unsigned { + auto Buf = llvm::MemoryBuffer::getMemBuffer(Code); + SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); + const auto Loc = SourceMgr.getLocForStartOfFile(SourceMgr.getMainFileID()) + .getLocWithOffset(Offset); + const auto End = + Lexer::findEndOfIdentifierContinuation(Loc, SourceMgr, LangOpts); + const unsigned Length = SourceMgr.getFileOffset(End) - Offset; + SourceMgr.clearIDTables(); + return Length; + }; + + // ASCII identifiers. + EXPECT_EQ(Measure("abcd", 0), 4u); // Full identifier. + EXPECT_EQ(Measure("abcd", 1), 3u); // Mid-identifier. + EXPECT_EQ(Measure("ab12", 1), 3u); // At digit. + EXPECT_EQ(Measure("ab cd", 1), 1u); // At space. + EXPECT_EQ(Measure("ab+cd", 1), 1u); // At non-identifier. + EXPECT_EQ(Measure("ab(cd)", 1), 1u); // At '('. + EXPECT_EQ(Measure("ab(ef)", 1), 1u); // At '<'. + EXPECT_EQ(Measure("ab{cd}", 1), 1u); // At '{'. + EXPECT_EQ(Measure("ab=cd;", 1), 1u); // At '='. + + // UTF-8 identifier characters. + LangOpts.CPlusPlus = true; + EXPECT_EQ(Measure("ab🙂cd", 2), 6u); // '🙂' (4 bytes) + "cd". + EXPECT_EQ(Measure("🙂cd", 0), 6u); // Starts with '🙂'. + + // Dollar sign (requires DollarIdents). + LangOpts.DollarIdents = true; + EXPECT_EQ(Measure("ab$cd", 2), 3u); // '$' is identifier continue. + LangOpts.DollarIdents = false; + EXPECT_EQ(Measure("ab$cd", 2), 0u); // '$' is not identifier continue. +} + } // anonymous namespace From cabe3fac231f716dd117840d330968c5f6261535 Mon Sep 17 00:00:00 2001 From: sstwcw Date: Sun, 10 May 2026 01:18:42 +0000 Subject: [PATCH 159/538] Revert "[clang-format][NFC] Format with the new formatter" (#196771) Reverts llvm/llvm-project#196523 --- clang/lib/Format/BreakableToken.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index 98d357c78bb7c..9571a64797a2d 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -680,7 +680,7 @@ const llvm::StringSet<> BreakableBlockComment::ContentIndentingJavadocAnnotations = { "@param", "@return", "@returns", "@throws", "@type", "@template", "@see", "@deprecated", "@define", "@exports", "@mods", "@private", - }; +}; unsigned BreakableBlockComment::getContentIndent(unsigned LineIndex) const { if (!Style.isJava() && !Style.isJavaScript()) From 46ef11b7d5a0f61c1d7ea80f24663229fc4a7635 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 9 May 2026 20:09:23 -0700 Subject: [PATCH 160/538] [ELF] Fix --reproduce non-determinism with parallel input loading (#196773) After #191690, LoadJob::Archive runs in parallel and getArchiveMembers() calls ctx.tar->append() from the parallel body. TarWriter::append is unsynchronized. Member order in the tar is also non-deterministic because parallelFor scheduling determines append order. Buffer per-job tar entries during the parallel pass and flush them in the existing serial post-pass, mirroring the thinBufs / files pattern. --- lld/ELF/Config.h | 1 + lld/ELF/Driver.cpp | 9 +++++++-- lld/test/ELF/reproduce-thin-archive.s | 18 +++++++++++++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 3c851bfae92b2..485f5bf657d3e 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -188,6 +188,7 @@ struct LoadJob { uint32_t groupId; SmallVector, 0> out; std::vector> thinBufs; + SmallVector, 0> tarEntries; }; class LinkerDriver { diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 10141c201083b..214cb5678f4d8 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -210,8 +210,8 @@ std::vector> static getArchiveMembers( mb.getBufferIdentifier() + ": could not get the buffer for a child of the archive"); if (addToTar) - ctx.tar->append(relativeToRoot(check(c.getFullName())), - mbref.getBuffer()); + job.tarEntries.emplace_back(relativeToRoot(check(c.getFullName())), + mbref.getBuffer()); v.push_back(std::make_pair(mbref, c.getChildOffset())); } if (err) @@ -247,6 +247,7 @@ void LinkerDriver::addFile(StringRef path, bool withLOption) { /*withLOption=*/false, nextGroupId, {}, + {}, {}}); } else { auto magic = identify_magic(mbref.getBuffer()); @@ -285,6 +286,7 @@ void LinkerDriver::addFile(StringRef path, bool withLOption) { withLOption, nextGroupId, {}, + {}, {}}); } if (!isInGroup) @@ -2202,6 +2204,9 @@ void LinkerDriver::loadFiles() { for (auto &job : loadJobs) { if (job.kind == LoadJob::Archive) archiveFiles.emplace_back(job.path, (unsigned)job.out.size()); + if (ctx.tar) + for (const auto &[path, data] : job.tarEntries) + ctx.tar->append(path, data); files.append(std::make_move_iterator(job.out.begin()), std::make_move_iterator(job.out.end())); ctx.memoryBuffers.append(std::make_move_iterator(job.thinBufs.begin()), diff --git a/lld/test/ELF/reproduce-thin-archive.s b/lld/test/ELF/reproduce-thin-archive.s index 6c33ac22515a4..f3dd2a87edd0d 100644 --- a/lld/test/ELF/reproduce-thin-archive.s +++ b/lld/test/ELF/reproduce-thin-archive.s @@ -12,11 +12,23 @@ # CHECK: [[PATH]]/foo.a # CHECK: [[PATH]]/foo.o -# RUN: ld.lld -m elf_x86_64 --whole-archive foo.a -o /dev/null --reproduce repro2.tar +## With multiple thin archives + --threads>1, ensure deterministic member order. +# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o n.o +# RUN: cp n.o b.o && cp n.o c.o && cp n.o d.o +# RUN: llvm-ar --format=gnu rcT b.a b.o +# RUN: llvm-ar --format=gnu rcT c.a c.o +# RUN: llvm-ar --format=gnu rcT d.a d.o +# RUN: ld.lld foo.a b.a --whole-archive c.a d.a --reproduce repro2.tar # RUN: tar tf repro2.tar | FileCheck -DPATH='repro2/%:t.dir' --check-prefix=CHECK2 %s -# CHECK2: [[PATH]]/foo.a -# CHECK2: [[PATH]]/foo.o +# CHECK2: [[PATH]]/foo.a +# CHECK2-NEXT: [[PATH]]/b.a +# CHECK2-NEXT: [[PATH]]/c.a +# CHECK2-NEXT: [[PATH]]/d.a +# CHECK2-NEXT: [[PATH]]/foo.o +# CHECK2-NEXT: [[PATH]]/b.o +# CHECK2-NEXT: [[PATH]]/c.o +# CHECK2-NEXT: [[PATH]]/d.o .globl _start _start: From f87250ec9a88079d35013adc04a828a278e9808e Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 9 May 2026 20:29:12 -0700 Subject: [PATCH 161/538] [Clang] Make matrix type trivially copyable (#193634) In order to simplify matrix casting and follow the existing pattern HLSL is doing, the matrix needs to be trivially copyable. related to: https://github.com/llvm/llvm-project/issues/184471 --------- Co-authored-by: Joao Saffran --- clang/lib/AST/Type.cpp | 5 +++-- clang/test/CodeGenHLSL/BoolMatrix.hlsl | 8 ++------ clang/test/SemaCXX/type-traits.cpp | 12 ++++++++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 06023fc088a32..ca13f8f4fcfee 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2937,8 +2937,9 @@ static bool isTriviallyCopyableTypeImpl(const QualType &type, if (CanonicalType.hasAddressDiscriminatedPointerAuth()) return false; - // As an extension, Clang treats vector types as Scalar types. - if (CanonicalType->isScalarType() || CanonicalType->isVectorType()) + // As an extension, Clang treats vector and matrix types as Scalar types. + if (CanonicalType->isScalarType() || CanonicalType->isVectorType() || + CanonicalType->isMatrixType()) return true; // Mfloat8 type is a special case as it not scalar, but is still trivially diff --git a/clang/test/CodeGenHLSL/BoolMatrix.hlsl b/clang/test/CodeGenHLSL/BoolMatrix.hlsl index 0109d6029b5a2..233c56bd20757 100644 --- a/clang/test/CodeGenHLSL/BoolMatrix.hlsl +++ b/clang/test/CodeGenHLSL/BoolMatrix.hlsl @@ -75,9 +75,7 @@ bool fn3() { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[RETVAL:%.*]] = alloca i1, align 4 // CHECK-NEXT: [[ARR:%.*]] = alloca [2 x [2 x <2 x i32>]], align 4 -// CHECK-NEXT: store <4 x i32> splat (i32 1), ptr [[ARR]], align 4 -// CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[ARR]], i32 1 -// CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[ARRAYINIT_ELEMENT]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARR]], ptr align 4 @constinit, i32 32, i1 false) // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [2 x <2 x i32>]], ptr [[ARR]], i32 0, i32 0 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[MATRIXEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 @@ -129,9 +127,7 @@ void fn6() { // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[ARR:%.*]] = alloca [2 x [2 x <2 x i32>]], align 4 -// CHECK-NEXT: store <4 x i32> splat (i32 1), ptr [[ARR]], align 4 -// CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[ARR]], i32 1 -// CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[ARRAYINIT_ELEMENT]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARR]], ptr align 4 @constinit.1, i32 32, i1 false) // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [2 x <2 x i32>]], ptr [[ARR]], i32 0, i32 0 // CHECK-NEXT: [[TMP0:%.*]] = getelementptr <4 x i32>, ptr [[ARRAYIDX]], i32 0, i32 1 // CHECK-NEXT: store i32 0, ptr [[TMP0]], align 4 diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index 65c0729571f99..8decb1f61395e 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++11 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted -Wno-c++17-extensions %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++14 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted -Wno-c++17-extensions %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++17 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++20 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++11 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted -fenable-matrix -Wno-c++17-extensions %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++14 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted -fenable-matrix -Wno-c++17-extensions %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++17 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted -fenable-matrix %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=gnu++20 -fblocks -Wno-deprecated-builtins -Wno-defaulted-function-deleted -fenable-matrix %s struct NonPOD { NonPOD(int); }; @@ -45,6 +45,8 @@ struct HasAnonymousUnion { typedef int Vector __attribute__((vector_size(16))); typedef int VectorExt __attribute__((ext_vector_type(4))); +typedef float __attribute__((matrix_type(2, 3))) fm2x3; +typedef int __attribute__((matrix_type(4, 4))) im4x4; using ComplexFloat = _Complex float; using ComplexInt = _Complex int; @@ -1359,6 +1361,8 @@ void is_trivially_copyable2() static_assert(__is_trivially_copyable(NonTrivialStruct)); static_assert(__is_trivially_copyable(AllDefaulted)); static_assert(__is_trivially_copyable(AllDeleted)); + static_assert(__is_trivially_copyable(fm2x3)); + static_assert(__is_trivially_copyable(im4x4)); static_assert(!__is_trivially_copyable(void)); static_assert(!__is_trivially_copyable(SuperNonTrivialStruct)); From 525fab579da1504be3f09013f22e9ad499fb5bfc Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 9 May 2026 21:24:49 -0700 Subject: [PATCH 162/538] [ADT] Decouple xxhash.h from ADT. NFC (#196774) Move xxHash64, xxh3_64bits, and xxh3_128bits ArrayRef/StringRef overloads from llvm/Support/xxhash.h to inline overloads in llvm/ADT/ArrayRef.h and llvm/ADT/StringRef.h, so xxhash.h has no ADT dependencies. This is prerequisite for using xxh3 as the combine_bytes backend in llvm/ADT/Hashing.h (#194567), which would otherwise reintroduce a header dependency cycle. FoldingSet.h and StableHashing.h adjust to call the new pointer-and-length entry point. --- llvm/benchmarks/xxhash.cpp | 1 + llvm/include/llvm/ADT/ArrayRef.h | 14 ++++++++++++++ llvm/include/llvm/ADT/FoldingSet.h | 4 ++-- llvm/include/llvm/ADT/StableHashing.h | 5 ++--- llvm/include/llvm/ADT/StringRef.h | 12 ++++++++++++ llvm/include/llvm/Support/xxhash.h | 20 ++++++++++---------- llvm/lib/Support/xxhash.cpp | 19 ++++--------------- 7 files changed, 45 insertions(+), 30 deletions(-) diff --git a/llvm/benchmarks/xxhash.cpp b/llvm/benchmarks/xxhash.cpp index 429cbc0fa87d4..ab72ebd649293 100644 --- a/llvm/benchmarks/xxhash.cpp +++ b/llvm/benchmarks/xxhash.cpp @@ -1,5 +1,6 @@ #include "llvm/Support/xxhash.h" #include "benchmark/benchmark.h" +#include "llvm/ADT/ArrayRef.h" #include diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h index 86d4a6ec0e907..366233fdefd02 100644 --- a/llvm/include/llvm/ADT/ArrayRef.h +++ b/llvm/include/llvm/ADT/ArrayRef.h @@ -13,6 +13,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/xxhash.h" #include #include #include @@ -551,6 +552,19 @@ template hash_code hash_value(ArrayRef S) { return hash_combine_range(S); } +/// Inline ArrayRef overloads of the xxhash entry points declared +/// out-of-line in llvm/Support/xxhash.h. They live here so xxhash.h can stay +/// free of ADT dependencies. +inline uint64_t xxHash64(ArrayRef data) { + return xxHash64(data.data(), data.size()); +} +inline uint64_t xxh3_64bits(ArrayRef data) { + return xxh3_64bits(data.data(), data.size()); +} +inline XXH128_hash_t xxh3_128bits(ArrayRef data) { + return xxh3_128bits(data.data(), data.size()); +} + // Provide DenseMapInfo for ArrayRefs. template struct DenseMapInfo, void> { static inline ArrayRef getEmptyKey() { diff --git a/llvm/include/llvm/ADT/FoldingSet.h b/llvm/include/llvm/ADT/FoldingSet.h index ab501ad172be6..cde22fe35e390 100644 --- a/llvm/include/llvm/ADT/FoldingSet.h +++ b/llvm/include/llvm/ADT/FoldingSet.h @@ -185,8 +185,8 @@ class FoldingSetNodeIDRef { // Compute a deterministic hash value across processes that is suitable for // on-disk serialization. unsigned computeStableHash() const { - return static_cast(xxh3_64bits(ArrayRef( - reinterpret_cast(Data), sizeof(unsigned) * Size))); + return static_cast(xxh3_64bits( + reinterpret_cast(Data), sizeof(unsigned) * Size)); } LLVM_ABI bool operator==(FoldingSetNodeIDRef) const; diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h index 0dd83be639424..1beb5b85c9967 100644 --- a/llvm/include/llvm/ADT/StableHashing.h +++ b/llvm/include/llvm/ADT/StableHashing.h @@ -30,9 +30,8 @@ namespace llvm { using stable_hash = uint64_t; inline stable_hash stable_hash_combine(ArrayRef Buffer) { - const uint8_t *Ptr = reinterpret_cast(Buffer.data()); - size_t Size = Buffer.size() * sizeof(stable_hash); - return xxh3_64bits(ArrayRef(Ptr, Size)); + return xxh3_64bits(reinterpret_cast(Buffer.data()), + Buffer.size() * sizeof(stable_hash)); } inline stable_hash stable_hash_combine(stable_hash A, stable_hash B) { diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h index 71aa7157dfd98..7ed0d6efdbeb3 100644 --- a/llvm/include/llvm/ADT/StringRef.h +++ b/llvm/include/llvm/ADT/StringRef.h @@ -13,6 +13,7 @@ #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/xxhash.h" #include #include #include @@ -940,6 +941,17 @@ inline std::string &operator+=(std::string &buffer, StringRef string) { /// Compute a hash_code for a StringRef. [[nodiscard]] LLVM_ABI hash_code hash_value(StringRef S); +/// Inline StringRef overloads of the xxhash entry points declared out-of-line +/// in llvm/Support/xxhash.h. They live here so xxhash.h can stay free of ADT +/// dependencies. +inline uint64_t xxHash64(StringRef data) { + return xxHash64(reinterpret_cast(data.data()), data.size()); +} +inline uint64_t xxh3_64bits(StringRef data) { + return xxh3_64bits(reinterpret_cast(data.data()), + data.size()); +} + // Provide DenseMapInfo for StringRefs. template <> struct DenseMapInfo { static inline StringRef getEmptyKey() { diff --git a/llvm/include/llvm/Support/xxhash.h b/llvm/include/llvm/Support/xxhash.h index b521adbef3456..15c4f1bfd4563 100644 --- a/llvm/include/llvm/Support/xxhash.h +++ b/llvm/include/llvm/Support/xxhash.h @@ -38,19 +38,18 @@ #ifndef LLVM_SUPPORT_XXHASH_H #define LLVM_SUPPORT_XXHASH_H -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" +#include +#include namespace llvm { -LLVM_ABI uint64_t xxHash64(llvm::StringRef Data); -LLVM_ABI uint64_t xxHash64(llvm::ArrayRef Data); +// Deprecated pre-xxh3 64-bit hash. +LLVM_ABI uint64_t xxHash64(const uint8_t *data, size_t len); -LLVM_ABI uint64_t xxh3_64bits(ArrayRef data); -inline uint64_t xxh3_64bits(StringRef data) { - return xxh3_64bits(ArrayRef(data.bytes_begin(), data.size())); -} +/// XXH3's 64-bit variant. Inline ArrayRef and StringRef overloads live in +/// llvm/ADT/ArrayRef.h and llvm/ADT/StringRef.h. +LLVM_ABI uint64_t xxh3_64bits(const uint8_t *data, size_t len); /*-********************************************************************** * XXH3 128-bit variant @@ -72,8 +71,9 @@ struct XXH128_hash_t { } }; -/// XXH3's 128-bit variant. -LLVM_ABI XXH128_hash_t xxh3_128bits(ArrayRef data); +/// XXH3's 128-bit variant. Inline ArrayRef overload lives in +/// llvm/ADT/ArrayRef.h. +LLVM_ABI XXH128_hash_t xxh3_128bits(const uint8_t *data, size_t len); } // namespace llvm diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp index cdb76d57e2c1d..6997fed7e8336 100644 --- a/llvm/lib/Support/xxhash.cpp +++ b/llvm/lib/Support/xxhash.cpp @@ -100,11 +100,9 @@ static uint64_t XXH64_avalanche(uint64_t hash) { return hash; } -uint64_t llvm::xxHash64(StringRef Data) { - size_t Len = Data.size(); +uint64_t llvm::xxHash64(const uint8_t *P, size_t Len) { uint64_t Seed = 0; - const unsigned char *P = Data.bytes_begin(); - const unsigned char *const BEnd = Data.bytes_end(); + const uint8_t *const BEnd = P + Len; uint64_t H64; if (Len >= 32) { @@ -160,10 +158,6 @@ uint64_t llvm::xxHash64(StringRef Data) { return XXH64_avalanche(H64); } -uint64_t llvm::xxHash64(ArrayRef Data) { - return xxHash64({(const char *)Data.data(), Data.size()}); -} - constexpr size_t XXH3_SECRETSIZE_MIN = 136; constexpr size_t XXH_SECRET_DEFAULT_SIZE = 192; @@ -550,9 +544,7 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len, (uint64_t)len * PRIME64_1); } -uint64_t llvm::xxh3_64bits(ArrayRef data) { - auto *in = data.data(); - size_t len = data.size(); +uint64_t llvm::xxh3_64bits(const uint8_t *in, size_t len) { if (len <= 16) return XXH3_len_0to16_64b(in, len, kSecret, 0); if (len <= 128) @@ -1020,10 +1012,7 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret, return h128; } -llvm::XXH128_hash_t llvm::xxh3_128bits(ArrayRef data) { - size_t len = data.size(); - const uint8_t *input = data.data(); - +llvm::XXH128_hash_t llvm::xxh3_128bits(const uint8_t *input, size_t len) { /* * If an action is to be taken if `secret` conditions are not respected, * it should be done here. From 71d78b2220e4dc4b022fd74aec16ed8d93fc419e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 9 May 2026 22:41:13 -0700 Subject: [PATCH 163/538] [Hashing] Replace CityHash mixers with xxh3 (#194567) Replace the CityHash-style mixer in hash_combine and (transitively) hash_value(std::basic_string), hash_value(StringRef), and therefore DenseMap lookups, with a flatten-and-call into xxh3_64bits, a modern hash superior to CityHash. hash_value(int) / hash_value(ptr) keep the existing Murmur-style hash_16_bytes mixer; those are the dominant DenseMap key paths and a fully-inline 16-byte mix beats inlining xxh3's larger 0..16-byte short path. To break dependency cycle: xxHash64, xxh3_64bits, and xxh3_128bits ArrayRef/StringRef overloads move from llvm/Support/xxhash.h to inline overloads in llvm/ADT/ArrayRef.h and llvm/ADT/StringRef.h, so xxhash.h has no ADT dependencies. A variant that inlined xxh3's 0..16-byte fast path at every combine_bytes call site (vs. always calling out-of-line xxh3_64bits) showed no measurable compile-time improvement on the tracker, so combine_bytes is a one-liner over the out-of-line entry point. llvm-compile-time-tracker.com (CTMark, instructions:u) ``` stage1-O0-g -1.76% (sqlite3 -3.78%) stage1-aarch64-O0-g -1.40% (sqlite3 -2.86%) stage1-ReleaseLTO-g -1.13% stage1-ReleaseThinLTO -0.45% stage1-O3 -0.43% stage1-aarch64-O3 -0.42% stage2-O0-g -0.42% stage2-O3 -0.15% clang build -0.71% (wall -0.42%) ``` DenseMap-of-pointer paths (dominant at -O3) are untouched, so higher- optimization configs see smaller wins as expected. opt's .text shrinks ~92 KB. Subsumes the StringRef-only carve-out proposed in #191115. Notes on properties not introduced by this patch: - Endianness: hash_combine over native integers was already not cross-host stable. memcpy of a native integer into the buffer is host-encoded; fetch32 normalized the read but not the underlying bytes, so on LE vs BE the value fed to the mixer already differed. xxh3 inherits the same property: same byte stream, different mixer. - Process seed: combine_bytes XORs get_execution_seed into the result, which cancels under hash_combine(x) ^ hash_combine(y). The pre-patch short/state paths fed the seed through hash_16_bytes / shift_mix non-linearly, so this is a regression in seed effectiveness under that pattern. Default seed is constant, so this only matters under LLVM_ENABLE_ABI_BREAKING_CHECKS. Follow-up: add a seeded xxh3 entry point in libSupport. Aided by Claude opus 4.7 --- llvm/include/llvm/ADT/Hashing.h | 405 +++++------------------------ llvm/unittests/ADT/HashingTest.cpp | 19 ++ 2 files changed, 90 insertions(+), 334 deletions(-) diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h index 0f96b857bff16..c3cc37683c79a 100644 --- a/llvm/include/llvm/ADT/Hashing.h +++ b/llvm/include/llvm/ADT/Hashing.h @@ -33,11 +33,10 @@ // a single hash_code for their object. They should only logically be used // within the implementation of a 'hash_value' routine or similar context. // -// Note that 'hash_combine_range' contains very special logic for hashing -// a contiguous array of integers or pointers. This logic is *extremely* fast, -// on a modern Intel "Gainestown" Xeon (Nehalem uarch) @2.2 GHz, these were -// benchmarked at over 6.5 GiB/s for large keys, and <20 cycles/hash for keys -// under 32-bytes. +// 'hash_combine_range' hashes the byte stream of the range via xxh3. The +// contiguous-array overload hashes the range in place; the iterator overload +// materializes the byte stream into a 256-byte on-stack buffer, falling back +// to the heap for ranges that exceed it. // //===----------------------------------------------------------------------===// @@ -50,9 +49,12 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/type_traits.h" +#include "llvm/Support/xxhash.h" #include +#include #include #include +#include #include #include #include @@ -136,14 +138,6 @@ template hash_code hash_value(const std::optional &arg); namespace hashing { namespace detail { -inline uint64_t fetch64(const char *p) { - uint64_t result; - std::memcpy(&result, p, sizeof(result)); - if (sys::IsBigEndianHost) - sys::swapByteOrder(result); - return result; -} - inline uint32_t fetch32(const char *p) { uint32_t result; std::memcpy(&result, p, sizeof(result)); @@ -152,22 +146,6 @@ inline uint32_t fetch32(const char *p) { return result; } -/// Some primes between 2^63 and 2^64 for various uses. -static constexpr uint64_t k0 = 0xc3a5c85c97cb3127ULL; -static constexpr uint64_t k1 = 0xb492b66fbe98f273ULL; -static constexpr uint64_t k2 = 0x9ae16a3b2f90404fULL; -static constexpr uint64_t k3 = 0xc949d7c7509e6557ULL; - -/// Bitwise right rotate. -/// Normally this will compile to a single instruction, especially if the -/// shift is a manifest constant. -constexpr uint64_t rotate(uint64_t val, size_t shift) { - // Avoid shifting by 64: doing so yields an undefined result. - return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); -} - -constexpr uint64_t shift_mix(uint64_t val) { return val ^ (val >> 47); } - constexpr uint64_t hash_16_bytes(uint64_t low, uint64_t high) { // Murmur-inspired hashing. const uint64_t kMul = 0x9ddfea08eb382d69ULL; @@ -179,134 +157,6 @@ constexpr uint64_t hash_16_bytes(uint64_t low, uint64_t high) { return b; } -constexpr uint64_t hash_1to3_bytes(const char *s, size_t len, uint64_t seed) { - uint8_t a = s[0]; - uint8_t b = s[len >> 1]; - uint8_t c = s[len - 1]; - uint32_t y = static_cast(a) + (static_cast(b) << 8); - uint32_t z = static_cast(len) + (static_cast(c) << 2); - return shift_mix(y * k2 ^ z * k3 ^ seed) * k2; -} - -inline uint64_t hash_4to8_bytes(const char *s, size_t len, uint64_t seed) { - uint64_t a = fetch32(s); - return hash_16_bytes(len + (a << 3), seed ^ fetch32(s + len - 4)); -} - -inline uint64_t hash_9to16_bytes(const char *s, size_t len, uint64_t seed) { - uint64_t a = fetch64(s); - uint64_t b = fetch64(s + len - 8); - return hash_16_bytes(seed ^ a, rotate(b + len, len)) ^ b; -} - -inline uint64_t hash_17to32_bytes(const char *s, size_t len, uint64_t seed) { - uint64_t a = fetch64(s) * k1; - uint64_t b = fetch64(s + 8); - uint64_t c = fetch64(s + len - 8) * k2; - uint64_t d = fetch64(s + len - 16) * k0; - return hash_16_bytes(llvm::rotr(a - b, 43) + - llvm::rotr(c ^ seed, 30) + d, - a + llvm::rotr(b ^ k3, 20) - c + len + seed); -} - -inline uint64_t hash_33to64_bytes(const char *s, size_t len, uint64_t seed) { - uint64_t z = fetch64(s + 24); - uint64_t a = fetch64(s) + (len + fetch64(s + len - 16)) * k0; - uint64_t b = llvm::rotr(a + z, 52); - uint64_t c = llvm::rotr(a, 37); - a += fetch64(s + 8); - c += llvm::rotr(a, 7); - a += fetch64(s + 16); - uint64_t vf = a + z; - uint64_t vs = b + llvm::rotr(a, 31) + c; - a = fetch64(s + 16) + fetch64(s + len - 32); - z = fetch64(s + len - 8); - b = llvm::rotr(a + z, 52); - c = llvm::rotr(a, 37); - a += fetch64(s + len - 24); - c += llvm::rotr(a, 7); - a += fetch64(s + len - 16); - uint64_t wf = a + z; - uint64_t ws = b + llvm::rotr(a, 31) + c; - uint64_t r = shift_mix((vf + ws) * k2 + (wf + vs) * k0); - return shift_mix((seed ^ (r * k0)) + vs) * k2; -} - -inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) { - if (length >= 4 && length <= 8) - return hash_4to8_bytes(s, length, seed); - if (length > 8 && length <= 16) - return hash_9to16_bytes(s, length, seed); - if (length > 16 && length <= 32) - return hash_17to32_bytes(s, length, seed); - if (length > 32) - return hash_33to64_bytes(s, length, seed); - if (length != 0) - return hash_1to3_bytes(s, length, seed); - - return k2 ^ seed; -} - -/// The intermediate state used during hashing. -/// Currently, the algorithm for computing hash codes is based on CityHash and -/// keeps 56 bytes of arbitrary state. -struct hash_state { - uint64_t h0 = 0, h1 = 0, h2 = 0, h3 = 0, h4 = 0, h5 = 0, h6 = 0; - - /// Create a new hash_state structure and initialize it based on the - /// seed and the first 64-byte chunk. - /// This effectively performs the initial mix. - static hash_state create(const char *s, uint64_t seed) { - hash_state state = {0, - seed, - hash_16_bytes(seed, k1), - llvm::rotr(seed ^ k1, 49), - seed * k1, - shift_mix(seed), - 0}; - state.h6 = hash_16_bytes(state.h4, state.h5); - state.mix(s); - return state; - } - - /// Mix 32-bytes from the input sequence into the 16-bytes of 'a' - /// and 'b', including whatever is already in 'a' and 'b'. - static void mix_32_bytes(const char *s, uint64_t &a, uint64_t &b) { - a += fetch64(s); - uint64_t c = fetch64(s + 24); - b = llvm::rotr(b + a + c, 21); - uint64_t d = a; - a += fetch64(s + 8) + fetch64(s + 16); - b += llvm::rotr(a, 44) + d; - a += c; - } - - /// Mix in a 64-byte buffer of data. - /// We mix all 64 bytes even when the chunk length is smaller, but we - /// record the actual length. - void mix(const char *s) { - h0 = llvm::rotr(h0 + h1 + h3 + fetch64(s + 8), 37) * k1; - h1 = llvm::rotr(h1 + h4 + fetch64(s + 48), 42) * k1; - h0 ^= h6; - h1 += h3 + fetch64(s + 40); - h2 = llvm::rotr(h2 + h5, 33) * k1; - h3 = h4 * k1; - h4 = h0 + h5; - mix_32_bytes(s, h3, h4); - h5 = h2 + h6; - h6 = h1 + fetch64(s + 16); - mix_32_bytes(s + 32, h5, h6); - std::swap(h2, h0); - } - - /// Compute the final 64-bit hash code value based on the current - /// state and the length of bytes hashed. - constexpr uint64_t finalize(size_t length) { - return hash_16_bytes(hash_16_bytes(h3, h5) + shift_mix(h1) * k1 + h2, - hash_16_bytes(h4, h6) + shift_mix(length) * k1 + h0); - } -}; - /// In LLVM_ENABLE_ABI_BREAKING_CHECKS builds, the seed is non-deterministic /// per process (address of a function in LLVMSupport) to prevent having users /// depend on the particular hash values. On platforms without ASLR, this is @@ -320,6 +170,17 @@ inline uint64_t get_execution_seed() { #endif } +/// Hash a contiguous byte buffer to a hash_code. The execution seed is XORed +/// into the result (not propagated through the avalanche), so a given byte +/// stream produces the same xxh3 output modulo the per-process seed. +// +// TODO: post-XOR allows `hash_combine(x) ^ hash_combine(y)` to cancel the +// process seed. Follow-up: add a seeded xxh3 entry in +// llvm/lib/Support/xxhash.cpp. +inline hash_code combine_bytes(const char *data, size_t len) { + return xxh3_64bits(reinterpret_cast(data), len) ^ + get_execution_seed(); +} /// Trait to indicate whether a type's bits can be hashed directly. /// @@ -362,63 +223,43 @@ template auto get_hashable_data(const T &value) { } } -/// Helper to store data from a value into a buffer and advance the -/// pointer into that buffer. -/// -/// This routine first checks whether there is enough space in the provided -/// buffer, and if not immediately returns false. If there is space, it -/// copies the underlying bytes of value into the buffer, advances the -/// buffer_ptr past the copied bytes, and returns true. -template -bool store_and_advance(char *&buffer_ptr, char *buffer_end, const T& value, - size_t offset = 0) { - size_t store_size = sizeof(value) - offset; - if (buffer_ptr + store_size > buffer_end) - return false; - const char *value_data = reinterpret_cast(&value); - std::memcpy(buffer_ptr, value_data + offset, store_size); - buffer_ptr += store_size; - return true; -} - /// Implement the combining of integral values into a hash_code. /// /// This overload is selected when the value type of the iterator is /// integral. Rather than computing a hash_code for each object and then /// combining them, this (as an optimization) directly combines the integers. +/// +/// xxh3 has no streaming entry point in libLLVMSupport, so the byte stream is +/// flattened to a buffer and hashed in one shot. The 256-byte on-stack buffer +/// holds 32 pointer-sized values, which covers virtually all in-tree +/// non-contiguous callers. The prior chunked CityHash impl streamed and never +/// allocated. template hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) { - const uint64_t seed = get_execution_seed(); - char buffer[64], *buffer_ptr = buffer; - char *const buffer_end = std::end(buffer); - while (first != last && store_and_advance(buffer_ptr, buffer_end, - get_hashable_data(*first))) - ++first; - if (first == last) - return hash_short(buffer, buffer_ptr - buffer, seed); - assert(buffer_ptr == buffer_end); - - hash_state state = state.create(buffer, seed); - size_t length = 64; - while (first != last) { - // Fill up the buffer. We don't clear it, which re-mixes the last round - // when only a partial 64-byte chunk is left. - buffer_ptr = buffer; - while (first != last && store_and_advance(buffer_ptr, buffer_end, - get_hashable_data(*first))) - ++first; - - // Rotate the buffer if we did a partial fill in order to simulate doing - // a mix of the last 64-bytes. That is how the algorithm works when we - // have a contiguous byte sequence, and we want to emulate that here. - std::rotate(buffer, buffer_ptr, buffer_end); - - // Mix this chunk into the current state. - state.mix(buffer); - length += buffer_ptr - buffer; - }; - - return state.finalize(length); + alignas(uint64_t) char stack_buf[256]; + std::unique_ptr heap_buf; + char *buf = stack_buf; + size_t cap = sizeof(stack_buf); + size_t len = 0; + for (; first != last; ++first) { + auto data = get_hashable_data(*first); + if (len + sizeof(data) > cap) { + size_t new_cap = cap * 2; + while (new_cap < len + sizeof(data)) + new_cap *= 2; + // `new char[]` default-initializes (no zero-fill); make_unique would + // value-initialize, which is wasted work for a buffer about to be + // overwritten. + std::unique_ptr new_buf(new char[new_cap]); + std::memcpy(new_buf.get(), buf, len); + heap_buf = std::move(new_buf); + buf = heap_buf.get(); + cap = new_cap; + } + std::memcpy(buf + len, &data, sizeof(data)); + len += sizeof(data); + } + return combine_bytes(buf, len); } /// Implement the combining of integral values into a hash_code. @@ -432,24 +273,22 @@ hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) { template std::enable_if_t::value, hash_code> hash_combine_range_impl(ValueT *first, ValueT *last) { - const uint64_t seed = get_execution_seed(); - const char *s_begin = reinterpret_cast(first); - const char *s_end = reinterpret_cast(last); - const size_t length = std::distance(s_begin, s_end); - if (length <= 64) - return hash_short(s_begin, length, seed); - - const char *s_aligned_end = s_begin + (length & ~63); - hash_state state = state.create(s_begin, seed); - s_begin += 64; - while (s_begin != s_aligned_end) { - state.mix(s_begin); - s_begin += 64; - } - if (length & 63) - state.mix(s_end - 64); + return combine_bytes(reinterpret_cast(first), + size_t(last - first) * sizeof(ValueT)); +} + +/// Sum of `sizeof(get_hashable_data(arg))` across a parameter pack. +template constexpr size_t total_hashable_size() { + return (size_t(0) + ... + + sizeof(decltype(get_hashable_data(std::declval())))); +} - return state.finalize(length); +/// Copy `get_hashable_data(arg)` into `buf` at offset `off`, advancing `off`. +template +inline void store_hashable_data(char *buf, size_t &off, const T &arg) { + auto data = get_hashable_data(arg); + std::memcpy(buf + off, &data, sizeof(data)); + off += sizeof(data); } } // namespace detail @@ -472,112 +311,6 @@ template hash_code hash_combine_range(RangeT &&R) { return hash_combine_range(adl_begin(R), adl_end(R)); } -// Implementation details for hash_combine. -namespace hashing { -namespace detail { - -/// Helper class to manage the recursive combining of hash_combine -/// arguments. -/// -/// This class exists to manage the state and various calls involved in the -/// recursive combining of arguments used in hash_combine. It is particularly -/// useful at minimizing the code in the recursive calls to ease the pain -/// caused by a lack of variadic functions. -struct hash_combine_recursive_helper { - char buffer[64] = {}; - hash_state state; - const uint64_t seed; - -public: - /// Construct a recursive hash combining helper. - /// - /// This sets up the state for a recursive hash combine, including getting - /// the seed and buffer setup. - hash_combine_recursive_helper() - : seed(get_execution_seed()) {} - - /// Combine one chunk of data into the current in-flight hash. - /// - /// This merges one chunk of data into the hash. First it tries to buffer - /// the data. If the buffer is full, it hashes the buffer into its - /// hash_state, empties it, and then merges the new chunk in. This also - /// handles cases where the data straddles the end of the buffer. - template - char *combine_data(size_t &length, char *buffer_ptr, char *buffer_end, T data) { - if (!store_and_advance(buffer_ptr, buffer_end, data)) { - // Check for skew which prevents the buffer from being packed, and do - // a partial store into the buffer to fill it. This is only a concern - // with the variadic combine because that formation can have varying - // argument types. - size_t partial_store_size = buffer_end - buffer_ptr; - std::memcpy(buffer_ptr, &data, partial_store_size); - - // If the store fails, our buffer is full and ready to hash. We have to - // either initialize the hash state (on the first full buffer) or mix - // this buffer into the existing hash state. Length tracks the *hashed* - // length, not the buffered length. - if (length == 0) { - state = state.create(buffer, seed); - length = 64; - } else { - // Mix this chunk into the current state and bump length up by 64. - state.mix(buffer); - length += 64; - } - // Reset the buffer_ptr to the head of the buffer for the next chunk of - // data. - buffer_ptr = buffer; - - // Try again to store into the buffer -- this cannot fail as we only - // store types smaller than the buffer. - if (!store_and_advance(buffer_ptr, buffer_end, data, - partial_store_size)) - llvm_unreachable("buffer smaller than stored type"); - } - return buffer_ptr; - } - - /// Recursive, variadic combining method. - /// - /// This function recurses through each argument, combining that argument - /// into a single hash. - template - hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, - const T &arg, const Ts &...args) { - buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg)); - - // Recurse to the next argument. - return combine(length, buffer_ptr, buffer_end, args...); - } - - /// Base case for recursive, variadic combining. - /// - /// The base case when combining arguments recursively is reached when all - /// arguments have been handled. It flushes the remaining buffer and - /// constructs a hash_code. - hash_code combine(size_t length, char *buffer_ptr, char *buffer_end) { - // Check whether the entire set of values fit in the buffer. If so, we'll - // use the optimized short hashing routine and skip state entirely. - if (length == 0) - return hash_short(buffer, buffer_ptr - buffer, seed); - - // Mix the final buffer, rotating it if we did a partial fill in order to - // simulate doing a mix of the last 64-bytes. That is how the algorithm - // works when we have a contiguous byte sequence, and we want to emulate - // that here. - std::rotate(buffer, buffer_ptr, buffer_end); - - // Mix this chunk into the current state. - state.mix(buffer); - length += buffer_ptr - buffer; - - return state.finalize(length); - } -}; - -} // namespace detail -} // namespace hashing - /// Combine values into a single hash_code. /// /// This routine accepts a varying number of arguments of any type. It will @@ -589,10 +322,14 @@ struct hash_combine_recursive_helper { /// The result is suitable for returning from a user's hash_value /// *implementation* for their user-defined type. Consumers of a type should /// *not* call this routine, they should instead call 'hash_value'. -template hash_code hash_combine(const Ts &...args) { - // Recursively hash each argument using a helper class. - ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(0, helper.buffer, helper.buffer + 64, args...); +template hash_code hash_combine(const Ts &...args) { + constexpr size_t Total = hashing::detail::total_hashable_size(); + // Round up so `data()` is non-null when Total == 0; combine_bytes won't + // read the buffer in that case (len=0 short-circuits in xxh3_64bits). + std::array(1, Total)> buf; + size_t off = 0; + (hashing::detail::store_hashable_data(buf.data(), off, args), ...); + return hashing::detail::combine_bytes(buf.data(), Total); } // Implementation details for implementations of hash_value overloads provided diff --git a/llvm/unittests/ADT/HashingTest.cpp b/llvm/unittests/ADT/HashingTest.cpp index e116ee934a0a4..b6daf8f4695f8 100644 --- a/llvm/unittests/ADT/HashingTest.cpp +++ b/llvm/unittests/ADT/HashingTest.cpp @@ -202,6 +202,25 @@ TEST(HashingTest, HashCombineRangeBasicTest) { EXPECT_EQ(arr5_hash, d_arr5_hash); } +TEST(HashingTest, HashCombineRangeIteratorOverInlineBuffer) { + // Drive the non-pointer iterator overload past the inline stack buffer + // (>256 bytes of hashable data) into the heap-grow path. + constexpr size_t N = 100; // 100 * sizeof(size_t) = 800 bytes + std::vector v(N); + for (size_t i = 0; i < N; ++i) + v[i] = i * 0x9E3779B97F4A7C15ULL; + std::list l(v.begin(), v.end()); + + // Iterator and pointer paths see the same byte stream and agree past the + // inline buffer threshold. + EXPECT_EQ(hash_combine_range(v), hash_combine_range(l)); + EXPECT_EQ(hash_combine_range(l), hash_combine_range(l)); + + std::list l2 = l; + l2.push_back(0xDEADBEEFu); + EXPECT_NE(hash_combine_range(l), hash_combine_range(l2)); +} + TEST(HashingTest, HashCombineRangeLengthDiff) { // Test that as only the length varies, we compute different hash codes for // sequences. From 29e545d3ea7c1a39fa4929e339609dadf1f4215b Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 9 May 2026 22:55:38 -0700 Subject: [PATCH 164/538] [MC] Remove deprecated lookupTarget overload (#196778) This has been deprecated for a while and was slated for removal after the branching of LLVM 22. Remove it since I'm on on the Google integrate rotation this week and can take care of any failures on our end. --- llvm/include/llvm/MC/TargetRegistry.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h index f7ecc71132e94..b111342aec140 100644 --- a/llvm/include/llvm/MC/TargetRegistry.h +++ b/llvm/include/llvm/MC/TargetRegistry.h @@ -723,17 +723,6 @@ struct TargetRegistry { LLVM_ABI static iterator_range targets(); - /// lookupTarget - Lookup a target based on a target triple. - /// - /// \param TripleStr - The triple to use for finding a target. - /// \param Error - On failure, an error string describing why no target was - /// found. - // TODO(boomanaiden154): Remove this function after LLVM 22 branches. - [[deprecated("Use overload accepting Triple instead")]] - static const Target *lookupTarget(StringRef TripleStr, std::string &Error) { - return lookupTarget(Triple(TripleStr), Error); - } - /// lookupTarget - Lookup a target based on a target triple. /// /// \param Triple - The triple to use for finding a target. From a1d21ccf78e7893134c8c8cccd78e7d7d545a9c3 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 9 May 2026 23:08:49 -0700 Subject: [PATCH 165/538] [libc] Add barebones dl_iterate_phdr implementation (#194196) Add a basic dl_iterate_phdr implementation so that we can get libunwind building. This implementation is bare and not fully compliant with the man page for fully static binaries (which are all that we support currently with the lack of a dynamic linker) due to the lack of TLS info, but that can be added at a future date if it is needed, as it is not needed by libunwind. Add some very basic smoke tests. --- libc/config/linux/aarch64/entrypoints.txt | 3 ++ libc/config/linux/riscv/entrypoints.txt | 3 ++ libc/config/linux/x86_64/entrypoints.txt | 3 ++ libc/src/link/CMakeLists.txt | 2 + libc/src/link/dl_iterate_phdr.cpp | 57 +++++++++++++++++++-- libc/test/src/CMakeLists.txt | 1 + libc/test/src/link/CMakeLists.txt | 11 ++++ libc/test/src/link/dl_iterate_phdr_test.cpp | 42 +++++++++++++++ 8 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 libc/test/src/link/CMakeLists.txt create mode 100644 libc/test/src/link/dl_iterate_phdr_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index a0fb4663c1e54..643bba2aae694 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -1243,6 +1243,9 @@ if(LLVM_LIBC_FULL_BUILD) # sys/select.h entrypoints libc.src.sys.select.select + # link.h entrypoints + libc.src.link.dl_iterate_phdr + # wchar.h entrypoints # libc.src.wchar.fgetwc # libc.src.wchar.fgetws diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index f67984d4f6484..bbb7aca7f39b6 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -1377,6 +1377,9 @@ if(LLVM_LIBC_FULL_BUILD) # sys/select.h entrypoints libc.src.sys.select.select + # link.h entrypoints + libc.src.link.dl_iterate_phdr + # wchar.h entrypoints # libc.src.wchar.fgetwc # libc.src.wchar.fgetws diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index b6247c1172150..2a0e43744ec0d 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1478,6 +1478,9 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.nl_types.catopen libc.src.nl_types.catclose libc.src.nl_types.catgets + + # link.h entrypoints + libc.src.link.dl_iterate_phdr ) endif() diff --git a/libc/src/link/CMakeLists.txt b/libc/src/link/CMakeLists.txt index 55f5edfab7d93..ae2b1df6ea71a 100644 --- a/libc/src/link/CMakeLists.txt +++ b/libc/src/link/CMakeLists.txt @@ -6,4 +6,6 @@ add_entrypoint_object( dl_iterate_phdr.h DEPENDS libc.hdr.stdint_proxy + libc.src.__support.CPP.span + libc.src.__support.OSUtil.linux.auxv ) diff --git a/libc/src/link/dl_iterate_phdr.cpp b/libc/src/link/dl_iterate_phdr.cpp index 7964411598d4a..166f50fdfee86 100644 --- a/libc/src/link/dl_iterate_phdr.cpp +++ b/libc/src/link/dl_iterate_phdr.cpp @@ -5,21 +5,68 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// +/// +/// \file +/// The dl_iterate_phdr implementation. +/// +//===----------------------------------------------------------------------===/ #include "dl_iterate_phdr.h" +#include "llvm-libc-macros/link-macros.h" +#include "src/__support/CPP/span.h" +#include "src/__support/OSUtil/linux/auxv.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include + +extern "C" void *__ehdr_start; + namespace LIBC_NAMESPACE_DECL { +struct dl_phdr_info create_executable_info(ElfW(Ehdr) * executable_header) { + // TODO: Calculate dlpi_addr in the PIE case and set dlpi_name for VDSO. + struct dl_phdr_info to_return; + to_return.dlpi_addr = 0; + to_return.dlpi_name = ""; + to_return.dlpi_phdr = reinterpret_cast( + reinterpret_cast(executable_header) + + executable_header->e_phoff); + to_return.dlpi_phnum = executable_header->e_phnum; + to_return.dlpi_adds = 0; + to_return.dlpi_subs = 0; + to_return.dlpi_tls_modid = 0; + to_return.dlpi_tls_data = nullptr; + return to_return; +} + LLVM_LIBC_FUNCTION(int, dl_iterate_phdr, (__dl_iterate_phdr_callback_t callback, void *arg)) { - // FIXME: For pure static linking, this can report just the executable with - // info from __ehdr_start or AT_{PHDR,PHNUM} decoding, and its PT_TLS; and it - // could report the vDSO. - (void)callback, (void)arg; - return 0; + ElfW(Ehdr) *executable_header = reinterpret_cast(&__ehdr_start); + struct dl_phdr_info executable_info = + create_executable_info(executable_header); + int executable_return_code = + callback(&executable_info, sizeof(executable_info), arg); + if (executable_return_code != 0) + return executable_return_code; + + cpp::optional vdso_start_address = auxv::get(AT_SYSINFO_EHDR); + if (!vdso_start_address) + return 0; + ElfW(Ehdr) *vdso_header = reinterpret_cast(*vdso_start_address); + if (vdso_header == nullptr) + return 0; + struct dl_phdr_info vdso_info = create_executable_info(vdso_header); + for (auto elf_headers : + cpp::span(vdso_info.dlpi_phdr, vdso_info.dlpi_phnum)) { + if (elf_headers.p_type == PT_LOAD) { + vdso_info.dlpi_addr = + reinterpret_cast(vdso_header) - elf_headers.p_vaddr; + break; + } + } + return callback(&vdso_info, sizeof(vdso_info), arg); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 7bdafc6a85706..b877c7455fc34 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -62,6 +62,7 @@ add_subdirectory(complex) add_subdirectory(ctype) add_subdirectory(errno) add_subdirectory(fenv) +add_subdirectory(link) add_subdirectory(math) add_subdirectory(search) add_subdirectory(setjmp) diff --git a/libc/test/src/link/CMakeLists.txt b/libc/test/src/link/CMakeLists.txt new file mode 100644 index 0000000000000..efae9cf7e3d9b --- /dev/null +++ b/libc/test/src/link/CMakeLists.txt @@ -0,0 +1,11 @@ +add_custom_target(libc_link_unittests) + +add_libc_unittest( + dl_iterate_phdr_test + SUITE + libc_link_unittests + SRCS + dl_iterate_phdr_test.cpp + DEPENDS + libc.src.link.dl_iterate_phdr +) diff --git a/libc/test/src/link/dl_iterate_phdr_test.cpp b/libc/test/src/link/dl_iterate_phdr_test.cpp new file mode 100644 index 0000000000000..c3c8a5e89663c --- /dev/null +++ b/libc/test/src/link/dl_iterate_phdr_test.cpp @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===/ +/// +/// \file +/// Tests for the dl_iterate_phdr implementation. +/// +//===----------------------------------------------------------------------===/ + +#include "hdr/types/size_t.h" +#include "src/link/dl_iterate_phdr.h" +#include "test/UnitTest/Test.h" + +int save_return_1(struct dl_phdr_info *info, [[maybe_unused]] size_t info_size, + void *arg) { + *static_cast(arg) = info->dlpi_phnum; + return 1; +} + +TEST(LlvmLibcLinkDlIteratePhdrTest, OnlyExecutable) { + int program_header_count = 0; + EXPECT_EQ( + LIBC_NAMESPACE::dl_iterate_phdr(save_return_1, &program_header_count), 1); + EXPECT_GT(program_header_count, 0); +} + +int save_return_0(struct dl_phdr_info *info, [[maybe_unused]] size_t info_size, + void *arg) { + *static_cast(arg) = info->dlpi_phnum; + return 0; +} + +TEST(LlvmLibcLinkDlIteratePhdrTest, BothExecutableAndVDSO) { + int program_header_count = 0; + EXPECT_EQ( + LIBC_NAMESPACE::dl_iterate_phdr(save_return_0, &program_header_count), 0); + EXPECT_GT(program_header_count, 0); +} From 305469117a2daad5582d7d90f0f4f029ea3aa3a8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 9 May 2026 23:38:50 -0700 Subject: [PATCH 166/538] [ADT] Remove xxHash64 ArrayRef/StringRef overloads. NFC (#196781) xxHash64 is a legacy, pre-XXH3 hash whose only non-test caller in the monorepo is llvm::getKCFITypeID. #196774 accidentally exposed the API. --- llvm/include/llvm/ADT/ArrayRef.h | 3 --- llvm/include/llvm/ADT/StringRef.h | 3 --- llvm/lib/Support/Hash.cpp | 4 +++- llvm/unittests/Support/xxhashTest.cpp | 12 ++++++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h index 366233fdefd02..cf2c6d85dc272 100644 --- a/llvm/include/llvm/ADT/ArrayRef.h +++ b/llvm/include/llvm/ADT/ArrayRef.h @@ -555,9 +555,6 @@ template hash_code hash_value(ArrayRef S) { /// Inline ArrayRef overloads of the xxhash entry points declared /// out-of-line in llvm/Support/xxhash.h. They live here so xxhash.h can stay /// free of ADT dependencies. -inline uint64_t xxHash64(ArrayRef data) { - return xxHash64(data.data(), data.size()); -} inline uint64_t xxh3_64bits(ArrayRef data) { return xxh3_64bits(data.data(), data.size()); } diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h index 7ed0d6efdbeb3..5421224e6a1d3 100644 --- a/llvm/include/llvm/ADT/StringRef.h +++ b/llvm/include/llvm/ADT/StringRef.h @@ -944,9 +944,6 @@ inline std::string &operator+=(std::string &buffer, StringRef string) { /// Inline StringRef overloads of the xxhash entry points declared out-of-line /// in llvm/Support/xxhash.h. They live here so xxhash.h can stay free of ADT /// dependencies. -inline uint64_t xxHash64(StringRef data) { - return xxHash64(reinterpret_cast(data.data()), data.size()); -} inline uint64_t xxh3_64bits(StringRef data) { return xxh3_64bits(reinterpret_cast(data.data()), data.size()); diff --git a/llvm/lib/Support/Hash.cpp b/llvm/lib/Support/Hash.cpp index d241a8a4cc753..6b5d000ee27c9 100644 --- a/llvm/lib/Support/Hash.cpp +++ b/llvm/lib/Support/Hash.cpp @@ -38,7 +38,9 @@ uint32_t llvm::getKCFITypeID(StringRef MangledTypeName, switch (Algorithm) { case KCFIHashAlgorithm::xxHash64: // Use lower 32 bits of xxHash64 - return static_cast(xxHash64(MangledTypeName)); + return static_cast( + xxHash64(reinterpret_cast(MangledTypeName.data()), + MangledTypeName.size())); case KCFIHashAlgorithm::FNV1a: // FNV-1a hash (32-bit) uint32_t Hash = 2166136261u; // FNV offset basis diff --git a/llvm/unittests/Support/xxhashTest.cpp b/llvm/unittests/Support/xxhashTest.cpp index 6764efe8e5415..9f91fc79e1f62 100644 --- a/llvm/unittests/Support/xxhashTest.cpp +++ b/llvm/unittests/Support/xxhashTest.cpp @@ -33,11 +33,15 @@ static void fillTestBuffer(uint8_t *buffer, size_t len) { } TEST(xxhashTest, Basic) { - EXPECT_EQ(0xef46db3751d8e999U, xxHash64(StringRef())); - EXPECT_EQ(0x33bf00a859c4ba3fU, xxHash64("foo")); - EXPECT_EQ(0x48a37c90ad27a659U, xxHash64("bar")); + EXPECT_EQ(0xef46db3751d8e999U, xxHash64(nullptr, 0)); + EXPECT_EQ(0x33bf00a859c4ba3fU, + xxHash64(reinterpret_cast("foo"), 3)); + EXPECT_EQ(0x48a37c90ad27a659U, + xxHash64(reinterpret_cast("bar"), 3)); EXPECT_EQ(0x69196c1b3af0bff9U, - xxHash64("0123456789abcdefghijklmnopqrstuvwxyz")); + xxHash64(reinterpret_cast( + "0123456789abcdefghijklmnopqrstuvwxyz"), + 36)); } TEST(xxhashTest, xxh3) { From 7e2821e025f8ee4add31693ddf462947d7618016 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Sun, 10 May 2026 15:55:16 +0800 Subject: [PATCH 167/538] [Clang] Transform lambda's constraints when instantiating parameter mapping (#195995) This way we can remove a few workarounds of lambda expressions where outer template arguments of concepts have to be preserved through ImplicitConceptSpecializationDecls. Fixes #193944 --- clang/lib/Parse/ParseTemplate.cpp | 6 ---- clang/lib/Sema/SemaConcept.cpp | 30 +++++----------- clang/lib/Sema/SemaTemplate.cpp | 2 +- clang/lib/Sema/SemaTemplateDeduction.cpp | 14 -------- clang/lib/Sema/SemaTemplateInstantiate.cpp | 36 ++++++++++++++++--- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 7 ++++ clang/lib/Sema/TreeTransform.h | 14 ++++++-- clang/test/SemaTemplate/concepts-lambda.cpp | 36 +++++++++++++++++-- 8 files changed, 93 insertions(+), 52 deletions(-) diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index 330a9c6aea0c5..dbc7cbc6cdc0c 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -533,12 +533,6 @@ bool Parser::isTypeConstraintAnnotation() { bool Parser::TryAnnotateTypeConstraint() { if (!getLangOpts().CPlusPlus20) return false; - // The type constraint may declare template parameters, notably - // if it contains a generic lambda, so we need to increment - // the template depth as these parameters would not be instantiated - // at the current depth. - TemplateParameterDepthRAII CurTemplateDepthTracker(TemplateParameterDepth); - ++CurTemplateDepthTracker; CXXScopeSpec SS; bool WasScopeAnnotation = Tok.is(tok::annot_cxxscope); if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index e07d8832131d6..bb3de89c42ccc 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -482,11 +482,6 @@ class ConstraintSatisfactionChecker { ConstraintSatisfaction &Satisfaction; bool BuildExpression; - // The most closest concept declaration when evaluating atomic constriants. - // This is to make sure that lambdas in the atomic expression live in the - // right context. - ConceptDecl *ParentConcept = nullptr; - // This is for TemplateInstantiator to not instantiate the same template // parameter mapping many times, in order to improve substitution performance. llvm::DenseMap @@ -730,20 +725,6 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow( return ExprEmpty(); } - // Note that generic lambdas inside requires body require a lambda context - // decl from which to fetch correct template arguments. But we don't have any - // proper decls because the constraints are already normalized. - if (ParentConcept) { - // FIXME: the evaluation context should learn to track template arguments - // separately from a Decl. - EvaluationContext.emplace( - S, Sema::ExpressionEvaluationContext::ConstantEvaluated, - /*LambdaContextDecl=*/ - ImplicitConceptSpecializationDecl::Create( - S.Context, ParentConcept->getDeclContext(), - ParentConcept->getBeginLoc(), SubstitutedOutermost)); - } - Sema::ArgPackSubstIndexRAII SubstIndex(S, PackSubstitutionIndex); ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint( Constraint.getConstraintExpr(), *SubstitutedArgs); @@ -1052,9 +1033,6 @@ ExprResult ConstraintSatisfactionChecker::Evaluate( if (InstTemplate.isInvalid()) return ExprError(); - llvm::SaveAndRestore PushConceptDecl( - ParentConcept, cast(ConceptId->getNamedConcept())); - unsigned Size = Satisfaction.Details.size(); ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL); @@ -2312,6 +2290,14 @@ bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) { } assert(!ArgsAsWritten); const ConceptSpecializationExpr *CSE = CC.getConceptSpecializationExpr(); + // Make sure that lambdas within template arguments live in a + // dependent context such that they are assured to be transformed during + // constraint evaluation. + EnterExpressionEvaluationContext EECtx( + SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated, + /*LambdaContextDecl=*/ + const_cast( + CSE->getSpecializationDecl())); SmallVector InnerArgs(CSE->getTemplateArguments()); ConceptDecl *Concept = CSE->getNamedConcept(); if (RemovePacksForFoldExpr) { diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 174f42caac506..71c2928b22d53 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4932,7 +4932,7 @@ ExprResult Sema::CheckConceptTemplateId( LocalInstantiationScope Scope(*this); EnterExpressionEvaluationContext EECtx{ - *this, ExpressionEvaluationContext::Unevaluated, CSD}; + *this, ExpressionEvaluationContext::Unevaluated}; Error = CheckConstraintSatisfaction( NamedConcept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL, diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index defdd9ca6968a..ac6dc5bcefb7e 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -5159,20 +5159,6 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type, return true; MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.SugaredConverted, /*Final=*/true); - // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so - // that the template arguments of the constraint can be preserved. For - // example: - // - // template - // concept C = []() { return true; }(); - // - // We need the argument for T while evaluating type constraint D in - // building the CallExpr to the lambda. - EnterExpressionEvaluationContext EECtx( - S, Sema::ExpressionEvaluationContext::Unevaluated, - ImplicitConceptSpecializationDecl::Create( - S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(), - CTAI.SugaredConverted)); if (S.CheckConstraintSatisfaction( Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL, TypeLoc.getLocalSourceRange(), Satisfaction)) diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index c0a2c9f747bc4..7a039b68c0ffb 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1763,9 +1763,24 @@ namespace { if (TA.isDependent()) return CXXRecordDecl::LambdaDependencyKind::LDK_AlwaysDependent; } + if (auto *CD = dyn_cast_if_present( + LSI->Lambda->getLambdaContextDecl())) { + if (llvm::any_of(CD->getTemplateArguments(), + [](const auto &TA) { return TA.isDependent(); })) + return CXXRecordDecl::LambdaDependencyKind::LDK_AlwaysDependent; + } return inherited::ComputeLambdaDependency(LSI); } + ExprResult TransformConstraint(Expr *AC) { + // We don't want the template argument substitution into parameter + // mappings to preserve the outer depths. + if (AC && SemaRef.inConstraintSubstitution()) + return TransformExpr(const_cast(AC)); + + return AC; + } + ExprResult TransformLambdaExpr(LambdaExpr *E) { // Do not rebuild lambdas to avoid creating a new type. // Lambdas have already been processed inside their eval contexts. @@ -1876,11 +1891,24 @@ namespace { TemplateParameterList *OrigTPL) { if (!OrigTPL || !OrigTPL->size()) return OrigTPL; + std::optional OldMLTAL; + // We need to preserve the lambda depth in parameter mapping. + // Otherwise the template argument deduction would fail, if we reduced the + // depth too early. + if (SemaRef.inParameterMappingSubstitution() && + OrigTPL->getDepth() >= TemplateArgs.getNumSubstitutedLevels()) + OldMLTAL = ForgetSubstitution(); + DeclContext *Owner = OrigTPL->getParam(0)->getDeclContext(); - TemplateDeclInstantiator DeclInstantiator(getSema(), - /* DeclContext *Owner */ Owner, TemplateArgs); - DeclInstantiator.setEvaluateConstraints(EvaluateConstraints); - return DeclInstantiator.SubstTemplateParams(OrigTPL); + TemplateDeclInstantiator DeclInstantiator(getSema(), Owner, TemplateArgs); + // We don't want the template argument substitution into parameter + // mappings to preserve the outer depths. + DeclInstantiator.setEvaluateConstraints( + SemaRef.inConstraintSubstitution() || EvaluateConstraints); + auto *Transformed = DeclInstantiator.SubstTemplateParams(OrigTPL); + if (OldMLTAL) + RememberSubstitution(std::move(*OldMLTAL)); + return Transformed; } concepts::TypeRequirement * diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index c9bc613a7c4ea..ce0e390e371e1 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -4862,6 +4862,13 @@ TemplateDeclInstantiator::SubstTemplateParams(TemplateParameterList *L) { return nullptr; Expr *InstRequiresClause = L->getRequiresClause(); + if (InstRequiresClause && EvaluateConstraints) { + ExprResult E = + SemaRef.SubstConstraintExpr(InstRequiresClause, TemplateArgs); + if (E.isInvalid()) + return nullptr; + InstRequiresClause = E.get(); + } TemplateParameterList *InstL = TemplateParameterList::Create(SemaRef.Context, L->getTemplateLoc(), diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 40187f71231bd..444795c3b67b9 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -836,6 +836,8 @@ class TreeTransform { LSI->Lambda->getLambdaDependencyKind()); } + ExprResult TransformConstraint(Expr *AC) { return AC; } + QualType TransformReferenceType(TypeLocBuilder &TLB, ReferenceTypeLoc TL); StmtResult TransformCompoundStmt(CompoundStmt *S, bool IsStmtExpr); @@ -16066,8 +16068,16 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { assert(FPTL && "Not a FunctionProtoType?"); AssociatedConstraint TRC = E->getCallOperator()->getTrailingRequiresClause(); - // If the concept refers to any outer parameter packs, we track the SubstIndex - // for evaluation. + if (TRC) { + ExprResult E = getDerived().TransformConstraint( + const_cast(TRC.ConstraintExpr)); + if (E.isInvalid()) + return E; + TRC.ConstraintExpr = E.get(); + } + // If the concept refers to any outer parameter packs, we track the + // SubstIndex for evaluation. + // FIXME: This seems unnecessary after transforming lambda constraints. if (TRC && TRC.ConstraintExpr->containsUnexpandedParameterPack() && !TRC.ArgPackSubstIndex) TRC.ArgPackSubstIndex = SemaRef.ArgPackSubstIndex; diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp index ddee39b162c63..a583589340bd0 100644 --- a/clang/test/SemaTemplate/concepts-lambda.cpp +++ b/clang/test/SemaTemplate/concepts-lambda.cpp @@ -56,9 +56,7 @@ namespace GH57971 { function_ptr ptr = f; } -// GH58368: A lambda defined in a concept requires we store -// the concept as a part of the lambda context. -namespace LambdaInConcept { +namespace GH58368 { using size_t = unsigned long; template @@ -367,3 +365,35 @@ void test() { f<42>(); } } + +namespace GH193944 { + +template +concept pass_a_concept_inside_a_lambda = requires { L.template operator()(); }; // #requires_pass_a_concept_inside_a_lambda + +template +concept PredicateFor_bad = pass_a_concept_inside_a_lambda<[] // #pass_a_concept_inside_a_lambda + requires(__is_same(decltype(Pred.template operator()()), bool) and ...) + {}, + Ts...>; + +template + requires PredicateFor_bad // #PredicateFor_bad +constexpr const unsigned count_if_v_bad = + [] { return (Pred.template operator()() + ... + 0); }(); + +constexpr const auto L = [] +{ return __is_same(T, long); }; + +constexpr const auto L2 = [] +{ return 114514; }; + +static_assert(count_if_v_bad == 1); + +static_assert(count_if_v_bad == 1); +// expected-error@-1 {{constraints not satisfied}} +// expected-note@#PredicateFor_bad {{evaluated to false}} +// expected-note@#pass_a_concept_inside_a_lambda {{evaluated to false}} +// expected-note@#requires_pass_a_concept_inside_a_lambda {{no matching member function}} + +} From 3be2eaafd277113a68667cfae52b6405b66869f2 Mon Sep 17 00:00:00 2001 From: Andrew Marshall Date: Sun, 10 May 2026 09:04:41 +0100 Subject: [PATCH 168/538] [cmake] use target names instead of legacy variables (#185463) Use the [name of the imported targets](https://cmake.org/cmake/help/latest/module/CheckSymbolExists.html) when testing the libraries during cmake configuration. This removes the need to also set `CMAKE_REQUIRED_INCLUDES` and `CMAKE_REQUIRED_DEFINITIONS` and reflects more modern CMake usage where targets are preferred over variables. This is already the case when checking libcurl in the same file. --- llvm/cmake/config-ix.cmake | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 0ae3e25b5b43f..415b09e298075 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -172,8 +172,7 @@ if(LLVM_ENABLE_ZLIB) # Check if zlib we found is usable; for example, we may have found a 32-bit # library on a 64-bit system which would result in a link-time failure. cmake_push_check_state() - list(APPEND CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIRS}) - list(APPEND CMAKE_REQUIRED_LIBRARIES ${ZLIB_LIBRARY}) + list(APPEND CMAKE_REQUIRED_LIBRARIES ZLIB::ZLIB) check_symbol_exists(compress2 zlib.h HAVE_ZLIB) cmake_pop_check_state() if(LLVM_ENABLE_ZLIB STREQUAL FORCE_ON AND NOT HAVE_ZLIB) @@ -219,9 +218,7 @@ if(LLVM_ENABLE_LIBXML2) # Check if libxml2 we found is usable; for example, we may have found a 32-bit # library on a 64-bit system which would result in a link-time failure. cmake_push_check_state() - list(APPEND CMAKE_REQUIRED_INCLUDES ${LIBXML2_INCLUDE_DIRS}) - list(APPEND CMAKE_REQUIRED_LIBRARIES ${LIBXML2_LIBRARIES}) - list(APPEND CMAKE_REQUIRED_DEFINITIONS ${LIBXML2_DEFINITIONS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES LibXml2::LibXml2) check_symbol_exists(xmlReadMemory libxml/xmlreader.h HAVE_LIBXML2) cmake_pop_check_state() if(LLVM_ENABLE_LIBXML2 STREQUAL FORCE_ON AND NOT HAVE_LIBXML2) From 155861873606c3be95e02ecd9247b5ce29cb88ac Mon Sep 17 00:00:00 2001 From: Zeyi Xu Date: Sun, 10 May 2026 16:08:33 +0800 Subject: [PATCH 169/538] [clang-tidy] Remove hicpp module [1/4] (#194516) This is part one of removing the `hicpp-*` checks. RFC: https://discourse.llvm.org/t/rfc-regarding-the-current-status-of-hicpp-checks/89883 Part of https://github.com/llvm/llvm-project/issues/183462 --- .../clang-tidy/hicpp/HICPPTidyModule.cpp | 34 ----------------- clang-tools-extra/clangd/TidyFastChecks.inc | 8 ---- clang-tools-extra/docs/ReleaseNotes.rst | 38 ++++++++++++------- .../checks/bugprone/unused-return-value.rst | 5 --- .../checks/cppcoreguidelines/avoid-goto.rst | 4 +- .../checks/hicpp/avoid-c-arrays.rst | 9 ----- .../clang-tidy/checks/hicpp/avoid-goto.rst | 9 ----- .../checks/hicpp/braces-around-statements.rst | 11 ------ .../checks/hicpp/deprecated-headers.rst | 9 ----- .../checks/hicpp/exception-baseclass.rst | 8 ---- .../checks/hicpp/explicit-conversions.rst | 23 ----------- .../clang-tidy/checks/hicpp/function-size.rst | 12 ------ .../checks/hicpp/ignored-remove-result.rst | 8 ---- .../docs/clang-tidy/checks/list.rst | 8 ---- .../checks/modernize/avoid-c-arrays.rst | 2 - .../unused-return-value-remove.cpp} | 6 ++- 16 files changed, 30 insertions(+), 164 deletions(-) delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-c-arrays.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-goto.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/braces-around-statements.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/deprecated-headers.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/exception-baseclass.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/function-size.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/ignored-remove-result.rst rename clang-tools-extra/test/clang-tidy/checkers/{hicpp/ignored-remove-result.cpp => bugprone/unused-return-value-remove.cpp} (80%) diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp index c87056f9141ca..e628c81db6955 100644 --- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp @@ -9,21 +9,15 @@ #include "../ClangTidy.h" #include "../ClangTidyModule.h" #include "../bugprone/SignedBitwiseCheck.h" -#include "../bugprone/StdExceptionBaseclassCheck.h" #include "../bugprone/UndelegatedConstructorCheck.h" -#include "../bugprone/UnusedReturnValueCheck.h" #include "../bugprone/UseAfterMoveCheck.h" -#include "../cppcoreguidelines/AvoidGotoCheck.h" #include "../cppcoreguidelines/NoMallocCheck.h" #include "../cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h" #include "../cppcoreguidelines/ProTypeMemberInitCheck.h" #include "../cppcoreguidelines/ProTypeVarargCheck.h" #include "../cppcoreguidelines/SpecialMemberFunctionsCheck.h" -#include "../misc/ExplicitConstructorCheck.h" #include "../misc/NewDeleteOverloadsCheck.h" #include "../misc/StaticAssertCheck.h" -#include "../modernize/AvoidCArraysCheck.h" -#include "../modernize/DeprecatedHeadersCheck.h" #include "../modernize/UseAutoCheck.h" #include "../modernize/UseEmplaceCheck.h" #include "../modernize/UseEqualsDefaultCheck.h" @@ -34,8 +28,6 @@ #include "../performance/MoveConstArgCheck.h" #include "../performance/NoexceptMoveConstructorCheck.h" #include "../portability/NoAssemblerCheck.h" -#include "../readability/BracesAroundStatementsCheck.h" -#include "../readability/FunctionSizeCheck.h" #include "../readability/NamedParameterCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" #include "MultiwayPathsCoveredCheck.h" @@ -47,26 +39,10 @@ namespace { class HICPPModule : public ClangTidyModule { public: void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { - CheckFactories.registerCheck( - "hicpp-avoid-c-arrays"); - CheckFactories.registerCheck( - "hicpp-avoid-goto"); - CheckFactories.registerCheck( - "hicpp-braces-around-statements"); - CheckFactories.registerCheck( - "hicpp-deprecated-headers"); - CheckFactories.registerCheck( - "hicpp-exception-baseclass"); - CheckFactories.registerCheck( - "hicpp-ignored-remove-result"); CheckFactories.registerCheck( "hicpp-multiway-paths-covered"); CheckFactories.registerCheck( "hicpp-signed-bitwise"); - CheckFactories.registerCheck( - "hicpp-explicit-conversions"); - CheckFactories.registerCheck( - "hicpp-function-size"); CheckFactories.registerCheck( "hicpp-named-parameter"); CheckFactories.registerCheck( @@ -111,16 +87,6 @@ class HICPPModule : public ClangTidyModule { CheckFactories.registerCheck( "hicpp-vararg"); } - - ClangTidyOptions getModuleOptions() override { - ClangTidyOptions Options; - ClangTidyOptions::OptionMap &Opts = Options.CheckOptions; - Opts["hicpp-ignored-remove-result.CheckedFunctions"] = - "^::std::remove$;^::std::remove_if$;^::std::unique$"; - Opts["hicpp-ignored-remove-result.CheckedReturnTypes"] = ""; - Opts["hicpp-ignored-remove-result.AllowCastToVoid"] = "true"; - return Options; - } }; } // namespace diff --git a/clang-tools-extra/clangd/TidyFastChecks.inc b/clang-tools-extra/clangd/TidyFastChecks.inc index 15fc27cf81c01..c1a72bae035b7 100644 --- a/clang-tools-extra/clangd/TidyFastChecks.inc +++ b/clang-tools-extra/clangd/TidyFastChecks.inc @@ -272,14 +272,6 @@ FAST(google-runtime-float, 1.0) FAST(google-runtime-int, 2.0) FAST(google-runtime-operator, 1.0) FAST(google-upgrade-googletest-case, 1.0) -FAST(hicpp-avoid-c-arrays, 2.0) -FAST(hicpp-avoid-goto, -0.0) -FAST(hicpp-braces-around-statements, 1.0) -FAST(hicpp-deprecated-headers, -0.0) -FAST(hicpp-exception-baseclass, 1.0) -FAST(hicpp-explicit-conversions, 2.0) -FAST(hicpp-function-size, 1.0) -FAST(hicpp-ignored-remove-result, 2.0) FAST(hicpp-invalid-access-moved, 9.0) FAST(hicpp-member-init, 2.0) FAST(hicpp-move-const-arg, 2.0) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 9821c7e74ebf7..be5315ac3c181 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -55,6 +55,30 @@ Potentially Breaking Changes `. The original check will be removed in the 25th release. +- Removed the :program:`clang-tidy` ``hicpp`` module. All checks have been moved + to the other modules. Use the replacement checks instead: + + ================================== ========================================================= + Removed check Replacement check + ================================== ========================================================= + ``hicpp-avoid-c-arrays`` :doc:`modernize-avoid-c-arrays + ` + ``hicpp-avoid-goto`` :doc:`cppcoreguidelines-avoid-goto + ` + ``hicpp-braces-around-statements`` :doc:`readability-braces-around-statements + ` + ``hicpp-deprecated-headers`` :doc:`modernize-deprecated-headers + ` + ``hicpp-exception-baseclass`` :doc:`bugprone-std-exception-baseclass + ` + ``hicpp-explicit-conversions`` :doc:`misc-explicit-constructor + ` + ``hicpp-function-size`` :doc:`readability-function-size + ` + ``hicpp-ignored-remove-result`` :doc:`bugprone-unused-return-value + ` + ================================== ========================================================= + Improvements to clangd ---------------------- @@ -207,7 +231,7 @@ New checks New check aliases ^^^^^^^^^^^^^^^^^ -- New alias :doc:`cert-exp45-c ` +- Renamed :doc:`cert-exp45-c ` to :doc:`bugprone-assignment-in-selection-statement `. @@ -223,18 +247,6 @@ New check aliases `. The `google-explicit-constructor` name is kept as an alias. -- Renamed :doc:`hicpp-exception-baseclass - ` - to :doc:`bugprone-std-exception-baseclass - `. - The `hicpp-exception-baseclass` name is kept as an alias. - -- Renamed :doc:`hicpp-ignored-remove-result - ` - to :doc:`bugprone-unused-return-value - `. - The `hicpp-ignored-remove-result` name is kept as an alias. - - Renamed :doc:`hicpp-no-assembler ` to :doc:`portability-no-assembler `. The `hicpp-no-assembler` diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst index 3e7c51a9b1ac4..725403a6eb818 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst @@ -67,8 +67,3 @@ Options :doc:`cert-err33-c <../cert/err33-c>` is an alias of this check that checks a fixed and large set of standard library functions. - -:doc:`hicpp-ignored-remove-result <../hicpp/ignored-remove-result>` is an -alias of this check that checks a restricted set of functions: -``std::remove``, ``std::remove_if``, and ``std::unique``. -The `AllowCastToVoid` option is set to `true` by default. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst index 87e14bbe8a850..f8a57df1cb61d 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst @@ -8,9 +8,7 @@ with looping constructs. Only forward jumps in nested loops are accepted. This check implements `ES.76 `_ -from the C++ Core Guidelines and -`6.3.1 `_ -from High Integrity C++ Coding Standard. +from the C++ Core Guidelines. For more information on why to avoid programming with ``goto`` you can read the famous paper `A Case against the GO TO Statement. `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-c-arrays.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-c-arrays.rst deleted file mode 100644 index ad7198831d0d0..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-c-arrays.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-avoid-c-arrays - -hicpp-avoid-c-arrays -==================== - -The hicpp-avoid-c-arrays check is an alias, please see -:doc:`modernize-avoid-c-arrays <../modernize/avoid-c-arrays>` -for more information. -It partly enforces the `rule 4.1.1 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-goto.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-goto.rst deleted file mode 100644 index eb5d303477cab..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/avoid-goto.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-avoid-goto - -hicpp-avoid-goto -================ - -The `hicpp-avoid-goto` check is an alias, please see -:doc:`cppcoreguidelines-avoid-goto <../cppcoreguidelines/avoid-goto>` -for more information. -It enforces the `rule 6.3.1 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/braces-around-statements.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/braces-around-statements.rst deleted file mode 100644 index 9584126c2bb88..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/braces-around-statements.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. title:: clang-tidy - hicpp-braces-around-statements - -hicpp-braces-around-statements -============================== - -The `hicpp-braces-around-statements` check is an alias, please see -:doc:`readability-braces-around-statements -<../readability/braces-around-statements>` -for more information. -It enforces the `rule 6.1.1 -`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/deprecated-headers.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/deprecated-headers.rst deleted file mode 100644 index a90ec85e6d0e1..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/deprecated-headers.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-deprecated-headers - -hicpp-deprecated-headers -======================== - -The `hicpp-deprecated-headers` check is an alias, please see -:doc:`modernize-deprecated-headers <../modernize/deprecated-headers>` -for more information. -It enforces the `rule 1.3.3 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/exception-baseclass.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/exception-baseclass.rst deleted file mode 100644 index 5f54565f5f9c6..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/exception-baseclass.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-exception-baseclass - -hicpp-exception-baseclass -========================= - -The `hicpp-exception-baseclass` check is an alias, please see -`bugprone-std-exception-baseclass <../bugprone/std-exception-baseclass.html>`_ -for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst deleted file mode 100644 index 5267b0c6b12e3..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/explicit-conversions.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. title:: clang-tidy - hicpp-explicit-conversions - -hicpp-explicit-conversions -========================== - -This check is an alias for -:doc:`misc-explicit-constructor <../misc/explicit-constructor>`. - -Used to enforce parts of `rule 5.4.1 -`_. -This check will enforce that constructors and conversion operators are -marked ``explicit``. Other forms of casting checks are implemented in -other places. The following checks can be used to check for more forms -of casting: - -- :doc:`cppcoreguidelines-pro-type-static-cast-downcast - <../cppcoreguidelines/pro-type-static-cast-downcast>` -- :doc:`cppcoreguidelines-pro-type-reinterpret-cast - <../cppcoreguidelines/pro-type-reinterpret-cast>` -- :doc:`cppcoreguidelines-pro-type-const-cast - <../cppcoreguidelines/pro-type-const-cast>` -- :doc:`cppcoreguidelines-pro-type-cstyle-cast - <../cppcoreguidelines/pro-type-cstyle-cast>` diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/function-size.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/function-size.rst deleted file mode 100644 index d54310f0e3624..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/function-size.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. title:: clang-tidy - hicpp-function-size - -hicpp-function-size -=================== - -This check is an alias for -:doc:`readability-function-size <../readability/function-size>`. -Useful to enforce multiple sections on function complexity. - -- `rule 8.2.2 `_ -- `rule 8.3.1 `_ -- `rule 8.3.2 `_ diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/ignored-remove-result.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/ignored-remove-result.rst deleted file mode 100644 index bb3aee9ebaaae..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/ignored-remove-result.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-ignored-remove-result - -hicpp-ignored-remove-result -=========================== - -The hicpp-ignored-remove-result check is an alias, please see -:doc:`bugprone-unused-return-value <../bugprone/unused-return-value>` -for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 3e3cd92374ee9..6d91b0297ee5c 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -603,14 +603,6 @@ Check aliases :doc:`google-readability-casting `, :doc:`modernize-avoid-c-style-cast `, "Yes" :doc:`google-readability-function-size `, :doc:`readability-function-size `, :doc:`google-readability-namespace-comments `, :doc:`llvm-namespace-comment `, - :doc:`hicpp-avoid-c-arrays `, :doc:`modernize-avoid-c-arrays `, - :doc:`hicpp-avoid-goto `, :doc:`cppcoreguidelines-avoid-goto `, - :doc:`hicpp-braces-around-statements `, :doc:`readability-braces-around-statements `, "Yes" - :doc:`hicpp-deprecated-headers `, :doc:`modernize-deprecated-headers `, "Yes" - :doc:`hicpp-exception-baseclass `, :doc:`bugprone-std-exception-baseclass `, - :doc:`hicpp-explicit-conversions `, :doc:`misc-explicit-constructor `, "Yes" - :doc:`hicpp-function-size `, :doc:`readability-function-size `, - :doc:`hicpp-ignored-remove-result `, :doc:`bugprone-unused-return-value `, :doc:`hicpp-invalid-access-moved `, :doc:`bugprone-use-after-move `, :doc:`hicpp-member-init `, :doc:`cppcoreguidelines-pro-type-member-init `, "Yes" :doc:`hicpp-move-const-arg `, :doc:`performance-move-const-arg `, "Yes" diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst index b7a87bf23967b..fec282740dd53 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst @@ -5,8 +5,6 @@ modernize-avoid-c-arrays `cppcoreguidelines-avoid-c-arrays` redirects here as an alias for this check. -`hicpp-avoid-c-arrays` redirects here as an alias for this check. - Finds C-style array types and recommend to use ``std::array<>`` / ``std::vector<>``. All types of C arrays are diagnosed. diff --git a/clang-tools-extra/test/clang-tidy/checkers/hicpp/ignored-remove-result.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-remove.cpp similarity index 80% rename from clang-tools-extra/test/clang-tidy/checkers/hicpp/ignored-remove-result.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-remove.cpp index fc431024303ab..2934db6dd2d2c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/hicpp/ignored-remove-result.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-remove.cpp @@ -1,5 +1,7 @@ -// RUN: %check_clang_tidy %s hicpp-ignored-remove-result %t -// RUN: %check_clang_tidy -check-suffixes=NOCAST %s hicpp-ignored-remove-result %t -- -config='{CheckOptions: {hicpp-ignored-remove-result.AllowCastToVoid: false}}' +// RUN: %check_clang_tidy %s bugprone-unused-return-value %t -- \ +// RUN: -config='{CheckOptions: {bugprone-unused-return-value.CheckedFunctions: "^::std::remove$;^::std::remove_if$;^::std::unique$", bugprone-unused-return-value.CheckedReturnTypes: "", bugprone-unused-return-value.AllowCastToVoid: true}}' +// RUN: %check_clang_tidy -check-suffixes=NOCAST %s bugprone-unused-return-value %t -- \ +// RUN: -config='{CheckOptions: {bugprone-unused-return-value.CheckedFunctions: "^::std::remove$;^::std::remove_if$;^::std::unique$", bugprone-unused-return-value.CheckedReturnTypes: "", bugprone-unused-return-value.AllowCastToVoid: false}}' namespace std { From 9076ffffdca4ca9573936ce6f7565ff628a12dc1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 10 May 2026 10:01:17 +0100 Subject: [PATCH 170/538] [DAG][GISel] Rename CTTZ_ZERO_UNDEF/CTLZ_ZERO_UNDEF/CTTZ_ELTS_ZERO_UNDEF -> CTTZ_ZERO_POISON/CTLZ_ZERO_POISON/CTTZ_ELTS_ZERO_POISON (#196732) DAG/GISel are ambiguous about whether zero-input results in UNDEF/POISON, unlike the rest of LLVM which makes it clear its POISON. I've tried to clean this up once and for all by ensuring SelectionDAG::canCreateUndefOrPoison does a includesPoison(Kind) check, renaming the opcodes (including the VP variants) and updating as many comments/tests as possible (I may still have missed some...). --- llvm/docs/GlobalISel/GenericOpcode.rst | 14 +- llvm/docs/ReleaseNotes.md | 14 + .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 2 +- .../CodeGen/GlobalISel/MachineIRBuilder.h | 14 +- llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 2 +- llvm/include/llvm/CodeGen/ISDOpcodes.h | 6 +- llvm/include/llvm/CodeGen/TargetLowering.h | 10 +- llvm/include/llvm/IR/VPIntrinsics.def | 14 +- llvm/include/llvm/Support/TargetOpcodes.def | 8 +- llvm/include/llvm/Target/GenericOpcodes.td | 4 +- .../include/llvm/Target/GlobalISel/Combine.td | 6 +- .../Target/GlobalISel/SelectionDAGCompat.td | 4 +- .../include/llvm/Target/TargetSelectionDAG.td | 4 +- llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp | 4 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +- .../CodeGen/GlobalISel/GISelValueTracking.cpp | 4 +- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 10 +- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 68 ++-- llvm/lib/CodeGen/GlobalISel/Utils.cpp | 4 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 62 ++-- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 20 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 30 +- .../SelectionDAG/LegalizeVectorOps.cpp | 12 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 26 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 19 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 16 +- .../SelectionDAG/SelectionDAGDumper.cpp | 4 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 26 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 +- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 40 +-- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 4 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 16 +- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 4 +- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 10 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 4 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 22 +- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 22 +- llvm/lib/Target/ARM/ARMLegalizerInfo.cpp | 4 +- llvm/lib/Target/BPF/BPFISelLowering.cpp | 4 +- llvm/lib/Target/Mips/MipsLegalizerInfo.cpp | 4 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +- .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 8 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 62 ++-- llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 +- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 4 +- llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 2 +- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 24 +- llvm/lib/Target/Sparc/SparcInstrVIS.td | 4 +- .../Target/SystemZ/SystemZISelLowering.cpp | 4 +- llvm/lib/Target/VE/VEISelLowering.cpp | 2 +- llvm/lib/Target/VE/VEInstrInfo.td | 2 +- .../GISel/WebAssemblyLegalizerInfo.cpp | 2 +- .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 56 +-- llvm/lib/Target/X86/X86InstrCompiler.td | 14 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 21 +- llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 4 +- .../AArch64/GlobalISel/arm64-irtranslator.ll | 6 +- .../combine-constant-fold-unary-int.mir | 12 +- .../AArch64/GlobalISel/legalize-ctlz.mir | 14 +- ...ndef.mir => legalize-cttz-zero-poison.mir} | 8 +- .../GlobalISel/legalizer-info-validation.mir | 4 +- ...f.mir => inst-select-ctlz-zero-poison.mir} | 24 +- ...f.mir => inst-select-cttz-zero-poison.mir} | 24 +- ...ndef.mir => legalize-ctlz-zero-poison.mir} | 92 ++--- ...ndef.mir => legalize-cttz-zero-poison.mir} | 108 +++--- .../AMDGPU/GlobalISel/legalize-cttz.mir | 20 +- ...mir => regbankselect-ctlz-zero-poison.mir} | 36 +- ...mir => regbankselect-cttz-zero-poison.mir} | 36 +- llvm/test/CodeGen/AMDGPU/ctls.ll | 16 +- ...ctlz_zero_undef.ll => ctlz_zero_poison.ll} | 340 +++++++++--------- ...cttz_zero_undef.ll => cttz_zero_poison.ll} | 120 +++---- .../ARM/GlobalISel/arm-legalize-bitcounts.mir | 18 +- llvm/test/CodeGen/ARM/cttz.ll | 40 +-- llvm/test/CodeGen/ARM/cttz_vector.ll | 56 +-- llvm/test/CodeGen/Lanai/i32.ll | 8 +- .../Mips/GlobalISel/legalizer/cttz.mir | 4 +- .../GlobalISel/legalizer-info-validation.mir | 4 +- .../legalizer/legalize-ctlz-rv32.mir | 32 +- .../legalizer/legalize-ctlz-rv64.mir | 32 +- .../legalizer/legalize-cttz-rv32.mir | 32 +- .../legalizer/legalize-cttz-rv64.mir | 32 +- llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll | 6 +- llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 128 +++---- llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 8 +- llvm/test/CodeGen/RISCV/rv64zbb.ll | 6 +- llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll | 216 +++++------ llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 306 ++++++++-------- llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll | 216 +++++------ llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll | 294 +++++++-------- .../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 144 ++++---- .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll | 84 ++--- .../RISCV/rvv/fixed-vectors-cttz-vp.ll | 144 ++++---- .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll | 84 ++--- .../CodeGen/RISCV/rvv/known-never-zero.ll | 2 +- .../test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll | 64 ++-- llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll | 8 +- llvm/test/CodeGen/SystemZ/vec-cttz-01.ll | 8 +- .../GlobalISel/instructions/ctlz.mir | 18 +- .../GlobalISel/instructions/cttz.mir | 18 +- llvm/test/CodeGen/WebAssembly/i128.ll | 12 +- llvm/test/CodeGen/WebAssembly/i32.ll | 12 +- llvm/test/CodeGen/WebAssembly/i64.ll | 12 +- .../X86/GlobalISel/legalize-leading-zeros.mir | 10 +- ...mir => legalize-trailing-zeros-poison.mir} | 44 +-- .../GlobalISel/legalize-trailing-zeros.mir | 20 +- llvm/test/CodeGen/X86/bitcnt-big-integer.ll | 270 +++++++------- llvm/test/CodeGen/X86/freeze-unary.ll | 36 +- llvm/test/CodeGen/X86/known-never-zero.ll | 2 +- llvm/test/CodeGen/X86/widen_bitcnt.ll | 58 +-- llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll | 8 +- .../InstCombine/X86/x86-sse4a-inseltpoison.ll | 2 +- llvm/test/Transforms/InstCombine/add2.ll | 4 +- llvm/test/Transforms/InstCombine/cttz.ll | 24 +- .../InstCombine/select-cmp-cttz-ctlz.ll | 2 +- .../Transforms/InstCombine/umin_cttz_ctlz.ll | 108 +++--- .../test/Transforms/SLPVectorizer/X86/ctlz.ll | 62 ++-- .../test/Transforms/SLPVectorizer/X86/cttz.ll | 62 ++-- .../GlobalISel/LegalizerHelperTest.cpp | 64 ++-- .../GlobalISel/MachineIRBuilderTest.cpp | 8 +- 123 files changed, 2242 insertions(+), 2226 deletions(-) rename llvm/test/CodeGen/AArch64/GlobalISel/{legalize-cttz-zero-undef.mir => legalize-cttz-zero-poison.mir} (95%) rename llvm/test/CodeGen/AMDGPU/GlobalISel/{inst-select-ctlz-zero-undef.mir => inst-select-ctlz-zero-poison.mir} (80%) rename llvm/test/CodeGen/AMDGPU/GlobalISel/{inst-select-cttz-zero-undef.mir => inst-select-cttz-zero-poison.mir} (79%) rename llvm/test/CodeGen/AMDGPU/GlobalISel/{legalize-ctlz-zero-undef.mir => legalize-ctlz-zero-poison.mir} (67%) rename llvm/test/CodeGen/AMDGPU/GlobalISel/{legalize-cttz-zero-undef.mir => legalize-cttz-zero-poison.mir} (54%) rename llvm/test/CodeGen/AMDGPU/GlobalISel/{regbankselect-ctlz-zero-undef.mir => regbankselect-ctlz-zero-poison.mir} (66%) rename llvm/test/CodeGen/AMDGPU/GlobalISel/{regbankselect-cttz-zero-undef.mir => regbankselect-cttz-zero-poison.mir} (66%) rename llvm/test/CodeGen/AMDGPU/{ctlz_zero_undef.ll => ctlz_zero_poison.ll} (90%) rename llvm/test/CodeGen/AMDGPU/{cttz_zero_undef.ll => cttz_zero_poison.ll} (94%) rename llvm/test/CodeGen/X86/GlobalISel/{legalize-trailing-zeros-undef.mir => legalize-trailing-zeros-poison.mir} (78%) diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index f9872c8926d6d..d212b39720c5a 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -486,20 +486,20 @@ Count leading zeros, trailing zeros, or number of set bits. .. code-block:: none - %2:_(s33) = G_CTLZ_ZERO_UNDEF %1 - %2:_(s33) = G_CTTZ_ZERO_UNDEF %1 + %2:_(s33) = G_CTLZ_ZERO_POISON %1 + %2:_(s33) = G_CTTZ_ZERO_POISON %1 %2:_(s33) = G_CTPOP %1 -G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +G_CTLZ_ZERO_POISON, G_CTTZ_ZERO_POISON +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Count leading zeros or trailing zeros. If the value is zero then the result is -undefined. +poison. .. code-block:: none - %2:_(s33) = G_CTLZ_ZERO_UNDEF %1 - %2:_(s33) = G_CTTZ_ZERO_UNDEF %1 + %2:_(s33) = G_CTLZ_ZERO_POISON %1 + %2:_(s33) = G_CTTZ_ZERO_POISON %1 G_CTLS ^^^^^^ diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 98f2205bc06a7..7363c2fd8c52b 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -231,6 +231,20 @@ Makes programs 10x faster by doing Special New Thing. ### Changes to the CodeGen infrastructure +* Renamed ISD::CTLZ_ZERO_UNDEF to ISD::CTLZ_ZERO_POISON opcode to make it clear that + a zero input results in poison. + +* Renamed ISD::CTTZ_ZERO_UNDEF to ISD::CTTZ_ZERO_POISON opcode to make it clear that + a zero input results in poison. + +### Changes to the GlobalISel infrastructure + +* Renamed G_CTLZ_ZERO_UNDEF to G_CTLZ_ZERO_POISON opcode to make it clear that + a zero input results in poison. + +* Renamed G_CTTZ_ZERO_UNDEF to G_CTTZ_ZERO_POISON opcode to make it clear that + a zero input results in poison. + ### Changes to the Metadata Info ### Changes to the Debug Info diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 97f29015c6911..3687adbe60f1a 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -410,7 +410,7 @@ class CombinerHelper { const ConstantFP *Cst) const; /// Constant fold a unary integer op (G_CTLZ, G_CTTZ, G_CTPOP and their - /// _ZERO_UNDEF variants, G_ABS, G_BSWAP, G_BITREVERSE) when the operand is + /// _ZERO_POISON variants, G_ABS, G_BSWAP, G_BITREVERSE) when the operand is /// a scalar constant or a G_BUILD_VECTOR of constants. bool matchConstantFoldUnaryIntOp(MachineInstr &MI, BuildFnTy &MatchInfo) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index c7973a8a9fb00..511bc17161e03 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -2061,9 +2061,10 @@ class LLVM_ABI MachineIRBuilder { return buildInstr(TargetOpcode::G_CTLZ, {Dst}, {Src0}); } - /// Build and insert \p Res = G_CTLZ_ZERO_UNDEF \p Op0, \p Src0 - MachineInstrBuilder buildCTLZ_ZERO_UNDEF(const DstOp &Dst, const SrcOp &Src0) { - return buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, {Dst}, {Src0}); + /// Build and insert \p Res = G_CTLZ_ZERO_POISON \p Op0, \p Src0 + MachineInstrBuilder buildCTLZ_ZERO_POISON(const DstOp &Dst, + const SrcOp &Src0) { + return buildInstr(TargetOpcode::G_CTLZ_ZERO_POISON, {Dst}, {Src0}); } /// Build and insert \p Res = G_CTTZ \p Op0, \p Src0 @@ -2071,9 +2072,10 @@ class LLVM_ABI MachineIRBuilder { return buildInstr(TargetOpcode::G_CTTZ, {Dst}, {Src0}); } - /// Build and insert \p Res = G_CTTZ_ZERO_UNDEF \p Op0, \p Src0 - MachineInstrBuilder buildCTTZ_ZERO_UNDEF(const DstOp &Dst, const SrcOp &Src0) { - return buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, {Dst}, {Src0}); + /// Build and insert \p Res = G_CTTZ_ZERO_POISON \p Op0, \p Src0 + MachineInstrBuilder buildCTTZ_ZERO_POISON(const DstOp &Dst, + const SrcOp &Src0) { + return buildInstr(TargetOpcode::G_CTTZ_ZERO_POISON, {Dst}, {Src0}); } /// Build and insert \p Res = G_CTLS \p Op0, \p Src0 diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index a861a60ecfcd3..a78bbe80b2f61 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -320,7 +320,7 @@ ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy, Register Src, const MachineRegisterInfo &MRI); /// Tries to constant fold a unary integer operation (G_CTLZ, G_CTTZ, G_CTPOP -/// and their _ZERO_UNDEF variants, G_ABS, G_BSWAP, G_BITREVERSE) on \p Src. +/// and their _ZERO_POISON variants, G_ABS, G_BSWAP, G_BITREVERSE) on \p Src. /// If \p Src is a vector then it tries to do an element-wise constant fold. LLVM_ABI SmallVector ConstantFoldUnaryIntOp(unsigned Opcode, LLT DstTy, Register Src, diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 8a8a9ee71ca02..f78e9fb77afeb 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -783,9 +783,9 @@ enum NodeType { BITREVERSE, PARITY, - /// Bit counting operators with an undefined result for zero inputs. - CTTZ_ZERO_UNDEF, - CTLZ_ZERO_UNDEF, + /// Bit counting operators with a poisoned result for zero inputs. + CTTZ_ZERO_POISON, + CTLZ_ZERO_POISON, /// Count leading redundant sign bits. Equivalent to /// (sub (ctlz (x < 0 ? ~x : x)), 1). diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index f1119e4acce58..318763113fb42 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5663,13 +5663,13 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// \returns The expansion result or SDValue() if it fails. SDValue expandVPCTPOP(SDNode *N, SelectionDAG &DAG) const; - /// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes, + /// Expand CTLZ/CTLZ_ZERO_POISON nodes. Expands vector/scalar CTLZ nodes, /// vector nodes can only succeed if all operations are legal/custom. /// \param N Node to expand /// \returns The expansion result or SDValue() if it fails. SDValue expandCTLZ(SDNode *N, SelectionDAG &DAG) const; - /// Expand VP_CTLZ/VP_CTLZ_ZERO_UNDEF nodes. + /// Expand VP_CTLZ/VP_CTLZ_ZERO_POISON nodes. /// \param N Node to expand /// \returns The expansion result or SDValue() if it fails. SDValue expandVPCTLZ(SDNode *N, SelectionDAG &DAG) const; @@ -5686,18 +5686,18 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { SDValue CTTZTableLookup(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, SDValue Op, unsigned NumBitsPerElt) const; - /// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes, + /// Expand CTTZ/CTTZ_ZERO_POISON nodes. Expands vector/scalar CTTZ nodes, /// vector nodes can only succeed if all operations are legal/custom. /// \param N Node to expand /// \returns The expansion result or SDValue() if it fails. SDValue expandCTTZ(SDNode *N, SelectionDAG &DAG) const; - /// Expand VP_CTTZ/VP_CTTZ_ZERO_UNDEF nodes. + /// Expand VP_CTTZ/VP_CTTZ_ZERO_POISON nodes. /// \param N Node to expand /// \returns The expansion result or SDValue() if it fails. SDValue expandVPCTTZ(SDNode *N, SelectionDAG &DAG) const; - /// Expand VP_CTTZ_ELTS/VP_CTTZ_ELTS_ZERO_UNDEF nodes. + /// Expand VP_CTTZ_ELTS/VP_CTTZ_ELTS_ZERO_POISON nodes. /// \param N Node to expand /// \returns The expansion result or SDValue() if it fails. SDValue expandVPCTTZElements(SDNode *N, SelectionDAG &DAG) const; diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 0b0c744487b92..7015bd5a6f0ce 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -240,9 +240,9 @@ BEGIN_REGISTER_VP_SDNODE(VP_CTLZ, -1, vp_ctlz, 1, 2) VP_PROPERTY_FUNCTIONAL_INTRINSIC(ctlz) VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ) END_REGISTER_VP_SDNODE(VP_CTLZ) -BEGIN_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF, -1, vp_ctlz_zero_undef, 1, 2) -VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ_ZERO_UNDEF) -END_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF) +BEGIN_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_POISON, -1, vp_ctlz_zero_poison, 1, 2) +VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ_ZERO_POISON) +END_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_POISON) END_REGISTER_VP_INTRINSIC(vp_ctlz) // llvm.vp.cttz(x,is_zero_poison,mask,vlen) @@ -251,8 +251,8 @@ BEGIN_REGISTER_VP_SDNODE(VP_CTTZ, -1, vp_cttz, 1, 2) VP_PROPERTY_FUNCTIONAL_INTRINSIC(cttz) VP_PROPERTY_FUNCTIONAL_SDOPC(CTTZ) END_REGISTER_VP_SDNODE(VP_CTTZ) -BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_UNDEF, -1, vp_cttz_zero_undef, 1, 2) -END_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_UNDEF) +BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_POISON, -1, vp_cttz_zero_poison, 1, 2) +END_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_POISON) END_REGISTER_VP_INTRINSIC(vp_cttz) // llvm.vp.cttz.elts(x,is_zero_poison,mask,vl) @@ -260,8 +260,8 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_cttz_elts, 2, 3) VP_PROPERTY_NO_FUNCTIONAL BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ELTS, 0, vp_cttz_elts, 1, 2) END_REGISTER_VP_SDNODE(VP_CTTZ_ELTS) -BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ELTS_ZERO_UNDEF, 0, vp_cttz_elts_zero_undef, 1, 2) -END_REGISTER_VP_SDNODE(VP_CTTZ_ELTS_ZERO_UNDEF) +BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ELTS_ZERO_POISON, 0, vp_cttz_elts_zero_poison, 1, 2) +END_REGISTER_VP_SDNODE(VP_CTTZ_ELTS_ZERO_POISON) END_REGISTER_VP_INTRINSIC(vp_cttz_elts) // llvm.vp.fshl(x,y,z,mask,vlen) diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index a8db3c1d75aa0..68a80dde6b3fe 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -838,14 +838,14 @@ HANDLE_TARGET_OPCODE(G_VECTOR_COMPRESS) /// Generic count trailing zeroes. HANDLE_TARGET_OPCODE(G_CTTZ) -/// Same as above, undefined for zero inputs. -HANDLE_TARGET_OPCODE(G_CTTZ_ZERO_UNDEF) +/// Same as above, poisoned for zero inputs. +HANDLE_TARGET_OPCODE(G_CTTZ_ZERO_POISON) /// Generic count leading zeroes. HANDLE_TARGET_OPCODE(G_CTLZ) -/// Same as above, undefined for zero inputs. -HANDLE_TARGET_OPCODE(G_CTLZ_ZERO_UNDEF) +/// Same as above, poisoned for zero inputs. +HANDLE_TARGET_OPCODE(G_CTLZ_ZERO_POISON) /// Generic count extra sign bits. HANDLE_TARGET_OPCODE(G_CTLS) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 3612e3c458f90..37a185a226b63 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -202,7 +202,7 @@ def G_CTLZ : GenericInstruction { let hasSideEffects = false; } -def G_CTLZ_ZERO_UNDEF : GenericInstruction { +def G_CTLZ_ZERO_POISON : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type1:$src); let hasSideEffects = false; @@ -214,7 +214,7 @@ def G_CTTZ : GenericInstruction { let hasSideEffects = false; } -def G_CTTZ_ZERO_UNDEF : GenericInstruction { +def G_CTTZ_ZERO_POISON : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type1:$src); let hasSideEffects = false; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 2091554304771..5d9025b5cc9c8 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1404,7 +1404,7 @@ def constant_fold_cast_op : GICombineRule< def unary_int_op_frags : GICombinePatFrag< (outs root:$dst), (ins), !foreach(op, - [G_CTLZ, G_CTLZ_ZERO_UNDEF, G_CTTZ, G_CTTZ_ZERO_UNDEF, G_CTPOP, + [G_CTLZ, G_CTLZ_ZERO_POISON, G_CTTZ, G_CTTZ_ZERO_POISON, G_CTPOP, G_ABS, G_BSWAP, G_BITREVERSE], (pattern (op $dst, $src)))>; @@ -2230,11 +2230,11 @@ class ctlz_to_ctls_op : GICombineRule < (apply [{Helper.applyBuildFn(*${root}, ${matchinfo});}])>; def ctlz_to_ctls : ctlz_to_ctls_op; -def ctlz_zero_undef_to_ctls : ctlz_to_ctls_op; +def ctlz_zero_poison_to_ctls : ctlz_to_ctls_op; def ctls_combines : GICombineGroup<[ ctlz_to_ctls, - ctlz_zero_undef_to_ctls, + ctlz_zero_poison_to_ctls, ]>; def narrow_binop_add : narrow_binop_opcode; diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index abfd41864c8bf..6933176b0fb53 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -157,8 +157,8 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 573342846b4cf..ec8fbd84d5166 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -535,8 +535,8 @@ def ctlz : SDNode<"ISD::CTLZ" , SDTIntBitCountUnaryOp>; def cttz : SDNode<"ISD::CTTZ" , SDTIntBitCountUnaryOp>; def ctpop : SDNode<"ISD::CTPOP" , SDTIntBitCountUnaryOp>; def ctls : SDNode<"ISD::CTLS" , SDTIntBitCountUnaryOp>; -def ctlz_zero_undef : SDNode<"ISD::CTLZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>; -def cttz_zero_undef : SDNode<"ISD::CTTZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>; +def ctlz_zero_poison : SDNode<"ISD::CTLZ_ZERO_POISON", SDTIntBitCountUnaryOp>; +def cttz_zero_poison : SDNode<"ISD::CTTZ_ZERO_POISON", SDTIntBitCountUnaryOp>; def sext : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>; def zext : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>; def anyext : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>; diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index ff96d7a3cbd9a..8e5bf2120dd40 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -282,9 +282,9 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc, break; } case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTTZ_ZERO_POISON: case TargetOpcode::G_CTPOP: case TargetOpcode::G_ABS: case TargetOpcode::G_BSWAP: diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index c58fb84a3890e..0f2617b9b5b1c 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -8642,7 +8642,7 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI, bool CombinerHelper::matchCtls(MachineInstr &CtlzMI, BuildFnTy &MatchInfo) const { assert((CtlzMI.getOpcode() == TargetOpcode::G_CTLZ || - CtlzMI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) && + CtlzMI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_POISON) && "Expected G_CTLZ variant"); const Register Dst = CtlzMI.getOperand(0).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 5fcc84d0d76a1..44298258da11c 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -793,7 +793,7 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, break; } case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: { + case TargetOpcode::G_CTTZ_ZERO_POISON: { KnownBits SrcOpKnown; computeKnownBitsImpl(MI.getOperand(1).getReg(), SrcOpKnown, DemandedElts, Depth + 1); @@ -804,7 +804,7 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, break; } case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: { + case TargetOpcode::G_CTLZ_ZERO_POISON: { KnownBits SrcOpKnown; computeKnownBitsImpl(MI.getOperand(1).getReg(), SrcOpKnown, DemandedElts, Depth + 1); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 7db3e2d4e02b7..8c8e08865744a 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2482,11 +2482,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, case Intrinsic::ctlz: { ConstantInt *Cst = cast(CI.getArgOperand(1)); bool isTrailing = ID == Intrinsic::cttz; - unsigned Opcode = isTrailing - ? Cst->isZero() ? TargetOpcode::G_CTTZ - : TargetOpcode::G_CTTZ_ZERO_UNDEF - : Cst->isZero() ? TargetOpcode::G_CTLZ - : TargetOpcode::G_CTLZ_ZERO_UNDEF; + unsigned Opcode = isTrailing ? Cst->isZero() + ? TargetOpcode::G_CTTZ + : TargetOpcode::G_CTTZ_ZERO_POISON + : Cst->isZero() ? TargetOpcode::G_CTLZ + : TargetOpcode::G_CTLZ_ZERO_POISON; MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(CI)}, {getOrCreateVReg(*CI.getArgOperand(0))}); return true; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 71cda8a480dd7..909decfb015b5 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -427,7 +427,7 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE_INT(SREM_I); case TargetOpcode::G_UREM: RTLIBCASE_INT(UREM_I); - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: RTLIBCASE_INT(CTLZ_I); case TargetOpcode::G_FADD: RTLIBCASE(ADD_F); @@ -1318,7 +1318,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { case TargetOpcode::G_UDIV: case TargetOpcode::G_SREM: case TargetOpcode::G_UREM: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: { + case TargetOpcode::G_CTLZ_ZERO_POISON: { LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = LLTy.getSizeInBits(); Type *HLTy = IntegerType::get(Ctx, Size); @@ -1781,18 +1781,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, case TargetOpcode::G_ASHR: return narrowScalarShift(MI, TypeIdx, NarrowTy); case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTTZ_ZERO_POISON: case TargetOpcode::G_CTLS: case TargetOpcode::G_CTPOP: if (TypeIdx == 1) switch (MI.getOpcode()) { case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: return narrowScalarCTLZ(MI, TypeIdx, NarrowTy); case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTTZ_ZERO_POISON: return narrowScalarCTTZ(MI, TypeIdx, NarrowTy); case TargetOpcode::G_CTPOP: return narrowScalarCTPOP(MI, TypeIdx, NarrowTy); @@ -2822,9 +2822,9 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_USHLSAT: return widenScalarAddSubShlSat(MI, TypeIdx, WideTy); case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTTZ_ZERO_POISON: case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: case TargetOpcode::G_CTLS: case TargetOpcode::G_CTPOP: { if (TypeIdx == 0) { @@ -2840,8 +2840,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { unsigned ExtOpc; switch (Opcode) { case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: // undef bits shifted out below + case TargetOpcode::G_CTTZ_ZERO_POISON: + case TargetOpcode::G_CTLZ_ZERO_POISON: // poison shifted out below ExtOpc = TargetOpcode::G_ANYEXT; break; case TargetOpcode::G_CTLS: @@ -2863,13 +2863,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { MIBSrc = MIRBuilder.buildOr( WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit)); // Now we know the operand is non-zero, use the more relaxed opcode. - NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF; + NewOpc = TargetOpcode::G_CTTZ_ZERO_POISON; } unsigned SizeDiff = WideTy.getScalarSizeInBits() - CurTy.getScalarSizeInBits(); - if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) { + if (Opcode == TargetOpcode::G_CTLZ_ZERO_POISON) { // An optimization where the result is the CTLZ after the left shift by // (Difference in widety and current ty), that is, // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy)) @@ -4727,8 +4727,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerLoad(cast(MI)); case TargetOpcode::G_STORE: return lowerStore(cast(MI)); - case TargetOpcode::G_CTLZ_ZERO_UNDEF: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: + case TargetOpcode::G_CTTZ_ZERO_POISON: case TargetOpcode::G_CTLZ: case TargetOpcode::G_CTTZ: case TargetOpcode::G_CTPOP: @@ -5682,9 +5682,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_SSHLSAT: case G_USHLSAT: case G_CTLZ: - case G_CTLZ_ZERO_UNDEF: + case G_CTLZ_ZERO_POISON: case G_CTTZ: - case G_CTTZ_ZERO_UNDEF: + case G_CTTZ_ZERO_POISON: case G_CTPOP: case G_CTLS: case G_FCOPYSIGN: @@ -7577,7 +7577,7 @@ LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, unsigned NarrowSize = NarrowTy.getSizeInBits(); if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { - const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF; + const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_POISON; MachineIRBuilder &B = MIRBuilder; auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); @@ -7585,12 +7585,11 @@ LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, auto C_0 = B.buildConstant(NarrowTy, 0); auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::integer(1), UnmergeSrc.getReg(1), C_0); - auto LoCTLZ = IsUndef ? - B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) : - B.buildCTLZ(DstTy, UnmergeSrc.getReg(0)); + auto LoCTLZ = IsUndef ? B.buildCTLZ_ZERO_POISON(DstTy, UnmergeSrc.getReg(0)) + : B.buildCTLZ(DstTy, UnmergeSrc.getReg(0)); auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize); - auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)); + auto HiCTLZ = B.buildCTLZ_ZERO_POISON(DstTy, UnmergeSrc.getReg(1)); B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ); MI.eraseFromParent(); @@ -7610,7 +7609,7 @@ LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, unsigned NarrowSize = NarrowTy.getSizeInBits(); if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { - const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF; + const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_POISON; MachineIRBuilder &B = MIRBuilder; auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); @@ -7618,12 +7617,11 @@ LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, auto C_0 = B.buildConstant(NarrowTy, 0); auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), UnmergeSrc.getReg(0), C_0); - auto HiCTTZ = IsUndef ? - B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) : - B.buildCTTZ(DstTy, UnmergeSrc.getReg(1)); + auto HiCTTZ = IsUndef ? B.buildCTTZ_ZERO_POISON(DstTy, UnmergeSrc.getReg(1)) + : B.buildCTTZ(DstTy, UnmergeSrc.getReg(1)); auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize); - auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)); + auto LoCTTZ = B.buildCTTZ_ZERO_POISON(DstTy, UnmergeSrc.getReg(0)); B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ); MI.eraseFromParent(); @@ -7734,7 +7732,7 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { switch (Opc) { default: return UnableToLegalize; - case TargetOpcode::G_CTLZ_ZERO_UNDEF: { + case TargetOpcode::G_CTLZ_ZERO_POISON: { // This trivially expands to CTLZ. Observer.changingInstr(MI); MI.setDesc(TII.get(TargetOpcode::G_CTLZ)); @@ -7745,9 +7743,9 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned Len = SrcTy.getScalarSizeInBits(); - if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) { - // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero. - auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg); + if (isSupported({TargetOpcode::G_CTLZ_ZERO_POISON, {DstTy, SrcTy}})) { + // If CTLZ_ZERO_POISON is supported, emit that and a select for zero. + auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_POISON(DstTy, SrcReg); auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0); auto ICmp = MIRBuilder.buildICmp( CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc); @@ -7781,7 +7779,7 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { MI.eraseFromParent(); return Legalized; } - case TargetOpcode::G_CTTZ_ZERO_UNDEF: { + case TargetOpcode::G_CTTZ_ZERO_POISON: { // This trivially expands to CTTZ. Observer.changingInstr(MI); MI.setDesc(TII.get(TargetOpcode::G_CTTZ)); @@ -7792,10 +7790,10 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned Len = SrcTy.getScalarSizeInBits(); - if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) { - // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with + if (isSupported({TargetOpcode::G_CTTZ_ZERO_POISON, {DstTy, SrcTy}})) { + // If CTTZ_ZERO_POISON is legal or custom, emit that and a select with // zero. - auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg); + auto CttzZU = MIRBuilder.buildCTTZ_ZERO_POISON(DstTy, SrcReg); auto Zero = MIRBuilder.buildConstant(SrcTy, 0); auto ICmp = MIRBuilder.buildICmp( CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero); @@ -8261,7 +8259,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { auto Zero32 = MIRBuilder.buildConstant(S32, 0); auto Zero64 = MIRBuilder.buildConstant(S64, 0); - auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src); + auto LZ = MIRBuilder.buildCTLZ_ZERO_POISON(S32, Src); auto K = MIRBuilder.buildConstant(S32, 127U + 63U); auto Sub = MIRBuilder.buildSub(S32, K, LZ); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index f552fb7de45dd..5d1e088fcb066 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -948,10 +948,10 @@ llvm::ConstantFoldUnaryIntOp(unsigned Opcode, LLT DstTy, Register Src, auto Fold = [Opcode, EltBits](const APInt &V) -> APInt { switch (Opcode) { case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: return APInt(EltBits, V.countl_zero()); case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTTZ_ZERO_POISON: return APInt(EltBits, V.countr_zero()); case TargetOpcode::G_CTPOP: return APInt(EltBits, V.popcount()); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a5f7a5ae330f0..bf4d4cc261e42 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -476,9 +476,9 @@ namespace { SDValue visitBSWAP(SDNode *N); SDValue visitBITREVERSE(SDNode *N); SDValue visitCTLZ(SDNode *N); - SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); + SDValue visitCTLZ_ZERO_POISON(SDNode *N); SDValue visitCTTZ(SDNode *N); - SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); + SDValue visitCTTZ_ZERO_POISON(SDNode *N); SDValue visitCTPOP(SDNode *N); SDValue visitSELECT(SDNode *N); SDValue visitVSELECT(SDNode *N); @@ -1997,9 +1997,9 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); case ISD::CTLZ: return visitCTLZ(N); - case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); + case ISD::CTLZ_ZERO_POISON: return visitCTLZ_ZERO_POISON(N); case ISD::CTTZ: return visitCTTZ(N); - case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); + case ISD::CTTZ_ZERO_POISON: return visitCTTZ_ZERO_POISON(N); case ISD::CTPOP: return visitCTPOP(N); case ISD::SELECT: return visitSELECT(N); case ISD::VSELECT: return visitVSELECT(N); @@ -4104,8 +4104,8 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) { // if BitWidthDiff == BitWidth(Node) - BitWidth(Src) // --> // -// (ctlz_zero_undef (not (shl (anyextend Src) -// BitWidthDiff))) +// (ctlz_zero_poison (not (shl (anyextend Src) +// BitWidthDiff))) // // * Type Legalisation Pattern: // @@ -4118,7 +4118,7 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) { // and XorMask has more trailing ones than AndMask // --> // -// (ctlz_zero_undef (not (shl Src BitWidthDiff))) +// (ctlz_zero_poison (not (shl Src BitWidthDiff))) template static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) { const SDLoc DL(N); @@ -4164,7 +4164,7 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) { SDValue Not = Matcher.getNode(ISD::XOR, DL, VT, LShift, DAG.getAllOnesConstant(DL, VT)); - return Matcher.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, Not); + return Matcher.getNode(ISD::CTLZ_ZERO_POISON, DL, VT, Not); } // Fold sub(x, mul(divrem(x,y)[0], y)) to divrem(x, y)[1] @@ -11048,7 +11048,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // target. if (((N1.getOpcode() == ISD::CTTZ && VT.getScalarSizeInBits() <= ShiftVT.getScalarSizeInBits()) || - N1.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && + N1.getOpcode() == ISD::CTTZ_ZERO_POISON) && N1.hasOneUse() && !TLI.isOperationLegalOrCustom(ISD::CTTZ, ShiftVT) && TLI.isOperationLegalOrCustom(ISD::MUL, VT)) { SDValue Y = N1.getOperand(0); @@ -12308,10 +12308,10 @@ SDValue DAGCombiner::visitCTLZ(SDNode *N) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTLZ, DL, VT, {N0})) return C; - // If the value is known never to be zero, switch to the undef version. - if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) + // If the value is known never to be zero, switch to the poison version. + if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_POISON, VT)) if (DAG.isKnownNeverZero(N0)) - return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, N0); + return DAG.getNode(ISD::CTLZ_ZERO_POISON, DL, VT, N0); if (SDValue V = foldCTLZToCTLS(N0, DL)) return V; @@ -12319,14 +12319,14 @@ SDValue DAGCombiner::visitCTLZ(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { +SDValue DAGCombiner::visitCTLZ_ZERO_POISON(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc DL(N); - // fold (ctlz_zero_undef c1) -> c2 + // fold (ctlz_zero_poison c1) -> c2 if (SDValue C = - DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0})) + DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_POISON, DL, VT, {N0})) return C; if (SDValue V = foldCTLZToCTLS(N0, DL)) @@ -12344,22 +12344,22 @@ SDValue DAGCombiner::visitCTTZ(SDNode *N) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTTZ, DL, VT, {N0})) return C; - // If the value is known never to be zero, switch to the undef version. - if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) + // If the value is known never to be zero, switch to the poison version. + if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_POISON, VT)) if (DAG.isKnownNeverZero(N0)) - return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, DL, VT, N0); + return DAG.getNode(ISD::CTTZ_ZERO_POISON, DL, VT, N0); return SDValue(); } -SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { +SDValue DAGCombiner::visitCTTZ_ZERO_POISON(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc DL(N); - // fold (cttz_zero_undef c1) -> c2 + // fold (cttz_zero_poison c1) -> c2 if (SDValue C = - DAG.FoldConstantArithmetic(ISD::CTTZ_ZERO_UNDEF, DL, VT, {N0})) + DAG.FoldConstantArithmetic(ISD::CTTZ_ZERO_POISON, DL, VT, {N0})) return C; return SDValue(); } @@ -17966,8 +17966,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // operands via the operand numbers. The typical scenario is that we have // something like this // t262: i32 = freeze t181 - // t150: i32 = ctlz_zero_undef t262 - // t184: i32 = ctlz_zero_undef t181 + // t150: i32 = ctlz_zero_poison t262 + // t184: i32 = ctlz_zero_poison t181 // t268: i32 = select_cc t181, Constant:i32<0>, t184, t186, setne:ch // When freezing the t181 operand we get t262 back, and then the // ReplaceAllUsesOfValueWith call will not only replace t181 by t262, but @@ -30482,13 +30482,13 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, } // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) - // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) + // select_cc seteq X, 0, sizeof(X), ctlz_zero_poison(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) - // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) + // select_cc seteq X, 0, sizeof(X), cttz_zero_poison(X) -> cttz(X) // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) - // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) + // select_cc setne X, 0, ctlz_zero_poison(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) - // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) + // select_cc setne X, 0, cttz_zero_poison(X), sizeof(X) -> cttz(X) if (N1C && N1C->isZero() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue ValueOnZero = N2; SDValue Count = N3; @@ -30498,17 +30498,17 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, // Check if the value on zero is a constant equal to the bits in the type. if (auto *ValueOnZeroC = dyn_cast(ValueOnZero)) { if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { - // If the other operand is cttz/cttz_zero_undef of N0, and cttz is + // If the other operand is cttz/cttz_zero_poison of N0, and cttz is // legal, combine to just cttz. if ((Count.getOpcode() == ISD::CTTZ || - Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && + Count.getOpcode() == ISD::CTTZ_ZERO_POISON) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) return DAG.getNode(ISD::CTTZ, DL, VT, N0); - // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is + // If the other operand is ctlz/ctlz_zero_poison of N0, and ctlz is // legal, combine to just ctlz. if ((Count.getOpcode() == ISD::CTLZ || - Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && + Count.getOpcode() == ISD::CTLZ_ZERO_POISON) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) return DAG.getNode(ISD::CTLZ, DL, VT, N0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 4550d9e67d8f3..9e4f169cd4f3f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1283,7 +1283,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::CTTZ_ELTS: case ISD::CTTZ_ELTS_ZERO_POISON: case ISD::VP_CTTZ_ELTS: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; @@ -3261,7 +3261,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Tmp1); break; case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: if ((Tmp1 = TLI.expandCTLZ(Node, DAG))) Results.push_back(Tmp1); break; @@ -3270,7 +3270,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Tmp1); break; case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: if ((Tmp1 = TLI.expandCTTZ(Node, DAG))) Results.push_back(Tmp1); break; @@ -4623,7 +4623,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(TLI.expandVecReduce(Node, DAG)); break; case ISD::VP_CTTZ_ELTS: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: Results.push_back(TLI.expandVPCTTZElements(Node, DAG)); break; case ISD::CLEAR_CACHE: @@ -5373,7 +5373,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::MUL_I16, RTLIB::MUL_I32, RTLIB::MUL_I64, RTLIB::MUL_I128)); break; - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: Results.push_back(ExpandBitCountingLibCall( Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64, RTLIB::CTLZ_I128)); break; @@ -5510,12 +5510,12 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { SDValue Tmp1, Tmp2, Tmp3, Tmp4; switch (Node->getOpcode()) { case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::CTLZ: case ISD::CTPOP: { // Zero extend the argument unless its cttz, then use any_extend. if (Node->getOpcode() == ISD::CTTZ || - Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF) + Node->getOpcode() == ISD::CTTZ_ZERO_POISON) Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0)); else Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0)); @@ -5529,9 +5529,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { OVT.getSizeInBits()); Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1, DAG.getConstant(TopBit, dl, NVT)); - NewOpc = ISD::CTTZ_ZERO_UNDEF; + NewOpc = ISD::CTTZ_ZERO_POISON; } - // Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is + // Perform the larger operation. For CTPOP and CTTZ_ZERO_POISON, this is // already the correct result. Tmp1 = DAG.getNode(NewOpc, dl, NVT, Tmp1); if (NewOpc == ISD::CTLZ) { @@ -5544,7 +5544,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1, SDNodeFlags::NoWrap)); break; } - case ISD::CTLZ_ZERO_UNDEF: { + case ISD::CTLZ_ZERO_POISON: { // We know that the argument is unlikely to be zero, hence we can take a // different approach as compared to ISD::CTLZ diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index e7bf5a74cba34..070c366556186 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -66,21 +66,21 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::BSWAP: Res = PromoteIntRes_BSWAP(N); break; case ISD::BUILD_PAIR: Res = PromoteIntRes_BUILD_PAIR(N); break; case ISD::Constant: Res = PromoteIntRes_Constant(N); break; - case ISD::VP_CTLZ_ZERO_UNDEF: + case ISD::VP_CTLZ_ZERO_POISON: case ISD::VP_CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: case ISD::CTLZ: Res = PromoteIntRes_CTLZ(N); break; case ISD::CTLS: Res = PromoteIntRes_CTLS(N); break; case ISD::PARITY: case ISD::VP_CTPOP: case ISD::CTPOP: Res = PromoteIntRes_CTPOP_PARITY(N); break; - case ISD::VP_CTTZ_ZERO_UNDEF: + case ISD::VP_CTTZ_ZERO_POISON: case ISD::VP_CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::CTTZ_ELTS_ZERO_POISON: case ISD::CTTZ_ELTS: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: case ISD::VP_CTTZ_ELTS: Res = PromoteIntRes_VP_CttzElements(N); break; @@ -737,7 +737,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { // original type. if (!OVT.isVector() && TLI.isTypeLegal(NVT) && !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ, NVT) && - !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ_ZERO_UNDEF, NVT)) { + !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ_ZERO_POISON, NVT)) { if (SDValue Result = TLI.expandCTLZ(N, DAG)) { Result = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Result); return Result; @@ -765,8 +765,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { ExtractLeadingBits, Mask, EVL, SDNodeFlags::NoUnsignedWrap); } - if (CtlzOpcode == ISD::CTLZ_ZERO_UNDEF || - CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) { + if (CtlzOpcode == ISD::CTLZ_ZERO_POISON || + CtlzOpcode == ISD::VP_CTLZ_ZERO_POISON) { // Any Extend the argument SDValue Op = GetPromotedInteger(N->getOperand(0)); // Op = Op << (sizeinbits(NVT) - sizeinbits(Old VT)) @@ -839,7 +839,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { // larger type. if (!OVT.isVector() && TLI.isTypeLegal(NVT) && !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ, NVT) && - !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ_ZERO_UNDEF, NVT) && + !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ_ZERO_POISON, NVT) && !TLI.isOperationLegal(ISD::CTPOP, NVT) && !TLI.isOperationLegal(ISD::CTLZ, NVT)) { if (SDValue Result = TLI.expandCTTZ(N, DAG)) { @@ -857,12 +857,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { OVT.getScalarSizeInBits()); if (NewOpc == ISD::CTTZ) { Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT)); - NewOpc = ISD::CTTZ_ZERO_UNDEF; + NewOpc = ISD::CTTZ_ZERO_POISON; } else { Op = DAG.getNode(ISD::VP_OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT), N->getOperand(1), N->getOperand(2)); - NewOpc = ISD::VP_CTTZ_ZERO_UNDEF; + NewOpc = ISD::VP_CTTZ_ZERO_POISON; } } if (!N->isVPOpcode()) @@ -3139,11 +3139,11 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ABS: ExpandIntRes_ABS(N, Lo, Hi); break; case ISD::ABDS: case ISD::ABDU: ExpandIntRes_ABD(N, Lo, Hi); break; - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break; case ISD::CTLS: ExpandIntRes_CTLS(N, Lo, Hi); break; case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break; - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break; case ISD::GET_ROUNDING:ExpandIntRes_GET_ROUNDING(N, Lo, Hi); break; case ISD::STRICT_FP_TO_SINT: @@ -4240,7 +4240,7 @@ void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N, DAG.getConstant(0, dl, NVT), ISD::SETNE); SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo); - SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi); + SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_POISON, dl, NVT, Hi); Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ, DAG.getNode(ISD::ADD, dl, NVT, LoLZ, @@ -4321,7 +4321,7 @@ void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N, SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, DAG.getConstant(0, dl, NVT), ISD::SETNE); - SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo); + SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_POISON, dl, NVT, Lo); SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi); Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f054ab3fdc911..0a9c3dda7f330 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -389,8 +389,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::BITREVERSE: case ISD::CTLZ: case ISD::CTTZ: - case ISD::CTLZ_ZERO_UNDEF: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: + case ISD::CTTZ_ZERO_POISON: case ISD::CTPOP: case ISD::CLMUL: case ISD::CLMULH: @@ -1151,28 +1151,28 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { } break; case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: if (SDValue Expanded = TLI.expandCTLZ(Node, DAG)) { Results.push_back(Expanded); return; } break; case ISD::VP_CTLZ: - case ISD::VP_CTLZ_ZERO_UNDEF: + case ISD::VP_CTLZ_ZERO_POISON: if (SDValue Expanded = TLI.expandVPCTLZ(Node, DAG)) { Results.push_back(Expanded); return; } break; case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: if (SDValue Expanded = TLI.expandCTTZ(Node, DAG)) { Results.push_back(Expanded); return; } break; case ISD::VP_CTTZ: - case ISD::VP_CTTZ_ZERO_UNDEF: + case ISD::VP_CTTZ_ZERO_POISON: if (SDValue Expanded = TLI.expandVPCTTZ(Node, DAG)) { Results.push_back(Expanded); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index cbd67675aab96..d363cf91f0e6c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -96,10 +96,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: case ISD::CTPOP: case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::FABS: case ISD::FACOS: case ISD::FASIN: @@ -1412,10 +1412,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_CTLZ: case ISD::CTTZ: case ISD::VP_CTTZ: - case ISD::CTLZ_ZERO_UNDEF: - case ISD::VP_CTLZ_ZERO_UNDEF: - case ISD::CTTZ_ZERO_UNDEF: - case ISD::VP_CTTZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: + case ISD::VP_CTLZ_ZERO_POISON: + case ISD::CTTZ_ZERO_POISON: + case ISD::VP_CTTZ_ZERO_POISON: case ISD::CTPOP: case ISD::VP_CTPOP: case ISD::FABS: case ISD::VP_FABS: @@ -3841,7 +3841,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { Res = SplitVecOp_CttzElts(N); break; case ISD::VP_CTTZ_ELTS: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: Res = SplitVecOp_VP_CttzElements(N); break; case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: @@ -4967,7 +4967,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_CttzElements(SDNode *N) { SDValue VLo = DAG.getZExtOrTrunc(EVLLo, DL, ResVT); // if VP_CTTZ_ELTS(Lo) != EVLLo => VP_CTTZ_ELTS(Lo). - // else => EVLLo + (VP_CTTZ_ELTS(Hi) or VP_CTTZ_ELTS_ZERO_UNDEF(Hi)). + // else => EVLLo + (VP_CTTZ_ELTS(Hi) or VP_CTTZ_ELTS_ZERO_POISON(Hi)). SDValue ResLo = DAG.getNode(ISD::VP_CTTZ_ELTS, DL, ResVT, Lo, MaskLo, EVLLo); SDValue ResLoNotEVL = DAG.getSetCC(DL, getSetCCResultType(ResVT), ResLo, VLo, ISD::SETNE); @@ -5356,14 +5356,14 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_BSWAP: case ISD::CTLZ: case ISD::VP_CTLZ: - case ISD::CTLZ_ZERO_UNDEF: - case ISD::VP_CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: + case ISD::VP_CTLZ_ZERO_POISON: case ISD::CTPOP: case ISD::VP_CTPOP: case ISD::CTTZ: case ISD::VP_CTTZ: - case ISD::CTTZ_ZERO_UNDEF: - case ISD::VP_CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: + case ISD::VP_CTTZ_ZERO_POISON: case ISD::FNEG: case ISD::VP_FNEG: case ISD::FABS: case ISD::VP_FABS: case ISD::VP_SQRT: @@ -7500,7 +7500,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { Res = WidenVecOp_VP_REDUCE(N); break; case ISD::VP_CTTZ_ELTS: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: Res = WidenVecOp_VP_CttzElements(N); break; case ISD::VECTOR_FIND_LAST_ACTIVE: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index a221df567a10f..f0b03b89ec1d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3914,7 +3914,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, break; } case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: { + case ISD::CTTZ_ZERO_POISON: { Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); // If we have a known 1, its position is our upper bound. unsigned PossibleTZ = Known2.countMaxTrailingZeros(); @@ -3923,7 +3923,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, break; } case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: { + case ISD::CTLZ_ZERO_POISON: { Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); // If we have a known 1, its position is our upper bound. unsigned PossibleLZ = Known2.countMaxLeadingZeros(); @@ -5980,11 +5980,12 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, return includesPoison(Kind) && !getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1); - case ISD::CTTZ_ZERO_UNDEF: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: + case ISD::CTLZ_ZERO_POISON: // If the amount is zero then the result will be poison. // TODO: Add isKnownNeverZero DemandedElts handling. - return !isKnownNeverZero(Op.getOperand(0), Depth + 1); + return includesPoison(Kind) && + !isKnownNeverZero(Op.getOperand(0), Depth + 1); case ISD::SCALAR_TO_VECTOR: // Check if we demand any upper (undef) elements. @@ -7017,9 +7018,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::CTPOP: case ISD::CTLS: case ISD::STEP_VECTOR: { @@ -7554,11 +7555,11 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, return getConstant(Val.popcount(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: return getConstant(Val.countl_zero(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: return getConstant(Val.countr_zero(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTLS: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 894f04bb4668d..e88a07901e289 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7341,7 +7341,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); EVT Ty = Arg.getValueType(); - setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF, + setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_POISON, sdl, Ty, Arg)); return; } @@ -7349,7 +7349,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); EVT Ty = Arg.getValueType(); - setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF, + setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_POISON, sdl, Ty, Arg)); return; } @@ -8622,17 +8622,17 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) { switch (VPIntrin.getIntrinsicID()) { case Intrinsic::vp_ctlz: { bool IsZeroUndef = cast(VPIntrin.getArgOperand(1))->isOne(); - ResOPC = IsZeroUndef ? ISD::VP_CTLZ_ZERO_UNDEF : ISD::VP_CTLZ; + ResOPC = IsZeroUndef ? ISD::VP_CTLZ_ZERO_POISON : ISD::VP_CTLZ; break; } case Intrinsic::vp_cttz: { bool IsZeroUndef = cast(VPIntrin.getArgOperand(1))->isOne(); - ResOPC = IsZeroUndef ? ISD::VP_CTTZ_ZERO_UNDEF : ISD::VP_CTTZ; + ResOPC = IsZeroUndef ? ISD::VP_CTTZ_ZERO_POISON : ISD::VP_CTTZ; break; } case Intrinsic::vp_cttz_elts: { bool IsZeroPoison = cast(VPIntrin.getArgOperand(1))->isOne(); - ResOPC = IsZeroPoison ? ISD::VP_CTTZ_ELTS_ZERO_UNDEF : ISD::VP_CTTZ_ELTS; + ResOPC = IsZeroPoison ? ISD::VP_CTTZ_ELTS_ZERO_POISON : ISD::VP_CTTZ_ELTS; break; } #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ @@ -9025,10 +9025,10 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic( } case ISD::VP_ABS: case ISD::VP_CTLZ: - case ISD::VP_CTLZ_ZERO_UNDEF: + case ISD::VP_CTLZ_ZERO_POISON: case ISD::VP_CTTZ: - case ISD::VP_CTTZ_ZERO_UNDEF: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ZERO_POISON: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: case ISD::VP_CTTZ_ELTS: { SDValue Result = DAG.getNode(Opcode, DL, VTs, {OpValues[0], OpValues[2], OpValues[3]}); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index ce78072d21114..4aab651978bce 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -521,9 +521,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::BSWAP: return "bswap"; case ISD::CTPOP: return "ctpop"; case ISD::CTTZ: return "cttz"; - case ISD::CTTZ_ZERO_UNDEF: return "cttz_zero_undef"; + case ISD::CTTZ_ZERO_POISON: return "cttz_zero_poison"; case ISD::CTLZ: return "ctlz"; - case ISD::CTLZ_ZERO_UNDEF: return "ctlz_zero_undef"; + case ISD::CTLZ_ZERO_POISON: return "ctlz_zero_poison"; case ISD::CTLS: return "ctls"; case ISD::PARITY: return "parity"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 7161caab53da3..ce1493200b9b1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9161,7 +9161,7 @@ TargetLowering::expandCONVERT_FROM_ARBITRARY_FP(SDNode *Node, { const unsigned IntVTBits = DstBits; SDValue LeadingZeros = - DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, IntVT, MantField); + DAG.getNode(ISD::CTLZ_ZERO_POISON, dl, IntVT, MantField); const int DenormExpConst = (int)IntVTBits + DstBias - SrcBias - (int)SrcMant; @@ -10225,16 +10225,16 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const { SDValue Op = Node->getOperand(0); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - // If the non-ZERO_UNDEF version is supported we can use that instead. - if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF && + // If the non-ZERO_POISON version is supported we can use that instead. + if (Node->getOpcode() == ISD::CTLZ_ZERO_POISON && isOperationLegalOrCustom(ISD::CTLZ, VT)) return DAG.getNode(ISD::CTLZ, dl, VT, Op); - // If the ZERO_UNDEF version is supported use that and handle the zero case. - if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { + // If the ZERO_POISON version is supported use that and handle the zero case. + if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_POISON, VT)) { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op); + SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_POISON, dl, VT, Op); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); return DAG.getSelect(dl, VT, SrcIsZero, @@ -10309,7 +10309,7 @@ SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const { SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, DAG.getShiftAmountConstant(1, VT, dl)); SDValue Or = DAG.getNode(ISD::OR, dl, VT, Shl, DAG.getConstant(1, dl, VT)); - return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Or); + return DAG.getNode(ISD::CTLZ_ZERO_POISON, dl, VT, Or); } SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, @@ -10349,7 +10349,7 @@ SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getEntryNode(), DAG.getMemBasePlusOffset(CPIdx, Lookup, DL), PtrInfo, MVT::i8); - if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF) + if (Node->getOpcode() == ISD::CTTZ_ZERO_POISON) return ExtLoad; EVT SetCCVT = @@ -10366,16 +10366,16 @@ SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const { SDValue Op = Node->getOperand(0); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - // If the non-ZERO_UNDEF version is supported we can use that instead. - if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF && + // If the non-ZERO_POISON version is supported we can use that instead. + if (Node->getOpcode() == ISD::CTTZ_ZERO_POISON && isOperationLegalOrCustom(ISD::CTTZ, VT)) return DAG.getNode(ISD::CTTZ, dl, VT, Op); - // If the ZERO_UNDEF version is supported use that and handle the zero case. - if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { + // If the ZERO_POISON version is supported use that and handle the zero case. + if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_POISON, VT)) { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op); + SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_POISON, dl, VT, Op); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); return DAG.getSelect(dl, VT, SrcIsZero, diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 7dcdf7f952e27..e50e1832e37a6 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1194,7 +1194,7 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::TRUNCATE_USAT_U, VT, Expand); // These default to Expand so they will be expanded to CTLZ/CTTZ by default. - setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, + setOperationAction({ISD::CTLZ_ZERO_POISON, ISD::CTTZ_ZERO_POISON}, VT, Expand); setOperationAction(ISD::CTLS, VT, Expand); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 7d87847d7fc67..14a3f7547fb09 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -397,7 +397,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .scalarizeIf(scalarOrEltWiderThan(0, 32), 0) .scalarSameSizeAs(0, 1); - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON).lower(); getActionDefinitionsBuilder(G_CTTZ) .lowerIf(isVector(0)) @@ -407,7 +407,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor(HasCSSC, {s32, s64}) .customFor(!HasCSSC, {s32, s64}); - getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); + getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON).lower(); getActionDefinitionsBuilder(G_BITREVERSE) .legalFor({i32, i64, v8i8, v16i8}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 058ca9c29625f..34e85ed2aa170 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,11 +514,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, Legal); setOperationAction( - {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, + {ISD::CTTZ, ISD::CTTZ_ZERO_POISON, ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, MVT::i64, Custom); for (auto VT : {MVT::i8, MVT::i16}) - setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom); + setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, VT, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, @@ -1475,9 +1475,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: return LowerCTLZ_CTTZ(Op, DAG); case ISD::CTLS: return LowerCTLS(Op, DAG); @@ -1517,7 +1517,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Lowered); return; case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG)) Results.push_back(Lowered); return; @@ -3329,11 +3329,11 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { } static bool isCtlzOpc(unsigned Opc) { - return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON; } static bool isCttzOpc(unsigned Opc) { - return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; + return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON; } SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, @@ -3353,7 +3353,7 @@ SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32); SDValue NewOp; - if (Opc == ISD::CTLZ_ZERO_UNDEF) { + if (Opc == ISD::CTLZ_ZERO_POISON) { NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg); NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits); NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); @@ -3374,21 +3374,21 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons bool Ctlz = isCtlzOpc(Op.getOpcode()); unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32; - bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF || - Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; + bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON || + Op.getOpcode() == ISD::CTTZ_ZERO_POISON; bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64; if (Src.getValueType() == MVT::i32 || Is64BitScalar) { // (ctlz hi:lo) -> (umin (ffbh src), 32) // (cttz hi:lo) -> (umin (ffbl src), 32) - // (ctlz_zero_undef src) -> (ffbh src) - // (cttz_zero_undef src) -> (ffbl src) + // (ctlz_zero_poison src) -> (ffbh src) + // (cttz_zero_poison src) -> (ffbl src) // 64-bit scalar version produce 32-bit result // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64) // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64) - // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src) - // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) + // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src) + // (cttz_zero_poison src) -> (S_FF1_I32_B64 src) SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); if (!ZeroUndef) { const SDValue ConstVal = DAG.getConstant( @@ -3406,8 +3406,8 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64) // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64) - // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) - // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) + // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) + // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT; const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); @@ -4994,8 +4994,8 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C ISD::CondCode CCOpcode = cast(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); - // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x - // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x + // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x + // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) { @@ -5004,8 +5004,8 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C return getFFBX_U32(DAG, CmpLHS, SL, Opc); } - // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x - // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x + // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x + // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 1b9a8869d18b6..ae6546939d0a7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -495,11 +495,11 @@ def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2), def AMDGPUffbh_u32 : PatFrags<(ops node:$src), - [(ctlz_zero_undef node:$src), + [(ctlz_zero_poison node:$src), (AMDGPUffbh_u32_impl node:$src)]>; def AMDGPUffbl_b32 : PatFrags<(ops node:$src), - [(cttz_zero_undef node:$src), + [(cttz_zero_poison node:$src), (AMDGPUffbl_b32_impl node:$src)]>; def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1), diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 36365111f15a1..43db1ead84c80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1376,7 +1376,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .custom(); // The 64-bit versions produce 32-bit results, but only on the SALU. - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON) .legalFor({{S32, S32}, {S32, S64}}) .customIf(scalarNarrowerThan(1, 32)) .clampScalar(0, S32, S32) @@ -1385,7 +1385,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); - getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON) .legalFor({{S32, S32}, {S32, S64}}) .clampScalar(0, S32, S32) .clampScalar(1, S32, S64) @@ -2324,8 +2324,8 @@ bool AMDGPULegalizerInfo::legalizeCustom( return legalizeCTLZ_CTTZ(MI, MRI, B); case TargetOpcode::G_CTLS: return legalizeCTLS(MI, MRI, B); - case TargetOpcode::G_CTLZ_ZERO_UNDEF: - return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B); + case TargetOpcode::G_CTLZ_ZERO_POISON: + return legalizeCTLZ_ZERO_POISON(MI, MRI, B); case TargetOpcode::G_STACKSAVE: return legalizeStackSave(MI, B); case TargetOpcode::G_GET_FPENV: @@ -4658,7 +4658,7 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, } // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to -// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input +// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input // case with a single min instruction instead of a compare+select. bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -4678,9 +4678,9 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, return true; } -bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_POISON(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); LLT SrcTy = MRI.getType(Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 36ce5d9740762..53cd6d786ee2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -117,8 +117,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; - bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 090ffed1e1e53..a6e302d48c008 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -828,12 +828,12 @@ bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) { // Split 64-bit find-first-bit operations into 32-bit halves: // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32)) // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32)) - // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32)) - // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32)) + // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32)) + // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32)) unsigned Opc = MI.getOpcode(); // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid - // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add + // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add // is fine. unsigned FFBOpc; unsigned AddOpc; @@ -849,12 +849,12 @@ bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) { AddOpc = AMDGPU::G_UADDSAT; SearchFromMSB = false; break; - case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTLZ_ZERO_POISON: FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32; AddOpc = AMDGPU::G_ADD; SearchFromMSB = true; break; - case AMDGPU::G_CTTZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_POISON: FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32; AddOpc = AMDGPU::G_ADD; SearchFromMSB = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5cc0d073df17b..061b8dc070ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1284,8 +1284,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S64, {{Sgpr64}, {Sgpr64}}) .Div(S64, {{Vgpr64}, {Vgpr64}}); - addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_UNDEF, - G_CTTZ_ZERO_UNDEF}) + addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_POISON, + G_CTTZ_ZERO_POISON}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}) .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index ebdd709c34f08..15eae3cd6d088 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2707,8 +2707,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( } case AMDGPU::G_AMDGPU_FFBH_U32: case AMDGPU::G_AMDGPU_FFBL_B32: - case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ_ZERO_UNDEF: { + case AMDGPU::G_CTLZ_ZERO_POISON: + case AMDGPU::G_CTTZ_ZERO_POISON: { const RegisterBank *DstBank = OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::SGPRRegBank) @@ -2722,22 +2722,22 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // We can narrow this more efficiently than Helper can by using ffbh/ffbl // which return -1 when the input is zero: - // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) - // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) + // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) + // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); SmallVector SrcRegs(OpdMapper.getVRegs(1)); - unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF + unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_POISON ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 - : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF - ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 - : Opc; + : Opc == AMDGPU::G_CTTZ_ZERO_POISON + ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 + : Opc; unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); unsigned AddOpc = - Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF + Opc == AMDGPU::G_CTLZ_ZERO_POISON || Opc == AMDGPU::G_CTTZ_ZERO_POISON ? AMDGPU::G_ADD : AMDGPU::G_UADDSAT; Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); @@ -4294,8 +4294,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_AMDGPU_FFBH_U32: case AMDGPU::G_AMDGPU_FFBL_B32: - case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ_ZERO_UNDEF: { + case AMDGPU::G_CTLZ_ZERO_POISON: + case AMDGPU::G_CTTZ_ZERO_POISON: { unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 4ebe154166b3f..ae8e73eb5f329 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -177,10 +177,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::i64, Expand); if (Subtarget->hasFFBH()) - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, Custom); if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Custom); // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we // need it for R600. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 95b214f0da4c8..8b89366f89c5a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -518,8 +518,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMadMacF32Insts()) setOperationAction(ISD::FMAD, MVT::f32, Legal); - setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); - setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); + setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, MVT::i32, Custom); + setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_POISON}, MVT::i32, Custom); setOperationAction(ISD::CTLS, MVT::i32, Custom); // We only really have 32-bit BFE instructions (and 16-bit on VI). @@ -592,7 +592,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, - ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, + ISD::CTTZ_ZERO_POISON, ISD::CTLZ, ISD::CTLZ_ZERO_POISON, ISD::CTPOP}, MVT::i16, Promote); @@ -17420,7 +17420,7 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, } static bool isCtlzOpc(unsigned Opc) { - return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON; } SDValue SITargetLowering::performSubCombine(SDNode *N, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ff8df9b33b3cc..259467f78afb4 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -775,15 +775,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v1i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::v2i64, Custom); for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::MULHS, VT, Expand); @@ -1025,7 +1025,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::CTPOP, MVT::i64, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { setOperationAction(ISD::CTLZ, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, LibCall); } // @llvm.readcyclecounter requires the Performance Monitors extension. @@ -6285,7 +6285,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, } if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && - (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { + (N->getOpcode() == ISD::CTTZ_ZERO_POISON)) { // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 unsigned NumBits = ElemTy.getSizeInBits(); SDValue WidthMinus1 = @@ -10466,7 +10466,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTTZ_ZERO_POISON: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index ad3e4f116321b..8b334fe84be45 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -211,12 +211,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) : ST(ST) { .legalFor({s32, s32}) .clampScalar(1, s32, s32) .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON) .lowerFor({s32, s32}) .clampScalar(1, s32, s32) .clampScalar(0, s32, s32); } else { - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON) .libcallFor({s32, s32}) .clampScalar(1, s32, s32) .clampScalar(0, s32, s32); diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp index d846e6378acef..7bdb17e7d92b4 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -132,8 +132,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_POISON, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp index e487d80e845f5..95074a1efc2ef 100644 --- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -259,14 +259,14 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { .legalFor({{s32, s32}}) .maxScalar(0, s32) .maxScalar(1, s32); - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON) .lowerFor({{s32, s32}}); getActionDefinitionsBuilder(G_CTTZ) .lowerFor({{s32, s32}}) .maxScalar(0, s32) .maxScalar(1, s32); - getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON) .lowerFor({{s32, s32}, {s64, s64}}); getActionDefinitionsBuilder(G_CTPOP) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index edd372a9db344..bc673859ff03f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -805,7 +805,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, {MVT::i16, MVT::i32, MVT::i64}, Legal); - setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i16, + setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, MVT::i16, Promote); setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal); setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i64, Custom); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index b7efee8bf4c83..6a690402696e6 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -219,8 +219,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) BSWAPActions.maxScalar(0, sXLen).lower(); auto &CountZerosActions = getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}); - auto &CountZerosUndefActions = - getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}); + auto &CountZerosPoisonActions = + getActionDefinitionsBuilder({G_CTLZ_ZERO_POISON, G_CTTZ_ZERO_POISON}); if (ST.hasStdExtZbb()) { CountZerosActions.legalFor({{sXLen, sXLen}}) .customFor({{s32, s32}}) @@ -229,9 +229,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .scalarSameSizeAs(1, 0); } else { CountZerosActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower(); - CountZerosUndefActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0); + CountZerosPoisonActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0); } - CountZerosUndefActions.lower(); + CountZerosPoisonActions.lower(); auto &CountSignActions = getActionDefinitionsBuilder(G_CTLS); if (ST.hasStdExtP()) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2191d189fea49..93e820b4713ec 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -427,7 +427,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasCTZLike()) { if (Subtarget.is64Bit()) - setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); + setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_POISON}, MVT::i32, Custom); } else { setOperationAction(ISD::CTTZ, XLenVT, Expand); } @@ -448,7 +448,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Use default promotion for i32 without Zbb. if (Subtarget.is64Bit() && (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP())) - setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); + setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, MVT::i32, Custom); } else { setOperationAction(ISD::CTLZ, XLenVT, Expand); } @@ -873,7 +873,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN, ISD::VP_MERGE, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, - ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}; + ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_POISON}; static const unsigned FloatingPointVPOps[] = { ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD, @@ -929,7 +929,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VP_MERGE, VT, Custom); setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON, - ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}, + ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_POISON}, VT, Custom); setOperationAction( @@ -1082,12 +1082,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITREVERSE, VT, Expand); setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand); - // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the + // Lower CTLZ_ZERO_POISON and CTTZ_ZERO_POISON if element of VT in the // range of f32. EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); if (isTypeLegal(FloatVT)) { setOperationAction( - {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, + {ISD::CTLZ, ISD::CTLZ_ZERO_POISON, ISD::CTTZ_ZERO_POISON}, VT, Custom); } } @@ -1587,16 +1587,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::BSWAP, ISD::ROTL, ISD::ROTR}, VT, Custom); if (Subtarget.hasStdExtZvbb()) { - setOperationAction({ISD::BITREVERSE, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, - ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP}, + setOperationAction({ISD::BITREVERSE, ISD::CTLZ, ISD::CTLZ_ZERO_POISON, + ISD::CTTZ, ISD::CTTZ_ZERO_POISON, ISD::CTPOP}, VT, Custom); } else { - // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the + // Lower CTLZ_ZERO_POISON and CTTZ_ZERO_POISON if element of VT in the // range of f32. EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); if (isTypeLegal(FloatVT)) setOperationAction( - {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, + {ISD::CTLZ, ISD::CTLZ_ZERO_POISON, ISD::CTTZ_ZERO_POISON}, VT, Custom); } @@ -6867,11 +6867,11 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { isInterleaveShuffle(M, SVT, Dummy1, Dummy2, Subtarget); } -// Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting +// Lower CTLZ_ZERO_POISON or CTTZ_ZERO_POISON by converting to FP and extracting // the exponent. SDValue -RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, - SelectionDAG &DAG) const { +RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_POISON(SDValue Op, + SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); SDValue Src = Op.getOperand(0); @@ -6892,9 +6892,9 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, assert(DAG.getTargetLoweringInfo().isTypeLegal(FloatVT) && "Expected legal float type!"); - // For CTTZ_ZERO_UNDEF, we need to extract the lowest set bit using X & -X. + // For CTTZ_ZERO_POISON, we need to extract the lowest set bit using X & -X. // The trailing zero count is equal to log2 of this single bit value. - if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { + if (Op.getOpcode() == ISD::CTTZ_ZERO_POISON) { SDValue Neg = DAG.getNegative(Src, DL, VT); Src = DAG.getNode(ISD::AND, DL, VT, Src, Neg); } @@ -6935,7 +6935,7 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, // The exponent contains log2 of the value in biased form. unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127; // For trailing zeros, we just need to subtract the bias. - if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) + if (Op.getOpcode() == ISD::CTTZ_ZERO_POISON) return DAG.getNode(ISD::SUB, DL, VT, Exp, DAG.getConstant(ExponentBias, DL, VT)); @@ -6980,7 +6980,7 @@ SDValue RISCVTargetLowering::lowerVPCttzElements(SDValue Op, } SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Source, Mask, EVL); - if (Op->getOpcode() == ISD::VP_CTTZ_ELTS_ZERO_UNDEF) + if (Op->getOpcode() == ISD::VP_CTTZ_ELTS_ZERO_POISON) // In this case, we can interpret poison as -1, so nothing to do further. return Res; @@ -7563,9 +7563,9 @@ static unsigned getRISCVVLOp(SDValue Op) { VP_CASE(SREM) // VP_SREM VP_CASE(UDIV) // VP_UDIV VP_CASE(UREM) // VP_UREM - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: return RISCVISD::CTLZ_VL; - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: return RISCVISD::CTTZ_VL; case ISD::FMA: return RISCVISD::VFMADD_VL; @@ -8377,7 +8377,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ true); return lowerVPREDUCE(Op, DAG); case ISD::VP_CTTZ_ELTS: - case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: + case ISD::VP_CTTZ_ELTS_ZERO_POISON: return lowerVPCttzElements(Op, DAG); case ISD::UNDEF: { MVT ContainerVT = getContainerForFixedLengthVector(Op.getSimpleValueType()); @@ -8892,13 +8892,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::ABS: return lowerABS(Op, DAG); case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: if (Subtarget.hasStdExtZvbb()) return lowerToScalableOp(Op, DAG); assert(Op.getOpcode() != ISD::CTTZ); - return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG); + return lowerCTLZ_CTTZ_ZERO_POISON(Op, DAG); case ISD::CLMUL: { MVT VT = Op.getSimpleValueType(); assert(VT.isScalableVector() && Subtarget.hasStdExtZvbc() && @@ -15003,9 +15003,9 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(customLegalizeToWOp(N, DAG)); break; case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: case ISD::CTLS: { assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -15016,11 +15016,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: Opc = RISCVISD::CTZW; break; case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_POISON: Opc = RISCVISD::CLZW; break; case ISD::CTLS: @@ -19698,9 +19698,9 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) { CountZeroes = CountZeroes.getOperand(0); if (CountZeroes.getOpcode() != ISD::CTTZ && - CountZeroes.getOpcode() != ISD::CTTZ_ZERO_UNDEF && + CountZeroes.getOpcode() != ISD::CTTZ_ZERO_POISON && CountZeroes.getOpcode() != ISD::CTLZ && - CountZeroes.getOpcode() != ISD::CTLZ_ZERO_UNDEF) + CountZeroes.getOpcode() != ISD::CTLZ_ZERO_POISON) return SDValue(); if (!isNullConstant(ValOnZero)) @@ -19714,10 +19714,10 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) { if (!isPowerOf2_32(BitWidth)) return SDValue(); - if (CountZeroes.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { + if (CountZeroes.getOpcode() == ISD::CTTZ_ZERO_POISON) { CountZeroes = DAG.getNode(ISD::CTTZ, SDLoc(CountZeroes), CountZeroes.getValueType(), CountZeroesArgument); - } else if (CountZeroes.getOpcode() == ISD::CTLZ_ZERO_UNDEF) { + } else if (CountZeroes.getOpcode() == ISD::CTLZ_ZERO_POISON) { CountZeroes = DAG.getNode(ISD::CTLZ, SDLoc(CountZeroes), CountZeroes.getValueType(), CountZeroesArgument); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 54802cf62a13e..7ef15bc2c5a00 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -552,7 +552,7 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerCTLZ_CTTZ_ZERO_POISON(SDValue Op, SelectionDAG &DAG) const; SDValue lowerStrictFPExtendOrRoundLike(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 0a3c1489c40c0..338f53c8d791d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -1183,10 +1183,10 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::sqrt, GL::Sqrt); case TargetOpcode::G_CTTZ: - case TargetOpcode::G_CTTZ_ZERO_UNDEF: + case TargetOpcode::G_CTTZ_ZERO_POISON: return selectExtInst(ResVReg, ResType, I, CL::ctz); case TargetOpcode::G_CTLZ: - case TargetOpcode::G_CTLZ_ZERO_UNDEF: + case TargetOpcode::G_CTLZ_ZERO_POISON: return selectExtInst(ResVReg, ResType, I, CL::clz); case TargetOpcode::G_INTRINSIC_ROUND: diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 47ffecc4085ab..378b0846fb106 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -502,7 +502,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { if (ST.canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) { getActionDefinitionsBuilder( - {G_CTTZ, G_CTTZ_ZERO_UNDEF, G_CTLZ, G_CTLZ_ZERO_UNDEF}) + {G_CTTZ, G_CTTZ_ZERO_POISON, G_CTLZ, G_CTLZ_ZERO_POISON}) .legalForCartesianProduct(allIntScalarsAndVectors, allIntScalarsAndVectors); diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index bfca95c9b1a38..3d37517a65eda 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1990,38 +1990,38 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, if (Subtarget->isVIS3()) { setOperationAction(ISD::CTLZ, MVT::i32, Legal); setOperationAction(ISD::CTLZ, MVT::i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i64, Legal); setOperationAction(ISD::CTTZ, MVT::i32, Subtarget->is64Bit() ? Promote : Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Subtarget->is64Bit() ? Promote : Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i64, Expand); } else if (Subtarget->usePopc()) { setOperationAction(ISD::CTLZ, MVT::i32, Expand); setOperationAction(ISD::CTLZ, MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i64, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ, MVT::i32, Expand); setOperationAction(ISD::CTLZ, MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, Subtarget->is64Bit() ? Promote : LibCall); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, LibCall); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i64, LibCall); // FIXME here we don't have any ISA extensions that could help us, so to // prevent large expansions those should be made into LibCalls. setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i64, Expand); } setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td index 4a0907fc64bea..353f6fc299e6a 100644 --- a/llvm/lib/Target/Sparc/SparcInstrVIS.td +++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td @@ -318,14 +318,14 @@ def : Pat<(i64 (mulhs i64:$lhs, i64:$rhs)), (ANDrr (SRAXri $rhs, 63), $lhs)))>; def : Pat<(i64 (ctlz i64:$src)), (LZCNT $src)>; -def : Pat<(i64 (ctlz_zero_undef i64:$src)), (LZCNT $src)>; +def : Pat<(i64 (ctlz_zero_poison i64:$src)), (LZCNT $src)>; // 32-bit LZCNT. // The zero extension will leave us with 32 extra leading zeros, // so we need to compensate for it. // FIXME remove this when the codegen supports using 64-bit values directly // in V8+ mode. def : Pat<(i32 (ctlz i32:$src)), (ADDri (LZCNT (SRLri $src, 0)), (i32 -32))>; -def : Pat<(i32 (ctlz_zero_undef i32:$src)), (ADDri (LZCNT (SRLri $src, 0)), (i32 -32))>; +def : Pat<(i32 (ctlz_zero_poison i32:$src)), (ADDri (LZCNT (SRLri $src, 0)), (i32 -32))>; def : Pat<(i32 (bitconvert f32:$src)), (MOVSTOUW $src)>; def : Pat<(i64 (zanyext (i32 (bitconvert f32:$src)))), (MOVSTOUW $src)>; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 8084aebe0ba52..26cc921b6b169 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -350,13 +350,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // We have native support for a 64-bit CTLZ, via FLOGR. setOperationAction(ISD::CTLZ, MVT::i32, Promote); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, Promote); setOperationAction(ISD::CTLZ, MVT::i64, Legal); // On z17 we have native support for a 64-bit CTTZ. if (Subtarget.hasMiscellaneousExtensions4()) { setOperationAction(ISD::CTTZ, MVT::i32, Promote); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Promote); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Promote); setOperationAction(ISD::CTTZ, MVT::i64, Legal); } diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 141196c332074..23eb3034fe3e6 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -194,7 +194,7 @@ void VETargetLowering::initSPUActions() { LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal; setOperationAction(ISD::BITREVERSE, IntVT, Act); setOperationAction(ISD::CTLZ, IntVT, Act); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act); + setOperationAction(ISD::CTLZ_ZERO_POISON, IntVT, Act); setOperationAction(ISD::CTPOP, IntVT, Act); // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations. diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index 9869f95ae5661..a6b1a3a60aaf1 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -1289,7 +1289,7 @@ defm MRG : RRMRGm<"mrg", 0x56, I64>; // Section 8.5.7 - LDZ (Leading Zero Count) def ctlz_pat : PatFrags<(ops node:$src), [(ctlz node:$src), - (ctlz_zero_undef node:$src)]>; + (ctlz_zero_poison node:$src)]>; defm LDZ : RRI1m<"ldz", 0x67, I64, i64, ctlz_pat>; // Section 8.5.8 - PCNT (Population Count) diff --git a/llvm/lib/Target/WebAssembly/GISel/WebAssemblyLegalizerInfo.cpp b/llvm/lib/Target/WebAssembly/GISel/WebAssemblyLegalizerInfo.cpp index 7178934ccd1a5..fb499d0db1733 100644 --- a/llvm/lib/Target/WebAssembly/GISel/WebAssemblyLegalizerInfo.cpp +++ b/llvm/lib/Target/WebAssembly/GISel/WebAssemblyLegalizerInfo.cpp @@ -57,7 +57,7 @@ WebAssemblyLegalizerInfo::WebAssemblyLegalizerInfo( .clampScalar(1, s32, s64) .scalarSameSizeAs(0, 1); - getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}).lower(); + getActionDefinitionsBuilder({G_CTLZ_ZERO_POISON, G_CTTZ_ZERO_POISON}).lower(); getActionDefinitionsBuilder({G_ROTL, G_ROTR}) .legalFor({{i32, i32}, {i64, i64}}) diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index c342511e99b77..ce922538cfdc9 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -300,7 +300,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .scalarSameSizeAs(0, 1); // count trailing zeros - getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON) .legalFor({{s16, s16}, {s32, s32}}) .legalFor(Is64Bit, {{s64, s64}}) .widenScalarToNextPow2(1, /*Min=*/16) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e580715151186..4b193c8db3303 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -399,34 +399,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. - setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); - setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationPromotedToType(ISD::CTTZ, MVT::i8, MVT::i32); + setOperationPromotedToType(ISD::CTTZ_ZERO_POISON, MVT::i8, MVT::i32); // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to // promote that too. - setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32); - setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32); + setOperationPromotedToType(ISD::CTTZ, MVT::i16, MVT::i32); + setOperationPromotedToType(ISD::CTTZ_ZERO_POISON, MVT::i16, MVT::i32); if (!Subtarget.hasBMI()) { - setOperationAction(ISD::CTTZ , MVT::i32 , Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); + setOperationAction(ISD::CTTZ, MVT::i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Legal); if (Subtarget.is64Bit()) { - setOperationAction(ISD::CTTZ , MVT::i64 , Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); + setOperationAction(ISD::CTTZ, MVT::i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i64, Legal); } } if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. - setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); - setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationPromotedToType(ISD::CTLZ, MVT::i8, MVT::i32); + setOperationPromotedToType(ISD::CTLZ_ZERO_POISON, MVT::i8, MVT::i32); } else { for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; - setOperationAction(ISD::CTLZ , VT, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_POISON, VT, Custom); } } @@ -2266,8 +2266,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, continue; setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_POISON, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_POISON, VT, Custom); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); @@ -25832,12 +25832,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned CondCode = Cond.getConstantOperandVal(0); // Special handling for __builtin_ffs(X) - 1 pattern which looks like - // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special + // (select (seteq X, 0), -1, (cttz_zero_poison X)). Disable the special // handle to keep the CMP with 0. This should be removed by // optimizeCompareInst by using the flags from the BSR/TZCNT used for the - // cttz_zero_undef. + // cttz_zero_poison. auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) { - return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && + return (Op1.getOpcode() == ISD::CTTZ_ZERO_POISON && Op1.hasOneUse() && Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); }; if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) && @@ -34485,9 +34485,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG); case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG); case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); + case ISD::CTLZ_ZERO_POISON: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG); + case ISD::CTTZ_ZERO_POISON: return LowerCTTZ(Op, Subtarget, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); @@ -34964,8 +34964,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::CTLZ: case ISD::CTTZ: - case ISD::CTLZ_ZERO_UNDEF: - case ISD::CTTZ_ZERO_UNDEF: { + case ISD::CTLZ_ZERO_POISON: + case ISD::CTTZ_ZERO_POISON: { // Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 // vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. // Compute the CTLZ/CTTZ of each element, add the element's bit offset, @@ -34994,7 +34994,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // CTLZ - reverse the elements as we want the top non-zero element at the // bottom for compression. unsigned VecOpc = ISD::CTTZ; - if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) { + if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON) { VecOpc = ISD::CTLZ; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, Vec, RevMask); } @@ -50200,7 +50200,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant. if (isa(Const) && Add.getOpcode() == ISD::ADD && Add.hasOneUse() && isa(Add.getOperand(1)) && - (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF || + (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_POISON || Add.getOperand(0).getOpcode() == ISD::CTTZ) && Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) { // This should constant fold. @@ -56273,14 +56273,14 @@ static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF && - N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF) + if (N0.getOpcode() != ISD::CTLZ_ZERO_POISON && + N1.getOpcode() != ISD::CTLZ_ZERO_POISON) return SDValue(); SDValue OpCTLZ; SDValue OpSizeTM1; - if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) { + if (N1.getOpcode() == ISD::CTLZ_ZERO_POISON) { OpCTLZ = N1; OpSizeTM1 = N0; } else if (N->getOpcode() == ISD::SUB) { @@ -60653,8 +60653,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case ISD::CTPOP: case ISD::CTTZ: case ISD::CTLZ: - case ISD::CTTZ_ZERO_UNDEF: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_POISON: + case ISD::CTLZ_ZERO_POISON: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useBWIRegs()))) { return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0)); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ebbfa48d2660c..48291cdf91f72 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -2284,13 +2284,13 @@ def : Pat<(mul (loadi32 addr:$src1), imm:$src2), def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; -// Bit scan instruction patterns to match explicit zero-undef behavior. -def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr (i16 (IMPLICIT_DEF)), GR16:$src)>; -def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr (i32 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr (i64 (IMPLICIT_DEF)), GR64:$src)>; -def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm (i16 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm (i32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm (i64 (IMPLICIT_DEF)), addr:$src)>; +// Bit scan instruction patterns to match explicit zero-poison behavior. +def : Pat<(cttz_zero_poison GR16:$src), (BSF16rr (i16 (IMPLICIT_DEF)), GR16:$src)>; +def : Pat<(cttz_zero_poison GR32:$src), (BSF32rr (i32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(cttz_zero_poison GR64:$src), (BSF64rr (i64 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(cttz_zero_poison (loadi16 addr:$src)), (BSF16rm (i16 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(cttz_zero_poison (loadi32 addr:$src)), (BSF32rm (i32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(cttz_zero_poison (loadi64 addr:$src)), (BSF64rm (i64 (IMPLICIT_DEF)), addr:$src)>; // When HasMOVBE is enabled it is possible to get a non-legalized // register-register 16 bit bswap. This maps it to a ROL instruction. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index b2b77265b06a5..698be1615a04b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4442,12 +4442,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR - { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR + { ISD::CTLZ_ZERO_POISON,MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF - { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF + { ISD::CTTZ_ZERO_POISON,MVT::i64,{ 1, 2, 1, 2 } }, // BSF { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, @@ -4478,15 +4478,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV - { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR - { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR - { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR + { ISD::CTLZ_ZERO_POISON,MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR + { ISD::CTLZ_ZERO_POISON,MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR + { ISD::CTLZ_ZERO_POISON,MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH - { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF - { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF - { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF + { ISD::CTTZ_ZERO_POISON,MVT::i32,{ 1, 2, 1, 2 } }, // BSF + { ISD::CTTZ_ZERO_POISON,MVT::i16,{ 2, 2, 1, 2 } }, // BSF + { ISD::CTTZ_ZERO_POISON,MVT::i8, { 2, 2, 1, 2 } }, // BSF { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } }, @@ -4694,14 +4694,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, std::pair LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; - // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. + // Without BMI/LZCNT see if we're only looking for a *_ZERO_POISON cost. if (((ISD == ISD::CTTZ && !ST->hasBMI()) || (ISD == ISD::CTLZ && !ST->hasLZCNT())) && !MTy.isVector() && !ICA.isTypeBasedOnly()) { const SmallVectorImpl &Args = ICA.getArgs(); if (auto *Cst = dyn_cast(Args[1])) if (Cst->isAllOnesValue()) - ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; + ISD = + ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_POISON : ISD::CTLZ_ZERO_POISON; } // FSQRT is a single instruction. diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index d9ffc8571f960..090037c28704f 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -174,8 +174,8 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTLZ, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_POISON, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_POISON, MVT::i32, Expand); setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, MVT::i32, Subtarget.hasMINMAX() ? Legal : Expand); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index be18264454efa..8548f63bd1150 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1408,10 +1408,10 @@ define i32 @test_ctlz_intrinsic_zero_not_undef(i32 %a) { } declare i32 @llvm.cttz.i32(i32, i1) -define i32 @test_cttz_intrinsic_zero_undef(i32 %a) { -; CHECK-LABEL: name: test_cttz_intrinsic_zero_undef +define i32 @test_cttz_intrinsic_zero_poison(i32 %a) { +; CHECK-LABEL: name: test_cttz_intrinsic_zero_poison ; CHECK: [[A:%[0-9]+]]:_(i32) = COPY $w0 -; CHECK: [[RES:%[0-9]+]]:_(i32) = G_CTTZ_ZERO_UNDEF [[A]] +; CHECK: [[RES:%[0-9]+]]:_(i32) = G_CTTZ_ZERO_POISON [[A]] ; CHECK: $w0 = COPY [[RES]] %res = call i32 @llvm.cttz.i32(i32 %a, i1 1) ret i32 %res diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-constant-fold-unary-int.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-constant-fold-unary-int.mir index cba19b49d7687..8dc8d3d69209b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-constant-fold-unary-int.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-constant-fold-unary-int.mir @@ -41,15 +41,15 @@ body: | RET_ReallyLR implicit $x0 ... --- -name: ctlz_zero_undef_s32 +name: ctlz_zero_poison_s32 body: | bb.0: - ; CHECK-LABEL: name: ctlz_zero_undef_s32 + ; CHECK-LABEL: name: ctlz_zero_poison_s32 ; CHECK: %res:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: $w0 = COPY %res(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %a:_(s32) = G_CONSTANT i32 128 - %res:_(s32) = G_CTLZ_ZERO_UNDEF %a + %res:_(s32) = G_CTLZ_ZERO_POISON %a $w0 = COPY %res(s32) RET_ReallyLR implicit $w0 ... @@ -93,15 +93,15 @@ body: | RET_ReallyLR implicit $x0 ... --- -name: cttz_zero_undef_s32 +name: cttz_zero_poison_s32 body: | bb.0: - ; CHECK-LABEL: name: cttz_zero_undef_s32 + ; CHECK-LABEL: name: cttz_zero_poison_s32 ; CHECK: %res:_(s32) = G_CONSTANT i32 5 ; CHECK-NEXT: $w0 = COPY %res(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %a:_(s32) = G_CONSTANT i32 96 - %res:_(s32) = G_CTTZ_ZERO_UNDEF %a + %res:_(s32) = G_CTTZ_ZERO_POISON %a $w0 = COPY %res(s32) RET_ReallyLR implicit $w0 ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir index 954adcf98d38a..58b5df6ec55d5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir @@ -172,15 +172,15 @@ body: | ... -# The ZERO_UNDEF variants just lower into the vanilla ones. +# The ZERO_POISON variants just lower into the vanilla ones. --- -name: test_s32_zeroundef +name: test_s32_zeropoison alignment: 4 tracksRegLiveness: true body: | bb.0: liveins: $s0 - ; CHECK-LABEL: name: test_s32_zeroundef + ; CHECK-LABEL: name: test_s32_zeropoison ; CHECK: liveins: $s0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 @@ -190,20 +190,20 @@ body: | ; CHECK-NEXT: RET_ReallyLR implicit $s0 %0:_(s32) = COPY $s0 %2:_(s32) = COPY %0(s32) - %1:_(s32) = G_CTLZ_ZERO_UNDEF %2(s32) + %1:_(s32) = G_CTLZ_ZERO_POISON %2(s32) $s0 = COPY %1(s32) RET_ReallyLR implicit $s0 ... --- -name: test_s64_zeroundef +name: test_s64_zeropoison alignment: 4 tracksRegLiveness: true body: | bb.0: liveins: $d0 - ; CHECK-LABEL: name: test_s64_zeroundef + ; CHECK-LABEL: name: test_s64_zeropoison ; CHECK: liveins: $d0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 @@ -213,7 +213,7 @@ body: | ; CHECK-NEXT: RET_ReallyLR implicit $d0 %0:_(s64) = COPY $d0 %2:_(s64) = COPY %0(s64) - %1:_(s64) = G_CTLZ_ZERO_UNDEF %2(s64) + %1:_(s64) = G_CTLZ_ZERO_POISON %2(s64) $d0 = COPY %1(s64) RET_ReallyLR implicit $d0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz-zero-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz-zero-poison.mir similarity index 95% rename from llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz-zero-undef.mir rename to llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz-zero-poison.mir index cf51a3776790f..a0aa32f1a698d 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz-zero-poison.mir @@ -29,7 +29,7 @@ body: | ; CHECK-CSSC-NEXT: $w0 = COPY [[CTTZ]](s32) ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $w0 %val:_(s8) = G_IMPLICIT_DEF - %cttz:_(s8) = G_CTTZ_ZERO_UNDEF %val(s8) + %cttz:_(s8) = G_CTTZ_ZERO_POISON %val(s8) %ext:_(s32) = G_ANYEXT %cttz(s8) $w0 = COPY %ext(s32) RET_ReallyLR implicit $w0 @@ -62,7 +62,7 @@ body: | ; CHECK-CSSC-NEXT: $w0 = COPY [[CTTZ]](s32) ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $w0 %val:_(s16) = G_IMPLICIT_DEF - %cttz:_(s16) = G_CTTZ_ZERO_UNDEF %val(s16) + %cttz:_(s16) = G_CTTZ_ZERO_POISON %val(s16) %ext:_(s32) = G_ANYEXT %cttz(s16) $w0 = COPY %ext(s32) RET_ReallyLR implicit $w0 @@ -93,7 +93,7 @@ body: | ; CHECK-CSSC-NEXT: $w0 = COPY [[CTTZ]](s32) ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $w0 %val:_(s32) = COPY $w0 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %val(s32) + %1:_(s32) = G_CTTZ_ZERO_POISON %val(s32) $w0 = COPY %1(s32) RET_ReallyLR implicit $w0 @@ -123,7 +123,7 @@ body: | ; CHECK-CSSC-NEXT: $x0 = COPY [[CTTZ]](s64) ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $x0 %val:_(s64) = COPY $x0 - %1:_(s64) = G_CTTZ_ZERO_UNDEF %val(s64) + %1:_(s64) = G_CTTZ_ZERO_POISON %val(s64) $x0 = COPY %1(s64) RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 17bb373070568..2e0b781785785 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -748,13 +748,13 @@ # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: G_CTTZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: G_CTTZ_ZERO_POISON (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_CTLZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK -# DEBUG-NEXT: G_CTLZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: G_CTLZ_ZERO_POISON (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_CTLS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctlz-zero-poison.mir similarity index 80% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctlz-zero-undef.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctlz-zero-poison.mir index 6820077ad4870..53519493d503e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctlz-zero-poison.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- -name: ctlz_zero_undef_s32_ss +name: ctlz_zero_poison_s32_ss legalized: true regBankSelected: true tracksRegLiveness: true @@ -11,19 +11,19 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_ss + ; CHECK-LABEL: name: ctlz_zero_poison_s32_ss ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[S_FLBIT_I32_B32_:%[0-9]+]]:sreg_32 = S_FLBIT_I32_B32 [[COPY]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_FLBIT_I32_B32_]] %0:sgpr(s32) = COPY $sgpr0 - %1:sgpr(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:sgpr(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: ctlz_zero_undef_s32_vs +name: ctlz_zero_poison_s32_vs legalized: true regBankSelected: true tracksRegLiveness: true @@ -32,19 +32,19 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_vs + ; CHECK-LABEL: name: ctlz_zero_poison_s32_vs ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[V_FFBH_U32_e64_:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e64 [[COPY]], implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_FFBH_U32_e64_]] %0:sgpr(s32) = COPY $sgpr0 - %1:vgpr(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:vgpr(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: ctlz_zero_undef_s32_vv +name: ctlz_zero_poison_s32_vv legalized: true regBankSelected: true tracksRegLiveness: true @@ -53,19 +53,19 @@ body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_vv + ; CHECK-LABEL: name: ctlz_zero_poison_s32_vv ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_FFBH_U32_e64_:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e64 [[COPY]], implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_FFBH_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:vgpr(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: ctlz_zero_undef_s64_ss +name: ctlz_zero_poison_s64_ss legalized: true regBankSelected: true tracksRegLiveness: true @@ -74,13 +74,13 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s64_ss + ; CHECK-LABEL: name: ctlz_zero_poison_s64_ss ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_FLBIT_I32_B64_:%[0-9]+]]:sreg_32 = S_FLBIT_I32_B64 [[COPY]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_FLBIT_I32_B64_]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 - %1:sgpr(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:sgpr(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-cttz-zero-poison.mir similarity index 79% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-cttz-zero-undef.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-cttz-zero-poison.mir index a0d5db38813a3..9cc1577c5bfbb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-cttz-zero-poison.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- -name: cttz_zero_undef_s32_ss +name: cttz_zero_poison_s32_ss legalized: true regBankSelected: true tracksRegLiveness: true @@ -11,19 +11,19 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s32_ss + ; CHECK-LABEL: name: cttz_zero_poison_s32_ss ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[S_FF1_I32_B32_:%[0-9]+]]:sreg_32 = S_FF1_I32_B32 [[COPY]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_FF1_I32_B32_]] %0:sgpr(s32) = COPY $sgpr0 - %1:sgpr(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:sgpr(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: cttz_zero_undef_s32_vs +name: cttz_zero_poison_s32_vs legalized: true regBankSelected: true tracksRegLiveness: true @@ -32,19 +32,19 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s32_vs + ; CHECK-LABEL: name: cttz_zero_poison_s32_vs ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY]], implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_FFBL_B32_e64_]] %0:sgpr(s32) = COPY $sgpr0 - %1:vgpr(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:vgpr(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: cttz_zero_undef_s32_vv +name: cttz_zero_poison_s32_vv legalized: true regBankSelected: true tracksRegLiveness: true @@ -53,19 +53,19 @@ body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s32_vv + ; CHECK-LABEL: name: cttz_zero_poison_s32_vv ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY]], implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_FFBL_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:vgpr(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: cttz_zero_undef_s64_ss +name: cttz_zero_poison_s64_ss legalized: true regBankSelected: true tracksRegLiveness: true @@ -74,13 +74,13 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s64_ss + ; CHECK-LABEL: name: cttz_zero_poison_s64_ss ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[COPY]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_FF1_I32_B64_]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 - %1:sgpr(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:sgpr(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-poison.mir similarity index 67% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-poison.mir index f0f62dd2f98be..ab8129d6016c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-poison.mir @@ -2,82 +2,82 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s --- -name: ctlz_zero_undef_s32_s32 +name: ctlz_zero_poison_s32_s32 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_s32 + ; CHECK-LABEL: name: ctlz_zero_poison_s32_s32 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_POISON]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTLZ_ZERO_POISON %0 $vgpr0 = COPY %1 ... --- -name: ctlz_zero_undef_s32_s64 +name: ctlz_zero_poison_s32_s64 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_s64 + ; CHECK-LABEL: name: ctlz_zero_poison_s32_s64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_POISON]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTLZ_ZERO_POISON %0 $vgpr0 = COPY %1 ... --- -name: ctlz_zero_undef_s64_s64 +name: ctlz_zero_poison_s64_s64 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s64_s64 + ; CHECK-LABEL: name: ctlz_zero_poison_s64_s64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTLZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 - %1:_(s64) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s64) = G_CTLZ_ZERO_POISON %0 $vgpr0_vgpr1 = COPY %1 ... --- -name: ctlz_zero_undef_s16_s32 +name: ctlz_zero_poison_s16_s32 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s16_s32 + ; CHECK-LABEL: name: ctlz_zero_poison_s16_s32 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_POISON]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s16) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s16) = G_CTLZ_ZERO_POISON %0 %2:_(s32) = G_ZEXT %1 $vgpr0 = COPY %2 ... --- -name: ctlz_zero_undef_s16_s16 +name: ctlz_zero_poison_s16_s16 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s16_s16 + ; CHECK-LABEL: name: ctlz_zero_poison_s16_s16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -89,58 +89,58 @@ body: | ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_TRUNC %0 - %2:_(s16) = G_CTLZ_ZERO_UNDEF %1 + %2:_(s16) = G_CTLZ_ZERO_POISON %1 %3:_(s32) = G_ZEXT %2 $vgpr0 = COPY %3 ... --- -name: ctlz_zero_undef_v2s32_v2s32 +name: ctlz_zero_poison_v2s32_v2s32 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_v2s32_v2s32 + ; CHECK-LABEL: name: ctlz_zero_poison_v2s32_v2s32 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s32) - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTLZ_ZERO_UNDEF]](s32), [[CTLZ_ZERO_UNDEF1]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[UV]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[UV1]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTLZ_ZERO_POISON]](s32), [[CTLZ_ZERO_POISON1]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 - %1:_(<2 x s32>) = G_CTLZ_ZERO_UNDEF %0 + %1:_(<2 x s32>) = G_CTLZ_ZERO_POISON %0 $vgpr0_vgpr1 = COPY %1 ... --- -name: ctlz_zero_undef_v2s32_v2s64 +name: ctlz_zero_poison_v2s32_v2s64 body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK-LABEL: name: ctlz_zero_undef_v2s32_v2s64 + ; CHECK-LABEL: name: ctlz_zero_poison_v2s32_v2s64 ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s64) - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s64) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTLZ_ZERO_UNDEF]](s32), [[CTLZ_ZERO_UNDEF1]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[UV]](s64) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[UV1]](s64) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTLZ_ZERO_POISON]](s32), [[CTLZ_ZERO_POISON1]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - %1:_(<2 x s32>) = G_CTLZ_ZERO_UNDEF %0 + %1:_(<2 x s32>) = G_CTLZ_ZERO_POISON %0 $vgpr0_vgpr1 = COPY %1 ... --- -name: ctlz_zero_undef_v2s16_v2s16 +name: ctlz_zero_poison_v2s16_v2s16 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_v2s16_v2s16 + ; CHECK-LABEL: name: ctlz_zero_poison_v2s16_v2s16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 @@ -159,18 +159,18 @@ body: | ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 - %1:_(<2 x s16>) = G_CTLZ_ZERO_UNDEF %0 + %1:_(<2 x s16>) = G_CTLZ_ZERO_POISON %0 $vgpr0 = COPY %1 ... --- -name: ctlz_zero_undef_s7_s7 +name: ctlz_zero_poison_s7_s7 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s7_s7 + ; CHECK-LABEL: name: ctlz_zero_poison_s7_s7 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -182,30 +182,30 @@ body: | ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s7) = G_TRUNC %0 - %2:_(s7) = G_CTLZ_ZERO_UNDEF %1 + %2:_(s7) = G_CTLZ_ZERO_POISON %1 %3:_(s32) = G_ZEXT %2 $vgpr0 = COPY %3 ... --- -name: ctlz_zero_undef_s33_s33 +name: ctlz_zero_poison_s33_s33 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s33_s33 + ; CHECK-LABEL: name: ctlz_zero_poison_s33_s33 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[SHL]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTLZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s33) = G_TRUNC %0 - %2:_(s33) = G_CTLZ_ZERO_UNDEF %1 + %2:_(s33) = G_CTLZ_ZERO_POISON %1 %3:_(s64) = G_ANYEXT %2 $vgpr0_vgpr1 = COPY %3 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz-zero-poison.mir similarity index 54% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz-zero-undef.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz-zero-poison.mir index ecad6cb8bbd18..fb78db82e447d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz-zero-poison.mir @@ -2,199 +2,199 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s --- -name: cttz_zero_undef_s32_s32 +name: cttz_zero_poison_s32_s32 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s32_s32 + ; CHECK-LABEL: name: cttz_zero_poison_s32_s32 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[CTTZ_ZERO_POISON]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 $vgpr0 = COPY %1 ... --- -name: cttz_zero_undef_s32_s64 +name: cttz_zero_poison_s32_s64 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s32_s64 + ; CHECK-LABEL: name: cttz_zero_poison_s32_s64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: $vgpr0 = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[CTTZ_ZERO_POISON]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 $vgpr0 = COPY %1 ... --- -name: cttz_zero_undef_s64_s64 +name: cttz_zero_poison_s64_s64 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s64_s64 + ; CHECK-LABEL: name: cttz_zero_poison_s64_s64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 - %1:_(s64) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s64) = G_CTTZ_ZERO_POISON %0 $vgpr0_vgpr1 = COPY %1 ... --- -name: cttz_zero_undef_s16_s32 +name: cttz_zero_poison_s16_s32 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s16_s32 + ; CHECK-LABEL: name: cttz_zero_poison_s16_s32 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[CTTZ_ZERO_POISON]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s16) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s16) = G_CTTZ_ZERO_POISON %0 %2:_(s32) = G_ZEXT %1 $vgpr0 = COPY %2 ... --- -name: cttz_zero_undef_s16_s16 +name: cttz_zero_poison_s16_s16 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s16_s16 + ; CHECK-LABEL: name: cttz_zero_poison_s16_s16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_TRUNC %0 - %2:_(s16) = G_CTTZ_ZERO_UNDEF %1 + %2:_(s16) = G_CTTZ_ZERO_POISON %1 %3:_(s32) = G_ZEXT %2 $vgpr0 = COPY %3 ... --- -name: cttz_zero_undef_v2s32_v2s32 +name: cttz_zero_poison_v2s32_v2s32 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_v2s32_v2s32 + ; CHECK-LABEL: name: cttz_zero_poison_v2s32_v2s32 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTTZ_ZERO_UNDEF]](s32), [[CTTZ_ZERO_UNDEF1]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV1]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTTZ_ZERO_POISON]](s32), [[CTTZ_ZERO_POISON1]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 - %1:_(<2 x s32>) = G_CTTZ_ZERO_UNDEF %0 + %1:_(<2 x s32>) = G_CTTZ_ZERO_POISON %0 $vgpr0_vgpr1 = COPY %1 ... --- -name: cttz_zero_undef_v2s32_v2s64 +name: cttz_zero_poison_v2s32_v2s64 body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK-LABEL: name: cttz_zero_undef_v2s32_v2s64 + ; CHECK-LABEL: name: cttz_zero_poison_v2s32_v2s64 ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s64) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s64) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTTZ_ZERO_UNDEF]](s32), [[CTTZ_ZERO_UNDEF1]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV]](s64) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV1]](s64) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[CTTZ_ZERO_POISON]](s32), [[CTTZ_ZERO_POISON1]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - %1:_(<2 x s32>) = G_CTTZ_ZERO_UNDEF %0 + %1:_(<2 x s32>) = G_CTTZ_ZERO_POISON %0 $vgpr0_vgpr1 = COPY %1 ... --- -name: cttz_zero_undef_v2s16_v2s16 +name: cttz_zero_poison_v2s16_v2s16 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_v2s16_v2s16 + ; CHECK-LABEL: name: cttz_zero_poison_v2s16_v2s16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[BITCAST]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[LSHR]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF1]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[BITCAST]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[LSHR]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON1]](s32) ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL]] ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 - %1:_(<2 x s16>) = G_CTTZ_ZERO_UNDEF %0 + %1:_(<2 x s16>) = G_CTTZ_ZERO_POISON %0 $vgpr0 = COPY %1 ... --- -name: cttz_zero_undef_s7_s7 +name: cttz_zero_poison_s7_s7 body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s7_s7 + ; CHECK-LABEL: name: cttz_zero_poison_s7_s7 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s7) = G_TRUNC %0 - %2:_(s7) = G_CTTZ_ZERO_UNDEF %1 + %2:_(s7) = G_CTTZ_ZERO_POISON %1 %3:_(s32) = G_ZEXT %2 $vgpr0 = COPY %3 ... --- -name: cttz_zero_undef_s33_s33 +name: cttz_zero_poison_s33_s33 body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s33_s33 + ; CHECK-LABEL: name: cttz_zero_poison_s33_s33 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s33) = G_TRUNC %0 - %2:_(s33) = G_CTTZ_ZERO_UNDEF %1 + %2:_(s33) = G_CTTZ_ZERO_POISON %1 %3:_(s64) = G_ANYEXT %2 $vgpr0_vgpr1 = COPY %3 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir index 46d5bdbd309c4..00b8ee312e170 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir @@ -91,8 +91,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65536 ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[C]] - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_TRUNC %0 @@ -162,11 +162,11 @@ body: | ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65536 ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[BITCAST]], [[C1]] - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[C1]] - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF1]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON1]](s32) ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL]] ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) @@ -189,8 +189,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128 ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[C]] - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s7) = G_TRUNC %0 @@ -212,8 +212,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934592 ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]] - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s64) - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s33) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-poison.mir similarity index 66% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-poison.mir index 733d1342ff186..1828424c578d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-poison.mir @@ -3,67 +3,67 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- -name: ctlz_zero_undef_s32_s +name: ctlz_zero_poison_s32_s legalized: true body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_s + ; CHECK-LABEL: name: ctlz_zero_poison_s32_s ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:sgpr(s32) = G_CTLZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTLZ_ZERO_POISON]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: ctlz_zero_undef_s32_v +name: ctlz_zero_poison_s32_v legalized: true body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s32_v + ; CHECK-LABEL: name: ctlz_zero_poison_s32_v ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTLZ_ZERO_POISON]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: ctlz_zero_undef_s64_s +name: ctlz_zero_poison_s64_s legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s64_s + ; CHECK-LABEL: name: ctlz_zero_poison_s64_s ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:sgpr(s32) = G_CTLZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTLZ_ZERO_POISON]](s32) %0:_(s64) = COPY $sgpr0_sgpr1 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: ctlz_zero_undef_s64_v +name: ctlz_zero_poison_s64_v legalized: true body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: ctlz_zero_undef_s64_v + ; CHECK-LABEL: name: ctlz_zero_poison_s64_v ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 @@ -75,6 +75,6 @@ body: | ; CHECK-NEXT: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[ADD]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTLZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-poison.mir similarity index 66% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-poison.mir index 8dfcefbcd32df..c0421b784c600 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-poison.mir @@ -3,67 +3,67 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- -name: cttz_zero_undef_s32_s +name: cttz_zero_poison_s32_s legalized: true body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: cttz_zero_undef_s32_s + ; CHECK-LABEL: name: cttz_zero_poison_s32_s ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:sgpr(s32) = G_CTTZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTTZ_ZERO_POISON]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: cttz_zero_undef_s32_v +name: cttz_zero_poison_s32_v legalized: true body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s32_v + ; CHECK-LABEL: name: cttz_zero_poison_s32_v ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_POISON [[COPY]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTTZ_ZERO_POISON]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: cttz_zero_undef_s64_s +name: cttz_zero_poison_s64_s legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s64_s + ; CHECK-LABEL: name: cttz_zero_poison_s64_s ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:sgpr(s32) = G_CTTZ_ZERO_POISON [[COPY]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[CTTZ_ZERO_POISON]](s32) %0:_(s64) = COPY $sgpr0_sgpr1 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... --- -name: cttz_zero_undef_s64_v +name: cttz_zero_poison_s64_v legalized: true body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: cttz_zero_undef_s64_v + ; CHECK-LABEL: name: cttz_zero_poison_s64_v ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 @@ -75,6 +75,6 @@ body: | ; CHECK-NEXT: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[ADD]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 S_ENDPGM 0, implicit %1 ... diff --git a/llvm/test/CodeGen/AMDGPU/ctls.ll b/llvm/test/CodeGen/AMDGPU/ctls.ll index 3181b0032875d..00673b4a6d288 100644 --- a/llvm/test/CodeGen/AMDGPU/ctls.ll +++ b/llvm/test/CodeGen/AMDGPU/ctls.ll @@ -85,8 +85,8 @@ define i32 @ctls_i32_xor_commuted(i32 %x) { ret i32 %d } -define i32 @ctls_i32_zero_undef(i32 %x) { -; GFX6-LABEL: ctls_i32_zero_undef: +define i32 @ctls_i32_zero_poison(i32 %x) { +; GFX6-LABEL: ctls_i32_zero_poison: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 @@ -94,7 +94,7 @@ define i32 @ctls_i32_zero_undef(i32 %x) { ; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: ctls_i32_zero_undef: +; GFX11-LABEL: ctls_i32_zero_poison: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cls_i32_e32 v0, v0 @@ -104,7 +104,7 @@ define i32 @ctls_i32_zero_undef(i32 %x) { ; GFX11-NEXT: s_setpc_b64 s[30:31] %a = ashr i32 %x, 31 %b = xor i32 %x, %a - %c = call i32 @llvm.ctlz.i32(i32 %b, i1 true) ; zero_undef = true + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 true) ; zero_poison = true %d = sub i32 %c, 1 ret i32 %d } @@ -362,9 +362,9 @@ define <2 x i32> @ctls_v2i32_known_mixed_bits(<2 x i32> %x) { ret <2 x i32> %d } -; Vector with ctlz_zero_undef. -define <2 x i32> @ctls_v2i32_zero_undef(<2 x i32> %x) { -; GFX6-LABEL: ctls_v2i32_zero_undef: +; Vector with ctlz_zero_poison. +define <2 x i32> @ctls_v2i32_zero_poison(<2 x i32> %x) { +; GFX6-LABEL: ctls_v2i32_zero_poison: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 @@ -377,7 +377,7 @@ define <2 x i32> @ctls_v2i32_zero_undef(<2 x i32> %x) { ; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: ctls_v2i32_zero_undef: +; GFX11-LABEL: ctls_v2i32_zero_poison: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_poison.ll similarity index 90% rename from llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll rename to llvm/test/CodeGen/AMDGPU/ctlz_zero_poison.ll index 0f459959310e1..8c288ec551063 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_poison.ll @@ -26,8 +26,8 @@ declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i32: +define amdgpu_kernel void @s_ctlz_zero_poison_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -39,7 +39,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i32: +; VI-LABEL: s_ctlz_zero_poison_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -51,7 +51,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i32: +; EG-LABEL: s_ctlz_zero_poison_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 @@ -62,7 +62,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z, ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -77,8 +77,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32: +define amdgpu_kernel void @v_ctlz_zero_poison_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -97,7 +97,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32: +; VI-LABEL: v_ctlz_zero_poison_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32: +; EG-LABEL: v_ctlz_zero_poison_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -132,7 +132,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,8 +151,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_v2i32: +define amdgpu_kernel void @v_ctlz_zero_poison_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -172,7 +172,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_v2i32: +; VI-LABEL: v_ctlz_zero_poison_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -189,7 +189,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_v2i32: +; EG-LABEL: v_ctlz_zero_poison_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -209,7 +209,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v2i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_v4i32: +define amdgpu_kernel void @v_ctlz_zero_poison_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_v4i32: +; VI-LABEL: v_ctlz_zero_poison_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_v4i32: +; EG-LABEL: v_ctlz_zero_poison_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -293,7 +293,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v4i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ret void } -define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i8_with_select: +define amdgpu_kernel void @s_ctlz_zero_poison_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i8_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i8_with_select: +; VI-LABEL: s_ctlz_zero_poison_i8_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -342,7 +342,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i8_with_select: +; EG-LABEL: s_ctlz_zero_poison_i8_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[], KC1[] ; EG-NEXT: TEX 0 @6 @@ -371,7 +371,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i8_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -389,8 +389,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ret void } -define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i16_with_select: +define amdgpu_kernel void @s_ctlz_zero_poison_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i16_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -403,7 +403,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i16_with_select: +; VI-LABEL: s_ctlz_zero_poison_i16_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -416,7 +416,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i16_with_select: +; EG-LABEL: s_ctlz_zero_poison_i16_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[], KC1[] ; EG-NEXT: TEX 0 @6 @@ -445,7 +445,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i16_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -463,8 +463,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i32_with_select: +define amdgpu_kernel void @s_ctlz_zero_poison_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i32_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -476,7 +476,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i32_with_select: +; VI-LABEL: s_ctlz_zero_poison_i32_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -488,7 +488,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i32_with_select: +; EG-LABEL: s_ctlz_zero_poison_i32_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z, ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i32_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -516,8 +516,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i64_with_select: +define amdgpu_kernel void @s_ctlz_zero_poison_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i64_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -531,7 +531,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i64_with_select: +; VI-LABEL: s_ctlz_zero_poison_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v3, 0 @@ -543,7 +543,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i64_with_select: +; EG-LABEL: s_ctlz_zero_poison_i64_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 @@ -559,7 +559,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i64_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 @@ -577,8 +577,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i8_with_select: +define amdgpu_kernel void @v_ctlz_zero_poison_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i8_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -599,7 +599,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i8_with_select: +; VI-LABEL: v_ctlz_zero_poison_i8_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -617,7 +617,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i8_with_select: +; EG-LABEL: v_ctlz_zero_poison_i8_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -648,7 +648,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i8_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -671,8 +671,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i16_with_select: +define amdgpu_kernel void @v_ctlz_zero_poison_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i16_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i16_with_select: +; VI-LABEL: v_ctlz_zero_poison_i16_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -725,7 +725,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i16_with_select: +; EG-LABEL: v_ctlz_zero_poison_i16_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i16_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -784,8 +784,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_with_select: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -816,7 +816,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_with_select: +; VI-LABEL: v_ctlz_zero_poison_i32_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -854,7 +854,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_with_select: +; EG-LABEL: v_ctlz_zero_poison_i32_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -912,8 +912,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i64_with_select: +define amdgpu_kernel void @v_ctlz_zero_poison_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i64_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -958,7 +958,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i64_with_select: +; VI-LABEL: v_ctlz_zero_poison_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i64_with_select: +; EG-LABEL: v_ctlz_zero_poison_i64_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i64_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1119,8 +1119,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i8: +define amdgpu_kernel void @v_ctlz_zero_poison_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1139,7 +1139,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i8: +; VI-LABEL: v_ctlz_zero_poison_i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1155,7 +1155,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i8: +; EG-LABEL: v_ctlz_zero_poison_i8: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1184,7 +1184,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i8: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 @@ -1208,8 +1208,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ret void } -define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i64: +define amdgpu_kernel void @s_ctlz_zero_poison_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i64: +; VI-LABEL: s_ctlz_zero_poison_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 @@ -1235,7 +1235,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i64: +; EG-LABEL: s_ctlz_zero_poison_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 @@ -1251,7 +1251,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 @@ -1268,8 +1268,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ret void } -define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i64_trunc: +define amdgpu_kernel void @s_ctlz_zero_poison_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i64_trunc: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i64_trunc: +; VI-LABEL: s_ctlz_zero_poison_i64_trunc: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i64_trunc: +; EG-LABEL: s_ctlz_zero_poison_i64_trunc: ; EG: ; %bb.0: ; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i64_trunc: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -1323,8 +1323,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i64: +define amdgpu_kernel void @v_ctlz_zero_poison_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1344,7 +1344,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i64: +; VI-LABEL: v_ctlz_zero_poison_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i64: +; EG-LABEL: v_ctlz_zero_poison_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1390,7 +1390,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1413,8 +1413,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i64_trunc: +define amdgpu_kernel void @v_ctlz_zero_poison_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i64_trunc: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1434,7 +1434,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i64_trunc: +; VI-LABEL: v_ctlz_zero_poison_i64_trunc: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1455,7 +1455,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i64_trunc: +; EG-LABEL: v_ctlz_zero_poison_i64_trunc: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i64_trunc: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1504,8 +1504,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1524,7 +1524,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1560,7 +1560,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1583,8 +1583,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_ne_neg1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1603,7 +1603,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_ne_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1619,7 +1619,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_ne_neg1: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1662,8 +1662,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: +define amdgpu_kernel void @v_ctlz_zero_poison_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i8_sel_eq_neg1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1681,7 +1681,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: +; VI-LABEL: v_ctlz_zero_poison_i8_sel_eq_neg1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,7 +1696,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: +; EG-LABEL: v_ctlz_zero_poison_i8_sel_eq_neg1: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 @@ -1752,8 +1752,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ret void } -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1777,7 +1777,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1_two_use: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1826,7 +1826,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1855,8 +1855,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa } ; Selected on wrong constant -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_eq_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_eq_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1895,7 +1895,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_eq_0: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1915,7 +1915,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1939,8 +1939,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali } ; Selected on wrong constant -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_ne_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -1961,7 +1961,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_ne_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1979,7 +1979,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_ne_0: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -1999,7 +1999,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2023,8 +2023,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali } ; Compare on wrong constant -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -2045,7 +2045,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2063,7 +2063,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_eq_cmp_non0: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2108,8 +2108,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 } ; Selected on wrong constant -define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: +define amdgpu_kernel void @v_ctlz_zero_poison_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_ctlz_zero_poison_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: +; VI-LABEL: v_ctlz_zero_poison_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2148,7 +2148,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: +; EG-LABEL: v_ctlz_zero_poison_i32_sel_ne_cmp_non0: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -2169,7 +2169,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2192,27 +2192,27 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ret void } -define i7 @v_ctlz_zero_undef_i7(i7 %val) { -; SI-LABEL: v_ctlz_zero_undef_i7: +define i7 @v_ctlz_zero_poison_i7(i7 %val) { +; SI-LABEL: v_ctlz_zero_poison_i7: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_i7: +; VI-LABEL: v_ctlz_zero_poison_i7: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_i7: +; EG-LABEL: v_ctlz_zero_poison_i7: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i7: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0 @@ -2222,8 +2222,8 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { ret i7 %ctlz } -define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { -; SI-LABEL: s_ctlz_zero_undef_i18: +define amdgpu_kernel void @s_ctlz_zero_poison_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { +; SI-LABEL: s_ctlz_zero_poison_i18: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -2240,7 +2240,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_ctlz_zero_undef_i18: +; VI-LABEL: s_ctlz_zero_poison_i18: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_ctlz_zero_undef_i18: +; EG-LABEL: s_ctlz_zero_poison_i18: ; EG: ; %bb.0: ; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X @@ -2297,7 +2297,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: +; GFX9-GISEL-LABEL: s_ctlz_zero_poison_i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2317,27 +2317,27 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ret void } -define i18 @v_ctlz_zero_undef_i18(i18 %val) { -; SI-LABEL: v_ctlz_zero_undef_i18: +define i18 @v_ctlz_zero_poison_i18(i18 %val) { +; SI-LABEL: v_ctlz_zero_poison_i18: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_i18: +; VI-LABEL: v_ctlz_zero_poison_i18: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_i18: +; EG-LABEL: v_ctlz_zero_poison_i18: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0 @@ -2347,8 +2347,8 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) { ret i18 %ctlz } -define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { -; SI-LABEL: v_ctlz_zero_undef_v2i18: +define <2 x i18> @v_ctlz_zero_poison_v2i18(<2 x i18> %val) { +; SI-LABEL: v_ctlz_zero_poison_v2i18: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 @@ -2357,7 +2357,7 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_v2i18: +; VI-LABEL: v_ctlz_zero_poison_v2i18: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 @@ -2366,12 +2366,12 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_v2i18: +; EG-LABEL: v_ctlz_zero_poison_v2i18: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v2i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0 @@ -2383,8 +2383,8 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ret <2 x i18> %ctlz } -define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { -; SI-LABEL: v_ctlz_zero_undef_v2i16: +define <2 x i16> @v_ctlz_zero_poison_v2i16(<2 x i16> %val) { +; SI-LABEL: v_ctlz_zero_poison_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 @@ -2395,7 +2395,7 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_v2i16: +; VI-LABEL: v_ctlz_zero_poison_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 @@ -2405,12 +2405,12 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_v2i16: +; EG-LABEL: v_ctlz_zero_poison_v2i16: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v2i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -2425,8 +2425,8 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ret <2 x i16> %ctlz } -define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { -; SI-LABEL: v_ctlz_zero_undef_v3i16: +define <3 x i16> @v_ctlz_zero_poison_v3i16(<3 x i16> %val) { +; SI-LABEL: v_ctlz_zero_poison_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2439,7 +2439,7 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_v3i16: +; VI-LABEL: v_ctlz_zero_poison_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -2451,12 +2451,12 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_v3i16: +; EG-LABEL: v_ctlz_zero_poison_v3i16: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v3i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -2475,8 +2475,8 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ret <3 x i16> %ctlz } -define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { -; SI-LABEL: v_ctlz_zero_undef_v4i16: +define <4 x i16> @v_ctlz_zero_poison_v4i16(<4 x i16> %val) { +; SI-LABEL: v_ctlz_zero_poison_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 @@ -2494,7 +2494,7 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_v4i16: +; VI-LABEL: v_ctlz_zero_poison_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -2509,12 +2509,12 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_v4i16: +; EG-LABEL: v_ctlz_zero_poison_v4i16: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v4i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -2536,8 +2536,8 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ret <4 x i16> %ctlz } -define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { -; SI-LABEL: v_ctlz_zero_undef_v2i8: +define <2 x i8> @v_ctlz_zero_poison_v2i8(<2 x i8> %val) { +; SI-LABEL: v_ctlz_zero_poison_v2i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -2548,7 +2548,7 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_v2i8: +; VI-LABEL: v_ctlz_zero_poison_v2i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -2560,12 +2560,12 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { ; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_v2i8: +; EG-LABEL: v_ctlz_zero_poison_v2i8: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v2i8: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 @@ -2577,8 +2577,8 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { ret <2 x i8> %ctlz } -define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { -; SI-LABEL: v_ctlz_zero_undef_v2i7: +define <2 x i7> @v_ctlz_zero_poison_v2i7(<2 x i7> %val) { +; SI-LABEL: v_ctlz_zero_poison_v2i7: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 @@ -2587,7 +2587,7 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_ctlz_zero_undef_v2i7: +; VI-LABEL: v_ctlz_zero_poison_v2i7: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 @@ -2596,12 +2596,12 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_ctlz_zero_undef_v2i7: +; EG-LABEL: v_ctlz_zero_poison_v2i7: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD ; -; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7: +; GFX9-GISEL-LABEL: v_ctlz_zero_poison_v2i7: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_poison.ll similarity index 94% rename from llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll rename to llvm/test/CodeGen/AMDGPU/cttz_zero_poison.ll index cbfda51a61c29..0f5ee9d407f25 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_poison.ll @@ -13,8 +13,8 @@ declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { -; SI-LABEL: s_cttz_zero_undef_i32: +define amdgpu_kernel void @s_cttz_zero_poison_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { +; SI-LABEL: s_cttz_zero_poison_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -26,7 +26,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_cttz_zero_undef_i32: +; VI-LABEL: s_cttz_zero_poison_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -38,7 +38,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_cttz_zero_undef_i32: +; EG-LABEL: s_cttz_zero_poison_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z, ; -; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: +; GFX9-GISEL-LABEL: s_cttz_zero_poison_i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -64,8 +64,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ret void } -define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_cttz_zero_undef_i32: +define amdgpu_kernel void @v_cttz_zero_poison_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_cttz_zero_poison_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -84,7 +84,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_i32: +; VI-LABEL: v_cttz_zero_poison_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_i32: +; EG-LABEL: v_cttz_zero_poison_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -138,8 +138,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ret void } -define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_cttz_zero_undef_v2i32: +define amdgpu_kernel void @v_cttz_zero_poison_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_cttz_zero_poison_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -159,7 +159,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_v2i32: +; VI-LABEL: v_cttz_zero_poison_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_v2i32: +; EG-LABEL: v_cttz_zero_poison_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -196,7 +196,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_v2i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -216,8 +216,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ret void } -define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { -; SI-LABEL: v_cttz_zero_undef_v4i32: +define amdgpu_kernel void @v_cttz_zero_poison_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { +; SI-LABEL: v_cttz_zero_poison_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -239,7 +239,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_v4i32: +; VI-LABEL: v_cttz_zero_poison_v4i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_v4i32: +; EG-LABEL: v_cttz_zero_poison_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -280,7 +280,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_v4i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -302,8 +302,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ret void } -define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { -; SI-LABEL: s_cttz_zero_undef_i8_with_select: +define amdgpu_kernel void @s_cttz_zero_poison_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { +; SI-LABEL: s_cttz_zero_poison_i8_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -315,7 +315,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_cttz_zero_undef_i8_with_select: +; VI-LABEL: s_cttz_zero_poison_i8_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -327,7 +327,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_cttz_zero_undef_i8_with_select: +; EG-LABEL: s_cttz_zero_poison_i8_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[], KC1[] ; EG-NEXT: TEX 0 @6 @@ -354,7 +354,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: +; GFX9-GISEL-LABEL: s_cttz_zero_poison_i8_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -371,8 +371,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ret void } -define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { -; SI-LABEL: s_cttz_zero_undef_i16_with_select: +define amdgpu_kernel void @s_cttz_zero_poison_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { +; SI-LABEL: s_cttz_zero_poison_i16_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -384,7 +384,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_cttz_zero_undef_i16_with_select: +; VI-LABEL: s_cttz_zero_poison_i16_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -396,7 +396,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_cttz_zero_undef_i16_with_select: +; EG-LABEL: s_cttz_zero_poison_i16_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[], KC1[] ; EG-NEXT: TEX 0 @6 @@ -423,7 +423,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: +; GFX9-GISEL-LABEL: s_cttz_zero_poison_i16_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -440,8 +440,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { -; SI-LABEL: s_cttz_zero_undef_i32_with_select: +define amdgpu_kernel void @s_cttz_zero_poison_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { +; SI-LABEL: s_cttz_zero_poison_i32_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -453,7 +453,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_cttz_zero_undef_i32_with_select: +; VI-LABEL: s_cttz_zero_poison_i32_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_cttz_zero_undef_i32_with_select: +; EG-LABEL: s_cttz_zero_poison_i32_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 @@ -476,7 +476,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z, ; -; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: +; GFX9-GISEL-LABEL: s_cttz_zero_poison_i32_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -493,8 +493,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { -; SI-LABEL: s_cttz_zero_undef_i64_with_select: +define amdgpu_kernel void @s_cttz_zero_poison_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { +; SI-LABEL: s_cttz_zero_poison_i64_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -508,7 +508,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: s_cttz_zero_undef_i64_with_select: +; VI-LABEL: s_cttz_zero_poison_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v3, 0 @@ -520,7 +520,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; -; EG-LABEL: s_cttz_zero_undef_i64_with_select: +; EG-LABEL: s_cttz_zero_poison_i64_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 @@ -536,7 +536,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: +; GFX9-GISEL-LABEL: s_cttz_zero_poison_i64_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_cttz_zero_undef_i8_with_select: +define amdgpu_kernel void @v_cttz_zero_poison_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_cttz_zero_poison_i8_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -575,7 +575,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_i8_with_select: +; VI-LABEL: v_cttz_zero_poison_i8_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -592,7 +592,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_i8_with_select: +; EG-LABEL: v_cttz_zero_poison_i8_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -621,7 +621,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_i8_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -643,8 +643,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ret void } -define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_cttz_zero_undef_i16_with_select: +define amdgpu_kernel void @v_cttz_zero_poison_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_cttz_zero_poison_i16_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_i16_with_select: +; VI-LABEL: v_cttz_zero_poison_i16_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -693,7 +693,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_i16_with_select: +; EG-LABEL: v_cttz_zero_poison_i16_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 @@ -722,7 +722,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_i16_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -749,8 +749,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_cttz_zero_undef_i32_with_select: +define amdgpu_kernel void @v_cttz_zero_poison_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_cttz_zero_poison_i32_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -781,7 +781,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_i32_with_select: +; VI-LABEL: v_cttz_zero_poison_i32_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -819,7 +819,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_i32_with_select: +; EG-LABEL: v_cttz_zero_poison_i32_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 @@ -841,7 +841,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_i32_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -877,8 +877,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ret void } -define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { -; SI-LABEL: v_cttz_zero_undef_i64_with_select: +define amdgpu_kernel void @v_cttz_zero_poison_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { +; SI-LABEL: v_cttz_zero_poison_i64_with_select: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -923,7 +923,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_cttz_zero_undef_i64_with_select: +; VI-LABEL: v_cttz_zero_poison_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -994,7 +994,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; -; EG-LABEL: v_cttz_zero_undef_i64_with_select: +; EG-LABEL: v_cttz_zero_poison_i64_with_select: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: +; GFX9-GISEL-LABEL: v_cttz_zero_poison_i64_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir index 42c3ae3a889ba..b3ee77360798f 100644 --- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir +++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir @@ -3,11 +3,11 @@ # RUN: llc -O0 -mtriple arm-linux-gnueabi -mattr=-v5t -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,LIBCALLS --- | define void @test_ctlz_s32() { ret void } - define void @test_ctlz_zero_undef_s32() { ret void } + define void @test_ctlz_zero_poison_s32() { ret void } ; same as above but with extensions define void @test_ctlz_s16() { ret void } - define void @test_ctlz_zero_undef_s8() { ret void } + define void @test_ctlz_zero_poison_s8() { ret void } ... --- name: test_ctlz_s32 @@ -47,8 +47,8 @@ body: | BX_RET 14, $noreg, implicit $r0 ... --- -name: test_ctlz_zero_undef_s32 -# CHECK-LABEL: name: test_ctlz_zero_undef_s32 +name: test_ctlz_zero_poison_s32 +# CHECK-LABEL: name: test_ctlz_zero_poison_s32 legalized: false # CHECK: legalized: true regBankSelected: false @@ -72,7 +72,7 @@ body: | ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = COPY $r0 ; LIBCALLS: ADJCALLSTACKUP ; LIBCALLS-NOT: G_CTLZ - %1(s32) = G_CTLZ_ZERO_UNDEF %0 + %1(s32) = G_CTLZ_ZERO_POISON %0 ; CHECK: $r0 = COPY [[R]] $r0 = COPY %1(s32) @@ -128,8 +128,8 @@ body: | BX_RET 14, $noreg, implicit $r0 ... --- -name: test_ctlz_zero_undef_s8 -# CHECK-LABEL: name: test_ctlz_zero_undef_s8 +name: test_ctlz_zero_poison_s8 +# CHECK-LABEL: name: test_ctlz_zero_poison_s8 legalized: false # CHECK: legalized: true regBankSelected: false @@ -159,8 +159,8 @@ body: | ; LIBCALLS: ADJCALLSTACKUP ; LIBCALLS-NOT: G_CTLZ ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ [[R32]] - ; CLZ-NOT: G_CTLZ_ZERO_UNDEF - %2(s8) = G_CTLZ_ZERO_UNDEF %1 + ; CLZ-NOT: G_CTLZ_ZERO_POISON + %2(s8) = G_CTLZ_ZERO_POISON %1 ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[COUNT]], [[BITDIFF]] ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]] diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll index ae76452b4d2cd..b6c410b2639e0 100644 --- a/llvm/test/CodeGen/ARM/cttz.ll +++ b/llvm/test/CodeGen/ARM/cttz.ll @@ -366,8 +366,8 @@ define i64 @test_i64(i64 %a) { ;------------------------------------------------------------------------------ -define i8 @test_i8_zero_undef(i8 %a) { -; CHECK-5-LABEL: test_i8_zero_undef: +define i8 @test_i8_zero_poison(i8 %a) { +; CHECK-5-LABEL: test_i8_zero_poison: ; CHECK-5: @ %bb.0: ; CHECK-5-NEXT: sub r1, r0, #1 ; CHECK-5-NEXT: bic r0, r1, r0 @@ -375,13 +375,13 @@ define i8 @test_i8_zero_undef(i8 %a) { ; CHECK-5-NEXT: rsb r0, r0, #32 ; CHECK-5-NEXT: bx lr ; -; CHECK-LABEL: test_i8_zero_undef: +; CHECK-LABEL: test_i8_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: rbit r0, r0 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: bx lr ; -; CHECK-6M-LABEL: test_i8_zero_undef: +; CHECK-6M-LABEL: test_i8_zero_poison: ; CHECK-6M: @ %bb.0: ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: beq .LBB4_2 @@ -404,7 +404,7 @@ define i8 @test_i8_zero_undef(i8 %a) { ; CHECK-6M-NEXT: .LCPI4_1: ; CHECK-6M-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t" ; -; CHECK-8MBASE-LABEL: test_i8_zero_undef: +; CHECK-8MBASE-LABEL: test_i8_zero_poison: ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: cbz r0, .LBB4_2 ; CHECK-8MBASE-NEXT: @ %bb.1: @@ -428,8 +428,8 @@ define i8 @test_i8_zero_undef(i8 %a) { ret i8 %tmp } -define i16 @test_i16_zero_undef(i16 %a) { -; CHECK-5-LABEL: test_i16_zero_undef: +define i16 @test_i16_zero_poison(i16 %a) { +; CHECK-5-LABEL: test_i16_zero_poison: ; CHECK-5: @ %bb.0: ; CHECK-5-NEXT: sub r1, r0, #1 ; CHECK-5-NEXT: bic r0, r1, r0 @@ -437,13 +437,13 @@ define i16 @test_i16_zero_undef(i16 %a) { ; CHECK-5-NEXT: rsb r0, r0, #32 ; CHECK-5-NEXT: bx lr ; -; CHECK-LABEL: test_i16_zero_undef: +; CHECK-LABEL: test_i16_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: rbit r0, r0 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: bx lr ; -; CHECK-6M-LABEL: test_i16_zero_undef: +; CHECK-6M-LABEL: test_i16_zero_poison: ; CHECK-6M: @ %bb.0: ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: beq .LBB5_2 @@ -466,7 +466,7 @@ define i16 @test_i16_zero_undef(i16 %a) { ; CHECK-6M-NEXT: .LCPI5_1: ; CHECK-6M-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t" ; -; CHECK-8MBASE-LABEL: test_i16_zero_undef: +; CHECK-8MBASE-LABEL: test_i16_zero_poison: ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: cbz r0, .LBB5_2 ; CHECK-8MBASE-NEXT: @ %bb.1: @@ -491,8 +491,8 @@ define i16 @test_i16_zero_undef(i16 %a) { } -define i32 @test_i32_zero_undef(i32 %a) { -; CHECK-5-LABEL: test_i32_zero_undef: +define i32 @test_i32_zero_poison(i32 %a) { +; CHECK-5-LABEL: test_i32_zero_poison: ; CHECK-5: @ %bb.0: ; CHECK-5-NEXT: sub r1, r0, #1 ; CHECK-5-NEXT: bic r0, r1, r0 @@ -500,13 +500,13 @@ define i32 @test_i32_zero_undef(i32 %a) { ; CHECK-5-NEXT: rsb r0, r0, #32 ; CHECK-5-NEXT: bx lr ; -; CHECK-LABEL: test_i32_zero_undef: +; CHECK-LABEL: test_i32_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: rbit r0, r0 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: bx lr ; -; CHECK-6M-LABEL: test_i32_zero_undef: +; CHECK-6M-LABEL: test_i32_zero_poison: ; CHECK-6M: @ %bb.0: ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: beq .LBB6_2 @@ -529,7 +529,7 @@ define i32 @test_i32_zero_undef(i32 %a) { ; CHECK-6M-NEXT: .LCPI6_1: ; CHECK-6M-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t" ; -; CHECK-8MBASE-LABEL: test_i32_zero_undef: +; CHECK-8MBASE-LABEL: test_i32_zero_poison: ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: cbz r0, .LBB6_2 ; CHECK-8MBASE-NEXT: @ %bb.1: @@ -553,8 +553,8 @@ define i32 @test_i32_zero_undef(i32 %a) { ret i32 %tmp } -define i64 @test_i64_zero_undef(i64 %a) { -; CHECK-5-LABEL: test_i64_zero_undef: +define i64 @test_i64_zero_poison(i64 %a) { +; CHECK-5-LABEL: test_i64_zero_poison: ; CHECK-5: @ %bb.0: ; CHECK-5-NEXT: sub r3, r1, #1 ; CHECK-5-NEXT: sub r2, r0, #1 @@ -569,7 +569,7 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-5-NEXT: mov r1, #0 ; CHECK-5-NEXT: bx lr ; -; CHECK-LABEL: test_i64_zero_undef: +; CHECK-LABEL: test_i64_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: rbit r2, r0 @@ -581,7 +581,7 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: bx lr ; -; CHECK-6M-LABEL: test_i64_zero_undef: +; CHECK-6M-LABEL: test_i64_zero_poison: ; CHECK-6M: @ %bb.0: ; CHECK-6M-NEXT: .save {r4, r5, r7, lr} ; CHECK-6M-NEXT: push {r4, r5, r7, lr} @@ -625,7 +625,7 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-6M-NEXT: .LCPI7_1: ; CHECK-6M-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t" ; -; CHECK-8MBASE-LABEL: test_i64_zero_undef: +; CHECK-8MBASE-LABEL: test_i64_zero_poison: ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr} ; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr} diff --git a/llvm/test/CodeGen/ARM/cttz_vector.ll b/llvm/test/CodeGen/ARM/cttz_vector.ll index 988ea5d4acb12..ba4fb074db9c2 100644 --- a/llvm/test/CodeGen/ARM/cttz_vector.ll +++ b/llvm/test/CodeGen/ARM/cttz_vector.ll @@ -282,8 +282,8 @@ define void @test_v2i64(ptr %p) { ;------------------------------------------------------------------------------ -define void @test_v1i8_zero_undef(ptr %p) { -; CHECK-LABEL: test_v1i8_zero_undef: +define void @test_v1i8_zero_poison(ptr %p) { +; CHECK-LABEL: test_v1i8_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldrb r1, [r0] ; CHECK-NEXT: rbit r1, r1 @@ -296,8 +296,8 @@ define void @test_v1i8_zero_undef(ptr %p) { ret void } -define void @test_v2i8_zero_undef(ptr %p) { -; CHECK-LABEL: test_v2i8_zero_undef: +define void @test_v2i8_zero_poison(ptr %p) { +; CHECK-LABEL: test_v2i8_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.16 {d16[0]}, [r0:16] ; CHECK-NEXT: vmovl.u8 q8, d16 @@ -318,8 +318,8 @@ define void @test_v2i8_zero_undef(ptr %p) { ret void } -define void @test_v4i8_zero_undef(ptr %p) { -; CHECK-LABEL: test_v4i8_zero_undef: +define void @test_v4i8_zero_poison(ptr %p) { +; CHECK-LABEL: test_v4i8_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: vmovl.u8 q8, d16 @@ -337,8 +337,8 @@ define void @test_v4i8_zero_undef(ptr %p) { ret void } -define void @test_v8i8_zero_undef(ptr %p) { -; CHECK-LABEL: test_v8i8_zero_undef: +define void @test_v8i8_zero_poison(ptr %p) { +; CHECK-LABEL: test_v8i8_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vmov.i8 d18, #0x1 @@ -354,8 +354,8 @@ define void @test_v8i8_zero_undef(ptr %p) { ret void } -define void @test_v16i8_zero_undef(ptr %p) { -; CHECK-LABEL: test_v16i8_zero_undef: +define void @test_v16i8_zero_poison(ptr %p) { +; CHECK-LABEL: test_v16i8_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vmov.i8 q10, #0x1 @@ -371,8 +371,8 @@ define void @test_v16i8_zero_undef(ptr %p) { ret void } -define void @test_v1i16_zero_undef(ptr %p) { -; CHECK-LABEL: test_v1i16_zero_undef: +define void @test_v1i16_zero_poison(ptr %p) { +; CHECK-LABEL: test_v1i16_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldrh r1, [r0] ; CHECK-NEXT: rbit r1, r1 @@ -385,8 +385,8 @@ define void @test_v1i16_zero_undef(ptr %p) { ret void } -define void @test_v2i16_zero_undef(ptr %p) { -; CHECK-LABEL: test_v2i16_zero_undef: +define void @test_v2i16_zero_poison(ptr %p) { +; CHECK-LABEL: test_v2i16_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: vmovl.u16 q8, d16 @@ -404,8 +404,8 @@ define void @test_v2i16_zero_undef(ptr %p) { ret void } -define void @test_v4i16_zero_undef(ptr %p) { -; CHECK-LABEL: test_v4i16_zero_undef: +define void @test_v4i16_zero_poison(ptr %p) { +; CHECK-LABEL: test_v4i16_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vneg.s16 d17, d16 @@ -421,8 +421,8 @@ define void @test_v4i16_zero_undef(ptr %p) { ret void } -define void @test_v8i16_zero_undef(ptr %p) { -; CHECK-LABEL: test_v8i16_zero_undef: +define void @test_v8i16_zero_poison(ptr %p) { +; CHECK-LABEL: test_v8i16_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vneg.s16 q9, q8 @@ -438,8 +438,8 @@ define void @test_v8i16_zero_undef(ptr %p) { ret void } -define void @test_v1i32_zero_undef(ptr %p) { -; CHECK-LABEL: test_v1i32_zero_undef: +define void @test_v1i32_zero_poison(ptr %p) { +; CHECK-LABEL: test_v1i32_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: rbit r1, r1 @@ -452,8 +452,8 @@ define void @test_v1i32_zero_undef(ptr %p) { ret void } -define void @test_v2i32_zero_undef(ptr %p) { -; CHECK-LABEL: test_v2i32_zero_undef: +define void @test_v2i32_zero_poison(ptr %p) { +; CHECK-LABEL: test_v2i32_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vneg.s32 d17, d16 @@ -469,8 +469,8 @@ define void @test_v2i32_zero_undef(ptr %p) { ret void } -define void @test_v4i32_zero_undef(ptr %p) { -; CHECK-LABEL: test_v4i32_zero_undef: +define void @test_v4i32_zero_poison(ptr %p) { +; CHECK-LABEL: test_v4i32_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vneg.s32 q9, q8 @@ -486,8 +486,8 @@ define void @test_v4i32_zero_undef(ptr %p) { ret void } -define void @test_v1i64_zero_undef(ptr %p) { -; CHECK-LABEL: test_v1i64_zero_undef: +define void @test_v1i64_zero_poison(ptr %p) { +; CHECK-LABEL: test_v1i64_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov.i32 d16, #0x0 ; CHECK-NEXT: vldr d17, [r0] @@ -507,8 +507,8 @@ define void @test_v1i64_zero_undef(ptr %p) { ret void } -define void @test_v2i64_zero_undef(ptr %p) { -; CHECK-LABEL: test_v2i64_zero_undef: +define void @test_v2i64_zero_poison(ptr %p) { +; CHECK-LABEL: test_v2i64_zero_poison: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov.i32 q8, #0x0 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] diff --git a/llvm/test/CodeGen/Lanai/i32.ll b/llvm/test/CodeGen/Lanai/i32.ll index 632cc467d6811..a8a588ccbf506 100644 --- a/llvm/test/CodeGen/Lanai/i32.ll +++ b/llvm/test/CodeGen/Lanai/i32.ll @@ -114,10 +114,10 @@ define i32 @clz32(i32 %x) { ret i32 %a } -; CHECK-LABEL: clz32_zero_undef: +; CHECK-LABEL: clz32_zero_poison: ; CHECK-NOT: sub.f ; CHECK: leadz %r{{[0-9]+}}, %rv -define i32 @clz32_zero_undef(i32 %x) { +define i32 @clz32_zero_poison(i32 %x) { %a = call i32 @llvm.ctlz.i32(i32 %x, i1 true) ret i32 %a } @@ -129,10 +129,10 @@ define i32 @ctz32(i32 %x) { ret i32 %a } -; CHECK-LABEL: ctz32_zero_undef: +; CHECK-LABEL: ctz32_zero_poison: ; CHECK-NOT: sub.f ; CHECK: trailz %r{{[0-9]+}}, %rv -define i32 @ctz32_zero_undef(i32 %x) { +define i32 @ctz32_zero_poison(i32 %x) { %a = call i32 @llvm.cttz.i32(i32 %x, i1 true) ret i32 %a } diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir index a06bb6da45d23..64cb476d961e2 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir @@ -98,7 +98,7 @@ body: | %0:_(s32) = COPY $a0 %2:_(s32) = G_CONSTANT i32 1 %4:_(s32) = G_CONSTANT i32 0 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0(s32) + %1:_(s32) = G_CTTZ_ZERO_POISON %0(s32) %3:_(s32) = nuw nsw G_ADD %1, %2 %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4 %6:_(s32) = G_SELECT %5(s1), %4, %3 @@ -158,7 +158,7 @@ body: | %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) %4:_(s64) = G_CONSTANT i64 1 %6:_(s64) = G_CONSTANT i64 0 - %3:_(s64) = G_CTTZ_ZERO_UNDEF %0(s64) + %3:_(s64) = G_CTTZ_ZERO_POISON %0(s64) %5:_(s64) = nuw nsw G_ADD %3, %4 %7:_(s1) = G_ICMP intpred(eq), %0(s64), %6 %8:_(s64) = G_SELECT %7(s1), %6, %5 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir index f3e00ed6521c4..edc8bb70ef21e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir @@ -737,14 +737,14 @@ # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK -# DEBUG-NEXT: G_CTTZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: G_CTTZ_ZERO_POISON (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_CTLZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK -# DEBUG-NEXT: G_CTLZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: G_CTLZ_ZERO_POISON (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_CTLS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir index cb6a0da298fe8..f2d8683959222 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir @@ -303,12 +303,12 @@ body: | ... --- -name: ctlz_zero_undef_i8 +name: ctlz_zero_poison_i8 body: | bb.1: liveins: $x10 - ; RV32I-LABEL: name: ctlz_zero_undef_i8 + ; RV32I-LABEL: name: ctlz_zero_poison_i8 ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -348,7 +348,7 @@ body: | ; RV32I-NEXT: $x10 = COPY [[SUB1]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: ctlz_zero_undef_i8 + ; RV32ZBB-LABEL: name: ctlz_zero_poison_i8 ; RV32ZBB: liveins: $x10 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -362,19 +362,19 @@ body: | ; RV32ZBB-NEXT: PseudoRET implicit $x10 %1:_(s32) = COPY $x10 %0:_(s8) = G_TRUNC %1(s32) - %2:_(s8) = G_CTLZ_ZERO_UNDEF %0(s8) + %2:_(s8) = G_CTLZ_ZERO_POISON %0(s8) %3:_(s32) = G_ANYEXT %2(s8) $x10 = COPY %3(s32) PseudoRET implicit $x10 ... --- -name: ctlz_zero_undef_i16 +name: ctlz_zero_poison_i16 body: | bb.1: liveins: $x10 - ; RV32I-LABEL: name: ctlz_zero_undef_i16 + ; RV32I-LABEL: name: ctlz_zero_poison_i16 ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -419,7 +419,7 @@ body: | ; RV32I-NEXT: $x10 = COPY [[SUB1]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: ctlz_zero_undef_i16 + ; RV32ZBB-LABEL: name: ctlz_zero_poison_i16 ; RV32ZBB: liveins: $x10 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -433,19 +433,19 @@ body: | ; RV32ZBB-NEXT: PseudoRET implicit $x10 %1:_(s32) = COPY $x10 %0:_(s16) = G_TRUNC %1(s32) - %2:_(s16) = G_CTLZ_ZERO_UNDEF %0(s16) + %2:_(s16) = G_CTLZ_ZERO_POISON %0(s16) %3:_(s32) = G_ANYEXT %2(s16) $x10 = COPY %3(s32) PseudoRET implicit $x10 ... --- -name: ctlz_zero_undef_i32 +name: ctlz_zero_poison_i32 body: | bb.1: liveins: $x10 - ; RV32I-LABEL: name: ctlz_zero_undef_i32 + ; RV32I-LABEL: name: ctlz_zero_poison_i32 ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -486,7 +486,7 @@ body: | ; RV32I-NEXT: $x10 = COPY [[SUB1]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: ctlz_zero_undef_i32 + ; RV32ZBB-LABEL: name: ctlz_zero_poison_i32 ; RV32ZBB: liveins: $x10 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -494,18 +494,18 @@ body: | ; RV32ZBB-NEXT: $x10 = COPY [[CTLZ]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 - %1:_(s32) = G_CTLZ_ZERO_UNDEF %0(s32) + %1:_(s32) = G_CTLZ_ZERO_POISON %0(s32) $x10 = COPY %1(s32) PseudoRET implicit $x10 ... --- -name: ctlz_zero_undef_i64 +name: ctlz_zero_poison_i64 body: | bb.1: liveins: $x10, $x11 - ; RV32I-LABEL: name: ctlz_zero_undef_i64 + ; RV32I-LABEL: name: ctlz_zero_poison_i64 ; RV32I: liveins: $x10, $x11 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -575,7 +575,7 @@ body: | ; RV32I-NEXT: $x11 = COPY [[C]](s32) ; RV32I-NEXT: PseudoRET implicit $x10, implicit $x11 ; - ; RV32ZBB-LABEL: name: ctlz_zero_undef_i64 + ; RV32ZBB-LABEL: name: ctlz_zero_poison_i64 ; RV32ZBB: liveins: $x10, $x11 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -593,7 +593,7 @@ body: | %1:_(s32) = COPY $x10 %2:_(s32) = COPY $x11 %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) - %3:_(s64) = G_CTLZ_ZERO_UNDEF %0(s64) + %3:_(s64) = G_CTLZ_ZERO_POISON %0(s64) %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3(s64) $x10 = COPY %4(s32) $x11 = COPY %5(s32) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir index d65b1f26aa900..32288a44dcfac 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir @@ -280,12 +280,12 @@ body: | ... --- -name: ctlz_zero_undef_i8 +name: ctlz_zero_poison_i8 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: ctlz_zero_undef_i8 + ; RV64I-LABEL: name: ctlz_zero_poison_i8 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -325,7 +325,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: ctlz_zero_undef_i8 + ; RV64ZBB-LABEL: name: ctlz_zero_poison_i8 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -339,19 +339,19 @@ body: | ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %0:_(s8) = G_TRUNC %1(s64) - %2:_(s8) = G_CTLZ_ZERO_UNDEF %0(s8) + %2:_(s8) = G_CTLZ_ZERO_POISON %0(s8) %3:_(s64) = G_ANYEXT %2(s8) $x10 = COPY %3(s64) PseudoRET implicit $x10 ... --- -name: ctlz_zero_undef_i16 +name: ctlz_zero_poison_i16 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: ctlz_zero_undef_i16 + ; RV64I-LABEL: name: ctlz_zero_poison_i16 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -396,7 +396,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: ctlz_zero_undef_i16 + ; RV64ZBB-LABEL: name: ctlz_zero_poison_i16 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -410,19 +410,19 @@ body: | ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %0:_(s16) = G_TRUNC %1(s64) - %2:_(s16) = G_CTLZ_ZERO_UNDEF %0(s16) + %2:_(s16) = G_CTLZ_ZERO_POISON %0(s16) %3:_(s64) = G_ANYEXT %2(s16) $x10 = COPY %3(s64) PseudoRET implicit $x10 ... --- -name: ctlz_zero_undef_i32 +name: ctlz_zero_poison_i32 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: ctlz_zero_undef_i32 + ; RV64I-LABEL: name: ctlz_zero_poison_i32 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -476,7 +476,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: ctlz_zero_undef_i32 + ; RV64ZBB-LABEL: name: ctlz_zero_poison_i32 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -485,19 +485,19 @@ body: | ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %1(s64) - %2:_(s32) = G_CTLZ_ZERO_UNDEF %0(s32) + %2:_(s32) = G_CTLZ_ZERO_POISON %0(s32) %3:_(s64) = G_ANYEXT %2(s32) $x10 = COPY %3(s64) PseudoRET implicit $x10 ... --- -name: ctlz_zero_undef_i64 +name: ctlz_zero_poison_i64 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: ctlz_zero_undef_i64 + ; RV64I-LABEL: name: ctlz_zero_poison_i64 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -541,7 +541,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: ctlz_zero_undef_i64 + ; RV64ZBB-LABEL: name: ctlz_zero_poison_i64 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -549,7 +549,7 @@ body: | ; RV64ZBB-NEXT: $x10 = COPY [[CTLZ]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 - %1:_(s64) = G_CTLZ_ZERO_UNDEF %0(s64) + %1:_(s64) = G_CTLZ_ZERO_POISON %0(s64) $x10 = COPY %1(s64) PseudoRET implicit $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir index a4a06854fa18d..df05cf770e07d 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir @@ -255,12 +255,12 @@ body: | ... --- -name: cttz_zero_undef_i8 +name: cttz_zero_poison_i8 body: | bb.1: liveins: $x10 - ; RV32I-LABEL: name: cttz_zero_undef_i8 + ; RV32I-LABEL: name: cttz_zero_poison_i8 ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -293,7 +293,7 @@ body: | ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: cttz_zero_undef_i8 + ; RV32ZBB-LABEL: name: cttz_zero_poison_i8 ; RV32ZBB: liveins: $x10 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -305,19 +305,19 @@ body: | ; RV32ZBB-NEXT: PseudoRET implicit $x10 %1:_(s32) = COPY $x10 %0:_(s8) = G_TRUNC %1(s32) - %2:_(s8) = G_CTTZ_ZERO_UNDEF %0(s8) + %2:_(s8) = G_CTTZ_ZERO_POISON %0(s8) %3:_(s32) = G_ANYEXT %2(s8) $x10 = COPY %3(s32) PseudoRET implicit $x10 ... --- -name: cttz_zero_undef_i16 +name: cttz_zero_poison_i16 body: | bb.1: liveins: $x10 - ; RV32I-LABEL: name: cttz_zero_undef_i16 + ; RV32I-LABEL: name: cttz_zero_poison_i16 ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -352,7 +352,7 @@ body: | ; RV32I-NEXT: $x10 = COPY [[AND7]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: cttz_zero_undef_i16 + ; RV32ZBB-LABEL: name: cttz_zero_poison_i16 ; RV32ZBB: liveins: $x10 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -364,19 +364,19 @@ body: | ; RV32ZBB-NEXT: PseudoRET implicit $x10 %1:_(s32) = COPY $x10 %0:_(s16) = G_TRUNC %1(s32) - %2:_(s16) = G_CTTZ_ZERO_UNDEF %0(s16) + %2:_(s16) = G_CTTZ_ZERO_POISON %0(s16) %3:_(s32) = G_ANYEXT %2(s16) $x10 = COPY %3(s32) PseudoRET implicit $x10 ... --- -name: cttz_zero_undef_i32 +name: cttz_zero_poison_i32 body: | bb.1: liveins: $x10 - ; RV32I-LABEL: name: cttz_zero_undef_i32 + ; RV32I-LABEL: name: cttz_zero_poison_i32 ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -407,7 +407,7 @@ body: | ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: cttz_zero_undef_i32 + ; RV32ZBB-LABEL: name: cttz_zero_poison_i32 ; RV32ZBB: liveins: $x10 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -415,18 +415,18 @@ body: | ; RV32ZBB-NEXT: $x10 = COPY [[CTTZ]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0(s32) + %1:_(s32) = G_CTTZ_ZERO_POISON %0(s32) $x10 = COPY %1(s32) PseudoRET implicit $x10 ... --- -name: cttz_zero_undef_i64 +name: cttz_zero_poison_i64 body: | bb.1: liveins: $x10, $x11 - ; RV32I-LABEL: name: cttz_zero_undef_i64 + ; RV32I-LABEL: name: cttz_zero_poison_i64 ; RV32I: liveins: $x10, $x11 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -479,7 +479,7 @@ body: | ; RV32I-NEXT: $x11 = COPY [[C]](s32) ; RV32I-NEXT: PseudoRET implicit $x10, implicit $x11 ; - ; RV32ZBB-LABEL: name: cttz_zero_undef_i64 + ; RV32ZBB-LABEL: name: cttz_zero_poison_i64 ; RV32ZBB: liveins: $x10, $x11 ; RV32ZBB-NEXT: {{ $}} ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 @@ -497,7 +497,7 @@ body: | %1:_(s32) = COPY $x10 %2:_(s32) = COPY $x11 %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) - %3:_(s64) = G_CTTZ_ZERO_UNDEF %0(s64) + %3:_(s64) = G_CTTZ_ZERO_POISON %0(s64) %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3(s64) $x10 = COPY %4(s32) $x11 = COPY %5(s32) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir index 82220a27d015d..289fce339f707 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir @@ -229,12 +229,12 @@ body: | ... --- -name: cttz_zero_undef_i8 +name: cttz_zero_poison_i8 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: cttz_zero_undef_i8 + ; RV64I-LABEL: name: cttz_zero_poison_i8 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -267,7 +267,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[LSHR3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: cttz_zero_undef_i8 + ; RV64ZBB-LABEL: name: cttz_zero_poison_i8 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -278,19 +278,19 @@ body: | ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %0:_(s8) = G_TRUNC %1(s64) - %2:_(s8) = G_CTTZ_ZERO_UNDEF %0(s8) + %2:_(s8) = G_CTTZ_ZERO_POISON %0(s8) %3:_(s64) = G_ANYEXT %2(s8) $x10 = COPY %3(s64) PseudoRET implicit $x10 ... --- -name: cttz_zero_undef_i16 +name: cttz_zero_poison_i16 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: cttz_zero_undef_i16 + ; RV64I-LABEL: name: cttz_zero_poison_i16 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -325,7 +325,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[AND7]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: cttz_zero_undef_i16 + ; RV64ZBB-LABEL: name: cttz_zero_poison_i16 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -336,19 +336,19 @@ body: | ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %0:_(s16) = G_TRUNC %1(s64) - %2:_(s16) = G_CTTZ_ZERO_UNDEF %0(s16) + %2:_(s16) = G_CTTZ_ZERO_POISON %0(s16) %3:_(s64) = G_ANYEXT %2(s16) $x10 = COPY %3(s64) PseudoRET implicit $x10 ... --- -name: cttz_zero_undef_i32 +name: cttz_zero_poison_i32 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: cttz_zero_undef_i32 + ; RV64I-LABEL: name: cttz_zero_poison_i32 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -387,7 +387,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[LSHR3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: cttz_zero_undef_i32 + ; RV64ZBB-LABEL: name: cttz_zero_poison_i32 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -396,19 +396,19 @@ body: | ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %1(s64) - %2:_(s32) = G_CTTZ_ZERO_UNDEF %0(s32) + %2:_(s32) = G_CTTZ_ZERO_POISON %0(s32) %3:_(s64) = G_ANYEXT %2(s32) $x10 = COPY %3(s64) PseudoRET implicit $x10 ... --- -name: cttz_zero_undef_i64 +name: cttz_zero_poison_i64 body: | bb.1: liveins: $x10 - ; RV64I-LABEL: name: cttz_zero_undef_i64 + ; RV64I-LABEL: name: cttz_zero_poison_i64 ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -439,7 +439,7 @@ body: | ; RV64I-NEXT: $x10 = COPY [[LSHR3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: cttz_zero_undef_i64 + ; RV64ZBB-LABEL: name: cttz_zero_poison_i64 ; RV64ZBB: liveins: $x10 ; RV64ZBB-NEXT: {{ $}} ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 @@ -447,7 +447,7 @@ body: | ; RV64ZBB-NEXT: $x10 = COPY [[CTTZ]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 - %1:_(s64) = G_CTTZ_ZERO_UNDEF %0(s64) + %1:_(s64) = G_CTTZ_ZERO_POISON %0(s64) $x10 = COPY %1(s64) PseudoRET implicit $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index daac8440e5763..466f855d9aff4 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -435,8 +435,8 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ret i32 %1 } -define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { -; RV64I-LABEL: cttz_zero_undef_i32: +define signext i32 @cttz_zero_poison_i32(i32 signext %a) nounwind { +; RV64I-LABEL: cttz_zero_poison_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill @@ -467,7 +467,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64ZBB-LABEL: cttz_zero_undef_i32: +; RV64ZBB-LABEL: cttz_zero_poison_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctzw a0, a0 ; RV64ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 7f089782d87e0..fd21901d99186 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -558,8 +558,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ret i64 %tmp } -define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { -; RV32_NOZBB-LABEL: test_cttz_i8_zero_undef: +define i8 @test_cttz_i8_zero_poison(i8 %a) nounwind { +; RV32_NOZBB-LABEL: test_cttz_i8_zero_poison: ; RV32_NOZBB: # %bb.0: ; RV32_NOZBB-NEXT: addi a1, a0, -1 ; RV32_NOZBB-NEXT: not a0, a0 @@ -576,7 +576,7 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV32_NOZBB-NEXT: andi a0, a0, 15 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_cttz_i8_zero_undef: +; RV64NOZBB-LABEL: test_cttz_i8_zero_poison: ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: addi a1, a0, -1 ; RV64NOZBB-NEXT: not a0, a0 @@ -593,17 +593,17 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV64NOZBB-NEXT: andi a0, a0, 15 ; RV64NOZBB-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i8_zero_undef: +; RV32ZBB-LABEL: test_cttz_i8_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: ctz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_cttz_i8_zero_undef: +; RV64ZBB-LABEL: test_cttz_i8_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctz a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_cttz_i8_zero_undef: +; RV32XTHEADBB-LABEL: test_cttz_i8_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: addi a1, a0, -1 ; RV32XTHEADBB-NEXT: not a0, a0 @@ -613,7 +613,7 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV32XTHEADBB-NEXT: sub a0, a1, a0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_cttz_i8_zero_undef: +; RV64XTHEADBB-LABEL: test_cttz_i8_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NEXT: not a0, a0 @@ -626,8 +626,8 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ret i8 %tmp } -define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { -; RV32_NOZBB-LABEL: test_cttz_i16_zero_undef: +define i16 @test_cttz_i16_zero_poison(i16 %a) nounwind { +; RV32_NOZBB-LABEL: test_cttz_i16_zero_poison: ; RV32_NOZBB: # %bb.0: ; RV32_NOZBB-NEXT: addi a1, a0, -1 ; RV32_NOZBB-NEXT: not a0, a0 @@ -651,7 +651,7 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_cttz_i16_zero_undef: +; RV64NOZBB-LABEL: test_cttz_i16_zero_poison: ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: addi a1, a0, -1 ; RV64NOZBB-NEXT: not a0, a0 @@ -675,17 +675,17 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i16_zero_undef: +; RV32ZBB-LABEL: test_cttz_i16_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: ctz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_cttz_i16_zero_undef: +; RV64ZBB-LABEL: test_cttz_i16_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctz a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_cttz_i16_zero_undef: +; RV32XTHEADBB-LABEL: test_cttz_i16_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: addi a1, a0, -1 ; RV32XTHEADBB-NEXT: not a0, a0 @@ -695,7 +695,7 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV32XTHEADBB-NEXT: sub a0, a1, a0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_cttz_i16_zero_undef: +; RV64XTHEADBB-LABEL: test_cttz_i16_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NEXT: not a0, a0 @@ -708,8 +708,8 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ret i16 %tmp } -define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { -; RV32I-LABEL: test_cttz_i32_zero_undef: +define i32 @test_cttz_i32_zero_poison(i32 %a) nounwind { +; RV32I-LABEL: test_cttz_i32_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill @@ -727,7 +727,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV64I-LABEL: test_cttz_i32_zero_undef: +; RV64I-LABEL: test_cttz_i32_zero_poison: ; RV64I: # %bb.0: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 @@ -758,7 +758,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_cttz_i32_zero_undef: +; RV32M-LABEL: test_cttz_i32_zero_poison: ; RV32M: # %bb.0: ; RV32M-NEXT: neg a1, a0 ; RV32M-NEXT: and a0, a0, a1 @@ -772,7 +772,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV32M-NEXT: lbu a0, 0(a0) ; RV32M-NEXT: ret ; -; RV64M-LABEL: test_cttz_i32_zero_undef: +; RV64M-LABEL: test_cttz_i32_zero_poison: ; RV64M: # %bb.0: ; RV64M-NEXT: neg a1, a0 ; RV64M-NEXT: and a0, a0, a1 @@ -786,17 +786,17 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV64M-NEXT: lbu a0, 0(a0) ; RV64M-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i32_zero_undef: +; RV32ZBB-LABEL: test_cttz_i32_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: ctz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_cttz_i32_zero_undef: +; RV64ZBB-LABEL: test_cttz_i32_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctzw a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_cttz_i32_zero_undef: +; RV32XTHEADBB-LABEL: test_cttz_i32_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: addi a1, a0, -1 ; RV32XTHEADBB-NEXT: not a0, a0 @@ -806,7 +806,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV32XTHEADBB-NEXT: sub a0, a1, a0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_cttz_i32_zero_undef: +; RV64XTHEADBB-LABEL: test_cttz_i32_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NEXT: not a0, a0 @@ -819,8 +819,8 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ret i32 %tmp } -define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { -; RV32I-LABEL: test_cttz_i64_zero_undef: +define i64 @test_cttz_i64_zero_poison(i64 %a) nounwind { +; RV32I-LABEL: test_cttz_i64_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill @@ -866,7 +866,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; -; RV64I-LABEL: test_cttz_i64_zero_undef: +; RV64I-LABEL: test_cttz_i64_zero_poison: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill @@ -884,7 +884,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_cttz_i64_zero_undef: +; RV32M-LABEL: test_cttz_i64_zero_poison: ; RV32M: # %bb.0: ; RV32M-NEXT: lui a3, 30667 ; RV32M-NEXT: addi a3, a3, 1329 @@ -911,7 +911,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32M-NEXT: li a1, 0 ; RV32M-NEXT: ret ; -; RV64M-LABEL: test_cttz_i64_zero_undef: +; RV64M-LABEL: test_cttz_i64_zero_poison: ; RV64M: # %bb.0: ; RV64M-NEXT: lui a1, %hi(.LCPI7_0) ; RV64M-NEXT: ld a1, %lo(.LCPI7_0)(a1) @@ -925,7 +925,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV64M-NEXT: lbu a0, 0(a0) ; RV64M-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i64_zero_undef: +; RV32ZBB-LABEL: test_cttz_i64_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: bnez a0, .LBB7_2 ; RV32ZBB-NEXT: # %bb.1: @@ -938,12 +938,12 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32ZBB-NEXT: li a1, 0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_cttz_i64_zero_undef: +; RV64ZBB-LABEL: test_cttz_i64_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctz a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_cttz_i64_zero_undef: +; RV32XTHEADBB-LABEL: test_cttz_i64_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: bnez a0, .LBB7_2 ; RV32XTHEADBB-NEXT: # %bb.1: @@ -964,7 +964,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32XTHEADBB-NEXT: li a1, 0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_cttz_i64_zero_undef: +; RV64XTHEADBB-LABEL: test_cttz_i64_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NEXT: not a0, a0 @@ -1658,8 +1658,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ret i64 %tmp } -define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { -; RV32_NOZBB-LABEL: test_ctlz_i8_zero_undef: +define i8 @test_ctlz_i8_zero_poison(i8 %a) nounwind { +; RV32_NOZBB-LABEL: test_ctlz_i8_zero_poison: ; RV32_NOZBB: # %bb.0: ; RV32_NOZBB-NEXT: slli a1, a0, 24 ; RV32_NOZBB-NEXT: srli a1, a1, 25 @@ -1683,7 +1683,7 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ; RV32_NOZBB-NEXT: andi a0, a0, 15 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctlz_i8_zero_undef: +; RV64NOZBB-LABEL: test_ctlz_i8_zero_poison: ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: slli a1, a0, 56 ; RV64NOZBB-NEXT: srli a1, a1, 57 @@ -1707,25 +1707,25 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ; RV64NOZBB-NEXT: andi a0, a0, 15 ; RV64NOZBB-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i8_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i8_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: slli a0, a0, 24 ; RV32ZBB-NEXT: clz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_ctlz_i8_zero_undef: +; RV64ZBB-LABEL: test_ctlz_i8_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 56 ; RV64ZBB-NEXT: clz a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_ctlz_i8_zero_undef: +; RV32XTHEADBB-LABEL: test_ctlz_i8_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: slli a0, a0, 24 ; RV32XTHEADBB-NEXT: th.ff1 a0, a0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_ctlz_i8_zero_undef: +; RV64XTHEADBB-LABEL: test_ctlz_i8_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: slli a0, a0, 56 ; RV64XTHEADBB-NEXT: th.ff1 a0, a0 @@ -1734,8 +1734,8 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ret i8 %tmp } -define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { -; RV32_NOZBB-LABEL: test_ctlz_i16_zero_undef: +define i16 @test_ctlz_i16_zero_poison(i16 %a) nounwind { +; RV32_NOZBB-LABEL: test_ctlz_i16_zero_poison: ; RV32_NOZBB: # %bb.0: ; RV32_NOZBB-NEXT: slli a1, a0, 16 ; RV32_NOZBB-NEXT: lui a2, 5 @@ -1769,7 +1769,7 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef: +; RV64NOZBB-LABEL: test_ctlz_i16_zero_poison: ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: slli a1, a0, 48 ; RV64NOZBB-NEXT: lui a2, 5 @@ -1803,25 +1803,25 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i16_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i16_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: slli a0, a0, 16 ; RV32ZBB-NEXT: clz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_ctlz_i16_zero_undef: +; RV64ZBB-LABEL: test_ctlz_i16_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 48 ; RV64ZBB-NEXT: clz a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_ctlz_i16_zero_undef: +; RV32XTHEADBB-LABEL: test_ctlz_i16_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: slli a0, a0, 16 ; RV32XTHEADBB-NEXT: th.ff1 a0, a0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_ctlz_i16_zero_undef: +; RV64XTHEADBB-LABEL: test_ctlz_i16_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: slli a0, a0, 48 ; RV64XTHEADBB-NEXT: th.ff1 a0, a0 @@ -1830,8 +1830,8 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ret i16 %tmp } -define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { -; RV32I-LABEL: test_ctlz_i32_zero_undef: +define i32 @test_ctlz_i32_zero_poison(i32 %a) nounwind { +; RV32I-LABEL: test_ctlz_i32_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 @@ -1867,7 +1867,7 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: ret ; -; RV64I-LABEL: test_ctlz_i32_zero_undef: +; RV64I-LABEL: test_ctlz_i32_zero_poison: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 @@ -1903,7 +1903,7 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_ctlz_i32_zero_undef: +; RV32M-LABEL: test_ctlz_i32_zero_poison: ; RV32M: # %bb.0: ; RV32M-NEXT: srli a1, a0, 1 ; RV32M-NEXT: lui a2, 349525 @@ -1938,7 +1938,7 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32M-NEXT: srli a0, a0, 24 ; RV32M-NEXT: ret ; -; RV64M-LABEL: test_ctlz_i32_zero_undef: +; RV64M-LABEL: test_ctlz_i32_zero_poison: ; RV64M: # %bb.0: ; RV64M-NEXT: srliw a1, a0, 1 ; RV64M-NEXT: lui a2, 349525 @@ -1973,22 +1973,22 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64M-NEXT: srliw a0, a0, 24 ; RV64M-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i32_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i32_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: clz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_ctlz_i32_zero_undef: +; RV64ZBB-LABEL: test_ctlz_i32_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: clzw a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_ctlz_i32_zero_undef: +; RV32XTHEADBB-LABEL: test_ctlz_i32_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: th.ff1 a0, a0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_ctlz_i32_zero_undef: +; RV64XTHEADBB-LABEL: test_ctlz_i32_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: slli a0, a0, 32 ; RV64XTHEADBB-NEXT: th.ff1 a0, a0 @@ -1997,8 +1997,8 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ret i32 %tmp } -define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { -; RV32I-LABEL: test_ctlz_i64_zero_undef: +define i64 @test_ctlz_i64_zero_poison(i64 %a) nounwind { +; RV32I-LABEL: test_ctlz_i64_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: lui a3, 209715 @@ -2067,7 +2067,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: test_ctlz_i64_zero_undef: +; RV64I-LABEL: test_ctlz_i64_zero_poison: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 @@ -2113,7 +2113,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ret ; -; RV32M-LABEL: test_ctlz_i64_zero_undef: +; RV32M-LABEL: test_ctlz_i64_zero_poison: ; RV32M: # %bb.0: ; RV32M-NEXT: lui a2, 349525 ; RV32M-NEXT: lui a3, 209715 @@ -2178,7 +2178,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32M-NEXT: li a1, 0 ; RV32M-NEXT: ret ; -; RV64M-LABEL: test_ctlz_i64_zero_undef: +; RV64M-LABEL: test_ctlz_i64_zero_poison: ; RV64M: # %bb.0: ; RV64M-NEXT: srli a1, a0, 1 ; RV64M-NEXT: lui a2, 349525 @@ -2223,7 +2223,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i64_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i64_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: bnez a1, .LBB15_2 ; RV32ZBB-NEXT: # %bb.1: @@ -2236,12 +2236,12 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32ZBB-NEXT: li a1, 0 ; RV32ZBB-NEXT: ret ; -; RV64ZBB-LABEL: test_ctlz_i64_zero_undef: +; RV64ZBB-LABEL: test_ctlz_i64_zero_poison: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: clz a0, a0 ; RV64ZBB-NEXT: ret ; -; RV32XTHEADBB-LABEL: test_ctlz_i64_zero_undef: +; RV32XTHEADBB-LABEL: test_ctlz_i64_zero_poison: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: bnez a1, .LBB15_2 ; RV32XTHEADBB-NEXT: # %bb.1: @@ -2254,7 +2254,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32XTHEADBB-NEXT: li a1, 0 ; RV32XTHEADBB-NEXT: ret ; -; RV64XTHEADBB-LABEL: test_ctlz_i64_zero_undef: +; RV64XTHEADBB-LABEL: test_ctlz_i64_zero_poison: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: th.ff1 a0, a0 ; RV64XTHEADBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index c62fb0ae63555..4331d7ca79123 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -444,8 +444,8 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ret i32 %1 } -define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { -; RV64I-LABEL: cttz_zero_undef_i32: +define signext i32 @cttz_zero_poison_i32(i32 signext %a) nounwind { +; RV64I-LABEL: cttz_zero_poison_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 @@ -476,7 +476,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: ret ; -; RV64XTHEADBB-NOB-LABEL: cttz_zero_undef_i32: +; RV64XTHEADBB-NOB-LABEL: cttz_zero_poison_i32: ; RV64XTHEADBB-NOB: # %bb.0: ; RV64XTHEADBB-NOB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NOB-NEXT: not a0, a0 @@ -486,7 +486,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64XTHEADBB-NOB-NEXT: sub a0, a1, a0 ; RV64XTHEADBB-NOB-NEXT: ret ; -; RV64XTHEADBB-B-LABEL: cttz_zero_undef_i32: +; RV64XTHEADBB-B-LABEL: cttz_zero_poison_i32: ; RV64XTHEADBB-B: # %bb.0: ; RV64XTHEADBB-B-NEXT: ctzw a0, a0 ; RV64XTHEADBB-B-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index e840605710f21..759bf84262461 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -381,8 +381,8 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ret i32 %1 } -define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { -; RV64I-LABEL: cttz_zero_undef_i32: +define signext i32 @cttz_zero_poison_i32(i32 signext %a) nounwind { +; RV64I-LABEL: cttz_zero_poison_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 @@ -413,7 +413,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: ret ; -; RV64ZBB-LABEL: cttz_zero_undef_i32: +; RV64ZBB-LABEL: cttz_zero_poison_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctzw a0, a0 ; RV64ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index 1f45e45f23164..b2e62251f630f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -1666,8 +1666,8 @@ define @ctlz_nxv8i64( %va) { ret %a } -define @ctlz_zero_undef_nxv1i8( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv1i8: +define @ctlz_zero_poison_nxv1i8( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv1i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -1691,7 +1691,7 @@ define @ctlz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv1i8: +; CHECK-F-LABEL: ctlz_zero_poison_nxv1i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vzext.vf2 v9, v8 @@ -1703,7 +1703,7 @@ define @ctlz_zero_undef_nxv1i8( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv1i8: +; CHECK-D-LABEL: ctlz_zero_poison_nxv1i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vzext.vf2 v9, v8 @@ -1715,7 +1715,7 @@ define @ctlz_zero_undef_nxv1i8( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv1i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv1i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1724,8 +1724,8 @@ define @ctlz_zero_undef_nxv1i8( %va) { ret %a } -define @ctlz_zero_undef_nxv2i8( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv2i8: +define @ctlz_zero_poison_nxv2i8( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv2i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -1749,7 +1749,7 @@ define @ctlz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv2i8: +; CHECK-F-LABEL: ctlz_zero_poison_nxv2i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vzext.vf2 v9, v8 @@ -1761,7 +1761,7 @@ define @ctlz_zero_undef_nxv2i8( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv2i8: +; CHECK-D-LABEL: ctlz_zero_poison_nxv2i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vzext.vf2 v9, v8 @@ -1773,7 +1773,7 @@ define @ctlz_zero_undef_nxv2i8( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv2i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv2i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1782,8 +1782,8 @@ define @ctlz_zero_undef_nxv2i8( %va) { ret %a } -define @ctlz_zero_undef_nxv4i8( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv4i8: +define @ctlz_zero_poison_nxv4i8( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv4i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -1807,7 +1807,7 @@ define @ctlz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv4i8: +; CHECK-F-LABEL: ctlz_zero_poison_nxv4i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vzext.vf2 v10, v8 @@ -1819,7 +1819,7 @@ define @ctlz_zero_undef_nxv4i8( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv4i8: +; CHECK-D-LABEL: ctlz_zero_poison_nxv4i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vzext.vf2 v10, v8 @@ -1831,7 +1831,7 @@ define @ctlz_zero_undef_nxv4i8( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv4i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv4i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1840,8 +1840,8 @@ define @ctlz_zero_undef_nxv4i8( %va) { ret %a } -define @ctlz_zero_undef_nxv8i8( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv8i8: +define @ctlz_zero_poison_nxv8i8( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv8i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -1865,7 +1865,7 @@ define @ctlz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv8i8: +; CHECK-F-LABEL: ctlz_zero_poison_nxv8i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vzext.vf2 v12, v8 @@ -1877,7 +1877,7 @@ define @ctlz_zero_undef_nxv8i8( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv8i8: +; CHECK-D-LABEL: ctlz_zero_poison_nxv8i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vzext.vf2 v12, v8 @@ -1889,7 +1889,7 @@ define @ctlz_zero_undef_nxv8i8( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv8i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1898,8 +1898,8 @@ define @ctlz_zero_undef_nxv8i8( %va) { ret %a } -define @ctlz_zero_undef_nxv16i8( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv16i8: +define @ctlz_zero_poison_nxv16i8( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv16i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 @@ -1923,7 +1923,7 @@ define @ctlz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv16i8: +; CHECK-F-LABEL: ctlz_zero_poison_nxv16i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vzext.vf2 v16, v8 @@ -1935,7 +1935,7 @@ define @ctlz_zero_undef_nxv16i8( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv16i8: +; CHECK-D-LABEL: ctlz_zero_poison_nxv16i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vzext.vf2 v16, v8 @@ -1947,7 +1947,7 @@ define @ctlz_zero_undef_nxv16i8( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv16i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv16i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1956,8 +1956,8 @@ define @ctlz_zero_undef_nxv16i8( %va) { ret %a } -define @ctlz_zero_undef_nxv32i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv32i8: +define @ctlz_zero_poison_nxv32i8( %va) { +; CHECK-LABEL: ctlz_zero_poison_nxv32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 @@ -1981,7 +1981,7 @@ define @ctlz_zero_undef_nxv32i8( %va) { ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv32i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv32i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1990,8 +1990,8 @@ define @ctlz_zero_undef_nxv32i8( %va) { ret %a } -define @ctlz_zero_undef_nxv64i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv64i8: +define @ctlz_zero_poison_nxv64i8( %va) { +; CHECK-LABEL: ctlz_zero_poison_nxv64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 @@ -2015,7 +2015,7 @@ define @ctlz_zero_undef_nxv64i8( %va) { ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv64i8: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv64i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2024,8 +2024,8 @@ define @ctlz_zero_undef_nxv64i8( %va) { ret %a } -define @ctlz_zero_undef_nxv1i16( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv1i16: +define @ctlz_zero_poison_nxv1i16( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv1i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -2058,7 +2058,7 @@ define @ctlz_zero_undef_nxv1i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv1i16: +; CHECK-F-LABEL: ctlz_zero_poison_nxv1i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 @@ -2067,7 +2067,7 @@ define @ctlz_zero_undef_nxv1i16( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv1i16: +; CHECK-D-LABEL: ctlz_zero_poison_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v9, v8 @@ -2076,7 +2076,7 @@ define @ctlz_zero_undef_nxv1i16( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv1i16: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv1i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2085,8 +2085,8 @@ define @ctlz_zero_undef_nxv1i16( %va) { ret %a } -define @ctlz_zero_undef_nxv2i16( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv2i16: +define @ctlz_zero_poison_nxv2i16( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv2i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -2119,7 +2119,7 @@ define @ctlz_zero_undef_nxv2i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv2i16: +; CHECK-F-LABEL: ctlz_zero_poison_nxv2i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 @@ -2128,7 +2128,7 @@ define @ctlz_zero_undef_nxv2i16( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv2i16: +; CHECK-D-LABEL: ctlz_zero_poison_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v9, v8 @@ -2137,7 +2137,7 @@ define @ctlz_zero_undef_nxv2i16( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv2i16: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv2i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2146,8 +2146,8 @@ define @ctlz_zero_undef_nxv2i16( %va) { ret %a } -define @ctlz_zero_undef_nxv4i16( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv4i16: +define @ctlz_zero_poison_nxv4i16( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv4i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -2180,7 +2180,7 @@ define @ctlz_zero_undef_nxv4i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv4i16: +; CHECK-F-LABEL: ctlz_zero_poison_nxv4i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8 @@ -2189,7 +2189,7 @@ define @ctlz_zero_undef_nxv4i16( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv4i16: +; CHECK-D-LABEL: ctlz_zero_poison_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v8 @@ -2198,7 +2198,7 @@ define @ctlz_zero_undef_nxv4i16( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv4i16: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv4i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2207,8 +2207,8 @@ define @ctlz_zero_undef_nxv4i16( %va) { ret %a } -define @ctlz_zero_undef_nxv8i16( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv8i16: +define @ctlz_zero_poison_nxv8i16( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv8i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 @@ -2241,7 +2241,7 @@ define @ctlz_zero_undef_nxv8i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv8i16: +; CHECK-F-LABEL: ctlz_zero_poison_nxv8i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8 @@ -2250,7 +2250,7 @@ define @ctlz_zero_undef_nxv8i16( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv8i16: +; CHECK-D-LABEL: ctlz_zero_poison_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v8 @@ -2259,7 +2259,7 @@ define @ctlz_zero_undef_nxv8i16( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i16: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv8i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2268,8 +2268,8 @@ define @ctlz_zero_undef_nxv8i16( %va) { ret %a } -define @ctlz_zero_undef_nxv16i16( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv16i16: +define @ctlz_zero_poison_nxv16i16( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv16i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 @@ -2302,7 +2302,7 @@ define @ctlz_zero_undef_nxv16i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv16i16: +; CHECK-F-LABEL: ctlz_zero_poison_nxv16i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8 @@ -2311,7 +2311,7 @@ define @ctlz_zero_undef_nxv16i16( %va) { ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv16i16: +; CHECK-D-LABEL: ctlz_zero_poison_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v8 @@ -2320,7 +2320,7 @@ define @ctlz_zero_undef_nxv16i16( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv16i16: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv16i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2329,8 +2329,8 @@ define @ctlz_zero_undef_nxv16i16( %va) { ret %a } -define @ctlz_zero_undef_nxv32i16( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv32i16: +define @ctlz_zero_poison_nxv32i16( %va) { +; CHECK-LABEL: ctlz_zero_poison_nxv32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 @@ -2363,7 +2363,7 @@ define @ctlz_zero_undef_nxv32i16( %va) { ; CHECK-NEXT: vsrl.vi v8, v8, 8 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv32i16: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv32i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2372,8 +2372,8 @@ define @ctlz_zero_undef_nxv32i16( %va) { ret %a } -define @ctlz_zero_undef_nxv1i32( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv1i32: +define @ctlz_zero_poison_nxv1i32( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv1i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -2409,7 +2409,7 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv1i32: +; CHECK-F-LABEL: ctlz_zero_poison_nxv1i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma @@ -2420,7 +2420,7 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv1i32: +; CHECK-D-LABEL: ctlz_zero_poison_nxv1i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v9, v8 @@ -2430,7 +2430,7 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv1i32: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv1i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2439,8 +2439,8 @@ define @ctlz_zero_undef_nxv1i32( %va) { ret %a } -define @ctlz_zero_undef_nxv2i32( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv2i32: +define @ctlz_zero_poison_nxv2i32( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv2i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 @@ -2476,7 +2476,7 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv2i32: +; CHECK-F-LABEL: ctlz_zero_poison_nxv2i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma @@ -2487,7 +2487,7 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv2i32: +; CHECK-D-LABEL: ctlz_zero_poison_nxv2i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v8 @@ -2497,7 +2497,7 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv2i32: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv2i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2506,8 +2506,8 @@ define @ctlz_zero_undef_nxv2i32( %va) { ret %a } -define @ctlz_zero_undef_nxv4i32( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv4i32: +define @ctlz_zero_poison_nxv4i32( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv4i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 @@ -2543,7 +2543,7 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv4i32: +; CHECK-F-LABEL: ctlz_zero_poison_nxv4i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma @@ -2554,7 +2554,7 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv4i32: +; CHECK-D-LABEL: ctlz_zero_poison_nxv4i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v8 @@ -2564,7 +2564,7 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv4i32: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv4i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2573,8 +2573,8 @@ define @ctlz_zero_undef_nxv4i32( %va) { ret %a } -define @ctlz_zero_undef_nxv8i32( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv8i32: +define @ctlz_zero_poison_nxv8i32( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv8i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 @@ -2610,7 +2610,7 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv8i32: +; CHECK-F-LABEL: ctlz_zero_poison_nxv8i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma @@ -2621,7 +2621,7 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv8i32: +; CHECK-D-LABEL: ctlz_zero_poison_nxv8i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v8 @@ -2631,7 +2631,7 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i32: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv8i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2640,8 +2640,8 @@ define @ctlz_zero_undef_nxv8i32( %va) { ret %a } -define @ctlz_zero_undef_nxv16i32( %va) { -; CHECK-ZVE64X-LABEL: ctlz_zero_undef_nxv16i32: +define @ctlz_zero_poison_nxv16i32( %va) { +; CHECK-ZVE64X-LABEL: ctlz_zero_poison_nxv16i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 @@ -2677,7 +2677,7 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv16i32: +; CHECK-F-LABEL: ctlz_zero_poison_nxv16i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m8, ta, ma @@ -2688,7 +2688,7 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32: +; CHECK-D-LABEL: ctlz_zero_poison_nxv16i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e32, m8, ta, ma @@ -2699,7 +2699,7 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv16i32: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv16i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2708,8 +2708,8 @@ define @ctlz_zero_undef_nxv16i32( %va) { ret %a } -define @ctlz_zero_undef_nxv1i64( %va) { -; RV32I-LABEL: ctlz_zero_undef_nxv1i64: +define @ctlz_zero_poison_nxv1i64( %va) { +; RV32I-LABEL: ctlz_zero_poison_nxv1i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32I-NEXT: vsrl.vi v9, v8, 1 @@ -2761,7 +2761,7 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: ctlz_zero_undef_nxv1i64: +; RV64I-LABEL: ctlz_zero_poison_nxv1i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64I-NEXT: vsrl.vi v9, v8, 1 @@ -2809,7 +2809,7 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64: +; CHECK-F-LABEL: ctlz_zero_poison_nxv1i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 ; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma @@ -2821,7 +2821,7 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64: +; CHECK-D-LABEL: ctlz_zero_poison_nxv1i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m1, ta, ma @@ -2833,7 +2833,7 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv1i64: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv1i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2842,8 +2842,8 @@ define @ctlz_zero_undef_nxv1i64( %va) { ret %a } -define @ctlz_zero_undef_nxv2i64( %va) { -; RV32I-LABEL: ctlz_zero_undef_nxv2i64: +define @ctlz_zero_poison_nxv2i64( %va) { +; RV32I-LABEL: ctlz_zero_poison_nxv2i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32I-NEXT: vsrl.vi v10, v8, 1 @@ -2895,7 +2895,7 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: ctlz_zero_undef_nxv2i64: +; RV64I-LABEL: ctlz_zero_poison_nxv2i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV64I-NEXT: vsrl.vi v10, v8, 1 @@ -2943,7 +2943,7 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64: +; CHECK-F-LABEL: ctlz_zero_poison_nxv2i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma @@ -2955,7 +2955,7 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64: +; CHECK-D-LABEL: ctlz_zero_poison_nxv2i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m2, ta, ma @@ -2967,7 +2967,7 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv2i64: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv2i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2976,8 +2976,8 @@ define @ctlz_zero_undef_nxv2i64( %va) { ret %a } -define @ctlz_zero_undef_nxv4i64( %va) { -; RV32I-LABEL: ctlz_zero_undef_nxv4i64: +define @ctlz_zero_poison_nxv4i64( %va) { +; RV32I-LABEL: ctlz_zero_poison_nxv4i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32I-NEXT: vsrl.vi v12, v8, 1 @@ -3029,7 +3029,7 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: ctlz_zero_undef_nxv4i64: +; RV64I-LABEL: ctlz_zero_poison_nxv4i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64I-NEXT: vsrl.vi v12, v8, 1 @@ -3077,7 +3077,7 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64: +; CHECK-F-LABEL: ctlz_zero_poison_nxv4i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma @@ -3089,7 +3089,7 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64: +; CHECK-D-LABEL: ctlz_zero_poison_nxv4i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m4, ta, ma @@ -3101,7 +3101,7 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv4i64: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv4i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -3110,8 +3110,8 @@ define @ctlz_zero_undef_nxv4i64( %va) { ret %a } -define @ctlz_zero_undef_nxv8i64( %va) { -; RV32I-LABEL: ctlz_zero_undef_nxv8i64: +define @ctlz_zero_poison_nxv8i64( %va) { +; RV32I-LABEL: ctlz_zero_poison_nxv8i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32I-NEXT: vsrl.vi v16, v8, 1 @@ -3163,7 +3163,7 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: ctlz_zero_undef_nxv8i64: +; RV64I-LABEL: ctlz_zero_poison_nxv8i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64I-NEXT: vsrl.vi v16, v8, 1 @@ -3211,7 +3211,7 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64: +; CHECK-F-LABEL: ctlz_zero_poison_nxv8i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma @@ -3223,7 +3223,7 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64: +; CHECK-D-LABEL: ctlz_zero_poison_nxv8i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -3235,7 +3235,7 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i64: +; CHECK-ZVBB-LABEL: ctlz_zero_poison_nxv8i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index 26b778f061e95..f266fcc8787f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -1200,8 +1200,8 @@ define @vp_ctlz_nxv16i64_unmasked( %va, i ret %v } -define @vp_ctlz_zero_undef_nxv1i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i8: +define @vp_ctlz_zero_poison_nxv1i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -1213,7 +1213,7 @@ define @vp_ctlz_zero_undef_nxv1i8( %va, @vp_ctlz_zero_undef_nxv1i8( %va, %v } -define @vp_ctlz_zero_undef_nxv1i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i8_unmasked: +define @vp_ctlz_zero_poison_nxv1i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -1235,7 +1235,7 @@ define @vp_ctlz_zero_undef_nxv1i8_unmasked( % ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i8_unmasked: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv1i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1244,8 +1244,8 @@ define @vp_ctlz_zero_undef_nxv1i8_unmasked( % ret %v } -define @vp_ctlz_zero_undef_nxv2i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i8: +define @vp_ctlz_zero_poison_nxv2i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -1257,7 +1257,7 @@ define @vp_ctlz_zero_undef_nxv2i8( %va, @vp_ctlz_zero_undef_nxv2i8( %va, %v } -define @vp_ctlz_zero_undef_nxv2i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i8_unmasked: +define @vp_ctlz_zero_poison_nxv2i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -1279,7 +1279,7 @@ define @vp_ctlz_zero_undef_nxv2i8_unmasked( % ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i8_unmasked: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv2i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1288,8 +1288,8 @@ define @vp_ctlz_zero_undef_nxv2i8_unmasked( % ret %v } -define @vp_ctlz_zero_undef_nxv4i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i8: +define @vp_ctlz_zero_poison_nxv4i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 @@ -1301,7 +1301,7 @@ define @vp_ctlz_zero_undef_nxv4i8( %va, @vp_ctlz_zero_undef_nxv4i8( %va, %v } -define @vp_ctlz_zero_undef_nxv4i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i8_unmasked: +define @vp_ctlz_zero_poison_nxv4i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 @@ -1323,7 +1323,7 @@ define @vp_ctlz_zero_undef_nxv4i8_unmasked( % ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i8_unmasked: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv4i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1332,8 +1332,8 @@ define @vp_ctlz_zero_undef_nxv4i8_unmasked( % ret %v } -define @vp_ctlz_zero_undef_nxv8i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i8: +define @vp_ctlz_zero_poison_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 @@ -1345,7 +1345,7 @@ define @vp_ctlz_zero_undef_nxv8i8( %va, @vp_ctlz_zero_undef_nxv8i8( %va, %v } -define @vp_ctlz_zero_undef_nxv8i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i8_unmasked: +define @vp_ctlz_zero_poison_nxv8i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 @@ -1367,7 +1367,7 @@ define @vp_ctlz_zero_undef_nxv8i8_unmasked( % ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i8_unmasked: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv8i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1376,8 +1376,8 @@ define @vp_ctlz_zero_undef_nxv8i8_unmasked( % ret %v } -define @vp_ctlz_zero_undef_nxv16i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i8: +define @vp_ctlz_zero_poison_nxv16i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vzext.vf2 v16, v8 @@ -1389,7 +1389,7 @@ define @vp_ctlz_zero_undef_nxv16i8( %va, @vp_ctlz_zero_undef_nxv16i8( %va, %v } -define @vp_ctlz_zero_undef_nxv16i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i8_unmasked: +define @vp_ctlz_zero_poison_nxv16i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vzext.vf2 v16, v8 @@ -1411,7 +1411,7 @@ define @vp_ctlz_zero_undef_nxv16i8_unmasked( @vp_ctlz_zero_undef_nxv16i8_unmasked( %v } -define @vp_ctlz_zero_undef_nxv32i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i8: +define @vp_ctlz_zero_poison_nxv32i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 @@ -1445,7 +1445,7 @@ define @vp_ctlz_zero_undef_nxv32i8( %va, @vp_ctlz_zero_undef_nxv32i8( %va, %v } -define @vp_ctlz_zero_undef_nxv32i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i8_unmasked: +define @vp_ctlz_zero_poison_nxv32i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv32i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 @@ -1479,7 +1479,7 @@ define @vp_ctlz_zero_undef_nxv32i8_unmasked( @vp_ctlz_zero_undef_nxv32i8_unmasked( %v } -define @vp_ctlz_zero_undef_nxv64i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv64i8: +define @vp_ctlz_zero_poison_nxv64i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 @@ -1513,7 +1513,7 @@ define @vp_ctlz_zero_undef_nxv64i8( %va, @vp_ctlz_zero_undef_nxv64i8( %va, %v } -define @vp_ctlz_zero_undef_nxv64i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv64i8_unmasked: +define @vp_ctlz_zero_poison_nxv64i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv64i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 @@ -1547,7 +1547,7 @@ define @vp_ctlz_zero_undef_nxv64i8_unmasked( @vp_ctlz_zero_undef_nxv64i8_unmasked( %v } -define @vp_ctlz_zero_undef_nxv1i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i16: +define @vp_ctlz_zero_poison_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -1566,7 +1566,7 @@ define @vp_ctlz_zero_undef_nxv1i16( %va, @vp_ctlz_zero_undef_nxv1i16( %va, %v } -define @vp_ctlz_zero_undef_nxv1i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i16_unmasked: +define @vp_ctlz_zero_poison_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -1585,7 +1585,7 @@ define @vp_ctlz_zero_undef_nxv1i16_unmasked( @vp_ctlz_zero_undef_nxv1i16_unmasked( %v } -define @vp_ctlz_zero_undef_nxv2i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i16: +define @vp_ctlz_zero_poison_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -1604,7 +1604,7 @@ define @vp_ctlz_zero_undef_nxv2i16( %va, @vp_ctlz_zero_undef_nxv2i16( %va, %v } -define @vp_ctlz_zero_undef_nxv2i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i16_unmasked: +define @vp_ctlz_zero_poison_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -1623,7 +1623,7 @@ define @vp_ctlz_zero_undef_nxv2i16_unmasked( @vp_ctlz_zero_undef_nxv2i16_unmasked( %v } -define @vp_ctlz_zero_undef_nxv4i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i16: +define @vp_ctlz_zero_poison_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -1642,7 +1642,7 @@ define @vp_ctlz_zero_undef_nxv4i16( %va, @vp_ctlz_zero_undef_nxv4i16( %va, %v } -define @vp_ctlz_zero_undef_nxv4i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i16_unmasked: +define @vp_ctlz_zero_poison_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -1661,7 +1661,7 @@ define @vp_ctlz_zero_undef_nxv4i16_unmasked( @vp_ctlz_zero_undef_nxv4i16_unmasked( %v } -define @vp_ctlz_zero_undef_nxv8i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i16: +define @vp_ctlz_zero_poison_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -1680,7 +1680,7 @@ define @vp_ctlz_zero_undef_nxv8i16( %va, @vp_ctlz_zero_undef_nxv8i16( %va, %v } -define @vp_ctlz_zero_undef_nxv8i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i16_unmasked: +define @vp_ctlz_zero_poison_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -1699,7 +1699,7 @@ define @vp_ctlz_zero_undef_nxv8i16_unmasked( @vp_ctlz_zero_undef_nxv8i16_unmasked( %v } -define @vp_ctlz_zero_undef_nxv16i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i16: +define @vp_ctlz_zero_poison_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 @@ -1718,7 +1718,7 @@ define @vp_ctlz_zero_undef_nxv16i16( %va, ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i16: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv16i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1727,8 +1727,8 @@ define @vp_ctlz_zero_undef_nxv16i16( %va, ret %v } -define @vp_ctlz_zero_undef_nxv16i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i16_unmasked: +define @vp_ctlz_zero_poison_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 @@ -1737,7 +1737,7 @@ define @vp_ctlz_zero_undef_nxv16i16_unmasked( @vp_ctlz_zero_undef_nxv16i16_unmasked( %v } -define @vp_ctlz_zero_undef_nxv32i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i16: +define @vp_ctlz_zero_poison_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 @@ -1780,7 +1780,7 @@ define @vp_ctlz_zero_undef_nxv32i16( %va, ; CHECK-NEXT: vsrl.vi v8, v8, 8 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv32i16: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv32i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -1789,8 +1789,8 @@ define @vp_ctlz_zero_undef_nxv32i16( %va, ret %v } -define @vp_ctlz_zero_undef_nxv32i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv32i16_unmasked: +define @vp_ctlz_zero_poison_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv32i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 @@ -1823,7 +1823,7 @@ define @vp_ctlz_zero_undef_nxv32i16_unmasked( @vp_ctlz_zero_undef_nxv32i16_unmasked( %v } -define @vp_ctlz_zero_undef_nxv1i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i32: +define @vp_ctlz_zero_poison_nxv1i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -1843,7 +1843,7 @@ define @vp_ctlz_zero_undef_nxv1i32( %va, @vp_ctlz_zero_undef_nxv1i32( %va, %v } -define @vp_ctlz_zero_undef_nxv1i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i32_unmasked: +define @vp_ctlz_zero_poison_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -1863,7 +1863,7 @@ define @vp_ctlz_zero_undef_nxv1i32_unmasked( @vp_ctlz_zero_undef_nxv1i32_unmasked( %v } -define @vp_ctlz_zero_undef_nxv2i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i32: +define @vp_ctlz_zero_poison_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -1883,7 +1883,7 @@ define @vp_ctlz_zero_undef_nxv2i32( %va, @vp_ctlz_zero_undef_nxv2i32( %va, %v } -define @vp_ctlz_zero_undef_nxv2i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i32_unmasked: +define @vp_ctlz_zero_poison_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -1903,7 +1903,7 @@ define @vp_ctlz_zero_undef_nxv2i32_unmasked( @vp_ctlz_zero_undef_nxv2i32_unmasked( %v } -define @vp_ctlz_zero_undef_nxv4i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i32: +define @vp_ctlz_zero_poison_nxv4i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -1923,7 +1923,7 @@ define @vp_ctlz_zero_undef_nxv4i32( %va, @vp_ctlz_zero_undef_nxv4i32( %va, %v } -define @vp_ctlz_zero_undef_nxv4i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i32_unmasked: +define @vp_ctlz_zero_poison_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -1943,7 +1943,7 @@ define @vp_ctlz_zero_undef_nxv4i32_unmasked( @vp_ctlz_zero_undef_nxv4i32_unmasked( %v } -define @vp_ctlz_zero_undef_nxv8i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i32: +define @vp_ctlz_zero_poison_nxv8i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 @@ -1963,7 +1963,7 @@ define @vp_ctlz_zero_undef_nxv8i32( %va, @vp_ctlz_zero_undef_nxv8i32( %va, %v } -define @vp_ctlz_zero_undef_nxv8i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i32_unmasked: +define @vp_ctlz_zero_poison_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 @@ -1983,7 +1983,7 @@ define @vp_ctlz_zero_undef_nxv8i32_unmasked( @vp_ctlz_zero_undef_nxv8i32_unmasked( %v } -define @vp_ctlz_zero_undef_nxv16i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i32: +define @vp_ctlz_zero_poison_nxv16i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma @@ -2004,7 +2004,7 @@ define @vp_ctlz_zero_undef_nxv16i32( %va, ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv16i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2013,8 +2013,8 @@ define @vp_ctlz_zero_undef_nxv16i32( %va, ret %v } -define @vp_ctlz_zero_undef_nxv16i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i32_unmasked: +define @vp_ctlz_zero_poison_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma @@ -2025,7 +2025,7 @@ define @vp_ctlz_zero_undef_nxv16i32_unmasked( @vp_ctlz_zero_undef_nxv16i32_unmasked( %v } -define @vp_ctlz_zero_undef_nxv1i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i64: +define @vp_ctlz_zero_poison_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma @@ -2047,7 +2047,7 @@ define @vp_ctlz_zero_undef_nxv1i64( %va, @vp_ctlz_zero_undef_nxv1i64( %va, %v } -define @vp_ctlz_zero_undef_nxv1i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked: +define @vp_ctlz_zero_poison_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma @@ -2069,7 +2069,7 @@ define @vp_ctlz_zero_undef_nxv1i64_unmasked( @vp_ctlz_zero_undef_nxv1i64_unmasked( %v } -define @vp_ctlz_zero_undef_nxv2i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i64: +define @vp_ctlz_zero_poison_nxv2i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma @@ -2091,7 +2091,7 @@ define @vp_ctlz_zero_undef_nxv2i64( %va, @vp_ctlz_zero_undef_nxv2i64( %va, %v } -define @vp_ctlz_zero_undef_nxv2i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked: +define @vp_ctlz_zero_poison_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv2i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma @@ -2113,7 +2113,7 @@ define @vp_ctlz_zero_undef_nxv2i64_unmasked( @vp_ctlz_zero_undef_nxv2i64_unmasked( %v } -define @vp_ctlz_zero_undef_nxv4i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i64: +define @vp_ctlz_zero_poison_nxv4i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma @@ -2135,7 +2135,7 @@ define @vp_ctlz_zero_undef_nxv4i64( %va, @vp_ctlz_zero_undef_nxv4i64( %va, %v } -define @vp_ctlz_zero_undef_nxv4i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked: +define @vp_ctlz_zero_poison_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv4i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma @@ -2157,7 +2157,7 @@ define @vp_ctlz_zero_undef_nxv4i64_unmasked( @vp_ctlz_zero_undef_nxv4i64_unmasked( %v } -define @vp_ctlz_zero_undef_nxv7i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv7i64: +define @vp_ctlz_zero_poison_nxv7i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv7i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -2179,7 +2179,7 @@ define @vp_ctlz_zero_undef_nxv7i64( %va, @vp_ctlz_zero_undef_nxv7i64( %va, %v } -define @vp_ctlz_zero_undef_nxv7i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked: +define @vp_ctlz_zero_poison_nxv7i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv7i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -2201,7 +2201,7 @@ define @vp_ctlz_zero_undef_nxv7i64_unmasked( @vp_ctlz_zero_undef_nxv7i64_unmasked( %v } -define @vp_ctlz_zero_undef_nxv8i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i64: +define @vp_ctlz_zero_poison_nxv8i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -2223,7 +2223,7 @@ define @vp_ctlz_zero_undef_nxv8i64( %va, @vp_ctlz_zero_undef_nxv8i64( %va, %v } -define @vp_ctlz_zero_undef_nxv8i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked: +define @vp_ctlz_zero_poison_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv8i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -2245,7 +2245,7 @@ define @vp_ctlz_zero_undef_nxv8i64_unmasked( @vp_ctlz_zero_undef_nxv8i64_unmasked( %v } -define @vp_ctlz_zero_undef_nxv16i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64: +define @vp_ctlz_zero_poison_nxv16i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -2270,7 +2270,7 @@ define @vp_ctlz_zero_undef_nxv16i64( %va, ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64: +; CHECK-ZVBB-LABEL: vp_ctlz_zero_poison_nxv16i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v8, v8 @@ -2280,8 +2280,8 @@ define @vp_ctlz_zero_undef_nxv16i64( %va, ret %v } -define @vp_ctlz_zero_undef_nxv16i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked: +define @vp_ctlz_zero_poison_nxv16i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv16i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma @@ -2296,7 +2296,7 @@ define @vp_ctlz_zero_undef_nxv16i64_unmasked( @vp_ctlz_nxv1i9( %va, @llvm.vp.ctlz.nxv1i9( %va, i1 false, %m, i32 %evl) ret %v } -define @vp_ctlz_zero_undef_nxv1i9( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i9: +define @vp_ctlz_zero_poison_nxv1i9( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_nxv1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vsll.vi v8, v8, 7 @@ -2344,7 +2344,7 @@ define @vp_ctlz_zero_undef_nxv1i9( %va, @vp_ctlo_nxv1i9( %va, @llvm.vp.ctlz.nxv1i9( %va.not, i1 false, %m, i32 %evl) ret %v } -define @vp_ctlo_zero_undef_nxv1i9( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9: +define @vp_ctlo_zero_poison_nxv1i9( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlo_zero_poison_nxv1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 511 ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma @@ -2389,7 +2389,7 @@ define @vp_ctlo_zero_undef_nxv1i9( %va, @vp_ctlo_nxv1i9_vp_xor( %va, %v } -define @vp_ctlo_zero_undef_nxv1i9_vp_xor( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor: +define @vp_ctlo_zero_poison_nxv1i9_vp_xor( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlo_zero_poison_nxv1i9_vp_xor: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 511 ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma @@ -2439,7 +2439,7 @@ define @vp_ctlo_zero_undef_nxv1i9_vp_xor( %va ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor: +; CHECK-ZVBB-LABEL: vp_ctlo_zero_poison_nxv1i9_vp_xor: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: li a0, 511 ; CHECK-ZVBB-NEXT: vsetvli a1, zero, e16, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll index 79af06db4171e..7a4e948634333 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll @@ -1588,8 +1588,8 @@ define @cttz_nxv8i64( %va) { ret %a } -define @cttz_zero_undef_nxv1i8( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv1i8: +define @cttz_zero_poison_nxv1i8( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv1i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -1609,7 +1609,7 @@ define @cttz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv1i8: +; CHECK-F-LABEL: cttz_zero_poison_nxv1i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -1624,7 +1624,7 @@ define @cttz_zero_undef_nxv1i8( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv1i8: +; CHECK-D-LABEL: cttz_zero_poison_nxv1i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -1639,7 +1639,7 @@ define @cttz_zero_undef_nxv1i8( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv1i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv1i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1648,8 +1648,8 @@ define @cttz_zero_undef_nxv1i8( %va) { ret %a } -define @cttz_zero_undef_nxv2i8( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv2i8: +define @cttz_zero_poison_nxv2i8( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv2i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -1669,7 +1669,7 @@ define @cttz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv2i8: +; CHECK-F-LABEL: cttz_zero_poison_nxv2i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -1684,7 +1684,7 @@ define @cttz_zero_undef_nxv2i8( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv2i8: +; CHECK-D-LABEL: cttz_zero_poison_nxv2i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -1699,7 +1699,7 @@ define @cttz_zero_undef_nxv2i8( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv2i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1708,8 +1708,8 @@ define @cttz_zero_undef_nxv2i8( %va) { ret %a } -define @cttz_zero_undef_nxv4i8( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv4i8: +define @cttz_zero_poison_nxv4i8( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv4i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -1729,7 +1729,7 @@ define @cttz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv4i8: +; CHECK-F-LABEL: cttz_zero_poison_nxv4i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -1744,7 +1744,7 @@ define @cttz_zero_undef_nxv4i8( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv4i8: +; CHECK-D-LABEL: cttz_zero_poison_nxv4i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -1759,7 +1759,7 @@ define @cttz_zero_undef_nxv4i8( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv4i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1768,8 +1768,8 @@ define @cttz_zero_undef_nxv4i8( %va) { ret %a } -define @cttz_zero_undef_nxv8i8( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv8i8: +define @cttz_zero_poison_nxv8i8( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv8i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -1789,7 +1789,7 @@ define @cttz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv8i8: +; CHECK-F-LABEL: cttz_zero_poison_nxv8i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -1804,7 +1804,7 @@ define @cttz_zero_undef_nxv8i8( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv8i8: +; CHECK-D-LABEL: cttz_zero_poison_nxv8i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -1819,7 +1819,7 @@ define @cttz_zero_undef_nxv8i8( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv8i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1828,8 +1828,8 @@ define @cttz_zero_undef_nxv8i8( %va) { ret %a } -define @cttz_zero_undef_nxv16i8( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv16i8: +define @cttz_zero_poison_nxv16i8( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv16i8: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v10, v8, -1 @@ -1849,7 +1849,7 @@ define @cttz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv16i8: +; CHECK-F-LABEL: cttz_zero_poison_nxv16i8: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 @@ -1864,7 +1864,7 @@ define @cttz_zero_undef_nxv16i8( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv16i8: +; CHECK-D-LABEL: cttz_zero_poison_nxv16i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 @@ -1879,7 +1879,7 @@ define @cttz_zero_undef_nxv16i8( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv16i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1888,8 +1888,8 @@ define @cttz_zero_undef_nxv16i8( %va) { ret %a } -define @cttz_zero_undef_nxv32i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv32i8: +define @cttz_zero_poison_nxv32i8( %va) { +; CHECK-LABEL: cttz_zero_poison_nxv32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vadd.vi v12, v8, -1 @@ -1909,7 +1909,7 @@ define @cttz_zero_undef_nxv32i8( %va) { ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv32i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv32i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1918,8 +1918,8 @@ define @cttz_zero_undef_nxv32i8( %va) { ret %a } -define @cttz_zero_undef_nxv64i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv64i8: +define @cttz_zero_poison_nxv64i8( %va) { +; CHECK-LABEL: cttz_zero_poison_nxv64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v8, -1 @@ -1939,7 +1939,7 @@ define @cttz_zero_undef_nxv64i8( %va) { ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv64i8: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv64i8: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1948,8 +1948,8 @@ define @cttz_zero_undef_nxv64i8( %va) { ret %a } -define @cttz_zero_undef_nxv1i16( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv1i16: +define @cttz_zero_poison_nxv1i16( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv1i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -1976,7 +1976,7 @@ define @cttz_zero_undef_nxv1i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv1i16: +; CHECK-F-LABEL: cttz_zero_poison_nxv1i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -1987,7 +1987,7 @@ define @cttz_zero_undef_nxv1i16( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv1i16: +; CHECK-D-LABEL: cttz_zero_poison_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -1998,7 +1998,7 @@ define @cttz_zero_undef_nxv1i16( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv1i16: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv1i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2007,8 +2007,8 @@ define @cttz_zero_undef_nxv1i16( %va) { ret %a } -define @cttz_zero_undef_nxv2i16( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv2i16: +define @cttz_zero_poison_nxv2i16( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv2i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -2035,7 +2035,7 @@ define @cttz_zero_undef_nxv2i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv2i16: +; CHECK-F-LABEL: cttz_zero_poison_nxv2i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -2046,7 +2046,7 @@ define @cttz_zero_undef_nxv2i16( %va) { ; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv2i16: +; CHECK-D-LABEL: cttz_zero_poison_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -2057,7 +2057,7 @@ define @cttz_zero_undef_nxv2i16( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i16: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv2i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2066,8 +2066,8 @@ define @cttz_zero_undef_nxv2i16( %va) { ret %a } -define @cttz_zero_undef_nxv4i16( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv4i16: +define @cttz_zero_poison_nxv4i16( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv4i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -2094,7 +2094,7 @@ define @cttz_zero_undef_nxv4i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv4i16: +; CHECK-F-LABEL: cttz_zero_poison_nxv4i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -2105,7 +2105,7 @@ define @cttz_zero_undef_nxv4i16( %va) { ; CHECK-F-NEXT: vsub.vx v8, v10, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv4i16: +; CHECK-D-LABEL: cttz_zero_poison_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -2116,7 +2116,7 @@ define @cttz_zero_undef_nxv4i16( %va) { ; CHECK-D-NEXT: vsub.vx v8, v10, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i16: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv4i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2125,8 +2125,8 @@ define @cttz_zero_undef_nxv4i16( %va) { ret %a } -define @cttz_zero_undef_nxv8i16( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv8i16: +define @cttz_zero_poison_nxv8i16( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv8i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v10, v8, -1 @@ -2153,7 +2153,7 @@ define @cttz_zero_undef_nxv8i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv8i16: +; CHECK-F-LABEL: cttz_zero_poison_nxv8i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 @@ -2164,7 +2164,7 @@ define @cttz_zero_undef_nxv8i16( %va) { ; CHECK-F-NEXT: vsub.vx v8, v12, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv8i16: +; CHECK-D-LABEL: cttz_zero_poison_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 @@ -2175,7 +2175,7 @@ define @cttz_zero_undef_nxv8i16( %va) { ; CHECK-D-NEXT: vsub.vx v8, v12, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i16: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv8i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2184,8 +2184,8 @@ define @cttz_zero_undef_nxv8i16( %va) { ret %a } -define @cttz_zero_undef_nxv16i16( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv16i16: +define @cttz_zero_poison_nxv16i16( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv16i16: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v12, v8, -1 @@ -2212,7 +2212,7 @@ define @cttz_zero_undef_nxv16i16( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 8 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv16i16: +; CHECK-F-LABEL: cttz_zero_poison_nxv16i16: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 @@ -2223,7 +2223,7 @@ define @cttz_zero_undef_nxv16i16( %va) { ; CHECK-F-NEXT: vsub.vx v8, v16, a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv16i16: +; CHECK-D-LABEL: cttz_zero_poison_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 @@ -2234,7 +2234,7 @@ define @cttz_zero_undef_nxv16i16( %va) { ; CHECK-D-NEXT: vsub.vx v8, v16, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i16: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv16i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2243,8 +2243,8 @@ define @cttz_zero_undef_nxv16i16( %va) { ret %a } -define @cttz_zero_undef_nxv32i16( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv32i16: +define @cttz_zero_poison_nxv32i16( %va) { +; CHECK-LABEL: cttz_zero_poison_nxv32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v8, -1 @@ -2271,7 +2271,7 @@ define @cttz_zero_undef_nxv32i16( %va) { ; CHECK-NEXT: vsrl.vi v8, v8, 8 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv32i16: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv32i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2280,8 +2280,8 @@ define @cttz_zero_undef_nxv32i16( %va) { ret %a } -define @cttz_zero_undef_nxv1i32( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv1i32: +define @cttz_zero_poison_nxv1i32( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv1i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -2309,7 +2309,7 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv1i32: +; CHECK-F-LABEL: cttz_zero_poison_nxv1i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -2322,7 +2322,7 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv1i32: +; CHECK-D-LABEL: cttz_zero_poison_nxv1i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -2334,7 +2334,7 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv1i32: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv1i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2343,8 +2343,8 @@ define @cttz_zero_undef_nxv1i32( %va) { ret %a } -define @cttz_zero_undef_nxv2i32( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv2i32: +define @cttz_zero_poison_nxv2i32( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv2i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v9, v8, -1 @@ -2372,7 +2372,7 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv2i32: +; CHECK-F-LABEL: cttz_zero_poison_nxv2i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -2385,7 +2385,7 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv2i32: +; CHECK-D-LABEL: cttz_zero_poison_nxv2i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -2397,7 +2397,7 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-D-NEXT: vsub.vx v8, v10, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i32: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv2i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2406,8 +2406,8 @@ define @cttz_zero_undef_nxv2i32( %va) { ret %a } -define @cttz_zero_undef_nxv4i32( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv4i32: +define @cttz_zero_poison_nxv4i32( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv4i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v10, v8, -1 @@ -2435,7 +2435,7 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv4i32: +; CHECK-F-LABEL: cttz_zero_poison_nxv4i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 @@ -2448,7 +2448,7 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv4i32: +; CHECK-D-LABEL: cttz_zero_poison_nxv4i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 @@ -2460,7 +2460,7 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-D-NEXT: vsub.vx v8, v12, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i32: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv4i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2469,8 +2469,8 @@ define @cttz_zero_undef_nxv4i32( %va) { ret %a } -define @cttz_zero_undef_nxv8i32( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv8i32: +define @cttz_zero_poison_nxv8i32( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv8i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v12, v8, -1 @@ -2498,7 +2498,7 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv8i32: +; CHECK-F-LABEL: cttz_zero_poison_nxv8i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 @@ -2511,7 +2511,7 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv8i32: +; CHECK-D-LABEL: cttz_zero_poison_nxv8i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 @@ -2523,7 +2523,7 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-D-NEXT: vsub.vx v8, v16, a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i32: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv8i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2532,8 +2532,8 @@ define @cttz_zero_undef_nxv8i32( %va) { ret %a } -define @cttz_zero_undef_nxv16i32( %va) { -; CHECK-ZVE64X-LABEL: cttz_zero_undef_nxv16i32: +define @cttz_zero_poison_nxv16i32( %va) { +; CHECK-ZVE64X-LABEL: cttz_zero_poison_nxv16i32: ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVE64X-NEXT: vadd.vi v16, v8, -1 @@ -2561,7 +2561,7 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 24 ; CHECK-ZVE64X-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv16i32: +; CHECK-F-LABEL: cttz_zero_poison_nxv16i32: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 @@ -2574,7 +2574,7 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv16i32: +; CHECK-D-LABEL: cttz_zero_poison_nxv16i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 @@ -2587,7 +2587,7 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i32: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv16i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2596,8 +2596,8 @@ define @cttz_zero_undef_nxv16i32( %va) { ret %a } -define @cttz_zero_undef_nxv1i64( %va) { -; RV32I-LABEL: cttz_zero_undef_nxv1i64: +define @cttz_zero_poison_nxv1i64( %va) { +; RV32I-LABEL: cttz_zero_poison_nxv1i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32I-NEXT: vadd.vi v9, v8, -1 @@ -2638,7 +2638,7 @@ define @cttz_zero_undef_nxv1i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: cttz_zero_undef_nxv1i64: +; RV64I-LABEL: cttz_zero_poison_nxv1i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a0, 349525 ; RV64I-NEXT: lui a1, 209715 @@ -2675,7 +2675,7 @@ define @cttz_zero_undef_nxv1i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv1i64: +; CHECK-F-LABEL: cttz_zero_poison_nxv1i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 @@ -2689,7 +2689,7 @@ define @cttz_zero_undef_nxv1i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv1i64: +; CHECK-D-LABEL: cttz_zero_poison_nxv1i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 @@ -2703,7 +2703,7 @@ define @cttz_zero_undef_nxv1i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv1i64: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv1i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2712,8 +2712,8 @@ define @cttz_zero_undef_nxv1i64( %va) { ret %a } -define @cttz_zero_undef_nxv2i64( %va) { -; RV32I-LABEL: cttz_zero_undef_nxv2i64: +define @cttz_zero_poison_nxv2i64( %va) { +; RV32I-LABEL: cttz_zero_poison_nxv2i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32I-NEXT: vadd.vi v10, v8, -1 @@ -2754,7 +2754,7 @@ define @cttz_zero_undef_nxv2i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: cttz_zero_undef_nxv2i64: +; RV64I-LABEL: cttz_zero_poison_nxv2i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a0, 349525 ; RV64I-NEXT: lui a1, 209715 @@ -2791,7 +2791,7 @@ define @cttz_zero_undef_nxv2i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv2i64: +; CHECK-F-LABEL: cttz_zero_poison_nxv2i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 @@ -2805,7 +2805,7 @@ define @cttz_zero_undef_nxv2i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv2i64: +; CHECK-D-LABEL: cttz_zero_poison_nxv2i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 @@ -2819,7 +2819,7 @@ define @cttz_zero_undef_nxv2i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i64: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv2i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2828,8 +2828,8 @@ define @cttz_zero_undef_nxv2i64( %va) { ret %a } -define @cttz_zero_undef_nxv4i64( %va) { -; RV32I-LABEL: cttz_zero_undef_nxv4i64: +define @cttz_zero_poison_nxv4i64( %va) { +; RV32I-LABEL: cttz_zero_poison_nxv4i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32I-NEXT: vadd.vi v12, v8, -1 @@ -2870,7 +2870,7 @@ define @cttz_zero_undef_nxv4i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: cttz_zero_undef_nxv4i64: +; RV64I-LABEL: cttz_zero_poison_nxv4i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a0, 349525 ; RV64I-NEXT: lui a1, 209715 @@ -2907,7 +2907,7 @@ define @cttz_zero_undef_nxv4i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv4i64: +; CHECK-F-LABEL: cttz_zero_poison_nxv4i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 @@ -2921,7 +2921,7 @@ define @cttz_zero_undef_nxv4i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv4i64: +; CHECK-D-LABEL: cttz_zero_poison_nxv4i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 @@ -2935,7 +2935,7 @@ define @cttz_zero_undef_nxv4i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i64: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv4i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2944,8 +2944,8 @@ define @cttz_zero_undef_nxv4i64( %va) { ret %a } -define @cttz_zero_undef_nxv8i64( %va) { -; RV32I-LABEL: cttz_zero_undef_nxv8i64: +define @cttz_zero_poison_nxv8i64( %va) { +; RV32I-LABEL: cttz_zero_poison_nxv8i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32I-NEXT: vadd.vi v16, v8, -1 @@ -2986,7 +2986,7 @@ define @cttz_zero_undef_nxv8i64( %va) { ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret ; -; RV64I-LABEL: cttz_zero_undef_nxv8i64: +; RV64I-LABEL: cttz_zero_poison_nxv8i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a0, 349525 ; RV64I-NEXT: lui a1, 209715 @@ -3023,7 +3023,7 @@ define @cttz_zero_undef_nxv8i64( %va) { ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret ; -; CHECK-F-LABEL: cttz_zero_undef_nxv8i64: +; CHECK-F-LABEL: cttz_zero_poison_nxv8i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 @@ -3037,7 +3037,7 @@ define @cttz_zero_undef_nxv8i64( %va) { ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; -; CHECK-D-LABEL: cttz_zero_undef_nxv8i64: +; CHECK-D-LABEL: cttz_zero_poison_nxv8i64: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 @@ -3051,7 +3051,7 @@ define @cttz_zero_undef_nxv8i64( %va) { ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; -; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i64: +; CHECK-ZVBB-LABEL: cttz_zero_poison_nxv8i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 9b2f220dd802e..fe371817bf494 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -1306,8 +1306,8 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ret %v } -define @vp_cttz_zero_undef_nxv1i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i8: +define @vp_cttz_zero_poison_nxv1i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1322,7 +1322,7 @@ define @vp_cttz_zero_undef_nxv1i8( %va, @vp_cttz_zero_undef_nxv1i8( %va, %v } -define @vp_cttz_zero_undef_nxv1i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i8_unmasked: +define @vp_cttz_zero_poison_nxv1i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1347,7 +1347,7 @@ define @vp_cttz_zero_undef_nxv1i8_unmasked( % ; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i8_unmasked: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv1i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1356,8 +1356,8 @@ define @vp_cttz_zero_undef_nxv1i8_unmasked( % ret %v } -define @vp_cttz_zero_undef_nxv2i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i8: +define @vp_cttz_zero_poison_nxv2i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1372,7 +1372,7 @@ define @vp_cttz_zero_undef_nxv2i8( %va, @vp_cttz_zero_undef_nxv2i8( %va, %v } -define @vp_cttz_zero_undef_nxv2i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i8_unmasked: +define @vp_cttz_zero_poison_nxv2i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1397,7 +1397,7 @@ define @vp_cttz_zero_undef_nxv2i8_unmasked( % ; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i8_unmasked: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv2i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1406,8 +1406,8 @@ define @vp_cttz_zero_undef_nxv2i8_unmasked( % ret %v } -define @vp_cttz_zero_undef_nxv4i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i8: +define @vp_cttz_zero_poison_nxv4i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1422,7 +1422,7 @@ define @vp_cttz_zero_undef_nxv4i8( %va, @vp_cttz_zero_undef_nxv4i8( %va, %v } -define @vp_cttz_zero_undef_nxv4i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i8_unmasked: +define @vp_cttz_zero_poison_nxv4i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1447,7 +1447,7 @@ define @vp_cttz_zero_undef_nxv4i8_unmasked( % ; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i8_unmasked: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv4i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1456,8 +1456,8 @@ define @vp_cttz_zero_undef_nxv4i8_unmasked( % ret %v } -define @vp_cttz_zero_undef_nxv8i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i8: +define @vp_cttz_zero_poison_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1472,7 +1472,7 @@ define @vp_cttz_zero_undef_nxv8i8( %va, @vp_cttz_zero_undef_nxv8i8( %va, %v } -define @vp_cttz_zero_undef_nxv8i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i8_unmasked: +define @vp_cttz_zero_poison_nxv8i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1497,7 +1497,7 @@ define @vp_cttz_zero_undef_nxv8i8_unmasked( % ; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i8_unmasked: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv8i8_unmasked: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1506,8 +1506,8 @@ define @vp_cttz_zero_undef_nxv8i8_unmasked( % ret %v } -define @vp_cttz_zero_undef_nxv16i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i8: +define @vp_cttz_zero_poison_nxv16i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1522,7 +1522,7 @@ define @vp_cttz_zero_undef_nxv16i8( %va, @vp_cttz_zero_undef_nxv16i8( %va, %v } -define @vp_cttz_zero_undef_nxv16i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i8_unmasked: +define @vp_cttz_zero_poison_nxv16i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1547,7 +1547,7 @@ define @vp_cttz_zero_undef_nxv16i8_unmasked( @vp_cttz_zero_undef_nxv16i8_unmasked( %v } -define @vp_cttz_zero_undef_nxv32i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv32i8: +define @vp_cttz_zero_poison_nxv32i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vadd.vi v12, v8, -1 @@ -1577,7 +1577,7 @@ define @vp_cttz_zero_undef_nxv32i8( %va, @vp_cttz_zero_undef_nxv32i8( %va, %v } -define @vp_cttz_zero_undef_nxv32i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv32i8_unmasked: +define @vp_cttz_zero_poison_nxv32i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv32i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vadd.vi v12, v8, -1 @@ -1607,7 +1607,7 @@ define @vp_cttz_zero_undef_nxv32i8_unmasked( @vp_cttz_zero_undef_nxv32i8_unmasked( %v } -define @vp_cttz_zero_undef_nxv64i8( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv64i8: +define @vp_cttz_zero_poison_nxv64i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v8, -1 @@ -1637,7 +1637,7 @@ define @vp_cttz_zero_undef_nxv64i8( %va, @vp_cttz_zero_undef_nxv64i8( %va, %v } -define @vp_cttz_zero_undef_nxv64i8_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv64i8_unmasked: +define @vp_cttz_zero_poison_nxv64i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv64i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v8, -1 @@ -1667,7 +1667,7 @@ define @vp_cttz_zero_undef_nxv64i8_unmasked( @vp_cttz_zero_undef_nxv64i8_unmasked( %v } -define @vp_cttz_zero_undef_nxv1i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i16: +define @vp_cttz_zero_poison_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1688,7 +1688,7 @@ define @vp_cttz_zero_undef_nxv1i16( %va, @vp_cttz_zero_undef_nxv1i16( %va, %v } -define @vp_cttz_zero_undef_nxv1i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i16_unmasked: +define @vp_cttz_zero_poison_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1709,7 +1709,7 @@ define @vp_cttz_zero_undef_nxv1i16_unmasked( @vp_cttz_zero_undef_nxv1i16_unmasked( %v } -define @vp_cttz_zero_undef_nxv2i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i16: +define @vp_cttz_zero_poison_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1730,7 +1730,7 @@ define @vp_cttz_zero_undef_nxv2i16( %va, @vp_cttz_zero_undef_nxv2i16( %va, %v } -define @vp_cttz_zero_undef_nxv2i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i16_unmasked: +define @vp_cttz_zero_poison_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1751,7 +1751,7 @@ define @vp_cttz_zero_undef_nxv2i16_unmasked( @vp_cttz_zero_undef_nxv2i16_unmasked( %v } -define @vp_cttz_zero_undef_nxv4i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i16: +define @vp_cttz_zero_poison_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1772,7 +1772,7 @@ define @vp_cttz_zero_undef_nxv4i16( %va, @vp_cttz_zero_undef_nxv4i16( %va, %v } -define @vp_cttz_zero_undef_nxv4i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i16_unmasked: +define @vp_cttz_zero_poison_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1793,7 +1793,7 @@ define @vp_cttz_zero_undef_nxv4i16_unmasked( @vp_cttz_zero_undef_nxv4i16_unmasked( %v } -define @vp_cttz_zero_undef_nxv8i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i16: +define @vp_cttz_zero_poison_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1814,7 +1814,7 @@ define @vp_cttz_zero_undef_nxv8i16( %va, @vp_cttz_zero_undef_nxv8i16( %va, %v } -define @vp_cttz_zero_undef_nxv8i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i16_unmasked: +define @vp_cttz_zero_poison_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1835,7 +1835,7 @@ define @vp_cttz_zero_undef_nxv8i16_unmasked( @vp_cttz_zero_undef_nxv8i16_unmasked( %v } -define @vp_cttz_zero_undef_nxv16i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i16: +define @vp_cttz_zero_poison_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -1856,7 +1856,7 @@ define @vp_cttz_zero_undef_nxv16i16( %va, ; CHECK-NEXT: vsub.vx v8, v16, a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i16: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv16i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1865,8 +1865,8 @@ define @vp_cttz_zero_undef_nxv16i16( %va, ret %v } -define @vp_cttz_zero_undef_nxv16i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i16_unmasked: +define @vp_cttz_zero_poison_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -1877,7 +1877,7 @@ define @vp_cttz_zero_undef_nxv16i16_unmasked( @vp_cttz_zero_undef_nxv16i16_unmasked( %v } -define @vp_cttz_zero_undef_nxv32i16( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv32i16: +define @vp_cttz_zero_poison_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v8, -1 @@ -1914,7 +1914,7 @@ define @vp_cttz_zero_undef_nxv32i16( %va, ; CHECK-NEXT: vsrl.vi v8, v8, 8 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv32i16: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv32i16: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -1923,8 +1923,8 @@ define @vp_cttz_zero_undef_nxv32i16( %va, ret %v } -define @vp_cttz_zero_undef_nxv32i16_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv32i16_unmasked: +define @vp_cttz_zero_poison_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv32i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v8, -1 @@ -1951,7 +1951,7 @@ define @vp_cttz_zero_undef_nxv32i16_unmasked( @vp_cttz_zero_undef_nxv32i16_unmasked( %v } -define @vp_cttz_zero_undef_nxv1i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i32: +define @vp_cttz_zero_poison_nxv1i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1973,7 +1973,7 @@ define @vp_cttz_zero_undef_nxv1i32( %va, @vp_cttz_zero_undef_nxv1i32( %va, %v } -define @vp_cttz_zero_undef_nxv1i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i32_unmasked: +define @vp_cttz_zero_poison_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1995,7 +1995,7 @@ define @vp_cttz_zero_undef_nxv1i32_unmasked( @vp_cttz_zero_undef_nxv1i32_unmasked( %v } -define @vp_cttz_zero_undef_nxv2i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i32: +define @vp_cttz_zero_poison_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -2017,7 +2017,7 @@ define @vp_cttz_zero_undef_nxv2i32( %va, @vp_cttz_zero_undef_nxv2i32( %va, %v } -define @vp_cttz_zero_undef_nxv2i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i32_unmasked: +define @vp_cttz_zero_poison_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -2039,7 +2039,7 @@ define @vp_cttz_zero_undef_nxv2i32_unmasked( @vp_cttz_zero_undef_nxv2i32_unmasked( %v } -define @vp_cttz_zero_undef_nxv4i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i32: +define @vp_cttz_zero_poison_nxv4i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -2061,7 +2061,7 @@ define @vp_cttz_zero_undef_nxv4i32( %va, @vp_cttz_zero_undef_nxv4i32( %va, %v } -define @vp_cttz_zero_undef_nxv4i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i32_unmasked: +define @vp_cttz_zero_poison_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -2083,7 +2083,7 @@ define @vp_cttz_zero_undef_nxv4i32_unmasked( @vp_cttz_zero_undef_nxv4i32_unmasked( %v } -define @vp_cttz_zero_undef_nxv8i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i32: +define @vp_cttz_zero_poison_nxv8i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -2105,7 +2105,7 @@ define @vp_cttz_zero_undef_nxv8i32( %va, @vp_cttz_zero_undef_nxv8i32( %va, %v } -define @vp_cttz_zero_undef_nxv8i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i32_unmasked: +define @vp_cttz_zero_poison_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -2127,7 +2127,7 @@ define @vp_cttz_zero_undef_nxv8i32_unmasked( @vp_cttz_zero_undef_nxv8i32_unmasked( %v } -define @vp_cttz_zero_undef_nxv16i32( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i32: +define @vp_cttz_zero_poison_nxv16i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -2150,7 +2150,7 @@ define @vp_cttz_zero_undef_nxv16i32( %va, ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i32: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv16i32: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2159,8 +2159,8 @@ define @vp_cttz_zero_undef_nxv16i32( %va, ret %v } -define @vp_cttz_zero_undef_nxv16i32_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i32_unmasked: +define @vp_cttz_zero_poison_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -2173,7 +2173,7 @@ define @vp_cttz_zero_undef_nxv16i32_unmasked( @vp_cttz_zero_undef_nxv16i32_unmasked( %v } -define @vp_cttz_zero_undef_nxv1i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i64: +define @vp_cttz_zero_poison_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -2197,7 +2197,7 @@ define @vp_cttz_zero_undef_nxv1i64( %va, @vp_cttz_zero_undef_nxv1i64( %va, %v } -define @vp_cttz_zero_undef_nxv1i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv1i64_unmasked: +define @vp_cttz_zero_poison_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv1i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -2221,7 +2221,7 @@ define @vp_cttz_zero_undef_nxv1i64_unmasked( @vp_cttz_zero_undef_nxv1i64_unmasked( %v } -define @vp_cttz_zero_undef_nxv2i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i64: +define @vp_cttz_zero_poison_nxv2i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -2245,7 +2245,7 @@ define @vp_cttz_zero_undef_nxv2i64( %va, @vp_cttz_zero_undef_nxv2i64( %va, %v } -define @vp_cttz_zero_undef_nxv2i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv2i64_unmasked: +define @vp_cttz_zero_poison_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv2i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -2269,7 +2269,7 @@ define @vp_cttz_zero_undef_nxv2i64_unmasked( @vp_cttz_zero_undef_nxv2i64_unmasked( %v } -define @vp_cttz_zero_undef_nxv4i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i64: +define @vp_cttz_zero_poison_nxv4i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -2293,7 +2293,7 @@ define @vp_cttz_zero_undef_nxv4i64( %va, @vp_cttz_zero_undef_nxv4i64( %va, %v } -define @vp_cttz_zero_undef_nxv4i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv4i64_unmasked: +define @vp_cttz_zero_poison_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv4i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -2317,7 +2317,7 @@ define @vp_cttz_zero_undef_nxv4i64_unmasked( @vp_cttz_zero_undef_nxv4i64_unmasked( %v } -define @vp_cttz_zero_undef_nxv7i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv7i64: +define @vp_cttz_zero_poison_nxv7i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv7i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -2341,7 +2341,7 @@ define @vp_cttz_zero_undef_nxv7i64( %va, @vp_cttz_zero_undef_nxv7i64( %va, %v } -define @vp_cttz_zero_undef_nxv7i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv7i64_unmasked: +define @vp_cttz_zero_poison_nxv7i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv7i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -2365,7 +2365,7 @@ define @vp_cttz_zero_undef_nxv7i64_unmasked( @vp_cttz_zero_undef_nxv7i64_unmasked( %v } -define @vp_cttz_zero_undef_nxv8i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i64: +define @vp_cttz_zero_poison_nxv8i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -2389,7 +2389,7 @@ define @vp_cttz_zero_undef_nxv8i64( %va, @vp_cttz_zero_undef_nxv8i64( %va, %v } -define @vp_cttz_zero_undef_nxv8i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv8i64_unmasked: +define @vp_cttz_zero_poison_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv8i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -2413,7 +2413,7 @@ define @vp_cttz_zero_undef_nxv8i64_unmasked( @vp_cttz_zero_undef_nxv8i64_unmasked( %v } -define @vp_cttz_zero_undef_nxv16i64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64: +define @vp_cttz_zero_poison_nxv16i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v24, v8, 0 @@ -2442,7 +2442,7 @@ define @vp_cttz_zero_undef_nxv16i64( %va, ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: ret ; -; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i64: +; CHECK-ZVBB-LABEL: vp_cttz_zero_poison_nxv16i64: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v8, v8 @@ -2452,8 +2452,8 @@ define @vp_cttz_zero_undef_nxv16i64( %va, ret %v } -define @vp_cttz_zero_undef_nxv16i64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked: +define @vp_cttz_zero_poison_nxv16i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_nxv16i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v24, v8, 0 @@ -2472,7 +2472,7 @@ define @vp_cttz_zero_undef_nxv16i64_unmasked( @vp_cttz_nxv1i9( %va, @llvm.vp.cttz.nxv1i9( %va, i1 false, %m, i32 %evl) ret %v } -define @vp_zero_undef_cttz_nxv1i9( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_zero_undef_cttz_nxv1i9: +define @vp_zero_poison_cttz_nxv1i9( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_zero_poison_cttz_nxv1i9: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -2519,7 +2519,7 @@ define @vp_zero_undef_cttz_nxv1i9( %va, @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i8: +define <2 x i8> @vp_ctlz_zero_poison_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -636,8 +636,8 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext ret <2 x i8> %v } -define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i8_unmasked: +define <2 x i8> @vp_ctlz_zero_poison_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -652,8 +652,8 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl ret <2 x i8> %v } -define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i8: +define <4 x i8> @vp_ctlz_zero_poison_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -668,8 +668,8 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext ret <4 x i8> %v } -define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i8_unmasked: +define <4 x i8> @vp_ctlz_zero_poison_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 @@ -684,8 +684,8 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl ret <4 x i8> %v } -define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i8: +define <8 x i8> @vp_ctlz_zero_poison_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 @@ -700,8 +700,8 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext ret <8 x i8> %v } -define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i8_unmasked: +define <8 x i8> @vp_ctlz_zero_poison_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 @@ -716,8 +716,8 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl ret <8 x i8> %v } -define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i8: +define <16 x i8> @vp_ctlz_zero_poison_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 @@ -732,8 +732,8 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero ret <16 x i8> %v } -define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i8_unmasked: +define <16 x i8> @vp_ctlz_zero_poison_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 @@ -748,8 +748,8 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext % ret <16 x i8> %v } -define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i16: +define <2 x i16> @vp_ctlz_zero_poison_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -761,8 +761,8 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe ret <2 x i16> %v } -define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i16_unmasked: +define <2 x i16> @vp_ctlz_zero_poison_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -774,8 +774,8 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext % ret <2 x i16> %v } -define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i16: +define <4 x i16> @vp_ctlz_zero_poison_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -787,8 +787,8 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe ret <4 x i16> %v } -define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i16_unmasked: +define <4 x i16> @vp_ctlz_zero_poison_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -800,8 +800,8 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext % ret <4 x i16> %v } -define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i16: +define <8 x i16> @vp_ctlz_zero_poison_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -813,8 +813,8 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe ret <8 x i16> %v } -define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i16_unmasked: +define <8 x i16> @vp_ctlz_zero_poison_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -826,8 +826,8 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext % ret <8 x i16> %v } -define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i16: +define <16 x i16> @vp_ctlz_zero_poison_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -839,8 +839,8 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z ret <16 x i16> %v } -define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i16_unmasked: +define <16 x i16> @vp_ctlz_zero_poison_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -852,8 +852,8 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex ret <16 x i16> %v } -define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i32: +define <2 x i32> @vp_ctlz_zero_poison_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -866,8 +866,8 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe ret <2 x i32> %v } -define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i32_unmasked: +define <2 x i32> @vp_ctlz_zero_poison_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 @@ -880,8 +880,8 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext % ret <2 x i32> %v } -define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i32: +define <4 x i32> @vp_ctlz_zero_poison_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -894,8 +894,8 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe ret <4 x i32> %v } -define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i32_unmasked: +define <4 x i32> @vp_ctlz_zero_poison_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 @@ -908,8 +908,8 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext % ret <4 x i32> %v } -define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i32: +define <8 x i32> @vp_ctlz_zero_poison_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -922,8 +922,8 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe ret <8 x i32> %v } -define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i32_unmasked: +define <8 x i32> @vp_ctlz_zero_poison_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 @@ -936,8 +936,8 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext % ret <8 x i32> %v } -define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i32: +define <16 x i32> @vp_ctlz_zero_poison_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 @@ -950,8 +950,8 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z ret <16 x i32> %v } -define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i32_unmasked: +define <16 x i32> @vp_ctlz_zero_poison_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 @@ -964,8 +964,8 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex ret <16 x i32> %v } -define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i64: +define <2 x i64> @vp_ctlz_zero_poison_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -980,8 +980,8 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ret <2 x i64> %v } -define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v2i64_unmasked: +define <2 x i64> @vp_ctlz_zero_poison_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v2i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -996,8 +996,8 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ret <2 x i64> %v } -define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i64: +define <4 x i64> @vp_ctlz_zero_poison_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -1012,8 +1012,8 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ret <4 x i64> %v } -define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v4i64_unmasked: +define <4 x i64> @vp_ctlz_zero_poison_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v4i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -1028,8 +1028,8 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ret <4 x i64> %v } -define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i64: +define <8 x i64> @vp_ctlz_zero_poison_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -1044,8 +1044,8 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ret <8 x i64> %v } -define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v8i64_unmasked: +define <8 x i64> @vp_ctlz_zero_poison_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v8i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -1060,8 +1060,8 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ret <8 x i64> %v } -define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v15i64: +define <15 x i64> @vp_ctlz_zero_poison_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v15i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma @@ -1076,8 +1076,8 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ret <15 x i64> %v } -define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v15i64_unmasked: +define <15 x i64> @vp_ctlz_zero_poison_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v15i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma @@ -1092,8 +1092,8 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ret <15 x i64> %v } -define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i64: +define <16 x i64> @vp_ctlz_zero_poison_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma @@ -1108,8 +1108,8 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ret <16 x i64> %v } -define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v16i64_unmasked: +define <16 x i64> @vp_ctlz_zero_poison_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v16i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma @@ -1124,8 +1124,8 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ret <16 x i64> %v } -define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v32i64: +define <32 x i64> @vp_ctlz_zero_poison_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v32i64: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma @@ -1143,8 +1143,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ret <32 x i64> %v } -define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ctlz_zero_undef_v32i64_unmasked: +define <32 x i64> @vp_ctlz_zero_poison_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ctlz_zero_poison_v32i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index 02e1ec8da49fe..6340f0b71c563 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -778,8 +778,8 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { -; RVI-LABEL: ctlz_zero_undef_v16i8: +define void @ctlz_zero_poison_v16i8(ptr %x, ptr %y) nounwind { +; RVI-LABEL: ctlz_zero_poison_v16i8: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVI-NEXT: vle8.v v8, (a0) @@ -805,7 +805,7 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse8.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v16i8: +; RVF-LABEL: ctlz_zero_poison_v16i8: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle8.v v8, (a0) @@ -819,7 +819,7 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse8.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v16i8: +; RVD-LABEL: ctlz_zero_poison_v16i8: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle8.v v8, (a0) @@ -833,7 +833,7 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse8.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v16i8: +; ZVBB-LABEL: ctlz_zero_poison_v16i8: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; ZVBB-NEXT: vle8.v v8, (a0) @@ -847,8 +847,8 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { -; RVI-LABEL: ctlz_zero_undef_v8i16: +define void @ctlz_zero_poison_v8i16(ptr %x, ptr %y) nounwind { +; RVI-LABEL: ctlz_zero_poison_v8i16: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVI-NEXT: vle16.v v8, (a0) @@ -883,7 +883,7 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse16.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v8i16: +; RVF-LABEL: ctlz_zero_poison_v8i16: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVF-NEXT: vle16.v v10, (a0) @@ -894,7 +894,7 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse16.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v8i16: +; RVD-LABEL: ctlz_zero_poison_v8i16: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVD-NEXT: vle16.v v10, (a0) @@ -905,7 +905,7 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse16.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v8i16: +; ZVBB-LABEL: ctlz_zero_poison_v8i16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVBB-NEXT: vle16.v v8, (a0) @@ -919,8 +919,8 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { -; RVI-LABEL: ctlz_zero_undef_v4i32: +define void @ctlz_zero_poison_v4i32(ptr %x, ptr %y) nounwind { +; RVI-LABEL: ctlz_zero_poison_v4i32: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVI-NEXT: vle32.v v8, (a0) @@ -958,7 +958,7 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse32.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v4i32: +; RVF-LABEL: ctlz_zero_poison_v4i32: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle32.v v8, (a0) @@ -971,7 +971,7 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse32.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v4i32: +; RVD-LABEL: ctlz_zero_poison_v4i32: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVD-NEXT: vle32.v v10, (a0) @@ -983,7 +983,7 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse32.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v4i32: +; ZVBB-LABEL: ctlz_zero_poison_v4i32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVBB-NEXT: vle32.v v8, (a0) @@ -997,8 +997,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { -; RV32I-LABEL: ctlz_zero_undef_v2i64: +define void @ctlz_zero_poison_v2i64(ptr %x, ptr %y) nounwind { +; RV32I-LABEL: ctlz_zero_poison_v2i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) @@ -1052,7 +1052,7 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vse64.v v8, (a0) ; RV32I-NEXT: ret ; -; RV64I-LABEL: ctlz_zero_undef_v2i64: +; RV64I-LABEL: ctlz_zero_poison_v2i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) @@ -1102,7 +1102,7 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v2i64: +; RVF-LABEL: ctlz_zero_poison_v2i64: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RVF-NEXT: vle64.v v8, (a0) @@ -1116,7 +1116,7 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse64.v v10, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v2i64: +; RVD-LABEL: ctlz_zero_poison_v2i64: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVD-NEXT: vle64.v v8, (a0) @@ -1130,7 +1130,7 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse64.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v2i64: +; ZVBB-LABEL: ctlz_zero_poison_v2i64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; ZVBB-NEXT: vle64.v v8, (a0) @@ -1144,8 +1144,8 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { -; RVI-LABEL: ctlz_zero_undef_v32i8: +define void @ctlz_zero_poison_v32i8(ptr %x, ptr %y) nounwind { +; RVI-LABEL: ctlz_zero_poison_v32i8: ; RVI: # %bb.0: ; RVI-NEXT: li a1, 32 ; RVI-NEXT: vsetvli zero, a1, e8, m2, ta, ma @@ -1172,7 +1172,7 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse8.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v32i8: +; RVF-LABEL: ctlz_zero_poison_v32i8: ; RVF: # %bb.0: ; RVF-NEXT: li a1, 32 ; RVF-NEXT: vsetvli zero, a1, e16, m4, ta, ma @@ -1187,7 +1187,7 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse8.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v32i8: +; RVD-LABEL: ctlz_zero_poison_v32i8: ; RVD: # %bb.0: ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vsetvli zero, a1, e16, m4, ta, ma @@ -1202,7 +1202,7 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse8.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v32i8: +; ZVBB-LABEL: ctlz_zero_poison_v32i8: ; ZVBB: # %bb.0: ; ZVBB-NEXT: li a1, 32 ; ZVBB-NEXT: vsetvli zero, a1, e8, m2, ta, ma @@ -1217,8 +1217,8 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { -; RVI-LABEL: ctlz_zero_undef_v16i16: +define void @ctlz_zero_poison_v16i16(ptr %x, ptr %y) nounwind { +; RVI-LABEL: ctlz_zero_poison_v16i16: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVI-NEXT: vle16.v v8, (a0) @@ -1253,7 +1253,7 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse16.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v16i16: +; RVF-LABEL: ctlz_zero_poison_v16i16: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle16.v v12, (a0) @@ -1264,7 +1264,7 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse16.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v16i16: +; RVD-LABEL: ctlz_zero_poison_v16i16: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle16.v v12, (a0) @@ -1275,7 +1275,7 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse16.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v16i16: +; ZVBB-LABEL: ctlz_zero_poison_v16i16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVBB-NEXT: vle16.v v8, (a0) @@ -1289,8 +1289,8 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { -; RVI-LABEL: ctlz_zero_undef_v8i32: +define void @ctlz_zero_poison_v8i32(ptr %x, ptr %y) nounwind { +; RVI-LABEL: ctlz_zero_poison_v8i32: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVI-NEXT: vle32.v v8, (a0) @@ -1328,7 +1328,7 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse32.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v8i32: +; RVF-LABEL: ctlz_zero_poison_v8i32: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVF-NEXT: vle32.v v8, (a0) @@ -1341,7 +1341,7 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse32.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v8i32: +; RVD-LABEL: ctlz_zero_poison_v8i32: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVD-NEXT: vle32.v v12, (a0) @@ -1353,7 +1353,7 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse32.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v8i32: +; ZVBB-LABEL: ctlz_zero_poison_v8i32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; ZVBB-NEXT: vle32.v v8, (a0) @@ -1367,8 +1367,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ret void } -define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { -; RV32I-LABEL: ctlz_zero_undef_v4i64: +define void @ctlz_zero_poison_v4i64(ptr %x, ptr %y) nounwind { +; RV32I-LABEL: ctlz_zero_poison_v4i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) @@ -1422,7 +1422,7 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vse64.v v8, (a0) ; RV32I-NEXT: ret ; -; RV64I-LABEL: ctlz_zero_undef_v4i64: +; RV64I-LABEL: ctlz_zero_poison_v4i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) @@ -1472,7 +1472,7 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RVF-LABEL: ctlz_zero_undef_v4i64: +; RVF-LABEL: ctlz_zero_poison_v4i64: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle64.v v8, (a0) @@ -1486,7 +1486,7 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: ctlz_zero_undef_v4i64: +; RVD-LABEL: ctlz_zero_poison_v4i64: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVD-NEXT: vle64.v v8, (a0) @@ -1500,7 +1500,7 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse64.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: ctlz_zero_undef_v4i64: +; ZVBB-LABEL: ctlz_zero_poison_v4i64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; ZVBB-NEXT: vle64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index 0d54224154494..65582b3f5d972 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -736,8 +736,8 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ret <32 x i64> %v } -define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i8: +define <2 x i8> @vp_cttz_zero_poison_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -755,8 +755,8 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext ret <2 x i8> %v } -define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i8_unmasked: +define <2 x i8> @vp_cttz_zero_poison_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -774,8 +774,8 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl ret <2 x i8> %v } -define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i8: +define <4 x i8> @vp_cttz_zero_poison_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -793,8 +793,8 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext ret <4 x i8> %v } -define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i8_unmasked: +define <4 x i8> @vp_cttz_zero_poison_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -812,8 +812,8 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl ret <4 x i8> %v } -define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i8: +define <8 x i8> @vp_cttz_zero_poison_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -831,8 +831,8 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext ret <8 x i8> %v } -define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i8_unmasked: +define <8 x i8> @vp_cttz_zero_poison_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -850,8 +850,8 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl ret <8 x i8> %v } -define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i8: +define <16 x i8> @vp_cttz_zero_poison_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -869,8 +869,8 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero ret <16 x i8> %v } -define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i8_unmasked: +define <16 x i8> @vp_cttz_zero_poison_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i8_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -888,8 +888,8 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext % ret <16 x i8> %v } -define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i16: +define <2 x i16> @vp_cttz_zero_poison_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -903,8 +903,8 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe ret <2 x i16> %v } -define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i16_unmasked: +define <2 x i16> @vp_cttz_zero_poison_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -918,8 +918,8 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext % ret <2 x i16> %v } -define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i16: +define <4 x i16> @vp_cttz_zero_poison_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -933,8 +933,8 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe ret <4 x i16> %v } -define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i16_unmasked: +define <4 x i16> @vp_cttz_zero_poison_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -948,8 +948,8 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext % ret <4 x i16> %v } -define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i16: +define <8 x i16> @vp_cttz_zero_poison_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -963,8 +963,8 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe ret <8 x i16> %v } -define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i16_unmasked: +define <8 x i16> @vp_cttz_zero_poison_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -978,8 +978,8 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext % ret <8 x i16> %v } -define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i16: +define <16 x i16> @vp_cttz_zero_poison_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -993,8 +993,8 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z ret <16 x i16> %v } -define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i16_unmasked: +define <16 x i16> @vp_cttz_zero_poison_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1008,8 +1008,8 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex ret <16 x i16> %v } -define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i32: +define <2 x i32> @vp_cttz_zero_poison_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1024,8 +1024,8 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe ret <2 x i32> %v } -define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i32_unmasked: +define <2 x i32> @vp_cttz_zero_poison_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1040,8 +1040,8 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext % ret <2 x i32> %v } -define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i32: +define <4 x i32> @vp_cttz_zero_poison_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1056,8 +1056,8 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe ret <4 x i32> %v } -define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i32_unmasked: +define <4 x i32> @vp_cttz_zero_poison_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1072,8 +1072,8 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext % ret <4 x i32> %v } -define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i32: +define <8 x i32> @vp_cttz_zero_poison_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1088,8 +1088,8 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe ret <8 x i32> %v } -define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i32_unmasked: +define <8 x i32> @vp_cttz_zero_poison_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1104,8 +1104,8 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext % ret <8 x i32> %v } -define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i32: +define <16 x i32> @vp_cttz_zero_poison_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -1120,8 +1120,8 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z ret <16 x i32> %v } -define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i32_unmasked: +define <16 x i32> @vp_cttz_zero_poison_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -1136,8 +1136,8 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex ret <16 x i32> %v } -define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i64: +define <2 x i64> @vp_cttz_zero_poison_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1154,8 +1154,8 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ret <2 x i64> %v } -define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v2i64_unmasked: +define <2 x i64> @vp_cttz_zero_poison_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v2i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 @@ -1172,8 +1172,8 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ret <2 x i64> %v } -define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i64: +define <4 x i64> @vp_cttz_zero_poison_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1190,8 +1190,8 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ret <4 x i64> %v } -define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v4i64_unmasked: +define <4 x i64> @vp_cttz_zero_poison_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v4i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vrsub.vi v10, v8, 0 @@ -1208,8 +1208,8 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ret <4 x i64> %v } -define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i64: +define <8 x i64> @vp_cttz_zero_poison_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -1226,8 +1226,8 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ret <8 x i64> %v } -define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v8i64_unmasked: +define <8 x i64> @vp_cttz_zero_poison_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v8i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0 @@ -1244,8 +1244,8 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ret <8 x i64> %v } -define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v15i64: +define <15 x i64> @vp_cttz_zero_poison_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v15i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -1262,8 +1262,8 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ret <15 x i64> %v } -define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v15i64_unmasked: +define <15 x i64> @vp_cttz_zero_poison_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v15i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -1280,8 +1280,8 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ret <15 x i64> %v } -define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i64: +define <16 x i64> @vp_cttz_zero_poison_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -1298,8 +1298,8 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ret <16 x i64> %v } -define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v16i64_unmasked: +define <16 x i64> @vp_cttz_zero_poison_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v16i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0 @@ -1316,8 +1316,8 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ret <16 x i64> %v } -define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v32i64: +define <32 x i64> @vp_cttz_zero_poison_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v32i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v24, v8, 0 @@ -1339,8 +1339,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ret <32 x i64> %v } -define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_cttz_zero_undef_v32i64_unmasked: +define <32 x i64> @vp_cttz_zero_poison_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_cttz_zero_poison_v32i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v24, v8, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index ad51cab1ba8d2..259561fbfcf4a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -750,8 +750,8 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { -; RVI-LABEL: cttz_zero_undef_v16i8: +define void @cttz_zero_poison_v16i8(ptr %x, ptr %y) nounwind { +; RVI-LABEL: cttz_zero_poison_v16i8: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVI-NEXT: vle8.v v8, (a0) @@ -773,7 +773,7 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse8.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v16i8: +; RVF-LABEL: cttz_zero_poison_v16i8: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVF-NEXT: vle8.v v8, (a0) @@ -790,7 +790,7 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse8.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v16i8: +; RVD-LABEL: cttz_zero_poison_v16i8: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVD-NEXT: vle8.v v8, (a0) @@ -807,7 +807,7 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse8.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v16i8: +; ZVBB-LABEL: cttz_zero_poison_v16i8: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; ZVBB-NEXT: vle8.v v8, (a0) @@ -821,8 +821,8 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { -; RVI-LABEL: cttz_zero_undef_v8i16: +define void @cttz_zero_poison_v8i16(ptr %x, ptr %y) nounwind { +; RVI-LABEL: cttz_zero_poison_v8i16: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVI-NEXT: vle16.v v8, (a0) @@ -851,7 +851,7 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse16.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v8i16: +; RVF-LABEL: cttz_zero_poison_v8i16: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVF-NEXT: vle16.v v8, (a0) @@ -864,7 +864,7 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse16.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v8i16: +; RVD-LABEL: cttz_zero_poison_v8i16: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVD-NEXT: vle16.v v8, (a0) @@ -877,7 +877,7 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse16.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v8i16: +; ZVBB-LABEL: cttz_zero_poison_v8i16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVBB-NEXT: vle16.v v8, (a0) @@ -891,8 +891,8 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { -; RVI-LABEL: cttz_zero_undef_v4i32: +define void @cttz_zero_poison_v4i32(ptr %x, ptr %y) nounwind { +; RVI-LABEL: cttz_zero_poison_v4i32: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVI-NEXT: vle32.v v8, (a0) @@ -922,7 +922,7 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse32.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v4i32: +; RVF-LABEL: cttz_zero_poison_v4i32: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle32.v v8, (a0) @@ -937,7 +937,7 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse32.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v4i32: +; RVD-LABEL: cttz_zero_poison_v4i32: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVD-NEXT: vle32.v v8, (a0) @@ -951,7 +951,7 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse32.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v4i32: +; ZVBB-LABEL: cttz_zero_poison_v4i32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVBB-NEXT: vle32.v v8, (a0) @@ -965,8 +965,8 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { -; RV32I-LABEL: cttz_zero_undef_v2i64: +define void @cttz_zero_poison_v2i64(ptr %x, ptr %y) nounwind { +; RV32I-LABEL: cttz_zero_poison_v2i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) @@ -1009,7 +1009,7 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vse64.v v8, (a0) ; RV32I-NEXT: ret ; -; RV64I-LABEL: cttz_zero_undef_v2i64: +; RV64I-LABEL: cttz_zero_poison_v2i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) @@ -1048,7 +1048,7 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v2i64: +; RVF-LABEL: cttz_zero_poison_v2i64: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVF-NEXT: vle64.v v8, (a0) @@ -1064,7 +1064,7 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse64.v v9, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v2i64: +; RVD-LABEL: cttz_zero_poison_v2i64: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVD-NEXT: vle64.v v8, (a0) @@ -1080,7 +1080,7 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse64.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v2i64: +; ZVBB-LABEL: cttz_zero_poison_v2i64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; ZVBB-NEXT: vle64.v v8, (a0) @@ -1094,8 +1094,8 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { -; RVI-LABEL: cttz_zero_undef_v32i8: +define void @cttz_zero_poison_v32i8(ptr %x, ptr %y) nounwind { +; RVI-LABEL: cttz_zero_poison_v32i8: ; RVI: # %bb.0: ; RVI-NEXT: li a1, 32 ; RVI-NEXT: vsetvli zero, a1, e8, m2, ta, ma @@ -1118,7 +1118,7 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse8.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v32i8: +; RVF-LABEL: cttz_zero_poison_v32i8: ; RVF: # %bb.0: ; RVF-NEXT: li a1, 32 ; RVF-NEXT: vsetvli zero, a1, e8, m2, ta, ma @@ -1136,7 +1136,7 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse8.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v32i8: +; RVD-LABEL: cttz_zero_poison_v32i8: ; RVD: # %bb.0: ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vsetvli zero, a1, e8, m2, ta, ma @@ -1154,7 +1154,7 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse8.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v32i8: +; ZVBB-LABEL: cttz_zero_poison_v32i8: ; ZVBB: # %bb.0: ; ZVBB-NEXT: li a1, 32 ; ZVBB-NEXT: vsetvli zero, a1, e8, m2, ta, ma @@ -1169,8 +1169,8 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { -; RVI-LABEL: cttz_zero_undef_v16i16: +define void @cttz_zero_poison_v16i16(ptr %x, ptr %y) nounwind { +; RVI-LABEL: cttz_zero_poison_v16i16: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVI-NEXT: vle16.v v8, (a0) @@ -1199,7 +1199,7 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse16.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v16i16: +; RVF-LABEL: cttz_zero_poison_v16i16: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle16.v v8, (a0) @@ -1212,7 +1212,7 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse16.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v16i16: +; RVD-LABEL: cttz_zero_poison_v16i16: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle16.v v8, (a0) @@ -1225,7 +1225,7 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse16.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v16i16: +; ZVBB-LABEL: cttz_zero_poison_v16i16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVBB-NEXT: vle16.v v8, (a0) @@ -1239,8 +1239,8 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { -; RVI-LABEL: cttz_zero_undef_v8i32: +define void @cttz_zero_poison_v8i32(ptr %x, ptr %y) nounwind { +; RVI-LABEL: cttz_zero_poison_v8i32: ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVI-NEXT: vle32.v v8, (a0) @@ -1270,7 +1270,7 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vse32.v v8, (a0) ; RVI-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v8i32: +; RVF-LABEL: cttz_zero_poison_v8i32: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVF-NEXT: vle32.v v8, (a0) @@ -1285,7 +1285,7 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse32.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v8i32: +; RVD-LABEL: cttz_zero_poison_v8i32: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVD-NEXT: vle32.v v8, (a0) @@ -1299,7 +1299,7 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse32.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v8i32: +; ZVBB-LABEL: cttz_zero_poison_v8i32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; ZVBB-NEXT: vle32.v v8, (a0) @@ -1313,8 +1313,8 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ret void } -define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { -; RV32I-LABEL: cttz_zero_undef_v4i64: +define void @cttz_zero_poison_v4i64(ptr %x, ptr %y) nounwind { +; RV32I-LABEL: cttz_zero_poison_v4i64: ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) @@ -1357,7 +1357,7 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vse64.v v8, (a0) ; RV32I-NEXT: ret ; -; RV64I-LABEL: cttz_zero_undef_v4i64: +; RV64I-LABEL: cttz_zero_poison_v4i64: ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) @@ -1396,7 +1396,7 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vse64.v v8, (a0) ; RV64I-NEXT: ret ; -; RVF-LABEL: cttz_zero_undef_v4i64: +; RVF-LABEL: cttz_zero_poison_v4i64: ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVF-NEXT: vle64.v v8, (a0) @@ -1412,7 +1412,7 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; -; RVD-LABEL: cttz_zero_undef_v4i64: +; RVD-LABEL: cttz_zero_poison_v4i64: ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVD-NEXT: vle64.v v8, (a0) @@ -1428,7 +1428,7 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vse64.v v8, (a0) ; RVD-NEXT: ret ; -; ZVBB-LABEL: cttz_zero_undef_v4i64: +; ZVBB-LABEL: cttz_zero_poison_v4i64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; ZVBB-NEXT: vle64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll index 9687dced7eb2e..7778f127dea80 100644 --- a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll +++ b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s ; Use cttz to test if we properly prove never-zero. There is a very -; simple transform from cttz -> cttz_zero_undef if its operand is +; simple transform from cttz -> cttz_zero_poison if its operand is ; known never zero. ; Even without vscale_range, vscale is always guaranteed to be non-zero. diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll index a25ed44b805d1..2080bfd0cb518 100644 --- a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll +++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll @@ -225,8 +225,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ret i64 %tmp } -define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { -; RV32I-LABEL: test_cttz_i8_zero_undef: +define i8 @test_cttz_i8_zero_poison(i8 %a) nounwind { +; RV32I-LABEL: test_cttz_i8_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a0 ; RV32I-NEXT: addi a1, a1, -1 @@ -243,13 +243,13 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV32I-NEXT: andi a0, a0, 15 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i8_zero_undef: +; RV32ZBB-LABEL: test_cttz_i8_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: ctz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_cttz_i8_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_cttz_i8_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: qc.cto a0, a0 ; RV32ZBBXQCIBM-NEXT: ret @@ -258,8 +258,8 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ret i8 %tmp } -define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { -; RV32I-LABEL: test_cttz_i16_zero_undef: +define i16 @test_cttz_i16_zero_poison(i16 %a) nounwind { +; RV32I-LABEL: test_cttz_i16_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a0 ; RV32I-NEXT: lui a2, 5 @@ -283,13 +283,13 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i16_zero_undef: +; RV32ZBB-LABEL: test_cttz_i16_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: ctz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_cttz_i16_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_cttz_i16_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: qc.cto a0, a0 ; RV32ZBBXQCIBM-NEXT: ret @@ -298,8 +298,8 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ret i16 %tmp } -define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { -; RV32I-LABEL: test_cttz_i32_zero_undef: +define i32 @test_cttz_i32_zero_poison(i32 %a) nounwind { +; RV32I-LABEL: test_cttz_i32_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill @@ -318,13 +318,13 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i32_zero_undef: +; RV32ZBB-LABEL: test_cttz_i32_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: ctz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_cttz_i32_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_cttz_i32_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: qc.cto a0, a0 ; RV32ZBBXQCIBM-NEXT: ret @@ -333,8 +333,8 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ret i32 %tmp } -define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { -; RV32I-LABEL: test_cttz_i64_zero_undef: +define i64 @test_cttz_i64_zero_poison(i64 %a) nounwind { +; RV32I-LABEL: test_cttz_i64_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill @@ -380,7 +380,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_cttz_i64_zero_undef: +; RV32ZBB-LABEL: test_cttz_i64_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: bnez a0, .LBB7_2 @@ -395,7 +395,7 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32ZBB-NEXT: li a1, 0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_cttz_i64_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_cttz_i64_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: not a2, a0 ; RV32ZBBXQCIBM-NEXT: bnez a2, .LBB7_2 @@ -690,8 +690,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ret i64 %tmp } -define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { -; RV32I-LABEL: test_ctlz_i8_zero_undef: +define i8 @test_ctlz_i8_zero_poison(i8 %a) nounwind { +; RV32I-LABEL: test_ctlz_i8_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: slli a1, a0, 24 @@ -716,14 +716,14 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ; RV32I-NEXT: andi a0, a0, 15 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i8_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i8_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: slli a0, a0, 24 ; RV32ZBB-NEXT: clz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_ctlz_i8_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_ctlz_i8_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: not a0, a0 ; RV32ZBBXQCIBM-NEXT: slli a0, a0, 24 @@ -734,8 +734,8 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ret i8 %tmp } -define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { -; RV32I-LABEL: test_ctlz_i16_zero_undef: +define i16 @test_ctlz_i16_zero_poison(i16 %a) nounwind { +; RV32I-LABEL: test_ctlz_i16_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: lui a1, 5 @@ -770,14 +770,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i16_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i16_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: slli a0, a0, 16 ; RV32ZBB-NEXT: clz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_ctlz_i16_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_ctlz_i16_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: not a0, a0 ; RV32ZBBXQCIBM-NEXT: slli a0, a0, 16 @@ -788,8 +788,8 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ret i16 %tmp } -define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { -; RV32I-LABEL: test_ctlz_i32_zero_undef: +define i32 @test_ctlz_i32_zero_poison(i32 %a) nounwind { +; RV32I-LABEL: test_ctlz_i32_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: lui a1, 349525 @@ -826,13 +826,13 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i32_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i32_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a0, a0 ; RV32ZBB-NEXT: clz a0, a0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_ctlz_i32_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_ctlz_i32_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: qc.clo a0, a0 ; RV32ZBBXQCIBM-NEXT: ret @@ -841,8 +841,8 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ret i32 %tmp } -define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { -; RV32I-LABEL: test_ctlz_i64_zero_undef: +define i64 @test_ctlz_i64_zero_poison(i64 %a) nounwind { +; RV32I-LABEL: test_ctlz_i64_zero_poison: ; RV32I: # %bb.0: ; RV32I-NEXT: not a4, a1 ; RV32I-NEXT: lui a1, 349525 @@ -913,7 +913,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: ret ; -; RV32ZBB-LABEL: test_ctlz_i64_zero_undef: +; RV32ZBB-LABEL: test_ctlz_i64_zero_poison: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: not a1, a1 ; RV32ZBB-NEXT: bnez a1, .LBB15_2 @@ -928,7 +928,7 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32ZBB-NEXT: li a1, 0 ; RV32ZBB-NEXT: ret ; -; RV32ZBBXQCIBM-LABEL: test_ctlz_i64_zero_undef: +; RV32ZBBXQCIBM-LABEL: test_ctlz_i64_zero_poison: ; RV32ZBBXQCIBM: # %bb.0: ; RV32ZBBXQCIBM-NEXT: not a2, a1 ; RV32ZBBXQCIBM-NEXT: bnez a2, .LBB15_2 diff --git a/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll b/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll index f6502202ef582..4c59dad8953bf 100644 --- a/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-ctlz-01.ll @@ -2,10 +2,10 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %src, i1 %is_zero_undef) -declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %src, i1 %is_zero_undef) -declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %src, i1 %is_zero_undef) -declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %src, i1 %is_zero_undef) +declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %src, i1 %is_zero_poison) +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %src, i1 %is_zero_poison) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %src, i1 %is_zero_poison) +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %src, i1 %is_zero_poison) define <16 x i8> @f1(<16 x i8> %a) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll b/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll index 00a0d21b42fec..6e34ce12fba6b 100644 --- a/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-cttz-01.ll @@ -2,10 +2,10 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -declare <16 x i8> @llvm.cttz.v16i8(<16 x i8> %src, i1 %is_zero_undef) -declare <8 x i16> @llvm.cttz.v8i16(<8 x i16> %src, i1 %is_zero_undef) -declare <4 x i32> @llvm.cttz.v4i32(<4 x i32> %src, i1 %is_zero_undef) -declare <2 x i64> @llvm.cttz.v2i64(<2 x i64> %src, i1 %is_zero_undef) +declare <16 x i8> @llvm.cttz.v16i8(<16 x i8> %src, i1 %is_zero_poison) +declare <8 x i16> @llvm.cttz.v8i16(<8 x i16> %src, i1 %is_zero_poison) +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32> %src, i1 %is_zero_poison) +declare <2 x i64> @llvm.cttz.v2i64(<2 x i64> %src, i1 %is_zero_poison) define <16 x i8> @f1(<16 x i8> %a) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/ctlz.mir b/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/ctlz.mir index e8bdb5cb911c2..4a5238cc535d5 100644 --- a/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/ctlz.mir +++ b/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/ctlz.mir @@ -59,31 +59,31 @@ body: | --- -name: ctlz_zundef_i32_i8 +name: ctlz_zpoison_i32_i8 tracksRegLiveness: true body: | bb.1.entry: liveins: $arguments - ; CHECK-LABEL: name: ctlz_zundef_i32_i8 + ; CHECK-LABEL: name: ctlz_zpoison_i32_i8 ; CHECK: liveins: $arguments ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ARGUMENT_i32_:%[0-9]+]]:i32 = ARGUMENT_i32 0, implicit $arguments ; CHECK-NEXT: [[CLZ_I32_:%[0-9]+]]:i32 = CLZ_I32 [[ARGUMENT_i32_]], implicit-def dead $arguments ; CHECK-NEXT: RETURN [[CLZ_I32_]], implicit-def $arguments %0:i32(i32) = ARGUMENT_i32 0, implicit $arguments - %1:_(i8) = G_CTLZ_ZERO_UNDEF %0 + %1:_(i8) = G_CTLZ_ZERO_POISON %0 %2:_(i32) = G_ANYEXT %1(i8) RETURN %2(i32), implicit-def $arguments ... --- -name: ctlz_zundef_i64_i8 +name: ctlz_zpoison_i64_i8 tracksRegLiveness: true body: | bb.1.entry: liveins: $arguments - ; CHECK-LABEL: name: ctlz_zundef_i64_i8 + ; CHECK-LABEL: name: ctlz_zpoison_i64_i8 ; CHECK: liveins: $arguments ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ARGUMENT_i64_:%[0-9]+]]:i64 = ARGUMENT_i64 0, implicit $arguments @@ -91,18 +91,18 @@ body: | ; CHECK-NEXT: [[I32_WRAP_I64_:%[0-9]+]]:i32 = I32_WRAP_I64 [[CLZ_I64_]], implicit-def dead $arguments ; CHECK-NEXT: RETURN [[I32_WRAP_I64_]], implicit-def $arguments %0:i64(i64) = ARGUMENT_i64 0, implicit $arguments - %1:_(i8) = G_CTLZ_ZERO_UNDEF %0 + %1:_(i8) = G_CTLZ_ZERO_POISON %0 %2:_(i32) = G_ANYEXT %1(i8) RETURN %2(i32), implicit-def $arguments ... --- -name: ctlz_zundef_i64_i32 +name: ctlz_zpoison_i64_i32 tracksRegLiveness: true body: | bb.1.entry: liveins: $arguments - ; CHECK-LABEL: name: ctlz_zundef_i64_i32 + ; CHECK-LABEL: name: ctlz_zpoison_i64_i32 ; CHECK: liveins: $arguments ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ARGUMENT_i64_:%[0-9]+]]:i64 = ARGUMENT_i64 0, implicit $arguments @@ -110,6 +110,6 @@ body: | ; CHECK-NEXT: [[I32_WRAP_I64_:%[0-9]+]]:i32 = I32_WRAP_I64 [[CLZ_I64_]], implicit-def dead $arguments ; CHECK-NEXT: RETURN [[I32_WRAP_I64_]], implicit-def $arguments %0:i64(i64) = ARGUMENT_i64 0, implicit $arguments - %1:_(i32) = G_CTLZ_ZERO_UNDEF %0 + %1:_(i32) = G_CTLZ_ZERO_POISON %0 RETURN %1(i32), implicit-def $arguments ... diff --git a/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/cttz.mir b/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/cttz.mir index 465b4a051cf73..cb4e099c66d60 100644 --- a/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/cttz.mir +++ b/llvm/test/CodeGen/WebAssembly/GlobalISel/instructions/cttz.mir @@ -59,31 +59,31 @@ body: | --- -name: cttz_zundef_i32_i8 +name: cttz_zpoison_i32_i8 tracksRegLiveness: true body: | bb.1.entry: liveins: $arguments - ; CHECK-LABEL: name: cttz_zundef_i32_i8 + ; CHECK-LABEL: name: cttz_zpoison_i32_i8 ; CHECK: liveins: $arguments ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ARGUMENT_i32_:%[0-9]+]]:i32 = ARGUMENT_i32 0, implicit $arguments ; CHECK-NEXT: [[CTZ_I32_:%[0-9]+]]:i32 = CTZ_I32 [[ARGUMENT_i32_]], implicit-def dead $arguments ; CHECK-NEXT: RETURN [[CTZ_I32_]], implicit-def $arguments %0:i32(i32) = ARGUMENT_i32 0, implicit $arguments - %1:_(i8) = G_CTTZ_ZERO_UNDEF %0 + %1:_(i8) = G_CTTZ_ZERO_POISON %0 %2:_(i32) = G_ANYEXT %1(i8) RETURN %2(i32), implicit-def $arguments ... --- -name: cttz_zundef_i64_i8 +name: cttz_zpoison_i64_i8 tracksRegLiveness: true body: | bb.1.entry: liveins: $arguments - ; CHECK-LABEL: name: cttz_zundef_i64_i8 + ; CHECK-LABEL: name: cttz_zpoison_i64_i8 ; CHECK: liveins: $arguments ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ARGUMENT_i64_:%[0-9]+]]:i64 = ARGUMENT_i64 0, implicit $arguments @@ -91,18 +91,18 @@ body: | ; CHECK-NEXT: [[I32_WRAP_I64_:%[0-9]+]]:i32 = I32_WRAP_I64 [[CTZ_I64_]], implicit-def dead $arguments ; CHECK-NEXT: RETURN [[I32_WRAP_I64_]], implicit-def $arguments %0:i64(i64) = ARGUMENT_i64 0, implicit $arguments - %1:_(i8) = G_CTTZ_ZERO_UNDEF %0 + %1:_(i8) = G_CTTZ_ZERO_POISON %0 %2:_(i32) = G_ANYEXT %1(i8) RETURN %2(i32), implicit-def $arguments ... --- -name: cttz_zundef_i64_i32 +name: cttz_zpoison_i64_i32 tracksRegLiveness: true body: | bb.1.entry: liveins: $arguments - ; CHECK-LABEL: name: cttz_zundef_i64_i32 + ; CHECK-LABEL: name: cttz_zpoison_i64_i32 ; CHECK: liveins: $arguments ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[ARGUMENT_i64_:%[0-9]+]]:i64 = ARGUMENT_i64 0, implicit $arguments @@ -110,6 +110,6 @@ body: | ; CHECK-NEXT: [[I32_WRAP_I64_:%[0-9]+]]:i32 = I32_WRAP_I64 [[CTZ_I64_]], implicit-def dead $arguments ; CHECK-NEXT: RETURN [[I32_WRAP_I64_]], implicit-def $arguments %0:i64(i64) = ARGUMENT_i64 0, implicit $arguments - %1:_(i32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(i32) = G_CTTZ_ZERO_POISON %0 RETURN %1(i32), implicit-def $arguments ... diff --git a/llvm/test/CodeGen/WebAssembly/i128.ll b/llvm/test/CodeGen/WebAssembly/i128.ll index d9bec9b8ae887..353c38473b2e8 100644 --- a/llvm/test/CodeGen/WebAssembly/i128.ll +++ b/llvm/test/CodeGen/WebAssembly/i128.ll @@ -403,9 +403,9 @@ define i128 @clz128(i128 %x) { ret i128 %a } -define i128 @clz128_zero_undef(i128 %x) { -; CHECK-LABEL: clz128_zero_undef: -; CHECK: .functype clz128_zero_undef (i32, i64, i64) -> () +define i128 @clz128_zero_poison(i128 %x) { +; CHECK-LABEL: clz128_zero_poison: +; CHECK: .functype clz128_zero_poison (i32, i64, i64) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get $push8=, 0 ; CHECK-NEXT: i64.const $push0=, 0 @@ -451,9 +451,9 @@ define i128 @ctz128(i128 %x) { ret i128 %a } -define i128 @ctz128_zero_undef(i128 %x) { -; CHECK-LABEL: ctz128_zero_undef: -; CHECK: .functype ctz128_zero_undef (i32, i64, i64) -> () +define i128 @ctz128_zero_poison(i128 %x) { +; CHECK-LABEL: ctz128_zero_poison: +; CHECK: .functype ctz128_zero_poison (i32, i64, i64) -> () ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get $push8=, 0 ; CHECK-NEXT: i64.const $push0=, 0 diff --git a/llvm/test/CodeGen/WebAssembly/i32.ll b/llvm/test/CodeGen/WebAssembly/i32.ll index be5f779ae8102..8aa4aa582b436 100644 --- a/llvm/test/CodeGen/WebAssembly/i32.ll +++ b/llvm/test/CodeGen/WebAssembly/i32.ll @@ -161,12 +161,12 @@ define i32 @clz32(i32 %x) { ret i32 %a } -; CHECK-LABEL: clz32_zero_undef: -; CHECK-NEXT: .functype clz32_zero_undef (i32) -> (i32){{$}} +; CHECK-LABEL: clz32_zero_poison: +; CHECK-NEXT: .functype clz32_zero_poison (i32) -> (i32){{$}} ; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: i32.clz $push0=, $pop[[L0]]{{$}} ; CHECK-NEXT: return $pop0{{$}} -define i32 @clz32_zero_undef(i32 %x) { +define i32 @clz32_zero_poison(i32 %x) { %a = call i32 @llvm.ctlz.i32(i32 %x, i1 true) ret i32 %a } @@ -181,12 +181,12 @@ define i32 @ctz32(i32 %x) { ret i32 %a } -; CHECK-LABEL: ctz32_zero_undef: -; CHECK-NEXT: .functype ctz32_zero_undef (i32) -> (i32){{$}} +; CHECK-LABEL: ctz32_zero_poison: +; CHECK-NEXT: .functype ctz32_zero_poison (i32) -> (i32){{$}} ; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: i32.ctz $push0=, $pop[[L0]]{{$}} ; CHECK-NEXT: return $pop0{{$}} -define i32 @ctz32_zero_undef(i32 %x) { +define i32 @ctz32_zero_poison(i32 %x) { %a = call i32 @llvm.cttz.i32(i32 %x, i1 true) ret i32 %a } diff --git a/llvm/test/CodeGen/WebAssembly/i64.ll b/llvm/test/CodeGen/WebAssembly/i64.ll index c93f43080c409..cbceceee18d40 100644 --- a/llvm/test/CodeGen/WebAssembly/i64.ll +++ b/llvm/test/CodeGen/WebAssembly/i64.ll @@ -161,12 +161,12 @@ define i64 @clz64(i64 %x) { ret i64 %a } -; CHECK-LABEL: clz64_zero_undef: -; CHECK-NEXT: .functype clz64_zero_undef (i64) -> (i64){{$}} +; CHECK-LABEL: clz64_zero_poison: +; CHECK-NEXT: .functype clz64_zero_poison (i64) -> (i64){{$}} ; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: i64.clz $push0=, $pop[[L0]]{{$}} ; CHECK-NEXT: return $pop0{{$}} -define i64 @clz64_zero_undef(i64 %x) { +define i64 @clz64_zero_poison(i64 %x) { %a = call i64 @llvm.ctlz.i64(i64 %x, i1 true) ret i64 %a } @@ -181,12 +181,12 @@ define i64 @ctz64(i64 %x) { ret i64 %a } -; CHECK-LABEL: ctz64_zero_undef: -; CHECK-NEXT: .functype ctz64_zero_undef (i64) -> (i64){{$}} +; CHECK-LABEL: ctz64_zero_poison: +; CHECK-NEXT: .functype ctz64_zero_poison (i64) -> (i64){{$}} ; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: i64.ctz $push0=, $pop[[L0]]{{$}} ; CHECK-NEXT: return $pop0{{$}} -define i64 @ctz64_zero_undef(i64 %x) { +define i64 @ctz64_zero_poison(i64 %x) { %a = call i64 @llvm.cttz.i64(i64 %x, i1 true) ret i64 %a } diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir index 2e4adaaa4e48e..c877f33cc9822 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir @@ -3,7 +3,7 @@ # RUN: llc -mtriple=i386-linux-gnu -mattr=+lzcnt -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s 2>%t -o - | FileCheck %s --check-prefixes=CHECK,X86 # RUN: FileCheck -check-prefix=ERR32 %s < %t -# ERR32: remark: :0:0: unable to legalize instruction: %10:_(s64) = G_CTLZ_ZERO_UNDEF %4:_(s32) (in function: test_ctlz64) +# ERR32: remark: :0:0: unable to legalize instruction: %10:_(s64) = G_CTLZ_ZERO_POISON %4:_(s32) (in function: test_ctlz64) # test count leading zeros for s16, s32, and s64 @@ -35,10 +35,10 @@ body: | ; X86-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[UV]](s32) ; X86-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 ; X86-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[CTLZ]], [[C1]] - ; X86-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF [[UV1]](s32) + ; X86-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_POISON [[UV1]](s32) ; X86-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](s64) - ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_UNDEF]](s64) + ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_POISON]](s64) ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT1]](s32), [[UV2]], [[UV4]] ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT1]](s32), [[UV3]], [[UV5]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32) @@ -109,10 +109,10 @@ body: | ; X86-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[UV]](s32) ; X86-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 ; X86-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[CTLZ]], [[C1]] - ; X86-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF [[UV1]](s32) + ; X86-NEXT: [[CTLZ_ZERO_POISON:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_POISON [[UV1]](s32) ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](s64) - ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_UNDEF]](s64) + ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_POISON]](s64) ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV2]], [[UV4]] ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV3]], [[UV5]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32) diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-poison.mir similarity index 78% rename from llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir rename to llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-poison.mir index 148aac1ab46fe..cf3a246b8fcf9 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-poison.mir @@ -17,8 +17,8 @@ body: | ; X64: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx ; X64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 34359738368 ; X64-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]] - ; X64-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_UNDEF [[OR]](s64) - ; X64-NEXT: RET 0, implicit [[CTTZ_ZERO_UNDEF]](s64) + ; X64-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_POISON [[OR]](s64) + ; X64-NEXT: RET 0, implicit [[CTTZ_ZERO_POISON]](s64) ; ; X86-LABEL: name: test_cttz35 ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx @@ -28,15 +28,15 @@ body: | ; X86-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[C]] ; X86-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV1]], [[C1]] ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[OR]](s32), [[C]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR1]](s32) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_POISON]], [[C2]] ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] - ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF1]] + ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_POISON1]] ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32) ; X86-NEXT: RET 0, implicit [[MV]](s64) @@ -59,12 +59,12 @@ body: | ; CHECK-LABEL: name: test_cttz8 ; CHECK: [[DEF:%[0-9]+]]:_(s8) = IMPLICIT_DEF ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[DEF]](s8) - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_UNDEF [[ANYEXT]](s16) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[CTTZ_ZERO_UNDEF]](s16) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_POISON [[ANYEXT]](s16) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[CTTZ_ZERO_POISON]](s16) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8) ; CHECK-NEXT: RET 0, implicit [[COPY]](s8) %0:_(s8) = IMPLICIT_DEF - %1:_(s8) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s8) = G_CTTZ_ZERO_POISON %0 %2:_(s8) = COPY %1(s8) RET 0, implicit %2 ... @@ -80,8 +80,8 @@ body: | bb.1: ; X64-LABEL: name: test_cttz64 ; X64: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF - ; X64-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_UNDEF [[DEF]](s64) - ; X64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[CTTZ_ZERO_UNDEF]](s64) + ; X64-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_POISON [[DEF]](s64) + ; X64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[CTTZ_ZERO_POISON]](s64) ; X64-NEXT: RET 0, implicit [[COPY]](s64) ; ; X86-LABEL: name: test_cttz64 @@ -89,21 +89,21 @@ body: | ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64) ; X86-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[UV]](s32), [[C]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV1]](s32) ; X86-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_POISON]], [[C1]] ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] - ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF1]] + ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_POISON1]] ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32) ; X86-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; X86-NEXT: RET 0, implicit [[COPY]](s64) %0:_(s64) = IMPLICIT_DEF - %1:_(s64) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s64) = G_CTTZ_ZERO_POISON %0 %2:_(s64) = COPY %1(s64) RET 0, implicit %2 ... @@ -119,11 +119,11 @@ body: | bb.1: ; CHECK-LABEL: name: test_cttz32 ; CHECK: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[DEF]](s32) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_UNDEF]](s32) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[DEF]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[CTTZ_ZERO_POISON]](s32) ; CHECK-NEXT: RET 0, implicit [[COPY]](s32) %0:_(s32) = IMPLICIT_DEF - %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s32) = G_CTTZ_ZERO_POISON %0 %2:_(s32) = COPY %1(s32) RET 0, implicit %2 ... @@ -139,11 +139,11 @@ body: | bb.1: ; CHECK-LABEL: name: test_cttz16 ; CHECK: [[DEF:%[0-9]+]]:_(s16) = IMPLICIT_DEF - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_UNDEF [[DEF]](s16) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[CTTZ_ZERO_UNDEF]](s16) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_POISON [[DEF]](s16) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[CTTZ_ZERO_POISON]](s16) ; CHECK-NEXT: RET 0, implicit [[COPY]](s16) %0:_(s16) = IMPLICIT_DEF - %1:_(s16) = G_CTTZ_ZERO_UNDEF %0 + %1:_(s16) = G_CTTZ_ZERO_POISON %0 %2:_(s16) = COPY %1(s16) RET 0, implicit %2 ... diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir index 17469289821f1..32b3061256de7 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir @@ -17,8 +17,8 @@ body: | ; X64: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx ; X64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 34359738368 ; X64-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]] - ; X64-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_UNDEF [[OR]](s64) - ; X64-NEXT: RET 0, implicit [[CTTZ_ZERO_UNDEF]](s64) + ; X64-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_POISON [[OR]](s64) + ; X64-NEXT: RET 0, implicit [[CTTZ_ZERO_POISON]](s64) ; ; X86-LABEL: name: test_cttz35 ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx @@ -28,15 +28,15 @@ body: | ; X86-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[C]] ; X86-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV1]], [[C1]] ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[OR]](s32), [[C]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR1]](s32) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_POISON]], [[C2]] ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[OR]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] - ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF1]] + ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_POISON1]] ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32) ; X86-NEXT: RET 0, implicit [[MV]](s64) @@ -61,8 +61,8 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[DEF]](s8) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[ANYEXT]], [[C]] - ; CHECK-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_UNDEF [[OR]](s16) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[CTTZ_ZERO_UNDEF]](s16) + ; CHECK-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_POISON [[OR]](s16) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[CTTZ_ZERO_POISON]](s16) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8) ; CHECK-NEXT: RET 0, implicit [[COPY]](s8) %0:_(s8) = IMPLICIT_DEF @@ -95,11 +95,11 @@ body: | ; X86-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ]], [[C1]] ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] - ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) + ; X86-NEXT: [[CTTZ_ZERO_POISON:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UV]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] - ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF]] + ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_POISON]] ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32) ; X86-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[MV]](s64) diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 31adbb71f271e..6dcdd697e639a 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -3366,11 +3366,11 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind { } ; -; CTLZ_ZERO_UNDEF +; CTLZ_ZERO_POISON ; -define i32 @test_ctlz_undef_i128(i128 %a0) nounwind { -; SSE-LABEL: test_ctlz_undef_i128: +define i32 @test_ctlz_poison_i128(i128 %a0) nounwind { +; SSE-LABEL: test_ctlz_poison_i128: ; SSE: # %bb.0: ; SSE-NEXT: bsrq %rsi, %rcx ; SSE-NEXT: xorl $63, %ecx @@ -3382,7 +3382,7 @@ define i32 @test_ctlz_undef_i128(i128 %a0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: test_ctlz_undef_i128: +; AVX2-LABEL: test_ctlz_poison_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: lzcntq %rsi, %rcx ; AVX2-NEXT: lzcntq %rdi, %rax @@ -3392,7 +3392,7 @@ define i32 @test_ctlz_undef_i128(i128 %a0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ctlz_undef_i128: +; AVX512-LABEL: test_ctlz_poison_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: lzcntq %rsi, %rcx ; AVX512-NEXT: lzcntq %rdi, %rax @@ -3406,8 +3406,8 @@ define i32 @test_ctlz_undef_i128(i128 %a0) nounwind { ret i32 %res } -define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { -; SSE-LABEL: load_ctlz_undef_i128: +define i32 @load_ctlz_poison_i128(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_poison_i128: ; SSE: # %bb.0: ; SSE-NEXT: movq 8(%rdi), %rcx ; SSE-NEXT: bsrq %rcx, %rdx @@ -3420,7 +3420,7 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_ctlz_undef_i128: +; AVX2-LABEL: load_ctlz_poison_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: movq 8(%rdi), %rcx ; AVX2-NEXT: lzcntq %rcx, %rdx @@ -3431,7 +3431,7 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctlz_undef_i128: +; AVX512-LABEL: load_ctlz_poison_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movq 8(%rdi), %rcx ; AVX512-NEXT: lzcntq %rcx, %rdx @@ -3447,8 +3447,8 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { ret i32 %res } -define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { -; SSE-LABEL: vector_ctlz_undef_i128: +define i32 @vector_ctlz_poison_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_poison_i128: ; SSE: # %bb.0: ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: pextrq $1, %xmm0, %rcx @@ -3462,7 +3462,7 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: vector_ctlz_undef_i128: +; AVX2-LABEL: vector_ctlz_poison_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx @@ -3474,7 +3474,7 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vector_ctlz_undef_i128: +; AVX512F-LABEL: vector_ctlz_poison_i128: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx @@ -3486,7 +3486,7 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: vector_ctlz_undef_i128: +; AVX512POPCNT-LABEL: vector_ctlz_poison_i128: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %xmm0, %rax ; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx @@ -3498,7 +3498,7 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: vector_ctlz_undef_i128: +; AVX512VL-LABEL: vector_ctlz_poison_i128: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -3510,7 +3510,7 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: vector_ctlz_undef_i128: +; AVX512VLPOPCNT-LABEL: vector_ctlz_poison_i128: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512VLPOPCNT-NEXT: vmovq %xmm0, %rax @@ -3527,8 +3527,8 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ret i32 %res } -define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { -; SSE-LABEL: test_ctlz_undef_i256: +define i32 @test_ctlz_poison_i256(i256 %a0) nounwind { +; SSE-LABEL: test_ctlz_poison_i256: ; SSE: # %bb.0: ; SSE-NEXT: bsrq %rcx, %rax ; SSE-NEXT: xorl $63, %eax @@ -3550,7 +3550,7 @@ define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: test_ctlz_undef_i256: +; AVX2-LABEL: test_ctlz_poison_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: lzcntq %rcx, %rax ; AVX2-NEXT: lzcntq %rdx, %r8 @@ -3569,7 +3569,7 @@ define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ctlz_undef_i256: +; AVX512-LABEL: test_ctlz_poison_i256: ; AVX512: # %bb.0: ; AVX512-NEXT: lzcntq %rcx, %rax ; AVX512-NEXT: lzcntq %rdx, %r8 @@ -3591,8 +3591,8 @@ define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { ret i32 %res } -define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { -; SSE-LABEL: load_ctlz_undef_i256: +define i32 @load_ctlz_poison_i256(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_poison_i256: ; SSE: # %bb.0: ; SSE-NEXT: movq 8(%rdi), %rdx ; SSE-NEXT: movq 16(%rdi), %rcx @@ -3617,7 +3617,7 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_ctlz_undef_i256: +; AVX2-LABEL: load_ctlz_poison_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: movq 16(%rdi), %rcx ; AVX2-NEXT: movq 24(%rdi), %rdx @@ -3639,7 +3639,7 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_ctlz_undef_i256: +; AVX512F-LABEL: load_ctlz_poison_i256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] ; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 @@ -3651,7 +3651,7 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: load_ctlz_undef_i256: +; AVX512POPCNT-LABEL: load_ctlz_poison_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] ; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 @@ -3663,7 +3663,7 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: load_ctlz_undef_i256: +; AVX512VL-LABEL: load_ctlz_poison_i256: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] ; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1 @@ -3674,7 +3674,7 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: load_ctlz_undef_i256: +; AVX512VLPOPCNT-LABEL: load_ctlz_poison_i256: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] ; AVX512VLPOPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 @@ -3690,8 +3690,8 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ret i32 %res } -define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { -; SSE-LABEL: vector_ctlz_undef_i256: +define i32 @vector_ctlz_poison_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_poison_i256: ; SSE: # %bb.0: ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: movq %xmm0, %rax @@ -3717,7 +3717,7 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: vector_ctlz_undef_i256: +; AVX2-LABEL: vector_ctlz_poison_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx @@ -3742,7 +3742,7 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vector_ctlz_undef_i256: +; AVX512F-LABEL: vector_ctlz_poison_i256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 @@ -3754,7 +3754,7 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: vector_ctlz_undef_i256: +; AVX512POPCNT-LABEL: vector_ctlz_poison_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 @@ -3766,7 +3766,7 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: vector_ctlz_undef_i256: +; AVX512VL-LABEL: vector_ctlz_poison_i256: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1 @@ -3777,7 +3777,7 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: vector_ctlz_undef_i256: +; AVX512VLPOPCNT-LABEL: vector_ctlz_poison_i256: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512VLPOPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 @@ -3793,8 +3793,8 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ret i32 %res } -define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { -; SSE-LABEL: test_ctlz_undef_i512: +define i32 @test_ctlz_poison_i512(i512 %a0) nounwind { +; SSE-LABEL: test_ctlz_poison_i512: ; SSE: # %bb.0: ; SSE-NEXT: pushq %r15 ; SSE-NEXT: pushq %r14 @@ -3847,7 +3847,7 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; SSE-NEXT: popq %r15 ; SSE-NEXT: retq ; -; AVX2-LABEL: test_ctlz_undef_i512: +; AVX2-LABEL: test_ctlz_poison_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 @@ -3899,7 +3899,7 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_ctlz_undef_i512: +; AVX512F-LABEL: test_ctlz_poison_i512: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %rdi, %xmm0 ; AVX512F-NEXT: vmovq %rsi, %xmm1 @@ -3921,7 +3921,7 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: test_ctlz_undef_i512: +; AVX512POPCNT-LABEL: test_ctlz_poison_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 ; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 @@ -3943,7 +3943,7 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: test_ctlz_undef_i512: +; AVX512VL-LABEL: test_ctlz_poison_i512: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq %rdi, %xmm0 ; AVX512VL-NEXT: vmovq %rsi, %xmm1 @@ -3966,7 +3966,7 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: test_ctlz_undef_i512: +; AVX512VLPOPCNT-LABEL: test_ctlz_poison_i512: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovq %rdi, %xmm0 ; AVX512VLPOPCNT-NEXT: vmovq %rsi, %xmm1 @@ -3993,8 +3993,8 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ret i32 %res } -define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { -; SSE-LABEL: load_ctlz_undef_i512: +define i32 @load_ctlz_poison_i512(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_poison_i512: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbx ; SSE-NEXT: movdqa 32(%rdi), %xmm0 @@ -4046,7 +4046,7 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; SSE-NEXT: popq %rbx ; SSE-NEXT: retq ; -; AVX2-LABEL: load_ctlz_undef_i512: +; AVX2-LABEL: load_ctlz_poison_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: movq 16(%rdi), %rcx ; AVX2-NEXT: movq 24(%rdi), %rdx @@ -4093,7 +4093,7 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_ctlz_undef_i512: +; AVX512F-LABEL: load_ctlz_poison_i512: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 @@ -4104,7 +4104,7 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: load_ctlz_undef_i512: +; AVX512POPCNT-LABEL: load_ctlz_poison_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] ; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 @@ -4115,7 +4115,7 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: load_ctlz_undef_i512: +; AVX512VL-LABEL: load_ctlz_poison_i512: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] ; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 @@ -4127,7 +4127,7 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: load_ctlz_undef_i512: +; AVX512VLPOPCNT-LABEL: load_ctlz_poison_i512: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] ; AVX512VLPOPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 @@ -4144,8 +4144,8 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ret i32 %res } -define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { -; SSE-LABEL: vector_ctlz_undef_i512: +define i32 @vector_ctlz_poison_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_poison_i512: ; SSE: # %bb.0: ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: pextrq $1, %xmm1, %rax @@ -4196,7 +4196,7 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: vector_ctlz_undef_i512: +; AVX2-LABEL: vector_ctlz_poison_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx @@ -4245,7 +4245,7 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vector_ctlz_undef_i512: +; AVX512F-LABEL: vector_ctlz_poison_i512: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 @@ -4256,7 +4256,7 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: vector_ctlz_undef_i512: +; AVX512POPCNT-LABEL: vector_ctlz_poison_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0 @@ -4267,7 +4267,7 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: vector_ctlz_undef_i512: +; AVX512VL-LABEL: vector_ctlz_poison_i512: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 @@ -4279,7 +4279,7 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: vector_ctlz_undef_i512: +; AVX512VLPOPCNT-LABEL: vector_ctlz_poison_i512: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; AVX512VLPOPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0 @@ -4296,8 +4296,8 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ret i32 %res } -define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { -; SSE-LABEL: test_ctlz_undef_i1024: +define i32 @test_ctlz_poison_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_ctlz_poison_i1024: ; SSE: # %bb.0: ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx @@ -4401,7 +4401,7 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; SSE-NEXT: popq %r14 ; SSE-NEXT: retq ; -; AVX2-LABEL: test_ctlz_undef_i1024: +; AVX2-LABEL: test_ctlz_poison_i1024: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 @@ -4508,7 +4508,7 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_ctlz_undef_i1024: +; AVX512F-LABEL: test_ctlz_poison_i1024: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 ; AVX512F-NEXT: vmovq %rdi, %xmm1 @@ -4543,7 +4543,7 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; AVX512F-NEXT: cmovel %ecx, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: test_ctlz_undef_i1024: +; AVX512POPCNT-LABEL: test_ctlz_poison_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 ; AVX512POPCNT-NEXT: vmovq %rdi, %xmm1 @@ -4578,7 +4578,7 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; AVX512POPCNT-NEXT: cmovel %ecx, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: test_ctlz_undef_i1024: +; AVX512VL-LABEL: test_ctlz_poison_i1024: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 ; AVX512VL-NEXT: vmovq %rdi, %xmm1 @@ -4614,7 +4614,7 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: test_ctlz_undef_i1024: +; AVX512VLPOPCNT-LABEL: test_ctlz_poison_i1024: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0 ; AVX512VLPOPCNT-NEXT: vmovq %rdi, %xmm1 @@ -4654,8 +4654,8 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ret i32 %res } -define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { -; SSE-LABEL: load_ctlz_undef_i1024: +define i32 @load_ctlz_poison_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_poison_i1024: ; SSE: # %bb.0: ; SSE-NEXT: movq 8(%rdi), %rcx ; SSE-NEXT: movq 72(%rdi), %rax @@ -4758,7 +4758,7 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_ctlz_undef_i1024: +; AVX2-LABEL: load_ctlz_poison_i1024: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: movq 16(%rdi), %rcx @@ -4858,7 +4858,7 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_ctlz_undef_i1024: +; AVX512F-LABEL: load_ctlz_poison_i1024: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] @@ -4881,7 +4881,7 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; AVX512F-NEXT: cmovnel %ecx, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: load_ctlz_undef_i1024: +; AVX512POPCNT-LABEL: load_ctlz_poison_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] @@ -4904,7 +4904,7 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: cmovnel %ecx, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: load_ctlz_undef_i1024: +; AVX512VL-LABEL: load_ctlz_poison_i1024: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0 ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] @@ -4928,7 +4928,7 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: load_ctlz_undef_i1024: +; AVX512VLPOPCNT-LABEL: load_ctlz_poison_i1024: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 ; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] @@ -6588,11 +6588,11 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { } ; -; CTTZ_ZERO_UNDEF +; CTTZ_ZERO_POISON ; -define i32 @test_cttz_undef_i128(i128 %a0) nounwind { -; SSE-LABEL: test_cttz_undef_i128: +define i32 @test_cttz_poison_i128(i128 %a0) nounwind { +; SSE-LABEL: test_cttz_poison_i128: ; SSE: # %bb.0: ; SSE-NEXT: rep bsfq %rdi, %rcx ; SSE-NEXT: rep bsfq %rsi, %rax @@ -6602,7 +6602,7 @@ define i32 @test_cttz_undef_i128(i128 %a0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_undef_i128: +; AVX2-LABEL: test_cttz_poison_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: tzcntq %rdi, %rcx ; AVX2-NEXT: tzcntq %rsi, %rax @@ -6612,7 +6612,7 @@ define i32 @test_cttz_undef_i128(i128 %a0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_undef_i128: +; AVX512-LABEL: test_cttz_poison_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: tzcntq %rdi, %rcx ; AVX512-NEXT: tzcntq %rsi, %rax @@ -6626,8 +6626,8 @@ define i32 @test_cttz_undef_i128(i128 %a0) nounwind { ret i32 %res } -define i32 @load_cttz_undef_i128(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_undef_i128: +define i32 @load_cttz_poison_i128(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_poison_i128: ; SSE: # %bb.0: ; SSE-NEXT: movq (%rdi), %rcx ; SSE-NEXT: rep bsfq %rcx, %rdx @@ -6638,7 +6638,7 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_undef_i128: +; AVX2-LABEL: load_cttz_poison_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: movq (%rdi), %rcx ; AVX2-NEXT: tzcntq %rcx, %rdx @@ -6649,7 +6649,7 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_undef_i128: +; AVX512-LABEL: load_cttz_poison_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movq (%rdi), %rcx ; AVX512-NEXT: tzcntq %rcx, %rdx @@ -6665,8 +6665,8 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind { ret i32 %res } -define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind { -; SSE-LABEL: vector_cttz_undef_i128: +define i32 @vector_cttz_poison_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_poison_i128: ; SSE: # %bb.0: ; SSE-NEXT: pextrq $1, %xmm0, %rax ; SSE-NEXT: movq %xmm0, %rcx @@ -6678,7 +6678,7 @@ define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: vector_cttz_undef_i128: +; AVX2-LABEL: vector_cttz_poison_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vmovq %xmm0, %rcx @@ -6690,7 +6690,7 @@ define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: vector_cttz_undef_i128: +; AVX512-LABEL: vector_cttz_poison_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vmovq %xmm0, %rcx @@ -6707,8 +6707,8 @@ define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind { ret i32 %res } -define i32 @test_cttz_undef_i256(i256 %a0) nounwind { -; SSE-LABEL: test_cttz_undef_i256: +define i32 @test_cttz_poison_i256(i256 %a0) nounwind { +; SSE-LABEL: test_cttz_poison_i256: ; SSE: # %bb.0: ; SSE-NEXT: rep bsfq %rdi, %rax ; SSE-NEXT: rep bsfq %rsi, %r8 @@ -6726,7 +6726,7 @@ define i32 @test_cttz_undef_i256(i256 %a0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_undef_i256: +; AVX2-LABEL: test_cttz_poison_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: tzcntq %rdi, %rax ; AVX2-NEXT: tzcntq %rsi, %r8 @@ -6745,7 +6745,7 @@ define i32 @test_cttz_undef_i256(i256 %a0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_undef_i256: +; AVX512-LABEL: test_cttz_poison_i256: ; AVX512: # %bb.0: ; AVX512-NEXT: tzcntq %rdi, %rax ; AVX512-NEXT: tzcntq %rsi, %r8 @@ -6767,8 +6767,8 @@ define i32 @test_cttz_undef_i256(i256 %a0) nounwind { ret i32 %res } -define i32 @load_cttz_undef_i256(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_undef_i256: +define i32 @load_cttz_poison_i256(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_poison_i256: ; SSE: # %bb.0: ; SSE-NEXT: movq 16(%rdi), %rcx ; SSE-NEXT: movq (%rdi), %rdx @@ -6789,7 +6789,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_undef_i256: +; AVX2-LABEL: load_cttz_poison_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: movq (%rdi), %rcx ; AVX2-NEXT: movq 8(%rdi), %rdx @@ -6811,7 +6811,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_cttz_undef_i256: +; AVX512F-LABEL: load_cttz_poison_i256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6827,7 +6827,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: load_cttz_undef_i256: +; AVX512POPCNT-LABEL: load_cttz_poison_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6842,7 +6842,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: load_cttz_undef_i256: +; AVX512VL-LABEL: load_cttz_poison_i256: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6857,7 +6857,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: load_cttz_undef_i256: +; AVX512VLPOPCNT-LABEL: load_cttz_poison_i256: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512VLPOPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6876,8 +6876,8 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ret i32 %res } -define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { -; SSE-LABEL: vector_cttz_undef_i256: +define i32 @vector_cttz_poison_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_poison_i256: ; SSE: # %bb.0: ; SSE-NEXT: pextrq $1, %xmm1, %rax ; SSE-NEXT: movq %xmm1, %rcx @@ -6899,7 +6899,7 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: vector_cttz_undef_i256: +; AVX2-LABEL: vector_cttz_poison_i256: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax @@ -6924,7 +6924,7 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vector_cttz_undef_i256: +; AVX512F-LABEL: vector_cttz_poison_i256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6940,7 +6940,7 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: vector_cttz_undef_i256: +; AVX512POPCNT-LABEL: vector_cttz_poison_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6955,7 +6955,7 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: vector_cttz_undef_i256: +; AVX512VL-LABEL: vector_cttz_poison_i256: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 @@ -6969,7 +6969,7 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: vector_cttz_undef_i256: +; AVX512VLPOPCNT-LABEL: vector_cttz_poison_i256: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VLPOPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 @@ -6987,8 +6987,8 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ret i32 %res } -define i32 @test_cttz_undef_i512(i512 %a0) nounwind { -; SSE-LABEL: test_cttz_undef_i512: +define i32 @test_cttz_poison_i512(i512 %a0) nounwind { +; SSE-LABEL: test_cttz_poison_i512: ; SSE: # %bb.0: ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx @@ -7030,7 +7030,7 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; SSE-NEXT: popq %r14 ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_undef_i512: +; AVX2-LABEL: test_cttz_poison_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx @@ -7077,7 +7077,7 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_cttz_undef_i512: +; AVX512F-LABEL: test_cttz_poison_i512: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %rcx, %xmm0 ; AVX512F-NEXT: vmovq %rdx, %xmm1 @@ -7102,7 +7102,7 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: test_cttz_undef_i512: +; AVX512POPCNT-LABEL: test_cttz_poison_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 ; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 @@ -7126,7 +7126,7 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: test_cttz_undef_i512: +; AVX512VL-LABEL: test_cttz_poison_i512: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq %rcx, %xmm0 ; AVX512VL-NEXT: vmovq %rdx, %xmm1 @@ -7152,7 +7152,7 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: test_cttz_undef_i512: +; AVX512VLPOPCNT-LABEL: test_cttz_poison_i512: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovq %rcx, %xmm0 ; AVX512VLPOPCNT-NEXT: vmovq %rdx, %xmm1 @@ -7181,8 +7181,8 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ret i32 %res } -define i32 @load_cttz_undef_i512(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_undef_i512: +define i32 @load_cttz_poison_i512(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_poison_i512: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbx ; SSE-NEXT: movdqa (%rdi), %xmm0 @@ -7226,7 +7226,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; SSE-NEXT: popq %rbx ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_undef_i512: +; AVX2-LABEL: load_cttz_poison_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: movq 40(%rdi), %rcx ; AVX2-NEXT: movq 32(%rdi), %rdx @@ -7273,7 +7273,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_cttz_undef_i512: +; AVX512F-LABEL: load_cttz_poison_i512: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 @@ -7287,7 +7287,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: load_cttz_undef_i512: +; AVX512POPCNT-LABEL: load_cttz_poison_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 @@ -7300,7 +7300,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: load_cttz_undef_i512: +; AVX512VL-LABEL: load_cttz_poison_i512: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 @@ -7315,7 +7315,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: load_cttz_undef_i512: +; AVX512VLPOPCNT-LABEL: load_cttz_poison_i512: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512VLPOPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 @@ -7334,8 +7334,8 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ret i32 %res } -define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { -; SSE-LABEL: vector_cttz_undef_i512: +define i32 @vector_cttz_poison_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_poison_i512: ; SSE: # %bb.0: ; SSE-NEXT: pextrq $1, %xmm3, %rax ; SSE-NEXT: pextrq $1, %xmm2, %rdx @@ -7378,7 +7378,7 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: vector_cttz_undef_i512: +; AVX2-LABEL: vector_cttz_poison_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax @@ -7427,7 +7427,7 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vector_cttz_undef_i512: +; AVX512F-LABEL: vector_cttz_poison_i512: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 @@ -7440,7 +7440,7 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: vector_cttz_undef_i512: +; AVX512POPCNT-LABEL: vector_cttz_poison_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 @@ -7452,7 +7452,7 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: vector_cttz_undef_i512: +; AVX512VL-LABEL: vector_cttz_poison_i512: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 @@ -7466,7 +7466,7 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: vector_cttz_undef_i512: +; AVX512VLPOPCNT-LABEL: vector_cttz_poison_i512: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512VLPOPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 @@ -7484,8 +7484,8 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ret i32 %res } -define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { -; SSE-LABEL: test_cttz_undef_i1024: +define i32 @test_cttz_poison_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_cttz_poison_i1024: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r15 @@ -7588,7 +7588,7 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: test_cttz_undef_i1024: +; AVX2-LABEL: test_cttz_poison_i1024: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 @@ -7707,7 +7707,7 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_cttz_undef_i1024: +; AVX512F-LABEL: test_cttz_poison_i1024: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %rcx, %xmm0 ; AVX512F-NEXT: vmovq %rdx, %xmm1 @@ -7749,7 +7749,7 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; AVX512F-NEXT: cmovnel %r10d, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: test_cttz_undef_i1024: +; AVX512POPCNT-LABEL: test_cttz_poison_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 ; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 @@ -7791,7 +7791,7 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; AVX512POPCNT-NEXT: cmovnel %r10d, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: test_cttz_undef_i1024: +; AVX512VL-LABEL: test_cttz_poison_i1024: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq %rcx, %xmm0 ; AVX512VL-NEXT: vmovq %rdx, %xmm1 @@ -7834,7 +7834,7 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: test_cttz_undef_i1024: +; AVX512VLPOPCNT-LABEL: test_cttz_poison_i1024: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovq %rcx, %xmm0 ; AVX512VLPOPCNT-NEXT: vmovq %rdx, %xmm1 @@ -7881,8 +7881,8 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ret i32 %res } -define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { -; SSE-LABEL: load_cttz_undef_i1024: +define i32 @load_cttz_poison_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_poison_i1024: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbx ; SSE-NEXT: movq 104(%rdi), %rcx @@ -7971,7 +7971,7 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; SSE-NEXT: popq %rbx ; SSE-NEXT: retq ; -; AVX2-LABEL: load_cttz_undef_i1024: +; AVX2-LABEL: load_cttz_poison_i1024: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: movq 104(%rdi), %rcx @@ -8071,7 +8071,7 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_cttz_undef_i1024: +; AVX512F-LABEL: load_cttz_poison_i1024: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 @@ -8097,7 +8097,7 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; AVX512F-NEXT: cmovnel %ecx, %eax ; AVX512F-NEXT: retq ; -; AVX512POPCNT-LABEL: load_cttz_undef_i1024: +; AVX512POPCNT-LABEL: load_cttz_poison_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm1 @@ -8123,7 +8123,7 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: cmovnel %ecx, %eax ; AVX512POPCNT-NEXT: retq ; -; AVX512VL-LABEL: load_cttz_undef_i1024: +; AVX512VL-LABEL: load_cttz_poison_i1024: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm1 @@ -8150,7 +8150,7 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VLPOPCNT-LABEL: load_cttz_undef_i1024: +; AVX512VLPOPCNT-LABEL: load_cttz_poison_i1024: ; AVX512VLPOPCNT: # %bb.0: ; AVX512VLPOPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm1 diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll index 03b003f52ffd6..56b0ceb34fe18 100644 --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -110,8 +110,8 @@ define <4 x i32> @freeze_abs_vec(<4 x i32> %a0) nounwind { ret <4 x i32> %r } -define i32 @freeze_abs_undef(i32 %a0) nounwind { -; X86-LABEL: freeze_abs_undef: +define i32 @freeze_abs_poison(i32 %a0) nounwind { +; X86-LABEL: freeze_abs_poison: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax @@ -119,7 +119,7 @@ define i32 @freeze_abs_undef(i32 %a0) nounwind { ; X86-NEXT: cmovsl %ecx, %eax ; X86-NEXT: retl ; -; X64-LABEL: freeze_abs_undef: +; X64-LABEL: freeze_abs_poison: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax ; X64-NEXT: negl %eax @@ -131,8 +131,8 @@ define i32 @freeze_abs_undef(i32 %a0) nounwind { ret i32 %r } -define <4 x i32> @freeze_abs_undef_vec(<4 x i32> %a0) nounwind { -; X86-LABEL: freeze_abs_undef_vec: +define <4 x i32> @freeze_abs_poison_vec(<4 x i32> %a0) nounwind { +; X86-LABEL: freeze_abs_poison_vec: ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $31, %xmm1 @@ -140,7 +140,7 @@ define <4 x i32> @freeze_abs_undef_vec(<4 x i32> %a0) nounwind { ; X86-NEXT: psubd %xmm1, %xmm0 ; X86-NEXT: retl ; -; X64-LABEL: freeze_abs_undef_vec: +; X64-LABEL: freeze_abs_poison_vec: ; X64: # %bb.0: ; X64-NEXT: pabsd %xmm0, %xmm0 ; X64-NEXT: retq @@ -228,8 +228,8 @@ define i32 @freeze_ctlz(i32 %a0) nounwind { ret i32 %r } -define i32 @freeze_ctlz_undef(i32 %a0) nounwind { -; X86-LABEL: freeze_ctlz_undef: +define i32 @freeze_ctlz_poison(i32 %a0) nounwind { +; X86-LABEL: freeze_ctlz_poison: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bsrl %eax, %ecx @@ -239,7 +239,7 @@ define i32 @freeze_ctlz_undef(i32 %a0) nounwind { ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; -; X64-LABEL: freeze_ctlz_undef: +; X64-LABEL: freeze_ctlz_poison: ; X64: # %bb.0: ; X64-NEXT: bsrl %edi, %ecx ; X64-NEXT: xorl $31, %ecx @@ -254,8 +254,8 @@ define i32 @freeze_ctlz_undef(i32 %a0) nounwind { ret i32 %r } -define i32 @freeze_ctlz_undef_nonzero(i32 %a0) nounwind { -; X86-LABEL: freeze_ctlz_undef_nonzero: +define i32 @freeze_ctlz_poison_nonzero(i32 %a0) nounwind { +; X86-LABEL: freeze_ctlz_poison_nonzero: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl $1, %eax @@ -263,7 +263,7 @@ define i32 @freeze_ctlz_undef_nonzero(i32 %a0) nounwind { ; X86-NEXT: xorl $31, %eax ; X86-NEXT: retl ; -; X64-LABEL: freeze_ctlz_undef_nonzero: +; X64-LABEL: freeze_ctlz_poison_nonzero: ; X64: # %bb.0: ; X64-NEXT: orl $1, %edi ; X64-NEXT: bsrl %edi, %eax @@ -297,8 +297,8 @@ define i32 @freeze_cttz(i32 %a0) nounwind { ret i32 %r } -define i32 @freeze_cttz_undef(i32 %a0) nounwind { -; X86-LABEL: freeze_cttz_undef: +define i32 @freeze_cttz_poison(i32 %a0) nounwind { +; X86-LABEL: freeze_cttz_poison: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bsfl %eax, %ecx @@ -306,7 +306,7 @@ define i32 @freeze_cttz_undef(i32 %a0) nounwind { ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; -; X64-LABEL: freeze_cttz_undef: +; X64-LABEL: freeze_cttz_poison: ; X64: # %bb.0: ; X64-NEXT: bsfl %edi, %ecx ; X64-NEXT: movl $32, %eax @@ -319,15 +319,15 @@ define i32 @freeze_cttz_undef(i32 %a0) nounwind { ret i32 %r } -define i32 @freeze_cttz_undef_nonzero(i32 %a0) nounwind { -; X86-LABEL: freeze_cttz_undef_nonzero: +define i32 @freeze_cttz_poison_nonzero(i32 %a0) nounwind { +; X86-LABEL: freeze_cttz_poison_nonzero: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl $1, %eax ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl ; -; X64-LABEL: freeze_cttz_undef_nonzero: +; X64-LABEL: freeze_cttz_poison_nonzero: ; X64: # %bb.0: ; X64-NEXT: orl $1, %edi ; X64-NEXT: rep bsfl %edi, %eax diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index 910f1375ed1ca..496b97f9e17e1 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64 ;; Use cttz to test if we properly prove never-zero. There is a very -;; simple transform from cttz -> cttz_zero_undef if its operand is +;; simple transform from cttz -> cttz_zero_poison if its operand is ;; known never zero. define i32 @or_known_nonzero(i32 %x) { diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 56001468898e4..6635d9a7030dc 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -670,11 +670,11 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> } ; -; CTLZ_ZERO_UNDEF +; CTLZ_ZERO_POISON ; -define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { -; SSE42-LABEL: widen_ctlz_undef_v2i32_v4i32: +define <4 x i32> @widen_ctlz_poison_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { +; SSE42-LABEL: widen_ctlz_poison_v2i32_v4i32: ; SSE42: # %bb.0: ; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm3, %xmm6 @@ -723,7 +723,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE42-NEXT: retq ; -; AVX2-LABEL: widen_ctlz_undef_v2i32_v4i32: +; AVX2-LABEL: widen_ctlz_poison_v2i32_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 @@ -765,7 +765,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: retq ; -; AVX512-LABEL: widen_ctlz_undef_v2i32_v4i32: +; AVX512-LABEL: widen_ctlz_poison_v2i32_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vplzcntd %xmm0, %xmm0 @@ -776,8 +776,8 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ret <4 x i32> %res } -define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { -; SSE42-LABEL: widen_ctlz_undef_v4i32_v8i32: +define <8 x i32> @widen_ctlz_poison_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { +; SSE42-LABEL: widen_ctlz_poison_v4i32_v8i32: ; SSE42: # %bb.0: ; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm3, %xmm6 @@ -825,7 +825,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-NEXT: paddd %xmm6, %xmm1 ; SSE42-NEXT: retq ; -; AVX2-LABEL: widen_ctlz_undef_v4i32_v8i32: +; AVX2-LABEL: widen_ctlz_poison_v4i32_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -851,7 +851,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: widen_ctlz_undef_v4i32_v8i32: +; AVX512-LABEL: widen_ctlz_poison_v4i32_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -863,8 +863,8 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ret <8 x i32> %res } -define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { -; SSE42-LABEL: widen_ctlz_undef_v2i32_v8i32: +define <8 x i32> @widen_ctlz_poison_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { +; SSE42-LABEL: widen_ctlz_poison_v2i32_v8i32: ; SSE42: # %bb.0: ; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm5, %xmm8 @@ -956,7 +956,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE42-NEXT: retq ; -; AVX2-LABEL: widen_ctlz_undef_v2i32_v8i32: +; AVX2-LABEL: widen_ctlz_poison_v2i32_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 @@ -1003,7 +1003,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: retq ; -; AVX512-LABEL: widen_ctlz_undef_v2i32_v8i32: +; AVX512-LABEL: widen_ctlz_poison_v2i32_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 @@ -1353,11 +1353,11 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> } ; -; CTTZ_ZERO_UNDEF +; CTTZ_ZERO_POISON ; -define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { -; SSE42-LABEL: widen_cttz_undef_v2i32_v4i32: +define <4 x i32> @widen_cttz_poison_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { +; SSE42-LABEL: widen_cttz_poison_v2i32_v4i32: ; SSE42: # %bb.0: ; SSE42-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE42-NEXT: movdqa %xmm0, %xmm3 @@ -1392,7 +1392,7 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; SSE42-NEXT: packuswb %xmm1, %xmm0 ; SSE42-NEXT: retq ; -; AVX2-LABEL: widen_cttz_undef_v2i32_v4i32: +; AVX2-LABEL: widen_cttz_poison_v2i32_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm3 @@ -1421,7 +1421,7 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: widen_cttz_undef_v2i32_v4i32: +; AVX512VL-LABEL: widen_cttz_poison_v2i32_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm3 @@ -1434,7 +1434,7 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX512VL-NEXT: vpsubd %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: retq ; -; AVX512VPOPCNT-LABEL: widen_cttz_undef_v2i32_v4i32: +; AVX512VPOPCNT-LABEL: widen_cttz_poison_v2i32_v4i32: ; AVX512VPOPCNT: # %bb.0: ; AVX512VPOPCNT-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512VPOPCNT-NEXT: vpaddd %xmm2, %xmm0, %xmm3 @@ -1450,8 +1450,8 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ret <4 x i32> %res } -define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { -; SSE42-LABEL: widen_cttz_undef_v4i32_v8i32: +define <8 x i32> @widen_cttz_poison_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { +; SSE42-LABEL: widen_cttz_poison_v4i32_v8i32: ; SSE42: # %bb.0: ; SSE42-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE42-NEXT: movdqa %xmm0, %xmm2 @@ -1491,7 +1491,7 @@ define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-NEXT: packuswb %xmm2, %xmm1 ; SSE42-NEXT: retq ; -; AVX2-LABEL: widen_cttz_undef_v4i32_v8i32: +; AVX2-LABEL: widen_cttz_poison_v4i32_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -1515,7 +1515,7 @@ define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: widen_cttz_undef_v4i32_v8i32: +; AVX512VL-LABEL: widen_cttz_poison_v4i32_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -1527,7 +1527,7 @@ define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX512VL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; -; AVX512VPOPCNT-LABEL: widen_cttz_undef_v4i32_v8i32: +; AVX512VPOPCNT-LABEL: widen_cttz_poison_v4i32_v8i32: ; AVX512VPOPCNT: # %bb.0: ; AVX512VPOPCNT-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -1542,8 +1542,8 @@ define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ret <8 x i32> %res } -define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { -; SSE42-LABEL: widen_cttz_undef_v2i32_v8i32: +define <8 x i32> @widen_cttz_poison_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { +; SSE42-LABEL: widen_cttz_poison_v2i32_v8i32: ; SSE42: # %bb.0: ; SSE42-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE42-NEXT: movdqa %xmm0, %xmm4 @@ -1607,7 +1607,7 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; SSE42-NEXT: packuswb %xmm2, %xmm1 ; SSE42-NEXT: retq ; -; AVX2-LABEL: widen_cttz_undef_v2i32_v8i32: +; AVX2-LABEL: widen_cttz_poison_v2i32_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 @@ -1641,7 +1641,7 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: widen_cttz_undef_v2i32_v8i32: +; AVX512VL-LABEL: widen_cttz_poison_v2i32_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 @@ -1658,7 +1658,7 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX512VL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; -; AVX512VPOPCNT-LABEL: widen_cttz_undef_v2i32_v8i32: +; AVX512VPOPCNT-LABEL: widen_cttz_poison_v2i32_v8i32: ; AVX512VPOPCNT: # %bb.0: ; AVX512VPOPCNT-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VPOPCNT-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 diff --git a/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll index 2524a333556e2..737ab07f35a44 100644 --- a/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll @@ -41,8 +41,8 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ret i32 %tmp } -define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { -; XTENSA-LABEL: test_cttz_i32_zero_undef: +define i32 @test_cttz_i32_zero_poison(i32 %a) nounwind { +; XTENSA-LABEL: test_cttz_i32_zero_poison: ; XTENSA: # %bb.0: ; XTENSA-NEXT: movi a8, -1 ; XTENSA-NEXT: xor a8, a2, a8 @@ -114,8 +114,8 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ret i32 %tmp } -define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { -; XTENSA-LABEL: test_ctlz_i32_zero_undef: +define i32 @test_ctlz_i32_zero_poison(i32 %a) nounwind { +; XTENSA-LABEL: test_ctlz_i32_zero_poison: ; XTENSA: # %bb.0: ; XTENSA-NEXT: srli a8, a2, 1 ; XTENSA-NEXT: or a8, a2, a8 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll index 8b92eb4c59cc5..f47ff46073d53 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll @@ -237,7 +237,7 @@ define <2 x i64> @test_insertqi_call_constexpr(<2 x i64> %x) { } ; The result of this insert is the second arg, since the top 64 bits of -; the result are poisonined, and we copy the bottom 64 bits from the +; the result are poisoned, and we copy the bottom 64 bits from the ; second arg define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { ; CHECK-LABEL: @testInsert64Bits( diff --git a/llvm/test/Transforms/InstCombine/add2.ll b/llvm/test/Transforms/InstCombine/add2.ll index 2b4ad85bfd995..565718f9665b2 100644 --- a/llvm/test/Transforms/InstCombine/add2.ll +++ b/llvm/test/Transforms/InstCombine/add2.ll @@ -345,7 +345,7 @@ define i16 @add_cttz(i16 %a) { ; CHECK-NEXT: [[B:%.*]] = or disjoint i16 [[CTTZ]], -8 ; CHECK-NEXT: ret i16 [[B]] ; - ; llvm.cttz.i16(..., /*is_zero_undefined=*/true) implies the value returned + ; llvm.cttz.i16(..., /*is_zero_poison=*/true) implies the value returned ; is in [0, 16). The range metadata indicates the value returned is in [0, 8). ; Intersecting these ranges, we know the value returned is in [0, 8). ; Therefore, InstCombine will transform @@ -367,7 +367,7 @@ define i16 @add_cttz_2(i16 %a) { ; CHECK-NEXT: [[B:%.*]] = or disjoint i16 [[CTTZ]], -16 ; CHECK-NEXT: ret i16 [[B]] ; - ; llvm.cttz.i16(..., /*is_zero_undefined=*/true) implies the value returned + ; llvm.cttz.i16(..., /*is_zero_poison=*/true) implies the value returned ; is in [0, 16). The range metadata indicates the value returned is in ; [0, 32). Intersecting these ranges, we know the value returned is in ; [0, 16). Therefore, InstCombine will transform diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll index b3291e7058896..7af67faab1d47 100644 --- a/llvm/test/Transforms/InstCombine/cttz.ll +++ b/llvm/test/Transforms/InstCombine/cttz.ll @@ -6,8 +6,8 @@ declare i32 @llvm.ctlz.i32(i32, i1) declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) declare void @use(i32) -define i32 @cttz_zext_zero_undef(i16 %x) { -; CHECK-LABEL: @cttz_zext_zero_undef( +define i32 @cttz_zext_zero_poison(i16 %x) { +; CHECK-LABEL: @cttz_zext_zero_poison( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i16 0, 17) i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true) ; CHECK-NEXT: [[TZ:%.*]] = zext nneg i16 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[TZ]] @@ -28,8 +28,8 @@ define i32 @cttz_zext_zero_def(i16 %x) { ret i32 %tz } -define i32 @cttz_zext_zero_undef_extra_use(i16 %x) { -; CHECK-LABEL: @cttz_zext_zero_undef_extra_use( +define i32 @cttz_zext_zero_poison_extra_use(i16 %x) { +; CHECK-LABEL: @cttz_zext_zero_poison_extra_use( ; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X:%.*]] to i32 ; CHECK-NEXT: call void @use(i32 [[Z]]) ; CHECK-NEXT: [[TZ:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[Z]], i1 true) @@ -41,8 +41,8 @@ define i32 @cttz_zext_zero_undef_extra_use(i16 %x) { ret i32 %tz } -define <2 x i64> @cttz_zext_zero_undef_vec(<2 x i32> %x) { -; CHECK-LABEL: @cttz_zext_zero_undef_vec( +define <2 x i64> @cttz_zext_zero_poison_vec(<2 x i32> %x) { +; CHECK-LABEL: @cttz_zext_zero_poison_vec( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[X:%.*]], i1 true) ; CHECK-NEXT: [[TZ:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[TZ]] @@ -63,8 +63,8 @@ define <2 x i64> @cttz_zext_zero_def_vec(<2 x i32> %x) { ret <2 x i64> %tz } -define i32 @cttz_sext_zero_undef(i16 %x) { -; CHECK-LABEL: @cttz_sext_zero_undef( +define i32 @cttz_sext_zero_poison(i16 %x) { +; CHECK-LABEL: @cttz_sext_zero_poison( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i16 0, 17) i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true) ; CHECK-NEXT: [[TZ:%.*]] = zext nneg i16 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[TZ]] @@ -85,8 +85,8 @@ define i32 @cttz_sext_zero_def(i16 %x) { ret i32 %tz } -define i32 @cttz_sext_zero_undef_extra_use(i16 %x) { -; CHECK-LABEL: @cttz_sext_zero_undef_extra_use( +define i32 @cttz_sext_zero_poison_extra_use(i16 %x) { +; CHECK-LABEL: @cttz_sext_zero_poison_extra_use( ; CHECK-NEXT: [[S:%.*]] = sext i16 [[X:%.*]] to i32 ; CHECK-NEXT: call void @use(i32 [[S]]) ; CHECK-NEXT: [[TZ:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[S]], i1 true) @@ -98,8 +98,8 @@ define i32 @cttz_sext_zero_undef_extra_use(i16 %x) { ret i32 %tz } -define <2 x i64> @cttz_sext_zero_undef_vec(<2 x i32> %x) { -; CHECK-LABEL: @cttz_sext_zero_undef_vec( +define <2 x i64> @cttz_sext_zero_poison_vec(<2 x i32> %x) { +; CHECK-LABEL: @cttz_sext_zero_poison_vec( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[X:%.*]], i1 true) ; CHECK-NEXT: [[TZ:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[TZ]] diff --git a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll index 0cb3aeaa14b3e..c1afdfef97a1c 100644 --- a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll +++ b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll @@ -3,7 +3,7 @@ ; This test is to verify that the instruction combiner is able to fold ; a cttz/ctlz followed by a icmp + select into a single cttz/ctlz with -; the 'is_zero_undef' flag cleared. +; the 'is_zero_poison' flag cleared. define i16 @test1(i16 %x) { ; CHECK-LABEL: @test1( diff --git a/llvm/test/Transforms/InstCombine/umin_cttz_ctlz.ll b/llvm/test/Transforms/InstCombine/umin_cttz_ctlz.ll index a3da6ae25ae4f..910dd9f14bc2c 100644 --- a/llvm/test/Transforms/InstCombine/umin_cttz_ctlz.ll +++ b/llvm/test/Transforms/InstCombine/umin_cttz_ctlz.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s -define i8 @umin_cttz_i8_zero_undefined(i8 %X) { -; CHECK-LABEL: define i8 @umin_cttz_i8_zero_undefined( +define i8 @umin_cttz_i8_zero_poisoned(i8 %X) { +; CHECK-LABEL: define i8 @umin_cttz_i8_zero_poisoned( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[X]], 64 ; CHECK-NEXT: [[RET:%.*]] = call range(i8 0, 7) i8 @llvm.cttz.i8(i8 [[TMP1]], i1 true) @@ -25,8 +25,8 @@ define i8 @umin_cttz_i8_zero_defined(i8 %X) { ret i8 %ret } -define i8 @umin_cttz_i8_commuted_zero_undefined(i8 %X) { -; CHECK-LABEL: define i8 @umin_cttz_i8_commuted_zero_undefined( +define i8 @umin_cttz_i8_commuted_zero_poisoned(i8 %X) { +; CHECK-LABEL: define i8 @umin_cttz_i8_commuted_zero_poisoned( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[X]], 64 ; CHECK-NEXT: [[RET:%.*]] = call range(i8 0, 7) i8 @llvm.cttz.i8(i8 [[TMP1]], i1 true) @@ -37,8 +37,8 @@ define i8 @umin_cttz_i8_commuted_zero_undefined(i8 %X) { ret i8 %ret } -define i8 @umin_cttz_i8_negative_ge_bitwidth_zero_undefined(i8 %X) { -; CHECK-LABEL: define i8 @umin_cttz_i8_negative_ge_bitwidth_zero_undefined( +define i8 @umin_cttz_i8_negative_ge_bitwidth_zero_poisoned(i8 %X) { +; CHECK-LABEL: define i8 @umin_cttz_i8_negative_ge_bitwidth_zero_poisoned( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[CTTZ:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[X]], i1 true) ; CHECK-NEXT: ret i8 [[CTTZ]] @@ -48,8 +48,8 @@ define i8 @umin_cttz_i8_negative_ge_bitwidth_zero_undefined(i8 %X) { ret i8 %ret } -define i16 @umin_cttz_i16_zero_undefined(i16 %X) { -; CHECK-LABEL: define i16 @umin_cttz_i16_zero_undefined( +define i16 @umin_cttz_i16_zero_poisoned(i16 %X) { +; CHECK-LABEL: define i16 @umin_cttz_i16_zero_poisoned( ; CHECK-SAME: i16 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i16 [[X]], 64 ; CHECK-NEXT: [[RET:%.*]] = call range(i16 0, 7) i16 @llvm.cttz.i16(i16 [[TMP1]], i1 true) @@ -60,8 +60,8 @@ define i16 @umin_cttz_i16_zero_undefined(i16 %X) { ret i16 %ret } -define i32 @umin_cttz_i32_zero_undefined(i32 %X) { -; CHECK-LABEL: define i32 @umin_cttz_i32_zero_undefined( +define i32 @umin_cttz_i32_zero_poisoned(i32 %X) { +; CHECK-LABEL: define i32 @umin_cttz_i32_zero_poisoned( ; CHECK-SAME: i32 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[X]], 64 ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 7) i32 @llvm.cttz.i32(i32 [[TMP1]], i1 true) @@ -72,8 +72,8 @@ define i32 @umin_cttz_i32_zero_undefined(i32 %X) { ret i32 %ret } -define i64 @umin_cttz_i64_zero_undefined(i64 %X) { -; CHECK-LABEL: define i64 @umin_cttz_i64_zero_undefined( +define i64 @umin_cttz_i64_zero_poisoned(i64 %X) { +; CHECK-LABEL: define i64 @umin_cttz_i64_zero_poisoned( ; CHECK-SAME: i64 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[X]], 64 ; CHECK-NEXT: [[RET:%.*]] = call range(i64 0, 7) i64 @llvm.cttz.i64(i64 [[TMP1]], i1 true) @@ -84,8 +84,8 @@ define i64 @umin_cttz_i64_zero_undefined(i64 %X) { ret i64 %ret } -define i1 @umin_cttz_i1_zero_undefined(i1 %X) { -; CHECK-LABEL: define i1 @umin_cttz_i1_zero_undefined( +define i1 @umin_cttz_i1_zero_poisoned(i1 %X) { +; CHECK-LABEL: define i1 @umin_cttz_i1_zero_poisoned( ; CHECK-SAME: i1 [[X:%.*]]) { ; CHECK-NEXT: ret i1 false ; @@ -105,8 +105,8 @@ define i1 @umin_cttz_i1_zero_defined(i1 %X) { ret i1 %ret } -define <2 x i32> @umin_cttz_2xi32_splat_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_splat_zero_undefined( +define <2 x i32> @umin_cttz_2xi32_splat_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_splat_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[X]], splat (i32 64) ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 7) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP1]], i1 true) @@ -117,8 +117,8 @@ define <2 x i32> @umin_cttz_2xi32_splat_zero_undefined(<2 x i32> %X) { ret <2 x i32> %ret } -define <2 x i32> @umin_cttz_2xi32_splat_poison_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_splat_poison_zero_undefined( +define <2 x i32> @umin_cttz_2xi32_splat_poison_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_splat_poison_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[X]], ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 7) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP1]], i1 true) @@ -129,8 +129,8 @@ define <2 x i32> @umin_cttz_2xi32_splat_poison_zero_undefined(<2 x i32> %X) { ret <2 x i32> %ret } -define <2 x i32> @umin_cttz_2xi32_no_splat_all_lt_bitwidth_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_no_splat_all_lt_bitwidth_zero_undefined( +define <2 x i32> @umin_cttz_2xi32_no_splat_all_lt_bitwidth_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_no_splat_all_lt_bitwidth_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[X]], ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP1]], i1 true) @@ -141,8 +141,8 @@ define <2 x i32> @umin_cttz_2xi32_no_splat_all_lt_bitwidth_zero_undefined(<2 x i ret <2 x i32> %ret } -define <2 x i32> @umin_cttz_2xi32_negative_no_splat_some_lt_bitwidth_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_negative_no_splat_some_lt_bitwidth_zero_undefined( +define <2 x i32> @umin_cttz_2xi32_negative_no_splat_some_lt_bitwidth_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_negative_no_splat_some_lt_bitwidth_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[X]], i1 true) ; CHECK-NEXT: [[RET1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[RET]], <2 x i32> ) @@ -153,8 +153,8 @@ define <2 x i32> @umin_cttz_2xi32_negative_no_splat_some_lt_bitwidth_zero_undefi ret <2 x i32> %ret } -define <2 x i32> @umin_cttz_2xi32_negative_no_splat_none_lt_bitwidth_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_negative_no_splat_none_lt_bitwidth_zero_undefined( +define <2 x i32> @umin_cttz_2xi32_negative_no_splat_none_lt_bitwidth_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_cttz_2xi32_negative_no_splat_none_lt_bitwidth_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[X]], i1 true) ; CHECK-NEXT: [[RET1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[RET]], <2 x i32> ) @@ -191,8 +191,8 @@ define i16 @umin_cttz_i16_negative_two_uses(i16 %X) { ret i16 %ret } -define i8 @umin_ctlz_i8_zero_undefined(i8 %X) { -; CHECK-LABEL: define i8 @umin_ctlz_i8_zero_undefined( +define i8 @umin_ctlz_i8_zero_poisoned(i8 %X) { +; CHECK-LABEL: define i8 @umin_ctlz_i8_zero_poisoned( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[X]], 2 ; CHECK-NEXT: [[RET:%.*]] = call range(i8 0, 7) i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 true) @@ -215,8 +215,8 @@ define i8 @umin_ctlz_i8_zero_defined(i8 %X) { ret i8 %ret } -define i8 @umin_ctlz_i8_commuted_zero_undefined(i8 %X) { -; CHECK-LABEL: define i8 @umin_ctlz_i8_commuted_zero_undefined( +define i8 @umin_ctlz_i8_commuted_zero_poisoned(i8 %X) { +; CHECK-LABEL: define i8 @umin_ctlz_i8_commuted_zero_poisoned( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[X]], 2 ; CHECK-NEXT: [[RET:%.*]] = call range(i8 0, 7) i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 true) @@ -227,8 +227,8 @@ define i8 @umin_ctlz_i8_commuted_zero_undefined(i8 %X) { ret i8 %ret } -define i8 @umin_ctlz_i8_negative_ge_bitwidth_zero_undefined(i8 %X) { -; CHECK-LABEL: define i8 @umin_ctlz_i8_negative_ge_bitwidth_zero_undefined( +define i8 @umin_ctlz_i8_negative_ge_bitwidth_zero_poisoned(i8 %X) { +; CHECK-LABEL: define i8 @umin_ctlz_i8_negative_ge_bitwidth_zero_poisoned( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[CTLZ:%.*]] = call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[X]], i1 true) ; CHECK-NEXT: ret i8 [[CTLZ]] @@ -238,8 +238,8 @@ define i8 @umin_ctlz_i8_negative_ge_bitwidth_zero_undefined(i8 %X) { ret i8 %ret } -define i16 @umin_ctlz_i16_zero_undefined(i16 %X) { -; CHECK-LABEL: define i16 @umin_ctlz_i16_zero_undefined( +define i16 @umin_ctlz_i16_zero_poisoned(i16 %X) { +; CHECK-LABEL: define i16 @umin_ctlz_i16_zero_poisoned( ; CHECK-SAME: i16 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i16 [[X]], 512 ; CHECK-NEXT: [[RET:%.*]] = call range(i16 0, 7) i16 @llvm.ctlz.i16(i16 [[TMP1]], i1 true) @@ -250,8 +250,8 @@ define i16 @umin_ctlz_i16_zero_undefined(i16 %X) { ret i16 %ret } -define i32 @umin_ctlz_i32_zero_undefined(i32 %X) { -; CHECK-LABEL: define i32 @umin_ctlz_i32_zero_undefined( +define i32 @umin_ctlz_i32_zero_poisoned(i32 %X) { +; CHECK-LABEL: define i32 @umin_ctlz_i32_zero_poisoned( ; CHECK-SAME: i32 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[X]], 33554432 ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 7) i32 @llvm.ctlz.i32(i32 [[TMP1]], i1 true) @@ -262,8 +262,8 @@ define i32 @umin_ctlz_i32_zero_undefined(i32 %X) { ret i32 %ret } -define i64 @umin_ctlz_i64_zero_undefined(i64 %X) { -; CHECK-LABEL: define i64 @umin_ctlz_i64_zero_undefined( +define i64 @umin_ctlz_i64_zero_poisoned(i64 %X) { +; CHECK-LABEL: define i64 @umin_ctlz_i64_zero_poisoned( ; CHECK-SAME: i64 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[X]], 144115188075855872 ; CHECK-NEXT: [[RET:%.*]] = call range(i64 0, 7) i64 @llvm.ctlz.i64(i64 [[TMP1]], i1 true) @@ -274,8 +274,8 @@ define i64 @umin_ctlz_i64_zero_undefined(i64 %X) { ret i64 %ret } -define i1 @umin_ctlz_i1_zero_undefined(i1 %X) { -; CHECK-LABEL: define i1 @umin_ctlz_i1_zero_undefined( +define i1 @umin_ctlz_i1_zero_poisoned(i1 %X) { +; CHECK-LABEL: define i1 @umin_ctlz_i1_zero_poisoned( ; CHECK-SAME: i1 [[X:%.*]]) { ; CHECK-NEXT: ret i1 false ; @@ -295,8 +295,8 @@ define i1 @umin_ctlz_i1_zero_defined(i1 %X) { ret i1 %ret } -define <2 x i32> @umin_ctlz_2xi32_splat_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_splat_zero_undefined( +define <2 x i32> @umin_ctlz_2xi32_splat_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_splat_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[X]], splat (i32 33554432) ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 7) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP1]], i1 true) @@ -307,8 +307,8 @@ define <2 x i32> @umin_ctlz_2xi32_splat_zero_undefined(<2 x i32> %X) { ret <2 x i32> %ret } -define <2 x i32> @umin_ctlz_2xi32_splat_poison_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_splat_poison_zero_undefined( +define <2 x i32> @umin_ctlz_2xi32_splat_poison_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_splat_poison_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[X]], ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 7) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP1]], i1 true) @@ -319,8 +319,8 @@ define <2 x i32> @umin_ctlz_2xi32_splat_poison_zero_undefined(<2 x i32> %X) { ret <2 x i32> %ret } -define <2 x i32> @umin_ctlz_2xi32_no_splat_all_lt_bitwidth_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_no_splat_all_lt_bitwidth_zero_undefined( +define <2 x i32> @umin_ctlz_2xi32_no_splat_all_lt_bitwidth_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_no_splat_all_lt_bitwidth_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[X]], ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP1]], i1 true) @@ -331,8 +331,8 @@ define <2 x i32> @umin_ctlz_2xi32_no_splat_all_lt_bitwidth_zero_undefined(<2 x i ret <2 x i32> %ret } -define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_some_lt_bitwidth_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_some_lt_bitwidth_zero_undefined( +define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_some_lt_bitwidth_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_some_lt_bitwidth_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[X]], i1 true) ; CHECK-NEXT: [[RET1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[RET]], <2 x i32> ) @@ -343,8 +343,8 @@ define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_some_lt_bitwidth_zero_undefi ret <2 x i32> %ret } -define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_none_lt_bitwidth_zero_undefined(<2 x i32> %X) { -; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_none_lt_bitwidth_zero_undefined( +define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_none_lt_bitwidth_zero_poisoned(<2 x i32> %X) { +; CHECK-LABEL: define <2 x i32> @umin_ctlz_2xi32_negative_no_splat_none_lt_bitwidth_zero_poisoned( ; CHECK-SAME: <2 x i32> [[X:%.*]]) { ; CHECK-NEXT: [[RET:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[X]], i1 true) ; CHECK-NEXT: [[RET1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[RET]], <2 x i32> ) @@ -381,8 +381,8 @@ define i16 @umin_ctlz_i16_negative_two_uses(i16 %X) { ret i16 %ret } -define i32 @umin_cttz_i32_zero_undef(i32 %X, i32 %Y) { -; CHECK-LABEL: define i32 @umin_cttz_i32_zero_undef( +define i32 @umin_cttz_i32_zero_poison(i32 %X, i32 %Y) { +; CHECK-LABEL: define i32 @umin_cttz_i32_zero_poison( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[X]], [[Y]] ; CHECK-NEXT: [[RES:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[TMP1]], i1 false) @@ -420,8 +420,8 @@ define i32 @umin_cttz_i32_zero_def_undef(i32 %X, i32 %Y) { ret i32 %res } -define i32 @umin_ctlz_i32_zero_undef(i32 %X, i32 %Y) { -; CHECK-LABEL: define i32 @umin_ctlz_i32_zero_undef( +define i32 @umin_ctlz_i32_zero_poison(i32 %X, i32 %Y) { +; CHECK-LABEL: define i32 @umin_ctlz_i32_zero_poison( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[X]], [[Y]] ; CHECK-NEXT: [[RES:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[TMP1]], i1 false) @@ -505,8 +505,8 @@ define i32 @neg_umin_cttz_ctlz_i32_zero_def(i32 %X, i32 %Y) { ret i32 %res } -define i32 @neg_umin_cttz_ctlz_i32_zero_undef(i32 %X, i32 %Y) { -; CHECK-LABEL: define i32 @neg_umin_cttz_ctlz_i32_zero_undef( +define i32 @neg_umin_cttz_ctlz_i32_zero_poison(i32 %X, i32 %Y) { +; CHECK-LABEL: define i32 @neg_umin_cttz_ctlz_i32_zero_poison( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[CTTZ_X:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 false) ; CHECK-NEXT: [[CTLZ_Y:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[Y]], i1 false) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll index 9bf2ade3176d6..1dd7a7fc2ba17 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -562,11 +562,11 @@ define void @ctlz_32i8() #0 { } ; -; CTLZ_ZERO_UNDEF +; CTLZ_ZERO_POISON ; -define void @ctlz_undef_2i64() #0 { -; SSE-LABEL: @ctlz_undef_2i64( +define void @ctlz_poison_2i64() #0 { +; SSE-LABEL: @ctlz_poison_2i64( ; SSE-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 8 ; SSE-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8 ; SSE-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) @@ -575,7 +575,7 @@ define void @ctlz_undef_2i64() #0 { ; SSE-NEXT: store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctlz_undef_2i64( +; AVX1-LABEL: @ctlz_poison_2i64( ; AVX1-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 8 ; AVX1-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8 ; AVX1-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) @@ -584,7 +584,7 @@ define void @ctlz_undef_2i64() #0 { ; AVX1-NEXT: store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @ctlz_undef_2i64( +; AVX2-LABEL: @ctlz_poison_2i64( ; AVX2-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 8 ; AVX2-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8 ; AVX2-NEXT: [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true) @@ -593,7 +593,7 @@ define void @ctlz_undef_2i64() #0 { ; AVX2-NEXT: store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @ctlz_undef_2i64( +; AVX512-LABEL: @ctlz_poison_2i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 8 ; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 true) ; AVX512-NEXT: store <2 x i64> [[TMP2]], ptr @dst64, align 8 @@ -608,8 +608,8 @@ define void @ctlz_undef_2i64() #0 { ret void } -define void @ctlz_undef_4i64() #0 { -; SSE-LABEL: @ctlz_undef_4i64( +define void @ctlz_poison_4i64() #0 { +; SSE-LABEL: @ctlz_poison_4i64( ; SSE-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 4 ; SSE-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4 ; SSE-NEXT: [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4 @@ -624,7 +624,7 @@ define void @ctlz_undef_4i64() #0 { ; SSE-NEXT: store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctlz_undef_4i64( +; AVX1-LABEL: @ctlz_poison_4i64( ; AVX1-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 4 ; AVX1-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4 ; AVX1-NEXT: [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4 @@ -639,7 +639,7 @@ define void @ctlz_undef_4i64() #0 { ; AVX1-NEXT: store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @ctlz_undef_4i64( +; AVX2-LABEL: @ctlz_poison_4i64( ; AVX2-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 4 ; AVX2-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4 ; AVX2-NEXT: [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4 @@ -654,7 +654,7 @@ define void @ctlz_undef_4i64() #0 { ; AVX2-NEXT: store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @ctlz_undef_4i64( +; AVX512-LABEL: @ctlz_poison_4i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4 ; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 true) ; AVX512-NEXT: store <4 x i64> [[TMP2]], ptr @dst64, align 4 @@ -675,8 +675,8 @@ define void @ctlz_undef_4i64() #0 { ret void } -define void @ctlz_undef_4i32() #0 { -; SSE-LABEL: @ctlz_undef_4i32( +define void @ctlz_poison_4i32() #0 { +; SSE-LABEL: @ctlz_poison_4i32( ; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 ; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 ; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 @@ -691,7 +691,7 @@ define void @ctlz_undef_4i32() #0 { ; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctlz_undef_4i32( +; AVX1-LABEL: @ctlz_poison_4i32( ; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 ; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 ; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 @@ -706,13 +706,13 @@ define void @ctlz_undef_4i32() #0 { ; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @ctlz_undef_4i32( +; AVX2-LABEL: @ctlz_poison_4i32( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 ; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true) ; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @ctlz_undef_4i32( +; AVX512-LABEL: @ctlz_poison_4i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 ; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true) ; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 @@ -733,8 +733,8 @@ define void @ctlz_undef_4i32() #0 { ret void } -define void @ctlz_undef_8i32() #0 { -; SSE-LABEL: @ctlz_undef_8i32( +define void @ctlz_poison_8i32() #0 { +; SSE-LABEL: @ctlz_poison_8i32( ; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 ; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 ; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 @@ -761,7 +761,7 @@ define void @ctlz_undef_8i32() #0 { ; SSE-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @ctlz_undef_8i32( +; AVX1-LABEL: @ctlz_poison_8i32( ; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 ; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 ; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 @@ -788,13 +788,13 @@ define void @ctlz_undef_8i32() #0 { ; AVX1-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @ctlz_undef_8i32( +; AVX2-LABEL: @ctlz_poison_8i32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 ; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true) ; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @ctlz_undef_8i32( +; AVX512-LABEL: @ctlz_poison_8i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true) ; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 @@ -827,8 +827,8 @@ define void @ctlz_undef_8i32() #0 { ret void } -define void @ctlz_undef_8i16() #0 { -; CHECK-LABEL: @ctlz_undef_8i16( +define void @ctlz_poison_8i16() #0 { +; CHECK-LABEL: @ctlz_poison_8i16( ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2 ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) ; CHECK-NEXT: store <8 x i16> [[TMP2]], ptr @dst16, align 2 @@ -861,8 +861,8 @@ define void @ctlz_undef_8i16() #0 { ret void } -define void @ctlz_undef_16i16() #0 { -; SSE-LABEL: @ctlz_undef_16i16( +define void @ctlz_poison_16i16() #0 { +; SSE-LABEL: @ctlz_poison_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2 ; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) ; SSE-NEXT: store <8 x i16> [[TMP2]], ptr @dst16, align 2 @@ -871,7 +871,7 @@ define void @ctlz_undef_16i16() #0 { ; SSE-NEXT: store <8 x i16> [[TMP4]], ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 8), align 2 ; SSE-NEXT: ret void ; -; AVX-LABEL: @ctlz_undef_16i16( +; AVX-LABEL: @ctlz_poison_16i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 2 ; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 true) ; AVX-NEXT: store <16 x i16> [[TMP2]], ptr @dst16, align 2 @@ -928,8 +928,8 @@ define void @ctlz_undef_16i16() #0 { ret void } -define void @ctlz_undef_16i8() #0 { -; CHECK-LABEL: @ctlz_undef_16i8( +define void @ctlz_poison_16i8() #0 { +; CHECK-LABEL: @ctlz_poison_16i8( ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1 ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) ; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr @dst8, align 1 @@ -986,8 +986,8 @@ define void @ctlz_undef_16i8() #0 { ret void } -define void @ctlz_undef_32i8() #0 { -; SSE-LABEL: @ctlz_undef_32i8( +define void @ctlz_poison_32i8() #0 { +; SSE-LABEL: @ctlz_poison_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1 ; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) ; SSE-NEXT: store <16 x i8> [[TMP2]], ptr @dst8, align 1 @@ -996,7 +996,7 @@ define void @ctlz_undef_32i8() #0 { ; SSE-NEXT: store <16 x i8> [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1 ; SSE-NEXT: ret void ; -; AVX-LABEL: @ctlz_undef_32i8( +; AVX-LABEL: @ctlz_poison_32i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @src8, align 1 ; AVX-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> [[TMP1]], i1 true) ; AVX-NEXT: store <32 x i8> [[TMP2]], ptr @dst8, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll index 896be6f2fe213..688e22eb64fd3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll @@ -553,11 +553,11 @@ define void @cttz_32i8() #0 { } ; -; CTTZ_ZERO_UNDEF +; CTTZ_ZERO_POISON ; -define void @cttz_undef_2i64() #0 { -; SSE-LABEL: @cttz_undef_2i64( +define void @cttz_poison_2i64() #0 { +; SSE-LABEL: @cttz_poison_2i64( ; SSE-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 8 ; SSE-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8 ; SSE-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) @@ -566,7 +566,7 @@ define void @cttz_undef_2i64() #0 { ; SSE-NEXT: store i64 [[CTTZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @cttz_undef_2i64( +; AVX1-LABEL: @cttz_poison_2i64( ; AVX1-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 8 ; AVX1-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8 ; AVX1-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) @@ -575,7 +575,7 @@ define void @cttz_undef_2i64() #0 { ; AVX1-NEXT: store i64 [[CTTZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @cttz_undef_2i64( +; AVX2-LABEL: @cttz_poison_2i64( ; AVX2-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 8 ; AVX2-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8 ; AVX2-NEXT: [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true) @@ -584,7 +584,7 @@ define void @cttz_undef_2i64() #0 { ; AVX2-NEXT: store i64 [[CTTZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @cttz_undef_2i64( +; AVX512-LABEL: @cttz_poison_2i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 8 ; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[TMP1]], i1 true) ; AVX512-NEXT: store <2 x i64> [[TMP2]], ptr @dst64, align 8 @@ -599,8 +599,8 @@ define void @cttz_undef_2i64() #0 { ret void } -define void @cttz_undef_4i64() #0 { -; SSE-LABEL: @cttz_undef_4i64( +define void @cttz_poison_4i64() #0 { +; SSE-LABEL: @cttz_poison_4i64( ; SSE-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 4 ; SSE-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4 ; SSE-NEXT: [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4 @@ -615,7 +615,7 @@ define void @cttz_undef_4i64() #0 { ; SSE-NEXT: store i64 [[CTTZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @cttz_undef_4i64( +; AVX1-LABEL: @cttz_poison_4i64( ; AVX1-NEXT: [[LD0:%.*]] = load i64, ptr @src64, align 4 ; AVX1-NEXT: [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4 ; AVX1-NEXT: [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4 @@ -630,13 +630,13 @@ define void @cttz_undef_4i64() #0 { ; AVX1-NEXT: store i64 [[CTTZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @cttz_undef_4i64( +; AVX2-LABEL: @cttz_poison_4i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4 ; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> [[TMP1]], i1 true) ; AVX2-NEXT: store <4 x i64> [[TMP2]], ptr @dst64, align 4 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @cttz_undef_4i64( +; AVX512-LABEL: @cttz_poison_4i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4 ; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> [[TMP1]], i1 true) ; AVX512-NEXT: store <4 x i64> [[TMP2]], ptr @dst64, align 4 @@ -657,8 +657,8 @@ define void @cttz_undef_4i64() #0 { ret void } -define void @cttz_undef_4i32() #0 { -; SSE-LABEL: @cttz_undef_4i32( +define void @cttz_poison_4i32() #0 { +; SSE-LABEL: @cttz_poison_4i32( ; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 ; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 ; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 @@ -673,7 +673,7 @@ define void @cttz_undef_4i32() #0 { ; SSE-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @cttz_undef_4i32( +; AVX1-LABEL: @cttz_poison_4i32( ; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 ; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 ; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 @@ -688,13 +688,13 @@ define void @cttz_undef_4i32() #0 { ; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @cttz_undef_4i32( +; AVX2-LABEL: @cttz_poison_4i32( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 ; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) ; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @cttz_undef_4i32( +; AVX512-LABEL: @cttz_poison_4i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 ; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) ; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 @@ -715,8 +715,8 @@ define void @cttz_undef_4i32() #0 { ret void } -define void @cttz_undef_8i32() #0 { -; SSE-LABEL: @cttz_undef_8i32( +define void @cttz_poison_8i32() #0 { +; SSE-LABEL: @cttz_poison_8i32( ; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 ; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 ; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 @@ -743,7 +743,7 @@ define void @cttz_undef_8i32() #0 { ; SSE-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @cttz_undef_8i32( +; AVX1-LABEL: @cttz_poison_8i32( ; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 ; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 ; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 @@ -770,13 +770,13 @@ define void @cttz_undef_8i32() #0 { ; AVX1-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 ; AVX1-NEXT: ret void ; -; AVX2-LABEL: @cttz_undef_8i32( +; AVX2-LABEL: @cttz_poison_8i32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 ; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 true) ; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @cttz_undef_8i32( +; AVX512-LABEL: @cttz_poison_8i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 true) ; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 @@ -809,8 +809,8 @@ define void @cttz_undef_8i32() #0 { ret void } -define void @cttz_undef_8i16() #0 { -; CHECK-LABEL: @cttz_undef_8i16( +define void @cttz_poison_8i16() #0 { +; CHECK-LABEL: @cttz_poison_8i16( ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2 ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true) ; CHECK-NEXT: store <8 x i16> [[TMP2]], ptr @dst16, align 2 @@ -843,8 +843,8 @@ define void @cttz_undef_8i16() #0 { ret void } -define void @cttz_undef_16i16() #0 { -; SSE-LABEL: @cttz_undef_16i16( +define void @cttz_poison_16i16() #0 { +; SSE-LABEL: @cttz_poison_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2 ; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true) ; SSE-NEXT: store <8 x i16> [[TMP2]], ptr @dst16, align 2 @@ -853,7 +853,7 @@ define void @cttz_undef_16i16() #0 { ; SSE-NEXT: store <8 x i16> [[TMP4]], ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 8), align 2 ; SSE-NEXT: ret void ; -; AVX-LABEL: @cttz_undef_16i16( +; AVX-LABEL: @cttz_poison_16i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 2 ; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 true) ; AVX-NEXT: store <16 x i16> [[TMP2]], ptr @dst16, align 2 @@ -910,8 +910,8 @@ define void @cttz_undef_16i16() #0 { ret void } -define void @cttz_undef_16i8() #0 { -; CHECK-LABEL: @cttz_undef_16i8( +define void @cttz_poison_16i8() #0 { +; CHECK-LABEL: @cttz_poison_16i8( ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1 ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true) ; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr @dst8, align 1 @@ -968,8 +968,8 @@ define void @cttz_undef_16i8() #0 { ret void } -define void @cttz_undef_32i8() #0 { -; SSE-LABEL: @cttz_undef_32i8( +define void @cttz_poison_32i8() #0 { +; SSE-LABEL: @cttz_poison_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1 ; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true) ; SSE-NEXT: store <16 x i8> [[TMP2]], ptr @dst8, align 1 @@ -978,7 +978,7 @@ define void @cttz_undef_32i8() #0 { ; SSE-NEXT: store <16 x i8> [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1 ; SSE-NEXT: ret void ; -; AVX-LABEL: @cttz_undef_32i8( +; AVX-LABEL: @cttz_poison_32i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @src8, align 1 ; AVX-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> [[TMP1]], i1 true) ; AVX-NEXT: store <32 x i8> [[TMP2]], ptr @dst8, align 1 diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index cef2c59e26fb5..809d8880faaed 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -180,8 +180,8 @@ TEST_F(AArch64GISelMITest, LowerRotatesVector) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -// Test CTTZ expansion when CTTZ_ZERO_UNDEF is legal or custom, -// in which case it becomes CTTZ_ZERO_UNDEF with select. +// Test CTTZ expansion when CTTZ_ZERO_POISON is legal or custom, +// in which case it becomes CTTZ_ZERO_POISON with select. TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ0) { setUp(); if (!TM) @@ -189,7 +189,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ0) { // Declare your legalization info DefineLegalizerInfo(A, { - getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).legalFor({{s32, s64}}); + getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON).legalFor({{s32, s64}}); }); // Build Instr auto MIBCTTZ = @@ -202,7 +202,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ0) { Helper.lower(*MIBCTTZ, 0, LLT::scalar(64))); auto CheckStr = R"( - CHECK: [[CZU:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF %0 + CHECK: [[CZU:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON %0 CHECK: [[ZERO:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 CHECK: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), %0:_(s64), [[ZERO]] CHECK: [[SIXTY4:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 @@ -274,7 +274,7 @@ TEST_F(AArch64GISelMITest, NarrowScalarCTLZ) { CHECK: [[CTLZ_LO:%[0-9]+]]:_(s32) = G_CTLZ [[UNMERGE_LO]]:_(s32) CHECK: [[THIRTYTWO:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[CTLZ_LO]]:_, [[THIRTYTWO]]:_ - CHECK: [[CTLZ_HI:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UNMERGE_HI]]:_(s32) + CHECK: [[CTLZ_HI:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[UNMERGE_HI]]:_(s32) CHECK: %{{[0-9]+}}:_(s32) = G_SELECT [[CMP]]:_(i1), [[ADD]]:_, [[CTLZ_HI]]:_ )"; @@ -309,7 +309,7 @@ TEST_F(AArch64GISelMITest, NarrowScalarCTTZ) { CHECK: [[CTTZ_HI:%[0-9]+]]:_(s32) = G_CTTZ [[UNMERGE_HI]]:_(s32) CHECK: [[THIRTYTWO:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[CTTZ_HI]]:_, [[THIRTYTWO]]:_ - CHECK: [[CTTZ_LO:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UNMERGE_LO]]:_(s32) + CHECK: [[CTTZ_LO:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[UNMERGE_LO]]:_(s32) CHECK: %{{[0-9]+}}:_(s32) = G_SELECT [[CMP]]:_(s1), [[ADD]]:_, [[CTTZ_LO]]:_ )"; @@ -419,7 +419,7 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTPOP2) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -// CTTZ_ZERO_UNDEF expansion in terms of CTTZ +// CTTZ_ZERO_POISON expansion in terms of CTTZ TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ3) { setUp(); if (!TM) @@ -430,7 +430,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ3) { getActionDefinitionsBuilder(G_CTTZ).legalFor({{s64, s64}}); }); // Build - auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, + auto MIBCTTZ = B.buildInstr(TargetOpcode::G_CTTZ_ZERO_POISON, {LLT::scalar(64)}, {Copies[0]}); AInfo Info(MF->getSubtarget()); DummyGISelObserver Observer; @@ -446,7 +446,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ3) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -// CTLZ expansion in terms of CTLZ_ZERO_UNDEF +// CTLZ expansion in terms of CTLZ_ZERO_POISON TEST_F(AArch64GISelMITest, LowerBitCountingCTLZ0) { setUp(); if (!TM) @@ -454,7 +454,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTLZ0) { // Declare your legalization info DefineLegalizerInfo(A, { - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).legalFor({{s64, s64}}); + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON).legalFor({{s64, s64}}); }); // Build auto MIBCTLZ = @@ -466,7 +466,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTLZ0) { LegalizerHelper::LegalizeResult::Legalized); auto CheckStr = R"( - CHECK: [[CZU:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF %0 + CHECK: [[CZU:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_POISON %0 CHECK: [[ZERO:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 CHECK: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), %0:_(s64), [[ZERO]] CHECK: [[SIXTY4:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 @@ -477,7 +477,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTLZ0) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -// CTLZ expansion in terms of CTLZ_ZERO_UNDEF if the latter is a libcall +// CTLZ expansion in terms of CTLZ_ZERO_POISON if the latter is a libcall TEST_F(AArch64GISelMITest, LowerBitCountingCTLZLibcall) { setUp(); if (!TM) @@ -485,7 +485,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTLZLibcall) { // Declare your legalization info DefineLegalizerInfo(A, { - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).libcallFor({{s32, s64}}); + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON).libcallFor({{s32, s64}}); }); // Build auto MIBCTLZ = @@ -497,7 +497,7 @@ TEST_F(AArch64GISelMITest, LowerBitCountingCTLZLibcall) { Helper.lower(*MIBCTLZ, 0, LLT::scalar(32))); auto CheckStr = R"( - CHECK: [[CZU:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF %0 + CHECK: [[CZU:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON %0 CHECK: [[ZERO:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 CHECK: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), %0:_(s64), [[ZERO]] CHECK: [[THIRTY2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 @@ -584,15 +584,15 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTLZ) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -// CTLZ_ZERO_UNDEF widening. -TEST_F(AArch64GISelMITest, WidenBitCountingCTLZZeroUndef) { +// CTLZ_ZERO_POISON widening. +TEST_F(AArch64GISelMITest, WidenBitCountingCTLZZeroPoison) { setUp(); if (!TM) GTEST_SKIP(); // Declare your legalization info DefineLegalizerInfo(A, { - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).legalFor({{s16, s16}}); + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON).legalFor({{s16, s16}}); }); // Build // Trunc it to s8. @@ -600,7 +600,7 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTLZZeroUndef) { LLT s16{LLT::scalar(16)}; auto MIBTrunc = B.buildTrunc(s8, Copies[0]); auto MIBCTLZ_ZU = - B.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, {s8}, {MIBTrunc}); + B.buildInstr(TargetOpcode::G_CTLZ_ZERO_POISON, {s8}, {MIBTrunc}); AInfo Info(MF->getSubtarget()); DummyGISelObserver Observer; LegalizerHelper Helper(*MF, Info, Observer, B); @@ -612,7 +612,7 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTLZZeroUndef) { CHECK: [[Zext:%[0-9]+]]:_(s16) = G_ANYEXT [[Trunc]] CHECK: [[Cst8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 CHECK: [[Shl:%[0-9]+]]:_(s16) = G_SHL [[Zext]]:_, [[Cst8]]:_ - CHECK: [[CtlzZu:%[0-9]+]]:_(s16) = G_CTLZ_ZERO_UNDEF [[Shl]] + CHECK: [[CtlzZu:%[0-9]+]]:_(s16) = G_CTLZ_ZERO_POISON [[Shl]] CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC [[CtlzZu]] )"; @@ -653,33 +653,33 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTPOP) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -// CTTZ_ZERO_UNDEF widening. -TEST_F(AArch64GISelMITest, WidenBitCountingCTTZ_ZERO_UNDEF) { +// CTTZ_ZERO_POISON widening. +TEST_F(AArch64GISelMITest, WidenBitCountingCTTZ_ZERO_POISON) { setUp(); if (!TM) GTEST_SKIP(); // Declare your legalization info DefineLegalizerInfo(A, { - getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).legalFor({{s16, s16}}); + getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON).legalFor({{s16, s16}}); }); // Build // Trunc it to s8. LLT s8{LLT::scalar(8)}; LLT s16{LLT::scalar(16)}; auto MIBTrunc = B.buildTrunc(s8, Copies[0]); - auto MIBCTTZ_ZERO_UNDEF = - B.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, {s8}, {MIBTrunc}); + auto MIBCTTZ_ZERO_POISON = + B.buildInstr(TargetOpcode::G_CTTZ_ZERO_POISON, {s8}, {MIBTrunc}); AInfo Info(MF->getSubtarget()); DummyGISelObserver Observer; LegalizerHelper Helper(*MF, Info, Observer, B); - EXPECT_TRUE(Helper.widenScalar(*MIBCTTZ_ZERO_UNDEF, 1, s16) == + EXPECT_TRUE(Helper.widenScalar(*MIBCTTZ_ZERO_POISON, 1, s16) == LegalizerHelper::LegalizeResult::Legalized); auto CheckStr = R"( CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC CHECK: [[AnyExt:%[0-9]+]]:_(s16) = G_ANYEXT [[Trunc]] - CHECK: [[CttzZu:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_UNDEF [[AnyExt]] + CHECK: [[CttzZu:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_POISON [[AnyExt]] CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC [[CttzZu]] )"; @@ -714,7 +714,7 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTTZ) { CHECK: [[AnyExt:%[0-9]+]]:_(s16) = G_ANYEXT [[Trunc]] CHECK: [[Cst:%[0-9]+]]:_(s16) = G_CONSTANT i16 256 CHECK: [[Or:%[0-9]+]]:_(s16) = G_OR [[AnyExt]]:_, [[Cst]] - CHECK: [[Cttz:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_UNDEF [[Or]] + CHECK: [[Cttz:%[0-9]+]]:_(s16) = G_CTTZ_ZERO_POISON [[Or]] CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC [[Cttz]] )"; @@ -2310,14 +2310,14 @@ TEST_F(AArch64GISelMITest, LibcallURem) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(AArch64GISelMITest, LibcallCtlzZeroUndef) { +TEST_F(AArch64GISelMITest, LibcallCtlzZeroPoison) { setUp(); if (!TM) GTEST_SKIP(); // Declare your legalization info DefineLegalizerInfo(A, { - getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON) .libcallFor({{s32, s32}, {s64, s64}, {s128, s128}}); }); @@ -2328,11 +2328,11 @@ TEST_F(AArch64GISelMITest, LibcallCtlzZeroUndef) { auto MIBExt = B.buildAnyExt(S128, Copies[0]); auto MIBCtlz32 = - B.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, {S32}, {MIBTrunc}); + B.buildInstr(TargetOpcode::G_CTLZ_ZERO_POISON, {S32}, {MIBTrunc}); auto MIBCtlz64 = - B.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, {S64}, {Copies[0]}); + B.buildInstr(TargetOpcode::G_CTLZ_ZERO_POISON, {S64}, {Copies[0]}); auto MIBCtlz128 = - B.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, {S128}, {MIBExt}); + B.buildInstr(TargetOpcode::G_CTLZ_ZERO_POISON, {S128}, {MIBExt}); AInfo Info(MF->getSubtarget()); DummyGISelObserver Observer; diff --git a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp index caff8ed761ab2..a9e3c81eceb2b 100644 --- a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp @@ -222,18 +222,18 @@ TEST_F(AArch64GISelMITest, BuildBitCounts) { B.buildCTPOP(S32, Copies[0]); B.buildCTLZ(S32, Copies[0]); - B.buildCTLZ_ZERO_UNDEF(S32, Copies[1]); + B.buildCTLZ_ZERO_POISON(S32, Copies[1]); B.buildCTTZ(S32, Copies[0]); - B.buildCTTZ_ZERO_UNDEF(S32, Copies[1]); + B.buildCTTZ_ZERO_POISON(S32, Copies[1]); auto CheckStr = R"( ; CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 ; CHECK: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[COPY0]]:_ ; CHECK: [[CTLZ0:%[0-9]+]]:_(s32) = G_CTLZ [[COPY0]]:_ - ; CHECK: [[CTLZ_UNDEF0:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY1]]:_ + ; CHECK: [[CTLZ_POISON0:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_POISON [[COPY1]]:_ ; CHECK: [[CTTZ:%[0-9]+]]:_(s32) = G_CTTZ [[COPY0]]:_ - ; CHECK: [[CTTZ_UNDEF0:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY1]]:_ + ; CHECK: [[CTTZ_POISON0:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_POISON [[COPY1]]:_ )"; EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; From d6f6cdaddadb9929dd6778697b3ea422acae2c08 Mon Sep 17 00:00:00 2001 From: Michal Paszkowski Date: Sun, 10 May 2026 02:38:34 -0700 Subject: [PATCH 171/538] [SPIR-V] Fix inttoptr type deduction with ptr.annotation (#189219) Opaque pointer inttoptr was recording ptr as a pointee type, so OpConvertUToPtr was emitted as pointer-to-pointer and then bitcasted back. Please see an example below. LLVM IR: ``` %p = inttoptr i64 %x to ptr addrspace(1) %a = call ptr addrspace(1) @llvm.ptr.annotation(... %p ...) call spir_func void @prefetch(ptr addrspace(1) %a, ...) ``` SPIR-V (before the change): ``` %p2 = OpConvertUToPtr %_ptr_CrossWorkgroup__ptr_CrossWorkgroup_uchar %x %p1 = OpBitcast %_ptr_CrossWorkgroup_uchar %p2 OpFunctionCall ... %p1 ... ``` Skip assigning pointee type for inttoptr when the destination is untyped, fallback later recovers the correct single pointer type. --- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 2 + .../inttoptr-no-double-pointer.ll | 37 +++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/inttoptr-no-double-pointer.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index c9b66ccfbdbd1..b6e71c7b76348 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -747,6 +747,8 @@ void SPIRVEmitIntrinsics::maybeAssignPtrType(Type *&Ty, Value *Op, Type *RefTy, if (!UnknownElemTypeI8) return; insertTodoType(Op); + if (isa(Op)) + return; } Ty = RefTy; } diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/inttoptr-no-double-pointer.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/inttoptr-no-double-pointer.ll new file mode 100644 index 0000000000000..ded7d19a4dc50 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/inttoptr-no-double-pointer.ll @@ -0,0 +1,37 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls,+SPV_INTEL_variable_length_array %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls,+SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %} + +; Check that inttoptr followed by ptr.annotation does not produce a +; double-pointer type for OpConvertUToPtr (and there is no OpBitcast back to +; single-pointer type). + +; CHECK-DAG: %[[#UCHAR:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR]] +; CHECK: %[[#CONV:]] = OpConvertUToPtr %[[#PTR]] +; CHECK-NOT: OpBitcast %[[#PTR]] %[[#CONV]] +; CHECK: OpFunctionCall %[[#]] %[[#]] %[[#CONV]] + +@.str.file = private unnamed_addr addrspace(1) constant [1 x i8] zeroinitializer, section "llvm.metadata", align 1 +@.str.cachecontrol = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\220,1\22}\00", section "llvm.metadata", align 1 +@.str.cachecontrol.1 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\221,1\22}\00", section "llvm.metadata", align 1 + +declare spir_func void @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(ptr addrspace(1) nonnull, i32, i32, i32, <2 x i32>) #0 + +define spir_kernel void @test_inttoptr_annotation(ptr addrspace(1) %base) { +entry: + %int = ptrtoint ptr addrspace(1) %base to i64 + %vec = insertelement <4 x i64> zeroinitializer, i64 %int, i64 0 + %cast = bitcast <4 x i64> %vec to <8 x i32> + %recast = bitcast <8 x i32> %cast to <4 x i64> + %extracted = extractelement <4 x i64> %recast, i64 0 + %int2ptr = inttoptr i64 %extracted to ptr addrspace(1) + %coords = insertelement <2 x i32> zeroinitializer, i32 42, i32 1 + %a1 = call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %int2ptr, ptr addrspace(1) @.str.cachecontrol, ptr addrspace(1) @.str.file, i32 0, ptr addrspace(1) null) + %a2 = call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %a1, ptr addrspace(1) @.str.cachecontrol.1, ptr addrspace(1) @.str.file, i32 0, ptr addrspace(1) null) + call spir_func void @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(ptr addrspace(1) nonnull %a2, i32 8192, i32 4096, i32 8192, <2 x i32> %coords) #0 + ret void +} + +declare ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1)) + +attributes #0 = { nounwind memory(argmem: read) } From c522ad0a41c73ef2a2bdbae401a9c480c697548e Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Sun, 10 May 2026 13:54:30 +0300 Subject: [PATCH 172/538] [clang-tidy] Fix FP in readability-container-size-empty with compairing to unrelated type (#190535) Fixes https://github.com/llvm/llvm-project/issues/162287. --- .../readability/ContainerSizeEmptyCheck.cpp | 28 ++++--- clang-tools-extra/docs/ReleaseNotes.rst | 3 + .../readability/container-size-empty.cpp | 74 +++++++++++++++++++ 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp index 2101fd2248e8a..cbad3d244d841 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp @@ -144,15 +144,18 @@ void ContainerSizeEmptyCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { } void ContainerSizeEmptyCheck::registerMatchers(MatchFinder *Finder) { - const auto ValidContainerRecord = cxxRecordDecl(isSameOrDerivedFrom(namedDecl( - has(cxxMethodDecl( - isConst(), parameterCountIs(0), isPublic(), - hasAnyName("size", "length"), - returns(qualType(isIntegralType(), unless(booleanType())))) - .bind("size")), - has(cxxMethodDecl(isConst(), parameterCountIs(0), isPublic(), - hasName("empty"), returns(booleanType())) - .bind("empty"))))); + const auto ValidContainerRecord = + cxxRecordDecl( + isSameOrDerivedFrom(namedDecl( + has(cxxMethodDecl(isConst(), parameterCountIs(0), isPublic(), + hasAnyName("size", "length"), + returns(qualType(isIntegralType(), + unless(booleanType())))) + .bind("size")), + has(cxxMethodDecl(isConst(), parameterCountIs(0), isPublic(), + hasName("empty"), returns(booleanType())) + .bind("empty"))))) + .bind("ContainerDecl"); const auto ValidContainerNonTemplateType = qualType(hasUnqualifiedDesugaredType( @@ -235,6 +238,13 @@ void ContainerSizeEmptyCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( binaryOperation( hasAnyOperatorName("==", "!="), hasOperands(WrongComparend, STLArg), + unless(hasEitherOperand(cxxConstructExpr( + argumentCountIs(0), + unless(hasType(qualType(hasCanonicalType(hasDeclaration( + // 'equalsBoundNode' needs the 'ContainerDecl' binding + // from 'STLArg' to already exist, so this constraint must + // appear after 'hasOperands' matcher + namedDecl(equalsBoundNode("ContainerDecl")))))))))), unless(allOf(hasLHS(hasType(ExcludedComparisonTypesMatcher)), hasRHS(hasType(SameExcludedComparisonTypesMatcher)))), NotInEmptyMethodOfContainer) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index be5315ac3c181..0d3adedeea0f8 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -541,6 +541,9 @@ Changes in existing checks - Reduce verbosity by removing the note indicating source location of the ``empty`` function. + - Fixed a false positive with suggesting ``empty`` when comparing a container + to a default-constructed object of an unrelated type. + - Improved :doc:`readability-convert-member-functions-to-static ` check: diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp index 2b8b3261ac765..cd2eebb16138b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp @@ -954,3 +954,77 @@ struct DestructorUser { } }; } + +namespace GH162287 { + +struct Label { + virtual ~Label(); +}; +bool operator==(std::string, const Label&); +bool operator==(std::string, std::vector); +bool operator==(const Label&, std::string); + +void testUnrelatedType() { + std::string s{"aa"}; + if (s == Label{}) + ; + if (s == Label()) + ; + if (s == std::vector{}) + ; + if (Label() == s) + ; +} + +void testValid() { + std::string s{"aa"}; + std::vector v; + Container c; + + // CHECK-MESSAGES: :[[@LINE+2]]:7: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: if (s.empty()) + if (s == std::string{}) + ; + // CHECK-MESSAGES: :[[@LINE+2]]:7: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: if (s.empty()) + if (std::string() == s) + ; + // CHECK-MESSAGES: :[[@LINE+2]]:7: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: if (c.empty()) + if (c == Container()) + ; + Container *p = nullptr; + // CHECK-MESSAGES: :[[@LINE+2]]:7: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: if (p->empty()) + if (*p == Container()) + ; + using MyString = std::string; + MyString ms{"aa"}; + // CHECK-MESSAGES: :[[@LINE+2]]:7: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: if (ms.empty()) + if (ms == std::string()) + ; + bool b1 = s == Label(); + // CHECK-MESSAGES: :[[@LINE+2]]:13: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: bool b2 = c.empty(); + bool b2 = c == Container(); + // CHECK-MESSAGES: :[[@LINE+2]]:7: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: if (v.empty()) + if (v == std::vector()) + ; +} + +template +bool testUnrelatedInTemplate(std::string s) { + return s == Label{}; +} +template bool testUnrelatedInTemplate(std::string); + +template +bool testDependentValidContainer(TemplatedContainer c) { + return c == TemplatedContainer(); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: the 'empty' method should be used to check for emptiness instead of comparing to an empty object + // CHECK-FIXES: return c.empty(); +} +template bool testDependentValidContainer(TemplatedContainer); +} From 79fa36f9ceeb325b4c946a5a52b3c8775d7e7ddd Mon Sep 17 00:00:00 2001 From: Ayush Sahay Date: Sun, 10 May 2026 18:30:18 +0530 Subject: [PATCH 173/538] [lldb][Windows] Invalidate cached register values on thread stop (#192430) Invalidate cached values in register context data structures on every thread stop. NativeRegisterContextRegisterInfo::InvalidateAllRegisters performs no operation by default. Subclasses may override it to clear cached values within their register context data structures whenever a thread stops. This change intends to set up the necessary infrastructure to support caching of the thread context in NativeRegisterContextWindows_arm64, which will improve read performance. Currently, the thread context is retrieved for every read or write operation. --- lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux.h | 3 --- .../Process/Utility/NativeRegisterContextRegisterInfo.h | 3 +++ .../Plugins/Process/Windows/Common/NativeThreadWindows.cpp | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux.h index 08f19d374e280..797eacc3bdd0f 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux.h +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux.h @@ -37,9 +37,6 @@ class NativeRegisterContextLinux // Determine the architecture of the thread given by its ID. static llvm::Expected DetermineArchitecture(lldb::tid_t tid); - // Invalidates cached values in register context data structures - virtual void InvalidateAllRegisters(){} - struct SyscallData { /// The syscall instruction. If the architecture uses software /// single-stepping, the instruction should also be followed by a trap to diff --git a/lldb/source/Plugins/Process/Utility/NativeRegisterContextRegisterInfo.h b/lldb/source/Plugins/Process/Utility/NativeRegisterContextRegisterInfo.h index 0e96841fd9091..58a2a4843e4ec 100644 --- a/lldb/source/Plugins/Process/Utility/NativeRegisterContextRegisterInfo.h +++ b/lldb/source/Plugins/Process/Utility/NativeRegisterContextRegisterInfo.h @@ -33,6 +33,9 @@ class NativeRegisterContextRegisterInfo : public NativeRegisterContext { const RegisterInfoInterface &GetRegisterInfoInterface() const; + // Invalidate cached values in register context data structures. + virtual void InvalidateAllRegisters() {} + protected: std::unique_ptr m_register_info_interface_up; }; diff --git a/lldb/source/Plugins/Process/Windows/Common/NativeThreadWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/NativeThreadWindows.cpp index 8ad59cd1ece88..d76e9a51fdace 100644 --- a/lldb/source/Plugins/Process/Windows/Common/NativeThreadWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/NativeThreadWindows.cpp @@ -38,6 +38,9 @@ Status NativeThreadWindows::DoStop() { if (previous_suspend_count == (DWORD)-1) return Status(::GetLastError(), eErrorTypeWin32); + // Invalidate cached register values on every stop. + GetRegisterContext().InvalidateAllRegisters(); + m_state = eStateStopped; } return Status(); From 2b2635558939edcbfbb19dac04cd299aeee18fb1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 10 May 2026 14:21:28 +0100 Subject: [PATCH 174/538] Revert "[VectorCombine] foldShuffleChainsToReduce - add support for partial vector reductions" (#196796) Reverts llvm/llvm-project#195119 while reported assertions are investigated. --- .../Transforms/Vectorize/VectorCombine.cpp | 39 +++------------ .../X86/horizontal-reduce-smax.ll | 40 ++++++++++++--- .../X86/horizontal-reduce-smin.ll | 40 ++++++++++++--- .../X86/horizontal-reduce-umax.ll | 40 ++++++++++++--- .../X86/horizontal-reduce-umin.ll | 40 ++++++++++++--- .../fold-shuffle-chains-to-reduce.ll | 50 ------------------- 6 files changed, 134 insertions(+), 115 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 10ad3a71c73de..5ba344ea9a808 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3995,8 +3995,6 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { InstWorklist.push(VecOpEE); - bool IsPartialReduction = false; - while (!InstWorklist.empty()) { Value *CI = InstWorklist.front(); InstWorklist.pop(); @@ -4127,19 +4125,12 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { ShouldBeCallOrBinInst ^= 1; } else { - // Check if this is a partial reduction - the chain ended because - // the source vector is not a recognized op/shuffle. - if (ShouldBeCallOrBinInst && VisitedCnt >= 1 && CI == PrevVecV[0]) { - IsPartialReduction = true; - break; - } return false; } } - // Full reduction pattern should end with a shuffle op. - // Partial reduction ends when the source vector is reached. - if (ShouldBeCallOrBinInst && !IsPartialReduction) + // Pattern should end with a shuffle op. + if (ShouldBeCallOrBinInst) return false; assert(VecSize != -1 && "Expected Match for Vector Size"); @@ -4156,32 +4147,14 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { if (!ReducedOp) return false; - InstructionCost NewCost = 0; - FixedVectorType *ReduceVecTy = FinalVecVTy; - SmallVector ExtractMask; - - if (IsPartialReduction) { - unsigned SubVecSize = ShuffleMaskHalf; - ReduceVecTy = FixedVectorType::get(FVT->getElementType(), SubVecSize); - ExtractMask.resize(SubVecSize); - std::iota(ExtractMask.begin(), ExtractMask.end(), 0); - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - ReduceVecTy, FinalVecVTy, ExtractMask, CostKind, 0); - } - - IntrinsicCostAttributes ICA(ReducedOp, ReduceVecTy, {ReduceVecTy}); - NewCost += TTI.getIntrinsicInstrCost(ICA, CostKind); + IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); + InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); if (NewCost >= OrigCost) return false; - Value *ReduceInput = FinalVecV; - if (IsPartialReduction) - ReduceInput = Builder.CreateShuffleVector(FinalVecV, ExtractMask); - - auto *ReducedResult = Builder.CreateIntrinsic( - ReducedOp, {ReduceInput->getType()}, {ReduceInput}); + auto *ReducedResult = + Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); replaceValue(I, *ReducedResult); return true; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll index ec8cd82b96a37..85186dba0891f 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smax.ll @@ -314,8 +314,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -333,8 +338,13 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -352,8 +362,15 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -374,8 +391,15 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll index 650947d240ace..80c2929b5d5cf 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-smin.ll @@ -314,8 +314,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -333,8 +338,13 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -352,8 +362,15 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -374,8 +391,15 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll index f7d5a99bc0da0..dbb448c4b96e5 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umax.ll @@ -314,8 +314,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -333,8 +338,13 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -352,8 +362,15 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -374,8 +391,15 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll index e2fc523dd271c..bd2366d49a951 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horizontal-reduce-umin.ll @@ -314,8 +314,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v16i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[A0]], <16 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> @@ -333,8 +338,13 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; CHECK-LABEL: @test_reduce_v32i16_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[A0]], <32 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[TMP2]], <32 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[TMP4]], <32 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <32 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: ret i16 [[TMP10]] ; %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> @@ -352,8 +362,15 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v32i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[A0]], <32 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> @@ -374,8 +391,15 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; CHECK-LABEL: @test_reduce_v64i8_v16i8( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[A0]], <64 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[TMP2]], <64 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[TMP4]], <64 x i8> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[TMP6]], <64 x i8> [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[TMP8]], i64 0 ; CHECK-NEXT: ret i8 [[TMP13]] ; %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll index 71809534016d1..403ce33b5344e 100644 --- a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -193,53 +193,3 @@ define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) { %7 = extractelement <6 x i16> %6, i64 0 ret i16 %7 } - -; Partial reduction: reduce lower 8 elements of a 16-element vector using smax. -define i16 @test_partial_reduce_v16i16_v8i16_smax(<16 x i16> %a0) { -; CHECK-LABEL: define i16 @test_partial_reduce_v16i16_v8i16_smax( -; CHECK-SAME: <16 x i16> [[A0:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[TMP1]]) -; CHECK-NEXT: ret i16 [[TMP2]] -; - %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> - %2 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %a0, <16 x i16> %1) - %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> - %4 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %2, <16 x i16> %3) - %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> - %6 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %4, <16 x i16> %5) - %7 = extractelement <16 x i16> %6, i64 0 - ret i16 %7 -} - -; Partial reduction: reduce lower 4 elements of an 8-element vector using add. -define i32 @test_partial_reduce_v8i32_v4i32_add(<8 x i32> %a0) { -; CHECK-LABEL: define i32 @test_partial_reduce_v8i32_v4i32_add( -; CHECK-SAME: <8 x i32> [[A0:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: ret i32 [[TMP2]] -; - %1 = shufflevector <8 x i32> %a0, <8 x i32> poison, <8 x i32> - %2 = add <8 x i32> %a0, %1 - %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <8 x i32> - %4 = add <8 x i32> %2, %3 - %5 = extractelement <8 x i32> %4, i64 0 - ret i32 %5 -} - -; Partial reduction: reduce lower 4 elements of a 16-element vector using umin. -define i16 @test_partial_reduce_v16i16_v4i16_umin(<16 x i16> %a0) { -; CHECK-LABEL: define i16 @test_partial_reduce_v16i16_v4i16_umin( -; CHECK-SAME: <16 x i16> [[A0:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[TMP1]]) -; CHECK-NEXT: ret i16 [[TMP2]] -; - %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> - %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1) - %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> - %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3) - %5 = extractelement <16 x i16> %4, i64 0 - ret i16 %5 -} From 3de25f551c1df8a70131758b482dca4f4575343f Mon Sep 17 00:00:00 2001 From: NeKon69 Date: Sun, 10 May 2026 16:37:15 +0300 Subject: [PATCH 175/538] [LifetimeSafety] Warn on incorrectly placed `[[clang::lifetimebound]]` attributes (#196144) Adds new warning that is emitted when parameter is marked as `[[clang::lifetimebound]]` but is not returned in one way or another (tracked via `OriginEscapeFact`). Closes #182935 --- clang/docs/LifetimeSafety.rst | 1 + .../Analyses/LifetimeSafety/LifetimeSafety.h | 4 + clang/include/clang/Basic/DiagnosticGroups.td | 12 ++- .../clang/Basic/DiagnosticSemaKinds.td | 5 ++ clang/lib/Analysis/LifetimeSafety/Checker.cpp | 27 +++++- clang/lib/Sema/SemaLifetimeSafety.h | 39 +++++--- .../warn-lifetime-safety-lifetimebound.cpp | 89 +++++++++++++++++++ 7 files changed, 160 insertions(+), 17 deletions(-) create mode 100644 clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp diff --git a/clang/docs/LifetimeSafety.rst b/clang/docs/LifetimeSafety.rst index c71816dd75a82..db166db0637ca 100644 --- a/clang/docs/LifetimeSafety.rst +++ b/clang/docs/LifetimeSafety.rst @@ -467,6 +467,7 @@ enables only the high-confidence subset of these checks. * ``-Wlifetime-safety-validations``: Enables checks that validate existing lifetime annotations. * ``-Wlifetime-safety-noescape``: Warns when a parameter marked with ``[[clang::noescape]]`` escapes the function. + * ``-Wlifetime-safety-lifetimebound-violation``: Warns when the analysis cannot verify that the return value can be lifetime bound to a parameter marked with ``[[clang::lifetimebound]]``. Limitations =========== diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index d20ac87a7c8d9..7ccf30ba14987 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -108,6 +108,10 @@ class LifetimeSafetySemaHelper { virtual void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape, const VarDecl *EscapeGlobal) {} + // Reports misuse of [[clang::lifetimebound]] when parameter doesn't escape + // through return. + virtual void reportLifetimeboundViolation(const ParmVarDecl *VD) {} + // Suggests lifetime bound annotations for implicit this. virtual void suggestLifetimeboundToImplicitThis(SuggestionScope Scope, const CXXMethodDecl *MD, diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 2b3055d6d6bdd..03d423db9d21a 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -624,6 +624,13 @@ Warning to detect invalidation of references. }]; } +def LifetimeSafetyLifetimeboundViolation : DiagGroup<"lifetime-safety-lifetimebound-violation"> { + code Documentation = [{ +Detects parameters marked as [[clang::lifetimebound]] for which the analysis could not verify that the return value can be lifetime bound to the parameter. +This warning may produce false-positives diagnostics when it cannot fully model the code. + }]; +} + def LifetimeSafetyPermissive : DiagGroup<"lifetime-safety-permissive", [LifetimeSafetyUseAfterScope, LifetimeSafetyReturnStackAddr, @@ -666,10 +673,11 @@ Detects misuse of [[clang::noescape]] annotation where the parameter escapes (fo } def LifetimeSafetyValidations : DiagGroup<"lifetime-safety-validations", - [LifetimeSafetyNoescape]> { + [LifetimeSafetyNoescape, + LifetimeSafetyLifetimeboundViolation]> { code Documentation = [{ Verify function implementations adhere to the annotated lifetime contracts through lifetime safety -like verifying [[clang::noescape]] and [[clang::lifetimebound]] (upcoming). +like verifying [[clang::noescape]] and [[clang::lifetimebound]]. }]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index c638c23f24bb5..879812f3de0d3 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11007,6 +11007,11 @@ def warn_lifetime_safety_dangling_global_moved InGroup, DefaultIgnore; +def warn_lifetime_safety_param_lifetimebound_violation + : Warning<"could not verify that the return value can be lifetime bound to %select{an unnamed parameter|'%1'}0">, + InGroup, + DefaultIgnore; + def note_lifetime_safety_used_here : Note<"later used here">; def note_lifetime_safety_invalidated_here : Note<"invalidated here">; def note_lifetime_safety_destroyed_here : Note<"destroyed here">; diff --git a/clang/lib/Analysis/LifetimeSafety/Checker.cpp b/clang/lib/Analysis/LifetimeSafety/Checker.cpp index 4ae90cf751ec3..fc77ed3097602 100644 --- a/clang/lib/Analysis/LifetimeSafety/Checker.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Checker.cpp @@ -60,6 +60,7 @@ class LifetimeChecker { llvm::DenseMap FinalWarningsMap; llvm::DenseMap AnnotationWarningsMap; llvm::DenseMap NoescapeWarningsMap; + llvm::DenseSet VerifiedLiftimeboundEscapes; const LoanPropagationAnalysis &LoanPropagation; const MovedLoansAnalysis &MovedLoans; const LiveOriginsAnalysis &LiveOrigins; @@ -101,6 +102,7 @@ class LifetimeChecker { issuePendingWarnings(); suggestAnnotations(); reportNoescapeViolations(); + reportLifetimeboundViolations(); // Annotation inference is currently guarded by a frontend flag. In the // future, this might be replaced by a design that differentiates between // explicit and inferred findings with separate warning groups. @@ -129,9 +131,12 @@ class LifetimeChecker { // obscures the lifetime relationship (e.g., shared_ptr from unique_ptr). if (IsMoved) return; - // Otherwise, suggest lifetimebound for parameter escaping through return - // or a field in constructor. - if (!PVD->hasAttr()) { + if (PVD->hasAttr()) { + // Track that this lifetimebound parameter correctly escapes. + VerifiedLiftimeboundEscapes.insert(PVD); + } else { + // Otherwise, suggest lifetimebound for parameter escaping through + // return or a field in constructor. if (auto *ReturnEsc = dyn_cast(OEF)) AnnotationWarningsMap.try_emplace(PVD, ReturnEsc->getReturnExpr()); else if (auto *FieldEsc = dyn_cast(OEF); @@ -358,6 +363,22 @@ class LifetimeChecker { } } + void reportLifetimeboundViolations() { + if (!isa(FD)) + return; + for (const ParmVarDecl *PVD : cast(FD)->parameters()) { + if (!PVD->hasAttr()) + continue; + bool isImplicit = PVD->getAttr()->isImplicit(); + bool Escapes = VerifiedLiftimeboundEscapes.contains(PVD); + assert((!isImplicit || Escapes || isInStlNamespace(FD)) && + "Implicit lifetimebound parameters " + "should escape through return"); + if (!isImplicit && !Escapes) + SemaHelper->reportLifetimeboundViolation(PVD); + } + } + void inferAnnotations() { for (auto [Target, EscapeTarget] : AnnotationWarningsMap) { if (const auto *MD = Target.dyn_cast()) { diff --git a/clang/lib/Sema/SemaLifetimeSafety.h b/clang/lib/Sema/SemaLifetimeSafety.h index 92e7b5cf14ae5..5b1cf41445399 100644 --- a/clang/lib/Sema/SemaLifetimeSafety.h +++ b/clang/lib/Sema/SemaLifetimeSafety.h @@ -24,18 +24,24 @@ namespace clang::lifetimes { inline bool IsLifetimeSafetyDiagnosticEnabled(Sema &S, const Decl *D) { DiagnosticsEngine &Diags = S.getDiagnostics(); - return !Diags.isIgnored(diag::warn_lifetime_safety_use_after_scope, - D->getBeginLoc()) || - !Diags.isIgnored(diag::warn_lifetime_safety_use_after_scope_moved, - D->getBeginLoc()) || - !Diags.isIgnored(diag::warn_lifetime_safety_return_stack_addr, - D->getBeginLoc()) || - !Diags.isIgnored(diag::warn_lifetime_safety_return_stack_addr_moved, - D->getBeginLoc()) || - !Diags.isIgnored(diag::warn_lifetime_safety_invalidation, - D->getBeginLoc()) || - !Diags.isIgnored(diag::warn_lifetime_safety_noescape_escapes, - D->getBeginLoc()); + constexpr unsigned DiagIDs[] = { + diag::warn_lifetime_safety_use_after_scope, + diag::warn_lifetime_safety_use_after_scope_moved, + diag::warn_lifetime_safety_use_after_free, + diag::warn_lifetime_safety_return_stack_addr, + diag::warn_lifetime_safety_return_stack_addr_moved, + diag::warn_lifetime_safety_invalidation, + diag::warn_lifetime_safety_dangling_field, + diag::warn_lifetime_safety_dangling_field_moved, + diag::warn_lifetime_safety_dangling_global, + diag::warn_lifetime_safety_dangling_global_moved, + diag::warn_lifetime_safety_noescape_escapes, + diag::warn_lifetime_safety_param_lifetimebound_violation, + }; + for (unsigned DiagID : DiagIDs) + if (!Diags.isIgnored(DiagID, D->getBeginLoc())) + return true; + return false; } class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper { @@ -172,6 +178,15 @@ class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper { << EscapeField->getSourceRange(); } + void reportLifetimeboundViolation( + const ParmVarDecl *ParmWithLifetimebound) override { + StringRef ParamName = ParmWithLifetimebound->getName(); + bool HasName = ParamName.size() > 0; + S.Diag(ParmWithLifetimebound->getLocation(), + diag::warn_lifetime_safety_param_lifetimebound_violation) + << HasName << ParamName << ParmWithLifetimebound->getSourceRange(); + } + void suggestLifetimeboundToImplicitThis(SuggestionScope Scope, const CXXMethodDecl *MD, const Expr *EscapeExpr) override { diff --git a/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp b/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp new file mode 100644 index 0000000000000..941a3bb8ce1e3 --- /dev/null +++ b/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp @@ -0,0 +1,89 @@ +// RUN: %clang_cc1 -fsyntax-only -Wlifetime-safety-lifetimebound-violation -verify %s + +#include "Inputs/lifetime-analysis.h" + +struct [[gsl::Owner]] MyObj { + int id; + ~MyObj() {} // Non-trivial destructor +}; + +struct [[gsl::Pointer()]] View { + View(const MyObj &); // Borrows from MyObj + View(); + void use() const; +}; + +bool cond(); + +View not_lb(const MyObj &obj); + +View lb(const MyObj &obj [[clang::lifetimebound]]); + +View return_through_unannotated_passthrough( + const MyObj &obj [[clang::lifetimebound]]) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} + return not_lb(obj); +} + +View return_through_lifetimebound_passthrough( + const MyObj &obj [[clang::lifetimebound]]) { + return lb(obj); +} + +View lose_lb(const MyObj &obj [[clang::lifetimebound]]) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} + return not_lb(obj); +} + +View return_through_alias(const MyObj &obj [[clang::lifetimebound]]) { + const MyObj &alias = obj; + return alias; +} + +View return_alias_through_unannotated_passthrough( + const MyObj &obj [[clang::lifetimebound]]) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} + const MyObj &alias = obj; + return not_lb(alias); +} + +View not_lb_view(View v); + +View lb_view(View v [[clang::lifetimebound]]); + + +View return_through_two_lifetimebound_calls( + const MyObj &obj [[clang::lifetimebound]]) { + return lb_view(lb(obj)); +} + +View return_through_nested_broken_chain( + const MyObj &obj [[clang::lifetimebound]]) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} + return not_lb_view(lb_view(View(obj))); +} + +View return_constructed_view_through_unannotated_forwarder( + const MyObj &obj [[clang::lifetimebound]]) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} + return not_lb_view(View(obj)); +} + +View return_constructed_view_through_lifetimebound_forwarder( + const MyObj &obj [[clang::lifetimebound]]) { + return lb_view(View(obj)); +} + +View verify_each_annotated_param_independently( + const MyObj &a [[clang::lifetimebound]], + const MyObj &b [[clang::lifetimebound]], // expected-warning {{could not verify that the return value can be lifetime bound to 'b'}} + const MyObj &c [[clang::lifetimebound]]) { // expected-warning {{could not verify that the return value can be lifetime bound to 'c'}} + return cond() ? a : not_lb(b); +} + +View unnamed_lifetimebound_param( + [[clang::lifetimebound]] const MyObj &) { // expected-warning {{could not verify that the return value can be lifetime bound to an unnamed parameter}} + return View(); +} + +// FIXME: Should warn on declaration, not definiton +View annotated_decl_but_not_def_not_returned(const MyObj &obj [[clang::lifetimebound]]); + +View annotated_decl_but_not_def_not_returned(const MyObj &obj) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} + return not_lb(obj); +} From e7ea2127e37425fe738075328ce586b8b2f7811c Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 10 May 2026 10:19:13 -0400 Subject: [PATCH 176/538] [libc] Fix -Wshadow warnings in freetrie.h (#196529) --- libc/src/__support/freetrie.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/src/__support/freetrie.h b/libc/src/__support/freetrie.h index 42363c2c9e2f4..c1a8306c6f8d2 100644 --- a/libc/src/__support/freetrie.h +++ b/libc/src/__support/freetrie.h @@ -87,9 +87,9 @@ class FreeTrie { /// Sets the range of possible block sizes. This can only be called when the /// trie is empty. - LIBC_INLINE void set_range(FreeTrie::SizeRange range) { + LIBC_INLINE void set_range(FreeTrie::SizeRange new_range) { LIBC_ASSERT(empty() && "cannot change the range of a preexisting trie"); - this->range = range; + range = new_range; } /// @returns Whether the trie contains any blocks. From b2f37f4b0edf928dadfaf8d756d94e3c028ee25a Mon Sep 17 00:00:00 2001 From: Eugene Shalygin Date: Sun, 10 May 2026 16:29:39 +0200 Subject: [PATCH 177/538] clang-format: ensure ternary operands are aligned (#196697) Set ParentState::AlignedTo for ternary operands. --- clang/lib/Format/ContinuationIndenter.cpp | 16 ++++++++++++++++ clang/unittests/Format/AlignmentTest.cpp | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index b2f799bb33b01..485fe382bda3a 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -1276,6 +1276,22 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, CurrentState.BreakBeforeParameter = false; CurrentState.AlignedTo = &Current; } + if (Style.AlignOperands != FormatStyle::OAS_DontAlign && + Current.is(TT_ConditionalExpr)) { + switch (Style.AlignOperands) { + case FormatStyle::OAS_Align: + CurrentState.AlignedTo = Current.is(tok::question) + ? Current.getPrevious(tok::equal) + : Current.getPrevious(tok::question); + break; + case FormatStyle::OAS_AlignAfterOperator: + if (Current.is(tok::colon)) + CurrentState.AlignedTo = Current.getPrevious(tok::question); + break; + case FormatStyle::OAS_DontAlign: + break; + } + } if (!DryRun) { unsigned MaxEmptyLinesToKeep = Style.MaxEmptyLinesToKeep + 1; diff --git a/clang/unittests/Format/AlignmentTest.cpp b/clang/unittests/Format/AlignmentTest.cpp index 971ceeefef582..9421a4c933b9e 100644 --- a/clang/unittests/Format/AlignmentTest.cpp +++ b/clang/unittests/Format/AlignmentTest.cpp @@ -3599,6 +3599,16 @@ TEST_F(AlignmentTest, ContinuedAligned) { "\t},\n" "\tvariant);", Style); + + Style.ColumnLimit = 40; + Style.IndentWidth = Style.TabWidth = Style.ContinuationIndentWidth = 8; + + verifyFormat("void f() {\n" + "\tint aaaaaaaaaaaaaaaaaaaa =\n" + "\t\t000000000000000001 ? 2\n" + "\t\t : 3;\n" + "}", + Style); } } // namespace From 0b987506734cded0a99476eeb948a41490dd429c Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 10 May 2026 10:31:57 -0400 Subject: [PATCH 178/538] [gn] Make ClangDependencyScanningTests depend on Testing/Support (#196809) Needed after ebb9a79cd370c. --- .../gn/secondary/clang/unittests/DependencyScanning/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn index 4a40696ed802e..a4f9a717cfba1 100644 --- a/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/DependencyScanning/BUILD.gn @@ -7,6 +7,7 @@ unittest("ClangDependencyScanningTests") { "//clang/lib/Frontend", "//llvm/lib/Option", "//llvm/lib/Support", + "//llvm/lib/Testing/Support", ] sources = [ "DependencyScanningFilesystemTest.cpp", From 7456636311f0cd7272b3021f07b55c3a769b44e6 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Sun, 10 May 2026 09:32:19 -0500 Subject: [PATCH 179/538] [flang][OpenMP] Consistent names for non-executable directives, NFC (#196803) Change OpenMPGroupprivate -> OmpGroupprivateDirective OpenMPThreadprivate -> OmpThreadprivateDirective OpenMPRequiresConstruct -> OmpRequiresDirective OpenMPUtilityConstruct -> OmpUtilityDirective --- flang/examples/FeatureList/FeatureList.cpp | 4 ++-- flang/include/flang/Parser/dump-parse-tree.h | 8 +++---- flang/include/flang/Parser/parse-tree.h | 24 +++++++++++--------- flang/lib/Lower/OpenMP/OpenMP.cpp | 12 +++++----- flang/lib/Parser/openmp-parsers.cpp | 20 ++++++++-------- flang/lib/Parser/unparse.cpp | 6 ++--- flang/lib/Semantics/canonicalize-omp.cpp | 8 +++---- flang/lib/Semantics/check-omp-structure.cpp | 14 ++++++------ flang/lib/Semantics/check-omp-structure.h | 12 +++++----- flang/lib/Semantics/resolve-directives.cpp | 20 ++++++++-------- flang/lib/Semantics/resolve-names.cpp | 8 ++++--- flang/lib/Semantics/rewrite-parse-tree.cpp | 2 +- flang/lib/Semantics/unparse-with-symbols.cpp | 13 +++++++---- flang/test/Lower/OpenMP/Todo/error.f90 | 2 +- flang/test/Parser/OpenMP/error-unparse.f90 | 6 ++--- flang/test/Parser/OpenMP/groupprivate.f90 | 4 ++-- flang/test/Parser/OpenMP/nothing.f90 | 8 +++---- flang/test/Parser/OpenMP/requires.f90 | 10 ++++---- flang/test/Parser/OpenMP/threadprivate.f90 | 2 +- flang/test/Semantics/OpenMP/simd-only.f90 | 2 +- 20 files changed, 97 insertions(+), 88 deletions(-) diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp index 80683f90654a2..c0f6a1776c2e5 100644 --- a/flang/examples/FeatureList/FeatureList.cpp +++ b/flang/examples/FeatureList/FeatureList.cpp @@ -556,12 +556,12 @@ struct NodeVisitor { READ_FEATURE(OpenMPFlushConstruct) READ_FEATURE(OpenMPLoopConstruct) READ_FEATURE(OpenMPAllocatorsConstruct) - READ_FEATURE(OpenMPRequiresConstruct) + READ_FEATURE(OmpRequiresDirective) READ_FEATURE(OpenMPSimpleStandaloneConstruct) READ_FEATURE(OpenMPStandaloneConstruct) READ_FEATURE(OpenMPSectionConstruct) READ_FEATURE(OpenMPSectionsConstruct) - READ_FEATURE(OpenMPThreadprivate) + READ_FEATURE(OmpThreadprivateDirective) READ_FEATURE(OpenStmt) READ_FEATURE(Optional) READ_FEATURE(OptionalStmt) diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index da5a00b68ad38..1825db3a3424b 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -630,6 +630,7 @@ class ParseTreeDumper { NODE(OmpGrainsizeClause, Modifier) NODE(parser, OmpGraphIdClause) NODE(parser, OmpGraphResetClause) + NODE(parser, OmpGroupprivateDirective) NODE(parser, OmpHintClause) NODE(parser, OmpHoldsClause) NODE(parser, OmpIfClause) @@ -711,6 +712,7 @@ class ParseTreeDumper { NODE(parser, OmpRefModifier) NODE_ENUM(OmpRefModifier, Value) NODE(parser, OmpReplayableClause) + NODE(parser, OmpRequiresDirective) NODE(parser, OmpReverseOffloadClause) NODE(parser, OmpScheduleClause) NODE(OmpScheduleClause, Modifier) @@ -731,6 +733,7 @@ class ParseTreeDumper { NODE(OmpTaskReductionClause, Modifier) NODE(parser, OmpThreadLimitClause) NODE(OmpThreadLimitClause, Modifier) + NODE(parser, OmpThreadprivateDirective) NODE(parser, OmpThreadsetClause) NODE_ENUM(OmpThreadsetClause, ThreadsetPolicy) NODE(parser, OmpToClause) @@ -754,6 +757,7 @@ class ParseTreeDumper { NODE(parser, OmpUnifiedSharedMemoryClause) NODE(parser, OmpUpdateClause) NODE(parser, OmpUseClause) + NODE(parser, OmpUtilityDirective) NODE(parser, OmpVariableCategory) NODE_ENUM(OmpVariableCategory, Value) NODE(parser, OmpWhenClause) @@ -790,17 +794,13 @@ class ParseTreeDumper { NODE(parser, OpenMPDepobjConstruct) NODE(parser, OpenMPDispatchConstruct) NODE(parser, OpenMPFlushConstruct) - NODE(parser, OpenMPGroupprivate) NODE(parser, OpenMPInvalidDirective) NODE(parser, OpenMPLoopConstruct) NODE(parser, OpenMPMisplacedEndDirective) - NODE(parser, OpenMPRequiresConstruct) NODE(parser, OpenMPSectionConstruct) NODE(parser, OpenMPSectionsConstruct) NODE(parser, OpenMPSimpleStandaloneConstruct) NODE(parser, OpenMPStandaloneConstruct) - NODE(parser, OpenMPThreadprivate) - NODE(parser, OpenMPUtilityConstruct) NODE(parser, OpenStmt) NODE(parser, Optional) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 628f2ed80abad..58aeaf2280c53 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -5153,8 +5153,8 @@ struct OmpErrorDirective { WRAPPER_CLASS_BOILERPLATE(OmpErrorDirective, OmpDirectiveSpecification); }; -struct OpenMPUtilityConstruct { - UNION_CLASS_BOILERPLATE(OpenMPUtilityConstruct); +struct OmpUtilityDirective { + UNION_CLASS_BOILERPLATE(OmpUtilityDirective); CharBlock source; std::variant u; }; @@ -5271,20 +5271,22 @@ struct OmpDeclareSimdDirective { // // groupprivate-directive -> // GROUPPRIVATE (variable-list-item...) // since 6.0 -struct OpenMPGroupprivate { - WRAPPER_CLASS_BOILERPLATE(OpenMPGroupprivate, OmpDirectiveSpecification); +struct OmpGroupprivateDirective { + WRAPPER_CLASS_BOILERPLATE( + OmpGroupprivateDirective, OmpDirectiveSpecification); CharBlock source; }; // 2.4 requires -> REQUIRES requires-clause[ [ [,] requires-clause]...] -struct OpenMPRequiresConstruct { - WRAPPER_CLASS_BOILERPLATE(OpenMPRequiresConstruct, OmpDirectiveSpecification); +struct OmpRequiresDirective { + WRAPPER_CLASS_BOILERPLATE(OmpRequiresDirective, OmpDirectiveSpecification); CharBlock source; }; // 2.15.2 threadprivate -> THREADPRIVATE (variable-name-list) -struct OpenMPThreadprivate { - WRAPPER_CLASS_BOILERPLATE(OpenMPThreadprivate, OmpDirectiveSpecification); +struct OmpThreadprivateDirective { + WRAPPER_CLASS_BOILERPLATE( + OmpThreadprivateDirective, OmpDirectiveSpecification); CharBlock source; }; @@ -5326,8 +5328,8 @@ struct OpenMPDeclarativeConstruct { std::variant u; }; @@ -5473,7 +5475,7 @@ struct OpenMPConstruct { std::variant u; }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 0ffc7bdae85b9..fb5014f3394be 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3926,7 +3926,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter, static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPUtilityConstruct &); + const parser::OmpUtilityDirective &); static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, @@ -4409,14 +4409,14 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPGroupprivate &directive) { + const parser::OmpGroupprivateDirective &directive) { TODO(converter.getCurrentLocation(), "GROUPPRIVATE"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPRequiresConstruct &requiresConstruct) { + const parser::OmpRequiresDirective &requiresConstruct) { // Requires directives are gathered and processed in semantics and // then combined in the lowering bridge before triggering codegen // just once. Hence, there is no need to lower each individual @@ -4426,7 +4426,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPThreadprivate &threadprivate) { + const parser::OmpThreadprivateDirective &threadprivate) { // The directive is lowered when instantiating the variable to // support the case of threadprivate variable declared in module. } @@ -4692,9 +4692,9 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPUtilityConstruct &) { + const parser::OmpUtilityDirective &) { if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct"); + TODO(converter.getCurrentLocation(), "OmpUtilityDirective"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index d89bf13e137d8..e9bd97d729b89 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -1916,8 +1916,8 @@ TYPE_PARSER(construct( OmpDirectiveSpecificationParser{})) TYPE_PARSER( // - sourced(construct(Parser{})) || - sourced(construct(Parser{}))) + sourced(construct(Parser{})) || + sourced(construct(Parser{}))) TYPE_PARSER(construct( predicated(Parser{}, @@ -2423,20 +2423,20 @@ TYPE_PARSER(sourced(construct( OmpDirectiveSpecificationParser{}))) TYPE_PARSER(sourced( // - construct( + construct( predicated(OmpDirectiveNameParser{}, IsDirective(llvm::omp::Directive::OMPD_groupprivate)) >= OmpDirectiveSpecificationParser{}))) // 2.4 Requires construct -TYPE_PARSER(sourced(construct( +TYPE_PARSER(sourced(construct( predicated(OmpDirectiveNameParser{}, IsDirective(llvm::omp::Directive::OMPD_requires)) >= OmpDirectiveSpecificationParser{}))) // 2.15.2 Threadprivate directive TYPE_PARSER(sourced( // - construct( + construct( predicated(OmpDirectiveNameParser{}, IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >= OmpDirectiveSpecificationParser{}))) @@ -2465,13 +2465,13 @@ TYPE_PARSER( construct( sourced(OmpDeclarativeAllocateParser{})) || construct( - Parser{}) || + Parser{}) || construct( - Parser{}) || + Parser{}) || construct( - Parser{}) || + Parser{}) || construct( - Parser{}) || + Parser{}) || construct( Parser{})) / endOmpLine)) @@ -2566,7 +2566,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, // OpenMPStandaloneConstruct to resolve !$OMP ORDERED construct(Parser{}), construct(Parser{}), - construct(Parser{}), + construct(Parser{}), construct(Parser{}), construct(Parser{}), construct(Parser{}), diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 8a01f410bd861..355eadaf8e0d4 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2718,7 +2718,7 @@ class UnparseVisitor { Put("\n"); EndOpenMP(); } - void Unparse(const OpenMPGroupprivate &x) { + void Unparse(const OmpGroupprivateDirective &x) { BeginOpenMP(); Word("!$OMP "); Walk(x.v); @@ -2742,7 +2742,7 @@ class UnparseVisitor { void Unparse(const OpenMPMisplacedEndDirective &x) { Unparse(static_cast(x)); } - void Unparse(const OpenMPRequiresConstruct &x) { + void Unparse(const OmpRequiresDirective &x) { BeginOpenMP(); Word("!$OMP "); Walk(x.v); @@ -2772,7 +2772,7 @@ class UnparseVisitor { Put("\n"); EndOpenMP(); } - void Unparse(const OpenMPThreadprivate &x) { + void Unparse(const OmpThreadprivateDirective &x) { BeginOpenMP(); Word("!$OMP "); Walk(x.v); diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index f67ac689add14..bcef673dcdb0b 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -278,7 +278,7 @@ class CanonicalizationOfOmp { // Got OpenMPDeclarativeConstruct. If it's not a utility construct // then stop. auto &odc = std::get(sc.u).value(); - if (!std::holds_alternative(odc.u)) { + if (!std::holds_alternative(odc.u)) { return rit; } } @@ -291,7 +291,7 @@ class CanonicalizationOfOmp { using OpenMPDeclarativeConstruct = common::Indirection; auto &oc = std::get(sc.u).value(); - auto &ut = std::get(oc.u); + auto &ut = std::get(oc.u); return parser::ExecutionPartConstruct(parser::ExecutableConstruct( common::Indirection(parser::OpenMPConstruct(std::move(ut))))); @@ -309,7 +309,7 @@ class CanonicalizationOfOmp { std::list::reverse_iterator rlast = [&]() { for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) { OpenMPDeclarativeConstruct &dc = *rit; - if (!std::holds_alternative(dc.u)) { + if (!std::holds_alternative(dc.u)) { return rit; } } @@ -318,7 +318,7 @@ class CanonicalizationOfOmp { std::transform(omps.rbegin(), rlast, std::front_inserter(block), [](parser::OpenMPDeclarativeConstruct &dc) { - auto &ut = std::get(dc.u); + auto &ut = std::get(dc.u); return parser::ExecutionPartConstruct(parser::ExecutableConstruct( common::Indirection(parser::OpenMPConstruct(std::move(ut))))); }); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index e3b623e2e9f95..022003a8e1728 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -891,7 +891,7 @@ template struct DirectiveSpellingVisitor { checker_(GetDirName(x.t).source, Directive::OMPD_allocators); return false; } - bool Pre(const parser::OpenMPGroupprivate &x) { + bool Pre(const parser::OmpGroupprivateDirective &x) { checker_(x.v.DirName().source, Directive::OMPD_groupprivate); return false; } @@ -1574,7 +1574,7 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar( } } -void OmpStructureChecker::Enter(const parser::OpenMPGroupprivate &x) { +void OmpStructureChecker::Enter(const parser::OmpGroupprivateDirective &x) { PushContextAndClauseSets( x.v.DirName().source, llvm::omp::Directive::OMPD_groupprivate); @@ -1626,16 +1626,16 @@ void OmpStructureChecker::Enter(const parser::OpenMPGroupprivate &x) { } } -void OmpStructureChecker::Leave(const parser::OpenMPGroupprivate &x) { +void OmpStructureChecker::Leave(const parser::OmpGroupprivateDirective &x) { dirContext_.pop_back(); } -void OmpStructureChecker::Enter(const parser::OpenMPThreadprivate &x) { +void OmpStructureChecker::Enter(const parser::OmpThreadprivateDirective &x) { const parser::OmpDirectiveName &dirName{x.v.DirName()}; PushContextAndClauseSets(dirName.source, dirName.v); } -void OmpStructureChecker::Leave(const parser::OpenMPThreadprivate &x) { +void OmpStructureChecker::Leave(const parser::OmpThreadprivateDirective &x) { const parser::OmpDirectiveSpecification &dirSpec{x.v}; for (const parser::OmpArgument &arg : x.v.Arguments().v) { if (auto *object{GetArgumentObject(arg)}) { @@ -1881,7 +1881,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPDepobjConstruct &x) { dirContext_.pop_back(); } -void OmpStructureChecker::Enter(const parser::OpenMPRequiresConstruct &x) { +void OmpStructureChecker::Enter(const parser::OmpRequiresDirective &x) { const auto &dirName{x.v.DirName()}; PushContextAndClauseSets(dirName.source, dirName.v); unsigned version{context_.langOptions().OpenMPVersion}; @@ -1924,7 +1924,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPRequiresConstruct &x) { } } -void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) { +void OmpStructureChecker::Leave(const parser::OmpRequiresDirective &) { dirContext_.pop_back(); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 397381e30cccd..f9852bbf77d65 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -145,12 +145,12 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpNothingDirective &); void Enter(const parser::OpenMPAllocatorsConstruct &); void Leave(const parser::OpenMPAllocatorsConstruct &); - void Enter(const parser::OpenMPRequiresConstruct &); - void Leave(const parser::OpenMPRequiresConstruct &); - void Enter(const parser::OpenMPGroupprivate &); - void Leave(const parser::OpenMPGroupprivate &); - void Enter(const parser::OpenMPThreadprivate &); - void Leave(const parser::OpenMPThreadprivate &); + void Enter(const parser::OmpRequiresDirective &); + void Leave(const parser::OmpRequiresDirective &); + void Enter(const parser::OmpGroupprivateDirective &); + void Leave(const parser::OmpGroupprivateDirective &); + void Enter(const parser::OmpThreadprivateDirective &); + void Leave(const parser::OmpThreadprivateDirective &); void Enter(const parser::OpenMPSimpleStandaloneConstruct &); void Leave(const parser::OpenMPSimpleStandaloneConstruct &); diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 52cd445027d37..7ae867e19f276 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -551,8 +551,8 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { GetContext().withinConstruct = true; } - bool Pre(const parser::OpenMPGroupprivate &); - void Post(const parser::OpenMPGroupprivate &) { PopContext(); } + bool Pre(const parser::OmpGroupprivateDirective &); + void Post(const parser::OmpGroupprivateDirective &) { PopContext(); } bool Pre(const parser::OpenMPStandaloneConstruct &x) { common::visit( @@ -636,7 +636,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { } void Post(const parser::OpenMPFlushConstruct &) { PopContext(); } - bool Pre(const parser::OpenMPRequiresConstruct &x) { + bool Pre(const parser::OmpRequiresDirective &x) { using RequiresClauses = WithOmpDeclarative::RequiresClauses; PushContext(x.source, llvm::omp::Directive::OMPD_requires); @@ -689,7 +689,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { AddOmpRequiresToScope(currScope(), &reqs, memOrder); return true; } - void Post(const parser::OpenMPRequiresConstruct &) { PopContext(); } + void Post(const parser::OmpRequiresDirective &) { PopContext(); } bool Pre(const parser::OmpDeclareTargetDirective &); void Post(const parser::OmpDeclareTargetDirective &) { PopContext(); } @@ -700,8 +700,8 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OmpDeclareReductionDirective &); void Post(const parser::OmpDeclareReductionDirective &) { PopContext(); } - bool Pre(const parser::OpenMPThreadprivate &); - void Post(const parser::OpenMPThreadprivate &) { PopContext(); } + bool Pre(const parser::OmpThreadprivateDirective &); + void Post(const parser::OmpThreadprivateDirective &) { PopContext(); } bool Pre(const parser::OmpAllocateDirective &); @@ -717,11 +717,11 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPAllocatorsConstruct &); void Post(const parser::OpenMPAllocatorsConstruct &); - bool Pre(const parser::OpenMPUtilityConstruct &x) { + bool Pre(const parser::OmpUtilityDirective &x) { PushContext(x.source, parser::omp::GetOmpDirectiveName(x).v); return true; } - void Post(const parser::OpenMPUtilityConstruct &) { PopContext(); } + void Post(const parser::OmpUtilityDirective &) { PopContext(); } bool Pre(const parser::OmpDeclareVariantDirective &x) { PushContext(x.source, llvm::omp::Directive::OMPD_declare_variant); @@ -2142,7 +2142,7 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndex( } } -bool OmpAttributeVisitor::Pre(const parser::OpenMPGroupprivate &x) { +bool OmpAttributeVisitor::Pre(const parser::OmpGroupprivateDirective &x) { PushContext(x.source, llvm::omp::Directive::OMPD_groupprivate); for (const parser::OmpArgument &arg : x.v.Arguments().v) { if (auto *object{parser::omp::GetArgumentObject(arg)}) { @@ -2211,7 +2211,7 @@ bool OmpAttributeVisitor::Pre(const parser::OmpDeclareReductionDirective &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { +bool OmpAttributeVisitor::Pre(const parser::OmpThreadprivateDirective &x) { const parser::OmpDirectiveName &dirName{x.v.DirName()}; PushContext(dirName.source, dirName.v); diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index f5478c354b40a..f81eaf11618c4 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1730,7 +1730,7 @@ class OmpVisitor : public virtual DeclarationVisitor { static bool NeedsScope(const parser::OmpBlockConstruct &); static bool NeedsScope(const parser::OmpClause &); - bool Pre(const parser::OpenMPRequiresConstruct &x) { + bool Pre(const parser::OmpRequiresDirective &x) { AddOmpSourceRange(x.source); return true; } @@ -1815,11 +1815,13 @@ class OmpVisitor : public virtual DeclarationVisitor { void Post(const parser::OmpEndSectionsDirective &x) { Post(static_cast(x)); } - bool Pre(const parser::OpenMPThreadprivate &) { + bool Pre(const parser::OmpThreadprivateDirective &) { SkipImplicitTyping(true); return true; } - void Post(const parser::OpenMPThreadprivate &) { SkipImplicitTyping(false); } + void Post(const parser::OmpThreadprivateDirective &) { + SkipImplicitTyping(false); + } bool Pre(const parser::OmpDeclareTargetDirective &x) { auto addObjectName{[&](const parser::OmpObject &object) { common::visit( diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp index dff5a1cd30a1b..f71a1be50172e 100644 --- a/flang/lib/Semantics/rewrite-parse-tree.cpp +++ b/flang/lib/Semantics/rewrite-parse-tree.cpp @@ -131,7 +131,7 @@ void RewriteMutator::OpenMPSimdOnly(parser::SpecificationPart &specPart) { if (auto *ompDecl{std::get_if< common::Indirection>( &specConstr->u)}) { - if (std::holds_alternative( + if (std::holds_alternative( ompDecl->value().u) || std::holds_alternative( ompDecl->value().u)) { diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp index e202c17835044..a3361f9072f69 100644 --- a/flang/lib/Semantics/unparse-with-symbols.cpp +++ b/flang/lib/Semantics/unparse-with-symbols.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Semantics/unparse-with-symbols.h" + #include "mod-file.h" #include "flang/Parser/parse-tree-visitor.h" #include "flang/Parser/parse-tree.h" @@ -49,16 +50,20 @@ class SymbolDumpVisitor { return true; } void Post(const parser::OmpClause &) { currStmt_ = std::nullopt; } - bool Pre(const parser::OpenMPGroupprivate &dir) { + bool Pre(const parser::OmpGroupprivateDirective &dir) { currStmt_ = dir.source; return true; } - void Post(const parser::OpenMPGroupprivate &) { currStmt_ = std::nullopt; } - bool Pre(const parser::OpenMPThreadprivate &dir) { + void Post(const parser::OmpGroupprivateDirective &) { + currStmt_ = std::nullopt; + } + bool Pre(const parser::OmpThreadprivateDirective &dir) { currStmt_ = dir.source; return true; } - void Post(const parser::OpenMPThreadprivate &) { currStmt_ = std::nullopt; } + void Post(const parser::OmpThreadprivateDirective &) { + currStmt_ = std::nullopt; + } bool Pre(const parser::OmpDeclareMapperDirective &x) { currStmt_ = x.source; diff --git a/flang/test/Lower/OpenMP/Todo/error.f90 b/flang/test/Lower/OpenMP/Todo/error.f90 index 6d3bd892da47d..d002f72760b1c 100644 --- a/flang/test/Lower/OpenMP/Todo/error.f90 +++ b/flang/test/Lower/OpenMP/Todo/error.f90 @@ -1,6 +1,6 @@ ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -! CHECK: not yet implemented: OpenMPUtilityConstruct +! CHECK: not yet implemented: OmpUtilityDirective program p integer, allocatable :: x !$omp error at(compilation) severity(warning) message("an error") diff --git a/flang/test/Parser/OpenMP/error-unparse.f90 b/flang/test/Parser/OpenMP/error-unparse.f90 index 60ce52b9b88a1..0902d109fd233 100644 --- a/flang/test/Parser/OpenMP/error-unparse.f90 +++ b/flang/test/Parser/OpenMP/error-unparse.f90 @@ -3,7 +3,7 @@ program main character(*), parameter :: message = "This is an error" !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(WARNING) MESSAGE("some message here") - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpUtilityDirective -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> SevLevel = Warning !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"some message here"' @@ -11,14 +11,14 @@ program main !PARSE-TREE: string = 'some message here' !$omp error at(compilation) severity(warning) message("some message here") !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE("This is an error") - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpUtilityDirective -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> SevLevel = Fatal !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"' !PARSE-TREE: Designator -> DataRef -> Name = 'message' !$omp error at(compilation) severity(fatal) message(message) !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE("This is an error") - !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective + !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpUtilityDirective -> OmpErrorDirective !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Execution !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> SevLevel = Fatal !PARSE-TREE: OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"' diff --git a/flang/test/Parser/OpenMP/groupprivate.f90 b/flang/test/Parser/OpenMP/groupprivate.f90 index b069eb751b90d..ca4d974f6895c 100644 --- a/flang/test/Parser/OpenMP/groupprivate.f90 +++ b/flang/test/Parser/OpenMP/groupprivate.f90 @@ -17,13 +17,13 @@ module m !UNPARSE: !$OMP GROUPPRIVATE(z) !UNPARSE: END MODULE -!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPGroupprivate -> OmpDirectiveSpecification +!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpGroupprivateDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = groupprivate !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'x' !PARSE-TREE: | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'y' !PARSE-TREE: | OmpClauseList -> OmpClause -> DeviceType -> OmpDeviceTypeClause -> DeviceTypeDescription = Nohost !PARSE-TREE: | Flags = {} -!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPGroupprivate -> OmpDirectiveSpecification +!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpGroupprivateDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = groupprivate !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'z' !PARSE-TREE: | OmpClauseList -> diff --git a/flang/test/Parser/OpenMP/nothing.f90 b/flang/test/Parser/OpenMP/nothing.f90 index 22558c493c444..172e303c5bcb9 100644 --- a/flang/test/Parser/OpenMP/nothing.f90 +++ b/flang/test/Parser/OpenMP/nothing.f90 @@ -10,7 +10,7 @@ subroutine f00 !UNPARSE: END SUBROUTINE !PARSE-TREE: ExecutionPart -> Block -!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpUtilityDirective -> OmpNothingDirective subroutine f01 block @@ -39,7 +39,7 @@ subroutine f01 !PARSE-TREE: | | EntityDecl !PARSE-TREE: | | | Name = 'x' !PARSE-TREE: Block -!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpUtilityDirective -> OmpNothingDirective !PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4' !PARSE-TREE: | | Variable = 'x' !PARSE-TREE: | | | Designator -> DataRef -> Name = 'x' @@ -68,7 +68,7 @@ subroutine f02 !PARSE-TREE: | | EntityDecl !PARSE-TREE: | | | Name = 'x' !PARSE-TREE: ExecutionPart -> Block -!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpUtilityDirective -> OmpNothingDirective subroutine f03 block @@ -92,7 +92,7 @@ subroutine f03 !PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> BlockConstruct !PARSE-TREE: | | BlockStmt -> !PARSE-TREE: | | BlockSpecificationPart -> SpecificationPart -!PARSE-TREE: | | | OpenMPDeclarativeConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective +!PARSE-TREE: | | | OpenMPDeclarativeConstruct -> OmpUtilityDirective -> OmpNothingDirective !PARSE-TREE: | | | ImportStmt !PARSE-TREE: | | | ImplicitPart -> !PARSE-TREE: | | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt diff --git a/flang/test/Parser/OpenMP/requires.f90 b/flang/test/Parser/OpenMP/requires.f90 index 49d78737f415f..81af7f447ae52 100644 --- a/flang/test/Parser/OpenMP/requires.f90 +++ b/flang/test/Parser/OpenMP/requires.f90 @@ -5,7 +5,7 @@ !UNPARSE: !$OMP REQUIRES ATOMIC_DEFAULT_MEM_ORDER(SEQ_CST) -!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification +!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpRequiresDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires !PARSE-TREE: | OmpClauseList -> OmpClause -> AtomicDefaultMemOrder -> OmpAtomicDefaultMemOrderClause -> OmpMemoryOrderType = Seq_Cst !PARSE-TREE: | Flags = {} @@ -14,7 +14,7 @@ !UNPARSE: !$OMP REQUIRES UNIFIED_SHARED_MEMORY UNIFIED_ADDRESS -!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification +!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpRequiresDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires !PARSE-TREE: | OmpClauseList -> OmpClause -> UnifiedSharedMemory !PARSE-TREE: | OmpClause -> UnifiedAddress @@ -24,7 +24,7 @@ !UNPARSE: !$OMP REQUIRES DYNAMIC_ALLOCATORS REVERSE_OFFLOAD -!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification +!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpRequiresDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires !PARSE-TREE: | OmpClauseList -> OmpClause -> DynamicAllocators !PARSE-TREE: | OmpClause -> ReverseOffload @@ -34,7 +34,7 @@ !UNPARSE: !$OMP REQUIRES SELF_MAPS(.true._4) UNIFIED_ADDRESS(.false._4) -!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification +!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpRequiresDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires !PARSE-TREE: | OmpClauseList -> OmpClause -> SelfMaps -> OmpSelfMapsClause -> Scalar -> Logical -> Constant -> Expr = '.true._4' !PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant @@ -48,7 +48,7 @@ !UNPARSE: !$OMP REQUIRES DEVICE_SAFESYNC -!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification +!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpRequiresDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires !PARSE-TREE: | OmpClauseList -> OmpClause -> DeviceSafesync !PARSE-TREE: | Flags = {} diff --git a/flang/test/Parser/OpenMP/threadprivate.f90 b/flang/test/Parser/OpenMP/threadprivate.f90 index b7dfd952bb4a7..e03bf6f7f94b9 100644 --- a/flang/test/Parser/OpenMP/threadprivate.f90 +++ b/flang/test/Parser/OpenMP/threadprivate.f90 @@ -17,7 +17,7 @@ module m !UNPARSE: !$OMP THREADPRIVATE(/blk/, b) !UNPARSE: END MODULE -!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPThreadprivate -> OmpDirectiveSpecification +!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpThreadprivateDirective -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = threadprivate !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Name = 'blk' !PARSE-TREE: | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'b' diff --git a/flang/test/Semantics/OpenMP/simd-only.f90 b/flang/test/Semantics/OpenMP/simd-only.f90 index 8205b98ac6663..91c3eaf76d43d 100644 --- a/flang/test/Semantics/OpenMP/simd-only.f90 +++ b/flang/test/Semantics/OpenMP/simd-only.f90 @@ -253,7 +253,7 @@ module test_threadprivate_mod ! CHECK: Name = 'x' ! CHECK: Name = 'y' common /vars/ x, y - ! CHECK-NOT: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPThreadprivate + ! CHECK-NOT: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpThreadprivateDirective !$omp threadprivate(/vars/) end module From 6983aa76bb68bdfac6f212f2fc275f166e970367 Mon Sep 17 00:00:00 2001 From: Guy David Date: Sun, 10 May 2026 18:04:42 +0300 Subject: [PATCH 180/538] [AArch64] Improve post-inc stores of SIMD/FP values (#151372) Add patterns to match post-increment truncating stores from lane 0 of wide integer vectors (v4i32/v2i64) to narrower types (i8/i16/i32). This avoids transferring the value through a GPR when storing. Also remove the pre-legaliztion early-exit in `combineStoreValueFPToInt` as it prevented the optimization from applying in some cases. --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 - llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 + .../CodeGen/AArch64/store-float-conversion.ll | 260 ++++++++++++++++++ llvm/test/CodeGen/AArch64/tbl-loops.ll | 3 +- 4 files changed, 268 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b53605e917e2b..8dd94e1418400 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26041,9 +26041,6 @@ static SDValue combineStoreValueFPToInt(StoreSDNode *ST, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - // Limit to post-legalization in order to avoid peeling truncating stores. - if (DCI.isBeforeLegalize()) - return SDValue(); if (!Subtarget->isNeonAvailable()) return SDValue(); // Source operand is already a vector. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c20efffb1944..45ac5bfc16a26 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -10116,6 +10116,13 @@ def : St1Lane128SubvecPat; def : St1Lane128SubvecPat; def : St1Lane128SubvecPat; +// Truncating post-inc stores from lane 0 of v4i32/v2i64. +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; + let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>; diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll index c46801fc16714..bccbf489601aa 100644 --- a/llvm/test/CodeGen/AArch64/store-float-conversion.ll +++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll @@ -27,6 +27,34 @@ entry: ret void } +define ptr @f32_to_s8_inc(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s8_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: st1 { v0.b }[0], [x0], #1 +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i8 + %next = getelementptr i8, ptr %dst, i64 1 + store i8 %trunc, ptr %dst + ret ptr %next +} + +define ptr @f32_to_u8_inc(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u8_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: st1 { v0.b }[0], [x0], #1 +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %trunc = trunc i32 %conv to i8 + %next = getelementptr i8, ptr %dst, i64 1 + store i8 %trunc, ptr %dst + ret ptr %next +} + define void @f32_to_u16(float %f, ptr %dst) { ; CHECK-LABEL: f32_to_u16: ; CHECK: // %bb.0: // %entry @@ -53,6 +81,34 @@ entry: ret void } +define ptr @f32_to_s16_inc(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s16_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: st1 { v0.h }[0], [x0], #2 +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i16 + %next = getelementptr i16, ptr %dst, i64 1 + store i16 %trunc, ptr %dst + ret ptr %next +} + +define ptr @f32_to_u16_inc(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u16_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: st1 { v0.h }[0], [x0], #2 +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %trunc = trunc i32 %conv to i16 + %next = getelementptr i16, ptr %dst, i64 1 + store i16 %trunc, ptr %dst + ret ptr %next +} + define void @f32_to_u32(float %f, ptr %dst) { ; CHECK-LABEL: f32_to_u32: ; CHECK: // %bb.0: // %entry @@ -77,6 +133,32 @@ entry: ret void } +define ptr @f32_to_s32_inc(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s32_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: st1 { v0.s }[0], [x0], #4 +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %next = getelementptr i32, ptr %dst, i64 1 + store i32 %conv, ptr %dst + ret ptr %next +} + +define ptr @f32_to_u32_inc(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u32_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: st1 { v0.s }[0], [x0], #4 +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %next = getelementptr i32, ptr %dst, i64 1 + store i32 %conv, ptr %dst + ret ptr %next +} + define void @f32_to_s64(float %f, ptr %dst) { ; CHECK-LABEL: f32_to_s64: ; CHECK: // %bb.0: // %entry @@ -115,6 +197,170 @@ entry: ret void } +define ptr @f64_to_s64_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s64_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: st1 { v0.d }[0], [x0], #8 +; CHECK-NEXT: ret +entry: + %conv = fptosi double %d to i64 + %next = getelementptr i64, ptr %dst, i64 1 + store i64 %conv, ptr %dst + ret ptr %next +} + +define ptr @f64_to_u64_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u64_inc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: st1 { v0.d }[0], [x0], #8 +; CHECK-NEXT: ret +entry: + %conv = fptoui double %d to i64 + %next = getelementptr i64, ptr %dst, i64 1 + store i64 %conv, ptr %dst + ret ptr %next +} + +define void @f64_to_u8(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + %conv = fptoui double %d to i64 + %trunc = trunc i64 %conv to i8 + store i8 %trunc, ptr %dst + ret void +} + +define void @f64_to_s8(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + %conv = fptosi double %d to i64 + %trunc = trunc i64 %conv to i8 + store i8 %trunc, ptr %dst + ret void +} + +define ptr @f64_to_s8_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s8_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: st1 { v0.b }[0], [x0], #1 +; CHECK-NEXT: ret + %conv = fptosi double %d to i64 + %trunc = trunc i64 %conv to i8 + store i8 %trunc, ptr %dst + %next = getelementptr i8, ptr %dst, i64 1 + ret ptr %next +} + +define ptr @f64_to_u8_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u8_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: st1 { v0.b }[0], [x0], #1 +; CHECK-NEXT: ret + %conv = fptoui double %d to i64 + %trunc = trunc i64 %conv to i8 + store i8 %trunc, ptr %dst + %next = getelementptr i8, ptr %dst, i64 1 + ret ptr %next +} + +define void @f64_to_u16(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + %conv = fptoui double %d to i64 + %trunc = trunc i64 %conv to i16 + store i16 %trunc, ptr %dst + ret void +} + +define void @f64_to_s16(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + %conv = fptosi double %d to i64 + %trunc = trunc i64 %conv to i16 + store i16 %trunc, ptr %dst + ret void +} + +define ptr @f64_to_s16_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s16_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: st1 { v0.h }[0], [x0], #2 +; CHECK-NEXT: ret + %conv = fptosi double %d to i64 + %trunc = trunc i64 %conv to i16 + %next = getelementptr i16, ptr %dst, i64 1 + store i16 %trunc, ptr %dst + ret ptr %next +} + +define ptr @f64_to_u16_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u16_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: st1 { v0.h }[0], [x0], #2 +; CHECK-NEXT: ret + %conv = fptoui double %d to i64 + %trunc = trunc i64 %conv to i16 + %next = getelementptr i16, ptr %dst, i64 1 + store i16 %trunc, ptr %dst + ret ptr %next +} + +define void @f64_to_s32(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret + %conv = fptosi double %d to i64 + %trunc = trunc i64 %conv to i32 + store i32 %trunc, ptr %dst + ret void +} + +define ptr @f64_to_s32_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s32_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: st1 { v0.s }[0], [x0], #4 +; CHECK-NEXT: ret + %conv = fptosi double %d to i64 + %trunc = trunc i64 %conv to i32 + %next = getelementptr i32, ptr %dst, i64 1 + store i32 %trunc, ptr %dst + ret ptr %next +} + +define ptr @f64_to_u32_inc(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u32_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: st1 { v0.s }[0], [x0], #4 +; CHECK-NEXT: ret + %conv = fptoui double %d to i64 + %trunc = trunc i64 %conv to i32 + %next = getelementptr i32, ptr %dst, i64 1 + store i32 %trunc, ptr %dst + ret ptr %next +} + define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) { ; CHECK-LABEL: f32_to_i32_multiple_uses: ; CHECK: // %bb.0: // %entry @@ -129,3 +375,17 @@ entry: store i8 %trunc, ptr %dst ret i32 %conv } + +; Negative test: extracting from lane 1 must go through GPR. +define ptr @v4i32_lane1_to_i8_inc(<4 x i32> %v, ptr %dst) { +; CHECK-LABEL: v4i32_lane1_to_i8_inc: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: strb w8, [x0], #1 +; CHECK-NEXT: ret + %elt = extractelement <4 x i32> %v, i32 1 + %trunc = trunc i32 %elt to i8 + store i8 %trunc, ptr %dst + %next = getelementptr i8, ptr %dst, i64 1 + ret ptr %next +} diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 0f629971b5844..84af8596a0e99 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -64,8 +64,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcsel s2, s0, s3, mi ; CHECK-NEXT: subs w10, w10, #1 ; CHECK-NEXT: fcvtzs s2, s2 -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: strb w11, [x9], #1 +; CHECK-NEXT: st1 { v2.b }[0], [x9], #1 ; CHECK-NEXT: b.ne .LBB0_7 ; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup ; CHECK-NEXT: ret From 2162c169241387e16c0baef4cfc7e00364997bef Mon Sep 17 00:00:00 2001 From: raindelight <62292184+raindelight@users.noreply.github.com> Date: Sun, 10 May 2026 17:04:51 +0200 Subject: [PATCH 181/538] [clang-tidy] Rename hicpp-multiway-paths-covered to bugprone-unhandled-code-paths (#191625) Part of the work in https://github.com/llvm/llvm-project/issues/183462. Closes https://github.com/llvm/llvm-project/issues/183464. Splitting the check into two more focused checks was considered during discussion, but since clang-tidy does not support one-to-many aliases, a single name covering both behaviors was chosen instead that is more clear than `multiway-paths-covered`. --------- Co-authored-by: Zeyi Xu --- .../bugprone/BugproneTidyModule.cpp | 3 + .../clang-tidy/bugprone/CMakeLists.txt | 1 + .../UnhandledCodePathsCheck.cpp} | 19 ++-- .../UnhandledCodePathsCheck.h} | 16 ++-- .../clang-tidy/hicpp/CMakeLists.txt | 1 - .../clang-tidy/hicpp/HICPPTidyModule.cpp | 4 +- clang-tools-extra/docs/ReleaseNotes.rst | 6 ++ .../checks/bugprone/unhandled-code-paths.rst | 95 ++++++++++++++++++ .../checks/hicpp/multiway-paths-covered.rst | 96 +------------------ .../docs/clang-tidy/checks/list.rst | 3 +- .../unhandled-code-paths-else.cpp} | 4 +- .../unhandled-code-paths.cpp} | 2 +- 12 files changed, 132 insertions(+), 118 deletions(-) rename clang-tools-extra/clang-tidy/{hicpp/MultiwayPathsCoveredCheck.cpp => bugprone/UnhandledCodePathsCheck.cpp} (92%) rename clang-tools-extra/clang-tidy/{hicpp/MultiwayPathsCoveredCheck.h => bugprone/UnhandledCodePathsCheck.h} (73%) create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-code-paths.rst rename clang-tools-extra/test/clang-tidy/checkers/{hicpp/multiway-paths-covered-else.cpp => bugprone/unhandled-code-paths-else.cpp} (92%) rename clang-tools-extra/test/clang-tidy/checkers/{hicpp/multiway-paths-covered.cpp => bugprone/unhandled-code-paths.cpp} (99%) diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp index 146947dd6747d..e6a1e162d8f8e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp @@ -104,6 +104,7 @@ #include "UncheckedStringToNumberConversionCheck.h" #include "UndefinedMemoryManipulationCheck.h" #include "UndelegatedConstructorCheck.h" +#include "UnhandledCodePathsCheck.h" #include "UnhandledExceptionAtNewCheck.h" #include "UnhandledSelfAssignmentCheck.h" #include "UnintendedCharOstreamOutputCheck.h" @@ -307,6 +308,8 @@ class BugproneModule : public ClangTidyModule { "bugprone-undefined-memory-manipulation"); CheckFactories.registerCheck( "bugprone-undelegated-constructor"); + CheckFactories.registerCheck( + "bugprone-unhandled-code-paths"); CheckFactories.registerCheck( "bugprone-unhandled-self-assignment"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt index 3e55eae1bdc92..1841bf518997b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt @@ -107,6 +107,7 @@ add_clang_library(clangTidyBugproneModule STATIC UncheckedStringToNumberConversionCheck.cpp UndefinedMemoryManipulationCheck.cpp UndelegatedConstructorCheck.cpp + UnhandledCodePathsCheck.cpp UnhandledExceptionAtNewCheck.cpp UnhandledSelfAssignmentCheck.cpp UniquePtrArrayMismatchCheck.cpp diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledCodePathsCheck.cpp similarity index 92% rename from clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/UnhandledCodePathsCheck.cpp index 18d5aa44a6a95..1a7d907bac6ec 100644 --- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledCodePathsCheck.cpp @@ -6,21 +6,20 @@ // //===----------------------------------------------------------------------===// -#include "MultiwayPathsCoveredCheck.h" +#include "UnhandledCodePathsCheck.h" #include "clang/AST/ASTContext.h" #include using namespace clang::ast_matchers; -namespace clang::tidy::hicpp { +namespace clang::tidy::bugprone { -void MultiwayPathsCoveredCheck::storeOptions( - ClangTidyOptions::OptionMap &Opts) { +void UnhandledCodePathsCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "WarnOnMissingElse", WarnOnMissingElse); } -void MultiwayPathsCoveredCheck::registerMatchers(MatchFinder *Finder) { +void UnhandledCodePathsCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( switchStmt( hasCondition(expr( @@ -87,7 +86,7 @@ static std::size_t getNumberOfPossibleValues(QualType T, return 1; } -void MultiwayPathsCoveredCheck::check(const MatchFinder::MatchResult &Result) { +void UnhandledCodePathsCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *ElseIfWithoutElse = Result.Nodes.getNodeAs("else-if")) { diag(ElseIfWithoutElse->getBeginLoc(), @@ -123,8 +122,8 @@ void MultiwayPathsCoveredCheck::check(const MatchFinder::MatchResult &Result) { llvm_unreachable("matched a case, that was not explicitly handled"); } -void MultiwayPathsCoveredCheck::handleSwitchWithDefault( - const SwitchStmt *Switch, std::size_t CaseCount) { +void UnhandledCodePathsCheck::handleSwitchWithDefault(const SwitchStmt *Switch, + std::size_t CaseCount) { assert(CaseCount > 0 && "Switch statement with supposedly one default " "branch did not contain any case labels"); if (CaseCount == 1 || CaseCount == 2) @@ -134,7 +133,7 @@ void MultiwayPathsCoveredCheck::handleSwitchWithDefault( : "switch could be better written as an if/else statement"); } -void MultiwayPathsCoveredCheck::handleSwitchWithoutDefault( +void UnhandledCodePathsCheck::handleSwitchWithoutDefault( const SwitchStmt *Switch, std::size_t CaseCount, const MatchFinder::MatchResult &Result) { // The matcher only works because some nodes are explicitly matched and @@ -172,4 +171,4 @@ void MultiwayPathsCoveredCheck::handleSwitchWithoutDefault( CaseCount == 1 ? "switch with only one case; use an if statement" : "potential uncovered code path; add a default label"); } -} // namespace clang::tidy::hicpp +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnhandledCodePathsCheck.h similarity index 73% rename from clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h rename to clang-tools-extra/clang-tidy/bugprone/UnhandledCodePathsCheck.h index e22e31ac7b05a..051f1fd66dd63 100644 --- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledCodePathsCheck.h @@ -6,22 +6,22 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNHANDLEDCODEPATHSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNHANDLEDCODEPATHSCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::hicpp { +namespace clang::tidy::bugprone { /// Find occasions where not all codepaths are explicitly covered in code. /// This includes 'switch' without a 'default'-branch and 'if'-'else if'-chains /// without a final 'else'-branch. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/hicpp/multiway-paths-covered.html -class MultiwayPathsCoveredCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unhandled-code-paths.html +class UnhandledCodePathsCheck : public ClangTidyCheck { public: - MultiwayPathsCoveredCheck(StringRef Name, ClangTidyContext *Context) + UnhandledCodePathsCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), WarnOnMissingElse(Options.get("WarnOnMissingElse", false)) {} void storeOptions(ClangTidyOptions::OptionMap &Opts) override; @@ -39,6 +39,6 @@ class MultiwayPathsCoveredCheck : public ClangTidyCheck { const bool WarnOnMissingElse; }; -} // namespace clang::tidy::hicpp +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNHANDLEDCODEPATHSCHECK_H diff --git a/clang-tools-extra/clang-tidy/hicpp/CMakeLists.txt b/clang-tools-extra/clang-tidy/hicpp/CMakeLists.txt index b9b7e716d00a4..613b2e5962668 100644 --- a/clang-tools-extra/clang-tidy/hicpp/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/hicpp/CMakeLists.txt @@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyHICPPModule STATIC HICPPTidyModule.cpp - MultiwayPathsCoveredCheck.cpp LINK_LIBS clangTidy diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp index e628c81db6955..3679b70ab2117 100644 --- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp @@ -10,6 +10,7 @@ #include "../ClangTidyModule.h" #include "../bugprone/SignedBitwiseCheck.h" #include "../bugprone/UndelegatedConstructorCheck.h" +#include "../bugprone/UnhandledCodePathsCheck.h" #include "../bugprone/UseAfterMoveCheck.h" #include "../cppcoreguidelines/NoMallocCheck.h" #include "../cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h" @@ -30,7 +31,6 @@ #include "../portability/NoAssemblerCheck.h" #include "../readability/NamedParameterCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" -#include "MultiwayPathsCoveredCheck.h" namespace clang::tidy { namespace hicpp { @@ -39,7 +39,7 @@ namespace { class HICPPModule : public ClangTidyModule { public: void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { - CheckFactories.registerCheck( + CheckFactories.registerCheck( "hicpp-multiway-paths-covered"); CheckFactories.registerCheck( "hicpp-signed-bitwise"); diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 0d3adedeea0f8..365f3b40e8ca8 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -247,6 +247,12 @@ New check aliases `. The `google-explicit-constructor` name is kept as an alias. +- Renamed :doc:`hicpp-multiway-paths-covered + ` + to :doc:`bugprone-unhandled-code-paths + `. + The `hicpp-multiway-paths-covered` name is kept as an alias. + - Renamed :doc:`hicpp-no-assembler ` to :doc:`portability-no-assembler `. The `hicpp-no-assembler` diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-code-paths.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-code-paths.rst new file mode 100644 index 0000000000000..0e3082afe38f7 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-code-paths.rst @@ -0,0 +1,95 @@ +.. title:: clang-tidy - bugprone-unhandled-code-paths + +bugprone-unhandled-code-paths +============================= + +This check discovers situations where code paths are not fully-covered. +It furthermore suggests using ``if`` instead of ``switch`` if the code +will be more clear. + +``if-else if`` chains that miss a final ``else`` branch might lead to +unexpected program execution and be the result of a logical error. +If the missing ``else`` branch is intended you can leave it empty with +a clarifying comment. +This warning can be noisy on some code bases, so it is disabled by default. + +.. code-block:: c++ + + void f1() { + int i = determineTheNumber(); + + if(i > 0) { + // Some Calculation + } else if (i < 0) { + // Precondition violated or something else. + } + // ... + } + +Similar arguments hold for ``switch`` statements which do not cover all +possible code paths. + +.. code-block:: c++ + + // The missing default branch might be a logical error. It can be kept empty + // if there is nothing to do, making it explicit. + void f2(int i) { + switch (i) { + case 0: // something + break; + case 1: // something else + break; + } + // All other numbers? + } + + // Violates this rule as well, but already emits a compiler warning (-Wswitch). + enum Color { Red, Green, Blue, Yellow }; + void f3(enum Color c) { + switch (c) { + case Red: // We can't drive for now. + break; + case Green: // We are allowed to drive. + break; + } + // Other cases missing + } + + +Every ``switch`` statement should have at least two ``case`` labels +other than a `default` label. +Otherwise, the ``switch`` could be better expressed with an ``if`` statement. +Degenerated ``switch`` statements without any labels are caught as well. + +.. code-block:: c++ + + // Degenerated switch that could be better written as `if` + int i = 42; + switch(i) { + case 1: // do something here + default: // do something else here + } + + // Should rather be the following: + if (i == 1) { + // do something here + } + else { + // do something here + } + + +.. code-block:: c++ + + // A completely degenerated switch will be diagnosed. + int i = 42; + switch(i) {} + + +Options +------- + +.. option:: WarnOnMissingElse + + Boolean flag that activates a warning for missing ``else`` branches. + Default is `false`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst index 13f174778b6de..e6cef1d302059 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst @@ -3,96 +3,6 @@ hicpp-multiway-paths-covered ============================ -This check discovers situations where code paths are not fully-covered. -It furthermore suggests using ``if`` instead of ``switch`` if the code -will be more clear. -The `rule 6.1.2 `_ -and `rule 6.1.4 `_ -of the High Integrity C++ Coding Standard are enforced. - -``if-else if`` chains that miss a final ``else`` branch might lead to -unexpected program execution and be the result of a logical error. -If the missing ``else`` branch is intended you can leave it empty with -a clarifying comment. -This warning can be noisy on some code bases, so it is disabled by default. - -.. code-block:: c++ - - void f1() { - int i = determineTheNumber(); - - if(i > 0) { - // Some Calculation - } else if (i < 0) { - // Precondition violated or something else. - } - // ... - } - -Similar arguments hold for ``switch`` statements which do not cover all -possible code paths. - -.. code-block:: c++ - - // The missing default branch might be a logical error. It can be kept empty - // if there is nothing to do, making it explicit. - void f2(int i) { - switch (i) { - case 0: // something - break; - case 1: // something else - break; - } - // All other numbers? - } - - // Violates this rule as well, but already emits a compiler warning (-Wswitch). - enum Color { Red, Green, Blue, Yellow }; - void f3(enum Color c) { - switch (c) { - case Red: // We can't drive for now. - break; - case Green: // We are allowed to drive. - break; - } - // Other cases missing - } - - -The `rule 6.1.4 `_ -requires every ``switch`` statement to have at least two ``case`` labels other than a `default` label. -Otherwise, the ``switch`` could be better expressed with an ``if`` statement. -Degenerated ``switch`` statements without any labels are caught as well. - -.. code-block:: c++ - - // Degenerated switch that could be better written as `if` - int i = 42; - switch(i) { - case 1: // do something here - default: // do something else here - } - - // Should rather be the following: - if (i == 1) { - // do something here - } - else { - // do something here - } - - -.. code-block:: c++ - - // A completely degenerated switch will be diagnosed. - int i = 42; - switch(i) {} - - -Options -------- - -.. option:: WarnOnMissingElse - - Boolean flag that activates a warning for missing ``else`` branches. - Default is `false`. +The `hicpp-multiway-paths-covered` check is an alias, please see +`bugprone-unhandled-code-paths <../bugprone/unhandled-code-paths.html>`_ +for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 6d91b0297ee5c..f193c0920ec1b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -174,6 +174,7 @@ Clang-Tidy Checks :doc:`bugprone-unchecked-string-to-number-conversion `, :doc:`bugprone-undefined-memory-manipulation `, :doc:`bugprone-undelegated-constructor `, + :doc:`bugprone-unhandled-code-paths `, :doc:`bugprone-unhandled-exception-at-new `, :doc:`bugprone-unhandled-self-assignment `, :doc:`bugprone-unintended-char-ostream-output `, "Yes" @@ -241,7 +242,6 @@ Clang-Tidy Checks :doc:`google-runtime-int `, :doc:`google-runtime-operator `, :doc:`google-upgrade-googletest-case `, "Yes" - :doc:`hicpp-multiway-paths-covered `, :doc:`linuxkernel-must-check-errs `, :doc:`llvm-header-guard `, :doc:`llvm-include-order `, "Yes" @@ -606,6 +606,7 @@ Check aliases :doc:`hicpp-invalid-access-moved `, :doc:`bugprone-use-after-move `, :doc:`hicpp-member-init `, :doc:`cppcoreguidelines-pro-type-member-init `, "Yes" :doc:`hicpp-move-const-arg `, :doc:`performance-move-const-arg `, "Yes" + :doc:`hicpp-multiway-paths-covered `, :doc:`bugprone-unhandled-code-paths `, :doc:`hicpp-named-parameter `, :doc:`readability-named-parameter `, "Yes" :doc:`hicpp-new-delete-operators `, :doc:`misc-new-delete-overloads `, :doc:`hicpp-no-array-decay `, :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay `, diff --git a/clang-tools-extra/test/clang-tidy/checkers/hicpp/multiway-paths-covered-else.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-code-paths-else.cpp similarity index 92% rename from clang-tools-extra/test/clang-tidy/checkers/hicpp/multiway-paths-covered-else.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-code-paths-else.cpp index 5041b4e97d0c6..420bae681ca65 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/hicpp/multiway-paths-covered-else.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-code-paths-else.cpp @@ -1,6 +1,6 @@ -// RUN: %check_clang_tidy %s hicpp-multiway-paths-covered %t \ +// RUN: %check_clang_tidy %s bugprone-unhandled-code-paths %t \ // RUN: -config='{CheckOptions: \ -// RUN: {hicpp-multiway-paths-covered.WarnOnMissingElse: true}}'\ +// RUN: {bugprone-unhandled-code-paths.WarnOnMissingElse: true}}'\ // RUN: -- enum OS { Mac, diff --git a/clang-tools-extra/test/clang-tidy/checkers/hicpp/multiway-paths-covered.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-code-paths.cpp similarity index 99% rename from clang-tools-extra/test/clang-tidy/checkers/hicpp/multiway-paths-covered.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-code-paths.cpp index 15a3407dc1c27..0c3b459ef06fd 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/hicpp/multiway-paths-covered.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-code-paths.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s hicpp-multiway-paths-covered %t +// RUN: %check_clang_tidy %s bugprone-unhandled-code-paths %t enum OS { Mac, Windows, From c74ecc9e9283560ea5f5727dbc3a9a6fa18d642c Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sun, 10 May 2026 17:07:22 +0200 Subject: [PATCH 182/538] [IRBuilder] Split CreateAssumption to one with bundle and one with condition [NFC] (#196795) as it is not possible to combine bundles and conditions from https://github.com/llvm/llvm-project/pull/160460 reflect that in CreateAssumption --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- llvm/include/llvm/IR/IRBuilder.h | 13 +++++---- llvm/lib/IR/IRBuilder.cpp | 29 +++++++++---------- .../InstCombine/InstCombineCalls.cpp | 6 ++-- 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 67de2a34f44ea..1318641159212 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3650,7 +3650,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Value *Values[] = {Value0, Value1}; OperandBundleDefT OBD("separate_storage", Values); - Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD}); + Builder.CreateAssumption({OBD}); return RValue::get(nullptr); } case Builtin::BI__builtin_allow_runtime_check: { diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index ca085bb4aaa11..cb0fdeaecd1cc 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -896,11 +896,11 @@ class IRBuilderBase { /// Create an assume intrinsic call that allows the optimizer to /// assume that the provided condition will be true. - /// - /// The optional argument \p OpBundles specifies operand bundles that are - /// added to the call instruction. - LLVM_ABI CallInst * - CreateAssumption(Value *Cond, ArrayRef OpBundles = {}); + LLVM_ABI CallInst *CreateAssumption(Value *Cond); + + /// Create an assume intrinsic call that allows the optimizer to + /// assume that the provided operand bundles hold. + LLVM_ABI CallInst *CreateAssumption(ArrayRef OpBundles); /// Create a llvm.experimental.noalias.scope.decl intrinsic call. LLVM_ABI Instruction *CreateNoAliasScopeDeclaration(Value *Scope); @@ -1025,7 +1025,8 @@ class IRBuilderBase { ArrayRef OverloadTypes, ArrayRef Args, FMFSource FMFSource = {}, - const Twine &Name = ""); + const Twine &Name = "", + ArrayRef OpBundles = {}); /// Create a call to intrinsic \p ID with \p RetTy and \p Args. If /// \p FMFSource is provided, copy fast-math-flags from that instruction to diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 706a977a5b6d5..93a2eba532ac2 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -545,16 +545,17 @@ CallInst *IRBuilderBase::CreateThreadLocalAddress(Value *Ptr) { return CI; } -CallInst * -IRBuilderBase::CreateAssumption(Value *Cond, - ArrayRef OpBundles) { +CallInst *IRBuilderBase::CreateAssumption(Value *Cond) { assert(Cond->getType() == getInt1Ty() && "an assumption condition must be of type i1"); + return CreateIntrinsic(Intrinsic::assume, /*OverloadTypes=*/{}, {Cond}); +} - Value *Ops[] = { Cond }; - Module *M = BB->getParent()->getParent(); - Function *FnAssume = Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume); - return CreateCall(FnAssume, Ops, OpBundles); +CallInst * +IRBuilderBase::CreateAssumption(ArrayRef OpBundles) { + Value *Args[] = {ConstantInt::getTrue(getContext())}; + return CreateIntrinsic(Intrinsic::assume, /*OverloadTypes=*/{}, Args, + /*FMFSource=*/nullptr, /*Name=*/"", OpBundles); } Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) { @@ -930,11 +931,11 @@ Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID, ArrayRef OverloadTypes, ArrayRef Args, - FMFSource FMFSource, - const Twine &Name) { + FMFSource FMFSource, const Twine &Name, + ArrayRef OpBundles) { Module *M = BB->getModule(); Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, OverloadTypes); - return createCallHelper(Fn, Args, Name, FMFSource); + return createCallHelper(Fn, Args, Name, FMFSource, OpBundles); } CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID, @@ -1358,7 +1359,7 @@ CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL, if (OffsetValue) Vals.push_back(OffsetValue); OperandBundleDefT AlignOpB("align", Vals); - return CreateAssumption(ConstantInt::getTrue(getContext()), {AlignOpB}); + return CreateAssumption({AlignOpB}); } CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL, @@ -1389,15 +1390,13 @@ CallInst *IRBuilderBase::CreateDereferenceableAssumption(Value *PtrValue, "trying to create a deferenceable assumption on a non-pointer?"); SmallVector Vals({PtrValue, SizeValue}); OperandBundleDefT DereferenceableOpB("dereferenceable", Vals); - return CreateAssumption(ConstantInt::getTrue(getContext()), - {DereferenceableOpB}); + return CreateAssumption({DereferenceableOpB}); } CallInst *IRBuilderBase::CreateNonnullAssumption(Value *PtrValue) { assert(isa(PtrValue->getType()) && "trying to create a nonnull assumption on a non-pointer?"); - return CreateAssumption(ConstantInt::getTrue(getContext()), - OperandBundleDef("nonnull", PtrValue)); + return CreateAssumption(OperandBundleDef("nonnull", PtrValue)); } IRBuilderDefaultInserter::~IRBuilderDefaultInserter() = default; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index a622e0248fce8..2a8bf3ffecd6f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3604,21 +3604,19 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { break; case Intrinsic::assume: { Value *IIOperand = II->getArgOperand(0); - SmallVector OpBundles; - II->getOperandBundlesAsDefs(OpBundles); // Canonicalize assume(a && b) -> assume(a); assume(b); // Note: New assumption intrinsics created here are registered by // the InstCombineIRInserter object. Value *A, *B; if (match(IIOperand, m_LogicalAnd(m_Value(A), m_Value(B)))) { - Builder.CreateAssumption(A, OpBundles); + Builder.CreateAssumption(A); Builder.CreateAssumption(B); return eraseInstFromFunction(*II); } // assume(!(a || b)) -> assume(!a); assume(!b); if (match(IIOperand, m_Not(m_LogicalOr(m_Value(A), m_Value(B))))) { - Builder.CreateAssumption(Builder.CreateNot(A), OpBundles); + Builder.CreateAssumption(Builder.CreateNot(A)); Builder.CreateAssumption(Builder.CreateNot(B)); return eraseInstFromFunction(*II); } From d0d40cf3c61785b15dda70a528e4d44d27bfab0e Mon Sep 17 00:00:00 2001 From: Zinovy Nis Date: Sun, 10 May 2026 18:16:30 +0300 Subject: [PATCH 183/538] [clang-tidy] Reland "An option for conditional skipping overloaded functions in modernize-use-string-view" (#196387) --- .../modernize/UseStringViewCheck.cpp | 6 +- .../clang-tidy/modernize/UseStringViewCheck.h | 5 +- .../checks/modernize/use-string-view.rst | 10 + .../modernize/use-string-view-overloaded.cpp | 185 ++++++++++++++++++ .../checkers/modernize/use-string-view.cpp | 91 --------- 5 files changed, 203 insertions(+), 94 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view-overloaded.cpp diff --git a/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.cpp index 29e5bdb65632e..9892870279b55 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.cpp @@ -80,6 +80,7 @@ static void fixReturns(const FunctionDecl *FuncDecl, DiagnosticBuilder &Diag, UseStringViewCheck::UseStringViewCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), + CheckOverloadedFunctions(Options.get("CheckOverloadedFunctions", false)), IgnoredFunctions(utils::options::parseStringList( Options.get("IgnoredFunctions", "toString$;ToString$;to_string$"))) { parseReplacementStringViewClass( @@ -87,6 +88,7 @@ UseStringViewCheck::UseStringViewCheck(StringRef Name, } void UseStringViewCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { + Options.store(Opts, "CheckOverloadedFunctions", CheckOverloadedFunctions); Options.store(Opts, "IgnoredFunctions", utils::options::serializeStringList(IgnoredFunctions)); Options.store(Opts, "ReplacementStringViewClass", @@ -109,11 +111,13 @@ void UseStringViewCheck::registerMatchers(MatchFinder *Finder) { hasFalseExpression(ignoringParenImpCasts(stringLiteral()))); const auto VirtualOrOperator = cxxMethodDecl(anyOf(cxxConversionDecl(), isVirtual())); + const auto CheckOverloaded = + CheckOverloadedFunctions ? unless(anything()) : isOverloaded(); Finder->addMatcher( functionDecl( isDefinition(), unless(anyOf(VirtualOrOperator, IgnoredFunctionsMatcher, - isOverloaded(), + CheckOverloaded, ast_matchers::isExplicitTemplateSpecialization())), returns(IsStdString), hasDescendant(returnStmt()), unless(hasDescendant(returnStmt(hasReturnValue(unless( diff --git a/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.h index f5f11edc54824..275ce904290ac 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseStringViewCheck.h @@ -35,13 +35,14 @@ class UseStringViewCheck : public ClangTidyCheck { StringRef toStringViewTypeStr(StringRef Type) const; void parseReplacementStringViewClass(StringRef Options); + bool CheckOverloadedFunctions = false; + const std::vector IgnoredFunctions; + StringRef StringViewClass = "std::string_view"; StringRef WStringViewClass = "std::wstring_view"; StringRef U8StringViewClass = "std::u8string_view"; StringRef U16StringViewClass = "std::u16string_view"; StringRef U32StringViewClass = "std::u32string_view"; - - const std::vector IgnoredFunctions; }; } // namespace clang::tidy::modernize diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-string-view.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-string-view.rst index c72a0480c0eb8..f3d2e0b94a508 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-string-view.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-string-view.rst @@ -108,6 +108,16 @@ simply make an explicit conversion. Options ------- +.. option:: CheckOverloadedFunctions + + If `true`, the check will also consider overloaded functions for + ``string_view`` conversion suggestions. If `false`, overloaded + functions are skipped to avoid potential issues with ambiguous + conversions. + + Default is `false`. + + .. option:: IgnoredFunctions A semicolon-separated list of the names of functions or methods to be diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view-overloaded.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view-overloaded.cpp new file mode 100644 index 0000000000000..bdb30473fda2b --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view-overloaded.cpp @@ -0,0 +1,185 @@ +// RUN: %check_clang_tidy -check-suffix=DONTCHECK \ +// RUN: -std=c++17-or-later %s modernize-use-string-view %t -- \ +// RUN: --config="{CheckOptions: {modernize-use-string-view.CheckOverloadedFunctions: false}}" + +// RUN: %check_clang_tidy -check-suffix=CHECK \ +// RUN: -std=c++17-or-later %s modernize-use-string-view %t -- \ +// RUN: --config="{CheckOptions: {modernize-use-string-view.CheckOverloadedFunctions: true}}" + +// RUN: %check_clang_tidy -check-suffix=CHECK-CXX20 \ +// RUN: -std=c++20-or-later %s modernize-use-string-view %t -- \ +// RUN: --config="{CheckOptions: {modernize-use-string-view.CheckOverloadedFunctions: true}}" \ +// RUN: -- -DUSE_CXX20=1 + +#include +#include + +namespace overload_funcs_redeclared { +std::basic_string overload(int); +std::string overload(int); +std::string overload(int) { return "int"; } +// CHECK-MESSAGES-DONTCHECK:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK:[[@LINE-2]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-3]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-DONTCHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } +} + +namespace overload_non_func { +struct overload {}; +std::string overload(int) { return "int"; } +// CHECK-MESSAGES-DONTCHECK:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK:[[@LINE-2]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-3]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-DONTCHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } +} + +namespace overload_with_inline { + inline namespace inline_namespace { + std::string overload1(int) { return "int"; } +// CHECK-MESSAGES-DONTCHECK:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK:[[@LINE-2]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-3]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-DONTCHECK: std::string_view overload1(int) { return "int"; } +// CHECK-FIXES-CHECK: std::string_view overload1(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload1(int) { return "int"; } + } + inline namespace regular_namespace { + std::string overload1(int) { return "int"; } +// CHECK-MESSAGES-DONTCHECK:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK:[[@LINE-2]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-3]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-DONTCHECK: std::string_view overload1(int) { return "int"; } +// CHECK-FIXES-CHECK: std::string_view overload1(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload1(int) { return "int"; } + } +} + +namespace overload_funcs { +std::string dbl2str(double f); +std::string overload(int) { return "int"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } +std::string overload(double f) { return "f=" + dbl2str(f); } +std::string overload(std::string) { return "string"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(std::string) { return "string"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(std::string) { return "string"; } +} + +namespace overload_methods { +struct Foo { + // Skip overloaded methods + std::string overload(int) { return "int"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } + std::string overload(double f) { return "double"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(double f) { return "double"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(double f) { return "double"; } + std::string overload(std::string) { return "string"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(std::string) { return "string"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(std::string) { return "string"; } +}; +} + +namespace overload_methods_nested_classes { +struct Bar { + std::string overload(int) { return "int"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } + std::string overload(std::string) { return "string"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(std::string) { return "string"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(std::string) { return "string"; } + struct FooBar { + std::string overload(char*) { return "char*"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(char*) { return "char*"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(char*) { return "char*"; } + std::string overload(double f) { return "double"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(double f) { return "double"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(double f) { return "double"; } + }; +}; +} + +namespace two_overloads_with_inline { + inline namespace inline_namespace { + std::string overload(int) { return "int"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } + std::string overload(double) { return "double"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(double) { return "double"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(double) { return "double"; } + } + std::string overload(int) { return "int"; } +// CHECK-MESSAGES-CHECK:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-2]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK: std::string_view overload(int) { return "int"; } +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } +} + +#if USE_CXX20 + +namespace overload_with_outer { +namespace overload_with_templates { + template + std::string overload(T) { return "T"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] + std::string overload(std::string) { return "string"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(std::string) { return "string"; } +} +using overload_with_templates::overload; +std::string overload(char*) { return "char*"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(char*) { return "char*"; } +} + +namespace overload_methods_nested_namespaces { +namespace foo { + std::string overload(int) { return "int"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int) { return "int"; } + std::string overload(std::string) { return "string"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:3: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(std::string) { return "string"; } +} +using foo::overload; +std::string overload(char*) { return "char*"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(char*) { return "char*"; } +} + +namespace overload_methods_templated { + template + std::string overload(T value) { return "T";} +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(T value) { return "T";} + std::string overload(int value) { return "int"; } +// CHECK-MESSAGES-CHECK-CXX20:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] +// CHECK-FIXES-CHECK-CXX20: std::string_view overload(int value) { return "int"; } +} +#endif diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view.cpp index 07c7eaff4a4c3..26a72c1c242e1 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-string-view.cpp @@ -203,44 +203,6 @@ MyString aliasedWChar() { return L"aliasedWChar"; } -namespace overload_funcs_redeclared { -std::basic_string overload(int); -std::string overload(int); -std::string overload(int) { return "int"; } -// CHECK-MESSAGES:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] -// CHECK-FIXES: std::string_view overload(int) { return "int"; } -} - -namespace overload_non_func { -struct overload {}; -std::string overload(int) { return "int"; } -// CHECK-MESSAGES:[[@LINE-1]]:1: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] -// CHECK-FIXES: std::string_view overload(int) { return "int"; } -} - -namespace overload_with_inline { - inline namespace inline_namespace { - std::string overload(int) { return "int"; } -// CHECK-MESSAGES:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] -// CHECK-FIXES: std::string_view overload(int) { return "int"; } - } - inline namespace regular_namespace { - std::string overload(int) { return "int"; } -// CHECK-MESSAGES:[[@LINE-1]]:5: warning: consider using 'std::string_view' to avoid unnecessary copying and allocations [modernize-use-string-view] -// CHECK-FIXES: std::string_view overload(int) { return "int"; } - } -} - -namespace overload_with_outer { -namespace overload_with_templates { - template - std::string overload(T) { return "T"; } - std::string overload(std::string) { return "string"; } -} -using overload_with_templates::overload; -std::string overload(char*) { return "char*"; } -} - // ========================================================== // Negative tests // ========================================================== @@ -395,59 +357,6 @@ std::string lambda() { }(); } -namespace overload_funcs { -std::string dbl2str(double f); -// Skip overloaded functions -std::string overload(int) { return "int"; } -// Because of this overload (non-literal return) the fix should not be applied -std::string overload(double f) { return "f=" + dbl2str(f); } -std::string overload(std::string) { return "string"; } -} - -namespace overload_methods { -struct Foo { - // Skip overloaded methods - std::string overload(int) { return "int"; } - std::string overload(double f) { return "double"; } - std::string overload(std::string) { return "string"; } -}; -} - -namespace overload_methods_nested_classes { -struct Bar { - std::string overload(int) { return "int"; } - std::string overload(std::string) { return "string"; } - - struct FooBar { - std::string overload(char*) { return "char*"; } - std::string overload(double f) { return "double"; } - }; -}; -} - -namespace overload_methods_nested_namespaces { -namespace foo { - std::string overload(int) { return "int"; } - std::string overload(std::string) { return "string"; } -} -using foo::overload; -std::string overload(char*) { return "char*"; } -} - -namespace overload_methods_templated { - template - std::string overload(T value) { return "T";} - std::string overload(int value) { return "int"; } -} - -namespace two_overloads_with_inline { - inline namespace inline_namespace { - std::string overload(int) { return "int"; } - std::string overload(double) { return "double"; } - } - std::string overload(int) { return "int"; } -} - struct TemplateString { static constexpr char* val = "TEMPLATE"; template From 9c95f3765930b964d2a42c3f08dae2fd281cc49b Mon Sep 17 00:00:00 2001 From: Jiahao Guo Date: Sun, 10 May 2026 23:56:18 +0800 Subject: [PATCH 184/538] [CIR][AArch64] Lower NEON vuzp intrinsics (#195591) ### Summary part of : https://github.com/llvm/llvm-project/issues/185382 lower `vuzp` intrinsics in: https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#unzip-elements this is a follow up : https://github.com/llvm/llvm-project/pull/195527 Lower `NEON::BI__builtin_neon_vuzp_v` and `NEON::BI__builtin_neon_vuzpq_v`in CIRGenBuiltinAArch64.cpp by porting by porting the existing incubator logic(clangir/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp) : two bitcasts on the input vectors,two rounds of cir.vec.shuffle generating the deinterleave (even/odd) shuffle patterns with indices 2*i+vi, each stored via ptr_stride on the sret base pointer. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 22 +- .../fp8-intrinsics/acle_neon_fp8_untyped.c | 36 -- clang/test/CodeGen/AArch64/neon-perm.c | 376 ----------------- clang/test/CodeGen/AArch64/neon/perm.c | 382 ++++++++++++++++++ 4 files changed, 402 insertions(+), 414 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 16018d9b080ad..26c64c6b26dca 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2871,12 +2871,30 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, case NEON::BI__builtin_neon_vst4q_lane_v: case NEON::BI__builtin_neon_vtrn_v: case NEON::BI__builtin_neon_vtrnq_v: - case NEON::BI__builtin_neon_vuzp_v: - case NEON::BI__builtin_neon_vuzpq_v: cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AArch64 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return mlir::Value{}; + case NEON::BI__builtin_neon_vuzp_v: + case NEON::BI__builtin_neon_vuzpq_v: { + ops[1] = builder.createBitcast(ops[1], ty); + ops[2] = builder.createBitcast(ops[2], ty); + // Adding a bitcast here as Ops[0] might be a void pointer. + mlir::Value baseAddr = + builder.createBitcast(ops[0], builder.getPointerTo(ty)); + mlir::Value sv; + for (unsigned vi = 0; vi != 2; ++vi) { + llvm::SmallVector indices; + for (unsigned i = 0, e = ty.getSize(); i != e; ++i) { + indices.push_back(2 * i + vi); + } + cir::ConstantOp idx = builder.getConstInt(loc, builder.getSInt32Ty(), vi); + mlir::Value addr = builder.createPtrStride(loc, baseAddr, idx); + sv = builder.createVecShuffle(loc, ops[1], ops[2], indices); + (void)builder.CIRBaseBuilderTy::createStore(loc, sv, addr); + } + return sv; + } case NEON::BI__builtin_neon_vzip_v: case NEON::BI__builtin_neon_vzipq_v: { ops[1] = builder.createBitcast(ops[1], ty); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c index bdb2cc1984963..0a9a88fc249b0 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c @@ -519,42 +519,6 @@ mfloat8x16x2_t test_vtrnq_mf8(mfloat8x16_t a, mfloat8x16_t b) { return vtrnq_mf8(a, b); } -// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vuzp_mf8( -// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -mfloat8x8x2_t test_vuzp_mf8(mfloat8x8_t a, mfloat8x8_t b) { - return vuzp_mf8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vuzpq_mf8( -// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] -// -mfloat8x16x2_t test_vuzpq_mf8(mfloat8x16_t a, mfloat8x16_t b) { - return vuzpq_mf8(a, b); -} - // CHECK-LABEL: define dso_local void @test_vcopy_lane_mf8( // CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c index edabc8a002953..7e3745b952f50 100644 --- a/clang/test/CodeGen/AArch64/neon-perm.c +++ b/clang/test/CodeGen/AArch64/neon-perm.c @@ -846,382 +846,6 @@ poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) { return vtrn2q_p16(a, b); } -// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vuzp_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { - return vuzp_s8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vuzp_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, <4 x i16> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT]] -// -int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { - return vuzp_s16(a, b); -} - -// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vuzp_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, <2 x i32> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT]] -// -int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { - return vuzp_s32(a, b); -} - -// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vuzp_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { - return vuzp_u8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vuzp_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, <4 x i16> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT]] -// -uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { - return vuzp_u16(a, b); -} - -// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vuzp_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, <2 x i32> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT]] -// -uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { - return vuzp_u32(a, b); -} - -// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vuzp_f32( -// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT3]], <2 x float> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT4]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, <2 x float> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT]] -// -float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { - return vuzp_f32(a, b); -} - -// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vuzp_p8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { - return vuzp_p8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vuzp_p16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, <4 x i16> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT]] -// -poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { - return vuzp_p16(a, b); -} - -// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vuzpq_s8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT]] -// -int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { - return vuzpq_s8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vuzpq_s16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, <8 x i16> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { - return vuzpq_s16(a, b); -} - -// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vuzpq_s32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, <4 x i32> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT]] -// -int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { - return vuzpq_s32(a, b); -} - -// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vuzpq_u8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT]] -// -uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { - return vuzpq_u8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vuzpq_u16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, <8 x i16> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { - return vuzpq_u16(a, b); -} - -// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vuzpq_u32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, <4 x i32> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT]] -// -uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { - return vuzpq_u32(a, b); -} - -// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vuzpq_f32( -// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x float> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, <4 x float> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT]] -// -float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { - return vuzpq_f32(a, b); -} - -// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vuzpq_p8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT]] -// -poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { - return vuzpq_p8(a, b); -} - -// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vuzpq_p16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, <8 x i16> [[VUZP_I]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VUZP1_I]], 0, 1 -// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 -// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 -// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 -// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 -// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 -// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT]] -// -poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { - return vuzpq_p16(a, b); -} - // CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vtrn_s8( // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon/perm.c b/clang/test/CodeGen/AArch64/neon/perm.c index 6173ce43d18c7..6c3a9a44a7084 100644 --- a/clang/test/CodeGen/AArch64/neon/perm.c +++ b/clang/test/CodeGen/AArch64/neon/perm.c @@ -921,3 +921,385 @@ mfloat8x16x2_t test_vzipq_mf8(mfloat8x16_t a, mfloat8x16_t b) { // LLVM: ret %struct.mfloat8x16x2_t return vzipq_mf8(a, b); } + +//===------------------------------------------------------===// +// 2.1.9.11. Unzip elements +// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#unzip-elements +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vuzp_s8( +// CIR-LABEL: @vuzp_s8( +int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { +// CIR: [[LO:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !s8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !s8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !s8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !s8i>, !cir.ptr> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: insertvalue %struct.int8x8x2_t poison, <8 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.int8x8x2_t {{.*}}, <8 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.int8x8x2_t +return vuzp_s8(a, b); +} + +// LLVM-LABEL: @test_vuzp_s16( +// CIR-LABEL: @vuzp_s16( +int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !s16i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !s16i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<4 x !s16i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !s16i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i] : !cir.vector<4 x !s16i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<4 x !s16i>, !cir.ptr> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: insertvalue %struct.int16x4x2_t poison, <4 x i16> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.int16x4x2_t {{.*}}, <4 x i16> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.int16x4x2_t +return vuzp_s16(a, b); +} + +// LLVM-LABEL: @test_vuzp_u8( +// CIR-LABEL: @vuzp_u8( +uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<8 x !u8i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<8 x !u8i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !u8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !u8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !u8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !u8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !u8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !u8i>, !cir.ptr> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: insertvalue %struct.uint8x8x2_t poison, <8 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.uint8x8x2_t {{.*}}, <8 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.uint8x8x2_t +return vuzp_u8(a, b); +} + +// LLVM-LABEL: @test_vuzp_u16( +// CIR-LABEL: @vuzp_u16( +uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !u16i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !u16i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !u16i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !u16i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<4 x !u16i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !u16i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i] : !cir.vector<4 x !u16i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<4 x !u16i>, !cir.ptr> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: insertvalue %struct.uint16x4x2_t poison, <4 x i16> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.uint16x4x2_t {{.*}}, <4 x i16> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.uint16x4x2_t +return vuzp_u16(a, b); +} + +// LLVM-LABEL: @test_vuzp_p8( +// CIR-LABEL: @vuzp_p8( +poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { +// CIR: [[LO:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !s8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !s8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !s8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !s8i>, !cir.ptr> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: insertvalue %struct.poly8x8x2_t poison, <8 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.poly8x8x2_t {{.*}}, <8 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.poly8x8x2_t +return vuzp_p8(a, b); +} + +// LLVM-LABEL: @test_vuzp_p16( +// CIR-LABEL: @vuzp_p16( +poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !s16i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !s16i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<4 x !s16i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !s16i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i] : !cir.vector<4 x !s16i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<4 x !s16i>, !cir.ptr> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: insertvalue %struct.poly16x4x2_t poison, <4 x i16> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.poly16x4x2_t {{.*}}, <4 x i16> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.poly16x4x2_t +return vuzp_p16(a, b); +} + +// LLVM-LABEL: @test_vuzp_mf8( +// CIR-LABEL: @vuzp_mf8( +mfloat8x8x2_t test_vuzp_mf8(mfloat8x8_t a, mfloat8x8_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<8 x !u8i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<8 x !u8i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !u8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !u8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !u8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !u8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !u8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !u8i>, !cir.ptr> + +// LLVM-SAME: <8 x i8> {{.*}}[[A:%.*]], <8 x i8> {{.*}}[[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: insertvalue %struct.mfloat8x8x2_t poison, <8 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.mfloat8x8x2_t {{.*}}, <8 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.mfloat8x8x2_t +return vuzp_mf8(a, b); +} + +// LLVM-LABEL: @test_vuzp_s32( +// CIR-LABEL: @vuzp_s32( +int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<2 x !s32i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i] : !cir.vector<2 x !s32i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<2 x !s32i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<2 x !s32i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !s32i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<2 x !s32i>, !cir.ptr> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: insertvalue %struct.int32x2x2_t poison, <2 x i32> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.int32x2x2_t {{.*}}, <2 x i32> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.int32x2x2_t +return vuzp_s32(a, b); +} + +// LLVM-LABEL: @test_vuzp_f32( +// CIR-LABEL: @vuzp_f32( +float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !cir.float> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !cir.float> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<2 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i] : !cir.vector<2 x !cir.float> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<2 x !cir.float>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<2 x !cir.float>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !cir.float> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<2 x !cir.float>, !cir.ptr> + +// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// LLVM: insertvalue %struct.float32x2x2_t poison, <2 x float> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.float32x2x2_t {{.*}}, <2 x float> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.float32x2x2_t +return vuzp_f32(a, b); +} + +// LLVM-LABEL: @test_vuzp_u32( +// CIR-LABEL: @vuzp_u32( +uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !u32i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !u32i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<2 x !u32i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i] : !cir.vector<2 x !u32i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<2 x !u32i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<2 x !u32i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !u32i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<2 x !u32i>, !cir.ptr> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: insertvalue %struct.uint32x2x2_t poison, <2 x i32> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.uint32x2x2_t {{.*}}, <2 x i32> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.uint32x2x2_t +return vuzp_u32(a, b); +} + +// LLVM-LABEL: @test_vuzpq_s8( +// CIR-LABEL: @vuzpq_s8( +int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { +// CIR: [[LO:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i, #cir.int<16> : !s32i, #cir.int<18> : !s32i, #cir.int<20> : !s32i, #cir.int<22> : !s32i, #cir.int<24> : !s32i, #cir.int<26> : !s32i, #cir.int<28> : !s32i, #cir.int<30> : !s32i] : !cir.vector<16 x !s8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<16 x !s8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, #cir.int<19> : !s32i, #cir.int<21> : !s32i, #cir.int<23> : !s32i, #cir.int<25> : !s32i, #cir.int<27> : !s32i, #cir.int<29> : !s32i, #cir.int<31> : !s32i] : !cir.vector<16 x !s8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<16 x !s8i>, !cir.ptr> + +// LLVM-SAME: <16 x i8> {{.*}} [[A:%.*]], <16 x i8> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: insertvalue %struct.int8x16x2_t poison, <16 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.int8x16x2_t {{.*}}, <16 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.int8x16x2_t +return vuzpq_s8(a, b); +} + +// LLVM-LABEL: @test_vuzpq_s16( +// CIR-LABEL: @vuzpq_s16( +int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !s16i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !s16i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !s16i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !s16i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !s16i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !s16i>, !cir.ptr> + +// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]], <8 x i16> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: insertvalue %struct.int16x8x2_t poison, <8 x i16> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.int16x8x2_t {{.*}}, <8 x i16> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.int16x8x2_t +return vuzpq_s16(a, b); +} + +// LLVM-LABEL: @test_vuzpq_s32( +// CIR-LABEL: @vuzpq_s32( +int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !s32i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !s32i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !s32i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i] : !cir.vector<4 x !s32i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<4 x !s32i>, !cir.ptr> + +// LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]], <4 x i32> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: insertvalue %struct.int32x4x2_t poison, <4 x i32> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.int32x4x2_t {{.*}}, <4 x i32> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.int32x4x2_t +return vuzpq_s32(a, b); +} + +// LLVM-LABEL: @test_vuzpq_f32( +// CIR-LABEL: @vuzpq_f32( +float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !cir.float> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !cir.float> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !cir.float> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<4 x !cir.float>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !cir.float>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i] : !cir.vector<4 x !cir.float> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<4 x !cir.float>, !cir.ptr> + +// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// LLVM: insertvalue %struct.float32x4x2_t poison, <4 x float> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.float32x4x2_t {{.*}}, <4 x float> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.float32x4x2_t +return vuzpq_f32(a, b); +} + +// LLVM-LABEL: @test_vuzpq_u8( +// CIR-LABEL: @vuzpq_u8( +uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<16 x !u8i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<16 x !u8i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<16 x !u8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i, #cir.int<16> : !s32i, #cir.int<18> : !s32i, #cir.int<20> : !s32i, #cir.int<22> : !s32i, #cir.int<24> : !s32i, #cir.int<26> : !s32i, #cir.int<28> : !s32i, #cir.int<30> : !s32i] : !cir.vector<16 x !u8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<16 x !u8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<16 x !u8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, #cir.int<19> : !s32i, #cir.int<21> : !s32i, #cir.int<23> : !s32i, #cir.int<25> : !s32i, #cir.int<27> : !s32i, #cir.int<29> : !s32i, #cir.int<31> : !s32i] : !cir.vector<16 x !u8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<16 x !u8i>, !cir.ptr> + +// LLVM-SAME: <16 x i8> {{.*}} [[A:%.*]], <16 x i8> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: insertvalue %struct.uint8x16x2_t poison, <16 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.uint8x16x2_t {{.*}}, <16 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.uint8x16x2_t +return vuzpq_u8(a, b); +} + +// LLVM-LABEL: @test_vuzpq_u32( +// CIR-LABEL: @vuzpq_u32( +uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !u32i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !u32i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !u32i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !u32i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<4 x !u32i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<4 x !u32i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i] : !cir.vector<4 x !u32i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<4 x !u32i>, !cir.ptr> + +// LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]], <4 x i32> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: insertvalue %struct.uint32x4x2_t poison, <4 x i32> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.uint32x4x2_t {{.*}}, <4 x i32> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.uint32x4x2_t +return vuzpq_u32(a, b); +} + +// LLVM-LABEL: @test_vuzpq_p8( +// CIR-LABEL: @vuzpq_p8( +poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { +// CIR: [[LO:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i, #cir.int<16> : !s32i, #cir.int<18> : !s32i, #cir.int<20> : !s32i, #cir.int<22> : !s32i, #cir.int<24> : !s32i, #cir.int<26> : !s32i, #cir.int<28> : !s32i, #cir.int<30> : !s32i] : !cir.vector<16 x !s8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<16 x !s8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, #cir.int<19> : !s32i, #cir.int<21> : !s32i, #cir.int<23> : !s32i, #cir.int<25> : !s32i, #cir.int<27> : !s32i, #cir.int<29> : !s32i, #cir.int<31> : !s32i] : !cir.vector<16 x !s8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<16 x !s8i>, !cir.ptr> + +// LLVM-SAME: <16 x i8> {{.*}} [[A:%.*]], <16 x i8> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: insertvalue %struct.poly8x16x2_t poison, <16 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.poly8x16x2_t {{.*}}, <16 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.poly8x16x2_t +return vuzpq_p8(a, b); +} + +// LLVM-LABEL: @test_vuzpq_p16( +// CIR-LABEL: @vuzpq_p16( +poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !s16i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !s16i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !s16i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !s16i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !s16i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !s16i>, !cir.ptr> + +// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]], <8 x i16> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: insertvalue %struct.poly16x8x2_t poison, <8 x i16> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.poly16x8x2_t {{.*}}, <8 x i16> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.poly16x8x2_t +return vuzpq_p16(a, b); +} + +// LLVM-LABEL: @test_vuzpq_mf8( +// CIR-LABEL: @vuzpq_mf8( +mfloat8x16x2_t test_vuzpq_mf8(mfloat8x16_t a, mfloat8x16_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<16 x !u8i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<16 x !u8i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<16 x !u8i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i, #cir.int<16> : !s32i, #cir.int<18> : !s32i, #cir.int<20> : !s32i, #cir.int<22> : !s32i, #cir.int<24> : !s32i, #cir.int<26> : !s32i, #cir.int<28> : !s32i, #cir.int<30> : !s32i] : !cir.vector<16 x !u8i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<16 x !u8i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<16 x !u8i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, #cir.int<19> : !s32i, #cir.int<21> : !s32i, #cir.int<23> : !s32i, #cir.int<25> : !s32i, #cir.int<27> : !s32i, #cir.int<29> : !s32i, #cir.int<31> : !s32i] : !cir.vector<16 x !u8i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<16 x !u8i>, !cir.ptr> + +// LLVM-SAME: <16 x i8> {{.*}}[[A:%.*]], <16 x i8> {{.*}}[[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: insertvalue %struct.mfloat8x16x2_t poison, <16 x i8> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.mfloat8x16x2_t {{.*}}, <16 x i8> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.mfloat8x16x2_t +return vuzpq_mf8(a, b); +} + +//===------------------------------------------------------===// +// 2.1.9.14. Unzip elements` +// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#unzip-elements-1 +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vuzpq_u16( +// CIR-LABEL: @vuzpq_u16( +uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { +// CIR: [[A_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !u16i> +// CIR: [[B_CAST:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !u16i> +// CIR: [[LO:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !u16i>) [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i, #cir.int<10> : !s32i, #cir.int<12> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !u16i> +// CIR: cir.store [[LO]], %{{.*}} : !cir.vector<8 x !u16i>, !cir.ptr> +// CIR: [[HI:%.*]] = cir.vec.shuffle([[A_CAST]], [[B_CAST]] : !cir.vector<8 x !u16i>) [#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<15> : !s32i] : !cir.vector<8 x !u16i> +// CIR: cir.store [[HI]], %{{.*}} : !cir.vector<8 x !u16i>, !cir.ptr> + +// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]], <8 x i16> {{.*}} [[B:%.*]]) {{.*}} { +// LLVM: [[VUZP_LO:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: [[VUZP_HI:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: insertvalue %struct.uint16x8x2_t poison, <8 x i16> [[VUZP_LO]], 0, 0 +// LLVM: insertvalue %struct.uint16x8x2_t {{.*}}, <8 x i16> [[VUZP_HI]], 0, 1 +// LLVM: ret %struct.uint16x8x2_t +return vuzpq_u16(a, b); +} From 8730fb782b815bfec78f2731798258e105024cf7 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Mon, 11 May 2026 00:58:19 +0900 Subject: [PATCH 185/538] [llvm][RISCV] Optimize fneg for fixed vectors (#194555) vfneg is not available on zvfhmin or zvfbfmin, it's expected to expand to integer operations instead of unrolling to scalar operations. General expandFNEG already handles that in most of cases except for fixed vector types that are not promotable, we need to find a better heuristic to gate this. --- .../SelectionDAG/LegalizeVectorOps.cpp | 24 ++- llvm/test/CodeGen/NVPTX/f16-instructions.ll | 3 +- .../RISCV/rvv/fixed-vectors-vfneg-sdnode.ll | 195 +++++++++++++++--- llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll | 47 ++--- 4 files changed, 196 insertions(+), 73 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 0a9c3dda7f330..21f9bce565efe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2066,10 +2066,26 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) { if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT)) return SDValue(); - // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. - if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) && - !VT.isScalableVector()) - return SDValue(); + // Heuristic check to determine whether vector should be expanded to integer + // operations or unrolled to scalar operations. + // 1. Scalable vector is never unrolled. + // 2. Fixed vector is unrolled if one of followings is true: + // a. Vector only has 1 element and target knows how to handle scalar + // FNEG (either legal or custom expand or promote). + // b. Vector has more than 1 element and target supports scalar + // FNEG natively and vector length <= 2(1 XOR + 1 CONST). + // FIXME: Scalar construction instruction count varies in every architecture, + // here we assume 1 instruction for now. + if (VT.isFixedLengthVector()) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + if ((NumElts == 1 && + TLI.isOperationLegalOrCustomOrPromote(ISD::FNEG, EltVT)) || + (NumElts < 3 && TLI.isOperationLegal(ISD::FNEG, EltVT) && + TLI.isExtractVecEltCheap(VT, 0) && + (NumElts == 1 || TLI.isExtractVecEltCheap(VT, 1)))) + return SDValue(); + } SDLoc DL(Node); SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index 53288b35d55a4..38c5b567b1378 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -1176,8 +1176,7 @@ define half @test_neg_f16(half noundef %arg) #0 { ; CHECK-LABEL: test_neg_f16x2( ; CHECK-F16-NOFTZ: neg.f16x2 ; CHECK-F16-FTZ: neg.ftz.f16x2 -; CHECK-NOF16: xor.b16 %rs{{.*}}, %rs{{.*}}, -32768 -; CHECK-NOF16: xor.b16 %rs{{.*}}, %rs{{.*}}, -32768 +; CHECK-NOF16: xor.b32 %r{{.*}}, %r{{.*}}, -2147450880 define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 { %res = fneg <2 x half> %arg ret <2 x half> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll index b3b9a62600f46..cf16fde1354b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll @@ -1,66 +1,193 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa,+v \ -; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa,+v \ -; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+experimental-zvfbfa \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+experimental-zvfbfa \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFMIN +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFMIN define <1 x bfloat> @v1bf16(<1 x bfloat> %va) { -; CHECK-LABEL: v1bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16alt, mf4, ta, ma -; CHECK-NEXT: vfneg.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 1, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v1bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %vb = fneg <1 x bfloat> %va ret <1 x bfloat> %vb } define <2 x bfloat> @v2bf16(<2 x bfloat> %va) { -; CHECK-LABEL: v2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma -; CHECK-NEXT: vfneg.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v2bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %vb = fneg <2 x bfloat> %va ret <2 x bfloat> %vb } define <4 x bfloat> @v4bf16(<4 x bfloat> %va) { -; CHECK-LABEL: v4bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma -; CHECK-NEXT: vfneg.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v4bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %vb = fneg <4 x bfloat> %va ret <4 x bfloat> %vb } define <8 x bfloat> @v8bf16(<8 x bfloat> %va) { -; CHECK-LABEL: v8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma -; CHECK-NEXT: vfneg.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v8bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %vb = fneg <8 x bfloat> %va ret <8 x bfloat> %vb } define <16 x bfloat> @v16bf16(<16 x bfloat> %va) { -; CHECK-LABEL: v16bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma -; CHECK-NEXT: vfneg.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v16bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %vb = fneg <16 x bfloat> %va ret <16 x bfloat> %vb } define <32 x bfloat> @v32bf16(<32 x bfloat> %va) { -; CHECK-LABEL: v32bf16: +; ZVFBFA-LABEL: v32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: li a0, 32 +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v32bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: li a0, 32 +; ZVFBFMIN-NEXT: lui a1, 8 +; ZVFBFMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFBFMIN-NEXT: ret + %vb = fneg <32 x bfloat> %va + ret <32 x bfloat> %vb +} + +define <64 x bfloat> @v64bf16(<64 x bfloat> %va) { +; ZVFBFA-LABEL: v64bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: li a0, 64 +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfneg.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v64bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: li a0, 64 +; ZVFBFMIN-NEXT: lui a1, 8 +; ZVFBFMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFBFMIN-NEXT: ret + %vb = fneg <64 x bfloat> %va + ret <64 x bfloat> %vb +} + +define <1 x half> @v1f16(<1 x half> %va) { +; CHECK-LABEL: v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a0 +; CHECK-NEXT: ret + %vb = fneg <1 x half> %va + ret <1 x half> %vb +} + +define <2 x half> @v2f16(<2 x half> %va) { +; CHECK-LABEL: v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a0 +; CHECK-NEXT: ret + %vb = fneg <2 x half> %va + ret <2 x half> %vb +} + +define <8 x half> @v8f16(<8 x half> %va) { +; CHECK-LABEL: v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a0 +; CHECK-NEXT: ret + %vb = fneg <8 x half> %va + ret <8 x half> %vb +} + +define <32 x half> @v32f16(<32 x half> %va) { +; CHECK-LABEL: v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma -; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a1 ; CHECK-NEXT: ret - %vb = fneg <32 x bfloat> %va - ret <32 x bfloat> %vb + %vb = fneg <32 x half> %va + ret <32 x half> %vb +} + +define <64 x half> @v64f16(<64 x half> %va) { +; CHECK-LABEL: v64f16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: ret + %vb = fneg <64 x half> %va + ret <64 x half> %vb } diff --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll index 0e993f35ce85d..8b8b897a32eda 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll @@ -5,22 +5,8 @@ define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fneg_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s4, s0 -; CHECK-MVE-NEXT: vneg.f16 s0, s0 -; CHECK-MVE-NEXT: vneg.f16 s4, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s4 -; CHECK-MVE-NEXT: vmovx.f16 s4, s1 -; CHECK-MVE-NEXT: vneg.f16 s4, s4 -; CHECK-MVE-NEXT: vneg.f16 s1, s1 -; CHECK-MVE-NEXT: vins.f16 s1, s4 -; CHECK-MVE-NEXT: vmovx.f16 s4, s2 -; CHECK-MVE-NEXT: vneg.f16 s4, s4 -; CHECK-MVE-NEXT: vneg.f16 s2, s2 -; CHECK-MVE-NEXT: vins.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s4, s3 -; CHECK-MVE-NEXT: vneg.f16 s4, s4 -; CHECK-MVE-NEXT: vneg.f16 s3, s3 -; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vmov.i16 q1, #0x8000 +; CHECK-MVE-NEXT: veor q0, q0, q1 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fneg_float16_t: @@ -35,10 +21,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @fneg_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fneg_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vneg.f32 s3, s3 -; CHECK-MVE-NEXT: vneg.f32 s2, s2 -; CHECK-MVE-NEXT: vneg.f32 s1, s1 -; CHECK-MVE-NEXT: vneg.f32 s0, s0 +; CHECK-MVE-NEXT: vmov.i32 q1, #0x80000000 +; CHECK-MVE-NEXT: veor q0, q0, q1 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fneg_float32_t: @@ -53,20 +37,17 @@ entry: define arm_aapcs_vfpcc <2 x double> @fneg_float64_t(<2 x double> %src) { ; CHECK-LABEL: fneg_float64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vstr d1, [sp] -; CHECK-NEXT: ldrb.w r0, [sp, #7] -; CHECK-NEXT: vstr d0, [sp, #8] -; CHECK-NEXT: ldrb.w r1, [sp, #15] -; CHECK-NEXT: eor r0, r0, #128 -; CHECK-NEXT: strb.w r0, [sp, #7] -; CHECK-NEXT: vldr d1, [sp] -; CHECK-NEXT: eor r0, r1, #128 -; CHECK-NEXT: strb.w r0, [sp, #15] -; CHECK-NEXT: vldr d0, [sp, #8] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: adr r0, .LCPI2_0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: veor q0, q0, q1 ; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 entry: %0 = fsub nnan ninf nsz <2 x double> , %src ret <2 x double> %0 From 7cc1f32481015e50f29c9caf92524201bbdb0a50 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Mon, 11 May 2026 00:58:48 +0900 Subject: [PATCH 186/538] [llvm][RISCV] Optimize fabs for fixed vectors (#194554) vfabs is not available on zvfhmin or zvfbfmin, it's expected to expand to integer operations instead of unrolling to scalar operations. General expandFABS already handles that in most of cases except for fixed vector types that are not promotable, we need to find a better heuristic to gate this. --- .../SelectionDAG/LegalizeVectorOps.cpp | 24 +- llvm/test/CodeGen/ARM/vfloatintrinsics.ll | 4 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 36 +-- .../RISCV/rvv/fixed-vectors-vfabs-sdnode.ll | 209 +++++++++++++++--- llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll | 42 +--- llvm/test/CodeGen/Thumb2/mve-vabd.ll | 66 +++--- 6 files changed, 241 insertions(+), 140 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 21f9bce565efe..c014494679130 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2102,10 +2102,26 @@ SDValue VectorLegalizer::ExpandFABS(SDNode *Node) { if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT)) return SDValue(); - // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. - if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) && - !VT.isScalableVector()) - return SDValue(); + // Heuristic check to determine whether vector should be expanded to integer + // operations or unrolled to scalar operations. + // 1. Scalable vector is never unrolled. + // 2. Fixed vector is unrolled if one of followings is true: + // a. Vector only has 1 element and target knows how to handle scalar + // FABS(either legal or custom expand or promote). + // b. Vector has more than 1 element and target supports scalar + // FABS natively and vector length <= 2(1 AND + 1 CONST). + // FIXME: Scalar construction instruction count varies in every architecture, + // here we assume 1 instruction for now. + if (VT.isFixedLengthVector()) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + if ((NumElts == 1 && + TLI.isOperationLegalOrCustomOrPromote(ISD::FABS, EltVT)) || + (NumElts < 3 && TLI.isOperationLegal(ISD::FABS, EltVT) && + TLI.isExtractVecEltCheap(VT, 0) && + (NumElts == 1 || TLI.isExtractVecEltCheap(VT, 1)))) + return SDValue(); + } SDLoc DL(Node); SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); diff --git a/llvm/test/CodeGen/ARM/vfloatintrinsics.ll b/llvm/test/CodeGen/ARM/vfloatintrinsics.ll index 74782d44c7423..60e772961236a 100644 --- a/llvm/test/CodeGen/ARM/vfloatintrinsics.ll +++ b/llvm/test/CodeGen/ARM/vfloatintrinsics.ll @@ -341,8 +341,8 @@ define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) { } ; CHECK-LABEL: test_v2f64.fabs:{{.*}} define %v2f64 @test_v2f64.fabs(%v2f64 %a) { - ; CHECK: bfc {{r[1,3]}}, #31, #1 - ; CHECK: bfc {{r[1,3]}}, #31, #1 + ; CHECK: vld1.64 + ; CHECK: vand %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a) ret %v2f64 %1 } diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 3ebaf68d4a15f..9ba15737ff641 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1794,33 +1794,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { } define <2 x half> @test_fabs(<2 x half> %a) #0 { -; CHECK-F16-LABEL: test_fabs( -; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b32 %r<3>; -; CHECK-F16-EMPTY: -; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; -; CHECK-F16-NEXT: and.b32 %r2, %r1, 2147450879; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-F16-NEXT: ret; -; -; CHECK-NOF16-LABEL: test_fabs( -; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; -; CHECK-NOF16-NEXT: .reg .b32 %r<7>; -; CHECK-NOF16-EMPTY: -; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; -; CHECK-NOF16-NEXT: abs.f32 %r3, %r2; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs1; -; CHECK-NOF16-NEXT: abs.f32 %r5, %r4; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs4, %r5; -; CHECK-NOF16-NEXT: mov.b32 %r6, {%rs4, %rs3}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6; -; CHECK-NOF16-NEXT: ret; +; CHECK-LABEL: test_fabs( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; +; CHECK-NEXT: and.b32 %r2, %r1, 2147450879; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) ret <2 x half> %r } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll index 27c00de3c3487..5b76750e441aa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll @@ -1,66 +1,205 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa,+v \ -; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa,+v \ -; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+experimental-zvfbfa \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+experimental-zvfbfa \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFMIN +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFMIN define <1 x bfloat> @v1bf16(<1 x bfloat> %v) { -; CHECK-LABEL: v1bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16alt, mf4, ta, ma -; CHECK-NEXT: vfabs.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v1bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 1, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v1bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %r = call <1 x bfloat> @llvm.fabs.v1bf16(<1 x bfloat> %v) ret <1 x bfloat> %r } define <2 x bfloat> @v2bf16(<2 x bfloat> %v) { -; CHECK-LABEL: v2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma -; CHECK-NEXT: vfabs.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v2bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %r = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %v) ret <2 x bfloat> %r } define <4 x bfloat> @v4bf16(<4 x bfloat> %v) { -; CHECK-LABEL: v4bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma -; CHECK-NEXT: vfabs.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v4bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %r = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %v) ret <4 x bfloat> %r } define <8 x bfloat> @v8bf16(<8 x bfloat> %v) { -; CHECK-LABEL: v8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma -; CHECK-NEXT: vfabs.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v8bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %r = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %v) ret <8 x bfloat> %r } define <16 x bfloat> @v16bf16(<16 x bfloat> %v) { -; CHECK-LABEL: v16bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma -; CHECK-NEXT: vfabs.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v16bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %r = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> %v) ret <16 x bfloat> %r } define <32 x bfloat> @v32bf16(<32 x bfloat> %v) { -; CHECK-LABEL: v32bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma -; CHECK-NEXT: vfabs.v v8, v8 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: v32bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: li a0, 32 +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v32bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: li a1, 32 +; ZVFBFMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret %r = call <32 x bfloat> @llvm.fabs.v32bf16(<32 x bfloat> %v) ret <32 x bfloat> %r } + +define <64 x bfloat> @v64bf16(<64 x bfloat> %v) { +; ZVFBFA-LABEL: v64bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: li a0, 64 +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfabs.v v8, v8 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: v64bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: li a1, 64 +; ZVFBFMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: ret + %r = call <64 x bfloat> @llvm.fabs.v64bf16(<64 x bfloat> %v) + ret <64 x bfloat> %r +} + +define <1 x half> @v1f16(<1 x half> %v) { +; CHECK-LABEL: v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: ret + %r = call <1 x half> @llvm.fabs.v1f16(<1 x half> %v) + ret <1 x half> %r +} + +define <2 x half> @v2f16(<2 x half> %v) { +; CHECK-LABEL: v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: ret + %r = call <2 x half> @llvm.fabs.v2f16(<2 x half> %v) + ret <2 x half> %r +} + +define <8 x half> @v8f16(<8 x half> %v) { +; CHECK-LABEL: v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: ret + %r = call <8 x half> @llvm.fabs.v8f16(<8 x half> %v) + ret <8 x half> %r +} + +define <32 x half> @v32f16(<32 x half> %v) { +; CHECK-LABEL: v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: ret + %r = call <32 x half> @llvm.fabs.v32f16(<32 x half> %v) + ret <32 x half> %r +} + +define <64 x half> @v64f16(<64 x half> %v) { +; CHECK-LABEL: v64f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: ret + %r = call <64 x half> @llvm.fabs.v64f16(<64 x half> %v) + ret <64 x half> %r +} diff --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll index 8b8b897a32eda..c2cf7f9701e6d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll @@ -56,22 +56,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fabs_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s4, s0 -; CHECK-MVE-NEXT: vabs.f16 s0, s0 -; CHECK-MVE-NEXT: vabs.f16 s4, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s4 -; CHECK-MVE-NEXT: vmovx.f16 s4, s1 -; CHECK-MVE-NEXT: vabs.f16 s4, s4 -; CHECK-MVE-NEXT: vabs.f16 s1, s1 -; CHECK-MVE-NEXT: vins.f16 s1, s4 -; CHECK-MVE-NEXT: vmovx.f16 s4, s2 -; CHECK-MVE-NEXT: vabs.f16 s4, s4 -; CHECK-MVE-NEXT: vabs.f16 s2, s2 -; CHECK-MVE-NEXT: vins.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s4, s3 -; CHECK-MVE-NEXT: vabs.f16 s4, s4 -; CHECK-MVE-NEXT: vabs.f16 s3, s3 -; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vbic.i16 q0, #0x8000 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fabs_float16_t: @@ -86,10 +71,7 @@ entry: define arm_aapcs_vfpcc <4 x float> @fabs_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fabs_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vabs.f32 s3, s3 -; CHECK-MVE-NEXT: vabs.f32 s2, s2 -; CHECK-MVE-NEXT: vabs.f32 s1, s1 -; CHECK-MVE-NEXT: vabs.f32 s0, s0 +; CHECK-MVE-NEXT: vbic.i32 q0, #0x80000000 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fabs_float32_t: @@ -104,21 +86,17 @@ entry: define arm_aapcs_vfpcc <2 x double> @fabs_float64_t(<2 x double> %src) { ; CHECK-LABEL: fabs_float64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d2, .LCPI5_0 -; CHECK-NEXT: vmov r12, r3, d0 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r0, r2, d1 -; CHECK-NEXT: lsrs r1, r1, #31 -; CHECK-NEXT: bfi r2, r1, #31, #1 -; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: vmov d1, r0, r2 -; CHECK-NEXT: vmov d0, r12, r3 +; CHECK-NEXT: adr r0, .LCPI5_0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 0 @ double 0 -; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff entry: %0 = call nnan ninf nsz <2 x double> @llvm.fabs.v2f64(<2 x double> %src) ret <2 x double> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll index 3c35a29c0a84c..7b05414373e69 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -14,30 +14,24 @@ define arm_aapcs_vfpcc void @vabd_v4f32(<4 x float> %x, <4 x float> %y, ptr %z) ; CHECK-MVE-NEXT: vmov q4, q1 ; CHECK-MVE-NEXT: vmov q5, q0 ; CHECK-MVE-NEXT: mov r8, r0 -; CHECK-MVE-NEXT: vmov r0, r6, d10 -; CHECK-MVE-NEXT: vmov r1, r7, d8 +; CHECK-MVE-NEXT: vmov r0, r9, d11 +; CHECK-MVE-NEXT: vmov r1, r6, d9 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r9, r0 -; CHECK-MVE-NEXT: mov r0, r6 -; CHECK-MVE-NEXT: mov r1, r7 +; CHECK-MVE-NEXT: mov r7, r0 +; CHECK-MVE-NEXT: vmov r0, r4, d10 +; CHECK-MVE-NEXT: vmov r1, r5, d8 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r6, r0 -; CHECK-MVE-NEXT: vmov r0, r7, d11 -; CHECK-MVE-NEXT: vmov r1, r4, d9 +; CHECK-MVE-NEXT: vmov q4[2], q4[0], r0, r7 +; CHECK-MVE-NEXT: mov r0, r9 +; CHECK-MVE-NEXT: mov r1, r6 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: mov r0, r7 -; CHECK-MVE-NEXT: mov r1, r4 +; CHECK-MVE-NEXT: mov r6, r0 +; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: vmov s3, r0 -; CHECK-MVE-NEXT: bic r0, r5, #-2147483648 -; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: bic r0, r6, #-2147483648 -; CHECK-MVE-NEXT: vmov s1, r0 -; CHECK-MVE-NEXT: bic r0, r9, #-2147483648 -; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vstrw.32 q0, [r8] +; CHECK-MVE-NEXT: vmov q4[3], q4[1], r0, r6 +; CHECK-MVE-NEXT: vbic.i32 q4, #0x80000000 +; CHECK-MVE-NEXT: vstrw.32 q4, [r8] ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} ; CHECK-MVE-NEXT: add sp, #4 ; CHECK-MVE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} @@ -58,35 +52,32 @@ entry: define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-LABEL: vabd_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .save {r4, r5, r6, lr} -; CHECK-MVE-NEXT: push {r4, r5, r6, lr} +; CHECK-MVE-NEXT: .save {r4, r5, r7, lr} +; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: mov r4, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] ; CHECK-MVE-NEXT: vmov q5, q1 ; CHECK-MVE-NEXT: vmov q4, q0 ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q5[0] +; CHECK-MVE-NEXT: vmov.16 q6[0], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[1] ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r6, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r1, r6 +; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 -; CHECK-MVE-NEXT: bfc r5, #15, #17 -; CHECK-MVE-NEXT: vmov.16 q6[0], r0 +; CHECK-MVE-NEXT: vmov.16 q6[1], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[2] -; CHECK-MVE-NEXT: vmov.16 q6[1], r5 ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r5, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q4[2] @@ -94,7 +85,6 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[2], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[3] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -104,7 +94,6 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[3], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[4] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -114,7 +103,6 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[4], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[5] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -124,7 +112,6 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[5], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[6] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -134,7 +121,6 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[6], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[7] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -144,11 +130,11 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[7], r0 +; CHECK-MVE-NEXT: vbic.i16 q6, #0x8000 ; CHECK-MVE-NEXT: vstrw.32 q6, [r4] ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-MVE-NEXT: pop {r4, r5, r6, pc} +; CHECK-MVE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-MVEFP-LABEL: vabd_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry From ea4e329808feac0290f76ae4d9b922e74916fb9f Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Mon, 11 May 2026 00:59:20 +0900 Subject: [PATCH 187/538] [llvm][RISCV] Optimize fcopysign for fixed vectors (#193802) vfsgnj is not available on zvfhmin or zvfbfmin, it's expected to expand to integer operations instead of unrolling to scalar operations. General expandFCOPYSIGN already handles that in most of cases except for fixed vector types that are not promotable, we need to find a better heuristic to gate this. --- .../SelectionDAG/LegalizeVectorOps.cpp | 24 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 205 ++++++------------ .../rvv/fixed-vectors-vcopysign-sdnode.ll | 197 ++++++++++++++--- llvm/test/CodeGen/Thumb2/mve-fmath.ll | 166 ++++---------- 4 files changed, 287 insertions(+), 305 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index c014494679130..e5484bf3676db 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -2140,10 +2140,26 @@ SDValue VectorLegalizer::ExpandFCOPYSIGN(SDNode *Node) { !TLI.isOperationLegalOrCustom(ISD::OR, IntVT)) return SDValue(); - // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. - if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) && - !VT.isScalableVector()) - return SDValue(); + // Heuristic check to determine whether vector should be expanded to integer + // operations or unrolled to scalar operations. + // 1. Scalable vector is never unrolled. + // 2. Fixed vector is unrolled if one of followings is true: + // a. Vector only has 1 element and target knows how to handle scalar + // FCOPYSIGN(either legal or custom expand or promote). + // b. Vector has more than 1 element and target supports scalar + // FCOPYSIGN natively and vector length <= 5(2 AND + 1 OR + 2 CONST). + // FIXME: Scalar construction instruction count varies in every architecture, + // here we assume 1 instruction for now. + if (VT.isFixedLengthVector()) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + if ((NumElts == 1 && + TLI.isOperationLegalOrCustomOrPromote(ISD::FCOPYSIGN, EltVT)) || + (NumElts < 6 && TLI.isOperationLegal(ISD::FCOPYSIGN, EltVT) && + TLI.isExtractVecEltCheap(VT, 0) && + (NumElts == 1 || TLI.isExtractVecEltCheap(VT, 1)))) + return SDValue(); + } SDLoc DL(Node); SDValue Mag = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 9ba15737ff641..04af537e2147d 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1860,167 +1860,84 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { } define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { -; CHECK-F16-LABEL: test_copysign( -; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b32 %r<6>; -; CHECK-F16-EMPTY: -; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-F16-NEXT: ret; -; -; CHECK-NOF16-LABEL: test_copysign( -; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<11>; -; CHECK-NOF16-NEXT: .reg .b32 %r<3>; -; CHECK-NOF16-EMPTY: -; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, -32768; -; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs5, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs1, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs4, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs7}; -; CHECK-NOF16-NEXT: ret; +; CHECK-LABEL: test_copysign( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; +; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r } define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { -; CHECK-F16-LABEL: test_copysign_f32( -; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b16 %rs<3>; -; CHECK-F16-NEXT: .reg .b32 %r<8>; -; CHECK-F16-EMPTY: -; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; -; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; -; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; -; CHECK-F16-NEXT: and.b32 %r5, %r4, -2147450880; -; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; -; CHECK-F16-NEXT: ret; -; -; CHECK-NOF16-LABEL: test_copysign_f32( -; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; -; CHECK-NOF16-NEXT: .reg .b32 %r<6>; -; CHECK-NOF16-EMPTY: -; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; } -; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1; -; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; } -; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5}; -; CHECK-NOF16-NEXT: ret; +; CHECK-LABEL: test_copysign_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r3; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2; +; CHECK-NEXT: mov.b32 %r4, {%rs2, %rs1}; +; CHECK-NEXT: and.b32 %r5, %r4, -2147450880; +; CHECK-NEXT: and.b32 %r6, %r1, 2147450879; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; %tb = fptrunc <2 x float> %b to <2 x half> %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) ret <2 x half> %r } define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { -; CHECK-F16-LABEL: test_copysign_f64( -; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b16 %rs<3>; -; CHECK-F16-NEXT: .reg .b32 %r<6>; -; CHECK-F16-NEXT: .reg .b64 %rd<3>; -; CHECK-F16-EMPTY: -; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; -; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs1, %rd2; -; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs2, %rd1; -; CHECK-F16-NEXT: mov.b32 %r2, {%rs2, %rs1}; -; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; -; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-F16-NEXT: ret; -; -; CHECK-NOF16-LABEL: test_copysign_f64( -; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; -; CHECK-NOF16-NEXT: .reg .b32 %r<2>; -; CHECK-NOF16-NEXT: .reg .b64 %rd<7>; -; CHECK-NOF16-EMPTY: -; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; -; CHECK-NOF16-NEXT: and.b64 %rd3, %rd2, -9223372036854775808; -; CHECK-NOF16-NEXT: shr.u64 %rd4, %rd3, 48; -; CHECK-NOF16-NEXT: cvt.u16.u64 %rs4, %rd4; -; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; -; CHECK-NOF16-NEXT: and.b64 %rd5, %rd1, -9223372036854775808; -; CHECK-NOF16-NEXT: shr.u64 %rd6, %rd5, 48; -; CHECK-NOF16-NEXT: cvt.u16.u64 %rs7, %rd6; -; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5}; -; CHECK-NOF16-NEXT: ret; +; CHECK-LABEL: test_copysign_f64( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; +; CHECK-NEXT: cvt.rn.f16.f64 %rs1, %rd2; +; CHECK-NEXT: cvt.rn.f16.f64 %rs2, %rd1; +; CHECK-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x half> %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) ret <2 x half> %r } define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { -; CHECK-F16-LABEL: test_copysign_extended( -; CHECK-F16: { -; CHECK-F16-NEXT: .reg .b16 %rs<3>; -; CHECK-F16-NEXT: .reg .b32 %r<8>; -; CHECK-F16-EMPTY: -; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; -; CHECK-F16-NEXT: and.b32 %r3, %r2, -2147450880; -; CHECK-F16-NEXT: and.b32 %r4, %r1, 2147450879; -; CHECK-F16-NEXT: or.b32 %r5, %r4, %r3; -; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5; -; CHECK-F16-NEXT: cvt.f32.f16 %r6, %rs2; -; CHECK-F16-NEXT: cvt.f32.f16 %r7, %rs1; -; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r7, %r6}; -; CHECK-F16-NEXT: ret; -; -; CHECK-NOF16-LABEL: test_copysign_extended( -; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .b16 %rs<11>; -; CHECK-NOF16-NEXT: .reg .b32 %r<5>; -; CHECK-NOF16-EMPTY: -; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs1, -32768; -; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs4, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs2, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs5, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs10; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs7; -; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NOF16-NEXT: ret; +; CHECK-LABEL: test_copysign_extended( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; +; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; +; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; +; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r5; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r7, %r6}; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) %xr = fpext <2 x half> %r to <2 x float> ret <2 x float> %xr diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll index 9cfed6a659c64..0df4eabe89917 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll @@ -1,56 +1,195 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \ -; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \ -; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+experimental-zvfbfa \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+experimental-zvfbfa \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFA +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFMIN +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFBFMIN define <2 x bfloat> @copysign_v2bf16(<2 x bfloat> %vm, <2 x bfloat> %vs) { -; CHECK-LABEL: copysign_v2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v9 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: copysign_v2bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 2, e16alt, mf4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: copysign_v2bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vand.vx v9, v9, a0 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: vor.vv v8, v8, v9 +; ZVFBFMIN-NEXT: ret %r = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %vm, <2 x bfloat> %vs) ret <2 x bfloat> %r } define <4 x bfloat> @copysign_v4bf16(<4 x bfloat> %vm, <4 x bfloat> %vs) { -; CHECK-LABEL: copysign_v4bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v9 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: copysign_v4bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 4, e16alt, mf2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: copysign_v4bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFMIN-NEXT: vand.vx v9, v9, a0 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: vor.vv v8, v8, v9 +; ZVFBFMIN-NEXT: ret %r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %vm, <4 x bfloat> %vs) ret <4 x bfloat> %r } define <8 x bfloat> @copysign_v8bf16(<8 x bfloat> %vm, <8 x bfloat> %vs) { -; CHECK-LABEL: copysign_v8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v9 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: copysign_v8bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 8, e16alt, m1, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v9 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: copysign_v8bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFMIN-NEXT: vand.vx v9, v9, a0 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: vor.vv v8, v8, v9 +; ZVFBFMIN-NEXT: ret %r = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %vm, <8 x bfloat> %vs) ret <8 x bfloat> %r } define <16 x bfloat> @copysign_v16bf16(<16 x bfloat> %vm, <16 x bfloat> %vs) { -; CHECK-LABEL: copysign_v16bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v10 -; CHECK-NEXT: ret +; ZVFBFA-LABEL: copysign_v16bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: vsetivli zero, 16, e16alt, m2, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v10 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: copysign_v16bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: lui a0, 8 +; ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFMIN-NEXT: vand.vx v10, v10, a0 +; ZVFBFMIN-NEXT: addi a0, a0, -1 +; ZVFBFMIN-NEXT: vand.vx v8, v8, a0 +; ZVFBFMIN-NEXT: vor.vv v8, v8, v10 +; ZVFBFMIN-NEXT: ret %r = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> %vm, <16 x bfloat> %vs) ret <16 x bfloat> %r } define <32 x bfloat> @copysign_v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs) { -; CHECK-LABEL: copysign_v32bf32: +; ZVFBFA-LABEL: copysign_v32bf32: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: li a0, 32 +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v12 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: copysign_v32bf32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: li a0, 32 +; ZVFBFMIN-NEXT: lui a1, 8 +; ZVFBFMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFMIN-NEXT: vand.vx v12, v12, a1 +; ZVFBFMIN-NEXT: addi a1, a1, -1 +; ZVFBFMIN-NEXT: vand.vx v8, v8, a1 +; ZVFBFMIN-NEXT: vor.vv v8, v8, v12 +; ZVFBFMIN-NEXT: ret + %r = call <32 x bfloat> @llvm.copysign.v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs) + ret <32 x bfloat> %r +} + +define <64 x bfloat> @copysign_v64bf16(<64 x bfloat> %vm, <64 x bfloat> %vs) { +; ZVFBFA-LABEL: copysign_v64bf16: +; ZVFBFA: # %bb.0: +; ZVFBFA-NEXT: li a0, 64 +; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma +; ZVFBFA-NEXT: vfsgnj.vv v8, v8, v16 +; ZVFBFA-NEXT: ret +; +; ZVFBFMIN-LABEL: copysign_v64bf16: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: li a0, 64 +; ZVFBFMIN-NEXT: lui a1, 8 +; ZVFBFMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFBFMIN-NEXT: vand.vx v16, v16, a1 +; ZVFBFMIN-NEXT: addi a1, a1, -1 +; ZVFBFMIN-NEXT: vand.vx v8, v8, a1 +; ZVFBFMIN-NEXT: vor.vv v8, v8, v16 +; ZVFBFMIN-NEXT: ret + %r = call <64 x bfloat> @llvm.copysign.v64bf16(<64 x bfloat> %vm, <64 x bfloat> %vs) + ret <64 x bfloat> %r +} + +define <2 x half> @copysign_v2f16(<2 x half> %vm, <2 x half> %vs) { +; CHECK-LABEL: copysign_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = call <2 x half> @llvm.copysign.v2f16(<2 x half> %vm, <2 x half> %vs) + ret <2 x half> %r +} + +define <8 x half> @copysign_v8f16(<8 x half> %vm, <8 x half> %vs) { +; CHECK-LABEL: copysign_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %vm, <8 x half> %vs) + ret <8 x half> %r +} + +define <32 x half> @copysign_v32f16(<32 x half> %vm, <32 x half> %vs) { +; CHECK-LABEL: copysign_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v12 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vand.vx v12, v12, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: ret - %r = call <32 x bfloat> @llvm.copysign.v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs) - ret <32 x bfloat> %r + %r = call <32 x half> @llvm.copysign.v32f16(<32 x half> %vm, <32 x half> %vs) + ret <32 x half> %r +} + +define <64 x half> @copysign_v64f16(<64 x half> %vm, <64 x half> %vs) { +; CHECK-LABEL: copysign_v64f16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vand.vx v16, v16, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: ret + %r = call <64 x half> @llvm.copysign.v64f16(<64 x half> %vm, <64 x half> %vs) + ret <64 x half> %r } diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index ad8921d2f7b02..7da8b746c0ce3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULLFP16 -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,MVEFP -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULLFP16 -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,MVEFP +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: @@ -1091,123 +1091,26 @@ entry: } define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x float> %src2) { -; FULLFP16-LABEL: copysign_float32_t: -; FULLFP16: @ %bb.0: @ %entry -; FULLFP16-NEXT: .save {r4, r5, r7, lr} -; FULLFP16-NEXT: push {r4, r5, r7, lr} -; FULLFP16-NEXT: vmov r12, r1, d2 -; FULLFP16-NEXT: vmov r2, lr, d3 -; FULLFP16-NEXT: vmov r3, r0, d0 -; FULLFP16-NEXT: vmov r4, r5, d1 -; FULLFP16-NEXT: lsrs r1, r1, #31 -; FULLFP16-NEXT: bfi r0, r1, #31, #1 -; FULLFP16-NEXT: lsrs r1, r2, #31 -; FULLFP16-NEXT: bfi r4, r1, #31, #1 -; FULLFP16-NEXT: lsr.w r1, lr, #31 -; FULLFP16-NEXT: bfi r5, r1, #31, #1 -; FULLFP16-NEXT: lsr.w r1, r12, #31 -; FULLFP16-NEXT: bfi r3, r1, #31, #1 -; FULLFP16-NEXT: vmov s2, r4 -; FULLFP16-NEXT: vmov s3, r5 -; FULLFP16-NEXT: vmov s1, r0 -; FULLFP16-NEXT: vmov s0, r3 -; FULLFP16-NEXT: pop {r4, r5, r7, pc} -; -; MVEFP-LABEL: copysign_float32_t: -; MVEFP: @ %bb.0: @ %entry -; MVEFP-NEXT: vmov.i32 q2, #0x80000000 -; MVEFP-NEXT: vbic.i32 q0, #0x80000000 -; MVEFP-NEXT: vand q1, q1, q2 -; MVEFP-NEXT: vorr q0, q0, q1 -; MVEFP-NEXT: bx lr +; CHECK-LABEL: copysign_float32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x80000000 +; CHECK-NEXT: vbic.i32 q0, #0x80000000 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr entry: %0 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %src1, <4 x float> %src2) ret <4 x float> %0 } define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x half> %src2) { -; FULLFP16-LABEL: copysign_float16_t: -; FULLFP16: @ %bb.0: @ %entry -; FULLFP16-NEXT: .pad #32 -; FULLFP16-NEXT: sub sp, #32 -; FULLFP16-NEXT: vmovx.f16 s8, s4 -; FULLFP16-NEXT: vstr.16 s8, [sp, #24] -; FULLFP16-NEXT: vstr.16 s4, [sp, #28] -; FULLFP16-NEXT: vmovx.f16 s4, s5 -; FULLFP16-NEXT: vstr.16 s4, [sp, #16] -; FULLFP16-NEXT: vmovx.f16 s4, s6 -; FULLFP16-NEXT: vstr.16 s5, [sp, #20] -; FULLFP16-NEXT: vstr.16 s4, [sp, #8] -; FULLFP16-NEXT: vmovx.f16 s4, s7 -; FULLFP16-NEXT: vstr.16 s6, [sp, #12] -; FULLFP16-NEXT: vstr.16 s4, [sp] -; FULLFP16-NEXT: vstr.16 s7, [sp, #4] -; FULLFP16-NEXT: ldrb.w r0, [sp, #25] -; FULLFP16-NEXT: vmovx.f16 s4, s0 -; FULLFP16-NEXT: vabs.f16 s4, s4 -; FULLFP16-NEXT: vneg.f16 s6, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s6, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #29] -; FULLFP16-NEXT: vabs.f16 s4, s0 -; FULLFP16-NEXT: vneg.f16 s0, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s0, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #17] -; FULLFP16-NEXT: vmovx.f16 s4, s1 -; FULLFP16-NEXT: vabs.f16 s4, s4 -; FULLFP16-NEXT: vins.f16 s0, s6 -; FULLFP16-NEXT: vneg.f16 s6, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s6, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #21] -; FULLFP16-NEXT: vabs.f16 s4, s1 -; FULLFP16-NEXT: vneg.f16 s1, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s1, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #9] -; FULLFP16-NEXT: vmovx.f16 s4, s2 -; FULLFP16-NEXT: vabs.f16 s4, s4 -; FULLFP16-NEXT: vins.f16 s1, s6 -; FULLFP16-NEXT: vneg.f16 s6, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s6, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #13] -; FULLFP16-NEXT: vabs.f16 s4, s2 -; FULLFP16-NEXT: vneg.f16 s2, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s2, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #1] -; FULLFP16-NEXT: vmovx.f16 s4, s3 -; FULLFP16-NEXT: vabs.f16 s4, s4 -; FULLFP16-NEXT: vins.f16 s2, s6 -; FULLFP16-NEXT: vneg.f16 s6, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s6, s4 -; FULLFP16-NEXT: ldrb.w r0, [sp, #5] -; FULLFP16-NEXT: vabs.f16 s4, s3 -; FULLFP16-NEXT: vneg.f16 s3, s4 -; FULLFP16-NEXT: lsls r0, r0, #24 -; FULLFP16-NEXT: it pl -; FULLFP16-NEXT: vmovpl.f32 s3, s4 -; FULLFP16-NEXT: vins.f16 s3, s6 -; FULLFP16-NEXT: add sp, #32 -; FULLFP16-NEXT: bx lr -; -; MVEFP-LABEL: copysign_float16_t: -; MVEFP: @ %bb.0: @ %entry -; MVEFP-NEXT: vmov.i16 q2, #0x8000 -; MVEFP-NEXT: vbic.i16 q0, #0x8000 -; MVEFP-NEXT: vand q1, q1, q2 -; MVEFP-NEXT: vorr q0, q0, q1 -; MVEFP-NEXT: bx lr +; CHECK-LABEL: copysign_float16_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q2, #0x8000 +; CHECK-NEXT: vbic.i16 q0, #0x8000 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr entry: %0 = call fast <8 x half> @llvm.copysign.v8f16(<8 x half> %src1, <8 x half> %src2) ret <8 x half> %0 @@ -1216,19 +1119,26 @@ entry: define arm_aapcs_vfpcc <2 x double> @copysign_float64_t(<2 x double> %src1, <2 x double> %src2) { ; CHECK-LABEL: copysign_float64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r0, lr, d2 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: vmov r12, r2, d0 -; CHECK-NEXT: lsrs r1, r1, #31 -; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: lsr.w r1, lr, #31 -; CHECK-NEXT: bfi r2, r1, #31, #1 -; CHECK-NEXT: vmov d1, r0, r3 -; CHECK-NEXT: vmov d0, r12, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adr r1, .LCPI32_1 +; CHECK-NEXT: adr r0, .LCPI32_0 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI32_0: +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 4294967295 @ 0xffffffff +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .LCPI32_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483648 @ 0x80000000 entry: %0 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %src1, <2 x double> %src2) ret <2 x double> %0 From dc2f9fe95f03c3c927eed3eecd33c376d43cec8d Mon Sep 17 00:00:00 2001 From: Pedro Lobo Date: Sun, 10 May 2026 17:19:59 +0100 Subject: [PATCH 188/538] [InstCombine] Fold constant byte stores to integer stores (#196740) Byte constants are equivalent to integer constants when stored to memory. Replacing them in store instructions reduces IR differences and enables existing optimizations over integer constants. --- .../InstCombineLoadStoreAlloca.cpp | 7 ++ llvm/test/Transforms/InstCombine/store.ll | 111 ++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 0fe44d615acea..0e917fb55aa1e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1578,6 +1578,13 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) { if (isa(Val)) return eraseInstFromFunction(SI); + // Replace byte constants with integer constants in stores. + Constant *C; + if (Val->getType()->isByteOrByteVectorTy() && match(Val, m_ImmConstant(C))) + return replaceOperand( + SI, 0, + ConstantExpr::getBitCast(C, Type::getIntFromByteType(C->getType()))); + if (!NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace())) if (Value *V = simplifyNonNullOperand(Ptr, /*HasDereferenceable=*/true)) return replaceOperand(SI, 1, V); diff --git a/llvm/test/Transforms/InstCombine/store.ll b/llvm/test/Transforms/InstCombine/store.ll index 45f16d874c04e..e2fc0fdcd456a 100644 --- a/llvm/test/Transforms/InstCombine/store.ll +++ b/llvm/test/Transforms/InstCombine/store.ll @@ -399,6 +399,117 @@ define void @store_select_with_null_gep(i1 %cond, ptr %p, i64 %off) { ret void } +define void @store_const_b8(ptr %p) { +; CHECK-LABEL: @store_const_b8( +; CHECK-NEXT: store i8 1, ptr [[P:%.*]], align 1 +; CHECK-NEXT: ret void +; + store b8 1, ptr %p + ret void +} + +define void @store_const_v4b8_zero(ptr %p) { +; CHECK-LABEL: @store_const_v4b8_zero( +; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store <4 x b8> zeroinitializer, ptr %p + ret void +} + +define void @store_const_v4b8_splat(ptr %p) { +; CHECK-LABEL: @store_const_v4b8_splat( +; CHECK-NEXT: store <4 x i8> splat (i8 5), ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store <4 x b8> splat (b8 5), ptr %p + ret void +} + +define void @store_const_v4b8_mixed(ptr %p) { +; CHECK-LABEL: @store_const_v4b8_mixed( +; CHECK-NEXT: store <4 x i8> , ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store <4 x b8> , ptr %p + ret void +} + +define void @store_const_v4b8_data(ptr %p) { +; CHECK-LABEL: @store_const_v4b8_data( +; CHECK-NEXT: store <4 x i8> , ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store <4 x b8> , ptr %p + ret void +} + +define void @store_const_nxv4b8_splat(ptr %p) { +; CHECK-LABEL: @store_const_nxv4b8_splat( +; CHECK-NEXT: store splat (i8 5), ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store splat (b8 5), ptr %p + ret void +} + +define void @store_const_nxv4b8_zero(ptr %p) { +; CHECK-LABEL: @store_const_nxv4b8_zero( +; CHECK-NEXT: store zeroinitializer, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store zeroinitializer, ptr %p + ret void +} + +define void @store_const_b8_atomic(ptr %p) { +; CHECK-LABEL: @store_const_b8_atomic( +; CHECK-NEXT: store atomic i8 42, ptr [[P:%.*]] unordered, align 1 +; CHECK-NEXT: ret void +; + store atomic b8 42, ptr %p unordered, align 1 + ret void +} + +define void @store_const_v4b8_atomic(ptr %p) { +; CHECK-LABEL: @store_const_v4b8_atomic( +; CHECK-NEXT: store atomic <4 x i8> splat (i8 5), ptr [[P:%.*]] unordered, align 4 +; CHECK-NEXT: ret void +; + store atomic <4 x b8> splat (b8 5), ptr %p unordered, align 4 + ret void +} + +define void @store_const_b5(ptr %p) { +; CHECK-LABEL: @store_const_b5( +; CHECK-NEXT: store i5 5, ptr [[P:%.*]], align 1 +; CHECK-NEXT: ret void +; + store b5 5, ptr %p, align 1 + ret void +} + +; Byte constants cannot represent pointer provenance. +; Make sure we do not fold the bitcast and change the store type. +define void @store_const_b64_from_ptr(ptr %p) { +; CHECK-LABEL: @store_const_b64_from_ptr( +; CHECK-NEXT: store b64 bitcast (ptr @Unknown to b64), ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; + store b64 bitcast (ptr @Unknown to b64), ptr %p + ret void +} + +; Only constants should be folded. +define void @store_nonconst_b8(b8 %v, ptr %p) { +; CHECK-LABEL: @store_nonconst_b8( +; CHECK-NEXT: store b8 [[V:%.*]], ptr [[P:%.*]], align 1 +; CHECK-NEXT: ret void +; + store b8 %v, ptr %p + ret void +} + !0 = !{!4, !4, i64 0} !1 = !{!"omnipotent char", !2} !2 = !{!"Simple C/C++ TBAA"} From 285ac8c1a33cae80357bdc3419e9633e700c1d06 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sun, 10 May 2026 09:45:46 -0700 Subject: [PATCH 189/538] [libcxx] Switch to check-runtimes for generic-llvm-libc (#196780) --- libcxx/utils/ci/run-buildbot | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 59e4bfe5928b4..6376bc1d3dd5a 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -466,19 +466,7 @@ generic-llvm-libc) # Ensure we have the builtins archive built as we pass it in explicitly in # the test config. ninja -vC "${BUILD_DIR}" libclang_rt.builtins-x86_64.a - - # Manually run only libcxx/libcxxabi tests as we currently cannot build - # libunwind due to a missing dl_iterate_phdr implementation. - # TODO(boomanaiden154): Remove this once we can build libunwind and pass - # the tests. - step "Building libc++ test dependencies" - ninja -vC "${BUILD_DIR}" cxx-test-depends - - step "Running the libc++ tests" - ninja -vC "${BUILD_DIR}" check-cxx - - step "Running the libc++abi tests" - ninja -vC "${BUILD_DIR}" check-cxxabi + check-runtimes ;; # # Module builds From 2f4c387147f1617e242dd500960410728e5ec35e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 10 May 2026 10:26:34 -0700 Subject: [PATCH 190/538] Move KCFI type ID hash helpers out of LLVMSupport (#196784) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #167254 inappropriately introduced llvm/Support/Hash.{h,cpp} for the KCFI helpers. The name is misleading — it has nothing to do with the generic hashing facility in llvm/ADT/Hashing.h — and KCFI is a CodeGen/IR feature that does not belong in the foundational Support layer. Move the files to llvm/lib/Transforms/Utils/KCFIHash.cpp, alongside setKCFIType, which is the only existing KCFI helper in TransformUtils. Also relocate the deprecated pre-xxh3 xxHash64 implementation into KCFIHash.cpp, the sole user. clang/test/CodeGen/kcfi-generalize.c and kcfi-normalize.c are end-to-end regression tests for the xxHash64 output --- clang/include/clang/Basic/CodeGenOptions.h | 2 +- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- llvm/include/llvm/Support/xxhash.h | 6 - .../Hash.h => Transforms/Utils/KCFIHash.h} | 10 +- llvm/lib/Support/CMakeLists.txt | 1 - llvm/lib/Support/Hash.cpp | 54 ------- llvm/lib/Support/xxhash.cpp | 75 --------- llvm/lib/Transforms/Utils/CMakeLists.txt | 1 + llvm/lib/Transforms/Utils/KCFIHash.cpp | 142 ++++++++++++++++++ llvm/lib/Transforms/Utils/ModuleUtils.cpp | 2 +- llvm/unittests/Support/xxhashTest.cpp | 12 -- 11 files changed, 151 insertions(+), 156 deletions(-) rename llvm/include/llvm/{Support/Hash.h => Transforms/Utils/KCFIHash.h} (78%) delete mode 100644 llvm/lib/Support/Hash.cpp create mode 100644 llvm/lib/Transforms/Utils/KCFIHash.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index 3cbbf1a3074ac..e43112b4bb98b 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -22,10 +22,10 @@ #include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CodeGen.h" -#include "llvm/Support/Hash.h" #include "llvm/Support/Regex.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Instrumentation/AddressSanitizerOptions.h" +#include "llvm/Transforms/Utils/KCFIHash.h" #include #include #include diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 106f1e63cd904..2d91b7eaa52dc 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -70,7 +70,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Hash.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/RISCVISAInfo.h" @@ -78,6 +77,7 @@ #include "llvm/TargetParser/X86TargetParser.h" #include "llvm/Transforms/Instrumentation/KCFI.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/KCFIHash.h" #include #include diff --git a/llvm/include/llvm/Support/xxhash.h b/llvm/include/llvm/Support/xxhash.h index 15c4f1bfd4563..b03aaa1e907b4 100644 --- a/llvm/include/llvm/Support/xxhash.h +++ b/llvm/include/llvm/Support/xxhash.h @@ -32,9 +32,6 @@ - xxHash source repository : https://github.com/Cyan4973/xxHash */ -/* based on revision d2df04efcbef7d7f6886d345861e5dfda4edacc1 Removed - * everything but a simple interface for computing XXh64. */ - #ifndef LLVM_SUPPORT_XXHASH_H #define LLVM_SUPPORT_XXHASH_H @@ -44,9 +41,6 @@ namespace llvm { -// Deprecated pre-xxh3 64-bit hash. -LLVM_ABI uint64_t xxHash64(const uint8_t *data, size_t len); - /// XXH3's 64-bit variant. Inline ArrayRef and StringRef overloads live in /// llvm/ADT/ArrayRef.h and llvm/ADT/StringRef.h. LLVM_ABI uint64_t xxh3_64bits(const uint8_t *data, size_t len); diff --git a/llvm/include/llvm/Support/Hash.h b/llvm/include/llvm/Transforms/Utils/KCFIHash.h similarity index 78% rename from llvm/include/llvm/Support/Hash.h rename to llvm/include/llvm/Transforms/Utils/KCFIHash.h index 7a2cfb8774ae8..553c24d1289aa 100644 --- a/llvm/include/llvm/Support/Hash.h +++ b/llvm/include/llvm/Transforms/Utils/KCFIHash.h @@ -1,4 +1,4 @@ -//===- llvm/Support/Hash.h - Hash functions --------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This file provides hash functions. +// Helpers for computing the 32-bit KCFI type ID from a mangled type name. // //===----------------------------------------------------------------------===// -#ifndef LLVM_SUPPORT_HASH_H -#define LLVM_SUPPORT_HASH_H +#ifndef LLVM_TRANSFORMS_UTILS_KCFIHASH_H +#define LLVM_TRANSFORMS_UTILS_KCFIHASH_H #include "llvm/ADT/StringRef.h" #include @@ -34,4 +34,4 @@ LLVM_ABI uint32_t getKCFITypeID(StringRef MangledTypeName, } // end namespace llvm -#endif // LLVM_SUPPORT_HASH_H +#endif // LLVM_TRANSFORMS_UTILS_KCFIHASH_H diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index e8d505f218b69..100cfb567c348 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -204,7 +204,6 @@ add_llvm_component_library(LLVMSupport FormatVariadic.cpp GlobPattern.cpp GraphWriter.cpp - Hash.cpp HexagonAttributeParser.cpp HexagonAttributes.cpp InitLLVM.cpp diff --git a/llvm/lib/Support/Hash.cpp b/llvm/lib/Support/Hash.cpp deleted file mode 100644 index 6b5d000ee27c9..0000000000000 --- a/llvm/lib/Support/Hash.cpp +++ /dev/null @@ -1,54 +0,0 @@ -//===- Hash.cpp - Hash functions ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements hash functions. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/Hash.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/xxhash.h" - -using namespace llvm; - -KCFIHashAlgorithm llvm::parseKCFIHashAlgorithm(StringRef Name) { - if (Name == "FNV-1a") - return KCFIHashAlgorithm::FNV1a; - // Default to xxHash64 for backward compatibility - return KCFIHashAlgorithm::xxHash64; -} - -StringRef llvm::stringifyKCFIHashAlgorithm(KCFIHashAlgorithm Algorithm) { - switch (Algorithm) { - case KCFIHashAlgorithm::xxHash64: - return "xxHash64"; - case KCFIHashAlgorithm::FNV1a: - return "FNV-1a"; - } - llvm_unreachable("Unknown KCFI hash algorithm"); -} - -uint32_t llvm::getKCFITypeID(StringRef MangledTypeName, - KCFIHashAlgorithm Algorithm) { - switch (Algorithm) { - case KCFIHashAlgorithm::xxHash64: - // Use lower 32 bits of xxHash64 - return static_cast( - xxHash64(reinterpret_cast(MangledTypeName.data()), - MangledTypeName.size())); - case KCFIHashAlgorithm::FNV1a: - // FNV-1a hash (32-bit) - uint32_t Hash = 2166136261u; // FNV offset basis - for (unsigned char C : MangledTypeName) { - Hash ^= C; - Hash *= 16777619u; // FNV prime - } - return Hash; - } - llvm_unreachable("Unknown KCFI hash algorithm"); -} diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp index 6997fed7e8336..a1aee00b82637 100644 --- a/llvm/lib/Support/xxhash.cpp +++ b/llvm/lib/Support/xxhash.cpp @@ -32,9 +32,6 @@ * - xxHash source repository : https://github.com/Cyan4973/xxHash */ -// xxhash64 is based on commit d2df04efcbef7d7f6886d345861e5dfda4edacc1. Removed -// everything but a simple interface for computing xxh64. - // xxh3_64bits is based on commit d5891596637d21366b9b1dcf2c0007a3edb26a9e (July // 2023). @@ -77,20 +74,6 @@ static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; -static uint64_t round(uint64_t Acc, uint64_t Input) { - Acc += Input * PRIME64_2; - Acc = rotl64(Acc, 31); - Acc *= PRIME64_1; - return Acc; -} - -static uint64_t mergeRound(uint64_t Acc, uint64_t Val) { - Val = round(0, Val); - Acc ^= Val; - Acc = Acc * PRIME64_1 + PRIME64_4; - return Acc; -} - static uint64_t XXH64_avalanche(uint64_t hash) { hash ^= hash >> 33; hash *= PRIME64_2; @@ -100,64 +83,6 @@ static uint64_t XXH64_avalanche(uint64_t hash) { return hash; } -uint64_t llvm::xxHash64(const uint8_t *P, size_t Len) { - uint64_t Seed = 0; - const uint8_t *const BEnd = P + Len; - uint64_t H64; - - if (Len >= 32) { - const unsigned char *const Limit = BEnd - 32; - uint64_t V1 = Seed + PRIME64_1 + PRIME64_2; - uint64_t V2 = Seed + PRIME64_2; - uint64_t V3 = Seed + 0; - uint64_t V4 = Seed - PRIME64_1; - - do { - V1 = round(V1, endian::read64le(P)); - P += 8; - V2 = round(V2, endian::read64le(P)); - P += 8; - V3 = round(V3, endian::read64le(P)); - P += 8; - V4 = round(V4, endian::read64le(P)); - P += 8; - } while (P <= Limit); - - H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18); - H64 = mergeRound(H64, V1); - H64 = mergeRound(H64, V2); - H64 = mergeRound(H64, V3); - H64 = mergeRound(H64, V4); - - } else { - H64 = Seed + PRIME64_5; - } - - H64 += (uint64_t)Len; - - while (reinterpret_cast(P) + 8 <= - reinterpret_cast(BEnd)) { - uint64_t const K1 = round(0, endian::read64le(P)); - H64 ^= K1; - H64 = rotl64(H64, 27) * PRIME64_1 + PRIME64_4; - P += 8; - } - - if (reinterpret_cast(P) + 4 <= reinterpret_cast(BEnd)) { - H64 ^= (uint64_t)(endian::read32le(P)) * PRIME64_1; - H64 = rotl64(H64, 23) * PRIME64_2 + PRIME64_3; - P += 4; - } - - while (P < BEnd) { - H64 ^= (*P) * PRIME64_5; - H64 = rotl64(H64, 11) * PRIME64_1; - P++; - } - - return XXH64_avalanche(H64); -} - constexpr size_t XXH3_SECRETSIZE_MIN = 136; constexpr size_t XXH_SECRET_DEFAULT_SIZE = 192; diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 82e9edf674866..933e204081ad2 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -40,6 +40,7 @@ add_llvm_component_library(LLVMTransformUtils Instrumentation.cpp IntegerDivision.cpp IRNormalizer.cpp + KCFIHash.cpp LCSSA.cpp LibCallsShrinkWrap.cpp Local.cpp diff --git a/llvm/lib/Transforms/Utils/KCFIHash.cpp b/llvm/lib/Transforms/Utils/KCFIHash.cpp new file mode 100644 index 0000000000000..df65ed53b66b6 --- /dev/null +++ b/llvm/lib/Transforms/Utils/KCFIHash.cpp @@ -0,0 +1,142 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/KCFIHash.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; +using namespace support; + +// xxHash64 is a deprecated pre-xxh3 hash, retained here only as the default +// KCFI type-ID hash for ABI compatibility. + +static uint64_t rotl64(uint64_t X, size_t R) { + return (X << R) | (X >> (64 - R)); +} + +constexpr uint64_t PRIME64_1 = 11400714785074694791ULL; +constexpr uint64_t PRIME64_2 = 14029467366897019727ULL; +constexpr uint64_t PRIME64_3 = 1609587929392839161ULL; +constexpr uint64_t PRIME64_4 = 9650029242287828579ULL; +constexpr uint64_t PRIME64_5 = 2870177450012600261ULL; + +static uint64_t round(uint64_t Acc, uint64_t Input) { + Acc += Input * PRIME64_2; + Acc = rotl64(Acc, 31); + Acc *= PRIME64_1; + return Acc; +} + +static uint64_t mergeRound(uint64_t Acc, uint64_t Val) { + Val = round(0, Val); + Acc ^= Val; + Acc = Acc * PRIME64_1 + PRIME64_4; + return Acc; +} + +static uint64_t avalanche(uint64_t H) { + H ^= H >> 33; + H *= PRIME64_2; + H ^= H >> 29; + H *= PRIME64_3; + H ^= H >> 32; + return H; +} + +static uint64_t xxHash64(const uint8_t *P, size_t Len) { + const uint8_t *const BEnd = P + Len; + uint64_t H64; + + if (Len >= 32) { + const uint8_t *const Limit = BEnd - 32; + uint64_t V1 = PRIME64_1 + PRIME64_2; + uint64_t V2 = PRIME64_2; + uint64_t V3 = 0; + uint64_t V4 = -PRIME64_1; + + do { + V1 = round(V1, endian::read64le(P)); + P += 8; + V2 = round(V2, endian::read64le(P)); + P += 8; + V3 = round(V3, endian::read64le(P)); + P += 8; + V4 = round(V4, endian::read64le(P)); + P += 8; + } while (P <= Limit); + + H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18); + H64 = mergeRound(H64, V1); + H64 = mergeRound(H64, V2); + H64 = mergeRound(H64, V3); + H64 = mergeRound(H64, V4); + } else { + H64 = PRIME64_5; + } + + H64 += (uint64_t)Len; + + while (reinterpret_cast(P) + 8 <= + reinterpret_cast(BEnd)) { + H64 ^= round(0, endian::read64le(P)); + H64 = rotl64(H64, 27) * PRIME64_1 + PRIME64_4; + P += 8; + } + + if (reinterpret_cast(P) + 4 <= reinterpret_cast(BEnd)) { + H64 ^= (uint64_t)endian::read32le(P) * PRIME64_1; + H64 = rotl64(H64, 23) * PRIME64_2 + PRIME64_3; + P += 4; + } + + while (P < BEnd) { + H64 ^= (*P) * PRIME64_5; + H64 = rotl64(H64, 11) * PRIME64_1; + ++P; + } + + return avalanche(H64); +} + +KCFIHashAlgorithm llvm::parseKCFIHashAlgorithm(StringRef Name) { + if (Name == "FNV-1a") + return KCFIHashAlgorithm::FNV1a; + // Default to xxHash64 for backward compatibility + return KCFIHashAlgorithm::xxHash64; +} + +StringRef llvm::stringifyKCFIHashAlgorithm(KCFIHashAlgorithm Algorithm) { + switch (Algorithm) { + case KCFIHashAlgorithm::xxHash64: + return "xxHash64"; + case KCFIHashAlgorithm::FNV1a: + return "FNV-1a"; + } + llvm_unreachable("Unknown KCFI hash algorithm"); +} + +uint32_t llvm::getKCFITypeID(StringRef MangledTypeName, + KCFIHashAlgorithm Algorithm) { + switch (Algorithm) { + case KCFIHashAlgorithm::xxHash64: + // Use lower 32 bits of xxHash64 + return static_cast( + xxHash64(reinterpret_cast(MangledTypeName.data()), + MangledTypeName.size())); + case KCFIHashAlgorithm::FNV1a: + // FNV-1a hash (32-bit) + uint32_t Hash = 2166136261u; // FNV offset basis + for (unsigned char C : MangledTypeName) { + Hash ^= C; + Hash *= 16777619u; // FNV prime + } + return Hash; + } + llvm_unreachable("Unknown KCFI hash algorithm"); +} diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index 63a234960a0ad..2976ebf46c9b7 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -19,9 +19,9 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/Hash.h" #include "llvm/Support/MD5.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/KCFIHash.h" using namespace llvm; diff --git a/llvm/unittests/Support/xxhashTest.cpp b/llvm/unittests/Support/xxhashTest.cpp index 9f91fc79e1f62..6097f0525e2b2 100644 --- a/llvm/unittests/Support/xxhashTest.cpp +++ b/llvm/unittests/Support/xxhashTest.cpp @@ -32,18 +32,6 @@ static void fillTestBuffer(uint8_t *buffer, size_t len) { } } -TEST(xxhashTest, Basic) { - EXPECT_EQ(0xef46db3751d8e999U, xxHash64(nullptr, 0)); - EXPECT_EQ(0x33bf00a859c4ba3fU, - xxHash64(reinterpret_cast("foo"), 3)); - EXPECT_EQ(0x48a37c90ad27a659U, - xxHash64(reinterpret_cast("bar"), 3)); - EXPECT_EQ(0x69196c1b3af0bff9U, - xxHash64(reinterpret_cast( - "0123456789abcdefghijklmnopqrstuvwxyz"), - 36)); -} - TEST(xxhashTest, xxh3) { constexpr size_t size = 2243; uint8_t a[size]; From 10f941727ffc81c9237499d259e1246dd70263ab Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 10 May 2026 10:38:05 -0700 Subject: [PATCH 191/538] [Coverage] Fix assertion failure when a -isystem header invokes a user macro (#195427) ``` // a.cc static void foo(int x) { switch (x) { #define GENERIC(n) case n: #include "types.def" // -isystem header invokes a user macro break; } } // sys/types.def #define MID(name) GENERIC(name) MID(0) MID(1) MID(2) ``` ``` $ clang -fprofile-instr-generate -fcoverage-mapping -isystem sys -c a.cc Assertion `SystemHeadersCoverage || !SM.isInSystemHeader(SM.getSpellingLoc(Loc))' failed. ``` Commit 702a2b627ff4 ("[Coverage] Rework !SystemHeadersCoverage") replaced the system-header skip in gatherFileIDs with this assertion, which trips as `SM.isInSystemHeader(SM.getSpellingLoc(Loc))` is false. This patch adds back the pre-#91446 condition but folds it with the macro-token remap `if` statement. Fixes #179316/#195422. Clang Opus 4.7 identified clang/lib/Parse/ParseExpr.cpp, created a minimal reproduce with cvise, and wrote the initial version of this CodeGen patch. (An earlier session papered over the bug by patching llvm-cov instead, which I abandoned). --- clang/lib/CodeGen/CoverageMappingGen.cpp | 27 +++++++----- .../CoverageMapping/system_macro_switch.cpp | 42 +++++++++++++++++++ 2 files changed, 58 insertions(+), 11 deletions(-) create mode 100644 clang/test/CoverageMapping/system_macro_switch.cpp diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 803037d1874b3..eadb6e3bb25a8 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -386,24 +386,29 @@ class CoverageMappingBuilder { Region.setEndLoc(EndLoc.value()); } - // Replace Loc with FileLoc if it is expanded with system headers. - if (!SystemHeadersCoverage && SM.isInSystemMacro(Loc)) { - auto BeginLoc = SM.getSpellingLoc(Loc); - auto EndLoc = SM.getSpellingLoc(Region.getEndLoc()); - if (SM.isWrittenInSameFile(BeginLoc, EndLoc)) { - Loc = SM.getFileLoc(Loc); - Region.setStartLoc(Loc); - Region.setEndLoc(SM.getFileLoc(Region.getEndLoc())); + // For regions whose spelling is in a system header, remap macro + // tokens to their user-code call site so coverage is attributed to + // the user expression. Drop anything still in a system header + // (e.g. a plain FileID into a -isystem .def file). + if (!SystemHeadersCoverage && + SM.isInSystemHeader(SM.getSpellingLoc(Loc))) { + if (Loc.isMacroID()) { + auto BeginLoc = SM.getSpellingLoc(Loc); + auto EndLoc = SM.getSpellingLoc(Region.getEndLoc()); + if (SM.isWrittenInSameFile(BeginLoc, EndLoc)) { + Loc = SM.getFileLoc(Loc); + Region.setStartLoc(Loc); + Region.setEndLoc(SM.getFileLoc(Region.getEndLoc())); + } } + if (SM.isInSystemHeader(SM.getSpellingLoc(Loc))) + continue; } FileID File = SM.getFileID(Loc); if (!Visited.insert(File).second) continue; - assert(SystemHeadersCoverage || - !SM.isInSystemHeader(SM.getSpellingLoc(Loc))); - unsigned Depth = 0; for (SourceLocation Parent = getIncludeOrExpansionLoc(Loc); Parent.isValid(); Parent = getIncludeOrExpansionLoc(Parent)) diff --git a/clang/test/CoverageMapping/system_macro_switch.cpp b/clang/test/CoverageMapping/system_macro_switch.cpp new file mode 100644 index 0000000000000..71f8ebfc2a777 --- /dev/null +++ b/clang/test/CoverageMapping/system_macro_switch.cpp @@ -0,0 +1,42 @@ +/// `case` labels generated by a user macro invoked from a -isystem header +/// must not crash the coverage mapping builder. + +// RUN: rm -rf %t && split-file %s %t && cd %t +// RUN: %clang_cc1 -triple %itanium_abi_triple -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -isystem sys a.cpp | FileCheck %s + +// CHECK-LABEL: main: +// CHECK-NEXT: File 0, 9:29 -> 9:43 = #0 +// CHECK-LABEL: _ZL3fooi: +// CHECK-NEXT: File 0, 1:24 -> 7:2 = #0 +// CHECK-NEXT: Branch,File 0, 2:11 -> 2:12 = ((#2 + #3) + #4), (((#0 - #2) - #3) - #4) +// CHECK-NEXT: Gap,File 0, 2:14 -> 5:10 = 0 +// CHECK-NEXT: File 0, 4:21 -> 5:10 = ((#2 + #3) + #4) +// CHECK-NEXT: File 0, 4:21 -> 5:10 = (#2 + #3) +// CHECK-NEXT: File 0, 4:21 -> 5:10 = #2 +// CHECK-NEXT: File 1, 3:20 -> 3:27 = #2 +// CHECK-NEXT: Branch,File 1, 3:20 -> 3:26 = #2, (#0 - #2) +// CHECK-NEXT: File 2, 3:20 -> 3:27 = (#2 + #3) +// CHECK-NEXT: Branch,File 2, 3:20 -> 3:26 = #3, (#0 - #3) +// CHECK-NEXT: File 3, 3:20 -> 3:27 = ((#2 + #3) + #4) +// CHECK-NEXT: Branch,File 3, 3:20 -> 3:26 = #4, (#0 - #4) + +/// types.def must not leak into the file mapping. +// CHECK-NOT: Expansion, +// CHECK-NOT: File + +//--- a.cpp +static void foo(int x) { + switch (x) { +#define GENERIC(n) case n: +#include "types.def" + break; + } +} + +int main(int argc, char **) { foo(argc); } + +//--- sys/types.def +#define MID(name) GENERIC(name) +MID(0) +MID(1) +MID(2) From dd23e411f75396a037f5442b3f6a78f0fa392ca4 Mon Sep 17 00:00:00 2001 From: David Stone Date: Sun, 10 May 2026 12:16:24 -0600 Subject: [PATCH 192/538] [clang-tidy][NFC] Move `ClassifiedToken` to cpp file (#196820) `ClassifiedToken` is used in only the implementation of `UseTrailingReturnTypeCheck`. Move it into the unnamed namespace of the cpp file instead of it being in the header. --- .../modernize/UseTrailingReturnTypeCheck.cpp | 10 ++++++++++ .../clang-tidy/modernize/UseTrailingReturnTypeCheck.h | 6 ------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp index ea43cd6c42222..8bc06afaeb33c 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp @@ -205,6 +205,16 @@ static bool isSpecifier(Token T) { tok::kw_static, tok::kw_friend, tok::kw_virtual); } +namespace { + +struct ClassifiedToken { + Token T; + bool IsQualifier; + bool IsSpecifier; +}; + +} // namespace + static std::optional classifyToken(const FunctionDecl &F, Preprocessor &PP, Token Tok) { ClassifiedToken CT; diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h index 4aa1adaf30db5..05a8952f03700 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h @@ -14,12 +14,6 @@ namespace clang::tidy::modernize { -struct ClassifiedToken { - Token T; - bool IsQualifier; - bool IsSpecifier; -}; - /// Rewrites function signatures to use a trailing return type. /// /// For the user-facing documentation see: From 865465d6fd49a197c35dec5c78830abddc71a715 Mon Sep 17 00:00:00 2001 From: "forking-google-bazel-bot[bot]" <265904573+forking-google-bazel-bot[bot]@users.noreply.github.com> Date: Sun, 10 May 2026 11:20:28 -0700 Subject: [PATCH 193/538] [Bazel] Fixes 2f4c387 (#196822) This fixes 2f4c387147f1617e242dd500960410728e5ec35e. Co-authored-by: Google Bazel Bot --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 51404bce7aa5d..7b9d98531c151 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -778,6 +778,7 @@ cc_library( "//llvm:Support", "//llvm:Target", "//llvm:TargetParser", + "//llvm:TransformUtils", "//llvm:config", ], ) @@ -2334,6 +2335,7 @@ cc_library( "//llvm:Support", "//llvm:Target", "//llvm:TargetParser", + "//llvm:TransformUtils", "//llvm:config", ], ) From 93c1336ab0795f71ad5bb0f77ed78df7874f43f6 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 10 May 2026 14:34:20 -0400 Subject: [PATCH 194/538] [libc] Move a few -Wshadow warnings in __support/File (#196810) No behavior change. --- libc/src/__support/File/file.cpp | 10 +++++----- libc/src/__support/File/linux/file.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp index 31e6b7dab95a2..1499ab56578a9 100644 --- a/libc/src/__support/File/file.cpp +++ b/libc/src/__support/File/file.cpp @@ -170,7 +170,7 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) { FileIOResult result = platform_write(this, remainder.data(), remainder.size()); - size_t bytes_written = result.value; + bytes_written = result.value; // If less bytes were written than expected, then an error occurred. Return // the number of bytes that have been written from |data|. @@ -658,8 +658,8 @@ wint_t File::ungetwc_unlocked(wint_t wc) { break; } - char buf[4]; - auto result = internal::wcrtomb(buf, static_cast(wc), &mbstate); + char mb_buf[4]; + auto result = internal::wcrtomb(mb_buf, static_cast(wc), &mbstate); if (!result.has_value()) return WEOF; @@ -670,7 +670,7 @@ wint_t File::ungetwc_unlocked(wint_t wc) { return WEOF; for (size_t i = 0; i < n; ++i) - this->buf[i] = static_cast(buf[i]); + buf[i] = static_cast(mb_buf[i]); read_limit = n; pos = 0; @@ -679,7 +679,7 @@ wint_t File::ungetwc_unlocked(wint_t wc) { return WEOF; pos -= n; for (size_t i = 0; i < n; ++i) - this->buf[pos + i] = static_cast(buf[i]); + buf[pos + i] = static_cast(mb_buf[i]); } eof = false; err = false; diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp index 10a6f2a97dc41..2bef96a102a0c 100644 --- a/libc/src/__support/File/linux/file.cpp +++ b/libc/src/__support/File/linux/file.cpp @@ -169,7 +169,7 @@ ErrorOr create_file_from_fd(int fd, const char *mode) { } File::add_file(file); if (do_seek) { - auto result = file->seek(0, SEEK_END); + result = file->seek(0, SEEK_END); if (!result.has_value()) { File::remove_file(file); delete file; From 2893aa5d1f31d62a8f1c50e202d7924004792f2b Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 10 May 2026 14:38:53 -0400 Subject: [PATCH 195/538] [libc][math] Fix -Wshadow warnings in cos.h (#196342) cos() does `using namespace range_reduction_double_internal;` and range_reduction_double_internal after 51e9430a0c767 contains using LIBC_NAMESPACE::fputil::DoubleDouble; using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>; So the local using statements for DoubleDouble and Float128 shadowed these. Just remove the local using statements. No behavior change. --- libc/src/__support/math/cos.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/libc/src/__support/math/cos.h b/libc/src/__support/math/cos.h index 1a7833978304a..e6a37c132cd80 100644 --- a/libc/src/__support/math/cos.h +++ b/libc/src/__support/math/cos.h @@ -32,7 +32,6 @@ namespace math { LIBC_INLINE constexpr double cos(double x) { using namespace range_reduction_double_internal; - using DoubleDouble = fputil::DoubleDouble; using FPBits = typename fputil::FPBits; FPBits xbits(x); @@ -122,7 +121,6 @@ LIBC_INLINE constexpr double cos(double x) { #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS return rr.hi + rr.lo; #else - using Float128 = typename fputil::DyadicFloat<128>; double rlp = rr.lo + err; double rlm = rr.lo - err; From 899663966c7e27dcd211b002d9eed6725b6d52ff Mon Sep 17 00:00:00 2001 From: Ahmad Yasin Date: Sun, 10 May 2026 23:23:45 +0300 Subject: [PATCH 196/538] [AArch64] New pass for code layout optimizations. (#184434) This pass is intended to optimize code layout prior to AsmPrinter. The initial version handles two known cases: I. FCMP-FCSEL II. CMP/CMN-CSEL, 32-bit only Using existing directives, the pass induces function-alignment (of 64-bytes by default) when a pair is detected, and possibly induces block-alignment of up to 4-bytes on top of that if the pair would straddle cache-lines. Beyond performance improvement, this pass reduces noise due to code layout thus stabilizes measured performance over-time. For example, knock-out effects on a "sensitive function" won't be triggered by codegen changes outside it. Enabled by default on processors with the new `FeatureAlignCmpCSelPairs` subtarget feature (gated per sub-case by `FeatureFuseCmpCSel` / `FeatureFuseFCmpFCSel`); each case can also be forced through the `-aarch64-code-layout-opt` enumerated bit-mask --------- Co-authored-by: Jon Roelofs rdar://171283264 --- llvm/lib/Target/AArch64/AArch64.h | 2 + .../Target/AArch64/AArch64CodeLayoutOpt.cpp | 262 +++++++++++++++++ llvm/lib/Target/AArch64/AArch64Features.td | 4 + llvm/lib/Target/AArch64/AArch64Processors.td | 6 + .../Target/AArch64/AArch64TargetMachine.cpp | 6 + llvm/lib/Target/AArch64/CMakeLists.txt | 1 + llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 + llvm/test/CodeGen/AArch64/code-layout-opt.ll | 264 ++++++++++++++++++ 8 files changed, 546 insertions(+) create mode 100644 llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp create mode 100644 llvm/test/CodeGen/AArch64/code-layout-opt.ll diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index e56da6f9da5f5..a8fc28944af34 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -64,6 +64,7 @@ FunctionPass *createFalkorHWPFFixPass(); FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64PointerAuthPass(); FunctionPass *createAArch64BranchTargetsPass(); +FunctionPass *createAArch64CodeLayoutOptPass(); FunctionPass *createAArch64MIPeepholeOptLegacyPass(); FunctionPass *createAArch64PostCoalescerPass(); @@ -155,6 +156,7 @@ void initializeAArch64DeadRegisterDefinitionsLegacyPass(PassRegistry &); void initializeAArch64ExpandPseudoLegacyPass(PassRegistry &); void initializeAArch64LoadStoreOptLegacyPass(PassRegistry &); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); +void initializeAArch64CodeLayoutOptPass(PassRegistry &); void initializeAArch64MIPeepholeOptLegacyPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerLegacyPass(PassRegistry &); void initializeAArch64PostCoalescerLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp new file mode 100644 index 0000000000000..92ebd8592f854 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp @@ -0,0 +1,262 @@ +//===-- AArch64CodeLayoutOpt.cpp - Code Layout Optimizations --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass runs after instruction scheduling and employs code layout +// optimizations for certain patterns. +// +// Option -aarch64-code-layout-opt-enable selects instruction pairs to optimize: +// cmp-csel: Enable CMP/CMN-CSEL code layout optimization +// fcmp-fcsel: Enable FCMP-FCSEL code layout optimization +// +// The initial implementation induces function alignment when a supported +// pattern is detected, and possibly instruction-alignment when a pair would +// straddle cache-lines. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-code-layout-opt" +#define DBG(...) LLVM_DEBUG(dbgs() << DEBUG_TYPE ": " << __VA_ARGS__) +#define AARCH64_CODE_LAYOUT_OPT_NAME "AArch64 Code Layout Optimization" + +enum CodeLayoutOpt { + CmpCsel, // Align CMP/CMN-CSEL pairs + FcmpFcsel, // Align FCMP-FCSEL pairs +}; + +static cl::bits EnableCodeAlignment( + "aarch64-code-layout-opt-enable", cl::Hidden, cl::CommaSeparated, + cl::desc("Enable code alignment optimization for instruction pairs"), + cl::values( + clEnumValN(CmpCsel, "cmp-csel", "CMP/CMN-CSEL pair alignment (32-bit)"), + clEnumValN(FcmpFcsel, "fcmp-fcsel", "FCMP-FCSEL pair alignment"))); + +static cl::opt FunctionAlignBytes( + "aarch64-code-layout-opt-align-functions", cl::Hidden, + cl::desc("Function alignment in bytes for code layout optimization " + "(must be a power of 2)"), + cl::init(64), cl::callback([](const unsigned &Val) { + if (!isPowerOf2_32(Val)) + report_fatal_error( + "aarch64-code-layout-opt-align must be a power of 2"); + })); + +STATISTIC(NumFunctionsAligned, + "Number of functions with aligned (to 64-bytes by default)"); +STATISTIC(NumCmpCselPairsDetected, + "Number of CMP/CMN-CSEL pairs detected for alignment"); +STATISTIC(NumFcmpFcselPairsDetected, + "Number of FCMP-FCSEL pairs detected for alignment"); + +namespace { + +class AArch64CodeLayoutOpt : public MachineFunctionPass { +public: + static char ID; + AArch64CodeLayoutOpt() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return AARCH64_CODE_LAYOUT_OPT_NAME; + } + +private: + const AArch64InstrInfo *TII = nullptr; + + /// Align each fusible CMP/CMN-CSEL or FCMP-FCSEL pair in \p MBB by emitting + /// .p2align before the lead instruction (splitting the block if needed). + /// \returns true iff at least one pair was found and aligned. + bool alignLayoutSensitivePatterns(MachineBasicBlock *MBB); + + /// Emit .p2align before MI. Splits the block if MI is not at its start. + void emitP2Align(MachineInstr &MI, Align DesiredAlign, + unsigned MaxSkipBytes = 4); + + bool optimizeForCodeLayout(MachineFunction &MF); +}; + +} // end anonymous namespace + +char AArch64CodeLayoutOpt::ID = 0; + +INITIALIZE_PASS(AArch64CodeLayoutOpt, "aarch64-code-layout-opt", + AARCH64_CODE_LAYOUT_OPT_NAME, false, false) + +void AArch64CodeLayoutOpt::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +FunctionPass *llvm::createAArch64CodeLayoutOptPass() { + return new AArch64CodeLayoutOpt(); +} + +/// \returns true iff Opc is a floating-point comparison (FCMP/FCMPE). +static bool isFloatingPointCompare(unsigned Opc) { + switch (Opc) { + case AArch64::FCMPSrr: + case AArch64::FCMPDrr: + case AArch64::FCMPESrr: + case AArch64::FCMPEDrr: + case AArch64::FCMPHrr: + case AArch64::FCMPEHrr: + return true; + default: + return false; + } +} + +/// \returns true iff Opc is a floating-point conditional select (FCSEL). +static bool isFloatingPointConditionalSelect(unsigned Opc) { + switch (Opc) { + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: + case AArch64::FCSELHrrr: + return true; + default: + return false; + } +} + +/// \returns true if MI is a qualifying 32-bit CMP or CMN instruction. +/// CMP is encoded as SUBS with WZR destination, CMN as ADDS with WZR. +/// Only simple variants (no shifted/extended reg) qualify, and immediate +/// variants require no LSL shift and small immediates (<=15). +static bool isQualifyingIntCompare(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::SUBSWrr: + case AArch64::ADDSWrr: + return MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr); + case AArch64::SUBSWri: + case AArch64::ADDSWri: + return MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) && + MI.getOperand(3).getImm() == 0 && MI.getOperand(2).getImm() <= 15; + case AArch64::SUBSWrs: + case AArch64::ADDSWrs: + return MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) && + !AArch64InstrInfo::hasShiftedReg(MI); + case AArch64::SUBSWrx: + return MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) && + !AArch64InstrInfo::hasExtendedReg(MI); + default: + return false; + } +} + +bool AArch64CodeLayoutOpt::runOnMachineFunction(MachineFunction &MF) { + const Function &F = MF.getFunction(); + // hasOptSize() returns true for both -Os and -Oz. + if (F.hasOptSize()) + return false; + + const auto *Subtarget = &MF.getSubtarget(); + TII = Subtarget->getInstrInfo(); + + // Default: enable when the subtarget opts in via FeatureAlignCmpCSelPairs. + if (!EnableCodeAlignment.getBits() && Subtarget->hasAlignCmpCSelPairs()) { + if (Subtarget->hasFuseCmpCSel()) + EnableCodeAlignment.addValue(CmpCsel); + if (Subtarget->hasFuseFCmpFCSel()) + EnableCodeAlignment.addValue(FcmpFcsel); + } + + if (!(EnableCodeAlignment.isSet(CmpCsel) && Subtarget->hasFuseCmpCSel()) && + !(EnableCodeAlignment.isSet(FcmpFcsel) && Subtarget->hasFuseFCmpFCSel())) + return false; + + return optimizeForCodeLayout(MF); +} + +void AArch64CodeLayoutOpt::emitP2Align(MachineInstr &MI, Align DesiredAlign, + unsigned MaxSkipBytes) { + MachineBasicBlock *MBB = MI.getParent(); + + auto FirstReal = + skipDebugInstructionsForward(MBB->instr_begin(), MBB->instr_end()); + if (&*FirstReal != &MI) { + auto PrevIt = prev_nodbg(MI.getIterator(), MBB->instr_begin()); + MBB = MBB->splitAt(*PrevIt, /*UpdateLiveIns=*/true); + } + + MBB->setAlignment(DesiredAlign); + MBB->setMaxBytesForAlignment(MaxSkipBytes); +} + +// Align each fusible CMP/CMN-CSEL or FCMP-FCSEL pair in MBB by emitting +// .p2align before the lead instruction (splitting the block if needed). +// A pair is: a qualifying lead instruction immediately followed by its +// consumer (CMP/CMN→CSEL or FCMP→FCSEL), with no intervening instructions. +// Returns true iff at least one pair was found and aligned. +bool AArch64CodeLayoutOpt::alignLayoutSensitivePatterns( + MachineBasicBlock *MBB) { + auto End = MBB->instr_end(); + SmallVector, 4> Pairs; + + for (auto &MI : instructionsWithoutDebug(MBB->begin(), MBB->end())) { + auto NextIt = + skipDebugInstructionsForward(std::next(MI.getIterator()), End); + if (NextIt == End) + break; + + // --- CMP/CMN-CSEL detection --- + if (EnableCodeAlignment.isSet(CmpCsel) && isQualifyingIntCompare(MI) && + NextIt->getOpcode() == AArch64::CSELWr) { + Pairs.push_back({&MI, true}); + continue; + } + + // --- FCMP-FCSEL detection --- + if (EnableCodeAlignment.isSet(FcmpFcsel) && + isFloatingPointCompare(MI.getOpcode()) && + isFloatingPointConditionalSelect(NextIt->getOpcode())) { + Pairs.push_back({&MI, false}); + continue; + } + } + + for (auto &[MI, IsCmpCsel] : Pairs) { + emitP2Align(*MI, Align(64)); + DBG(".p2align 6, , 4 before " << *MI); + ++(IsCmpCsel ? NumCmpCselPairsDetected : NumFcmpFcselPairsDetected); + } + + return !Pairs.empty(); +} + +bool AArch64CodeLayoutOpt::optimizeForCodeLayout(MachineFunction &MF) { + DBG("optimizeForCodeLayout: " << MF.getName() << "\n"); + + bool Changed = false; + for (auto &MBB : MF) + Changed |= alignLayoutSensitivePatterns(&MBB); + + if (!Changed) + return false; + + if (MF.getAlignment() < Align(FunctionAlignBytes)) { + MF.setAlignment(Align(FunctionAlignBytes)); + ++NumFunctionsAligned; + DBG("Set " << FunctionAlignBytes << "-byte alignment for function " + << MF.getName() << "\n"); + } else { + DBG("Function " << MF.getName() << " already has sufficient alignment\n"); + } + return true; +} diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 8cd2278a05747..4f1a1810cffe7 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -793,6 +793,10 @@ def FeatureFuseFCmpFCSel : SubtargetFeature< "fuse-fcsel", "HasFuseFCmpFCSel", "true", "CPU can fuse FCMP and FCSEL operations">; +def FeatureAlignCmpCSelPairs : SubtargetFeature< + "align-cmp-csel-pairs", "HasAlignCmpCSelPairs", "true", + "Align certain CMP/FCMP and CSEL/FCSEL instruction pairs">; + def FeatureFuseCryptoEOR : SubtargetFeature< "fuse-crypto-eor", "HasFuseCryptoEOR", "true", "CPU fuses AES/PMULL and EOR operations">; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 06ccdb5cbee0b..b33ffdafbf2cc 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -472,6 +472,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", "Apple A14", [ FeatureAggressiveFMA, + FeatureAlignCmpCSelPairs, FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, @@ -495,6 +496,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", "Apple A15", [ FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, @@ -518,6 +520,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", "Apple A16", [ FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, @@ -542,6 +545,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", "Apple A17", [ FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, @@ -566,6 +570,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", "Apple M4", [ FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, @@ -589,6 +594,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5", "Apple M5", [ FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 1ef404b320c31..67a5ab0d6c030 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -241,6 +241,7 @@ LLVMInitializeAArch64Target() { initializeGlobalISel(PR); initializeAArch64A53Fix835769LegacyPass(PR); initializeAArch64A57FPLoadBalancingLegacyPass(PR); + initializeAArch64CodeLayoutOptPass(PR); initializeAArch64AdvSIMDScalarLegacyPass(PR); initializeAArch64AsmPrinterPass(PR); initializeAArch64BranchTargetsLegacyPass(PR); @@ -921,6 +922,11 @@ void AArch64PassConfig::addPreEmitPass() { if (TM->getOptLevel() != CodeGenOptLevel::None && EnableCollectLOH && TM->getTargetTriple().isOSBinFormatMachO()) addPass(createAArch64CollectLOHPass()); + + // Apply code layout optimizations. Run late so detection reflects the + // final MI stream. + if (getOptLevel() != CodeGenOptLevel::None) + addPass(createAArch64CodeLayoutOptPass()); } void AArch64PassConfig::addPostBBSections() { diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 80848845c2c24..7be0ca38ef4f3 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -42,6 +42,7 @@ add_llvm_target(AArch64CodeGen GISel/AArch64PostSelectOptimize.cpp GISel/AArch64RegisterBankInfo.cpp AArch64A57FPLoadBalancing.cpp + AArch64CodeLayoutOpt.cpp AArch64AdvSIMDScalarPass.cpp AArch64Arm64ECCallLowering.cpp AArch64AsmPrinter.cpp diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 451b79bfa42eb..1a0ffe234a236 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -237,6 +237,7 @@ ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: AArch64 Redundant Conditional Branch Elimination ; CHECK-NEXT: Workaround A53 erratum 835769 pass +; CHECK-NEXT: AArch64 Code Layout Optimization ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: Remove Loads Into Fake Uses ; CHECK-NEXT: StackMap Liveness Analysis diff --git a/llvm/test/CodeGen/AArch64/code-layout-opt.ll b/llvm/test/CodeGen/AArch64/code-layout-opt.ll new file mode 100644 index 0000000000000..adf5b05bd5305 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/code-layout-opt.ll @@ -0,0 +1,264 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; NOTE: Test cases for FCMP-FCSEL and CMP/CMN-CSEL code layout optimization +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-apple-darwin -mcpu=apple-m4 -aarch64-code-layout-opt-enable=fcmp-fcsel,cmp-csel | FileCheck %s +; Default for -mcpu=apple-m4 enables both fcmp-fcsel and cmp-csel; expect identical output. +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-apple-darwin -mcpu=apple-m4 | FileCheck %s + +; Test coverage for optimizeForCodeLayout function: +; * Basic FCMP-FCSEL instruction pair detection and function alignment (single/double precision) +; * Multiple FCMP-FCSEL pairs in same function (also tests different predicates) +; * Mixed single and double precision in same function +; * FCMP with immediate operand (#0.0) is excluded from optimization +; * Instruction pairs with function calls +; * Negative tests (no false positives) +; * Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment +; * CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded + +; * Basic single-precision FCMP-FCSEL instruction pair +; CHECK: .globl _test_basic_fcmp_fcsel_single +; CHECK-NEXT: .p2align 6 +define float @test_basic_fcmp_fcsel_single(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: test_basic_fcmp_fcsel_single: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, eq +; CHECK-NEXT: ret +entry: + %cmp = fcmp oeq float %a, %b + %sel = select i1 %cmp, float %c, float %d + ret float %sel +} + +; * Basic double-precision FCMP-FCSEL instruction pair +; CHECK: .globl _test_basic_fcmp_fcsel_double +; CHECK-NEXT: .p2align 6 +define double @test_basic_fcmp_fcsel_double(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: test_basic_fcmp_fcsel_double: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: fcmp d0, d1 +; CHECK-NEXT: fcsel d0, d2, d3, eq +; CHECK-NEXT: ret +entry: + %cmp = fcmp oeq double %a, %b + %sel = select i1 %cmp, double %c, double %d + ret double %sel +} + +; * Multiple FCMP-FCSEL instruction pairs in same function +; CHECK: .globl _test_multiple_patterns +; CHECK-NEXT: .p2align 6 +define float @test_multiple_patterns(float %a, float %b, float %c, float %d, float %e, float %f) { +; CHECK-LABEL: test_multiple_patterns: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, eq +; CHECK-NEXT: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.1: ; %entry +; CHECK-NEXT: fcmp s0, s4 +; CHECK-NEXT: fcsel s0, s0, s5, gt +; CHECK-NEXT: ret +entry: + %cmp1 = fcmp oeq float %a, %b + %sel1 = select i1 %cmp1, float %c, float %d + %cmp2 = fcmp ogt float %sel1, %e + %sel2 = select i1 %cmp2, float %sel1, float %f + ret float %sel2 +} + +; * FCMP with comparison to zero (immediate) - excluded from optimization +; FCMP #0.0 uses the ri-form opcode which is not in the detection list +; CHECK: .globl _test_fcmp_immediate +; CHECK-NEXT: .p2align 2 +define float @test_fcmp_immediate(float %a, float %b) { +; CHECK-LABEL: test_fcmp_immediate: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fcsel s0, s0, s1, eq +; CHECK-NEXT: ret +entry: + %cmp = fcmp oeq float %a, 0.0 + %sel = select i1 %cmp, float %a, float %b + ret float %sel +} + +; * Mixed single and double precision in same function +; CHECK: .globl _test_mixed_precision +; CHECK-NEXT: .p2align 6 +define float @test_mixed_precision(float %a, float %b, double %c, double %d) { +; CHECK-LABEL: test_mixed_precision: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s0, s1, gt +; CHECK-NEXT: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.1: ; %entry +; CHECK-NEXT: fcmp d2, d3 +; CHECK-NEXT: fcsel d1, d2, d3, mi +; CHECK-NEXT: fcvt s1, d1 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret +entry: + %cmp_single = fcmp ogt float %a, %b + %sel_single = select i1 %cmp_single, float %a, float %b + %cmp_double = fcmp olt double %c, %d + %sel_double = select i1 %cmp_double, double %c, double %d + %trunc = fptrunc double %sel_double to float + %final = fadd float %sel_single, %trunc + ret float %final +} + +; * FCMP-FCSEL instruction pair with a function call present +; CHECK: .globl _test_with_function_calls +; CHECK-NEXT: .p2align 6 +declare float @external_func(float) +define float @test_with_function_calls(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: test_with_function_calls: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.1: ; %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, gt +; CHECK-NEXT: bl _external_func +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %cmp = fcmp ogt float %a, %b + %sel = select i1 %cmp, float %c, float %d + %result = call float @external_func(float %sel) + ret float %result +} + +; * Verify no false positives - FCMP without FCSEL +; CHECK: .globl _test_fcmp_without_fcsel +; CHECK-NEXT: .p2align 2 +define i32 @test_fcmp_without_fcsel(float %a, float %b) { +; CHECK-LABEL: test_fcmp_without_fcsel: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, gt +; CHECK-NEXT: ret +entry: + %cmp = fcmp ogt float %a, %b + %result = zext i1 %cmp to i32 + ret i32 %result +} + +; * Verify no false positives - FCSEL without preceding FCMP +; CHECK: .globl _test_fcsel_without_fcmp +; CHECK-NEXT: .p2align 2 +define float @test_fcsel_without_fcmp(i1 %cond, float %a, float %b) { +; CHECK-LABEL: test_fcsel_without_fcmp: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: ret +entry: + %result = select i1 %cond, float %a, float %b + ret float %result +} + +;------------------------------------------------------------------------------ +; CMP/CMN-CSEL tests (cmp-csel flag of -aarch64-code-layout-opt-enable) +;------------------------------------------------------------------------------ + +; * Basic CMP-CSEL instruction pair (integer register comparison) +; CHECK: .globl _test_basic_cmp_csel +; CHECK-NEXT: .p2align 6 +define i32 @test_basic_cmp_csel(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: test_basic_cmp_csel: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: csel w0, w2, w3, eq +; CHECK-NEXT: ret +entry: + %cmp = icmp eq i32 %a, %b + %sel = select i1 %cmp, i32 %c, i32 %d + ret i32 %sel +} + +; * CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization) +; CHECK: .globl _test_cmp_small_imm_csel +; CHECK-NEXT: .p2align 6 +define i32 @test_cmp_small_imm_csel(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_cmp_small_imm_csel: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: cmp w0, #7 +; CHECK-NEXT: csel w0, w1, w2, eq +; CHECK-NEXT: ret +entry: + %cmp = icmp eq i32 %a, 7 + %sel = select i1 %cmp, i32 %b, i32 %c + ret i32 %sel +} + +; * CMP-CSEL with immediate > 15 - excluded from optimization +; CHECK: .globl _test_cmp_large_imm_csel +; CHECK-NEXT: .p2align 2 +define i32 @test_cmp_large_imm_csel(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_cmp_large_imm_csel: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: cmp w0, #100 +; CHECK-NEXT: csel w0, w1, w2, eq +; CHECK-NEXT: ret +entry: + %cmp = icmp eq i32 %a, 100 + %sel = select i1 %cmp, i32 %b, i32 %c + ret i32 %sel +} + +; * Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination) +; CHECK: .globl _test_basic_cmn_csel +; CHECK-NEXT: .p2align 6 +define i32 @test_basic_cmn_csel(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: test_basic_cmn_csel: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: cmn w0, w1 +; CHECK-NEXT: csel w0, w2, w3, eq +; CHECK-NEXT: ret +entry: + %sum = add i32 %a, %b + %cmp = icmp eq i32 %sum, 0 + %sel = select i1 %cmp, i32 %c, i32 %d + ret i32 %sel +} + +; * CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies) +; CHECK: .globl _test_cmn_small_imm_csel +; CHECK-NEXT: .p2align 6 +define i32 @test_cmn_small_imm_csel(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_cmn_small_imm_csel: +; CHECK: .p2align 6, , 4 +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: cmn w0, #7 +; CHECK-NEXT: csel w0, w1, w2, eq +; CHECK-NEXT: ret +entry: + %cmp = icmp eq i32 %a, -7 + %sel = select i1 %cmp, i32 %b, i32 %c + ret i32 %sel +} + +; * CMP without CSEL - no false positive +; CHECK: .globl _test_cmp_without_csel +; CHECK-NEXT: .p2align 2 +define i32 @test_cmp_without_csel(i32 %a, i32 %b) { +; CHECK-LABEL: test_cmp_without_csel: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %cmp = icmp eq i32 %a, %b + %result = zext i1 %cmp to i32 + ret i32 %result +} From a8868e6515acc2e1f1088bbd28c801c540da2f12 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Sun, 10 May 2026 21:24:24 +0100 Subject: [PATCH 197/538] [mlir][spirv] Remove stale NV CooperativeMatrix attributes (#196639) Since the support for NV CooperativeMatrix has been removed a while back, those attributes can be safely removed. --- .../mlir/Dialect/SPIRV/IR/SPIRVAttributes.td | 22 ----------- .../test/Dialect/SPIRV/IR/target-and-abi.mlir | 39 ------------------- 2 files changed, 61 deletions(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td index 1bc3c63646fdd..5f5c273b12dec 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td @@ -79,24 +79,6 @@ def SPIRV_CooperativeMatrixPropertiesKHRArrayAttr : TypedArrayAttrBase; -// Description of cooperative matrix operations supported on the -// target. Represents `VkCooperativeMatrixPropertiesNV`. See -// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkCooperativeMatrixPropertiesNV.html -def SPIRV_CooperativeMatrixPropertiesNVAttr : - SPIRV_Attr<"CooperativeMatrixPropertiesNV", "coop_matrix_props_nv"> { - let parameters = (ins - "int":$m_size, - "int":$n_size, - "int":$k_size, - "mlir::Type":$a_type, - "mlir::Type":$b_type, - "mlir::Type":$c_type, - "mlir::Type":$result_type, - "mlir::spirv::ScopeAttr":$scope - ); - let assemblyFormat = "`<` struct(params) `>`"; -} - def SPIRV_CacheControlLoadINTELAttr : SPIRV_Attr<"CacheControlLoadINTEL", "cache_control_load_intel"> { let parameters = (ins "unsigned":$cache_level, @@ -111,10 +93,6 @@ def SPIRV_CacheControlStoreINTELAttr : let assemblyFormat = "`<` struct(params) `>`"; } -def SPIRV_CooperativeMatrixPropertiesNVArrayAttr : - TypedArrayAttrBase; - // This attribute specifies the limits for various resources on the target // architecture. // diff --git a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir index e634186d3b9a8..3409458ac7a0e 100644 --- a/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir +++ b/mlir/test/Dialect/SPIRV/IR/target-and-abi.mlir @@ -257,45 +257,6 @@ func.func @target_env_cooperative_matrix_khr() attributes{ // ----- -func.func @target_env_cooperative_matrix_nv() attributes{ - // CHECK: spirv.target_env = #spirv.target_env< - // CHECK-SAME: SPV_NV_cooperative_matrix - // CHECK-SAME: #spirv.coop_matrix_props_nv< - // CHECK-SAME: m_size = 8, n_size = 8, k_size = 32, - // CHECK-SAME: a_type = i8, b_type = i8, c_type = i32, - // CHECK-SAME: result_type = i32, scope = > - // CHECK-SAME: #spirv.coop_matrix_props_nv< - // CHECK-SAME: m_size = 8, n_size = 8, k_size = 16, - // CHECK-SAME: a_type = f16, b_type = f16, c_type = f16, - // CHECK-SAME: result_type = f16, scope = > - spirv.target_env = #spirv.target_env< - #spirv.vce, - #spirv.resource_limits< - cooperative_matrix_properties_nv = [#spirv.coop_matrix_props_nv< - m_size = 8, - n_size = 8, - k_size = 32, - a_type = i8, - b_type = i8, - c_type = i32, - result_type = i32, - scope = #spirv.scope - >, #spirv.coop_matrix_props_nv< - m_size = 8, - n_size = 8, - k_size = 16, - a_type = f16, - b_type = f16, - c_type = f16, - result_type = f16, - scope = #spirv.scope - >] - >> -} { return } - -// ----- - //===----------------------------------------------------------------------===// // spirv.vce //===----------------------------------------------------------------------===// From a85e494648b43ecacc5fae44d170c70b941bf781 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Sun, 10 May 2026 21:25:00 +0100 Subject: [PATCH 198/538] [mlir][spirv] Enforce execution scope for group operations in ODS (#196644) This adds a new class `SPIRV_ExecutionScopeAttrIs` shared between group and non-uniform group operations. Assisted-by: Codex --- .../mlir/Dialect/SPIRV/IR/SPIRVBase.td | 11 ++ .../mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td | 53 +++++++-- .../Dialect/SPIRV/IR/SPIRVNonUniformOps.td | 95 ++++++++++------ mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp | 106 ------------------ mlir/test/Dialect/SPIRV/IR/group-ops.mlir | 3 +- .../Dialect/SPIRV/IR/non-uniform-ops.mlir | 34 +++--- 6 files changed, 132 insertions(+), 170 deletions(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td index 3bae0fc5a1acc..742f08137f3be 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -5055,4 +5055,15 @@ def SPIRV_TosaExtRoundingModeAttr : SPIRV_I32EnumAttr< I32EnumAttrCase<"DoubleRound", 3>, ]>; +//===----------------------------------------------------------------------===// +// SPIR-V Common Constraints. +//===----------------------------------------------------------------------===// + +class SPIRV_ExecutionScopeAttrIs values> : PredOpTrait< + operand # " must be '" # !interleave(values, "' or '") # "'", + CPred<"::llvm::is_contained({::mlir::spirv::Scope::" # !interleave(values, ", ::mlir::spirv::Scope::") # + "}, ::llvm::cast<::mlir::spirv::ScopeAttr>(getProperties()." # operand # + ").getValue())"> +>; + #endif // MLIR_DIALECT_SPIRV_IR_BASE diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td index 400e37432f388..047686f781bcb 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td @@ -18,7 +18,8 @@ // ----- def SPIRV_GroupFMulKHROp : SPIRV_KhrVendorOp<"GroupFMul", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ A floating-point multiplication group operation specified for all values of 'X' specified by invocations in the group. @@ -67,13 +68,16 @@ def SPIRV_GroupFMulKHROp : SPIRV_KhrVendorOp<"GroupFMul", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupBroadcastOp : SPIRV_Op<"GroupBroadcast", [Pure, - AllTypesMatch<["value", "result"]>]> { + AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ Broadcast the Value of the invocation identified by the local id LocalId to the result of all invocations in the group. @@ -135,7 +139,8 @@ def SPIRV_GroupBroadcastOp : SPIRV_Op<"GroupBroadcast", // ----- def SPIRV_GroupFAddOp : SPIRV_Op<"GroupFAdd", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ A floating-point add group operation specified for all values of X specified by invocations in the group. @@ -183,12 +188,15 @@ def SPIRV_GroupFAddOp : SPIRV_Op<"GroupFAdd", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupFMaxOp : SPIRV_Op<"GroupFMax", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ A floating-point maximum group operation specified for all values of X specified by invocations in the group. @@ -236,12 +244,15 @@ def SPIRV_GroupFMaxOp : SPIRV_Op<"GroupFMax", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupFMinOp : SPIRV_Op<"GroupFMin", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ A floating-point minimum group operation specified for all values of X specified by invocations in the group. @@ -289,12 +300,15 @@ def SPIRV_GroupFMinOp : SPIRV_Op<"GroupFMin", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupIAddOp : SPIRV_Op<"GroupIAdd", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ An integer add group operation specified for all values of X specified by invocations in the group. @@ -342,12 +356,15 @@ def SPIRV_GroupIAddOp : SPIRV_Op<"GroupIAdd", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupIMulKHROp : SPIRV_KhrVendorOp<"GroupIMul", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ An integer multiplication group operation specified for all values of 'X' specified by invocations in the group. @@ -395,12 +412,15 @@ def SPIRV_GroupIMulKHROp : SPIRV_KhrVendorOp<"GroupIMul", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupSMaxOp : SPIRV_Op<"GroupSMax", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ A signed integer maximum group operation specified for all values of X specified by invocations in the group. @@ -449,12 +469,15 @@ def SPIRV_GroupSMaxOp : SPIRV_Op<"GroupSMax", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupSMinOp : SPIRV_Op<"GroupSMin", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ A signed integer minimum group operation specified for all values of X specified by invocations in the group. @@ -503,12 +526,15 @@ def SPIRV_GroupSMinOp : SPIRV_Op<"GroupSMin", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupUMaxOp : SPIRV_Op<"GroupUMax", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ An unsigned integer maximum group operation specified for all values of X specified by invocations in the group. @@ -556,12 +582,15 @@ def SPIRV_GroupUMaxOp : SPIRV_Op<"GroupUMax", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- def SPIRV_GroupUMinOp : SPIRV_Op<"GroupUMin", [Pure, - AllTypesMatch<["x", "result"]>]> { + AllTypesMatch<["x", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { let summary = [{ An unsigned integer minimum group operation specified for all values of X specified by invocations in the group. @@ -610,6 +639,8 @@ def SPIRV_GroupUMinOp : SPIRV_Op<"GroupUMin", [Pure, let assemblyFormat = [{ $execution_scope $group_operation operands attr-dict `:` type($x) }]; + + let hasVerifier = 0; } // ----- diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td index 7ede319f85a5b..1a0ab0ff98a8a 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td @@ -14,17 +14,12 @@ #ifndef MLIR_DIALECT_SPIRV_IR_NON_UNIFORM_OPS #define MLIR_DIALECT_SPIRV_IR_NON_UNIFORM_OPS -class SPIRV_AttrIs : PredOpTrait< - operand # " must be " # type # " of value " # value, - CPred<"::llvm::cast<::mlir::spirv::" # type # "Attr>(getProperties()." # operand # ").getValue() == ::mlir::spirv::" # type # "::" # value> - >; - -class SPIRV_ExecutionScopeAttrIs : SPIRV_AttrIs; - // ----- class SPIRV_GroupNonUniformArithmeticOp traits = []> : SPIRV_Op { + list traits = []> : SPIRV_Op + ], traits)> { let arguments = (ins SPIRV_ScopeAttr:$execution_scope, @@ -46,7 +41,9 @@ class SPIRV_GroupNonUniformArithmeticOp { +def SPIRV_GroupNonUniformBallotOp : SPIRV_Op<"GroupNonUniformBallot",[ + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is a bitfield value combining the Predicate value from all invocations in the group that execute the same dynamic instance of this @@ -94,11 +91,15 @@ def SPIRV_GroupNonUniformBallotOp : SPIRV_Op<"GroupNonUniformBallot", []> { let assemblyFormat = [{ $execution_scope $predicate attr-dict `:` type($result) }]; + + let hasVerifier = 0; } // ----- -def SPIRV_GroupNonUniformBallotFindLSBOp : SPIRV_Op<"GroupNonUniformBallotFindLSB", []> { +def SPIRV_GroupNonUniformBallotFindLSBOp : SPIRV_Op<"GroupNonUniformBallotFindLSB", [ + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Find the least significant bit set to 1 in Value, considering only the bits in Value required to represent all bits of the group's invocations. @@ -150,11 +151,15 @@ def SPIRV_GroupNonUniformBallotFindLSBOp : SPIRV_Op<"GroupNonUniformBallotFindLS let assemblyFormat = [{ $execution_scope $value attr-dict `:` type($value) `,` type($result) }]; + + let hasVerifier = 0; } // ----- -def SPIRV_GroupNonUniformBallotFindMSBOp : SPIRV_Op<"GroupNonUniformBallotFindMSB", []> { +def SPIRV_GroupNonUniformBallotFindMSBOp : SPIRV_Op<"GroupNonUniformBallotFindMSB", [ + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Find the most significant bit set to 1 in Value, considering only the bits in Value required to represent all bits of the group's invocations. @@ -206,12 +211,16 @@ def SPIRV_GroupNonUniformBallotFindMSBOp : SPIRV_Op<"GroupNonUniformBallotFindMS let assemblyFormat = [{ $execution_scope $value attr-dict `:` type($value) `,` type($result) }]; + + let hasVerifier = 0; } // ----- -def SPIRV_GroupNonUniformBroadcastOp : SPIRV_Op<"GroupNonUniformBroadcast", - [Pure, AllTypesMatch<["value", "result"]>]> { +def SPIRV_GroupNonUniformBroadcastOp : SPIRV_Op<"GroupNonUniformBroadcast", [ + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is the Value of the invocation identified by the id Id to all active invocations in the group. @@ -269,8 +278,10 @@ def SPIRV_GroupNonUniformBroadcastOp : SPIRV_Op<"GroupNonUniformBroadcast", // ----- -def SPIRV_GroupNonUniformBroadcastFirstOp : SPIRV_Op<"GroupNonUniformBroadcastFirst", - [Pure, SPIRV_ExecutionScopeAttrIs<"execution_scope", "Subgroup">, AllTypesMatch<["value", "result"]>]> { +def SPIRV_GroupNonUniformBroadcastFirstOp : SPIRV_Op<"GroupNonUniformBroadcastFirst", [ + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Subgroup"]>]> { + let summary = [{ Broadcast the value from the active invocation with the lowest id in the subgroup. @@ -323,7 +334,9 @@ def SPIRV_GroupNonUniformBroadcastFirstOp : SPIRV_Op<"GroupNonUniformBroadcastFi // ----- -def SPIRV_GroupNonUniformElectOp : SPIRV_Op<"GroupNonUniformElect", []> { +def SPIRV_GroupNonUniformElectOp : SPIRV_Op<"GroupNonUniformElect", [ + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is true only in the active invocation with the lowest id in the group, otherwise result is false. @@ -357,6 +370,8 @@ def SPIRV_GroupNonUniformElectOp : SPIRV_Op<"GroupNonUniformElect", []> { ); let assemblyFormat = "$execution_scope attr-dict `:` type($result)"; + + let hasVerifier = 0; } // ----- @@ -739,8 +754,10 @@ def SPIRV_GroupNonUniformSMinOp : SPIRV_GroupNonUniformArithmeticOp<"GroupNonUni // ----- -def SPIRV_GroupNonUniformShuffleOp : SPIRV_Op<"GroupNonUniformShuffle", - [Pure, AllTypesMatch<["value", "result"]>]> { +def SPIRV_GroupNonUniformShuffleOp : SPIRV_Op<"GroupNonUniformShuffle", [ + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is the Value of the invocation identified by the id Id. }]; @@ -791,8 +808,10 @@ def SPIRV_GroupNonUniformShuffleOp : SPIRV_Op<"GroupNonUniformShuffle", // ----- -def SPIRV_GroupNonUniformShuffleDownOp : SPIRV_Op<"GroupNonUniformShuffleDown", - [Pure, AllTypesMatch<["value", "result"]>]> { +def SPIRV_GroupNonUniformShuffleDownOp : SPIRV_Op<"GroupNonUniformShuffleDown", [ + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is the Value of the invocation identified by the current invocation’s id within the group + Delta. @@ -846,8 +865,10 @@ def SPIRV_GroupNonUniformShuffleDownOp : SPIRV_Op<"GroupNonUniformShuffleDown", // ----- -def SPIRV_GroupNonUniformShuffleUpOp : SPIRV_Op<"GroupNonUniformShuffleUp", - [Pure, AllTypesMatch<["value", "result"]>]> { +def SPIRV_GroupNonUniformShuffleUpOp : SPIRV_Op<"GroupNonUniformShuffleUp", [ + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is the Value of the invocation identified by the current invocation’s id within the group - Delta. @@ -900,8 +921,10 @@ def SPIRV_GroupNonUniformShuffleUpOp : SPIRV_Op<"GroupNonUniformShuffleUp", // ----- -def SPIRV_GroupNonUniformShuffleXorOp : SPIRV_Op<"GroupNonUniformShuffleXor", - [Pure, AllTypesMatch<["value", "result"]>]> { +def SPIRV_GroupNonUniformShuffleXorOp : SPIRV_Op<"GroupNonUniformShuffleXor", [ + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Result is the Value of the invocation identified by the current invocation’s id within the group xor’ed with Mask. @@ -1351,8 +1374,8 @@ def SPIRV_GroupNonUniformLogicalXorOp : // ----- def SPIRV_GroupNonUniformBallotBitCountOp : SPIRV_Op<"GroupNonUniformBallotBitCount", [ - SPIRV_ExecutionScopeAttrIs<"execution_scope", "Subgroup">, -]> { + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Subgroup"]>]> { + let summary = [{ Result is the number of bits that are set to 1 in Value, considering only the bits in Value required to represent all bits of the scope @@ -1416,7 +1439,9 @@ def SPIRV_GroupNonUniformBallotBitCountOp : SPIRV_Op<"GroupNonUniformBallotBitCo // ----- def SPIRV_GroupNonUniformRotateKHROp : SPIRV_Op<"GroupNonUniformRotateKHR", [ - Pure, AllTypesMatch<["value", "result"]>]> { + Pure, AllTypesMatch<["value", "result"]>, + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Workgroup", "Subgroup"]>]> { + let summary = [{ Rotate values across invocations within a subgroup. }]; @@ -1490,8 +1515,8 @@ def SPIRV_GroupNonUniformRotateKHROp : SPIRV_Op<"GroupNonUniformRotateKHR", [ // ----- def SPIRV_GroupNonUniformAllOp : SPIRV_Op<"GroupNonUniformAll", [ - SPIRV_ExecutionScopeAttrIs<"execution_scope", "Subgroup"> -]> { + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Subgroup"]>]> { + let summary = [{ Evaluates a predicate for all tangled invocations within the Execution scope, resulting in true if predicate evaluates to true for all tangled @@ -1546,8 +1571,8 @@ def SPIRV_GroupNonUniformAllOp : SPIRV_Op<"GroupNonUniformAll", [ // ----- def SPIRV_GroupNonUniformAnyOp : SPIRV_Op<"GroupNonUniformAny", [ - SPIRV_ExecutionScopeAttrIs<"execution_scope", "Subgroup"> -]> { + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Subgroup"]>]> { + let summary = [{ Evaluates a predicate for all tangled invocations within the Execution scope, resulting in true if predicate evaluates to true for any tangled @@ -1602,8 +1627,8 @@ def SPIRV_GroupNonUniformAnyOp : SPIRV_Op<"GroupNonUniformAny", [ // ----- def SPIRV_GroupNonUniformAllEqualOp : SPIRV_Op<"GroupNonUniformAllEqual", [ - SPIRV_ExecutionScopeAttrIs<"execution_scope", "Subgroup"> -]> { + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Subgroup"]>]> { + let summary = [{ Evaluates a value for all tangled invocations within the Execution scope. The result is true if Value is equal for all tangled invocations @@ -1663,8 +1688,8 @@ def SPIRV_GroupNonUniformAllEqualOp : SPIRV_Op<"GroupNonUniformAllEqual", [ // ----- def SPIRV_GroupNonUniformQuadSwapOp : SPIRV_Op<"GroupNonUniformQuadSwap", [ - SPIRV_ExecutionScopeAttrIs<"execution_scope", "Subgroup">, AllTypesMatch<["value", "result"]> -]> { + SPIRV_ExecutionScopeAttrIs<"execution_scope", ["Subgroup"]>, AllTypesMatch<["value", "result"]>]> { + let summary = [{ Swap the Value of the invocation within the quad with another invocation in the quad using Direction. diff --git a/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp b/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp index a1bb7f89e9183..fe6f00e9e5bca 100644 --- a/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp @@ -22,15 +22,6 @@ namespace mlir::spirv { template static LogicalResult verifyGroupNonUniformArithmeticOp(Operation *groupOp) { - spirv::Scope scope = - groupOp - ->getAttrOfType( - OpTy::getExecutionScopeAttrName(groupOp->getName())) - .getValue(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return groupOp->emitOpError( - "execution scope must be 'Workgroup' or 'Subgroup'"); - GroupOperation operation = groupOp ->getAttrOfType( @@ -61,10 +52,6 @@ static LogicalResult verifyGroupNonUniformArithmeticOp(Operation *groupOp) { //===----------------------------------------------------------------------===// LogicalResult GroupBroadcastOp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - if (auto localIdTy = dyn_cast(getLocalid().getType())) if (localIdTy.getNumElements() != 2 && localIdTy.getNumElements() != 3) return emitOpError("localid is a vector and can be with only " @@ -74,51 +61,11 @@ LogicalResult GroupBroadcastOp::verify() { return success(); } -//===----------------------------------------------------------------------===// -// spirv.GroupNonUniformBallotOp -//===----------------------------------------------------------------------===// - -LogicalResult GroupNonUniformBallotOp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - - return success(); -} - -//===----------------------------------------------------------------------===// -// spirv.GroupNonUniformBallotFindLSBOp -//===----------------------------------------------------------------------===// - -LogicalResult GroupNonUniformBallotFindLSBOp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - - return success(); -} - -//===----------------------------------------------------------------------===// -// spirv.GroupNonUniformBallotFindLSBOp -//===----------------------------------------------------------------------===// - -LogicalResult GroupNonUniformBallotFindMSBOp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - - return success(); -} - //===----------------------------------------------------------------------===// // spirv.GroupNonUniformBroadcast //===----------------------------------------------------------------------===// LogicalResult GroupNonUniformBroadcastOp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - // SPIR-V spec: "Before version 1.5, Id must come from a // constant instruction. auto targetEnv = spirv::getDefaultTargetEnv(getContext()); @@ -141,10 +88,6 @@ LogicalResult GroupNonUniformBroadcastOp::verify() { template static LogicalResult verifyGroupNonUniformShuffleOp(OpTy op) { - spirv::Scope scope = op.getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return op.emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - if (op.getOperands().back().getType().isSignedInteger()) return op.emitOpError("second operand must be a singless/unsigned integer"); @@ -164,18 +107,6 @@ LogicalResult GroupNonUniformShuffleXorOp::verify() { return verifyGroupNonUniformShuffleOp(*this); } -//===----------------------------------------------------------------------===// -// spirv.GroupNonUniformElectOp -//===----------------------------------------------------------------------===// - -LogicalResult GroupNonUniformElectOp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - - return success(); -} - //===----------------------------------------------------------------------===// // spirv.GroupNonUniformFAddOp //===----------------------------------------------------------------------===// @@ -309,10 +240,6 @@ LogicalResult GroupNonUniformLogicalXorOp::verify() { //===----------------------------------------------------------------------===// LogicalResult GroupNonUniformRotateKHROp::verify() { - spirv::Scope scope = getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - if (Value clusterSizeVal = getClusterSize()) { mlir::Operation *defOp = clusterSizeVal.getDefiningOp(); int32_t clusterSize = 0; @@ -327,37 +254,4 @@ LogicalResult GroupNonUniformRotateKHROp::verify() { return success(); } -//===----------------------------------------------------------------------===// -// Group op verification -//===----------------------------------------------------------------------===// - -template -static LogicalResult verifyGroupOp(Op op) { - spirv::Scope scope = op.getExecutionScope(); - if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) - return op.emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); - - return success(); -} - -LogicalResult GroupIAddOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupFAddOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupFMinOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupUMinOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupSMinOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupFMaxOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupUMaxOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupSMaxOp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupIMulKHROp::verify() { return verifyGroupOp(*this); } - -LogicalResult GroupFMulKHROp::verify() { return verifyGroupOp(*this); } - } // namespace mlir::spirv diff --git a/mlir/test/Dialect/SPIRV/IR/group-ops.mlir b/mlir/test/Dialect/SPIRV/IR/group-ops.mlir index e69a07ff885f0..1034b9f02ef52 100644 --- a/mlir/test/Dialect/SPIRV/IR/group-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/group-ops.mlir @@ -41,7 +41,7 @@ func.func @group_broadcast_vector(%value: vector<4xf32>, %localid: vector<3xi32> // ----- func.func @group_broadcast_negative_scope(%value: f32, %localid: vector<3xi32> ) -> f32 { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupBroadcast %value, %localid : f32, vector<3xi32> return %0: f32 } @@ -196,3 +196,4 @@ func.func @group_fmul(%value: f32) -> f32 { %0 = spirv.KHR.GroupFMul %value : f32 return %0: f32 } + diff --git a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir index fb18d69f58241..abc6964026646 100644 --- a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir @@ -13,7 +13,7 @@ func.func @group_non_uniform_ballot(%predicate: i1) -> vector<4xi32> { // ----- func.func @group_non_uniform_ballot(%predicate: i1) -> vector<4xi32> { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformBallot %predicate : vector<4xi32> return %0: vector<4xi32> } @@ -41,7 +41,7 @@ func.func @group_non_uniform_ballot_find_lsb(%value : vector<4xi32>) -> i32 { // ----- func.func @group_non_uniform_ballot_find_lsb(%value : vector<4xi32>) -> i32 { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformBallotFindLSB %value : vector<4xi32>, i32 return %0: i32 } @@ -69,7 +69,7 @@ func.func @group_non_uniform_ballot_find_msb(%value : vector<4xi32>) -> i32 { // ----- func.func @group_non_uniform_ballot_find_msb(%value : vector<4xi32>) -> i32 { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformBallotFindMSB %value : vector<4xi32>, i32 return %0: i32 } @@ -108,7 +108,7 @@ func.func @group_non_uniform_broadcast_vector(%value: vector<4xf32>) -> vector<4 func.func @group_non_uniform_broadcast_negative_scope(%value: f32, %localid: i32 ) -> f32 { %one = spirv.Constant 1 : i32 - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformBroadcast %value, %one : f32, i32 return %0: f32 } @@ -146,7 +146,7 @@ func.func @group_non_uniform_broadcast_first_vector(%value: vector<4xf32>) -> ve // ----- func.func @group_non_uniform_broadcast_first_negative_scope(%value: f32) -> f32 { - // expected-error @+1 {{execution_scope must be Scope of value Subgroup}} + // expected-error @+1 {{execution_scope must be 'Subgroup'}} %0 = spirv.GroupNonUniformBroadcastFirst %value : f32 return %0 : f32 } @@ -184,7 +184,7 @@ func.func @group_non_uniform_elect() -> i1 { // ----- func.func @group_non_uniform_elect() -> i1 { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformElect : i1 return %0: i1 } @@ -295,7 +295,7 @@ func.func @group_non_uniform_iadd_clustered_reduce(%val: vector<2xi32>) -> vecto // ----- func.func @group_non_uniform_iadd_reduce(%val: i32) -> i32 { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformIAdd %val : i32 -> i32 return %0: i32 } @@ -395,7 +395,7 @@ func.func @group_non_uniform_shuffle2(%val: vector<2xf32>, %id: i32) -> vector<2 // ----- func.func @group_non_uniform_shuffle(%val: vector<2xf32>, %id: i32) -> vector<2xf32> { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformShuffle %val, %id : vector<2xf32>, i32 return %0: vector<2xf32> } @@ -431,7 +431,7 @@ func.func @group_non_uniform_shuffle2(%val: vector<2xf32>, %id: i32) -> vector<2 // ----- func.func @group_non_uniform_shuffle(%val: vector<2xf32>, %id: i32) -> vector<2xf32> { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformShuffleXor %val, %id : vector<2xf32>, i32 return %0: vector<2xf32> } @@ -467,7 +467,7 @@ func.func @group_non_uniform_shuffle2(%val: vector<2xf32>, %id: i32) -> vector<2 // ----- func.func @group_non_uniform_shuffle(%val: vector<2xf32>, %id: i32) -> vector<2xf32> { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformShuffleUp %val, %id : vector<2xf32>, i32 return %0: vector<2xf32> } @@ -503,7 +503,7 @@ func.func @group_non_uniform_shuffle2(%val: vector<2xf32>, %id: i32) -> vector<2 // ----- func.func @group_non_uniform_shuffle(%val: vector<2xf32>, %id: i32) -> vector<2xf32> { - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformShuffleDown %val, %id : vector<2xf32>, i32 return %0: vector<2xf32> } @@ -695,7 +695,7 @@ func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { %four = spirv.Constant 4 : i32 - // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + // expected-error @+1 {{execution_scope must be 'Workgroup' or 'Subgroup'}} %0 = spirv.GroupNonUniformRotateKHR %val, %delta, cluster_size(%four) : f32, i32, i32 -> f32 return %0: f32 } @@ -751,7 +751,7 @@ func.func @group_non_uniform_all(%predicate: i1) -> i1 { // ----- func.func @group_non_uniform_all(%predicate: i1) -> i1 { - // expected-error @+1 {{execution_scope must be Scope of value Subgroup}} + // expected-error @+1 {{execution_scope must be 'Subgroup'}} %0 = spirv.GroupNonUniformAll %predicate : i1 return %0: i1 } @@ -772,7 +772,7 @@ func.func @group_non_uniform_any(%predicate: i1) -> i1 { // ----- func.func @group_non_uniform_any(%predicate: i1) -> i1 { - // expected-error @+1 {{execution_scope must be Scope of value Subgroup}} + // expected-error @+1 {{execution_scope must be 'Subgroup'}} %0 = spirv.GroupNonUniformAny %predicate : i1 return %0: i1 } @@ -803,7 +803,7 @@ func.func @group_non_uniform_all_equal(%value: vector<4xi32>) -> i1 { // ----- func.func @group_non_uniform_all_equal(%value: f32) -> i1 { - // expected-error @+1 {{execution_scope must be Scope of value Subgroup}} + // expected-error @+1 {{execution_scope must be 'Subgroup'}} %0 = spirv.GroupNonUniformAllEqual %value : f32, i1 return %0: i1 } @@ -837,7 +837,7 @@ func.func @group_non_uniform_quad_swap(%value: vector<4xf32>) -> vector<4xf32> { // ----- func.func @group_non_uniform_quad_swap(%value: vector<4xf32>) -> vector<4xf32> { - // expected-error @+1 {{execution_scope must be Scope of value Subgroup}} + // expected-error @+1 {{execution_scope must be 'Subgroup'}} %0 = spirv.GroupNonUniformQuadSwap %value : vector<4xf32> return %0: vector<4xf32> } @@ -874,7 +874,7 @@ func.func @group_non_uniform_ballot_bit_count(%value: vector<4xi32>) -> i32 { // ----- func.func @group_non_uniform_ballot_bit_count_wrong_scope(%value: vector<4xi32>) -> i32 { - // expected-error @+1 {{execution_scope must be Scope of value Subgroup}} + // expected-error @+1 {{execution_scope must be 'Subgroup'}} %0 = spirv.GroupNonUniformBallotBitCount %value : vector<4xi32> -> i32 return %0: i32 } From 5a1635c6bb90e72c285527c285ed1d6596857085 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 10 May 2026 22:57:04 +0200 Subject: [PATCH 199/538] [LV] Add tests for load/store scalarization and ptrcasts (NFC) (#196839) Add missing test coverage for range of pointer casts and load/store scalarization. --- .../scalarize-wide-load-for-address-use.ll | 227 ++++++++++++++++++ ...reserve-inbounds-gep-with-pointer-casts.ll | 154 ++++++++++++ .../LoopVectorize/version-mem-access.ll | 48 ++++ 3 files changed, 429 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/VPlan/X86/scalarize-wide-load-for-address-use.ll create mode 100644 llvm/test/Transforms/LoopVectorize/preserve-inbounds-gep-with-pointer-casts.ll diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/X86/scalarize-wide-load-for-address-use.ll b/llvm/test/Transforms/LoopVectorize/VPlan/X86/scalarize-wide-load-for-address-use.ll new file mode 100644 index 0000000000000..02efbf0ac738b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/VPlan/X86/scalarize-wide-load-for-address-use.ll @@ -0,0 +1,227 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter-out-after "middle.block:" --version 6 +; RUN: opt -passes=loop-vectorize -disable-output \ +; RUN: -vplan-print-after=makeMemOpWideningDecisions \ +; RUN: -force-vector-width=4 -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell \ +; RUN: %s 2>&1 | FileCheck %s + +@tbl.a = external global [1024 x double] +@tbl.b = external global [1024 x double] + +; Reverse-consecutive load feeding a store address. +define void @reverse_unmasked_load_feeds_address(ptr noalias %src, i64 %n) { +; CHECK-LABEL: VPlan for loop in 'reverse_unmasked_load_feeds_address' +; CHECK: VPlan ' for UF>=1' { +; CHECK-NEXT: Live-in vp<[[VP0:%[0-9]+]]> = VF +; CHECK-NEXT: Live-in vp<[[VP1:%[0-9]+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %n.minus.1 = sub i64 %n, 1 +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[VP3:%[0-9]+]]> = CANONICAL-IV +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<%n.minus.1>, ir<-1>, vp<[[VP0]]> +; CHECK-NEXT: EMIT ir<%gep> = getelementptr ir<%src>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%val> = load ir<%gep> +; CHECK-NEXT: EMIT ir<%cmp> = fcmp oeq ir<%val>, ir<0.000000e+00> +; CHECK-NEXT: EMIT ir<%ptr.sel> = select ir<%cmp>, ir<@tbl.a>, ir<@tbl.b> +; CHECK-NEXT: REPLICATE store ir<1.000000e+00>, ir<%ptr.sel> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; +entry: + %n.minus.1 = sub i64 %n, 1 + br label %loop + +loop: + %iv = phi i64 [ %n.minus.1, %entry ], [ %iv.next, %loop ] + %gep = getelementptr double, ptr %src, i64 %iv + %val = load double, ptr %gep, align 8 + %cmp = fcmp oeq double %val, 0.0 + %ptr.sel = select i1 %cmp, ptr @tbl.a, ptr @tbl.b + store double 1.0, ptr %ptr.sel, align 8 + %iv.next = add i64 %iv, -1 + %ec = icmp eq i64 %iv, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Widened load whose value is used both as an address (via select) and as +; a vector arithmetic operand (fmul). +define void @mixed_address_and_vector_uses(ptr noalias %src, ptr noalias %dst, i64 %n) { +; CHECK-LABEL: VPlan for loop in 'mixed_address_and_vector_uses' +; CHECK: VPlan ' for UF>=1' { +; CHECK-NEXT: Live-in vp<[[VP0:%[0-9]+]]> = VF +; CHECK-NEXT: Live-in vp<[[VP1:%[0-9]+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[VP3:%[0-9]+]]> = CANONICAL-IV +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]> +; CHECK-NEXT: EMIT ir<%gep.src> = getelementptr ir<%src>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%val> = load ir<%gep.src> +; CHECK-NEXT: EMIT ir<%cmp> = fcmp oeq ir<%val>, ir<0.000000e+00> +; CHECK-NEXT: EMIT ir<%ptr.sel> = select ir<%cmp>, ir<@tbl.a>, ir<@tbl.b> +; CHECK-NEXT: REPLICATE store ir<1.000000e+00>, ir<%ptr.sel> +; CHECK-NEXT: EMIT ir<%doubled> = fmul ir<%val>, ir<2.000000e+00> +; CHECK-NEXT: EMIT ir<%gep.dst> = getelementptr ir<%dst>, ir<%iv> +; CHECK-NEXT: vp<[[VP4:%[0-9]+]]> = vector-pointer ir<%gep.dst> +; CHECK-NEXT: WIDEN store vp<[[VP4]]>, ir<%doubled> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr double, ptr %src, i64 %iv + %val = load double, ptr %gep.src, align 8 + %cmp = fcmp oeq double %val, 0.0 + %ptr.sel = select i1 %cmp, ptr @tbl.a, ptr @tbl.b + store double 1.0, ptr %ptr.sel, align 8 + %doubled = fmul double %val, 2.0 + %gep.dst = getelementptr double, ptr %dst, i64 %iv + store double %doubled, ptr %gep.dst, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Interleave-group candidate where one member's value feeds a store address. +%struct.pair = type { ptr, double } +define void @interleave_member_feeds_address(ptr noalias %arr, i64 %n) { +; CHECK-LABEL: VPlan for loop in 'interleave_member_feeds_address' +; CHECK: VPlan ' for UF>=1' { +; CHECK-NEXT: Live-in vp<[[VP0:%[0-9]+]]> = VF +; CHECK-NEXT: Live-in vp<[[VP1:%[0-9]+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[VP3:%[0-9]+]]> = CANONICAL-IV +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]> +; CHECK-NEXT: EMIT ir<%gep.p> = getelementptr ir<%arr>, ir<%iv>, ir<0> +; CHECK-NEXT: REPLICATE ir<%p> = load ir<%gep.p> +; CHECK-NEXT: EMIT ir<%gep.v> = getelementptr ir<%arr>, ir<%iv>, ir<1> +; CHECK-NEXT: REPLICATE ir<%v> = load ir<%gep.v> +; CHECK-NEXT: REPLICATE store ir<%v>, ir<%p> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.p = getelementptr %struct.pair, ptr %arr, i64 %iv, i32 0 + %p = load ptr, ptr %gep.p, align 8 + %gep.v = getelementptr %struct.pair, ptr %arr, i64 %iv, i32 1 + %v = load double, ptr %gep.v, align 8 + store double %v, ptr %p, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Load with symbolic stride that PSE versions to 1 at runtime. +define void @symbolic_stride_versioned_to_one(ptr noalias %src, ptr noalias %dst, i64 %n, i64 %stride) { +; CHECK-LABEL: VPlan for loop in 'symbolic_stride_versioned_to_one' +; CHECK: VPlan ' for UF>=1' { +; CHECK-NEXT: Live-in vp<[[VP0:%[0-9]+]]> = VF +; CHECK-NEXT: Live-in vp<[[VP1:%[0-9]+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[VP3:%[0-9]+]]> = CANONICAL-IV +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]> +; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride> +; CHECK-NEXT: EMIT ir<%gep.src> = getelementptr ir<%src>, ir<%idx> +; CHECK-NEXT: vp<[[VP4:%[0-9]+]]> = vector-pointer ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%val> = load vp<[[VP4]]> +; CHECK-NEXT: EMIT ir<%doubled> = fmul ir<%val>, ir<2.000000e+00> +; CHECK-NEXT: EMIT ir<%gep.dst> = getelementptr ir<%dst>, ir<%iv> +; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = vector-pointer ir<%gep.dst> +; CHECK-NEXT: WIDEN store vp<[[VP5]]>, ir<%doubled> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx = mul i64 %iv, %stride + %gep.src = getelementptr double, ptr %src, i64 %idx + %val = load double, ptr %gep.src, align 8 + %doubled = fmul double %val, 2.0 + %gep.dst = getelementptr double, ptr %dst, i64 %iv + store double %doubled, ptr %gep.dst, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/preserve-inbounds-gep-with-pointer-casts.ll b/llvm/test/Transforms/LoopVectorize/preserve-inbounds-gep-with-pointer-casts.ll new file mode 100644 index 0000000000000..a9bb1c8f7f7be --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/preserve-inbounds-gep-with-pointer-casts.ll @@ -0,0 +1,154 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s + +define void @preserve_inbounds_through_noop_gep(ptr %p) { +; CHECK-LABEL: define void @preserve_inbounds_through_noop_gep( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[VECTOR_BODY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY1:.*]] +; CHECK: [[VECTOR_BODY1]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP1]], align 4 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr inbounds i32, ptr %p, i32 %iv + %noop = getelementptr i8, ptr %gep, i64 0 + store i32 0, ptr %noop + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @bitcast_in_ptr_chain(ptr %src, ptr noalias %dst) { +; CHECK-LABEL: define void @bitcast_in_ptr_chain( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast ptr [[TMP0]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 4 +; CHECK-NEXT: store <4 x double> [[WIDE_LOAD]], ptr [[TMP3]], align 8 +; CHECK-NEXT: store <4 x double> [[WIDE_LOAD1]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds double, ptr %src, i64 %iv + %bc = bitcast ptr %gep.src to ptr + %val = load double, ptr %bc, align 8 + %gep.dst = getelementptr inbounds double, ptr %dst, i64 %iv + store double %val, ptr %gep.dst, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @addrspacecast_in_ptr_chain(ptr addrspace(1) %src, ptr noalias %dst) { +; CHECK-LABEL: define void @addrspacecast_in_ptr_chain( +; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr addrspace(1) [[SRC]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr addrspace(1) [[SRC]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast <4 x ptr addrspace(1)> [[TMP0]] to <4 x ptr> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x ptr> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[TMP2]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = addrspacecast <4 x ptr addrspace(1)> [[TMP1]] to <4 x ptr> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP7]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP7]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x double> [[TMP18]], double [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x double> poison, double [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP21]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP22]], i32 2 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP23]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP28]], i64 4 +; CHECK-NEXT: store <4 x double> [[TMP19]], ptr [[TMP28]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP27]], ptr [[TMP29]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP30]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds double, ptr addrspace(1) %src, i64 %iv + %ac = addrspacecast ptr addrspace(1) %gep.src to ptr + %val = load double, ptr %ac, align 8 + %gep.dst = getelementptr inbounds double, ptr %dst, i64 %iv + store double %val, ptr %gep.dst, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll index c573ebaa51e9f..3d0272bef7dfa 100644 --- a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll +++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll @@ -168,3 +168,51 @@ loop: exit: ret void } + +define void @load_symbolic_stride_versioned_to_one(ptr noalias %src, ptr noalias %dst, +; CHECK-LABEL: define void @load_symbolic_stride_versioned_to_one( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]], i64 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1 +; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 2.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[TMP2]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; + i64 %n, i64 %stride) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %idx = mul i64 %iv, %stride + %gep.src = getelementptr double, ptr %src, i64 %idx + %val = load double, ptr %gep.src, align 8 + %doubled = fmul double %val, 2.0 + %gep.dst = getelementptr double, ptr %dst, i64 %iv + store double %doubled, ptr %gep.dst, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} From d565d5185be40cf17b07d734c7ee21a5d59e764b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 10 May 2026 23:14:27 +0200 Subject: [PATCH 200/538] [LV] Add missing cost tests for various unary and binary ops (NFC) (#196841) --- .../LoopVectorize/AArch64/binop-costs.ll | 35 +- .../LoopVectorize/AArch64/reduction-cost.ll | 111 ++++++ .../X86/CostModel/vpinstruction-cost.ll | 341 ++++++++++++++++-- 3 files changed, 462 insertions(+), 25 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/binop-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/binop-costs.ll index 343276bc9fe30..e42090d556f3c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/binop-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/binop-costs.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "[Cc]ost.*udiv" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "[Cc]ost.*(udiv|fneg|fmul)" ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize %s -S -o - 2>&1 | FileCheck %s ; REQUIRES: asserts @@ -36,4 +36,37 @@ exit: ret void } +define void @fneg_used_by_fmul_scalar_cost_is_zero(ptr %dst) #0 { +; CHECK-LABEL: 'fneg_used_by_fmul_scalar_cost_is_zero' +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %neg = fneg double %conv +; CHECK: LV: Found an estimated cost of 2 for VF 1 For instruction: %mul = fmul double %neg, 2.500000e-04 +; CHECK: Cost of 1 for VF 2: WIDEN ir<%neg> = fneg ir<%conv> +; CHECK: Cost of 2 for VF 2: WIDEN ir<%mul> = fmul ir<%neg>, ir<2.500000e-04> +; CHECK: Cost of 0 for VF 2: IR %neg = fneg double %conv +; CHECK: Cost of 0 for VF 2: IR %mul = fmul double %neg, 2.500000e-04 +; CHECK: Cost of Invalid for VF vscale x 1: WIDEN ir<%neg> = fneg ir<%conv> +; CHECK: Cost of Invalid for VF vscale x 1: WIDEN ir<%mul> = fmul ir<%neg>, ir<2.500000e-04> +; CHECK: Cost of 1 for VF vscale x 2: WIDEN ir<%neg> = fneg ir<%conv> +; CHECK: Cost of 2 for VF vscale x 2: WIDEN ir<%mul> = fmul ir<%neg>, ir<2.500000e-04> +; CHECK: Cost of 0 for VF vscale x 2: IR %neg = fneg double %conv +; CHECK: Cost of 0 for VF vscale x 2: IR %mul = fmul double %neg, 2.500000e-04 +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %conv = uitofp i64 %iv to double + %neg = fneg double %conv + %mul = fmul double %neg, 2.500000e-04 + %gep.dst = getelementptr double, ptr %dst, i64 %iv + store double %mul, ptr %gep.dst, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + attributes #0 = { "target-cpu"="neoverse-v2" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll index e5886d83c0182..92e75f26fffcb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll @@ -59,3 +59,114 @@ exit: %res = add i64 %ext, %mul ret i64 %res } + +; The scalar cost of this loop must include the freeze's cost, otherwise VF=2 +; is incorrectly rejected as unprofitable. +define i32 @or_reduction_with_freeze(ptr %dst, ptr %src) { +; CHECK-LABEL: define i32 @or_reduction_with_freeze( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SRC7:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST6:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DST6]], [[SRC7]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 18 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[DST1]] to i3 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[SRC2]] to i3 +; CHECK-NEXT: [[TMP5:%.*]] = sub i3 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = zext i3 [[TMP5]] to i64 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[DST3]], [[SRC4]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP7]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 8 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP10]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 8 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP11:%.*]] = load i64, ptr [[DST]], align 8, !alias.scope [[META4:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[NEXT_GEP]], align 8, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP15]], align 8, !alias.scope [[META7]], !noalias [[META4]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[NEXT_GEP]], align 8, !alias.scope [[META7]], !noalias [[META4]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[TMP15]], align 8, !alias.scope [[META7]], !noalias [[META4]] +; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = or <2 x i64> [[WIDE_LOAD9]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP18:%.*]] = freeze <2 x i64> [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = freeze <2 x i64> [[TMP17]] +; CHECK-NEXT: [[TMP20]] = or <2 x i64> [[TMP18]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP21]] = or <2 x i64> [[TMP19]], [[VEC_PHI8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_SCEVCHECK]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[MASK:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[OR:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L_SRC:%.*]] = load i64, ptr [[PTR_IV]], align 8 +; CHECK-NEXT: store i64 0, ptr [[PTR_IV]], align 8 +; CHECK-NEXT: [[L_DST:%.*]] = load i64, ptr [[DST]], align 8 +; CHECK-NEXT: [[OR_1:%.*]] = or i64 [[L_SRC]], [[L_DST]] +; CHECK-NEXT: [[FR:%.*]] = freeze i64 [[OR_1]] +; CHECK-NEXT: [[OR]] = or i64 [[FR]], [[MASK]] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr i8, ptr [[PTR_IV]], i64 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[PTR_IV]], [[DST]] +; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i64 [ [[OR]], %[[LOOP]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[CAST:%.*]] = trunc i64 [[OR_LCSSA]] to i32 +; CHECK-NEXT: ret i32 [[CAST]] +; +entry: + br label %loop + +loop: + %mask = phi i64 [ 0, %entry ], [ %or, %loop ] + %ptr.iv = phi ptr [ %src, %entry ], [ %incdec.ptr, %loop ] + %l.src = load i64, ptr %ptr.iv, align 8 + store i64 0, ptr %ptr.iv, align 8 + %l.dst = load i64, ptr %dst, align 8 + %or.1 = or i64 %l.src, %l.dst + %fr = freeze i64 %or.1 + %or = or i64 %fr, %mask + %incdec.ptr = getelementptr i8, ptr %ptr.iv, i64 8 + %cmp = icmp eq ptr %ptr.iv, %dst + br i1 %cmp, label %exit, label %loop + +exit: + %cast = trunc i64 %or to i32 + ret i32 %cast +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll index 77f3f0da07c16..12d32872e1453 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction" --filter "Cost of" ; RUN: opt -S -passes=loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -debug -disable-output -S %s 2>&1 | FileCheck %s ; REQUIRES: asserts @@ -7,57 +7,70 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80: define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) { ; CHECK-LABEL: 'wide_or_replaced_with_add_vpinstruction' +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %l = load i64, ptr %g.src, align 8 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.4 = add nuw nsw i64 %iv, 4 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %c = icmp ule i64 %l, 128 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %c, label %loop.then, label %loop.latch +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %or = or disjoint i64 %iv.4, 1 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.dst = getelementptr inbounds i64, ptr %dst, i64 %or +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %iv.4, ptr %g.dst, align 4 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %loop.latch +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond = icmp eq i64 %iv.next, 32 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond, label %exit, label %loop.header ; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] ; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 -; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> -; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer inbounds ir<%g.src> -; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<%5> +; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VP0:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP4:%[0-9]+]]> = SCALAR-STEPS vp<[[VP3:%[0-9]+]]>, ir<1>, vp<[[VP0]]> +; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%g.src> +; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<[[VP5]]> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%iv.4> = add ir<%iv>, ir<4> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%c> = icmp ule ir<%l>, ir<128> ; CHECK: Cost of 1 for VF 2: EMIT ir<%or> = add ir<%iv.4>, ir<1> ; CHECK: Cost of 0 for VF 2: CLONE ir<%g.dst> = getelementptr ir<%dst>, ir<%or> -; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%g.dst> -; CHECK: Cost of 1 for VF 2: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> -; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 0 for VF 2: vp<[[VP6:%[0-9]+]]> = vector-pointer ir<%g.dst> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP6]]>, ir<%iv.4>, ir<%c> +; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]> ; CHECK: Cost of 0 for VF 2: vector loop backedge -; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP2]]>, middle.block ], [ ir<0>, ir-bb ] ; CHECK: Cost of 0 for VF 2: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) ; CHECK: Cost of 0 for VF 2: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv ; CHECK: Cost of 0 for VF 2: IR %l = load i64, ptr %g.src, align 8 ; CHECK: Cost of 0 for VF 2: IR %iv.4 = add nuw nsw i64 %iv, 4 ; CHECK: Cost of 0 for VF 2: IR %c = icmp ule i64 %l, 128 -; CHECK: Cost of 0 for VF 2: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 2: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<[[VP2]]> ; CHECK: Cost of 0 for VF 2: EMIT branch-on-cond vp<%cmp.n> ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] ; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 -; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> -; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer inbounds ir<%g.src> -; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<%5> +; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VP0]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP4]]> = SCALAR-STEPS vp<[[VP3]]>, ir<1>, vp<[[VP0]]> +; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP5]]> = vector-pointer inbounds ir<%g.src> +; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<[[VP5]]> ; CHECK: Cost of 1 for VF 4: WIDEN ir<%iv.4> = add ir<%iv>, ir<4> ; CHECK: Cost of 1 for VF 4: WIDEN ir<%c> = icmp ule ir<%l>, ir<128> ; CHECK: Cost of 1 for VF 4: EMIT ir<%or> = add ir<%iv.4>, ir<1> ; CHECK: Cost of 0 for VF 4: CLONE ir<%g.dst> = getelementptr ir<%dst>, ir<%or> -; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer ir<%g.dst> -; CHECK: Cost of 1 for VF 4: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> -; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 0 for VF 4: vp<[[VP6]]> = vector-pointer ir<%g.dst> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP6]]>, ir<%iv.4>, ir<%c> +; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> ; CHECK: Cost of 0 for VF 4: vector loop backedge -; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP2]]>, middle.block ], [ ir<0>, ir-bb ] ; CHECK: Cost of 0 for VF 4: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) ; CHECK: Cost of 0 for VF 4: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv ; CHECK: Cost of 0 for VF 4: IR %l = load i64, ptr %g.src, align 8 ; CHECK: Cost of 0 for VF 4: IR %iv.4 = add nuw nsw i64 %iv, 4 ; CHECK: Cost of 0 for VF 4: IR %c = icmp ule i64 %l, 128 -; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<[[VP2]]> ; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> -; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<[[VP2]]> ; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> ; entry: @@ -85,3 +98,283 @@ loop.latch: exit: ret void } + +define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) { +; CHECK-LABEL: 'test_vpinstruction_freeze_cost' +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %l = load i64, ptr %g.src, align 8 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %fr = freeze i64 %l +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %fr, ptr %g.dst, align 8 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 32 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: Cost of 1 for VF 2: exit condition instruction %ec = icmp eq i64 %iv.next, 32 +; CHECK: Cost of 0 for VF 2: vp<[[VP4:%[0-9]+]]> = SCALAR-STEPS vp<[[VP3:%[0-9]+]]>, ir<1>, vp<[[VP0:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%g.src> +; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<[[VP5]]> +; CHECK: Cost of 2 for VF 2: WIDEN ir<%fr> = freeze ir<%l> +; CHECK: Cost of 0 for VF 2: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%g.dst> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP6]]>, ir<%fr> +; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: vector loop backedge +; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP2]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 2: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 2: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: Cost of 0 for VF 2: IR %l = load i64, ptr %g.src, align 8 +; CHECK: Cost of 0 for VF 2: IR %fr = freeze i64 %l +; CHECK: Cost of 0 for VF 2: IR %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv +; CHECK: Cost of 0 for VF 2: IR store i64 %fr, ptr %g.dst, align 8 +; CHECK: Cost of 0 for VF 2: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 2: IR %ec = icmp eq i64 %iv.next, 32 +; CHECK: Cost of 0 for VF 2: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-cond vp<%cmp.n> +; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: Cost of 1 for VF 4: exit condition instruction %ec = icmp eq i64 %iv.next, 32 +; CHECK: Cost of 0 for VF 4: vp<[[VP4]]> = SCALAR-STEPS vp<[[VP3]]>, ir<1>, vp<[[VP0]]> +; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP5]]> = vector-pointer inbounds ir<%g.src> +; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<[[VP5]]> +; CHECK: Cost of 2 for VF 4: WIDEN ir<%fr> = freeze ir<%l> +; CHECK: Cost of 0 for VF 4: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP6]]> = vector-pointer inbounds ir<%g.dst> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP6]]>, ir<%fr> +; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: vector loop backedge +; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP2]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 4: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 4: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: Cost of 0 for VF 4: IR %l = load i64, ptr %g.src, align 8 +; CHECK: Cost of 0 for VF 4: IR %fr = freeze i64 %l +; CHECK: Cost of 0 for VF 4: IR %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv +; CHECK: Cost of 0 for VF 4: IR store i64 %fr, ptr %g.dst, align 8 +; CHECK: Cost of 0 for VF 4: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 4: IR %ec = icmp eq i64 %iv.next, 32 +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %g.src = getelementptr inbounds i64, ptr %src, i64 %iv + %l = load i64, ptr %g.src, align 8 + %fr = freeze i64 %l + %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv + store i64 %fr, ptr %g.dst, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 32 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_vpinstruction_switch_cost(ptr %start, ptr %end) { +; CHECK-LABEL: 'test_vpinstruction_switch_cost' +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %l = load i64, ptr %ptr.iv, align 1 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: switch i64 %l, label %default [ +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 1, ptr %ptr.iv, align 1 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %loop.latch +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 0, ptr %ptr.iv, align 1 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %loop.latch +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 42, ptr %ptr.iv, align 1 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %loop.latch +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 2, ptr %ptr.iv, align 1 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %loop.latch +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq ptr %ptr.iv.next, %end +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop.header +; CHECK: Cost of 0 for VF 2: induction instruction %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1 +; CHECK: Cost of 0 for VF 2: induction instruction %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] +; CHECK: Cost of 1 for VF 2: exit condition instruction %ec = icmp eq ptr %ptr.iv.next, %end +; CHECK: Cost of 0 for VF 2: vp<[[VP6:%[0-9]+]]> = DERIVED-IV ir<0> + vp<[[VP5:%[0-9]+]]> * ir<8> +; CHECK: Cost of 0 for VF 2: vp<[[VP7:%[0-9]+]]> = SCALAR-STEPS vp<[[VP6]]>, ir<8>, vp<[[VP0:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<[[VP8]]> +; CHECK: Cost of 1 for VF 2: EMIT vp<[[VP9:%[0-9]+]]> = icmp eq ir<%l>, ir<-12> +; CHECK: Cost of 1 for VF 2: EMIT vp<[[VP10:%[0-9]+]]> = icmp eq ir<%l>, ir<13> +; CHECK: Cost of 1 for VF 2: EMIT vp<[[VP11:%[0-9]+]]> = icmp eq ir<%l>, ir<0> +; CHECK: Cost of 0 for VF 2: EMIT vp<[[VP12:%[0-9]+]]> = or vp<[[VP9]]>, vp<[[VP10]]> +; CHECK: Cost of 0 for VF 2: EMIT vp<[[VP13:%[0-9]+]]> = or vp<[[VP12]]>, vp<[[VP11]]> +; CHECK: Cost of 0 for VF 2: EMIT vp<[[VP14:%[0-9]+]]> = not vp<[[VP13]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP15:%[0-9]+]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP15]]>, ir<1>, vp<[[VP11]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP16:%[0-9]+]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP16]]>, ir<0>, vp<[[VP10]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP17:%[0-9]+]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP17]]>, ir<42>, vp<[[VP9]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP18:%[0-9]+]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP18]]>, ir<2>, vp<[[VP14]]> +; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: vector loop backedge +; CHECK: Cost of 0 for VF 2: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEV (1 + ((-8 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 8)) +; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP4:%[0-9]+]]>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK: Cost of 0 for VF 2: IR %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 2: IR %l = load i64, ptr %ptr.iv, align 1 +; CHECK: Cost of 0 for VF 2: vp<[[VP4]]> = DERIVED-IV ir<%start> + vp<[[VP2]]> * ir<8> +; CHECK: Cost of 0 for VF 2: EMIT vp<%cmp.n> = icmp eq vp<[[VP3]]>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-cond vp<%cmp.n> +; CHECK: Cost of 0 for VF 4: induction instruction %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1 +; CHECK: Cost of 0 for VF 4: induction instruction %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] +; CHECK: Cost of 1 for VF 4: exit condition instruction %ec = icmp eq ptr %ptr.iv.next, %end +; CHECK: Cost of 0 for VF 4: vp<[[VP6]]> = DERIVED-IV ir<0> + vp<[[VP5]]> * ir<8> +; CHECK: Cost of 0 for VF 4: vp<[[VP7]]> = SCALAR-STEPS vp<[[VP6]]>, ir<8>, vp<[[VP0]]> +; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP8]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<[[VP8]]> +; CHECK: Cost of 1 for VF 4: EMIT vp<[[VP9]]> = icmp eq ir<%l>, ir<-12> +; CHECK: Cost of 1 for VF 4: EMIT vp<[[VP10]]> = icmp eq ir<%l>, ir<13> +; CHECK: Cost of 1 for VF 4: EMIT vp<[[VP11]]> = icmp eq ir<%l>, ir<0> +; CHECK: Cost of 0 for VF 4: EMIT vp<[[VP12]]> = or vp<[[VP9]]>, vp<[[VP10]]> +; CHECK: Cost of 0 for VF 4: EMIT vp<[[VP13]]> = or vp<[[VP12]]>, vp<[[VP11]]> +; CHECK: Cost of 0 for VF 4: EMIT vp<[[VP14]]> = not vp<[[VP13]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP15]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP15]]>, ir<1>, vp<[[VP11]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP16]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP16]]>, ir<0>, vp<[[VP10]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP17]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP17]]>, ir<42>, vp<[[VP9]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP18]]> = vector-pointer vp<%next.gep> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP18]]>, ir<2>, vp<[[VP14]]> +; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: vector loop backedge +; CHECK: Cost of 0 for VF 4: EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((-8 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 8)) +; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP4]]>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK: Cost of 0 for VF 4: IR %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 4: IR %l = load i64, ptr %ptr.iv, align 1 +; CHECK: Cost of 0 for VF 4: vp<[[VP4]]> = DERIVED-IV ir<%start> + vp<[[VP2]]> * ir<8> +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq vp<[[VP3]]>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> +; +entry: + br label %loop.header + +loop.header: + %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] + %l = load i64, ptr %ptr.iv, align 1 + switch i64 %l, label %default [ + i64 -12, label %case1 + i64 13, label %case2 + i64 0, label %case3 + ] + +case1: + store i64 42, ptr %ptr.iv, align 1 + br label %loop.latch + +case2: + store i64 0, ptr %ptr.iv, align 1 + br label %loop.latch + +case3: + store i64 1, ptr %ptr.iv, align 1 + br label %loop.latch + +default: + store i64 2, ptr %ptr.iv, align 1 + br label %loop.latch + +loop.latch: + %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv.next, %end + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +define void @test_vpinstruction_extractvalue_cost(ptr noalias %dst, {i64, i64} %sv) { +; CHECK-LABEL: 'test_vpinstruction_extractvalue_cost' +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %a = extractvalue { i64, i64 } %sv, 0 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %b = extractvalue { i64, i64 } %sv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %add = add i64 %a, %b +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %add, ptr %g.dst, align 8 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 1000 +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop +; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: Cost of 1 for VF 2: exit condition instruction %ec = icmp eq i64 %iv.next, 1000 +; CHECK: Cost of 0 for VF 2: vp<[[VP4:%[0-9]+]]> = SCALAR-STEPS vp<[[VP3:%[0-9]+]]>, ir<1>, vp<[[VP0:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 2: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%g.dst> +; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP5]]>, ir<%add> +; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]> +; CHECK: Cost of 0 for VF 2: vector loop backedge +; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP2]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 2: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 2: IR %a = extractvalue { i64, i64 } %sv, 0 +; CHECK: Cost of 0 for VF 2: IR %b = extractvalue { i64, i64 } %sv, 1 +; CHECK: Cost of 0 for VF 2: IR %add = add i64 %a, %b +; CHECK: Cost of 0 for VF 2: IR %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv +; CHECK: Cost of 0 for VF 2: IR store i64 %add, ptr %g.dst, align 8 +; CHECK: Cost of 0 for VF 2: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 2: IR %ec = icmp eq i64 %iv.next, 1000 +; CHECK: Cost of 0 for VF 2: CLONE ir<%a> = extractvalue ir<%sv> +; CHECK: Cost of 0 for VF 2: CLONE ir<%b> = extractvalue ir<%sv> +; CHECK: Cost of 1 for VF 2: CLONE ir<%add> = add ir<%a>, ir<%b> +; CHECK: Cost of 0 for VF 2: EMIT vp<%cmp.n> = icmp eq ir<1000>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-cond vp<%cmp.n> +; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: Cost of 1 for VF 4: exit condition instruction %ec = icmp eq i64 %iv.next, 1000 +; CHECK: Cost of 0 for VF 4: vp<[[VP4]]> = SCALAR-STEPS vp<[[VP3]]>, ir<1>, vp<[[VP0]]> +; CHECK: Cost of 0 for VF 4: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]> +; CHECK: Cost of 0 for VF 4: vp<[[VP5]]> = vector-pointer inbounds ir<%g.dst> +; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP5]]>, ir<%add> +; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: vector loop backedge +; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP2]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 4: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 4: IR %a = extractvalue { i64, i64 } %sv, 0 +; CHECK: Cost of 0 for VF 4: IR %b = extractvalue { i64, i64 } %sv, 1 +; CHECK: Cost of 0 for VF 4: IR %add = add i64 %a, %b +; CHECK: Cost of 0 for VF 4: IR %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv +; CHECK: Cost of 0 for VF 4: IR store i64 %add, ptr %g.dst, align 8 +; CHECK: Cost of 0 for VF 4: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK: Cost of 0 for VF 4: IR %ec = icmp eq i64 %iv.next, 1000 +; CHECK: Cost of 0 for VF 4: CLONE ir<%a> = extractvalue ir<%sv> +; CHECK: Cost of 0 for VF 4: CLONE ir<%b> = extractvalue ir<%sv> +; CHECK: Cost of 1 for VF 4: CLONE ir<%add> = add ir<%a>, ir<%b> +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<1000>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> +; CHECK: Cost of 0 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<1000>, vp<[[VP2]]> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %a = extractvalue {i64, i64} %sv, 0 + %b = extractvalue {i64, i64} %sv, 1 + %add = add i64 %a, %b + %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv + store i64 %add, ptr %g.dst, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} From fb534780379c1ef735ed8b46f2bb11d813965b97 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 10 May 2026 14:54:25 -0700 Subject: [PATCH 201/538] Add missing direct includes for bit.h/SwapByteOrder.h. NFC (#196843) These translation units use llvm::endianness, llvm::byteswap, llvm::has_single_bit, or sys::IsLittleEndianHost without explicitly including the header that declares them. They currently compile only because llvm/ADT/Hashing.h transitively pulls in llvm/Support/SwapByteOrder.h (which includes llvm/ADT/bit.h). --- llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h | 1 + llvm/include/llvm/ObjectYAML/DWARFEmitter.h | 1 + llvm/include/llvm/Support/ELFAttributeParser.h | 1 + llvm/include/llvm/Support/VersionTuple.h | 1 + llvm/lib/Support/APInt.cpp | 1 + llvm/lib/Support/ConvertUTFWrapper.cpp | 1 + llvm/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp | 1 + llvm/unittests/Object/SymbolicFileTest.cpp | 1 + 8 files changed, 8 insertions(+) diff --git a/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h index 727cd9abf9272..4ad8c7b69576b 100644 --- a/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h +++ b/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h @@ -15,6 +15,7 @@ #ifndef LLVM_CODEGEN_SCOREBOARDHAZARDRECOGNIZER_H #define LLVM_CODEGEN_SCOREBOARDHAZARDRECOGNIZER_H +#include "llvm/ADT/bit.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/MC/MCInstrItineraries.h" #include diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h index 050ff60bcd408..5f62f32b8ddfa 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h +++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h @@ -17,6 +17,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Host.h" #include diff --git a/llvm/include/llvm/Support/ELFAttributeParser.h b/llvm/include/llvm/Support/ELFAttributeParser.h index c2ad812b5d632..8aac1f7692749 100644 --- a/llvm/include/llvm/Support/ELFAttributeParser.h +++ b/llvm/include/llvm/Support/ELFAttributeParser.h @@ -11,6 +11,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/bit.h" #include "llvm/Support/Error.h" namespace llvm { diff --git a/llvm/include/llvm/Support/VersionTuple.h b/llvm/include/llvm/Support/VersionTuple.h index e4500a714d12b..a27241550836b 100644 --- a/llvm/include/llvm/Support/VersionTuple.h +++ b/llvm/include/llvm/Support/VersionTuple.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/bit.h" #include "llvm/Support/Compiler.h" #include #include diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index 417db25af8c37..eb3762e396258 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/raw_ostream.h" #include #include diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp index 76ead00c977bd..21dcf779338f7 100644 --- a/llvm/lib/Support/ConvertUTFWrapper.cpp +++ b/llvm/lib/Support/ConvertUTFWrapper.cpp @@ -8,6 +8,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/bit.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ErrorHandling.h" #include diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp index 29d4db44e7cf9..22856b4ba23c9 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp @@ -14,6 +14,7 @@ #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/LEB128.h" +#include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Host.h" #include "gtest/gtest.h" #include diff --git a/llvm/unittests/Object/SymbolicFileTest.cpp b/llvm/unittests/Object/SymbolicFileTest.cpp index c3813b12b4476..3ae541d4fc78b 100644 --- a/llvm/unittests/Object/SymbolicFileTest.cpp +++ b/llvm/unittests/Object/SymbolicFileTest.cpp @@ -8,6 +8,7 @@ #include "llvm/Object/SymbolicFile.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Host.h" #include "gmock/gmock.h" From b66d798c4ab3ca9b9937483562701c76ffaa6821 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 10 May 2026 19:01:24 -0400 Subject: [PATCH 202/538] [libc] Fix a copyright comment typo (#196846) No behavior change. --- libc/src/__support/high_precision_decimal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/__support/high_precision_decimal.h b/libc/src/__support/high_precision_decimal.h index 75f2a7607b425..de22172fd8d3e 100644 --- a/libc/src/__support/high_precision_decimal.h +++ b/libc/src/__support/high_precision_decimal.h @@ -1,7 +1,7 @@ //===-- High Precision Decimal ----------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See httpss//llvm.org/LICENSE.txt for license information. +// See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// From dff356d47cfc4413f78c858dd8339cb1c9fca255 Mon Sep 17 00:00:00 2001 From: Daniil Dudkin Date: Mon, 11 May 2026 02:05:37 +0300 Subject: [PATCH 203/538] [clang-tidy] comment braced and parenthesized init arguments (#180408) Handle arguments like `{}`, `Type{}` and `Type()` in `bugprone-argument-comment` and add coverage for `initializer_list` and designated initializers. Fixes: https://github.com/llvm/llvm-project/issues/171842 --- .../bugprone/ArgumentCommentCheck.cpp | 136 ++++++++-- .../bugprone/ArgumentCommentCheck.h | 23 +- clang-tools-extra/docs/ReleaseNotes.rst | 10 +- .../checks/bugprone/argument-comment.rst | 154 ++++++++--- .../argument-comment-init-list-cxx20.cpp | 69 +++++ .../bugprone/argument-comment-init-list.cpp | 239 ++++++++++++++++++ 6 files changed, 560 insertions(+), 71 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list-cxx20.cpp create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list.cpp diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp index e3139f96cfb09..8b78b5a50f0c8 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp @@ -36,26 +36,35 @@ ArgumentCommentCheck::ArgumentCommentCheck(StringRef Name, : ClangTidyCheck(Name, Context), StrictMode(Options.get("StrictMode", false)), IgnoreSingleArgument(Options.get("IgnoreSingleArgument", false)), + CommentAnonymousInitLists( + Options.get("CommentAnonymousInitLists", false)), CommentBoolLiterals(Options.get("CommentBoolLiterals", false)), - CommentIntegerLiterals(Options.get("CommentIntegerLiterals", false)), + CommentCharacterLiterals(Options.get("CommentCharacterLiterals", false)), CommentFloatLiterals(Options.get("CommentFloatLiterals", false)), + CommentIntegerLiterals(Options.get("CommentIntegerLiterals", false)), + CommentNullPtrs(Options.get("CommentNullPtrs", false)), + CommentParenthesizedTemporaries( + Options.get("CommentParenthesizedTemporaries", false)), CommentStringLiterals(Options.get("CommentStringLiterals", false)), + CommentTypedInitLists(Options.get("CommentTypedInitLists", false)), CommentUserDefinedLiterals( Options.get("CommentUserDefinedLiterals", false)), - CommentCharacterLiterals(Options.get("CommentCharacterLiterals", false)), - CommentNullPtrs(Options.get("CommentNullPtrs", false)), IdentRE("^(/\\* *)([_A-Za-z][_A-Za-z0-9]*)( *= *\\*/)$") {} void ArgumentCommentCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "StrictMode", StrictMode); Options.store(Opts, "IgnoreSingleArgument", IgnoreSingleArgument); + Options.store(Opts, "CommentAnonymousInitLists", CommentAnonymousInitLists); Options.store(Opts, "CommentBoolLiterals", CommentBoolLiterals); - Options.store(Opts, "CommentIntegerLiterals", CommentIntegerLiterals); + Options.store(Opts, "CommentCharacterLiterals", CommentCharacterLiterals); Options.store(Opts, "CommentFloatLiterals", CommentFloatLiterals); + Options.store(Opts, "CommentIntegerLiterals", CommentIntegerLiterals); + Options.store(Opts, "CommentNullPtrs", CommentNullPtrs); + Options.store(Opts, "CommentParenthesizedTemporaries", + CommentParenthesizedTemporaries); Options.store(Opts, "CommentStringLiterals", CommentStringLiterals); + Options.store(Opts, "CommentTypedInitLists", CommentTypedInitLists); Options.store(Opts, "CommentUserDefinedLiterals", CommentUserDefinedLiterals); - Options.store(Opts, "CommentCharacterLiterals", CommentCharacterLiterals); - Options.store(Opts, "CommentNullPtrs", CommentNullPtrs); } void ArgumentCommentCheck::registerMatchers(MatchFinder *Finder) { @@ -199,21 +208,91 @@ static const FunctionDecl *resolveMocks(const FunctionDecl *Func) { return Func; } -// Given the argument type and the options determine if we should -// be adding an argument comment. -bool ArgumentCommentCheck::shouldAddComment(const Expr *Arg) const { - Arg = Arg->IgnoreImpCasts(); - if (isa(Arg)) - Arg = cast(Arg)->getSubExpr(); +namespace { + +enum class InitListKind { + None, + Anonymous, + Typed, +}; + +} // namespace + +static InitListKind getInitListKind(const Expr *Arg) { + Arg = Arg->IgnoreUnlessSpelledInSource(); + + if (const auto *StdInit = dyn_cast(Arg)) + Arg = StdInit->getSubExpr()->IgnoreUnlessSpelledInSource(); + + if (isa(Arg)) + return InitListKind::Anonymous; + + if (const auto *Ctor = dyn_cast(Arg)) { + if (!Ctor->isListInitialization()) + return InitListKind::None; + // CXXTemporaryObjectExpr corresponds to explicit Type{...} syntax. + if (isa(Ctor)) + return InitListKind::Typed; + // Other list-initialized constructions (for example '{}') have no + // explicit type at the call site. + return InitListKind::Anonymous; + } + + // std::initializer_list{...} is represented as a functional cast whose + // subexpression carries the list-initialization spelling. + if (const auto *FuncCast = dyn_cast(Arg)) { + const Expr *SubExpr = FuncCast->getSubExpr()->IgnoreImplicit(); + if (FuncCast->isListInitialization() || + isa(SubExpr)) + return InitListKind::Typed; + } + + return InitListKind::None; +} + +static bool isParenthesizedTemporary(const Expr *Arg) { + Arg = Arg->IgnoreUnlessSpelledInSource(); + if (const auto *TempObject = dyn_cast(Arg)) + return !TempObject->isListInitialization(); + // CXXFunctionalCastExpr with CXXParenListInitExpr corresponds to explicit + // Type(...) aggregate temporary initialization syntax. + const auto *FuncCast = dyn_cast(Arg); + return FuncCast && + isa(FuncCast->getSubExpr()->IgnoreImplicit()); +} + +// Given the argument type and the options determine if we should be adding an +// argument comment and which diagnostic wording to use. +ArgumentCommentCheck::CommentKind +ArgumentCommentCheck::shouldAddComment(const Expr *Arg) const { + const InitListKind Kind = getInitListKind(Arg); + const bool IsParenthesizedTemporary = isParenthesizedTemporary(Arg); + + // Strip implicit wrappers so brace-init arguments bound to references still + // look like list-initialization at this point. + Arg = Arg->IgnoreImplicit(); + if (const auto *UO = dyn_cast(Arg)) + Arg = UO->getSubExpr()->IgnoreImplicit(); if (Arg->getExprLoc().isMacroID()) - return false; - return (CommentBoolLiterals && isa(Arg)) || - (CommentIntegerLiterals && isa(Arg)) || - (CommentFloatLiterals && isa(Arg)) || - (CommentUserDefinedLiterals && isa(Arg)) || - (CommentCharacterLiterals && isa(Arg)) || - (CommentStringLiterals && isa(Arg)) || - (CommentNullPtrs && isa(Arg)); + return CommentKind::None; + + if ((CommentAnonymousInitLists && Kind == InitListKind::Anonymous) || + (CommentTypedInitLists && Kind == InitListKind::Typed) || + (CommentParenthesizedTemporaries && IsParenthesizedTemporary)) { + return CommentKind::NonLiteral; + } + + if ((CommentBoolLiterals && isa(Arg)) || + (CommentIntegerLiterals && isa(Arg)) || + (CommentFloatLiterals && isa(Arg)) || + (CommentUserDefinedLiterals && isa(Arg)) || + (CommentCharacterLiterals && isa(Arg)) || + (CommentStringLiterals && isa(Arg)) || + (CommentNullPtrs && isa(Arg))) { + return CommentKind::Literal; + } + + return CommentKind::None; } void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, @@ -283,7 +362,8 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, << Matches[2] << II; if (isLikelyTypo(Callee->parameters(), Matches[2], II->getName())) { Diag << FixItHint::CreateReplacement( - Comment.Loc, (Matches[1] + II->getName() + Matches[3]).str()); + Comment.Loc, + llvm::Twine(Matches[1] + II->getName() + Matches[3]).str()); } } diag(PVD->getLocation(), "%0 declared here", DiagnosticIDs::Note) << II; @@ -295,14 +375,18 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx, } } - // If the argument comments are missing for literals add them. - if (Comments.empty() && shouldAddComment(Args[I])) { + // If the argument comments are missing for configured argument kinds, add + // them. + const CommentKind Kind = shouldAddComment(Args[I]); + if (Comments.empty() && Kind != CommentKind::None) { SmallString<32> ArgComment; - (llvm::Twine("/*") + II->getName() + "=*/").toStringRef(ArgComment); + llvm::Twine(llvm::Twine("/*") + II->getName() + "=*/") + .toStringRef(ArgComment); const DiagnosticBuilder Diag = diag(Args[I]->getBeginLoc(), - "argument comment missing for literal argument %0") - << II + "argument comment missing for %select{literal argument|" + "argument}0 %1") + << (Kind == CommentKind::Literal ? 0 : 1) << II << FixItHint::CreateInsertion(Args[I]->getBeginLoc(), ArgComment); } } diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h index 30fa32fad72e7..3bed1ad4247e5 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h @@ -14,7 +14,8 @@ namespace clang::tidy::bugprone { -/// Checks that argument comments match parameter names. +/// Checks that argument comments match parameter names and can optionally add +/// missing comments for literals, init-lists, and constructed temporaries. /// /// The check understands argument comments in the form `/*parameter_name=*/` /// that are placed right before the argument. @@ -28,7 +29,8 @@ namespace clang::tidy::bugprone { /// 'foo' /// \endcode /// -/// The check tries to detect typos and suggest automated fixes for them. +/// The check tries to detect typos and suggest automated fixes for them. It can +/// also insert missing comments for configured argument kinds. class ArgumentCommentCheck : public ClangTidyCheck { public: ArgumentCommentCheck(StringRef Name, ClangTidyContext *Context); @@ -38,22 +40,31 @@ class ArgumentCommentCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override; private: + enum class CommentKind { + None, + Literal, + NonLiteral, + }; + const unsigned StrictMode : 1; const unsigned IgnoreSingleArgument : 1; + const unsigned CommentAnonymousInitLists : 1; const unsigned CommentBoolLiterals : 1; - const unsigned CommentIntegerLiterals : 1; + const unsigned CommentCharacterLiterals : 1; const unsigned CommentFloatLiterals : 1; + const unsigned CommentIntegerLiterals : 1; + const unsigned CommentNullPtrs : 1; + const unsigned CommentParenthesizedTemporaries : 1; const unsigned CommentStringLiterals : 1; + const unsigned CommentTypedInitLists : 1; const unsigned CommentUserDefinedLiterals : 1; - const unsigned CommentCharacterLiterals : 1; - const unsigned CommentNullPtrs : 1; llvm::Regex IdentRE; void checkCallArgs(ASTContext *Ctx, const FunctionDecl *Callee, SourceLocation ArgBeginLoc, llvm::ArrayRef Args); - bool shouldAddComment(const Expr *Arg) const; + CommentKind shouldAddComment(const Expr *Arg) const; }; } // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 365f3b40e8ca8..c5a6857c7077f 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -273,8 +273,14 @@ Changes in existing checks ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Improved :doc:`bugprone-argument-comment - ` to also check for C++11 - inherited constructors. + `: + + - Checks for C++11 inherited constructors. + + - Adds `CommentAnonymousInitLists`, `CommentTypedInitLists`, and + `CommentParenthesizedTemporaries` options to comment braced-init list + arguments and explicit temporary constructions (for example, ``{}``, + ``Type{}``, and ``Type()``). - Improved :doc:`bugprone-bad-signal-to-kill-thread ` check by fixing false diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/argument-comment.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/argument-comment.rst index 8770d7224137a..a5863ab32c41f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/argument-comment.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/argument-comment.rst @@ -3,7 +3,8 @@ bugprone-argument-comment ========================= -Checks that argument comments match parameter names. +Checks that argument comments match parameter names and can optionally add +missing comments for literals, init-lists, and constructed temporaries. The check understands argument comments in the form ``/*parameter_name=*/`` that are placed right before the argument. @@ -17,7 +18,8 @@ that are placed right before the argument. f(/*bar=*/true); // warning: argument name 'bar' in comment does not match parameter name 'foo' -The check tries to detect typos and suggest automated fixes for them. +The check tries to detect typos and suggest automated fixes for them. It can +also insert missing comments for configured argument kinds. Options ------- @@ -32,6 +34,28 @@ Options When `true`, the check will ignore the single argument. Default is `false`. +.. option:: CommentAnonymousInitLists + + When `true`, the check will add argument comments in the format + ``/*ParameterName=*/`` right before anonymous braced-init list arguments + such as ``{}`` and ``{1, 2, 3}``. Default is `false`. + +Before: + +.. code-block:: c++ + + void foo(const std::vector &Dims); + + foo({}); + +After: + +.. code-block:: c++ + + void foo(const std::vector &Dims); + + foo(/*Dims=*/{}); + .. option:: CommentBoolLiterals When `true`, the check will add argument comments in the format @@ -54,27 +78,27 @@ After: foo(/*TurnKey=*/true, /*PressButton=*/false); -.. option:: CommentIntegerLiterals +.. option:: CommentCharacterLiterals When `true`, the check will add argument comments in the format - ``/*ParameterName=*/`` right before the integer literal argument. + ``/*ParameterName=*/`` right before the character literal argument. Default is `false`. Before: .. code-block:: c++ - void foo(int MeaningOfLife); + void foo(char *Character); - foo(42); + foo('A'); After: .. code-block:: c++ - void foo(int MeaningOfLife); + void foo(char *Character); - foo(/*MeaningOfLife=*/42); + foo(/*Character=*/'A'); .. option:: CommentFloatLiterals @@ -98,6 +122,84 @@ After: foo(/*Pi=*/3.14159); +.. option:: CommentIntegerLiterals + + When `true`, the check will add argument comments in the format + ``/*ParameterName=*/`` right before the integer literal argument. + Default is `false`. + +Before: + +.. code-block:: c++ + + void foo(int MeaningOfLife); + + foo(42); + +After: + +.. code-block:: c++ + + void foo(int MeaningOfLife); + + foo(/*MeaningOfLife=*/42); + +.. option:: CommentNullPtrs + + When `true`, the check will add argument comments in the format + ``/*ParameterName=*/`` right before the nullptr literal argument. + Default is `false`. + +Before: + +.. code-block:: c++ + + void foo(A* Value); + + foo(nullptr); + +After: + +.. code-block:: c++ + + void foo(A* Value); + + foo(/*Value=*/nullptr); + +.. option:: CommentParenthesizedTemporaries + + When `true`, the check will add argument comments in the format + ``/*ParameterName=*/`` right before explicit temporary constructions such as + ``Type()`` and ``Type(1, 2, 3)``. Default is `false`. + +Before: + +.. code-block:: c++ + + struct Dims { + Dims(); + Dims(int, int, int); + }; + + void foo(const Dims &DimsValue); + + foo(Dims()); + foo(Dims(1, 2, 3)); + +After: + +.. code-block:: c++ + + struct Dims { + Dims(); + Dims(int, int, int); + }; + + void foo(const Dims &DimsValue); + + foo(/*DimsValue=*/Dims()); + foo(/*DimsValue=*/Dims(1, 2, 3)); + .. option:: CommentStringLiterals When `true`, the check will add argument comments in the format @@ -124,27 +226,27 @@ After: foo(/*String=*/"Hello World"); foo(/*WideString=*/L"Hello World"); -.. option:: CommentCharacterLiterals +.. option:: CommentTypedInitLists When `true`, the check will add argument comments in the format - ``/*ParameterName=*/`` right before the character literal argument. - Default is `false`. + ``/*ParameterName=*/`` right before typed braced-init list arguments such + as ``Type{}``. Default is `false`. Before: .. code-block:: c++ - void foo(char *Character); + void foo(const std::vector &Dims); - foo('A'); + foo(std::vector{}); After: .. code-block:: c++ - void foo(char *Character); + void foo(const std::vector &Dims); - foo(/*Character=*/'A'); + foo(/*Dims=*/std::vector{}); .. option:: CommentUserDefinedLiterals @@ -171,25 +273,3 @@ After: double operator"" _km(long double); foo(/*Distance=*/402.0_km); - -.. option:: CommentNullPtrs - - When `true`, the check will add argument comments in the format - ``/*ParameterName=*/`` right before the nullptr literal argument. - Default is `false`. - -Before: - -.. code-block:: c++ - - void foo(A* Value); - - foo(nullptr); - -After: - -.. code-block:: c++ - - void foo(A* Value); - - foo(/*Value=*/nullptr); diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list-cxx20.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list-cxx20.cpp new file mode 100644 index 0000000000000..f143976ed6b0c --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list-cxx20.cpp @@ -0,0 +1,69 @@ +// RUN: %check_clang_tidy -check-suffix=OFF -std=c++20-or-later %s bugprone-argument-comment %t +// RUN: %check_clang_tidy -check-suffix=ANON -std=c++20-or-later %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentAnonymousInitLists: true}}" -- +// RUN: %check_clang_tidy -check-suffix=TYPED -std=c++20-or-later %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentTypedInitLists: true}}" -- +// RUN: %check_clang_tidy -check-suffix=TEMP -std=c++20-or-later %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentParenthesizedTemporaries: true}}" -- +// RUN: %check_clang_tidy -check-suffix=BOTH-INIT -std=c++20-or-later %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentAnonymousInitLists: true, \ +// RUN: bugprone-argument-comment.CommentTypedInitLists: true}}" -- +// RUN: %check_clang_tidy -check-suffix=ALL -std=c++20-or-later %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentAnonymousInitLists: true, \ +// RUN: bugprone-argument-comment.CommentTypedInitLists: true, \ +// RUN: bugprone-argument-comment.CommentParenthesizedTemporaries: true}}" -- + +struct T { + int value; +}; + +struct Agg { + int x; + int y; +}; + +void foo_designated(T some_arg, const Agg &dims); + +void test_designated_init() { + T some_arg{0}; + + foo_designated(some_arg, /*dim=*/Agg{.x = 1}); + // CHECK-MESSAGES-OFF: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-OFF: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-ANON: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ANON: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-TYPED: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TYPED: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-TEMP: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TEMP: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-BOTH-INIT: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-BOTH-INIT: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-ALL: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ALL: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + + foo_designated(some_arg, Agg{.x = 1}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED: [[@LINE-3]]:28: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TYPED: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:28: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:28: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_designated(some_arg, /*dims=*/Agg{.x = 1}); + + foo_designated(some_arg, Agg(1, 2)); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-3]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP: [[@LINE-4]]:28: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TEMP: foo_designated(some_arg, /*dims=*/Agg(1, 2)); + // CHECK-MESSAGES-BOTH-INIT-NOT: :[[@LINE-6]]:28: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ALL: [[@LINE-7]]:28: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_designated(some_arg, /*dims=*/Agg(1, 2)); +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list.cpp new file mode 100644 index 0000000000000..a29f426b93711 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-init-list.cpp @@ -0,0 +1,239 @@ +// RUN: %check_clang_tidy -check-suffix=OFF %s bugprone-argument-comment %t +// RUN: %check_clang_tidy -check-suffix=ANON %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentAnonymousInitLists: true}}" -- +// RUN: %check_clang_tidy -check-suffix=TYPED %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentTypedInitLists: true}}" -- +// RUN: %check_clang_tidy -check-suffix=TEMP %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentParenthesizedTemporaries: true}}" -- +// RUN: %check_clang_tidy -check-suffix=BOTH-INIT %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentAnonymousInitLists: true, \ +// RUN: bugprone-argument-comment.CommentTypedInitLists: true}}" -- +// RUN: %check_clang_tidy -check-suffix=ALL %s bugprone-argument-comment %t -- \ +// RUN: -config="{CheckOptions: { \ +// RUN: bugprone-argument-comment.CommentAnonymousInitLists: true, \ +// RUN: bugprone-argument-comment.CommentTypedInitLists: true, \ +// RUN: bugprone-argument-comment.CommentParenthesizedTemporaries: true}}" -- + +#include +#include + +struct T { + int value; +}; + +struct Dims { + Dims(); + Dims(int, int, int); +}; + +void foo(T some_arg, const std::vector &dims); +void foo_dims(T some_arg, const Dims &dims); +void foo_init_list(T some_arg, std::initializer_list dims); +void foo_nested_init_list(T some_arg, + std::initializer_list> dims); +void foo_int(T some_arg, int value); +template +void foo_template(T some_arg, const std::vector &dims); +template +void foo_template_typed(T some_arg, const DimsTy &dims); + +void test_braced_init_list() { + T some_arg{0}; + + // Mismatched explicit argument comments are validated independently of the + // missing-comment options. + foo(some_arg, /*dim=*/{}); + // CHECK-MESSAGES-OFF: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-OFF: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-ANON: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ANON: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-TYPED: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TYPED: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-TEMP: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TEMP: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-BOTH-INIT: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-BOTH-INIT: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-ALL: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ALL: foo(some_arg, /*dims=*/{}); + + foo(some_arg, /*dim=*/std::vector{}); + // CHECK-MESSAGES-OFF: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-OFF: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-ANON: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ANON: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-TYPED: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TYPED: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-TEMP: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TEMP: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-BOTH-INIT: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-BOTH-INIT: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-ALL: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ALL: foo(some_arg, /*dims=*/std::vector{}); + + foo(some_arg, {}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:17: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON: [[@LINE-2]]:17: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ANON: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-4]]:17: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:17: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:17: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:17: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo(some_arg, /*dims=*/{}); + + foo(some_arg, std::vector{}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:17: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:17: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED: [[@LINE-3]]:17: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TYPED: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:17: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:17: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:17: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo(some_arg, /*dims=*/std::vector{}); +} + +void test_initializer_list() { + T some_arg{0}; + + foo_init_list(some_arg, {1, 2, 3}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:27: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON: [[@LINE-2]]:27: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ANON: foo_init_list(some_arg, /*dims=*/{1, 2, 3}); + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-4]]:27: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:27: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:27: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_init_list(some_arg, /*dims=*/{1, 2, 3}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:27: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_init_list(some_arg, /*dims=*/{1, 2, 3}); + + foo_init_list(some_arg, std::initializer_list{1, 2, 3}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:27: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:27: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED: [[@LINE-3]]:27: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TYPED: foo_init_list(some_arg, /*dims=*/std::initializer_list{1, 2, 3}); + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:27: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:27: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_init_list(some_arg, /*dims=*/std::initializer_list{1, 2, 3}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:27: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_init_list(some_arg, /*dims=*/std::initializer_list{1, 2, 3}); +} + +void test_nested_initializer_list() { + T some_arg{0}; + + foo_nested_init_list(some_arg, {{1, 2}, {3, 4}}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON: [[@LINE-2]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ANON: foo_nested_init_list(some_arg, /*dims=*/{{.*}}); + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-4]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_nested_init_list(some_arg, /*dims=*/{{.*}}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_nested_init_list(some_arg, /*dims=*/{{.*}}); +} + +void test_parenthesized_temporary() { + T some_arg{0}; + + foo_dims(some_arg, /*dim=*/Dims()); + // CHECK-MESSAGES-OFF: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-OFF: foo_dims(some_arg, /*dims=*/Dims()); + // CHECK-MESSAGES-ANON: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ANON: foo_dims(some_arg, /*dims=*/Dims()); + // CHECK-MESSAGES-TYPED: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TYPED: foo_dims(some_arg, /*dims=*/Dims()); + // CHECK-MESSAGES-TEMP: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-TEMP: foo_dims(some_arg, /*dims=*/Dims()); + // CHECK-MESSAGES-BOTH-INIT: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-BOTH-INIT: foo_dims(some_arg, /*dims=*/Dims()); + // CHECK-MESSAGES-ALL: warning: argument name 'dim' in comment does not match parameter name 'dims' + // CHECK-FIXES-ALL: foo_dims(some_arg, /*dims=*/Dims()); + + foo_dims(some_arg, Dims{}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED: [[@LINE-3]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TYPED: foo_dims(some_arg, /*dims=*/Dims{}); + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_dims(some_arg, /*dims=*/Dims{}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_dims(some_arg, /*dims=*/Dims{}); + + foo_dims(some_arg, Dims()); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-3]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP: [[@LINE-4]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TEMP: foo_dims(some_arg, /*dims=*/Dims()); + // CHECK-MESSAGES-BOTH-INIT-NOT: :[[@LINE-6]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ALL: [[@LINE-7]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_dims(some_arg, /*dims=*/Dims()); + + foo_dims(some_arg, Dims(1, 2, 3)); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-3]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP: [[@LINE-4]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TEMP: foo_dims(some_arg, /*dims=*/Dims(1, 2, 3)); + // CHECK-MESSAGES-BOTH-INIT-NOT: :[[@LINE-6]]:22: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ALL: [[@LINE-7]]:22: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_dims(some_arg, /*dims=*/Dims(1, 2, 3)); + + foo_int(some_arg, int(1)); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:21: warning: argument comment missing for argument 'value' + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-2]]:21: warning: argument comment missing for argument 'value' + // CHECK-MESSAGES-ALL-NOT: :[[@LINE-3]]:21: warning: argument comment missing for argument 'value' +} + +template +void test_template_dependent_init_list() { + T some_arg{0}; + + foo_template(some_arg, {}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON: [[@LINE-2]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ANON: foo_template(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-TYPED-NOT: :[[@LINE-4]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_template(some_arg, /*dims=*/{}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_template(some_arg, /*dims=*/{}); + + foo_template(some_arg, std::vector{}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED: [[@LINE-3]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TYPED: foo_template(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:34: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_template(some_arg, /*dims=*/std::vector{}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:34: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_template(some_arg, /*dims=*/std::vector{}); +} + +template +void test_template_dependent_typed_init_list() { + T some_arg{0}; + + foo_template_typed(some_arg, DimsTy{1, 2, 3}); + // CHECK-MESSAGES-OFF-NOT: :[[@LINE-1]]:40: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-ANON-NOT: :[[@LINE-2]]:40: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-TYPED: [[@LINE-3]]:40: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-TYPED: foo_template_typed(some_arg, /*dims=*/DimsTy{1, 2, 3}); + // CHECK-MESSAGES-TEMP-NOT: :[[@LINE-5]]:40: warning: argument comment missing for argument 'dims' + // CHECK-MESSAGES-BOTH-INIT: [[@LINE-6]]:40: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-BOTH-INIT: foo_template_typed(some_arg, /*dims=*/DimsTy{1, 2, 3}); + // CHECK-MESSAGES-ALL: [[@LINE-8]]:40: warning: argument comment missing for argument 'dims' [bugprone-argument-comment] + // CHECK-FIXES-ALL: foo_template_typed(some_arg, /*dims=*/DimsTy{1, 2, 3}); +} + +template void test_template_dependent_init_list(); +template void test_template_dependent_typed_init_list>(); From 578ba5f3039e7a1e4e6f3a82891b3838823e37c2 Mon Sep 17 00:00:00 2001 From: Mingjie Xu Date: Mon, 11 May 2026 09:12:43 +0800 Subject: [PATCH 204/538] [ADT] Avoid map storage for small SmallMapVector (#196473) SmallMapVector previously used SmallDenseMap for its index, which still initializes and maintains map storage even when the number of entries is tiny. Teach MapVector to support a vector-only small mode. While the entry count stays within the configured small size, operations use the underlying vector directly. When the size grows past the threshold, the map index is built and subsequent operations use the regular MapVector path. This mirrors the small-size strategy used by SmallSetVector. --- llvm/include/llvm/ADT/MapVector.h | 88 +++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/ADT/MapVector.h b/llvm/include/llvm/ADT/MapVector.h index 2b2f098dd3abf..8a7f5edf2f710 100644 --- a/llvm/include/llvm/ADT/MapVector.h +++ b/llvm/include/llvm/ADT/MapVector.h @@ -18,6 +18,7 @@ #define LLVM_ADT_MAPVECTOR_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include #include @@ -32,7 +33,8 @@ namespace llvm { /// mapping is done with DenseMap from Keys to indexes in that vector. template , - typename VectorType = SmallVector, 0>> + typename VectorType = SmallVector, 0>, + unsigned N = 0> class MapVector { public: using key_type = KeyT; @@ -108,8 +110,8 @@ class MapVector { [[nodiscard]] ValueT lookup(const KeyT &Key) const { static_assert(std::is_copy_constructible_v, "Cannot call lookup() if ValueT is not copyable."); - typename MapType::const_iterator Pos = Map.find(Key); - return Pos == Map.end()? ValueT() : Vector[Pos->second].second; + auto I = find(Key); + return I == end() ? ValueT() : I->second; } template @@ -144,7 +146,7 @@ class MapVector { } [[nodiscard]] bool contains(const KeyT &Key) const { - return Map.find(Key) != Map.end(); + return find(Key) != end(); } [[nodiscard]] size_type count(const KeyT &Key) const { @@ -152,11 +154,19 @@ class MapVector { } [[nodiscard]] iterator find(const KeyT &Key) { + if constexpr (canBeSmall()) + if (isSmall()) + return findInVector(Vector, Key); + typename MapType::const_iterator Pos = Map.find(Key); return Pos == Map.end() ? Vector.end() : (Vector.begin() + Pos->second); } [[nodiscard]] const_iterator find(const KeyT &Key) const { + if constexpr (canBeSmall()) + if (isSmall()) + return findInVector(Vector, Key); + typename MapType::const_iterator Pos = Map.find(Key); return Pos == Map.end() ? Vector.end() : (Vector.begin() + Pos->second); } @@ -164,21 +174,27 @@ class MapVector { /// at - Return the entry for the specified key, or abort if no such /// entry exists. [[nodiscard]] ValueT &at(const KeyT &Key) { - auto Pos = Map.find(Key); - assert(Pos != Map.end() && "MapVector::at failed due to a missing key"); - return Vector[Pos->second].second; + auto I = find(Key); + assert(I != end() && "MapVector::at failed due to a missing key"); + return I->second; } /// at - Return the entry for the specified key, or abort if no such /// entry exists. [[nodiscard]] const ValueT &at(const KeyT &Key) const { - auto Pos = Map.find(Key); - assert(Pos != Map.end() && "MapVector::at failed due to a missing key"); - return Vector[Pos->second].second; + auto I = find(Key); + assert(I != end() && "MapVector::at failed due to a missing key"); + return I->second; } /// Remove the last element from the vector. void pop_back() { + if constexpr (canBeSmall()) + if (isSmall()) { + Vector.pop_back(); + return; + } + typename MapType::iterator Pos = Map.find(Vector.back().first); Map.erase(Pos); Vector.pop_back(); @@ -192,6 +208,10 @@ class MapVector { /// \note This is a deceivingly expensive operation (linear time). It's /// usually better to use \a remove_if() if possible. typename VectorType::iterator erase(typename VectorType::iterator Iterator) { + if constexpr (canBeSmall()) + if (isSmall()) + return Vector.erase(Iterator); + Map.erase(Iterator->first); auto Next = Vector.erase(Iterator); if (Next == Vector.end()) @@ -225,15 +245,47 @@ class MapVector { template void remove_if(Predicate Pred); private: + template + [[nodiscard]] static auto findInVector(VectorT &Vec, const LookupKeyT &Key) { + return find_if(Vec, [&Key](const auto &P) { return P.first == Key; }); + } + + [[nodiscard]] static constexpr bool canBeSmall() { return N != 0; } + + [[nodiscard]] bool isSmall() const { return Map.empty(); } + + void makeBig() { + if constexpr (canBeSmall()) { + unsigned Index = 0; + for (const auto &entry : Vector) + Map[entry.first] = Index++; + } + } + MapType Map; VectorType Vector; + static_assert(N <= 32, "Small size should be less than or equal to 32!"); + static_assert( std::is_integral_v, "The mapped_type of the specified Map must be an integral type"); template std::pair try_emplace_impl(KeyArgT &&Key, Ts &&...Args) { + if constexpr (canBeSmall()) + if (isSmall()) { + auto I = findInVector(Vector, Key); + if (I != Vector.end()) + return {I, false}; + Vector.emplace_back(std::piecewise_construct, + std::forward_as_tuple(std::forward(Key)), + std::forward_as_tuple(std::forward(Args)...)); + if (Vector.size() > N) + makeBig(); + return {std::prev(end()), true}; + } + auto [It, Inserted] = Map.try_emplace(Key); if (Inserted) { It->second = Vector.size(); @@ -246,9 +298,16 @@ class MapVector { } }; -template +template template -void MapVector::remove_if(Function Pred) { +void MapVector::remove_if(Function Pred) { + if constexpr (canBeSmall()) + if (isSmall()) { + Vector.erase(llvm::remove_if(Vector, Pred), Vector.end()); + return; + } + auto O = Vector.begin(); for (auto I = O, E = Vector.end(); I != E; ++I) { if (Pred(*I)) { @@ -271,9 +330,8 @@ void MapVector::remove_if(Function Pred) { /// A MapVector that performs no allocations if smaller than a certain /// size. template -struct SmallMapVector - : MapVector, - SmallVector, N>> { +struct SmallMapVector : MapVector, + SmallVector, N>, N> { }; } // end namespace llvm From ed202103309fb2c1bf846bd2794dfc0bc6448031 Mon Sep 17 00:00:00 2001 From: Cheng Lingfei <53817093+clingfei@users.noreply.github.com> Date: Mon, 11 May 2026 10:20:05 +0800 Subject: [PATCH 205/538] [clangd][Parser][Sema] Fix TemplateIdAnnotation UAF with template-id declarator and lambda default argument (#196788) I think this is another case of template annotations lifetime bug, similar to the one fixed by https://github.com/llvm/llvm-project/pull/89494. Closes https://github.com/llvm/llvm-project/issues/196725. --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Parse/ParseDecl.cpp | 8 ++++++++ clang/test/Parser/cxx-default-args.cpp | 6 ++++++ 3 files changed, 15 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c17143e3c0398..40b97b4c8bf4b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -593,6 +593,7 @@ Bug Fixes to C++ Support - Fixed a crash in Itanium C++ name mangling for a lambda in a local class field initializer inside a constructor/destructor. (#GH176395) - Fixed crashes in Itanium C++ name mangling for lambdas with trailing requires-clauses involving requires-expressions. (#GH100774) (#GH123854) - Fixed an invalid rejection and assertion failure while generating ``operator=`` for fields with the ``__restrict`` qualifier. (#GH37979) +- Fixed a use-after-free bug when parsing default arguments containing lambdas in declarations with template-id declarators. (#GH196725) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 55ea562faacaa..1a04ca7f43647 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -7748,6 +7748,14 @@ void Parser::ParseParameterDeclarationClause( // Consume the '='. ConsumeToken(); + // The default argument may contain a lambda whose body triggers + // MaybeDestroyTemplateIds at the end of the inner statements; avoid + // destroying parsed template-ids that may still be referenced by + // the enclosing declarator (e.g. a template-id in the function + // name or other parameters). + DelayTemplateIdDestructionRAII DontDestructTemplateIds( + *this, /*DelayTemplateIdDestruction=*/true); + // The argument isn't actually potentially evaluated unless it is // used. EnterExpressionEvaluationContext Eval( diff --git a/clang/test/Parser/cxx-default-args.cpp b/clang/test/Parser/cxx-default-args.cpp index 5b7d22a56bb91..0a4dbe19d8d54 100644 --- a/clang/test/Parser/cxx-default-args.cpp +++ b/clang/test/Parser/cxx-default-args.cpp @@ -40,3 +40,9 @@ struct U { void i(int x = ) {} // expected-error{{expected expression}} typedef int *fp(int x = ); // expected-error{{default arguments can only be specified for parameters in a function declaration}} }; + +namespace GH196725 { +template void f(); +template <> void f(int = []{ ; return 0; }()) {} // expected-error{{no function template matches function template specialization 'f'}} \ + // expected-note@-1{{candidate template ignored}} +} From 3a7c0eba9bf8f118b161c5634ce45da50433fa06 Mon Sep 17 00:00:00 2001 From: GkvJwa Date: Mon, 11 May 2026 10:46:39 +0800 Subject: [PATCH 206/538] [clang] Add arm64_neon.h wrapper on windows (#196014) Add an MSVC-compatible resource header that forwards to Clang's generated . This lets ARM64 Windows code using the MSVC header name lower NEON intrinsics through Clang builtins instead of eaving external neon_* calls such as neon_ld1m4_q32 Fixes #195683 --- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/arm64_neon.h | 21 +++++++++++++++++++++ clang/lib/Headers/module.modulemap | 6 ++++++ clang/test/CodeGen/arm64-neon-header.c | 14 ++++++++++++++ 4 files changed, 42 insertions(+) create mode 100644 clang/lib/Headers/arm64_neon.h create mode 100644 clang/test/CodeGen/arm64-neon-header.c diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index d60ae2b5961e0..ce34f8b9410a7 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -55,6 +55,7 @@ set(arm_only_files ) set(aarch64_only_files + arm64_neon.h arm64intr.h arm_neon_sve_bridge.h ) diff --git a/clang/lib/Headers/arm64_neon.h b/clang/lib/Headers/arm64_neon.h new file mode 100644 index 0000000000000..29803a8e455ae --- /dev/null +++ b/clang/lib/Headers/arm64_neon.h @@ -0,0 +1,21 @@ +/*===---- arm64_neon.h - ARM64 NEON intrinsics -----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Only include this if we're compiling for the windows platform. */ +#ifndef _MSC_VER +#include_next +#else + +#ifndef __ARM64_NEON_H +#define __ARM64_NEON_H + +#include + +#endif /* __ARM64_NEON_H */ +#endif /* _MSC_VER */ diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap index 3fcaa55f1110e..c8f96df1672c1 100644 --- a/clang/lib/Headers/module.modulemap +++ b/clang/lib/Headers/module.modulemap @@ -50,6 +50,12 @@ module _Builtin_intrinsics [system] [extern_c] { header "arm64intr.h" export * + + explicit module neon { + requires neon + header "arm64_neon.h" + export * + } } explicit module intel { diff --git a/clang/test/CodeGen/arm64-neon-header.c b/clang/test/CodeGen/arm64-neon-header.c new file mode 100644 index 0000000000000..3560bfeaad814 --- /dev/null +++ b/clang/test/CodeGen/arm64-neon-header.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -triple arm64-pc-windows-msvc -target-feature +neon \ +// RUN: -fms-compatibility -fms-compatibility-version=19.00 \ +// RUN: -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: define{{.*}} @test_vld1q_s32_x4( +// CHECK-NOT: neon_ld1m4_q32 +// CHECK: call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr {{.*}}) +// CHECK-NOT: neon_ld1m4_q32 +// CHECK: ret +int32x4x4_t test_vld1q_s32_x4(int32_t const *a) { + return vld1q_s32_x4(a); +} From 084a5acf5a076aa32c04cbcdca25c27fc75d8e6d Mon Sep 17 00:00:00 2001 From: GkvJwa Date: Mon, 11 May 2026 11:28:00 +0800 Subject: [PATCH 207/538] [clang][test] Add AArch64 requirement to arm64_neon.h test (#196867) Only run test when the AArch64 target is built --- clang/test/CodeGen/arm64-neon-header.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/test/CodeGen/arm64-neon-header.c b/clang/test/CodeGen/arm64-neon-header.c index 3560bfeaad814..96208ea481b8d 100644 --- a/clang/test/CodeGen/arm64-neon-header.c +++ b/clang/test/CodeGen/arm64-neon-header.c @@ -2,6 +2,8 @@ // RUN: -fms-compatibility -fms-compatibility-version=19.00 \ // RUN: -emit-llvm -o - %s | FileCheck %s +// REQUIRES: aarch64-registered-target + #include // CHECK-LABEL: define{{.*}} @test_vld1q_s32_x4( From dccd300e58f2537a124334fae6b946d04c988dfe Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Mon, 11 May 2026 09:46:54 +0530 Subject: [PATCH 208/538] [LV][NFC] Reshape pointer_iv_non_uniform_0 test to use distinct loads (#196494) The followup [patch](https://github.com/llvm/llvm-project/pull/196080) is folding some of the idempotent binary ops This test has `sub x - x` operation which is affected by the followup patch. This patch is making the test immune to the fold. --- .../LoopVectorize/consecutive-ptr-uniforms.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll index bc5410b9ce93d..6c6bb1f9a1cda 100644 --- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -977,7 +977,7 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) { ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i64 1 ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i64 2 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i64 3 -; CHECK-NEXT: [[TMP39:%.*]] = sub <4 x i32> [[TMP40]], [[TMP40]] +; CHECK-NEXT: [[TMP39:%.*]] = sub <4 x i32> [[TMP12]], [[TMP40]] ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i64 0 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i64 1 ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i64 2 @@ -1019,7 +1019,7 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) { ; CHECK-NEXT: [[UNNAMEDTMP05:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 5 ; CHECK-NEXT: [[UNNAMEDTMP06:%.*]] = load i32, ptr [[UNNAMEDTMP05]], align 8 ; CHECK-NEXT: [[UNNAMEDTMP07:%.*]] = sub i32 [[UNNAMEDTMP4]], [[UNNAMEDTMP00]] -; CHECK-NEXT: [[UNNAMEDTMP08:%.*]] = sub i32 [[UNNAMEDTMP04]], [[UNNAMEDTMP04]] +; CHECK-NEXT: [[UNNAMEDTMP08:%.*]] = sub i32 [[UNNAMEDTMP00]], [[UNNAMEDTMP04]] ; CHECK-NEXT: [[UNNAMEDTMP09:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 2 ; CHECK-NEXT: store i32 [[UNNAMEDTMP07]], ptr [[UNNAMEDTMP09]], align 8 ; CHECK-NEXT: [[UNNAMEDTMP10:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 3 @@ -1071,7 +1071,7 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) { ; INTER-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1 ; INTER-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2 ; INTER-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3 -; INTER-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC4]] +; INTER-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]] ; INTER-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0 ; INTER-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1 ; INTER-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2 @@ -1112,7 +1112,7 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) { ; INTER-NEXT: [[UNNAMEDTMP05:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 5 ; INTER-NEXT: [[UNNAMEDTMP06:%.*]] = load i32, ptr [[UNNAMEDTMP05]], align 8 ; INTER-NEXT: [[UNNAMEDTMP07:%.*]] = sub i32 [[UNNAMEDTMP4]], [[UNNAMEDTMP00]] -; INTER-NEXT: [[UNNAMEDTMP08:%.*]] = sub i32 [[UNNAMEDTMP04]], [[UNNAMEDTMP04]] +; INTER-NEXT: [[UNNAMEDTMP08:%.*]] = sub i32 [[UNNAMEDTMP00]], [[UNNAMEDTMP04]] ; INTER-NEXT: [[UNNAMEDTMP09:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 2 ; INTER-NEXT: store i32 [[UNNAMEDTMP07]], ptr [[UNNAMEDTMP09]], align 8 ; INTER-NEXT: [[UNNAMEDTMP10:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 3 @@ -1137,7 +1137,7 @@ for.body: %tmp05 = getelementptr inbounds i32, ptr %p, i32 5 %tmp06 = load i32, ptr %tmp05, align 8 %tmp07 = sub i32 %tmp04, %tmp00 - %tmp08 = sub i32 %tmp02, %tmp02 + %tmp08 = sub i32 %tmp00, %tmp02 %tmp09 = getelementptr inbounds i32, ptr %p, i32 2 store i32 %tmp07, ptr %tmp09, align 8 %tmp10 = getelementptr inbounds i32, ptr %p, i32 3 From 21c75f0b2e5472ed368ea8fbee81d05af089e5e7 Mon Sep 17 00:00:00 2001 From: XinlongZHANG-Bob <47516944+XinlongZHANG-Bob@users.noreply.github.com> Date: Mon, 11 May 2026 12:39:01 +0800 Subject: [PATCH 209/538] [InstCombine][NFC] Change the order of checks in SliceUpIllegalIntegerPHI for faster compile time. (#183726) SliceUpIllegalIntegerPHI searches for PHIs that have illegal type and are only used by trunc or trunc(lshr) operations. It bails out if encounters invoke or EH pad instructions. It first checks whether it encounters invoke or EH pad, which is time consuming as it checks every instruction. Then it checks whether it is used by trunc or trunc(lshr). The former check is generally loose, while the latter one is stricter. Switch the order of the checks will speed up compilation. Signed-off-by: XinlongZHANG-Bob --- .../Transforms/InstCombine/InstCombinePHI.cpp | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 99fc7fcdb80e4..c4996e2583f2c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -1090,32 +1090,6 @@ Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) { PHINode *PN = PHIsToSlice[PHIId]; - // Scan the input list of the PHI. If any input is an invoke, and if the - // input is defined in the predecessor, then we won't be split the critical - // edge which is required to insert a truncate. Because of this, we have to - // bail out. - for (auto Incoming : zip(PN->blocks(), PN->incoming_values())) { - BasicBlock *BB = std::get<0>(Incoming); - Value *V = std::get<1>(Incoming); - InvokeInst *II = dyn_cast(V); - if (!II) - continue; - if (II->getParent() != BB) - continue; - - // If we have a phi, and if it's directly in the predecessor, then we have - // a critical edge where we need to put the truncate. Since we can't - // split the edge in instcombine, we have to bail out. - return nullptr; - } - - // If the incoming value is a PHI node before a catchswitch, we cannot - // extract the value within that BB because we cannot insert any non-PHI - // instructions in the BB. - for (auto *Pred : PN->blocks()) - if (!Pred->hasInsertionPt()) - return nullptr; - for (User *U : PN->users()) { Instruction *UserI = cast(U); @@ -1148,6 +1122,34 @@ Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { } } + for (const auto &PN : PHIsToSlice) { + // Scan the input list of the PHI. If any input is an invoke, and if the + // input is defined in the predecessor, then we won't be split the critical + // edge which is required to insert a truncate. Because of this, we have to + // bail out. + for (auto Incoming : zip(PN->blocks(), PN->incoming_values())) { + BasicBlock *BB = std::get<0>(Incoming); + Value *V = std::get<1>(Incoming); + InvokeInst *II = dyn_cast(V); + if (!II) + continue; + if (II->getParent() != BB) + continue; + + // If we have a phi, and if it's directly in the predecessor, then we have + // a critical edge where we need to put the truncate. Since we can't + // split the edge in instcombine, we have to bail out. + return nullptr; + } + + // If the incoming value is a PHI node before a catchswitch, we cannot + // extract the value within that BB because we cannot insert any non-PHI + // instructions in the BB. + for (auto *Pred : PN->blocks()) + if (!Pred->hasInsertionPt()) + return nullptr; + } + // If we have no users, they must be all self uses, just nuke the PHI. if (PHIUsers.empty()) return replaceInstUsesWith(FirstPhi, PoisonValue::get(FirstPhi.getType())); From 1bcdc4bf0b174ea5744bb6d27d4cf2d04aaf9266 Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Mon, 11 May 2026 12:41:50 +0800 Subject: [PATCH 210/538] [NFC] Fix C++23 build failures caused by incomplete types (#196814) --- clang/lib/Interpreter/IncrementalParser.h | 5 +---- clang/tools/libclang/CIndexDiagnostic.cpp | 3 +++ clang/tools/libclang/CIndexDiagnostic.h | 3 +-- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 6 ++++++ llvm/lib/Target/BPF/BPFAsmPrinter.cpp | 6 ++++++ llvm/lib/Target/BPF/BPFAsmPrinter.h | 4 ++-- .../TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp | 3 +++ .../utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h | 3 ++- 8 files changed, 24 insertions(+), 9 deletions(-) diff --git a/clang/lib/Interpreter/IncrementalParser.h b/clang/lib/Interpreter/IncrementalParser.h index 9b042bc494efb..a2b654d2ac0f4 100644 --- a/clang/lib/Interpreter/IncrementalParser.h +++ b/clang/lib/Interpreter/IncrementalParser.h @@ -14,15 +14,12 @@ #define LLVM_CLANG_LIB_INTERPRETER_INCREMENTALPARSER_H #include "llvm/ADT/StringRef.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Error.h" #include #include -namespace llvm { -class Module; -} - namespace clang { class ASTConsumer; class CompilerInstance; diff --git a/clang/tools/libclang/CIndexDiagnostic.cpp b/clang/tools/libclang/CIndexDiagnostic.cpp index d37597e747a84..22fc8f5f9c163 100644 --- a/clang/tools/libclang/CIndexDiagnostic.cpp +++ b/clang/tools/libclang/CIndexDiagnostic.cpp @@ -27,6 +27,9 @@ using namespace clang::cxloc; using namespace clang::cxdiag; using namespace llvm; +CXDiagnosticSetImpl::CXDiagnosticSetImpl(bool isManaged) + : IsExternallyManaged(isManaged) {} + CXDiagnosticSetImpl::~CXDiagnosticSetImpl() {} void diff --git a/clang/tools/libclang/CIndexDiagnostic.h b/clang/tools/libclang/CIndexDiagnostic.h index 25589bb57474a..6824f3c170424 100644 --- a/clang/tools/libclang/CIndexDiagnostic.h +++ b/clang/tools/libclang/CIndexDiagnostic.h @@ -28,8 +28,7 @@ class CXDiagnosticSetImpl { std::vector> Diagnostics; const bool IsExternallyManaged; public: - CXDiagnosticSetImpl(bool isManaged = false) - : IsExternallyManaged(isManaged) {} + CXDiagnosticSetImpl(bool isManaged = false); virtual ~CXDiagnosticSetImpl(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index aaa01ee2e549a..e3d7b7a5a519e 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -528,6 +528,10 @@ class SIInsertWaitcnts { struct BlockInfo { std::unique_ptr Incoming; bool Dirty = true; + BlockInfo() = default; + BlockInfo(BlockInfo &&) = default; + BlockInfo &operator=(BlockInfo &&) = default; + ~BlockInfo(); }; MapVector BlockInfos; @@ -1051,6 +1055,8 @@ class WaitcntBrackets { CounterValueArray AsyncScore{}; }; +SIInsertWaitcnts::BlockInfo::~BlockInfo() = default; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 18c92af077ac7..2e72cb840c577 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -42,6 +42,12 @@ using namespace llvm; #define DEBUG_TYPE "asm-printer" +BPFAsmPrinter::BPFAsmPrinter(TargetMachine &TM, + std::unique_ptr Streamer) + : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr), TM(TM) {} + +BPFAsmPrinter::~BPFAsmPrinter() = default; + bool BPFAsmPrinter::doInitialization(Module &M) { AsmPrinter::doInitialization(M); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index d1db65d878463..c9707e03fcabf 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -18,8 +18,8 @@ namespace llvm { class BPFAsmPrinter : public AsmPrinter { public: explicit BPFAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer) - : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr), TM(TM) {} + std::unique_ptr Streamer); + ~BPFAsmPrinter() override; StringRef getPassName() const override { return "BPF Assembly Printer"; } bool doInitialization(Module &M) override; diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp index 4ce1565b4c387..ab20eb6e2f790 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp @@ -628,6 +628,9 @@ void GroupMatcher::optimize() { //===- SwitchMatcher ------------------------------------------------------===// +SwitchMatcher::SwitchMatcher() : Matcher(MK_Switch) {} +SwitchMatcher::~SwitchMatcher() = default; + bool SwitchMatcher::recordsOperand() const { assert(!isa_and_present(Condition.get()) && "Switch conditions should not record named operands"); diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index 5fb3ef1a87c88..ad02b1adb58b8 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -445,7 +445,8 @@ class SwitchMatcher : public Matcher { std::vector> MatcherStorage; public: - SwitchMatcher() : Matcher(MK_Switch) {} + SwitchMatcher(); + ~SwitchMatcher(); static bool classof(const Matcher *M) { return M->getKind() == MK_Switch; } From 5a03b842815318ff355573e667828f3f1dada298 Mon Sep 17 00:00:00 2001 From: Yashwant Singh Date: Mon, 11 May 2026 10:48:42 +0530 Subject: [PATCH 211/538] [AArch64][CostModel] Model sve costs for ctpop (#192428) Targets supporting sve prefer sve for ctpop with fixed length vectors. Update cost model to reflect the same. --- .../AArch64/AArch64TargetTransformInfo.cpp | 17 +- llvm/test/Analysis/CostModel/AArch64/ctpop.ll | 187 ++++++++++++++---- .../Analysis/CostModel/AArch64/sve-ctpop.ll | 38 ++++ 3 files changed, 198 insertions(+), 44 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/AArch64/sve-ctpop.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3685177328f12..2e3e7b73ba390 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -807,14 +807,25 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, {ISD::CTPOP, MVT::v4i32, 3}, {ISD::CTPOP, MVT::v8i16, 2}, {ISD::CTPOP, MVT::v16i8, 1}, - {ISD::CTPOP, MVT::i64, 4}, + {ISD::CTPOP, MVT::i64, 4}, {ISD::CTPOP, MVT::v2i32, 3}, {ISD::CTPOP, MVT::v4i16, 2}, - {ISD::CTPOP, MVT::v8i8, 1}, - {ISD::CTPOP, MVT::i32, 5}, + {ISD::CTPOP, MVT::v8i8, 1}, + {ISD::CTPOP, MVT::i32, 5}, + // SVE types (For targets that override NEON for fixed length vectors) + {ISD::CTPOP, MVT::nxv2i64, 1}, + {ISD::CTPOP, MVT::nxv4i32, 1}, + {ISD::CTPOP, MVT::nxv8i16, 1}, + {ISD::CTPOP, MVT::nxv16i8, 1}, }; auto LT = getTypeLegalizationCost(RetTy); MVT MTy = LT.second; + + // When SVE is available CNT will be used for fixed and scalable vectors. + if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector()) + MTy = MVT::getScalableVectorVT(MTy.getVectorElementType(), + 128 / MTy.getScalarSizeInBits()); + if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { // Extra cost of +1 when illegal vector types are legalized by promoting // the integer type. diff --git a/llvm/test/Analysis/CostModel/AArch64/ctpop.ll b/llvm/test/Analysis/CostModel/AArch64/ctpop.ll index 013432991f5ae..514d12f3b2d47 100644 --- a/llvm/test/Analysis/CostModel/AArch64/ctpop.ll +++ b/llvm/test/Analysis/CostModel/AArch64/ctpop.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s - +; RUN: opt < %s -mtriple=aarch64 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,BASE +; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,SVE-256 ; Verify the cost of scalar ctpop instructions. target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -49,72 +50,136 @@ declare i8 @llvm.ctpop.i8(i8) ; Verify the cost of vector ctpop instructions. define <2 x i64> @test_ctpop_v2i64(<2 x i64> %a) { -; CHECK-LABEL: 'test_ctpop_v2i64' -; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop +; BASE-LABEL: 'test_ctpop_v2i64' +; BASE-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop +; +; SVE-LABEL: 'test_ctpop_v2i64' +; SVE-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v2i64' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop ; %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %ctpop } define <2 x i32> @test_ctpop_v2i32(<2 x i32> %a) { -; CHECK-LABEL: 'test_ctpop_v2i32' -; CHECK-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %ctpop +; BASE-LABEL: 'test_ctpop_v2i32' +; BASE-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %ctpop +; +; SVE-LABEL: 'test_ctpop_v2i32' +; SVE-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v2i32' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %ctpop ; %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) ret <2 x i32> %ctpop } define <4 x i32> @test_ctpop_v4i32(<4 x i32> %a) { -; CHECK-LABEL: 'test_ctpop_v4i32' -; CHECK-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %ctpop +; BASE-LABEL: 'test_ctpop_v4i32' +; BASE-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %ctpop +; +; SVE-LABEL: 'test_ctpop_v4i32' +; SVE-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v4i32' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %ctpop ; %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) ret <4 x i32> %ctpop } define <2 x i16> @test_ctpop_v2i16(<2 x i16> %a) { -; CHECK-LABEL: 'test_ctpop_v2i16' -; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %ctpop +; BASE-LABEL: 'test_ctpop_v2i16' +; BASE-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %ctpop +; +; SVE-LABEL: 'test_ctpop_v2i16' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v2i16' +; SVE-256-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %ctpop ; %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) ret <2 x i16> %ctpop } define <4 x i16> @test_ctpop_v4i16(<4 x i16> %a) { -; CHECK-LABEL: 'test_ctpop_v4i16' -; CHECK-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %ctpop +; BASE-LABEL: 'test_ctpop_v4i16' +; BASE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %ctpop +; +; SVE-LABEL: 'test_ctpop_v4i16' +; SVE-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v4i16' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %ctpop ; %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) ret <4 x i16> %ctpop } define <8 x i16> @test_ctpop_v8i16(<8 x i16> %a) { -; CHECK-LABEL: 'test_ctpop_v8i16' -; CHECK-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %ctpop +; BASE-LABEL: 'test_ctpop_v8i16' +; BASE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %ctpop +; +; SVE-LABEL: 'test_ctpop_v8i16' +; SVE-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v8i16' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %ctpop ; %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) ret <8 x i16> %ctpop } define <2 x i8> @test_ctpop_v2i8(<2 x i8> %a) { -; CHECK-LABEL: 'test_ctpop_v2i8' -; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %ctpop +; BASE-LABEL: 'test_ctpop_v2i8' +; BASE-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %ctpop +; +; SVE-LABEL: 'test_ctpop_v2i8' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v2i8' +; SVE-256-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %ctpop ; %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) ret <2 x i8> %ctpop } define <4 x i8> @test_ctpop_v4i8(<4 x i8> %a) { -; CHECK-LABEL: 'test_ctpop_v4i8' -; CHECK-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %ctpop +; BASE-LABEL: 'test_ctpop_v4i8' +; BASE-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %ctpop +; +; SVE-LABEL: 'test_ctpop_v4i8' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v4i8' +; SVE-256-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %ctpop ; %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) ret <4 x i8> %ctpop @@ -139,36 +204,68 @@ define <16 x i8> @test_ctpop_v16i8(<16 x i8> %a) { } define <4 x i64> @test_ctpop_v4i64(<4 x i64> %a) { -; CHECK-LABEL: 'test_ctpop_v4i64' -; CHECK-NEXT: Cost Model: Found costs of 8 for: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %ctpop +; BASE-LABEL: 'test_ctpop_v4i64' +; BASE-NEXT: Cost Model: Found costs of 8 for: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %ctpop +; +; SVE-LABEL: 'test_ctpop_v4i64' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v4i64' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %ctpop ; %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) ret <4 x i64> %ctpop } define <8 x i32> @test_ctpop_v8i32(<8 x i32> %a) { -; CHECK-LABEL: 'test_ctpop_v8i32' -; CHECK-NEXT: Cost Model: Found costs of 6 for: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %ctpop +; BASE-LABEL: 'test_ctpop_v8i32' +; BASE-NEXT: Cost Model: Found costs of 6 for: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %ctpop +; +; SVE-LABEL: 'test_ctpop_v8i32' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v8i32' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %ctpop ; %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) ret <8 x i32> %ctpop } define <16 x i16> @test_ctpop_v16i16(<16 x i16> %a) { -; CHECK-LABEL: 'test_ctpop_v16i16' -; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %ctpop +; BASE-LABEL: 'test_ctpop_v16i16' +; BASE-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %ctpop +; +; SVE-LABEL: 'test_ctpop_v16i16' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v16i16' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %ctpop ; %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) ret <16 x i16> %ctpop } define <32 x i8> @test_ctpop_v32i8(<32 x i8> %a) { -; CHECK-LABEL: 'test_ctpop_v32i8' -; CHECK-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %ctpop +; BASE-LABEL: 'test_ctpop_v32i8' +; BASE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %ctpop +; +; SVE-LABEL: 'test_ctpop_v32i8' +; SVE-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_v32i8' +; SVE-256-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %ctpop ; %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) ret <32 x i8> %ctpop @@ -184,9 +281,17 @@ define i64 @test_ctpop_noneon_i64(i64 %a) "target-features"="-fp-armv8,-neon" { } define <2 x i64> @test_ctpop_noneon_v2i64(<2 x i64> %a) "target-features"="-fp-armv8,-neon" { -; CHECK-LABEL: 'test_ctpop_noneon_v2i64' -; CHECK-NEXT: Cost Model: Found costs of 24 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) -; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop +; BASE-LABEL: 'test_ctpop_noneon_v2i64' +; BASE-NEXT: Cost Model: Found costs of 24 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop +; +; SVE-LABEL: 'test_ctpop_noneon_v2i64' +; SVE-NEXT: Cost Model: Found costs of 24 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop +; +; SVE-256-LABEL: 'test_ctpop_noneon_v2i64' +; SVE-256-NEXT: Cost Model: Found costs of 12 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; SVE-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop ; %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %ctpop diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ctpop.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ctpop.ll new file mode 100644 index 0000000000000..5c4395630c2c2 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-ctpop.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +define void @ctpop_scalable() { +; CHECK-LABEL: 'ctpop_scalable' +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv4i8 = call @llvm.ctpop.nxv4i8( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv8i8 = call @llvm.ctpop.nxv8i8( poison) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv16i8 = call @llvm.ctpop.nxv16i8( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv32i8 = call @llvm.ctpop.nxv32i8( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv2i16 = call @llvm.ctpop.nxv2i16( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv4i16 = call @llvm.ctpop.nxv4i16( poison) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv8i16 = call @llvm.ctpop.nxv8i16( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv16i16 = call @llvm.ctpop.nxv16i16( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv2i32 = call @llvm.ctpop.nxv2i32( poison) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4i32 = call @llvm.ctpop.nxv4i32( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv8i32 = call @llvm.ctpop.nxv8i32( poison) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2i64 = call @llvm.ctpop.nxv2i64( poison) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %nxv4i64 = call @llvm.ctpop.nxv4i64( poison) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %nxv4i8 = call @llvm.ctpop.nxv4i8 ( poison) + %nxv8i8 = call @llvm.ctpop.nxv8i8 ( poison) + %nxv16i8 = call @llvm.ctpop.nxv16i8 ( poison) + %nxv32i8 = call @llvm.ctpop.nxv32i8 ( poison) + %nxv2i16 = call @llvm.ctpop.nxv2i16 ( poison) + %nxv4i16 = call @llvm.ctpop.nxv4i16 ( poison) + %nxv8i16 = call @llvm.ctpop.nxv8i16 ( poison) + %nxv16i16 = call @llvm.ctpop.nxv16i16( poison) + %nxv2i32 = call @llvm.ctpop.nxv2i32 ( poison) + %nxv4i32 = call @llvm.ctpop.nxv4i32 ( poison) + %nxv8i32 = call @llvm.ctpop.nxv8i32 ( poison) + %nxv2i64 = call @llvm.ctpop.nxv2i64 ( poison) + %nxv4i64 = call @llvm.ctpop.nxv4i64 ( poison) + ret void +} + From ce6605a4931a294bd17b5e56658b701b18d2bcf9 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Mon, 11 May 2026 10:56:52 +0530 Subject: [PATCH 212/538] [MLIR][NVVM][NFC] Restructure NVVM dialect (#195811) Moves the declarations of the NVVM dialect and some widely used enums (`FPRoundingModeAttr` and `SaturationModeAttr`) to separate files to make them easier to maintain and also use in the NVGPU dialect. --- .../mlir/Dialect/LLVMIR/NVVMDialect.td | 94 ++++++++++++ mlir/include/mlir/Dialect/LLVMIR/NVVMEnums.td | 72 +++++++++ mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 143 +----------------- 3 files changed, 173 insertions(+), 136 deletions(-) create mode 100644 mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.td create mode 100644 mlir/include/mlir/Dialect/LLVMIR/NVVMEnums.td diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.td new file mode 100644 index 0000000000000..025e093ebd8b6 --- /dev/null +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.td @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the NVVM IR dialect. +/// +//===----------------------------------------------------------------------===// + +#ifndef NVVMIR_DIALECT +#define NVVMIR_DIALECT + +include "mlir/IR/DialectBase.td" + +def NVVM_Dialect : Dialect { + let name = "nvvm"; + let cppNamespace = "::mlir::NVVM"; + let dependentDialects = ["LLVM::LLVMDialect"]; + let hasOperationAttrVerify = 1; + + let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate external kernel + /// functions. + static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; } + /// Get the name of the attribute used to annotate max threads required + /// per CTA for kernel functions. + static StringRef getMaxntidAttrName() { return "nvvm.maxntid"; } + /// Get the name of the metadata names for each dimension + static StringRef getMaxntidXName() { return "maxntidx"; } + static StringRef getMaxntidYName() { return "maxntidy"; } + static StringRef getMaxntidZName() { return "maxntidz"; } + + /// Get the name of the attribute used to annotate exact threads required + /// per CTA for kernel functions. + static StringRef getReqntidAttrName() { return "nvvm.reqntid"; } + /// Get the name of the metadata names for each dimension + static StringRef getReqntidXName() { return "reqntidx"; } + static StringRef getReqntidYName() { return "reqntidy"; } + static StringRef getReqntidZName() { return "reqntidz"; } + + /// Get the name of the attribute used to annotate exact CTAs required + /// per cluster for kernel functions. + static StringRef getClusterDimAttrName() { return "nvvm.cluster_dim"; } + /// Get the name of the metadata names for each dimension + static StringRef getClusterDimXName() { return "cluster_dim_x"; } + static StringRef getClusterDimYName() { return "cluster_dim_y"; } + static StringRef getClusterDimZName() { return "cluster_dim_z"; } + + /// Get the name of the attribute used to annotate maximum number of + /// CTAs per cluster for kernel functions. + static StringRef getClusterMaxBlocksAttrName() { return "nvvm.cluster_max_blocks"; } + + /// Get the name of the attribute used to annotate min CTA required + /// per SM for kernel functions. + static StringRef getMinctasmAttrName() { return "nvvm.minctasm"; } + + /// Get the name of the attribute used to annotate max number of + /// registers that can be allocated per thread. + static StringRef getMaxnregAttrName() { return "nvvm.maxnreg"; } + + /// Get the name of the attribute used to annotate kernel arguments that + /// are grid constants. + static StringRef getGridConstantAttrName() { return "nvvm.grid_constant"; } + + /// Get the name of the attribute used to annotate the `.blocksareclusters` + /// PTX directive for kernel functions. + /// This attribute implies that the grid launch configuration for the + /// corresponding kernel function is specifying the number of clusters + /// instead of the number of thread blocks. This attribute is only + /// allowed for kernel functions and requires nvvm.reqntid and + /// nvvm.cluster_dim attributes. + static StringRef getBlocksAreClustersAttrName() { return "nvvm.blocksareclusters"; } + + /// Get the name of the attribute used to annotate managed global variables. + static StringRef getManagedAttrName() { return "nvvm.managed"; } + + /// Verify an attribute from this dialect on the argument at 'argIndex' for + /// the region at 'regionIndex' on the given operation. Returns failure if + /// the verification failed, success otherwise. This hook may optionally be + /// invoked from any operation containing a region. + LogicalResult verifyRegionArgAttribute(Operation *op, + unsigned regionIndex, + unsigned argIndex, + NamedAttribute argAttr) override; + }]; + + let useDefaultAttributePrinterParser = 1; +} + +#endif // NVVMIR_DIALECT diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMEnums.td new file mode 100644 index 0000000000000..42d196c5662d1 --- /dev/null +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMEnums.td @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the NVVM IR enum attributes. +/// +//===----------------------------------------------------------------------===// + +#ifndef NVVMIR_ENUMS +#define NVVMIR_ENUMS + +include "mlir/Dialect/LLVMIR/NVVMDialect.td" +include "mlir/IR/EnumAttr.td" + +// Attributes for the floating point rounding modes supported by PTX +def FPRoundingModeNone : I32EnumAttrCase<"NONE", 0, "none">; +def FPRoundingModeRN : I32EnumAttrCase<"RN", 1, "rn">; +def FPRoundingModeRM : I32EnumAttrCase<"RM", 2, "rm">; +def FPRoundingModeRP : I32EnumAttrCase<"RP", 3, "rp">; +def FPRoundingModeRZ : I32EnumAttrCase<"RZ", 4, "rz">; +def FPRoundingModeRNA : I32EnumAttrCase<"RNA", 5, "rna">; +def FPRoundingModeRS : I32EnumAttrCase<"RS", 6, "rs">; + +def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind", + [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM, + FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def FPRoundingModeAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def SaturationModeNone : I32EnumAttrCase<"NONE", 0, "none">; +def SaturationModeFinite : I32EnumAttrCase<"SATFINITE", 1, "satfinite">; +def SaturationModeSat : I32EnumAttrCase<"SAT", 2, "sat">; + +def SaturationMode : I32EnumAttr<"SaturationMode", "NVVM SaturationMode kind", + [SaturationModeNone, SaturationModeFinite, SaturationModeSat]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def SaturationModeAttr : EnumAttr { + let summary = "Describes the saturation mode"; + let description = [{ + A `nvvm.sat_mode` attribute specifies the saturation mode for instructions + involving floating points or integers. It can be one of the following + values: + - `none`: No saturation is applied. + - `satfinite`: If the absolute value of input (ignoring sign) is greater + than the `MAX_NORM` of the specified destination format, then the result + is the sign-preserved `MAX_NORM` of the destination format and a positive + `MAX_NORM` in unsigned datatypes for which the destination sign is not + supported. If the input is `NaN`, then the result can be `NaN` or the + `MAX_NORM` of the destination format, depending on the format. + - `sat`: For integer destination types, this limits the value to `MININT.. + MAXINT` and applies to both signed and unsigned integer datatypes. For + floating point destination types (applies to only `F16`, `F32`, and `F64` + types), this limits the value to the range `[0.0, 1.0]` and flushes NaN + results to positive zero. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt) +}]; + let assemblyFormat = "`<` $value `>`"; +} + +#endif // NVVMIR_ENUMS diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 51396947fad4e..4e0bce985fdf7 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -13,17 +13,19 @@ #ifndef NVVMIR_OPS #define NVVMIR_OPS -include "mlir/IR/EnumAttr.td" include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td" +include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td" include "mlir/Dialect/LLVMIR/LLVMOpBase.td" +include "mlir/Dialect/LLVMIR/LLVMTypes.td" +include "mlir/Dialect/LLVMIR/NVVMDialect.td" +include "mlir/Dialect/LLVMIR/NVVMEnums.td" include "mlir/Dialect/LLVMIR/NVVMRequiresSMTraits.td" include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.td" -include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td" +include "mlir/IR/CommonAttrConstraints.td" +include "mlir/IR/EnumAttr.td" include "mlir/Interfaces/InferIntRangeInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" -include "mlir/Dialect/LLVMIR/LLVMTypes.td" -include "mlir/IR/CommonAttrConstraints.td" +include "mlir/Interfaces/SideEffectInterfaces.td" def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>; def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>; @@ -33,85 +35,6 @@ def LLVM_PointerLocal : LLVM_PointerInAddressSpace<5>; def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>; def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>; -//===----------------------------------------------------------------------===// -// NVVM dialect definitions -//===----------------------------------------------------------------------===// - -def NVVM_Dialect : Dialect { - let name = "nvvm"; - let cppNamespace = "::mlir::NVVM"; - let dependentDialects = ["LLVM::LLVMDialect"]; - let hasOperationAttrVerify = 1; - - let extraClassDeclaration = [{ - /// Get the name of the attribute used to annotate external kernel - /// functions. - static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; } - /// Get the name of the attribute used to annotate max threads required - /// per CTA for kernel functions. - static StringRef getMaxntidAttrName() { return "nvvm.maxntid"; } - /// Get the name of the metadata names for each dimension - static StringRef getMaxntidXName() { return "maxntidx"; } - static StringRef getMaxntidYName() { return "maxntidy"; } - static StringRef getMaxntidZName() { return "maxntidz"; } - - /// Get the name of the attribute used to annotate exact threads required - /// per CTA for kernel functions. - static StringRef getReqntidAttrName() { return "nvvm.reqntid"; } - /// Get the name of the metadata names for each dimension - static StringRef getReqntidXName() { return "reqntidx"; } - static StringRef getReqntidYName() { return "reqntidy"; } - static StringRef getReqntidZName() { return "reqntidz"; } - - /// Get the name of the attribute used to annotate exact CTAs required - /// per cluster for kernel functions. - static StringRef getClusterDimAttrName() { return "nvvm.cluster_dim"; } - /// Get the name of the metadata names for each dimension - static StringRef getClusterDimXName() { return "cluster_dim_x"; } - static StringRef getClusterDimYName() { return "cluster_dim_y"; } - static StringRef getClusterDimZName() { return "cluster_dim_z"; } - - /// Get the name of the attribute used to annotate maximum number of - /// CTAs per cluster for kernel functions. - static StringRef getClusterMaxBlocksAttrName() { return "nvvm.cluster_max_blocks"; } - - /// Get the name of the attribute used to annotate min CTA required - /// per SM for kernel functions. - static StringRef getMinctasmAttrName() { return "nvvm.minctasm"; } - - /// Get the name of the attribute used to annotate max number of - /// registers that can be allocated per thread. - static StringRef getMaxnregAttrName() { return "nvvm.maxnreg"; } - - /// Get the name of the attribute used to annotate kernel arguments that - /// are grid constants. - static StringRef getGridConstantAttrName() { return "nvvm.grid_constant"; } - - /// Get the name of the attribute used to annotate the `.blocksareclusters` - /// PTX directive for kernel functions. - /// This attribute implies that the grid launch configuration for the - /// corresponding kernel function is specifying the number of clusters - /// instead of the number of thread blocks. This attribute is only - /// allowed for kernel functions and requires nvvm.reqntid and - /// nvvm.cluster_dim attributes. - static StringRef getBlocksAreClustersAttrName() { return "nvvm.blocksareclusters"; } - - /// Get the name of the attribute used to annotate managed global variables. - static StringRef getManagedAttrName() { return "nvvm.managed"; } - - /// Verify an attribute from this dialect on the argument at 'argIndex' for - /// the region at 'regionIndex' on the given operation. Returns failure if - /// the verification failed, success otherwise. This hook may optionally be - /// invoked from any operation containing a region. - LogicalResult verifyRegionArgAttribute(Operation *op, - unsigned regionIndex, - unsigned argIndex, - NamedAttribute argAttr) override; - }]; - - let useDefaultAttributePrinterParser = 1; -} - //===----------------------------------------------------------------------===// // NVVM op definitions //===----------------------------------------------------------------------===// @@ -1899,58 +1822,6 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_VoidIntrinsicOp<"cp.async.mbarrier.arriv // NVVM Conversion Ops (for "cvt.*" family of PTX instructions) //===----------------------------------------------------------------------===// -// Attributes for the floating point rounding modes supported by PTX -def FPRoundingModeNone : I32EnumAttrCase<"NONE", 0, "none">; -def FPRoundingModeRN : I32EnumAttrCase<"RN", 1, "rn">; -def FPRoundingModeRM : I32EnumAttrCase<"RM", 2, "rm">; -def FPRoundingModeRP : I32EnumAttrCase<"RP", 3, "rp">; -def FPRoundingModeRZ : I32EnumAttrCase<"RZ", 4, "rz">; -def FPRoundingModeRNA : I32EnumAttrCase<"RNA", 5, "rna">; -def FPRoundingModeRS : I32EnumAttrCase<"RS", 6, "rs">; - -def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind", - [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM, - FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> { - let genSpecializedAttr = 0; - let cppNamespace = "::mlir::NVVM"; -} -def FPRoundingModeAttr : EnumAttr { - let assemblyFormat = "`<` $value `>`"; -} - -def SaturationModeNone : I32EnumAttrCase<"NONE", 0, "none">; -def SaturationModeFinite : I32EnumAttrCase<"SATFINITE", 1, "satfinite">; -def SaturationModeSat : I32EnumAttrCase<"SAT", 2, "sat">; - -def SaturationMode : I32EnumAttr<"SaturationMode", "NVVM SaturationMode kind", - [SaturationModeNone, SaturationModeFinite, SaturationModeSat]> { - let genSpecializedAttr = 0; - let cppNamespace = "::mlir::NVVM"; -} -def SaturationModeAttr : EnumAttr { - let summary = "Describes the saturation mode"; - let description = [{ - A `nvvm.sat_mode` attribute specifies the saturation mode for instructions - involving floating points or integers. It can be one of the following - values: - - `none`: No saturation is applied. - - `satfinite`: If the absolute value of input (ignoring sign) is greater - than the `MAX_NORM` of the specified destination format, then the result - is the sign-preserved `MAX_NORM` of the destination format and a positive - `MAX_NORM` in unsigned datatypes for which the destination sign is not - supported. If the input is `NaN`, then the result can be `NaN` or th - `MAX_NORM` of the destination format, depending on the format. - - `sat`: For integer destination types, this limits the value to `MININT.. - MAXINT` and applies to both signed and unsigned integer datatypes. For - floating point destination types (applies to only `F16`, `F32`, and `F64` - types), this limits the value to the range `[0.0, 1.0]` and flushes NaN - results to positive zero. - - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt) -}]; - let assemblyFormat = "`<` $value `>`"; -} - def NVVM_ConvertFloatToTF32Op : NVVM_Op<"convert.float.to.tf32"> { let summary = "Convert the given float input to TF32"; let description = [{ From cbc6e861874cbd1ee10818f7ff3bb90959c95fa4 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 11 May 2026 07:33:20 +0200 Subject: [PATCH 213/538] [clang][bytecode] Allow const mutation in all variable initializers (#195794) So the attached test case works even though it's just an `InitListExpr`. --- clang/lib/AST/ByteCode/Compiler.cpp | 6 ++++++ clang/lib/AST/ByteCode/Interp.h | 11 +++++++++++ clang/lib/AST/ByteCode/Opcodes.td | 4 ++++ clang/test/AST/ByteCode/cxx14.cpp | 12 ++++++++++++ 4 files changed, 33 insertions(+) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index bcdbd68731ee5..9a8842bf258f6 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -5297,9 +5297,15 @@ VarCreationState Compiler::visitVarDecl(const VarDecl *VD, if (!this->emitGetPtrGlobal(*GlobalIndex, Init)) return false; + if (!this->emitStartInit(Init)) + return false; + if (!visitInitializer(Init)) return false; + if (!this->emitEndInit(Init)) + return false; + return this->emitFinishInitGlobal(Init); } // Local variables. diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 4a550fdd63bfb..fe2d99901d367 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -3564,6 +3564,17 @@ inline bool StartSpeculation(InterpState &S, CodePtr OpPC) { return true; } +inline bool StartInit(InterpState &S, CodePtr OpPC) { + const Pointer &Ptr = S.Stk.peek(); + S.InitializingBlocks.push_back(Ptr.block()); + return true; +} + +inline bool EndInit(InterpState &S, CodePtr OpPC) { + S.InitializingBlocks.pop_back(); + return true; +} + // This is special-cased in the tablegen opcode emitter. // Its dispatch function will NOT call InterpNext // and instead simply return true. diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 3fb25a5fa0884..57ed71fb6f16b 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -181,6 +181,10 @@ def Jf : JumpOpcode; def PushIgnoreDiags : Opcode; def PopIgnoreDiags : Opcode; + +def StartInit : Opcode; +def EndInit : Opcode; + def StartSpeculation : Opcode; def EndSpeculation : Opcode; def BCP : Opcode { diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp index 00a9cf5b00098..170bd09504993 100644 --- a/clang/test/AST/ByteCode/cxx14.cpp +++ b/clang/test/AST/ByteCode/cxx14.cpp @@ -35,3 +35,15 @@ constexpr bool f() { // both-error {{constexpr function never produces a constan // both-warning {{array index 3 is past the end of the array}} return true; } + +namespace InitListModify { + struct Aggregate { + int x = 0; + int y = ++x; + }; + constexpr Aggregate aggr1; + static_assert(aggr1.x == 1 && aggr1.y == 1, ""); + // FIXME: This is not specified by the standard, but sanity requires it. + constexpr Aggregate aggr2 = {}; + static_assert(aggr2.x == 1 && aggr2.y == 1, ""); +} From 5c7854c8ed1998650af6f01761df385712f5295a Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Mon, 11 May 2026 05:57:00 +0000 Subject: [PATCH 214/538] [libc][stdlib] Add setenv (#163018) Add the POSIX setenv() function, with EnvironmentManager::set() handling environment array management and ownership tracking. Registered for x86_64, aarch64, and riscv architectures. Integration tests cover overwrite/no-overwrite semantics, empty/invalid names, empty values, and repeated replacement. Assisted-by: Automated tooling, human reviewed. --------- Co-authored-by: Michael Jones --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/include/stdlib.yaml | 8 + libc/src/stdlib/CMakeLists.txt | 8 + libc/src/stdlib/environ_internal.cpp | 50 +++++ libc/src/stdlib/environ_internal.h | 7 + libc/src/stdlib/linux/CMakeLists.txt | 15 ++ libc/src/stdlib/linux/setenv.cpp | 48 +++++ libc/src/stdlib/setenv.h | 25 +++ .../integration/src/stdlib/CMakeLists.txt | 12 ++ .../integration/src/stdlib/setenv_test.cpp | 175 ++++++++++++++++++ 12 files changed, 351 insertions(+) create mode 100644 libc/src/stdlib/linux/setenv.cpp create mode 100644 libc/src/stdlib/setenv.h create mode 100644 libc/test/integration/src/stdlib/setenv_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 643bba2aae694..9994a9294173d 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -1148,6 +1148,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.stdlib.atexit libc.src.stdlib.exit libc.src.stdlib.getenv + libc.src.stdlib.setenv libc.src.stdlib.quick_exit # signal.h entrypoints diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index bbb7aca7f39b6..2748b2b8e6a5d 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -1277,6 +1277,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.stdlib.atexit libc.src.stdlib.exit libc.src.stdlib.getenv + libc.src.stdlib.setenv libc.src.stdlib.quick_exit # signal.h entrypoints diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 2a0e43744ec0d..4b551ced82138 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1341,6 +1341,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.stdlib.atexit libc.src.stdlib.exit libc.src.stdlib.getenv + libc.src.stdlib.setenv libc.src.stdlib.mbstowcs libc.src.stdlib.mbtowc libc.src.stdlib.quick_exit diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml index a751f8306be24..4c958cd9d28ad 100644 --- a/libc/include/stdlib.yaml +++ b/libc/include/stdlib.yaml @@ -209,6 +209,14 @@ functions: return_type: void arguments: - type: unsigned int + - name: setenv + standards: + - posix + return_type: int + arguments: + - type: const char * + - type: const char * + - type: int - name: strfromd standards: - stdc diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index 7e587235447b0..e79353eb2a581 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -91,6 +91,7 @@ add_object_library( libc.src.__support.CPP.string_view libc.src.__support.macros.attributes libc.src.__support.macros.config + libc.src.string.memory_utils.inline_memcpy ) add_entrypoint_object( @@ -635,6 +636,13 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) endif() +add_entrypoint_object( + setenv + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.setenv +) + if(LIBC_TARGET_OS_IS_BAREMETAL OR LIBC_TARGET_OS_IS_GPU) add_entrypoint_object( malloc diff --git a/libc/src/stdlib/environ_internal.cpp b/libc/src/stdlib/environ_internal.cpp index a6b630921f9f1..e9da17d028575 100644 --- a/libc/src/stdlib/environ_internal.cpp +++ b/libc/src/stdlib/environ_internal.cpp @@ -17,6 +17,7 @@ #include "src/__support/CPP/string_view.h" #include "src/__support/alloc-checker.h" #include "src/__support/macros/config.h" +#include "src/string/memory_utils/inline_memcpy.h" namespace LIBC_NAMESPACE_DECL { namespace internal { @@ -171,5 +172,54 @@ bool EnvironmentManager::ensure_capacity(size_t needed) { return true; } +int EnvironmentManager::set(cpp::string_view name, cpp::string_view value, + bool overwrite) { + cpp::optional idx = find_var(name); + + // If the variable exists and we're not overwriting, do nothing. + if (idx && !overwrite) + return 0; + + // Ensure we have capacity. If the variable doesn't exist, we need one + // more slot. + size_t needed = idx ? count : count + 1; + if (!ensure_capacity(needed)) + return -1; + + // Build the "name=value" string. + size_t name_len = name.size(); + size_t value_len = value.size(); + size_t total_len = name_len + 1 + value_len + 1; // name + '=' + value + '\0' + + AllocChecker ac; + char *new_string = new (ac) char[total_len]; + if (!ac) + return -1; + + inline_memcpy(new_string, name.data(), name_len); + new_string[name_len] = '='; + inline_memcpy(new_string + name_len + 1, value.data(), value_len); + new_string[name_len + 1 + value_len] = '\0'; + + char **env_array = get_array(); + + if (idx) { + // Replace existing variable. Free old string if we own it. + if (ownership[*idx].can_free()) + delete[] env_array[*idx]; + + env_array[*idx] = new_string; + ownership[*idx].allocated_by_us = true; + } else { + // Add new variable at the end. + env_array[count] = new_string; + ownership[count].allocated_by_us = true; + count++; + env_array[count] = nullptr; // Maintain null terminator. + } + + return 0; +} + } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/environ_internal.h b/libc/src/stdlib/environ_internal.h index 0ab75e200b968..d902354f421cf 100644 --- a/libc/src/stdlib/environ_internal.h +++ b/libc/src/stdlib/environ_internal.h @@ -122,6 +122,13 @@ class EnvironmentManager { // Look up a variable by name. Returns a pointer to the value string // (after the '='), or nullptr if not found. char *get(cpp::string_view name); + + // Set or update an environment variable. Builds a "name=value" string, + // manages ownership, and updates the environ array. If `overwrite` is + // false and the variable already exists, does nothing and returns 0. + // Returns 0 on success, -1 on allocation failure (caller should set + // errno to ENOMEM). + int set(cpp::string_view name, cpp::string_view value, bool overwrite); }; } // namespace internal diff --git a/libc/src/stdlib/linux/CMakeLists.txt b/libc/src/stdlib/linux/CMakeLists.txt index 8a4b2bab1c53d..aedcd3f11bf38 100644 --- a/libc/src/stdlib/linux/CMakeLists.txt +++ b/libc/src/stdlib/linux/CMakeLists.txt @@ -8,3 +8,18 @@ add_header_library( libc.src.signal.linux.__restore libc.src.signal.linux.signal_utils ) + +add_entrypoint_object( + setenv + SRCS + setenv.cpp + HDRS + ../setenv.h + DEPENDS + libc.src.__support.CPP.string_view + libc.src.__support.common + libc.src.__support.libc_errno + libc.src.__support.macros.config + libc.src.__support.macros.null_check + libc.src.stdlib.environ_internal +) diff --git a/libc/src/stdlib/linux/setenv.cpp b/libc/src/stdlib/linux/setenv.cpp new file mode 100644 index 0000000000000..40a3b985bf251 --- /dev/null +++ b/libc/src/stdlib/linux/setenv.cpp @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Implementation of the POSIX setenv function. +/// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/setenv.h" +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/null_check.h" +#include "src/stdlib/environ_internal.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, setenv, + (const char *name, const char *value, int overwrite)) { + // Passing nullptr for either argument is undefined behavior per POSIX, + // so crash rather than returning an error. + LIBC_CRASH_ON_NULLPTR(name); + LIBC_CRASH_ON_NULLPTR(value); + + cpp::string_view name_view(name); + + // POSIX: name must not be empty or contain '='. + if (name_view.empty() || + name_view.find_first_of('=') != cpp::string_view::npos) { + libc_errno = EINVAL; + return -1; + } + + int result = internal::EnvironmentManager::get_instance().set( + name_view, value, overwrite != 0); + if (result != 0) + libc_errno = ENOMEM; + + return result; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/setenv.h b/libc/src/stdlib/setenv.h new file mode 100644 index 0000000000000..365d768cdd668 --- /dev/null +++ b/libc/src/stdlib/setenv.h @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Declaration of the POSIX setenv function. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_SETENV_H +#define LLVM_LIBC_SRC_STDLIB_SETENV_H + +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int setenv(const char *name, const char *value, int overwrite); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_SETENV_H diff --git a/libc/test/integration/src/stdlib/CMakeLists.txt b/libc/test/integration/src/stdlib/CMakeLists.txt index cf39b7ceb2f87..2e0389f9a60d3 100644 --- a/libc/test/integration/src/stdlib/CMakeLists.txt +++ b/libc/test/integration/src/stdlib/CMakeLists.txt @@ -18,6 +18,18 @@ add_integration_test( ) if(${LIBC_TARGET_OS} STREQUAL "linux") + add_integration_test( + setenv_test + SUITE + stdlib-integration-tests + SRCS + setenv_test.cpp + DEPENDS + libc.src.stdlib.getenv + libc.src.stdlib.setenv + libc.src.string.strcmp + ) + add_integration_test( abort_test SUITE diff --git a/libc/test/integration/src/stdlib/setenv_test.cpp b/libc/test/integration/src/stdlib/setenv_test.cpp new file mode 100644 index 0000000000000..e4ff827f0a807 --- /dev/null +++ b/libc/test/integration/src/stdlib/setenv_test.cpp @@ -0,0 +1,175 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Integration tests for the POSIX setenv function. +/// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/getenv.h" +#include "src/stdlib/setenv.h" +#include "src/string/strcmp.h" + +#include "test/IntegrationTest/test.h" + +#include + +TEST_MAIN([[maybe_unused]] int argc, [[maybe_unused]] char **argv, + [[maybe_unused]] char **envp) { + // Test: Basic + { + // Set a simple environment variable + ASSERT_EQ(LIBC_NAMESPACE::setenv("SETENV_TEST_VAR", "test_value", 1), 0); + ASSERT_ERRNO_SUCCESS(); + + // Verify it was set + char *value = LIBC_NAMESPACE::getenv("SETENV_TEST_VAR"); + ASSERT_NE(value, nullptr); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(value, "test_value"), 0); + } + + // Test: OverwriteExisting + { + // Set initial value + ASSERT_EQ(LIBC_NAMESPACE::setenv("OVERWRITE_VAR", "original", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("OVERWRITE_VAR"), + "original"), + 0); + + // Overwrite with new value (overwrite = 1) + ASSERT_EQ(LIBC_NAMESPACE::setenv("OVERWRITE_VAR", "replaced", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("OVERWRITE_VAR"), + "replaced"), + 0); + } + + // Test: NoOverwriteFlag + { + // Set initial value + ASSERT_EQ(LIBC_NAMESPACE::setenv("NO_OVERWRITE_VAR", "original", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("NO_OVERWRITE_VAR"), + "original"), + 0); + + // Try to set with overwrite = 0 (should not change) + ASSERT_EQ(LIBC_NAMESPACE::setenv("NO_OVERWRITE_VAR", "ignored", 0), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("NO_OVERWRITE_VAR"), + "original"), + 0); + + // Verify it still works with overwrite = 1 + ASSERT_EQ(LIBC_NAMESPACE::setenv("NO_OVERWRITE_VAR", "changed", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("NO_OVERWRITE_VAR"), + "changed"), + 0); + } + + // Note: passing nullptr for name or value is undefined behavior per POSIX. + // The implementation uses LIBC_CRASH_ON_NULLPTR for both, so there is no + // test for that case here. + + // Test: EmptyName + { + errno = 0; + ASSERT_EQ(LIBC_NAMESPACE::setenv("", "value", 1), -1); + ASSERT_ERRNO_EQ(EINVAL); + } + + // Test: NameWithEquals + { + errno = 0; + ASSERT_EQ(LIBC_NAMESPACE::setenv("BAD=NAME", "value", 1), -1); + ASSERT_ERRNO_EQ(EINVAL); + } + + // Test: EmptyValue + { + // Empty value is valid - just means variable is set to empty string + errno = 0; + ASSERT_EQ(LIBC_NAMESPACE::setenv("EMPTY_VALUE_VAR", "", 1), 0); + ASSERT_ERRNO_SUCCESS(); + + char *value = LIBC_NAMESPACE::getenv("EMPTY_VALUE_VAR"); + ASSERT_NE(value, nullptr); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(value, ""), 0); + } + + // Test: MultipleVariables + { + // Set multiple different variables + errno = 0; + ASSERT_EQ(LIBC_NAMESPACE::setenv("VAR1", "value1", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::setenv("VAR2", "value2", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::setenv("VAR3", "value3", 1), 0); + ASSERT_ERRNO_SUCCESS(); + + // Verify all are set correctly + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("VAR1"), "value1"), + 0); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("VAR2"), "value2"), + 0); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("VAR3"), "value3"), + 0); + } + + // Test: LongValues + { + // Test with longer strings + const char *long_name = "LONG_VAR_NAME_FOR_TESTING"; + const char *long_value = "This is a fairly long value string to test that " + "setenv handles longer strings correctly without " + "any memory issues or truncation problems"; + + ASSERT_EQ(LIBC_NAMESPACE::setenv(long_name, long_value, 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ( + LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv(long_name), long_value), + 0); + } + + // Test: SpecialCharacters + { + // Test with special characters in value (but not in name) + ASSERT_EQ(LIBC_NAMESPACE::setenv("SPECIAL_CHARS", "!@#$%^&*()", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("SPECIAL_CHARS"), + "!@#$%^&*()"), + 0); + } + + // Test: ReplaceMultipleTimes + { + // Replace the same variable multiple times + ASSERT_EQ(LIBC_NAMESPACE::setenv("MULTI_REPLACE", "value1", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("MULTI_REPLACE"), + "value1"), + 0); + + ASSERT_EQ(LIBC_NAMESPACE::setenv("MULTI_REPLACE", "value2", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("MULTI_REPLACE"), + "value2"), + 0); + + ASSERT_EQ(LIBC_NAMESPACE::setenv("MULTI_REPLACE", "value3", 1), 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(LIBC_NAMESPACE::strcmp(LIBC_NAMESPACE::getenv("MULTI_REPLACE"), + "value3"), + 0); + } + + return 0; +} From a7e4e2521e77f638e6740059109725c9a4d89453 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 11 May 2026 07:00:09 +0100 Subject: [PATCH 215/538] [GlobalISel] Delay match table builder initialization (#196506) MachineIRBuilder::setInstrAndDebugLoc is expensive, delay until needed. CTMark -0.10% geomean improvement on aarch64-O0-g. https://llvm-compile-time-tracker.com/compare.php?from=71fef6d5a306d1adf8bf7d30d2fe9e286380fecf&to=8a87845dfde9de9d141b42d2fce92fcf3be02276&stat=instructions%3Au Assisted-by: codex --- .../CodeGen/GlobalISel/GIMatchTableExecutorImpl.h | 12 ++++++++++++ .../GlobalISelCombinerEmitter/match-table.td | 1 - llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp | 1 - 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h index 4576dff67db0c..b47615c775b5d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h @@ -59,6 +59,15 @@ bool GIMatchTableExecutor::executeMatchTable( bool NoFPException = !State.MIs[0]->getDesc().mayRaiseFPException(); const uint32_t Flags = State.MIs[0]->getFlags(); + bool BuilderInitialized = false; + const auto initializeBuilder = [&]() { + if (BuilderInitialized) + return; + // Delay setting the insertion point and debug location until a successful + // action needs the builder. + Builder.setInstrAndDebugLoc(*State.MIs[0]); + BuilderInitialized = true; + }; enum RejectAction { RejectAndGiveUp, RejectAndResume }; auto handleReject = [&]() -> RejectAction { @@ -126,6 +135,7 @@ bool GIMatchTableExecutor::executeMatchTable( }; const auto eraseImpl = [&](MachineInstr *MI) { + initializeBuilder(); // If we're erasing the insertion point, ensure we don't leave a dangling // pointer in the builder. if (Builder.getInsertPt() == MI) @@ -1089,6 +1099,7 @@ bool GIMatchTableExecutor::executeMatchTable( if (NewInsnID >= OutMIs.size()) OutMIs.resize(NewInsnID + 1); + initializeBuilder(); OutMIs[NewInsnID] = Builder.buildInstr(Opcode); DEBUG_WITH_TYPE(TgtExecutor::getName(), dbgs() << CurrentIdx << ": GIR_BuildMI(OutMIs[" @@ -1099,6 +1110,7 @@ bool GIMatchTableExecutor::executeMatchTable( case GIR_BuildConstant: { uint64_t TempRegID = readULEB(); uint64_t Imm = readU64(); + initializeBuilder(); Builder.buildConstant(State.TempRegisters[TempRegID], Imm); DEBUG_WITH_TYPE(TgtExecutor::getName(), dbgs() << CurrentIdx << ": GIR_BuildConstant(TempReg[" diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index 8907cfe811abd..4278b6b614138 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -86,7 +86,6 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: bool GenMyCombiner::tryCombineAll(MachineInstr &I) const { // CHECK-NEXT: const TargetSubtargetInfo &ST = MF.getSubtarget(); // CHECK-NEXT: const PredicateBitset AvailableFeatures = getAvailableFeatures(); -// CHECK-NEXT: B.setInstrAndDebugLoc(I); // CHECK-NEXT: State.MIs.clear(); // CHECK-NEXT: State.MIs.push_back(&I); // CHECK-NEXT: if (executeMatchTable(*this, State, ExecInfo, B, getMatchTable(), *ST.getInstrInfo(), MRI, *MRI.getTargetRegisterInfo(), *ST.getRegBankInfo(), AvailableFeatures, /*CoverageInfo*/ nullptr)) diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 56b39fcd1cc9e..61c4c9eb29b71 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -2563,7 +2563,6 @@ void GICombinerEmitter::emitAdditionalImpl(raw_ostream &OS) { << " const TargetSubtargetInfo &ST = MF.getSubtarget();\n" << " const PredicateBitset AvailableFeatures = " "getAvailableFeatures();\n" - << " B.setInstrAndDebugLoc(I);\n" << " State.MIs.clear();\n" << " State.MIs.push_back(&I);\n" << " if (executeMatchTable(*this, State, ExecInfo, B" From f16e1b34e9317b8bc0a41fa6d1c4cb68162ac56d Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 11 May 2026 07:00:48 +0100 Subject: [PATCH 216/538] [GlobalISel] Avoid repeated target info queries in combiners (#196530) tryCombineAllImpl queries target info for every instruction. Cache TargetInstrInfo/TargetRegisterInfo/RegisterBankInfo in CombinerHelper and pass to executeMatchTable instead. This avoids repeated virtual calls on the combiner executeMatchTable path. CTMark -0.08% geomean improvement on aarch64-O0-g. https://llvm-compile-time-tracker.com/compare.php?from=71fef6d5a306d1adf8bf7d30d2fe9e286380fecf&to=13bc49510657450402c066098e3a4b7d1af9d0e6&stat=instructions%3Au Assisted-by: codex --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h | 8 ++++++++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 + .../TableGen/GlobalISelCombinerEmitter/match-table.td | 3 +-- llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp | 5 ++--- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 3687adbe60f1a..c5f541bb23ea0 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -44,6 +44,7 @@ class LegalizerInfo; struct LegalityQuery; class RegisterBank; class RegisterBankInfo; +class TargetInstrInfo; class TargetLowering; class TargetRegisterInfo; @@ -120,6 +121,7 @@ class CombinerHelper { MachineDominatorTree *MDT; bool IsPreLegalize; const LegalizerInfo *LI; + const TargetInstrInfo *TII; const RegisterBankInfo *RBI; const TargetRegisterInfo *TRI; @@ -135,6 +137,12 @@ class CombinerHelper { return Builder; } + const TargetInstrInfo &getTII() const { return *TII; } + + const TargetRegisterInfo &getTRI() const { return *TRI; } + + const RegisterBankInfo &getRBI() const { return *RBI; } + const TargetLowering &getTargetLowering() const; const MachineFunction &getMachineFunction() const; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 0f2617b9b5b1c..ddf8085b4e249 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -62,6 +62,7 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, const LegalizerInfo *LI) : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), VT(VT), MDT(MDT), IsPreLegalize(IsPreLegalize), LI(LI), + TII(Builder.getMF().getSubtarget().getInstrInfo()), RBI(Builder.getMF().getSubtarget().getRegBankInfo()), TRI(Builder.getMF().getSubtarget().getRegisterInfo()) { (void)this->VT; diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index 4278b6b614138..a3f29015ffb0f 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -84,11 +84,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // Verify we reset MatchData on each tryCombineAll // CHECK: bool GenMyCombiner::tryCombineAll(MachineInstr &I) const { -// CHECK-NEXT: const TargetSubtargetInfo &ST = MF.getSubtarget(); // CHECK-NEXT: const PredicateBitset AvailableFeatures = getAvailableFeatures(); // CHECK-NEXT: State.MIs.clear(); // CHECK-NEXT: State.MIs.push_back(&I); -// CHECK-NEXT: if (executeMatchTable(*this, State, ExecInfo, B, getMatchTable(), *ST.getInstrInfo(), MRI, *MRI.getTargetRegisterInfo(), *ST.getRegBankInfo(), AvailableFeatures, /*CoverageInfo*/ nullptr)) +// CHECK-NEXT: if (executeMatchTable(*this, State, ExecInfo, B, getMatchTable(), Helper.getTII(), MRI, Helper.getTRI(), Helper.getRBI(), AvailableFeatures, /*CoverageInfo*/ nullptr)) // CHECK-NEXT: return true; // CHECK-NEXT: } // CHECK-EMPTY: diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 61c4c9eb29b71..cffc8ccb813ca 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -2560,14 +2560,13 @@ void GICombinerEmitter::emitRuleConfigImpl(raw_ostream &OS) { void GICombinerEmitter::emitAdditionalImpl(raw_ostream &OS) { OS << "bool " << getClassName() << "::" << getCombineAllMethodName() << "(MachineInstr &I) const {\n" - << " const TargetSubtargetInfo &ST = MF.getSubtarget();\n" << " const PredicateBitset AvailableFeatures = " "getAvailableFeatures();\n" << " State.MIs.clear();\n" << " State.MIs.push_back(&I);\n" << " if (executeMatchTable(*this, State, ExecInfo, B" - << ", getMatchTable(), *ST.getInstrInfo(), MRI, " - "*MRI.getTargetRegisterInfo(), *ST.getRegBankInfo(), AvailableFeatures" + << ", getMatchTable(), Helper.getTII(), MRI, Helper.getTRI(), " + "Helper.getRBI(), AvailableFeatures" << ", /*CoverageInfo*/ nullptr)) {\n" << " return true;\n" << " }\n\n" From ed50ea52004259af958bb3e5636268342c49ee62 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 11 May 2026 07:01:59 +0100 Subject: [PATCH 217/538] [DebugInfo] Pack DILocation hash inputs (#196556) Pack DILocation fields before hashing. Now that column is 16-bits Line/Column/ImplicitCode fit in one 64-bit value (32 + 16 + 1 = 49 bits) and AtomGroup and AtomRank also fit cleanly in one 64-bit value (61 + 3 = 64 bits). Fewer hash_combine inputs on the hot DILocation path is a small compile-time improvement. CTMark geomean: - stage1-ReleaseLTO-g: -0.10% - stage1-O0-g: -0.23% - stage1-aarch64-O0-g: -0.19% - stage2-O0-g: -0.07% https://llvm-compile-time-tracker.com/compare.php?from=71fef6d5a306d1adf8bf7d30d2fe9e286380fecf&to=1d80b5f5aa98561d2ba09adc3f20c3eacd24cb88&stat=instructions%3Au Assisted-by: codex --- llvm/lib/IR/LLVMContextImpl.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index b7645a25989e8..25133f0bbb775 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -339,6 +339,8 @@ template <> struct MDNodeKeyImpl { } unsigned getHashValue() const { + uint64_t LineColumnAndImplicitCode = + Line | (uint64_t(Column) << 32) | (uint64_t(ImplicitCode) << 48); // Hashing AtomGroup and AtomRank substantially impacts performance whether // Key Instructions is enabled or not. We can't detect whether it's enabled // here cheaply; avoiding hashing zero values is a good approximation. This @@ -347,9 +349,9 @@ template <> struct MDNodeKeyImpl { // outweighed by the overall compile time savings by performing this check. // * (hash_combine(x) != hash_combine(x, 0)) if (AtomGroup || AtomRank) - return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode, - AtomGroup, (uint8_t)AtomRank); - return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode); + return hash_combine(LineColumnAndImplicitCode, Scope, InlinedAt, + AtomGroup | (uint64_t(AtomRank) << 61)); + return hash_combine(LineColumnAndImplicitCode, Scope, InlinedAt); } }; From 422678d1eeb23979eddc8be607d411bba6c9ad27 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Mon, 11 May 2026 11:43:12 +0530 Subject: [PATCH 218/538] [LoopFusion] Remove SCEV-based dependence analysis path (#195864) Loop Fusion has used Dependence Analysis (DA) as the default dependence check since the option default was flipped in #187309. The SCEV-based strategy and the combined "all" mode were retained only for fallback and experimentation, with a comment noting that the SCEV code would be removed in a follow-up. This patch removes the SCEV-based dependence path and the now-unused selector machinery. Fixes #194821. Assisted by Cursor. --- llvm/lib/Transforms/Scalar/LoopFuse.cpp | 254 +++++------------- .../Transforms/LoopFusion/loop_invariant.ll | 9 +- 2 files changed, 61 insertions(+), 202 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 6de3fc0174d7d..54ab6de2f9ead 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -53,7 +53,6 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Verifier.h" @@ -102,23 +101,6 @@ STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); STATISTIC(NumDA, "DA checks passed"); -enum FusionDependenceAnalysisChoice { - FUSION_DEPENDENCE_ANALYSIS_SCEV, - FUSION_DEPENDENCE_ANALYSIS_DA, - FUSION_DEPENDENCE_ANALYSIS_ALL, -}; - -static cl::opt FusionDependenceAnalysis( - "loop-fusion-dependence-analysis", - cl::desc("Which dependence analysis should loop fusion use?"), - cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", - "Use the scalar evolution interface"), - clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", - "Use the dependence analysis interface"), - clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", - "Use all available analyses")), - cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_DA)); - static cl::opt FusionPeelMaxCount( "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, cl::desc("Max number of iterations to be peeled from a loop, such that " @@ -1123,190 +1105,82 @@ struct LoopFuser { return true; } - /// Rewrite all additive recurrences in a SCEV to use a new loop. - class AddRecLoopReplacer : public SCEVRewriteVisitor { - public: - AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, - bool UseMax = true) - : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), - NewL(NewL) {} - - const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { - const Loop *ExprL = Expr->getLoop(); - SmallVector Operands; - if (ExprL == &OldL) { - append_range(Operands, Expr->operands()); - return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); - } - - if (OldL.contains(ExprL)) { - bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); - if (!UseMax || !Pos || !Expr->isAffine()) { - Valid = false; - return Expr; - } - return visit(Expr->getStart()); - } - - for (SCEVUse Op : Expr->operands()) - Operands.push_back(visit(Op)); - return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); - } - - bool wasValidSCEV() const { return Valid; } - - private: - bool Valid, UseMax; - const Loop &OldL, &NewL; - }; - - /// Return false if the access functions of \p I0 and \p I1 could cause - /// a negative dependence. - bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, - Instruction &I1, bool EqualIsInvalid) { - Value *Ptr0 = getLoadStorePointerOperand(&I0); - Value *Ptr1 = getLoadStorePointerOperand(&I1); - if (!Ptr0 || !Ptr1) - return false; - - const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); - const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); -#ifndef NDEBUG - if (VerboseFusionDebugging) - LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " - << *SCEVPtr1 << "\n"); -#endif - AddRecLoopReplacer Rewriter(SE, L0, L1); - SCEVPtr0 = Rewriter.visit(SCEVPtr0); -#ifndef NDEBUG - if (VerboseFusionDebugging) - LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 - << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); -#endif - if (!Rewriter.wasValidSCEV()) - return false; - - // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by - // L0) and the other is not. We could check if it is monotone and test - // the beginning and end value instead. - - BasicBlock *L0Header = L0.getHeader(); - auto HasNonLinearDominanceRelation = [&](const SCEV *S) { - const SCEVAddRecExpr *AddRec = dyn_cast(S); - if (!AddRec) - return false; - return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && - !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); - }; - if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) - return false; - - ICmpInst::Predicate Pred = - EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; - bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); -#ifndef NDEBUG - if (VerboseFusionDebugging) - LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 - << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 - << "\n"); -#endif - return IsAlwaysGE; - } - /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in - /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses - /// specified by @p DepChoice are used to determine this. + /// @p L1) allow loop fusion of @p L0 and @p L1. bool dependencesAllowFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, Instruction &I0, - Instruction &I1, bool AnyDep, - FusionDependenceAnalysisChoice DepChoice) { + Instruction &I1) { #ifndef NDEBUG if (VerboseFusionDebugging) { - LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " - << DepChoice << "\n"); + LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << "\n"); } #endif - switch (DepChoice) { - case FUSION_DEPENDENCE_ANALYSIS_SCEV: - return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); - case FUSION_DEPENDENCE_ANALYSIS_DA: { - auto DepResult = DI.depends(&I0, &I1); - if (!DepResult) - return true; + auto DepResult = DI.depends(&I0, &I1); + if (!DepResult) + return true; #ifndef NDEBUG - if (VerboseFusionDebugging) { - LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); - dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " - << (DepResult->isOrdered() ? "true" : "false") - << "]\n"); - LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() - << "\n"); - } + if (VerboseFusionDebugging) { + LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); + dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " + << (DepResult->isOrdered() ? "true" : "false") + << "]\n"); + LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() + << "\n"); + } #endif - unsigned Levels = DepResult->getLevels(); - unsigned SameSDLevels = DepResult->getSameSDLevels(); - unsigned CurLoopLevel = FC0.L->getLoopDepth(); - - // Check if DA is missing info regarding the current loop level - if (CurLoopLevel > Levels + SameSDLevels) - return false; - - // Iterating over the outer levels. - for (unsigned Level = 1; Level <= std::min(CurLoopLevel - 1, Levels); - ++Level) { - unsigned Direction = DepResult->getDirection(Level, false); - - // Check if the direction vector does not include equality. If an outer - // loop has a non-equal direction, outer indicies are different and it - // is safe to fuse. - if (!(Direction & Dependence::DVEntry::EQ)) { - LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " - "outer loops\n"); - NumDA++; - return true; - } - } + unsigned Levels = DepResult->getLevels(); + unsigned SameSDLevels = DepResult->getSameSDLevels(); + unsigned CurLoopLevel = FC0.L->getLoopDepth(); - assert(CurLoopLevel > Levels && "Fusion candidates are not separated"); + // Check if DA is missing info regarding the current loop level + if (CurLoopLevel > Levels + SameSDLevels) + return false; - if (DepResult->isScalar(CurLoopLevel, true) && !DepResult->isAnti()) { - LLVM_DEBUG(dbgs() << "Safe to fuse due to a loop-invariant non-anti " - "dependency\n"); + // Iterating over the outer levels. + for (unsigned Level = 1; Level <= std::min(CurLoopLevel - 1, Levels); + ++Level) { + unsigned Direction = DepResult->getDirection(Level, false); + + // Check if the direction vector does not include equality. If an outer + // loop has a non-equal direction, outer indicies are different and it + // is safe to fuse. + if (!(Direction & Dependence::DVEntry::EQ)) { + LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " + "outer loops\n"); NumDA++; return true; } + } - unsigned CurDir = DepResult->getDirection(CurLoopLevel, true); - - // Check if the direction vector does not include greater direction. In - // that case, the dependency is not a backward loop-carried and is legal - // to fuse. For example here we have a forward dependency - // for (int i = 0; i < n; i++) - // A[i] = ...; - // for (int i = 0; i < n; i++) - // ... = A[i-1]; - if (!(CurDir & Dependence::DVEntry::GT)) { - LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " - "dependency\n"); - NumDA++; - return true; - } + assert(CurLoopLevel > Levels && "Fusion candidates are not separated"); - if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) - LLVM_DEBUG( - dbgs() << "TODO: Implement pred/succ dependence handling!\n"); - - return false; + if (DepResult->isScalar(CurLoopLevel, true) && !DepResult->isAnti()) { + LLVM_DEBUG(dbgs() << "Safe to fuse due to a loop-invariant non-anti " + "dependency\n"); + NumDA++; + return true; } - case FUSION_DEPENDENCE_ANALYSIS_ALL: - return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, - FUSION_DEPENDENCE_ANALYSIS_SCEV) || - dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, - FUSION_DEPENDENCE_ANALYSIS_DA); + unsigned CurDir = DepResult->getDirection(CurLoopLevel, true); + + // Check if the direction vector does not include greater direction. In + // that case, the dependency is not a backward loop-carried and is legal + // to fuse. For example here we have a forward dependency + // for (int i = 0; i < n; i++) + // A[i] = ...; + // for (int i = 0; i < n; i++) + // ... = A[i-1]; + if (!(CurDir & Dependence::DVEntry::GT)) { + LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " + "dependency\n"); + NumDA++; + return true; } - llvm_unreachable("Unknown fusion dependence analysis choice!"); + if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) + LLVM_DEBUG(dbgs() << "TODO: Implement pred/succ dependence handling!\n"); + + return false; } /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. @@ -1319,30 +1193,22 @@ struct LoopFuser { for (Instruction *WriteL0 : FC0.MemWrites) { for (Instruction *WriteL1 : FC1.MemWrites) - if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, - /* AnyDep */ false, - FusionDependenceAnalysis)) { + if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1)) { return false; } for (Instruction *ReadL1 : FC1.MemReads) - if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, - /* AnyDep */ false, - FusionDependenceAnalysis)) { + if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1)) { return false; } } for (Instruction *WriteL1 : FC1.MemWrites) { for (Instruction *WriteL0 : FC0.MemWrites) - if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, - /* AnyDep */ false, - FusionDependenceAnalysis)) { + if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1)) { return false; } for (Instruction *ReadL0 : FC0.MemReads) - if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, - /* AnyDep */ false, - FusionDependenceAnalysis)) { + if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1)) { return false; } } diff --git a/llvm/test/Transforms/LoopFusion/loop_invariant.ll b/llvm/test/Transforms/LoopFusion/loop_invariant.ll index 87e2162b3c6f5..7de4e7122464a 100644 --- a/llvm/test/Transforms/LoopFusion/loop_invariant.ll +++ b/llvm/test/Transforms/LoopFusion/loop_invariant.ll @@ -1,13 +1,10 @@ ; REQUIRES: asserts -; RUN: opt -S -passes=loop-fusion -loop-fusion-dependence-analysis=da -debug-only=loop-fusion -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DA -; RUN: opt -S -passes=loop-fusion -loop-fusion-dependence-analysis=scev -debug-only=loop-fusion -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCEV +; RUN: opt -S -passes=loop-fusion -debug-only=loop-fusion -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DA define void @loop_invariant(i32 %N) { ; CHECK-DA: Performing Loop Fusion on function loop_invariant ; CHECK-DA: Safe to fuse due to a loop-invariant non-anti dependency -; CHECK-SCEV: Performing Loop Fusion on function loop_invariant -; CHECK-SCEV: Fusion done ; pre1: %ptr = alloca i32, align 4 @@ -34,13 +31,9 @@ exit: ret void } -; TODO: improve SCEV check to detect the loop-invariant anti dependence with -; scalar access and prevent fusion. define void @anti_loop_invariant(i32 %N) { ; CHECK-DA: Performing Loop Fusion on function anti_loop_invariant ; CHECK-DA: Memory dependencies do not allow fusion! -; CHECK-SCEV: Performing Loop Fusion on function anti_loop_invariant -; XFAIL-CHECK-SCEV: Memory dependencies do not allow fusion! ; pre1: %ptr = alloca i32, align 4 From 1a683f68b065c9cfec76cf6dd051c4f177fc0086 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Mon, 11 May 2026 09:13:59 +0300 Subject: [PATCH 219/538] [clang-tidy][NFC] Fix tests on 32bit ARM (#196873) Should fix https://github.com/llvm/llvm-project/pull/191386#issuecomment-4408294981. --- .../test/clang-tidy/checkers/readability/redundant-casting.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp index 13be4192f49d1..6ce2b026fbcce 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp @@ -196,7 +196,7 @@ void testBinaryOperatorRedundantCasting() { const auto diff_types_operands4 { static_cast(static_cast(3) + 2) }; - // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: redundant explicit casting to the same type 'size_t' (aka 'unsigned long{{( long)?}}') as the sub-expression, remove this casting [readability-redundant-casting] + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: redundant explicit casting to the same type 'size_t' (aka 'unsigned {{(int|long( long)?)}}') as the sub-expression, remove this casting [readability-redundant-casting] // CHECK-FIXES: (static_cast(3) + 2) // CHECK-FIXES-IMPLICIT: static_cast(static_cast(3) + 2) From 4ef4f900f0f42678f24bbcba79549217ace966ca Mon Sep 17 00:00:00 2001 From: Jeff Bailey Date: Mon, 11 May 2026 06:34:30 +0000 Subject: [PATCH 220/538] [libc] Fix partial multi-byte write detection in File (#196402) File::write_unlocked(const wchar_t*, size_t) checked 'write_res.value < 1' after writing a converted UTF-8 sequence. For multi-byte characters, a short platform write (e.g. 2 of 3 bytes for a 3-byte character) passed this check and was counted as a successful write. The output stream would then contain an incomplete UTF-8 sequence with no error reported to the caller. Changed the check to 'write_res.value < char_size' and set the error indicator on the stream when it triggers. Added a regression test using a mock File subclass that limits platform_write to 2 bytes per call, simulating short writes on pipes and sockets. Assisted-by: Automated tooling, human reviewed. --------- Co-authored-by: Michael Jones --- libc/src/__support/File/file.cpp | 6 +- libc/test/src/__support/File/file_test.cpp | 86 ++++++++++++++++++++++ 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp index 1499ab56578a9..7c1bc103ca7aa 100644 --- a/libc/src/__support/File/file.cpp +++ b/libc/src/__support/File/file.cpp @@ -590,8 +590,10 @@ FileIOResult File::write_unlocked(const wchar_t *ws, size_t len) { auto write_res = write_unlocked_impl(buffer, char_size); if (write_res.has_error()) return {written, write_res.error}; - if (write_res.value < 1) - return {written, 0}; + if (write_res.value < char_size) { + err = true; + return {written, EIO}; + } ++written; } return {written, 0}; diff --git a/libc/test/src/__support/File/file_test.cpp b/libc/test/src/__support/File/file_test.cpp index f3eb8634ce0be..6c7855225a52a 100644 --- a/libc/test/src/__support/File/file_test.cpp +++ b/libc/test/src/__support/File/file_test.cpp @@ -772,3 +772,89 @@ TEST(LlvmLibcFileTest, UngetwcWEOF) { ASSERT_EQ(f->close(), 0); } + +// A File subclass with a platform_write that simulates short writes. +// This models the behavior of write(2) on pipes, sockets, or FIFOs where +// the kernel may write fewer bytes than requested. +class ShortWriteFile : public File { + static constexpr size_t SIZE = 512; + size_t pos; + char str[SIZE] = {0}; + size_t max_write; + + static FileIOResult short_write(LIBC_NAMESPACE::File *f, const void *data, + size_t len) { + ShortWriteFile *sf = static_cast(f); + // Simulate a short write: write at most max_write bytes per call. + size_t to_write = len < sf->max_write ? len : sf->max_write; + for (size_t i = 0; i < to_write && sf->pos < SIZE; ++i, ++sf->pos) + sf->str[sf->pos] = reinterpret_cast(data)[i]; + return to_write; + } + + static FileIOResult short_read(LIBC_NAMESPACE::File *f, void *data, + size_t len) { + ShortWriteFile *sf = static_cast(f); + size_t i = 0; + for (i = 0; i < len && sf->pos < SIZE; ++i) + reinterpret_cast(data)[i] = sf->str[sf->pos + i]; + sf->pos += i; + return i; + } + + static ErrorOr short_seek(LIBC_NAMESPACE::File *f, off_t offset, + int whence) { + ShortWriteFile *sf = static_cast(f); + if (whence == SEEK_SET) + sf->pos = offset; + if (whence == SEEK_CUR) + sf->pos += offset; + if (whence == SEEK_END) + sf->pos = SIZE + offset; + return sf->pos; + } + + static int short_close(LIBC_NAMESPACE::File *f) { + delete reinterpret_cast(f); + return 0; + } + +public: + explicit ShortWriteFile(char *buffer, size_t buflen, int bufmode, bool owned, + ModeFlags modeflags, size_t max_write_bytes) + : LIBC_NAMESPACE::File(&short_write, &short_read, &short_seek, + &short_close, reinterpret_cast(buffer), + buflen, bufmode, owned, modeflags), + pos(0), max_write(max_write_bytes) {} + + void reset() { pos = 0; } + size_t get_pos() const { return pos; } + char *get_str() { return str; } +}; + +// Verify that a short platform_write of a multi-byte UTF-8 character is +// detected and reported as a failure. POSIX write(2) may perform short +// writes on pipes, sockets, and FIFOs, so a 3-byte character could have +// only 2 bytes accepted by the kernel. +TEST(LlvmLibcFileTest, PartialWideCharWriteDetected) { + LIBC_NAMESPACE::AllocChecker ac; + // Unbuffered so writes go directly to platform_write, limited to 2 bytes. + ShortWriteFile *f = new (ac) ShortWriteFile( + nullptr, 0, _IONBF, true, LIBC_NAMESPACE::File::mode_flags("w"), + /*max_write_bytes=*/2); + ASSERT_FALSE(f == nullptr); + + // € (U+20AC) encodes to 3 UTF-8 bytes: 0xE2 0x82 0xAC. + // With max_write=2, only 2 of the 3 bytes will be accepted. + const wchar_t euro = L'€'; + auto result = f->write(&euro, 1); + + // The incomplete character must not be counted as written. + EXPECT_TRUE(result.has_error()); + EXPECT_EQ(result.value, size_t(0)); + + // The error indicator on the stream should be set. + EXPECT_TRUE(f->error()); + + ASSERT_EQ(f->close(), 0); +} From 8a230212a560a60bef18e576ad62b0554158b3b3 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 11 May 2026 08:44:03 +0200 Subject: [PATCH 221/538] [AA] No synchronization effects for never-escaping identified local (#193939) Fences and other synchronizing operations (such as atomic accesses stronger than monotonic) are modelled as reading and writing all memory, in order to enforce their implied ordering constraints. Currently, this happens even for identified function locals that do not escape. This patch excludes those objects. Notably, we can *not* reason based on captures-before here, because the synchronizing operation still has an effect even if the object only escapes *later*. The hope here is that with this restriction in place, it may be viable to respect potential synchronization inside non-nosync function calls. --- llvm/lib/Analysis/AliasAnalysis.cpp | 39 +++++++++++----- llvm/test/Analysis/BasicAA/atomics.ll | 46 ++++++++++--------- .../test/Analysis/MemorySSA/atomic-clobber.ll | 2 +- .../Transforms/DeadStoreElimination/fence.ll | 8 ---- llvm/test/Transforms/GVN/fence.ll | 4 +- .../GVN/simplify-icf-cache-invalidation.ll | 2 - llvm/test/Transforms/LICM/atomics.ll | 6 +-- 7 files changed, 58 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 1449a54d1de2b..ffb2dc0f4e041 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -458,12 +458,34 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) { // Helper method implementation //===----------------------------------------------------------------------===// +/// Get ModRefInfo for a synchronizing operation, such as a fence or stronger +/// than monotonic atomic load/store. +static ModRefInfo getSyncEffects(AAResults *AA, const MemoryLocation &Loc, + AAQueryInfo &AAQI) { + if (!Loc.Ptr) + return ModRefInfo::ModRef; + + // If the location is *never* captured, it cannot be affected by + // synchronizing operations. However, we cannot ignore locations that are + // only captured after the operation, as the synchronization may still have + // an effect if the object is only captured *later*. As such, set I to null + // and ReturnCaptures to true here. + const Value *Obj = getUnderlyingObject(Loc.Ptr); + if (capturesNothing(AAQI.CA->getCapturesBefore( + Obj, /*I=*/nullptr, /*OrAt=*/true, /*ReturnCaptures=*/true))) + return ModRefInfo::NoModRef; + + // If Loc is a constant memory location, the synchronization operation + // definitely could not modify it. + return AA->getModRefInfoMask(Loc); +} + ModRefInfo AAResults::getModRefInfo(const LoadInst *L, const MemoryLocation &Loc, AAQueryInfo &AAQI) { // Be conservative in the face of atomic. if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered)) - return ModRefInfo::ModRef; + return getSyncEffects(this, Loc, AAQI); // If the load address doesn't alias the given address, it doesn't read // or write the specified memory. @@ -481,7 +503,7 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S, AAQueryInfo &AAQI) { // Be conservative in the face of atomic. if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered)) - return ModRefInfo::ModRef; + return getSyncEffects(this, Loc, AAQI); if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(S), Loc, AAQI, S); @@ -515,14 +537,9 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *F, return ModRefInfo::NoModRef; } - // Apply the ModRef mask. This ensures that if Loc is a constant memory - // location, we take into account the fact that the fence definitely could - // not modify the memory location. - if (!isNoModRef(Result)) - Result &= getModRefInfoMask(Loc); - - return Result; + return Result & getSyncEffects(this, Loc, AAQI); } + return ModRefInfo::ModRef; } @@ -576,7 +593,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX, AAQueryInfo &AAQI) { // Acquire/Release cmpxchg has properties that matter for arbitrary addresses. if (isStrongerThanMonotonic(CX->getSuccessOrdering())) - return ModRefInfo::ModRef; + return getSyncEffects(this, Loc, AAQI); if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(CX), Loc, AAQI, CX); @@ -594,7 +611,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, AAQueryInfo &AAQI) { // Acquire/Release atomicrmw has properties that matter for arbitrary addresses. if (isStrongerThanMonotonic(RMW->getOrdering())) - return ModRefInfo::ModRef; + return getSyncEffects(this, Loc, AAQI); if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(RMW), Loc, AAQI, RMW); diff --git a/llvm/test/Analysis/BasicAA/atomics.ll b/llvm/test/Analysis/BasicAA/atomics.ll index db0417c758e92..1101466fe7055 100644 --- a/llvm/test/Analysis/BasicAA/atomics.ll +++ b/llvm/test/Analysis/BasicAA/atomics.ll @@ -8,29 +8,29 @@ declare noalias ptr @malloc(i64) ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 monotonic, align 4 ; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> fence release +; CHECK: NoModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: Both ModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %x acquire, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %6 = load atomic i32, ptr %x acquire, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> fence seq_cst +; CHECK: NoModRef: Ptr: i32* %a <-> fence seq_cst ; CHECK: Both ModRef: Ptr: i32* %x <-> fence seq_cst -; CHECK: Both ModRef: Ptr: i32* %a <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %9 = load atomic i32, ptr %x seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %9 = load atomic i32, ptr %x seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %9 = load atomic i32, ptr %x seq_cst, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x seq_cst, align 4 define void @alloca_no_escape(ptr %x) { %a = alloca i32 @@ -83,15 +83,15 @@ define void @alloca_escape_after(ptr %x) { } ; CHECK-LABEL: Function: noalias_no_escape: -; CHECK: Both ModRef: Ptr: i32* %a <-> fence release +; CHECK: NoModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x acquire, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 define void @noalias_no_escape(ptr noalias %a, ptr %x) { store i32 0, ptr %a @@ -125,21 +125,23 @@ define void @noalias_escape_after(ptr noalias %a, ptr %x) { load atomic i32, ptr %x acquire, align 4 store atomic i32 0, ptr %x release, align 4 + call void @escape(ptr %a) + ret void } ; CHECK-LABEL: Function: malloc_no_escape: ; CHECK: Both ModRef: Ptr: i32* %a <-> %a = call ptr @malloc(i64 4) ; CHECK: Both ModRef: Ptr: i32* %x <-> %a = call ptr @malloc(i64 4) -; CHECK: Both ModRef: Ptr: i32* %a <-> fence release +; CHECK: NoModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x acquire, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 define void @malloc_no_escape(ptr %x) { %a = call ptr @malloc(i64 4) diff --git a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll index 326ec8b15283d..86708ecbc58fd 100644 --- a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll +++ b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll @@ -106,7 +106,7 @@ define void @seq_cst_clobber(ptr noalias %a, ptr noalias %b) { ; If AA gets more aggressive, we can find another way. ; ; CHECK-LABEL: define void @check_aa_is_sane -define void @check_aa_is_sane(ptr noalias %a, ptr noalias %b) { +define void @check_aa_is_sane(ptr noalias %a, ptr %b) { ; CHECK: 1 = MemoryDef(liveOnEntry) ; CHECK-NEXT: cmpxchg ptr %a, i32 0, i32 1 acquire acquire cmpxchg ptr %a, i32 0, i32 1 acquire acquire diff --git a/llvm/test/Transforms/DeadStoreElimination/fence.ll b/llvm/test/Transforms/DeadStoreElimination/fence.ll index b619b0035ce03..3c02d715eb94d 100644 --- a/llvm/test/Transforms/DeadStoreElimination/fence.ll +++ b/llvm/test/Transforms/DeadStoreElimination/fence.ll @@ -54,7 +54,6 @@ define void @test2(ptr %addr.i) { ret void } -; TODO: ; We DSE stack alloc'ed and byval locations, in the presence of fences. ; Fence does not make an otherwise thread local store visible. ; Right now the DSE in presence of fence is only done in end blocks (with no successors), @@ -63,7 +62,6 @@ define void @test2(ptr %addr.i) { define void @test3(ptr byval(i32) %addr.i) { ; CHECK-LABEL: define void @test3( ; CHECK-SAME: ptr byval(i32) [[ADDR_I:%.*]]) { -; CHECK-NEXT: store i32 5, ptr [[ADDR_I]], align 4 ; CHECK-NEXT: fence release ; CHECK-NEXT: ret void ; @@ -76,13 +74,11 @@ declare void @foo(ptr nocapture %p) declare noalias ptr @malloc(i32) -; TODO: ; DSE of stores in locations allocated through library calls. define void @test_nocapture() { ; CHECK-LABEL: define void @test_nocapture() { ; CHECK-NEXT: [[M:%.*]] = call ptr @malloc(i32 24) ; CHECK-NEXT: call void @foo(ptr [[M]]) -; CHECK-NEXT: store i8 4, ptr [[M]], align 1 ; CHECK-NEXT: fence release ; CHECK-NEXT: ret void ; @@ -93,14 +89,10 @@ define void @test_nocapture() { ret void } - -; TODO: ; This is a full fence, but it does not make a thread local store visible. ; We can DSE the store in presence of the fence. define void @fence_seq_cst() { ; CHECK-LABEL: define void @fence_seq_cst() { -; CHECK-NEXT: [[P1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 0, ptr [[P1]], align 4 ; CHECK-NEXT: fence seq_cst ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/GVN/fence.ll b/llvm/test/Transforms/GVN/fence.ll index f2b1538843681..16c6b5143703d 100644 --- a/llvm/test/Transforms/GVN/fence.ll +++ b/llvm/test/Transforms/GVN/fence.ll @@ -37,9 +37,9 @@ define i32 @test2(ptr %addr.i) { ; ordering property (though it is that too), but a liveness ; property. We expect to eventually see the value of store by ; another thread when spinning on that location. -define i32 @test3(ptr noalias %addr.i, ptr noalias %otheraddr) { +define i32 @test3(ptr %addr.i) { ; CHECK-LABEL: define i32 @test3 -; CHECK-SAME: (ptr noalias [[ADDR_I:%.*]], ptr noalias [[OTHERADDR:%.*]]) { +; CHECK-SAME: (ptr [[ADDR_I:%.*]]) { ; CHECK-NEXT: fence acquire ; CHECK-NEXT: [[A:%.*]] = load i32, ptr [[ADDR_I]], align 4 ; CHECK-NEXT: fence acquire diff --git a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll index f4a4155e94f80..fc85048ebdacf 100644 --- a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll +++ b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll @@ -28,8 +28,6 @@ define hidden void @eggs(ptr %arg, i1 %arg2, ptr %arg3, i32 %arg4, ptr %arg5) un ; CHECK-NEXT: br label %[[BB9]] ; CHECK: [[BB9]]: ; CHECK-NEXT: tail call void @quux(ptr [[ARG]], i1 [[ARG2]]) -; CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq ptr [[TMP17]], null ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll index 2b3435ba2e7ef..1d29b038c9a53 100644 --- a/llvm/test/Transforms/LICM/atomics.ll +++ b/llvm/test/Transforms/LICM/atomics.ll @@ -239,6 +239,7 @@ define i32 @test7b(ptr nocapture noalias %x, ptr nocapture %y, ptr noalias nocap ; CHECK-LABEL: define i32 @test7b( ; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]], ptr noalias captures(none) [[Z:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: store i32 5, ptr [[X]], align 4 ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[VALA:%.*]] = load atomic i32, ptr [[Y]] monotonic, align 4 @@ -247,7 +248,6 @@ define i32 @test7b(ptr nocapture noalias %x, ptr nocapture %y, ptr noalias nocap ; CHECK: [[END]]: ; CHECK-NEXT: [[VALA_LCSSA1:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ] ; CHECK-NEXT: [[VALA_LCSSA:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ] -; CHECK-NEXT: store i32 5, ptr [[X]], align 4 ; CHECK-NEXT: store atomic i32 [[VALA_LCSSA1]], ptr [[Z]] unordered, align 4 ; CHECK-NEXT: ret i32 [[VALA_LCSSA]] ; @@ -266,9 +266,9 @@ end: } -define i32 @test8(ptr nocapture noalias %x, ptr nocapture %y) { +define i32 @test8(ptr nocapture %x, ptr nocapture noalias %y) { ; CHECK-LABEL: define i32 @test8( -; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) { +; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr noalias captures(none) [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: From 58936f7103663d04d6be4f0c9bf61be2ec6077ae Mon Sep 17 00:00:00 2001 From: "forking-google-bazel-bot[bot]" <265904573+forking-google-bazel-bot[bot]@users.noreply.github.com> Date: Mon, 11 May 2026 08:45:26 +0200 Subject: [PATCH 222/538] [Bazel] Fixes ce6605a (#196880) This fixes ce6605a4931a294bd17b5e56658b701b18d2bcf9. Co-authored-by: Google Bazel Bot --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 77da1838c6f24..63cc42179055a 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6401,7 +6401,11 @@ td_library( td_library( name = "NVVMOpsTdFiles", - srcs = ["include/mlir/Dialect/LLVMIR/NVVMOps.td"], + srcs = [ + "include/mlir/Dialect/LLVMIR/NVVMDialect.td", + "include/mlir/Dialect/LLVMIR/NVVMEnums.td", + "include/mlir/Dialect/LLVMIR/NVVMOps.td", + ], includes = ["include"], deps = [ ":BasicPtxBuilderIntTdFiles", From 23b7a131c27a37da8ce465c82e599b75c10dfba6 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 11 May 2026 09:06:57 +0200 Subject: [PATCH 223/538] [clang][NFC] Remove alignment checks from test/CodeGen/c-strings.c (#196501) and re-enable it on more targets. I don't think this test was intended to check for alignment. Those expectations were added as part of FileCheck-izing the test in e29dadb6403c8b0d3658f9bbbe2f5fbde5431fdb and we've been working around them or xfailing the test since. --- clang/test/CodeGen/c-strings.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/clang/test/CodeGen/c-strings.c b/clang/test/CodeGen/c-strings.c index 31c438fd8ff2e..60d434f37a20d 100644 --- a/clang/test/CodeGen/c-strings.c +++ b/clang/test/CodeGen/c-strings.c @@ -4,37 +4,15 @@ // Should be 3 hello strings, two global (of different sizes), the rest are // shared. -// CHECK: @align = {{(dso_local )?}}global i8 [[ALIGN:[0-9]+]] // ITANIUM: @.str = private unnamed_addr constant [6 x i8] c"hello\00" -// MSABI: @"??_C@_05CJBACGMB@hello?$AA@" = linkonce_odr dso_local unnamed_addr constant [6 x i8] c"hello\00", comdat, align 1 +// MSABI: @"??_C@_05CJBACGMB@hello?$AA@" = linkonce_odr dso_local unnamed_addr constant [6 x i8] c"hello\00", comdat // ITANIUM: @f1.x = internal global ptr @.str // MSABI: @f1.x = internal global ptr @"??_C@_05CJBACGMB@hello?$AA@" -// CHECK: @f2.x = internal global [6 x i8] c"hello\00", align [[ALIGN]] -// CHECK: @f3.x = internal global [8 x i8] c"hello\00\00\00", align [[ALIGN]] +// CHECK: @f2.x = internal global [6 x i8] c"hello\00" +// CHECK: @f3.x = internal global [8 x i8] c"hello\00\00\00" // ITANIUM: @f4.x = internal global %struct.s { ptr @.str } // MSABI: @f4.x = internal global %struct.s { ptr @"??_C@_05CJBACGMB@hello?$AA@" } -// CHECK: @x = {{(dso_local )?}}global [3 x i8] c"ola", align [[ALIGN]] - -// XFAIL: target=aarch64-{{.*}}-windows-msvc, target=arm64ec-{{.*}}-windows-msvc -// Arm64 in MSVC mode aligns arrays to either 32-bit or 64-bit boundaries, which fails -// various checks above, since ALIGN is derived from the alignment of a single -// i8, which is still 1. - -// XFAIL: target=hexagon-{{.*}} -// Hexagon aligns arrays of size 8+ bytes to a 64-bit boundary, which -// fails the check for "@f3.x = ... align [ALIGN]", since ALIGN is derived -// from the alignment of a single i8, which is still 1. - -// XFAIL: target=csky{{.*}} -// CSKY aligns arrays of size 4+ bytes to a 32-bit boundary, which -// fails the check for "@f2.x = ... align [ALIGN]", since ALIGN is derived -// from the alignment of a single i8, which is still 1. - -#if defined(__s390x__) -unsigned char align = 2; -#else -unsigned char align = 1; -#endif +// CHECK: @x = {{(dso_local )?}}global [3 x i8] c"ola" void bar(const char *); From 52b6343c5380ec5e4649f18542dfcdb35c58f98e Mon Sep 17 00:00:00 2001 From: Chaitanya Date: Mon, 11 May 2026 12:43:52 +0530 Subject: [PATCH 224/538] [CIR][AMDGPU] Add lowering for amdgcn ds swizzle builtin. (#196011) Upstreaming clangIR PR: https://github.com/llvm/clangir/pull/2052 This PR adds support for lowering of _builtin_amdgcn_ds_swizzle* amdgpu builtin to clangIR. --- clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp | 8 +++++++- clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp index 929cdf8e88789..04ab1c29b0d63 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp @@ -161,7 +161,13 @@ CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId, .getResult(); return result; } - case AMDGPU::BI__builtin_amdgcn_ds_swizzle: + case AMDGPU::BI__builtin_amdgcn_ds_swizzle: { + mlir::Value src0 = emitScalarExpr(expr->getArg(0)); + mlir::Value src1 = emitScalarExpr(expr->getArg(1)); + return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()), + "amdgcn.ds.swizzle", src0.getType(), + mlir::ValueRange{src0, src1}); + } case AMDGPU::BI__builtin_amdgcn_mov_dpp8: case AMDGPU::BI__builtin_amdgcn_mov_dpp: case AMDGPU::BI__builtin_amdgcn_update_dpp: { diff --git a/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip b/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip index d374479e6182e..4a61fde7aa90c 100644 --- a/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip +++ b/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip @@ -63,3 +63,11 @@ __device__ void test_div_fmas_f32(double* out, float a, float b, float c, int d) __device__ void test_div_fmas_f64(double* out, double a, double b, double c, int d) { *out = __builtin_amdgcn_div_fmas(a, b, c, d); } + +// CIR-LABEL: @_Z15test_ds_swizzlePii +// CIR: cir.call_llvm_intrinsic "amdgcn.ds.swizzle" {{.*}} : (!s32i, !s32i) -> !s32i +// LLVM: define{{.*}} void @_Z15test_ds_swizzlePii +// LLVM: call{{.*}} i32 @llvm.amdgcn.ds.swizzle(i32 %{{.*}}, i32 32) +__device__ void test_ds_swizzle(int* out, int a) { + *out = __builtin_amdgcn_ds_swizzle(a, 32); +} From 3eab15a63038b41e89dee38543fa125365a3b18a Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Mon, 11 May 2026 08:15:12 +0100 Subject: [PATCH 225/538] [lldb] Fix TestDelayedBreakpoint on ARM Thumb (#196888) The original address used for the "fake breakpoint" is not valid in Thumb mode. To be safe, change it to have 0's in the LSBs. --- .../breakpoint/delayed_breakpoints/TestDelayedBreakpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/breakpoint/delayed_breakpoints/TestDelayedBreakpoint.py b/lldb/test/API/functionalities/breakpoint/delayed_breakpoints/TestDelayedBreakpoint.py index 0d55e4e03f2e9..e6a6390f8f1b6 100644 --- a/lldb/test/API/functionalities/breakpoint/delayed_breakpoints/TestDelayedBreakpoint.py +++ b/lldb/test/API/functionalities/breakpoint/delayed_breakpoints/TestDelayedBreakpoint.py @@ -55,7 +55,7 @@ def test_eager_breakpoints(self): bp1 = target.BreakpointCreateByLocation("main.c", 1) self.runCmd("proc plugin packet send BEGIN_EAGER", check=False) # Create an address breakpoint to trigger eager breakpoints. - fake_address = 0x1234567 + fake_address = 0x1234560 target.BreakpointCreateByAddress(fake_address) self.runCmd("proc plugin packet send END_EAGER", check=False) From 47495f4934094fab8f991303ec5d94d76b9254fb Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 11 May 2026 09:18:11 +0200 Subject: [PATCH 226/538] [clang][bytecode] Visit `tryEvaluateObjectSize` expr as lvalue (#196010) Just like we do with the first parameter of a regular `__builtin_object_size` call. This still doesn't fix the bigger bos test cases since e.g. ```c++ int NoViableOverloadObjectSize3(void *const p PS(3)) __attribute__((overloadable)) { return __builtin_object_size(p, 3); } void test4(struct Foo *t) { gi = NoViableOverloadObjectSize3(&t[1].t[1]); } ``` is still broken because we don't have special handling for the `&t[1].t[1]` handling here and we can't usually access a one-past-end pointer. --- clang/lib/AST/ByteCode/ByteCodeEmitter.h | 1 + clang/lib/AST/ByteCode/Compiler.cpp | 9 +++++++++ clang/lib/AST/ByteCode/Compiler.h | 1 + clang/lib/AST/ByteCode/Context.cpp | 2 +- clang/lib/AST/ByteCode/EvalEmitter.cpp | 17 +++++++++++++++-- clang/lib/AST/ByteCode/EvalEmitter.h | 2 ++ .../AST/ByteCode/builtin-object-size-codegen.c | 6 ++++++ 7 files changed, 35 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.h b/clang/lib/AST/ByteCode/ByteCodeEmitter.h index 102ce939c6717..e3aa3c940de47 100644 --- a/clang/lib/AST/ByteCode/ByteCodeEmitter.h +++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.h @@ -46,6 +46,7 @@ class ByteCodeEmitter { /// Methods implemented by the compiler. virtual bool visitFunc(const FunctionDecl *E) = 0; virtual bool visitExpr(const Expr *E, bool DestroyToplevelScope) = 0; + virtual bool visitLValueExpr(const Expr *E, bool DestroyToplevelScope) = 0; virtual bool visitDeclAndReturn(const VarDecl *VD, const Expr *Init, bool ConstantContext) = 0; virtual bool visit(const Expr *E) = 0; diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 9a8842bf258f6..faad6e0b4a230 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -5155,6 +5155,15 @@ bool Compiler::visitExpr(const Expr *E, bool DestroyToplevelScope) { return maybeDestroyLocals() && false; } +template +bool Compiler::visitLValueExpr(const Expr *E, + bool DestroyToplevelScope) { + OptionScope Scope(this, /*NewDiscardResult=*/false, + /*NewInitializing=*/false, /*ToLValue=*/true); + + return this->visitExpr(E, DestroyToplevelScope); +} + template VarCreationState Compiler::visitDecl(const VarDecl *VD) { diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index de6ea524897a0..ff5d0c05fe14b 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -253,6 +253,7 @@ class Compiler : public ConstStmtVisitor, bool>, protected: bool visitStmt(const Stmt *S); bool visitExpr(const Expr *E, bool DestroyToplevelScope) override; + bool visitLValueExpr(const Expr *E, bool DestroyToplevelScope) override; bool visitFunc(const FunctionDecl *F) override; bool visitDeclAndReturn(const VarDecl *VD, const Expr *Init, diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp index 3a8a50f2aeb53..35959715946c3 100644 --- a/clang/lib/AST/ByteCode/Context.cpp +++ b/clang/lib/AST/ByteCode/Context.cpp @@ -345,7 +345,7 @@ Context::tryEvaluateObjectSize(State &Parent, const Expr *E, unsigned Kind) { std::optional Result; - auto PtrRes = C.interpretAsPointer(E, [&](const Pointer &Ptr) { + auto PtrRes = C.interpretAsLValuePointer(E, [&](const Pointer &Ptr) { const Descriptor *DeclDesc = Ptr.getDeclDesc(); if (!DeclDesc) return false; diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp index 319ef7edd57b9..d3acaa406af51 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.cpp +++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp @@ -75,14 +75,13 @@ EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD, const Expr *Init, EvaluationResult EvalEmitter::interpretAsPointer(const Expr *E, PtrCallback PtrCB) { - S.setEvalLocation(E->getExprLoc()); this->ConvertResultToRValue = false; this->CheckFullyInitialized = false; this->PtrCB = PtrCB; EvalResult.setSource(E); - if (!this->visitExpr(E, /*DestroyToplevelScope=*/true)) { + if (!this->visitExpr(E, true)) { // EvalResult may already have a result set, but something failed // after that (e.g. evaluating destructors). EvalResult.setInvalid(); @@ -91,6 +90,20 @@ EvaluationResult EvalEmitter::interpretAsPointer(const Expr *E, return std::move(this->EvalResult); } +EvaluationResult EvalEmitter::interpretAsLValuePointer(const Expr *E, + PtrCallback PtrCB) { + S.setEvalLocation(E->getExprLoc()); + this->ConvertResultToRValue = false; + this->CheckFullyInitialized = false; + this->PtrCB = PtrCB; + EvalResult.setSource(E); + + if (!this->visitLValueExpr(E, true)) + EvalResult.setInvalid(); + + return std::move(this->EvalResult); +} + bool EvalEmitter::interpretCall(const FunctionDecl *FD, const Expr *E) { // Add parameters to the parameter map. The values in the ParamOffset don't // matter in this case as reading from them can't ever work. diff --git a/clang/lib/AST/ByteCode/EvalEmitter.h b/clang/lib/AST/ByteCode/EvalEmitter.h index 8f6da7aef422a..ce5825eef3607 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.h +++ b/clang/lib/AST/ByteCode/EvalEmitter.h @@ -42,6 +42,7 @@ class EvalEmitter : public SourceMapper { bool CheckFullyInitialized); /// Interpret the given Expr to a Pointer. EvaluationResult interpretAsPointer(const Expr *E, PtrCallback PtrCB); + EvaluationResult interpretAsLValuePointer(const Expr *E, PtrCallback PtrCB); /// Interpret the given expression as if it was in the body of the given /// function, i.e. the parameters of the function are available for use. bool interpretCall(const FunctionDecl *FD, const Expr *E); @@ -61,6 +62,7 @@ class EvalEmitter : public SourceMapper { /// Methods implemented by the compiler. virtual bool visitExpr(const Expr *E, bool DestroyToplevelScope) = 0; + virtual bool visitLValueExpr(const Expr *E, bool DestroyToplevelScope) = 0; virtual bool visitDeclAndReturn(const VarDecl *VD, const Expr *Init, bool ConstantContext) = 0; virtual bool visitFunc(const FunctionDecl *F) = 0; diff --git a/clang/test/AST/ByteCode/builtin-object-size-codegen.c b/clang/test/AST/ByteCode/builtin-object-size-codegen.c index 6aa0485bd65ad..1b2561a89ebba 100644 --- a/clang/test/AST/ByteCode/builtin-object-size-codegen.c +++ b/clang/test/AST/ByteCode/builtin-object-size-codegen.c @@ -35,6 +35,12 @@ // gi = ObjectSize2(&t[1].t[1]); } +void foo2(struct Foo *t) { + // CHECK: call i32 @ObjectSize3(ptr noundef %{{.*}}, i64 noundef 36) + ObjectSize3(&t->t[1]); +} + + /// Used to crash due to the void-typed ArraySubscriptExpr. void foo(void *p) { int i = __builtin_object_size(&p[2], 3); From 0a181a1c0a0f1da569c20b84284154f9c3836d64 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 11 May 2026 00:20:21 -0700 Subject: [PATCH 227/538] Use auto for DenseMap/SmallDenseMap iterator variables. NFC (#196883) To match the prevailing style. --- llvm/include/llvm/ADT/SCCIterator.h | 3 +-- llvm/include/llvm/ADT/ScopedHashTable.h | 3 +-- .../llvm/Analysis/IRSimilarityIdentifier.h | 8 +++---- llvm/include/llvm/Analysis/LoopIterator.h | 4 ++-- .../llvm/CodeGen/FunctionLoweringInfo.h | 2 +- .../Transforms/Utils/InstructionWorklist.h | 2 +- llvm/lib/Analysis/CallGraphSCCPass.cpp | 3 +-- llvm/lib/Analysis/ScalarEvolution.cpp | 2 +- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/Bitcode/Reader/MetadataLoader.cpp | 2 +- .../CodeGen/AsmPrinter/DebugHandlerBase.cpp | 6 ++--- llvm/lib/CodeGen/CodeGenPrepare.cpp | 3 +-- llvm/lib/CodeGen/MachineCSE.cpp | 2 +- llvm/lib/CodeGen/MachineCombiner.cpp | 3 +-- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 2 +- llvm/lib/CodeGen/RegAllocFast.cpp | 2 +- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 6 ++--- .../SelectionDAG/LegalizeVectorOps.cpp | 2 +- .../CodeGen/SelectionDAG/ScheduleDAGFast.cpp | 2 +- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 3 +-- .../SelectionDAG/SelectionDAGBuilder.cpp | 10 ++++---- .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 7 +++--- llvm/lib/CodeGen/TailDuplicator.cpp | 9 +++---- .../lib/CodeGen/TwoAddressInstructionPass.cpp | 12 +++++----- llvm/lib/IR/LegacyPassManager.cpp | 14 +++++------ llvm/lib/IR/Value.cpp | 3 +-- llvm/lib/MC/MCRegisterInfo.cpp | 4 ++-- llvm/lib/Target/AArch64/AArch64FastISel.cpp | 3 +-- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 12 ++++------ .../AMDGPU/R600OptimizeVectorRegisters.cpp | 3 +-- llvm/lib/Target/AMDGPU/R600Packetizer.cpp | 2 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 2 +- llvm/lib/Target/ARM/ARMFastISel.cpp | 6 ++--- llvm/lib/Target/ARM/ARMMachineFunctionInfo.h | 2 +- llvm/lib/Target/ARM/Thumb2SizeReduction.cpp | 2 +- llvm/lib/Target/Mips/MipsFastISel.cpp | 6 ++--- llvm/lib/Target/PowerPC/PPCFastISel.cpp | 6 ++--- .../WebAssembly/WebAssemblyFastISel.cpp | 6 ++--- llvm/lib/Target/X86/X86FastISel.cpp | 5 ++-- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- llvm/lib/Transforms/IPO/IROutliner.cpp | 24 +++++++------------ .../Instrumentation/DataFlowSanitizer.cpp | 3 +-- llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp | 7 +++--- llvm/lib/Transforms/Scalar/GVN.cpp | 4 ++-- llvm/lib/Transforms/Scalar/SROA.cpp | 3 +-- .../Utils/PromoteMemoryToRegister.cpp | 6 ++--- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 3 +-- llvm/lib/Transforms/Utils/SSAUpdater.cpp | 2 +- llvm/tools/llvm-sim/llvm-sim.cpp | 2 +- 49 files changed, 95 insertions(+), 137 deletions(-) diff --git a/llvm/include/llvm/ADT/SCCIterator.h b/llvm/include/llvm/ADT/SCCIterator.h index 205fa669a12de..5e0ca2eb949a0 100644 --- a/llvm/include/llvm/ADT/SCCIterator.h +++ b/llvm/include/llvm/ADT/SCCIterator.h @@ -165,8 +165,7 @@ void scc_iterator::DFSVisitChildren() { while (VisitStack.back().NextChild != GT::child_end(VisitStack.back().Node)) { // TOS has at least one more child so continue DFS NodeRef childN = *VisitStack.back().NextChild++; - typename DenseMap::iterator Visited = - nodeVisitNumbers.find(childN); + auto Visited = nodeVisitNumbers.find(childN); if (Visited == nodeVisitNumbers.end()) { // this node has never been seen. DFSVisitOne(childN); diff --git a/llvm/include/llvm/ADT/ScopedHashTable.h b/llvm/include/llvm/ADT/ScopedHashTable.h index cc977012b2394..8fb4aa5da8cf5 100644 --- a/llvm/include/llvm/ADT/ScopedHashTable.h +++ b/llvm/include/llvm/ADT/ScopedHashTable.h @@ -219,8 +219,7 @@ class ScopedHashTable : detail::AllocatorHolder { iterator end() { return iterator(nullptr); } iterator begin(const K &Key) { - typename DenseMap::iterator I = - TopLevelMap.find(Key); + auto I = TopLevelMap.find(Key); if (I == TopLevelMap.end()) return end(); return iterator(I->second); } diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h index 4e0f23f9cc8fb..d8e58ad2a9219 100644 --- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h +++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -960,7 +960,7 @@ class IRSimilarityCandidate { /// \returns std::nullopt if not present. std::optional getGVN(Value *V) { assert(V != nullptr && "Value is a nullptr?"); - DenseMap::iterator VNIt = ValueToNumber.find(V); + auto VNIt = ValueToNumber.find(V); if (VNIt == ValueToNumber.end()) return std::nullopt; return VNIt->second; @@ -971,7 +971,7 @@ class IRSimilarityCandidate { /// \returns The Value associated with the number. /// \returns std::nullopt if not present. std::optional fromGVN(unsigned Num) { - DenseMap::iterator VNIt = NumberToValue.find(Num); + auto VNIt = NumberToValue.find(Num); if (VNIt == NumberToValue.end()) return std::nullopt; assert(VNIt->second != nullptr && "Found value is a nullptr!"); @@ -985,7 +985,7 @@ class IRSimilarityCandidate { /// \returns An optional containing the value, and std::nullopt if it could /// not be found. std::optional getCanonicalNum(unsigned N) { - DenseMap::iterator NCIt = NumberToCanonNum.find(N); + auto NCIt = NumberToCanonNum.find(N); if (NCIt == NumberToCanonNum.end()) return std::nullopt; return NCIt->second; @@ -998,7 +998,7 @@ class IRSimilarityCandidate { /// \returns An optional containing the value, and std::nullopt if it could /// not be found. std::optional fromCanonicalNum(unsigned N) { - DenseMap::iterator CNIt = CanonNumToNumber.find(N); + auto CNIt = CanonNumToNumber.find(N); if (CNIt == CanonNumToNumber.end()) return std::nullopt; return CNIt->second; diff --git a/llvm/include/llvm/Analysis/LoopIterator.h b/llvm/include/llvm/Analysis/LoopIterator.h index 6d25b2fd8923c..f403d658e2a5c 100644 --- a/llvm/include/llvm/Analysis/LoopIterator.h +++ b/llvm/include/llvm/Analysis/LoopIterator.h @@ -144,13 +144,13 @@ class LoopBlocksDFS { /// Return true if this block has a postorder number. bool hasPostorder(BasicBlock *BB) const { - DenseMap::const_iterator I = PostNumbers.find(BB); + auto I = PostNumbers.find(BB); return I != PostNumbers.end() && I->second; } /// Get a block's postorder number. unsigned getPostorder(BasicBlock *BB) const { - DenseMap::const_iterator I = PostNumbers.find(BB); + auto I = PostNumbers.find(BB); assert(I != PostNumbers.end() && "block not visited by DFS"); assert(I->second && "block not finished by DFS"); return I->second; diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h index fc76751a2342f..c1031b55f7fd8 100644 --- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -265,7 +265,7 @@ class FunctionLoweringInfo { /// called when a block is visited before all of its predecessors. void InvalidatePHILiveOutRegInfo(const PHINode *PN) { // PHIs with no uses have no ValueMap entry. - DenseMap::const_iterator It = ValueMap.find(PN); + auto It = ValueMap.find(PN); if (It == ValueMap.end()) return; diff --git a/llvm/include/llvm/Transforms/Utils/InstructionWorklist.h b/llvm/include/llvm/Transforms/Utils/InstructionWorklist.h index c8f20636965e8..5a46fff651336 100644 --- a/llvm/include/llvm/Transforms/Utils/InstructionWorklist.h +++ b/llvm/include/llvm/Transforms/Utils/InstructionWorklist.h @@ -83,7 +83,7 @@ class InstructionWorklist { /// Remove I from the worklist if it exists. void remove(Instruction *I) { - DenseMap::iterator It = WorklistMap.find(I); + auto It = WorklistMap.find(I); if (It != WorklistMap.end()) { // Don't bother moving everything down, just null out the slot. Worklist[It->second] = nullptr; diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp index 1228d5b4b78be..9213ea8ea2383 100644 --- a/llvm/lib/Analysis/CallGraphSCCPass.cpp +++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp @@ -318,8 +318,7 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG, // If this call site already existed in the callgraph, just verify it // matches up to expectations and remove it from Calls. - DenseMap::iterator ExistingIt = - Calls.find(Call); + auto ExistingIt = Calls.find(Call); if (ExistingIt != Calls.end()) { CallGraphNode *ExistingNode = ExistingIt->second; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 94fdd36dbcb22..3d17c2aadefd5 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6838,7 +6838,7 @@ const ConstantRange &ScalarEvolution::getRangeRef( : ConstantRange::Signed; // See if we've computed this range already. - DenseMap::iterator I = Cache.find(S); + auto I = Cache.find(S); if (I != Cache.end()) return I->second; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 83babe1c62541..3e863f4786e1a 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7058,7 +7058,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) { if (!F || !F->isMaterializable()) return Error::success(); - DenseMap::iterator DFII = DeferredFunctionInfo.find(F); + auto DFII = DeferredFunctionInfo.find(F); assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!"); // If its position is recorded as 0, its body is somewhere in the stream // but we haven't seen it yet. diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 0155f15b1ab17..a11e5609d5294 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -2598,7 +2598,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( Instruction *Inst = InstructionList[Record[0]]; for (unsigned i = 1; i != RecordLength; i = i + 2) { unsigned Kind = Record[i]; - DenseMap::iterator I = MDKindMap.find(Kind); + auto I = MDKindMap.find(Kind); if (I == MDKindMap.end()) return error("Invalid ID"); if (I->second == LLVMContext::MD_tbaa && StripTBAA) diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index dc38f5a6887c2..8787c12bf3a5f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -369,8 +369,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) { CurMI = MI; // Insert labels where requested. - DenseMap::iterator I = - LabelsBeforeInsn.find(MI); + auto I = LabelsBeforeInsn.find(MI); // No label needed. if (I == LabelsBeforeInsn.end()) @@ -399,8 +398,7 @@ void DebugHandlerBase::endInstruction() { PrevInstBB = CurMI->getParent(); } - DenseMap::iterator I = - LabelsAfterInsn.find(CurMI); + auto I = LabelsAfterInsn.find(CurMI); // No label needed or label already assigned. if (I == LabelsAfterInsn.end() || I->second) { diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 22d7d221c2670..74a0502d8cb7c 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7263,8 +7263,7 @@ bool CodeGenPrepare::performAddressTypePromotion( bool AllSeenFirst = true; for (auto *I : SpeculativelyMovedExts) { Value *HeadOfChain = I->getOperand(0); - DenseMap::iterator AlreadySeen = - SeenChainsForSExt.find(HeadOfChain); + auto AlreadySeen = SeenChainsForSExt.find(HeadOfChain); // If there is an unhandled SExt which has the same header, try to promote // it as well. if (AlreadySeen != SeenChainsForSExt.end()) { diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index 67ecb80bc3ee0..01ccd742d72b3 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -517,7 +517,7 @@ void MachineCSEImpl::EnterScope(MachineBasicBlock *MBB) { void MachineCSEImpl::ExitScope(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n'); - DenseMap::iterator SI = ScopeMap.find(MBB); + auto SI = ScopeMap.find(MBB); assert(SI != ScopeMap.end()); delete SI->second; ScopeMap.erase(SI); diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index b86e70f265786..0161e70fa2c8a 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -213,8 +213,7 @@ MachineCombiner::getDepth(SmallVectorImpl &InsInstrs, continue; unsigned DepthOp = 0; unsigned LatencyOp = 0; - DenseMap::iterator II = - InstrIdxForVirtReg.find(MO.getReg()); + auto II = InstrIdxForVirtReg.find(MO.getReg()); if (II != InstrIdxForVirtReg.end()) { // Operand is new virtual register not in trace assert(II->second < InstrDepth.size() && "Bad Index"); diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 1acca1fb2659f..1cd2ec5c1cded 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1427,7 +1427,7 @@ bool PeepholeOptimizer::foldImmediate( continue; if (ImmDefRegs.count(Reg) == 0) continue; - DenseMap::iterator II = ImmDefMIs.find(Reg); + auto II = ImmDefMIs.find(Reg); assert(II != ImmDefMIs.end() && "couldn't find immediate definition"); if (TII->foldImmediate(MI, *II->second, Reg, MRI)) { ++NumImmFold; diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 7998b87a59c96..6241ae94130dc 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -1776,7 +1776,7 @@ void RegAllocFastImpl::handleBundle(MachineInstr &MI) { if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; - DenseMap::iterator DI = BundleVirtRegsMap.find(Reg); + auto DI = BundleVirtRegsMap.find(Reg); assert(DI != BundleVirtRegsMap.end() && "Unassigned virtual register"); setPhysReg(MI, MO, DI->second); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 9774f2c721d1f..1b6d7b57c2c58 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -148,7 +148,7 @@ bool FastISel::lowerArguments() { for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(), E = FuncInfo.Fn->arg_end(); I != E; ++I) { - DenseMap::iterator VI = LocalValueMap.find(&*I); + auto VI = LocalValueMap.find(&*I); assert(VI != LocalValueMap.end() && "Missed an argument?"); FuncInfo.ValueMap[&*I] = VI->second; } @@ -354,7 +354,7 @@ Register FastISel::lookUpRegForValue(const Value *V) { // cache values defined by Instructions across blocks, and other values // only locally. This is because Instructions already have the SSA // def-dominates-use requirement enforced. - DenseMap::iterator I = FuncInfo.ValueMap.find(V); + auto I = FuncInfo.ValueMap.find(V); if (I != FuncInfo.ValueMap.end()) return I->second; return LocalValueMap[V]; @@ -1717,7 +1717,7 @@ bool FastISel::selectExtractValue(const User *U) { // Get the base result register. Register ResultReg; - DenseMap::iterator I = FuncInfo.ValueMap.find(Op0); + auto I = FuncInfo.ValueMap.find(Op0); if (I != FuncInfo.ValueMap.end()) ResultReg = I->second; else if (isa(Op0)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index e5484bf3676db..eb89a0f129df4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -272,7 +272,7 @@ VectorLegalizer::RecursivelyLegalizeResults(SDValue Op, SDValue VectorLegalizer::LegalizeOp(SDValue Op) { // Note that LegalizeOp may be reentered even from single-use nodes, which // means that we always must cache transformed nodes. - DenseMap::iterator I = LegalizedNodes.find(Op); + auto I = LegalizedNodes.find(Op); if (I != LegalizedNodes.end()) return I->second; // Legalize the operands diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 5af3df161b6c5..9b76ebdb0f8fa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -693,7 +693,7 @@ void ScheduleDAGLinearize::ScheduleNode(SDNode *N) { // Glue operand is already scheduled. continue; - DenseMap::iterator DI = GluedMap.find(OpN); + auto DI = GluedMap.find(OpN); if (DI != GluedMap.end() && DI->second != N) // Users of glues are counted against the glued users. OpN = DI->second; diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index c51e2de30934b..1b28e3a9dc3a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -813,8 +813,7 @@ EmitPhysRegCopy(SUnit *SU, SmallDenseMap &VRBaseMap, continue; // ignore chain preds if (Pred.getSUnit()->CopyDstRC) { // Copy to physical register. - DenseMap::iterator VRI = - VRBaseMap.find(Pred.getSUnit()); + auto VRI = VRBaseMap.find(Pred.getSUnit()); assert(VRI != VRBaseMap.end() && "Node emitted out of order - late"); // Find the destination physical register. Register Reg; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e88a07901e289..68ae86d8d561f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1776,7 +1776,7 @@ void SelectionDAGBuilder::resolveOrClearDbgInfo() { /// getCopyFromRegs - If there was virtual register allocated for the value V /// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise. SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) { - DenseMap::iterator It = FuncInfo.ValueMap.find(V); + auto It = FuncInfo.ValueMap.find(V); SDValue Result; if (It != FuncInfo.ValueMap.end()) { @@ -2010,8 +2010,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { // If this is a static alloca, generate it as the frameindex instead of // computation. if (const AllocaInst *AI = dyn_cast(V)) { - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) return DAG.getFrameIndex( SI->second, TLI.getValueType(DAG.getDataLayout(), AI->getType())); @@ -2362,7 +2361,7 @@ void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) { if (V->getType()->isEmptyTy()) return; - DenseMap::iterator VMI = FuncInfo.ValueMap.find(V); + auto VMI = FuncInfo.ValueMap.find(V); if (VMI != FuncInfo.ValueMap.end()) { assert((!V->use_empty() || isa(V)) && "Unused value assigned virtual registers!"); @@ -12346,8 +12345,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { } Reg = RegOut; } else { - DenseMap::iterator I = - FuncInfo.ValueMap.find(PHIOp); + auto I = FuncInfo.ValueMap.find(PHIOp); if (I != FuncInfo.ValueMap.end()) Reg = I->second; else { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 7aec7f41c15f2..3dc599b556ddd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -669,15 +669,14 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { // registers. If we don't apply the reg fixups before, some registers may // appear as unused and will be skipped, resulting in bad MI. MachineRegisterInfo &MRI = MF->getRegInfo(); - for (DenseMap::iterator I = FuncInfo->RegFixups.begin(), - E = FuncInfo->RegFixups.end(); + for (auto I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end(); I != E; ++I) { Register From = I->first; Register To = I->second; // If To is also scheduled to be replaced, find what its ultimate // replacement is. while (true) { - DenseMap::iterator J = FuncInfo->RegFixups.find(To); + auto J = FuncInfo->RegFixups.find(To); if (J == E) break; To = J->second; @@ -752,7 +751,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { // If Reg is live-in then update debug info to track its copy in a vreg. if (!Reg.isPhysical()) continue; - DenseMap::iterator LDI = LiveInMap.find(Reg); + auto LDI = LiveInMap.find(Reg); if (LDI != LiveInMap.end()) { assert(!hasFI && "There's no handling of frame pointer updating here yet " "- add if needed"); diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index b0888c1c44d57..4a972b360998b 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -212,8 +212,7 @@ bool TailDuplicator::tailDuplicateAndUpdate( } // Add the new vregs as available values. - DenseMap::iterator LI = - SSAUpdateVals.find(VReg); + auto LI = SSAUpdateVals.find(VReg); for (std::pair &J : LI->second) { MachineBasicBlock *SrcBB = J.first; Register SrcReg = J.second; @@ -338,8 +337,7 @@ static void getRegsUsedByPHIs(const MachineBasicBlock &BB, /// Add a definition and source virtual registers pair for SSA update. void TailDuplicator::addSSAUpdateEntry(Register OrigReg, Register NewReg, MachineBasicBlock *BB) { - DenseMap::iterator LI = - SSAUpdateVals.find(OrigReg); + auto LI = SSAUpdateVals.find(OrigReg); if (LI != SSAUpdateVals.end()) LI->second.push_back(std::make_pair(BB, NewReg)); else { @@ -522,8 +520,7 @@ void TailDuplicator::updateSuccessorsPHIs( // If Idx is set, the operands at Idx and Idx+1 must be removed. // We reuse the location to avoid expensive removeOperand calls. - DenseMap::iterator LI = - SSAUpdateVals.find(Reg); + auto LI = SSAUpdateVals.find(Reg); if (LI != SSAUpdateVals.end()) { // This register is defined in the tail block. for (const std::pair &J : LI->second) { diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 351aa6179d86a..fb3014d87f40a 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -361,7 +361,7 @@ bool TwoAddressInstructionImpl::noUseAfterLastDef(Register Reg, unsigned Dist, MachineInstr *MI = MO.getParent(); if (MI->getParent() != MBB || MI->isDebugValue()) continue; - DenseMap::iterator DI = DistanceMap.find(MI); + auto DI = DistanceMap.find(MI); if (DI == DistanceMap.end()) continue; if (MO.isUse() && DI->second < LastUse) @@ -550,7 +550,7 @@ MachineInstr *TwoAddressInstructionImpl::findOnlyInterestingUse( static MCRegister getMappedReg(Register Reg, DenseMap &RegMap) { while (Reg.isVirtual()) { - DenseMap::iterator SI = RegMap.find(Reg); + auto SI = RegMap.find(Reg); if (SI == RegMap.end()) return 0; Reg = SI->second; @@ -863,7 +863,7 @@ void TwoAddressInstructionImpl::scanUses(Register DstReg) { if (IsCopy && !Processed.insert(UseMI).second) break; - DenseMap::iterator DI = DistanceMap.find(UseMI); + auto DI = DistanceMap.find(UseMI); if (DI != DistanceMap.end()) // Earlier in the same MBB.Reached via a back edge. break; @@ -939,7 +939,7 @@ bool TwoAddressInstructionImpl::rescheduleMIBelowKill( return false; MachineInstr *MI = &*mi; - DenseMap::iterator DI = DistanceMap.find(MI); + auto DI = DistanceMap.find(MI); if (DI == DistanceMap.end()) // Must be created from unfolded load. Don't waste time trying this. return false; @@ -1104,7 +1104,7 @@ bool TwoAddressInstructionImpl::isDefTooClose(Register Reg, unsigned Dist, continue; if (&DefMI == MI) return true; // MI is defining something KillMI uses - DenseMap::iterator DDI = DistanceMap.find(&DefMI); + auto DDI = DistanceMap.find(&DefMI); if (DDI == DistanceMap.end()) return true; // Below MI unsigned DefDist = DDI->second; @@ -1127,7 +1127,7 @@ bool TwoAddressInstructionImpl::rescheduleKillAboveMI( return false; MachineInstr *MI = &*mi; - DenseMap::iterator DI = DistanceMap.find(MI); + auto DI = DistanceMap.find(MI); if (DI == DistanceMap.end()) // Must be created from unfolded load. Don't waste time trying this. return false; diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index 47a828842b481..7b9ad89038dc6 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -903,9 +903,9 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) { return; const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet(); - for (DenseMap::iterator I = AvailableAnalysis.begin(), - E = AvailableAnalysis.end(); I != E; ) { - DenseMap::iterator Info = I++; + for (auto I = AvailableAnalysis.begin(), E = AvailableAnalysis.end(); + I != E;) { + auto Info = I++; if (Info->second->getAsImmutablePass() == nullptr && !is_contained(PreservedSet, Info->first)) { // Remove this analysis @@ -924,10 +924,8 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) { if (!IA) continue; - for (DenseMap::iterator I = IA->begin(), - E = IA->end(); - I != E;) { - DenseMap::iterator Info = I++; + for (auto I = IA->begin(), E = IA->end(); I != E;) { + auto Info = I++; if (Info->second->getAsImmutablePass() == nullptr && !is_contained(PreservedSet, Info->first)) { // Remove this analysis @@ -1098,7 +1096,7 @@ void PMDataManager::initializeAnalysisImpl(Pass *P) { Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) { // Check if AvailableAnalysis map has one entry. - DenseMap::const_iterator I = AvailableAnalysis.find(AID); + auto I = AvailableAnalysis.find(AID); if (I != AvailableAnalysis.end()) return I->second; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 360bf0f8fc47f..7246adf7ec651 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -1195,8 +1195,7 @@ void ValueHandleBase::AddToUseList() { } // Okay, reallocation did happen. Fix the Prev Pointers. - for (DenseMap::iterator I = Handles.begin(), - E = Handles.end(); I != E; ++I) { + for (auto I = Handles.begin(), E = Handles.end(); I != E; ++I) { assert(I->second && I->first == I->second->getValPtr() && "List invariant broken!"); I->second->setPrevPtr(&I->second); diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp index 77fb7332619cd..8ac752ab91a07 100644 --- a/llvm/lib/MC/MCRegisterInfo.cpp +++ b/llvm/lib/MC/MCRegisterInfo.cpp @@ -192,7 +192,7 @@ int64_t MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(uint64_t RegNum) const { } int MCRegisterInfo::getSEHRegNum(MCRegister Reg) const { - const DenseMap::const_iterator I = L2SEHRegs.find(Reg); + const auto I = L2SEHRegs.find(Reg); if (I == L2SEHRegs.end()) return (int)Reg.id(); return I->second; @@ -201,7 +201,7 @@ int MCRegisterInfo::getSEHRegNum(MCRegister Reg) const { int MCRegisterInfo::getCodeViewRegNum(MCRegister Reg) const { if (L2CVRegs.empty()) report_fatal_error("target does not implement codeview register mapping"); - const DenseMap::const_iterator I = L2CVRegs.find(Reg); + const auto I = L2CVRegs.find(Reg); if (I == L2CVRegs.end()) report_fatal_error("unknown codeview register " + (Reg.id() < getNumRegs() ? getName(Reg) diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index b9c317a2071ca..6c218de2c43c3 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -676,8 +676,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) } case Instruction::Alloca: { const AllocaInst *AI = cast(Obj); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { Addr.setKind(Address::FrameIndexBase); Addr.setFI(SI->second); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 5d20d1e10a0da..727ae68e88bfb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -2557,7 +2557,7 @@ bool RewriteMFMAFormStage::rewrite( } if (!Src2DefsReplace.empty()) { - DenseMap::iterator RI = RedefMap.find(Src2Reg); + auto RI = RedefMap.find(Src2Reg); if (RI != RedefMap.end()) { MappedReg = RI->second; } else { @@ -2641,7 +2641,7 @@ bool RewriteMFMAFormStage::rewrite( } if (!DstUseDefsReplace.empty()) { - DenseMap::iterator RI = RedefMap.find(DstReg); + auto RI = RedefMap.find(DstReg); if (RI != RedefMap.end()) { MappedReg = RI->second; } else { @@ -2668,8 +2668,7 @@ bool RewriteMFMAFormStage::rewrite( // If this reaching def was the last MI in the region, update the // region boundaries. - DenseMap::iterator LMI = - LastMIToRegion.find(RD); + auto LMI = LastMIToRegion.find(RD); if (LMI != LastMIToRegion.end()) { unsigned UpdateRegion = LMI->second; DAG.Regions[UpdateRegion].second = VGPRCopy; @@ -2747,8 +2746,7 @@ bool RewriteMFMAFormStage::rewrite( // If this UseInst was the first MI in the region, update the region // boundaries. - DenseMap::iterator FI = - FirstMIToRegion.find(UseInst); + auto FI = FirstMIToRegion.find(UseInst); if (FI != FirstMIToRegion.end()) { unsigned UpdateRegion = FI->second; DAG.Regions[UpdateRegion].first = VGPRCopy; @@ -2782,7 +2780,7 @@ bool RewriteMFMAFormStage::rewrite( Register RegToRewrite = RewriteReg; // Be sure to update the replacement register and not the original. - DenseMap::iterator RI = RedefMap.find(RewriteReg); + auto RI = RedefMap.find(RewriteReg); if (RI != RedefMap.end()) RegToRewrite = RI->second; diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 9e1a97e95dc23..1ad357228df3f 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -150,8 +150,7 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, const { unsigned CurrentUndexIdx = 0; for (auto &It : ToMerge->RegToChan) { - DenseMap::const_iterator PosInUntouched = - Untouched->RegToChan.find(It.first); + auto PosInUntouched = Untouched->RegToChan.find(It.first); if (PosInUntouched != Untouched->RegToChan.end()) { Remap.emplace_back(It.second, (*PosInUntouched).second); continue; diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index 301cb21a808f8..c240fae120c33 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -129,7 +129,7 @@ class R600PacketizerList : public VLIWPacketizerList { if (OperandIdx < 0) continue; Register Src = MI.getOperand(OperandIdx).getReg(); - const DenseMap::const_iterator It = PVs.find(Src); + const auto It = PVs.find(Src); if (It != PVs.end()) MI.getOperand(OperandIdx).setReg(It->second); } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index eb8ff794ad8a6..3b1b8673e56a0 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4825,7 +4825,7 @@ bool ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, unsigned &AddSubOpc, bool &NegAcc, bool &HasLane) const { - DenseMap::const_iterator I = MLxEntryMap.find(Opcode); + auto I = MLxEntryMap.find(Opcode); if (I == MLxEntryMap.end()) return false; diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 0d416152b3a36..2c5d286e11c4f 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -683,8 +683,7 @@ Register ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) { if (!isLoadTypeLegal(AI->getType(), VT)) return Register(); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); // This will get lowered later into the correct offsets and registers // via rewriteXFrameIndex. @@ -817,8 +816,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { } case Instruction::Alloca: { const AllocaInst *AI = cast(Obj); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { Addr.setKind(Address::FrameIndexBase); Addr.setFI(SI->second); diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index b6897608a952c..59d328783a37b 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -252,7 +252,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { } unsigned getOriginalCPIdx(unsigned CloneIdx) const { - DenseMap::const_iterator I = CPEClones.find(CloneIdx); + auto I = CPEClones.find(CloneIdx); if (I != CPEClones.end()) return I->second; else diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index e497845742ef5..a7edf8d105c02 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -1010,7 +1010,7 @@ bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, bool LiveCPSR, bool IsSelfLoop, bool SkipPrologueEpilogue) { unsigned Opcode = MI->getOpcode(); - DenseMap::iterator OPI = ReduceOpcodeMap.find(Opcode); + auto OPI = ReduceOpcodeMap.find(Opcode); if (OPI == ReduceOpcodeMap.end()) return false; if (SkipPrologueEpilogue && (MI->getFlag(MachineInstr::FrameSetup) || diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index 4c0f817234b92..9645fb5293609 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -336,8 +336,7 @@ Register MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) { assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 && "Alloca should always return a pointer."); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { Register ResultReg = createResultReg(&Mips::GPR32RegClass); @@ -527,8 +526,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) { } case Instruction::Alloca: { const AllocaInst *AI = cast(Obj); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { Addr.setKind(Address::FrameIndexBase); Addr.setFI(SI->second); diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index c02b10300b304..dec5dbdfc2258 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -375,8 +375,7 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) { } case Instruction::Alloca: { const AllocaInst *AI = cast(Obj); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { Addr.BaseType = Address::FrameIndexBase; Addr.Base.FI = SI->second; @@ -2270,8 +2269,7 @@ Register PPCFastISel::fastMaterializeConstant(const Constant *C) { // Materialize the address created by an alloca into a register, and // return the register number (or zero if we failed to handle it). Register PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) { - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); // Don't handle dynamic allocas. if (SI == FuncInfo.StaticAllocaMap.end()) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 99305f371551f..b107886a1f16e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -324,8 +324,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { } case Instruction::Alloca: { const auto *AI = cast(Obj); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { if (Addr.isSet()) { return false; @@ -674,8 +673,7 @@ unsigned WebAssemblyFastISel::copyValue(unsigned Reg) { } Register WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) { - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); + auto SI = FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { Register ResultReg = diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index f91ef4abbdf27..4daff4517e940 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -761,7 +761,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Ok, we need to do a load from a stub. If we've already loaded from // this stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap::iterator I = LocalValueMap.find(V); + auto I = LocalValueMap.find(V); Register LoadReg; if (I != LocalValueMap.end() && I->second) { LoadReg = I->second; @@ -874,8 +874,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { case Instruction::Alloca: { // Do static allocas. const AllocaInst *A = cast(V); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(A); + auto SI = FuncInfo.StaticAllocaMap.find(A); if (SI != FuncInfo.StaticAllocaMap.end()) { AM.BaseType = X86AddressMode::FrameIndexBase; AM.Base.FrameIndex = SI->second; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4b193c8db3303..f4529ddf4983d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23759,7 +23759,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, return false; SDValue Src = I->getOperand(0); - DenseMap::iterator M = SrcOpMap.find(Src); + auto M = SrcOpMap.find(Src); if (M == SrcOpMap.end()) { VT = Src.getValueType(); // Quit if not the same type. diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index c1640f3d0e2a7..0e780d41c93ed 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -513,8 +513,7 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) { /// not. static Value *findOutputMapping(const DenseMap OutputMappings, Value *Input) { - DenseMap::const_iterator OutputMapping = - OutputMappings.find(Input); + auto OutputMapping = OutputMappings.find(Input); if (OutputMapping != OutputMappings.end()) return OutputMapping->second; return Input; @@ -929,8 +928,7 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, assert(InputOpt && "Global value number not found?"); Value *Input = *InputOpt; - DenseMap::iterator AggArgIt = - Group.CanonicalNumberToAggArg.find(CanonicalNumber); + auto AggArgIt = Group.CanonicalNumberToAggArg.find(CanonicalNumber); if (!Group.InputTypesSet) { Group.ArgumentTypes.push_back(Input->getType()); @@ -1731,8 +1729,7 @@ findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region, IncomingVal = findOutputMapping(OutputMappings, IncomingVal); Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal); assert(Val && "Value is nullptr?"); - DenseMap::iterator RemappedIt = - FirstRegion->RemappedArguments.find(Val); + auto RemappedIt = FirstRegion->RemappedArguments.find(Val); if (RemappedIt != FirstRegion->RemappedArguments.end()) Val = RemappedIt->second; NewPN->setIncomingValue(Idx, Val); @@ -1925,8 +1922,7 @@ std::optional findDuplicateOutputBlock( for (DenseMap &CompBBs : OutputStoreBBs) { Mismatch = false; for (std::pair &VToB : CompBBs) { - DenseMap::iterator OutputBBIt = - OutputBBs.find(VToB.first); + auto OutputBBIt = OutputBBs.find(VToB.first); if (OutputBBIt == OutputBBs.end()) { Mismatch = true; break; @@ -2051,8 +2047,7 @@ static void alignOutputBlockWithAggFunc( for (std::pair &VtoBB : OutputBBs) { RetValueForBB = VtoBB.first; NewBB = VtoBB.second; - DenseMap::iterator VBBIt = - EndBBs.find(RetValueForBB); + auto VBBIt = EndBBs.find(RetValueForBB); LLVM_DEBUG(dbgs() << "Create output block for region in" << Region.ExtractedFunction << " to " << *NewBB); @@ -2130,8 +2125,7 @@ void createSwitchStatement( unsigned Idx = 0; for (DenseMap &OutputStoreBB : OutputStoreBBs) { - DenseMap::iterator OSBBIt = - OutputStoreBB.find(OutputBlock.first); + auto OSBBIt = OutputStoreBB.find(OutputBlock.first); if (OSBBIt == OutputStoreBB.end()) continue; @@ -2160,8 +2154,7 @@ void createSwitchStatement( << *OG.OutlinedFunction << "\n"); DenseMap OutputBlocks = OutputStoreBBs[0]; for (std::pair &VBPair : OutputBlocks) { - DenseMap::iterator EndBBIt = - EndBBs.find(VBPair.first); + auto EndBBIt = EndBBs.find(VBPair.first); assert(EndBBIt != EndBBs.end() && "Could not find end block"); BasicBlock *EndBB = EndBBIt->second; BasicBlock *OutputBB = VBPair.second; @@ -2215,8 +2208,7 @@ void IROutliner::fillOverallFunction( if (!analyzeAndPruneOutputBlocks(NewBBs, *CurrentOS)) { OutputStoreBBs.push_back(DenseMap()); for (std::pair &VToBB : NewBBs) { - DenseMap::iterator VBBIt = - CurrentGroup.EndBBs.find(VToBB.first); + auto VBBIt = CurrentGroup.EndBBs.find(VToBB.first); BasicBlock *EndBB = VBBIt->second; UncondBrInst::Create(EndBB, VToBB.second); OutputStoreBBs.back().insert(VToBB); diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 648df1f545f01..c349fe33dd237 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -3381,8 +3381,7 @@ void DFSanVisitor::visitCallBase(CallBase &CB) { } } - DenseMap::iterator UnwrappedFnIt = - DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand()); + auto UnwrappedFnIt = DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand()); if (UnwrappedFnIt != DFSF.DFS.UnwrappedFnMap.end()) if (visitWrappedCallBase(*UnwrappedFnIt->second, CB)) return; diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index f796266f0cf3c..a32f824299b09 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -1262,8 +1262,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, for (const BasicBlock *Succ : successors(BB)) { // If VisitBottomUp has pointer information for this successor, take // what we know about it. - const DenseMap::iterator BBI = - BBStates.find(Succ); + const auto BBI = BBStates.find(Succ); assert(BBI != BBStates.end()); const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); const Sequence SuccSSeq = SuccS.GetSeq(); @@ -1405,7 +1404,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB, SE(MyStates.succ_end()); if (SI != SE) { const BasicBlock *Succ = *SI; - DenseMap::iterator I = BBStates.find(Succ); + auto I = BBStates.find(Succ); assert(I != BBStates.end()); MyStates.InitFromSucc(I->second); ++SI; @@ -1589,7 +1588,7 @@ bool ObjCARCOpt::VisitTopDown( PE(MyStates.pred_end()); if (PI != PE) { const BasicBlock *Pred = *PI; - DenseMap::iterator I = BBStates.find(Pred); + auto I = BBStates.find(Pred); assert(I != BBStates.end()); MyStates.InitFromPred(I->second); ++PI; diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 2794fb27cae69..0965c2ab361c0 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -648,7 +648,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(MemoryAccess *MA) { /// lookupOrAdd - Returns the value number for the specified value, assigning /// it a new number if it did not have one before. uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) { - DenseMap::iterator VI = ValueNumbering.find(V); + auto VI = ValueNumbering.find(V); if (VI != ValueNumbering.end()) return VI->second; @@ -733,7 +733,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) { /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. uint32_t GVNPass::ValueTable::lookup(Value *V, bool Verify) const { - DenseMap::const_iterator VI = ValueNumbering.find(V); + auto VI = ValueNumbering.find(V); if (Verify) { assert(VI != ValueNumbering.end() && "Value not numbered?"); return VI->second; diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 7e798ef29c1b8..83e40edb64541 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1207,8 +1207,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { // FIXME: Yet another place we really should bypass this when // instrumenting for ASan. if (Offset.uge(AllocSize)) { - SmallDenseMap::iterator MTPI = - MemTransferSliceMap.find(&II); + auto MTPI = MemTransferSliceMap.find(&II); if (MTPI != MemTransferSliceMap.end()) AS.Slices[MTPI->second].kill(); return markAsDead(II); diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index ef668e575f3ea..b635d805bf13d 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -349,7 +349,7 @@ class LargeBlockInfo { "Not a load/store to/from an alloca?"); // If we already have this instruction number, return it. - DenseMap::iterator It = InstNumbers.find(I); + auto It = InstNumbers.find(I); if (It != InstNumbers.end()) return It->second; @@ -1187,7 +1187,7 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) { if (!Src) continue; - DenseMap::iterator AI = AllocaLookup.find(Src); + auto AI = AllocaLookup.find(Src); if (AI == AllocaLookup.end()) continue; @@ -1204,7 +1204,7 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) { if (!Dest) continue; - DenseMap::iterator ai = AllocaLookup.find(Dest); + auto ai = AllocaLookup.find(Dest); if (ai == AllocaLookup.end()) continue; diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 9ba1002533997..4cae040f96fc6 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -938,8 +938,7 @@ class SCCPInstVisitor : public InstVisitor { const ValueLatticeElement &getLatticeValueFor(Value *V) const { assert(!V->getType()->isStructTy() && "Should use getStructLatticeValueFor"); - DenseMap::const_iterator I = - ValueState.find(V); + auto I = ValueState.find(V); assert(I != ValueState.end() && "V not found in ValueState nor Paramstate map!"); return I->second; diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 49d0d9584347e..c2a234bb1928d 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -499,7 +499,7 @@ void LoadAndStorePromoter::run(const SmallVectorImpl &Insts) { // Propagate down to the ultimate replacee. The intermediately loads // could theoretically already have been deleted, so we don't want to // dereference the Value*'s. - DenseMap::iterator RLI = ReplacedLoads.find(NewVal); + auto RLI = ReplacedLoads.find(NewVal); while (RLI != ReplacedLoads.end()) { NewVal = RLI->second; RLI = ReplacedLoads.find(NewVal); diff --git a/llvm/tools/llvm-sim/llvm-sim.cpp b/llvm/tools/llvm-sim/llvm-sim.cpp index bdd2779ffb1c4..6e86714ed4acd 100644 --- a/llvm/tools/llvm-sim/llvm-sim.cpp +++ b/llvm/tools/llvm-sim/llvm-sim.cpp @@ -45,7 +45,7 @@ std::optional getPositionInModule(const Instruction *I, const DenseMap &LLVMInstNum) { assert(I && "Instruction is nullptr!"); - DenseMap::const_iterator It = LLVMInstNum.find(I); + auto It = LLVMInstNum.find(I); if (It == LLVMInstNum.end()) return std::nullopt; return It->second; From 9f5813571f2d8f1ff2fa9f4680059ac98dfaa855 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 11 May 2026 08:31:59 +0100 Subject: [PATCH 228/538] [AArch64] Use dup (lane mov) over ext for high-half extract (#195010) This changes the instruction we use to extract the high half of a vector register from a `ext v0, v1, v1, 8` to a `dup d0, v1.d[1]`. This is apparently slightly quicker on certain cpus and is generally a simpler instruction. This matches the instruction that gisel produced. Some of the old patterns for extract_subvector with index of 1 seem incorrect but were never used as we do not reach selection with such instructions. They have been repurposed to emit the new DUPi64 instructions. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 26 +- .../AArch64/aarch64-matrix-umull-smull.ll | 26 +- llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll | 24 +- llvm/test/CodeGen/AArch64/aarch64-mulv.ll | 32 +- llvm/test/CodeGen/AArch64/aarch64-smull.ll | 30 +- .../AArch64/aarch64_be-shuffle-vector.ll | 4 +- llvm/test/CodeGen/AArch64/abd-combine.ll | 2 +- llvm/test/CodeGen/AArch64/add-extract.ll | 4 +- llvm/test/CodeGen/AArch64/arm64-ext.ll | 8 +- .../AArch64/arm64-extract_subvector.ll | 57 +-- .../CodeGen/AArch64/arm64-neon-2velem-high.ll | 72 +-- .../test/CodeGen/AArch64/arm64-neon-2velem.ll | 130 +++--- .../test/CodeGen/AArch64/arm64-neon-3vdiff.ll | 2 +- .../AArch64/arm64-neon-add-pairwise.ll | 6 +- llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 14 +- .../CodeGen/AArch64/arm64-neon-mul-div.ll | 8 +- .../CodeGen/AArch64/arm64-neon-simd-vget.ll | 42 +- llvm/test/CodeGen/AArch64/arm64-vabs.ll | 8 +- llvm/test/CodeGen/AArch64/arm64-vadd.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-vmul.ll | 44 +- llvm/test/CodeGen/AArch64/arm64-vshift.ll | 12 +- llvm/test/CodeGen/AArch64/bf16-shuffle.ll | 3 +- .../CodeGen/AArch64/bf16-v8-instructions.ll | 4 +- .../CodeGen/AArch64/bf16-vector-shuffle.ll | 3 +- llvm/test/CodeGen/AArch64/clmul-fixed.ll | 12 +- .../CodeGen/AArch64/combine-storetomstore.ll | 4 +- .../complex-deinterleaving-mixed-cases.ll | 28 +- .../complex-deinterleaving-multiuses.ll | 20 +- .../complex-deinterleaving-uniform-cases.ll | 8 +- llvm/test/CodeGen/AArch64/concat-vector.ll | 4 +- llvm/test/CodeGen/AArch64/ctlz.ll | 3 +- llvm/test/CodeGen/AArch64/ctpop.ll | 3 +- llvm/test/CodeGen/AArch64/cttz.ll | 3 +- llvm/test/CodeGen/AArch64/double_reduct.ll | 92 ++-- llvm/test/CodeGen/AArch64/ext-narrow-index.ll | 34 +- .../CodeGen/AArch64/extract-subvec-combine.ll | 20 +- .../CodeGen/AArch64/extract-vector-elt.ll | 2 +- .../AArch64/f16f32dot-fixed-length-fdot.ll | 2 +- llvm/test/CodeGen/AArch64/fabs.ll | 3 +- llvm/test/CodeGen/AArch64/faddsub.ll | 6 +- llvm/test/CodeGen/AArch64/fcmp.ll | 8 +- llvm/test/CodeGen/AArch64/fcopysign.ll | 3 +- llvm/test/CodeGen/AArch64/fcvt.ll | 21 +- llvm/test/CodeGen/AArch64/fdiv.ll | 3 +- .../AArch64/fixed-vector-deinterleave.ll | 18 +- .../AArch64/fixed-vector-interleave.ll | 9 +- llvm/test/CodeGen/AArch64/fminimummaximum.ll | 6 +- llvm/test/CodeGen/AArch64/fminmax.ll | 6 +- llvm/test/CodeGen/AArch64/fmla.ll | 9 +- llvm/test/CodeGen/AArch64/fmul.ll | 3 +- llvm/test/CodeGen/AArch64/fneg.ll | 3 +- .../AArch64/fp-maximumnum-minimumnum.ll | 13 +- .../CodeGen/AArch64/fp16-vector-shuffle.ll | 3 +- llvm/test/CodeGen/AArch64/fpext.ll | 34 +- llvm/test/CodeGen/AArch64/fptoi.ll | 166 ++----- .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 34 +- .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 18 +- llvm/test/CodeGen/AArch64/freeze.ll | 6 +- llvm/test/CodeGen/AArch64/fsqrt.ll | 3 +- .../CodeGen/AArch64/highextractbitcast.ll | 50 +- llvm/test/CodeGen/AArch64/icmp.ll | 3 +- llvm/test/CodeGen/AArch64/insert-subvector.ll | 6 +- llvm/test/CodeGen/AArch64/insertextract.ll | 12 +- llvm/test/CodeGen/AArch64/itofp.ll | 432 ++++-------------- llvm/test/CodeGen/AArch64/neon-abd.ll | 4 +- .../neon-partial-reduce-dot-product.ll | 70 +-- llvm/test/CodeGen/AArch64/neon-scalar-copy.ll | 121 +++-- .../CodeGen/AArch64/neon-shift-left-long.ll | 12 +- llvm/test/CodeGen/AArch64/nontemporal-load.ll | 12 +- llvm/test/CodeGen/AArch64/ptradd.ll | 8 +- llvm/test/CodeGen/AArch64/reduce-and.ll | 28 +- llvm/test/CodeGen/AArch64/reduce-or.ll | 28 +- llvm/test/CodeGen/AArch64/reduce-xor.ll | 28 +- llvm/test/CodeGen/AArch64/rem.ll | 20 +- llvm/test/CodeGen/AArch64/sext.ll | 12 +- llvm/test/CodeGen/AArch64/st1-lane.ll | 56 +-- .../sve-fixed-length-extract-subvector.ll | 21 +- .../AArch64/sve-fixed-length-fp-to-int.ll | 4 +- .../AArch64/sve-fixed-length-int-extends.ll | 12 +- .../AArch64/sve-fixed-length-int-to-fp.ll | 4 +- .../sve-fixed-length-masked-expandloads.ll | 48 +- .../AArch64/sve-fixed-length-masked-gather.ll | 4 +- .../AArch64/sve-fixed-length-masked-loads.ll | 24 +- .../sve-fixed-length-masked-scatter.ll | 8 +- .../AArch64/sve-fixed-vector-llrint.ll | 214 ++++----- .../CodeGen/AArch64/sve-fixed-vector-lrint.ll | 214 ++++----- .../AArch64/vec-combine-compare-to-bitmask.ll | 2 +- llvm/test/CodeGen/AArch64/vec_uaddo.ll | 2 +- llvm/test/CodeGen/AArch64/vec_umulo.ll | 2 +- .../AArch64/vecreduce-and-legalization.ll | 8 +- llvm/test/CodeGen/AArch64/vecreduce-bitext.ll | 72 +-- llvm/test/CodeGen/AArch64/vecreduce-fmul.ll | 78 ++-- .../AArch64/vecreduce-umax-legalization.ll | 2 +- llvm/test/CodeGen/AArch64/vector-fcvt.ll | 12 +- llvm/test/CodeGen/AArch64/vector-llrint.ll | 100 ++-- llvm/test/CodeGen/AArch64/vector-lrint.ll | 114 ++--- llvm/test/CodeGen/AArch64/vselect-ext.ll | 8 +- llvm/test/CodeGen/AArch64/zext-shuffle.ll | 12 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 32 +- llvm/test/CodeGen/AArch64/zext.ll | 14 +- 100 files changed, 1305 insertions(+), 1781 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 45ac5bfc16a26..4a8f11cef2713 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7577,10 +7577,6 @@ multiclass ExtPat { (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; def : Pat<(VT128 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; - // We use EXT to handle extract_subvector to copy the upper 64-bits of a - // 128-bit vector. - def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; // A 64-bit EXT of two halves of the same 128-bit register can be done as a // single 128-bit EXT. def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 0)), @@ -10976,14 +10972,22 @@ def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; -def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; -def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; -def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 8))), + (DUPi64 FPR128:$Rn, 1)>; +def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 4))), + (DUPi64 FPR128:$Rn, 1)>; +def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 2))), + (DUPi64 FPR128:$Rn, 1)>; def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; + (DUPi64 FPR128:$Rn, 1)>; +def : Pat<(v4bf16 (extract_subvector (v8bf16 FPR128:$Rn), (i64 4))), + (DUPi64 FPR128:$Rn, 1)>; +def : Pat<(v4f16 (extract_subvector (v8f16 FPR128:$Rn), (i64 4))), + (DUPi64 FPR128:$Rn, 1)>; +def : Pat<(v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 2))), + (DUPi64 FPR128:$Rn, 1)>; +def : Pat<(v1f64 (extract_subvector (v2f64 FPR128:$Rn), (i64 1))), + (DUPi64 FPR128:$Rn, 1)>; // A 64-bit subvector insert to the first 128-bit vector position // is a subregister copy that needs no instruction. diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index b6e9d248a2d26..ea55c198a70f1 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -321,8 +321,8 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: mov x14, x11 ; CHECK-GI-NEXT: subs x13, x13, #16 ; CHECK-GI-NEXT: add x12, x12, #32 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-GI-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: mov d4, v2.d[1] ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: smull v3.4s, v0.4h, v3.4h @@ -481,8 +481,8 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: mov x13, x10 ; CHECK-GI-NEXT: subs x12, x12, #16 ; CHECK-GI-NEXT: add x11, x11, #32 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-GI-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: mov d4, v2.d[1] ; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h @@ -909,10 +909,10 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-GI-NEXT: ushll2 v18.4s, v18.8h, #0 ; CHECK-GI-NEXT: ushll v20.4s, v17.4h, #0 ; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0 -; CHECK-GI-NEXT: ext v21.16b, v19.16b, v19.16b, #8 -; CHECK-GI-NEXT: ext v22.16b, v18.16b, v18.16b, #8 -; CHECK-GI-NEXT: ext v23.16b, v20.16b, v20.16b, #8 -; CHECK-GI-NEXT: ext v24.16b, v17.16b, v17.16b, #8 +; CHECK-GI-NEXT: mov d21, v19.d[1] +; CHECK-GI-NEXT: mov d22, v18.d[1] +; CHECK-GI-NEXT: mov d23, v20.d[1] +; CHECK-GI-NEXT: mov d24, v17.d[1] ; CHECK-GI-NEXT: smlal v0.2d, v16.2s, v19.2s ; CHECK-GI-NEXT: smlal v2.2d, v16.2s, v18.2s ; CHECK-GI-NEXT: smlal v4.2d, v16.2s, v20.2s @@ -1176,7 +1176,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 -; CHECK-GI-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v1.d[1] ; CHECK-GI-NEXT: smull v1.2d, v1.2s, v0.2s ; CHECK-GI-NEXT: smull v2.2d, v2.2s, v0.2s ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15 @@ -1302,7 +1302,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 -; CHECK-GI-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v1.d[1] ; CHECK-GI-NEXT: smull v1.8h, v1.8b, v0.8b ; CHECK-GI-NEXT: smull v2.8h, v2.8b, v0.8b ; CHECK-GI-NEXT: cmlt v1.8h, v1.8h, #0 @@ -1460,8 +1460,8 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-GI-NEXT: ldur q2, [x9, #8] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #16 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-GI-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: mov d4, v2.d[1] ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h @@ -1731,7 +1731,7 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc ; CHECK-GI-NEXT: sqneg v2.8h, v1.8h ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: dup v3.8h, w0 ; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.4h ; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll index 2f0ec2b75bfc6..c5a8bba502e22 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -604,7 +604,7 @@ entry: define i64 @sminv_v2i64(<2 x i64> %a) { ; CHECK-SD-NOSVE-LABEL: sminv_v2i64: ; CHECK-SD-NOSVE: // %bb.0: // %entry -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmgt d2, d1, d0 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -643,7 +643,7 @@ define i64 @sminv_v3i64(<3 x i64> %a) { ; CHECK-SD-NOSVE-NEXT: mov v2.d[1], x8 ; CHECK-SD-NOSVE-NEXT: cmgt v1.2d, v2.2d, v0.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v2.16b, v1.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmgt d2, d1, d0 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -690,7 +690,7 @@ define i64 @sminv_v4i64(<4 x i64> %a) { ; CHECK-SD-NOSVE: // %bb.0: // %entry ; CHECK-SD-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmgt d2, d1, d0 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -996,7 +996,7 @@ entry: define i64 @smaxv_v2i64(<2 x i64> %a) { ; CHECK-SD-NOSVE-LABEL: smaxv_v2i64: ; CHECK-SD-NOSVE: // %bb.0: // %entry -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmgt d2, d0, d1 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1035,7 +1035,7 @@ define i64 @smaxv_v3i64(<3 x i64> %a) { ; CHECK-SD-NOSVE-NEXT: mov v2.d[1], x8 ; CHECK-SD-NOSVE-NEXT: cmgt v1.2d, v0.2d, v2.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v2.16b, v1.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmgt d2, d0, d1 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1082,7 +1082,7 @@ define i64 @smaxv_v4i64(<4 x i64> %a) { ; CHECK-SD-NOSVE: // %bb.0: // %entry ; CHECK-SD-NOSVE-NEXT: cmgt v2.2d, v0.2d, v1.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmgt d2, d0, d1 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1386,7 +1386,7 @@ entry: define i64 @uminv_v2i64(<2 x i64> %a) { ; CHECK-SD-NOSVE-LABEL: uminv_v2i64: ; CHECK-SD-NOSVE: // %bb.0: // %entry -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmhi d2, d1, d0 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1425,7 +1425,7 @@ define i64 @uminv_v3i64(<3 x i64> %a) { ; CHECK-SD-NOSVE-NEXT: mov v2.d[1], x8 ; CHECK-SD-NOSVE-NEXT: cmhi v1.2d, v2.2d, v0.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v2.16b, v1.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmhi d2, d1, d0 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1472,7 +1472,7 @@ define i64 @uminv_v4i64(<4 x i64> %a) { ; CHECK-SD-NOSVE: // %bb.0: // %entry ; CHECK-SD-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmhi d2, d1, d0 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1765,7 +1765,7 @@ entry: define i64 @umaxv_v2i64(<2 x i64> %a) { ; CHECK-SD-NOSVE-LABEL: umaxv_v2i64: ; CHECK-SD-NOSVE: // %bb.0: // %entry -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmhi d2, d0, d1 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 @@ -1803,7 +1803,7 @@ define i64 @umaxv_v3i64(<3 x i64> %a) { ; CHECK-SD-NOSVE-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NOSVE-NEXT: mov v3.d[1], xzr ; CHECK-SD-NOSVE-NEXT: cmhi v3.2d, v0.2d, v3.2d -; CHECK-SD-NOSVE-NEXT: ext v4.16b, v3.16b, v3.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d4, v3.d[1] ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v2.16b, v3.16b ; CHECK-SD-NOSVE-NEXT: and v1.8b, v1.8b, v4.8b ; CHECK-SD-NOSVE-NEXT: cmhi d2, d0, d1 @@ -1850,7 +1850,7 @@ define i64 @umaxv_v4i64(<4 x i64> %a) { ; CHECK-SD-NOSVE: // %bb.0: // %entry ; CHECK-SD-NOSVE-NEXT: cmhi v2.2d, v0.2d, v1.2d ; CHECK-SD-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOSVE-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOSVE-NEXT: cmhi d2, d0, d1 ; CHECK-SD-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NOSVE-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll index ce62dba4400d1..f8d7cca916159 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll @@ -132,7 +132,7 @@ entry: define i8 @mulv_v16i8(<16 x i8> %a) { ; CHECK-SD-LABEL: mulv_v16i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: umov w8, v0.b[1] ; CHECK-SD-NEXT: umov w9, v0.b[0] @@ -153,7 +153,7 @@ define i8 @mulv_v16i8(<16 x i8> %a) { ; ; CHECK-GI-LABEL: mulv_v16i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mul v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -180,7 +180,7 @@ define i8 @mulv_v32i8(<32 x i8> %a) { ; CHECK-SD-LABEL: mulv_v32i8: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mul v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: umov w8, v0.b[1] ; CHECK-SD-NEXT: umov w9, v0.b[0] @@ -201,8 +201,8 @@ define i8 @mulv_v32i8(<32 x i8> %a) { ; ; CHECK-GI-LABEL: mulv_v32i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: mul v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: mul v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mul v0.8b, v0.8b, v1.8b @@ -304,7 +304,7 @@ entry: define i16 @mulv_v8i16(<8 x i16> %a) { ; CHECK-SD-LABEL: mulv_v8i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: umov w8, v0.h[1] ; CHECK-SD-NEXT: umov w9, v0.h[0] @@ -317,7 +317,7 @@ define i16 @mulv_v8i16(<8 x i16> %a) { ; ; CHECK-GI-LABEL: mulv_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -336,7 +336,7 @@ define i16 @mulv_v16i16(<16 x i16> %a) { ; CHECK-SD-LABEL: mulv_v16i16: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mul v0.8h, v0.8h, v1.8h -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: umov w8, v0.h[1] ; CHECK-SD-NEXT: umov w9, v0.h[0] @@ -349,8 +349,8 @@ define i16 @mulv_v16i16(<16 x i16> %a) { ; ; CHECK-GI-LABEL: mulv_v16i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h ; CHECK-GI-NEXT: mul v1.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h @@ -393,7 +393,7 @@ define i32 @mulv_v3i32(<3 x i32> %a) { ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: mov v1.s[3], w8 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mul v0.2s, v0.2s, v0.s[1] ; CHECK-NEXT: fmov w0, s0 @@ -406,7 +406,7 @@ entry: define i32 @mulv_v4i32(<4 x i32> %a) { ; CHECK-SD-LABEL: mulv_v4i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: fmov w0, s0 @@ -414,7 +414,7 @@ define i32 @mulv_v4i32(<4 x i32> %a) { ; ; CHECK-GI-LABEL: mulv_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -429,7 +429,7 @@ define i32 @mulv_v8i32(<8 x i32> %a) { ; CHECK-SD-LABEL: mulv_v8i32: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: fmov w0, s0 @@ -437,8 +437,8 @@ define i32 @mulv_v8i32(<8 x i32> %a) { ; ; CHECK-GI-LABEL: mulv_v8i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index ee5b2e00c6a69..5896b408be49c 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -1726,7 +1726,7 @@ define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) { ; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: xtn v1.4h, v1.4s ; CHECK-GI-NEXT: xtn v2.4h, v2.4s ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h @@ -1759,7 +1759,7 @@ define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { ; CHECK-GI-LABEL: umull_and_v8i32_dup: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: dup v1.4s, w8 ; CHECK-GI-NEXT: xtn v1.4h, v1.4s ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h @@ -1832,7 +1832,7 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { ; CHECK-GI-NEXT: movi v3.2d, #0x000000000000ff ; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: xtn v1.2s, v1.2d ; CHECK-GI-NEXT: xtn v2.2s, v2.2d ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s @@ -1865,7 +1865,7 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-GI-LABEL: umull_and_v4i64_dup: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and x8, x0, #0xff -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: dup v1.2d, x8 ; CHECK-GI-NEXT: xtn v1.2s, v1.2d ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s @@ -1902,7 +1902,7 @@ define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { ; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q2, [x1, #16] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: xtn v2.8b, v2.8h ; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h @@ -1938,7 +1938,7 @@ define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { ; CHECK-GI-LABEL: smlsl2_v8i16_uzp1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q2, [x1, #16] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: xtn v2.8b, v2.8h ; CHECK-GI-NEXT: smlsl v1.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: str q1, [x0] @@ -1973,7 +1973,7 @@ define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { ; CHECK-GI-LABEL: umlsl2_v8i16_uzp1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q2, [x1, #16] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: xtn v2.8b, v2.8h ; CHECK-GI-NEXT: umlsl v1.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: str q1, [x0] @@ -2008,7 +2008,7 @@ define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { ; CHECK-GI-LABEL: smlsl2_v4i32_uzp1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q2, [x1, #16] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: xtn v2.4h, v2.4s ; CHECK-GI-NEXT: smlsl v1.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: str q1, [x0] @@ -2043,7 +2043,7 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { ; CHECK-GI-LABEL: umlsl2_v4i32_uzp1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q2, [x1, #16] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: xtn v2.4h, v2.4s ; CHECK-GI-NEXT: umlsl v1.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: str q1, [x0] @@ -2084,7 +2084,7 @@ define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, ; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldp q2, q3, [x1] -; CHECK-GI-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d4, v0.d[1] ; CHECK-GI-NEXT: xtn v2.8b, v2.8h ; CHECK-GI-NEXT: xtn v3.8b, v3.8h ; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b @@ -2131,7 +2131,7 @@ define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, ; CHECK-GI-LABEL: smlsl_smlsl2_v8i16_uzp1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldp q2, q3, [x1] -; CHECK-GI-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d4, v0.d[1] ; CHECK-GI-NEXT: xtn v2.8b, v2.8h ; CHECK-GI-NEXT: xtn v3.8b, v3.8h ; CHECK-GI-NEXT: smlsl v1.8h, v0.8b, v2.8b @@ -2176,7 +2176,7 @@ define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, ; CHECK-GI-LABEL: umlsl_umlsl2_v8i16_uzp1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldp q2, q3, [x1] -; CHECK-GI-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d4, v0.d[1] ; CHECK-GI-NEXT: xtn v2.8b, v2.8h ; CHECK-GI-NEXT: xtn v3.8b, v3.8h ; CHECK-GI-NEXT: umlsl v1.8h, v0.8b, v2.8b @@ -2221,7 +2221,7 @@ define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, ; CHECK-GI-LABEL: smlsl_smlsl2_v4i32_uzp1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldp q2, q3, [x1] -; CHECK-GI-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d4, v0.d[1] ; CHECK-GI-NEXT: xtn v2.4h, v2.4s ; CHECK-GI-NEXT: xtn v3.4h, v3.4s ; CHECK-GI-NEXT: smlsl v1.4s, v0.4h, v2.4h @@ -2266,7 +2266,7 @@ define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, ; CHECK-GI-LABEL: umlsl_umlsl2_v4i32_uzp1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldp q2, q3, [x1] -; CHECK-GI-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d4, v0.d[1] ; CHECK-GI-NEXT: xtn v2.4h, v2.4s ; CHECK-GI-NEXT: xtn v3.4h, v3.4s ; CHECK-GI-NEXT: umlsl v1.4s, v0.4h, v2.4h @@ -2309,7 +2309,7 @@ define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) { ; CHECK-GI-LABEL: do_stuff: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: xtn v0.2s, v0.2d -; CHECK-GI-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v1.d[1] ; CHECK-GI-NEXT: smull v0.2d, v2.2s, v0.2s ; CHECK-GI-NEXT: xtn v0.2s, v0.2d ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s diff --git a/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll index 4e60d99dbed36..f88d88cf1661c 100644 --- a/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll +++ b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll @@ -6,7 +6,7 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECKLE-LABEL: test_reconstructshuffle: ; CHECKLE: // %bb.0: ; CHECKLE-NEXT: mov b2, v0.b[3] -; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECKLE-NEXT: mov d1, v1.d[1] ; CHECKLE-NEXT: mov v2.b[2], v0.b[2] ; CHECKLE-NEXT: mov v2.b[4], v0.b[1] ; CHECKLE-NEXT: mov v2.b[6], v0.b[0] @@ -22,7 +22,7 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECKBE-NEXT: mov b2, v0.b[3] -; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECKBE-NEXT: mov d1, v1.d[1] ; CHECKBE-NEXT: mov v2.b[2], v0.b[2] ; CHECKBE-NEXT: mov v2.b[4], v0.b[1] ; CHECKBE-NEXT: mov v2.b[6], v0.b[0] diff --git a/llvm/test/CodeGen/AArch64/abd-combine.ll b/llvm/test/CodeGen/AArch64/abd-combine.ll index cdb40ceb46b1e..d632648bbcd11 100644 --- a/llvm/test/CodeGen/AArch64/abd-combine.ll +++ b/llvm/test/CodeGen/AArch64/abd-combine.ll @@ -340,7 +340,7 @@ define <8 x i16> @abds_const_lhs(<8 x i16> %src1) { define <8 x i16> @abds_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: abds_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: abs v0.4h, v0.4h ; CHECK-NEXT: abs v1.4h, v1.4h ; CHECK-NEXT: mov v0.d[1], v1.d[0] diff --git a/llvm/test/CodeGen/AArch64/add-extract.ll b/llvm/test/CodeGen/AArch64/add-extract.ll index 923bf089f47ba..002b8e52f4199 100644 --- a/llvm/test/CodeGen/AArch64/add-extract.ll +++ b/llvm/test/CodeGen/AArch64/add-extract.ll @@ -120,7 +120,7 @@ define i32 @add_i32_ext_load(<1 x i32> %A, ptr %B) nounwind { define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind { ; CHECK-SD-LABEL: add_i64_ext_ext_test1: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d2, v1.d[1] ; CHECK-SD-NEXT: add d0, d0, d1 ; CHECK-SD-NEXT: add d0, d0, d2 ; CHECK-SD-NEXT: fmov x0, d0 @@ -145,7 +145,7 @@ define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind { define i64 @sub_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind { ; CHECK-SD-LABEL: sub_i64_ext_ext_test1: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d2, v1.d[1] ; CHECK-SD-NEXT: sub d0, d0, d1 ; CHECK-SD-NEXT: sub d0, d0, d2 ; CHECK-SD-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/arm64-ext.ll b/llvm/test/CodeGen/AArch64/arm64-ext.ll index 85653874c18cb..3194fe1d9a9bc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ext.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ext.ll @@ -101,7 +101,7 @@ define <8 x i16> @test_vextRq_undef2(<8 x i16> %tmp1) nounwind { define <4 x i16> @test_undef(<8 x i16> %tmp1, <8 x i16> %tmp2) { ; CHECK-SD-LABEL: test_undef: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: ret ; @@ -146,7 +146,7 @@ define <16 x i8> @reverse_vector_s8x16b(<16 x i8> noundef %x) { ; CHECK-GI-LABEL: reverse_vector_s8x16b: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: rev64 v1.16b, v0.16b -; CHECK-GI-NEXT: ext v0.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d0, v1.d[1] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: @@ -167,7 +167,7 @@ define <8 x i16> @reverse_vector_s16x8b(<8 x i16> noundef %x) { ; CHECK-GI-LABEL: reverse_vector_s16x8b: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: rev64 v1.8h, v0.8h -; CHECK-GI-NEXT: ext v0.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d0, v1.d[1] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: @@ -188,7 +188,7 @@ define <4 x i32> @reverse_vector_s32x4b(<4 x i32> noundef %x) { ; CHECK-GI-LABEL: reverse_vector_s32x4b: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: rev64 v1.4s, v0.4s -; CHECK-GI-NEXT: ext v0.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d0, v1.d[1] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll b/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll index 1a4c7d8499b43..d5a059380fb3c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll @@ -7,8 +7,7 @@ define <8 x i8> @v8i8(<16 x i8> %a) nounwind { ; CHECK-LABEL: v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0[1] ; CHECK-NEXT: ret %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> ret <8 x i8> %ret @@ -17,8 +16,7 @@ define <8 x i8> @v8i8(<16 x i8> %a) nounwind { define <4 x i16> @v4i16(<8 x i16> %a) nounwind { ; CHECK-LABEL: v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0[1] ; CHECK-NEXT: ret %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> ret <4 x i16> %ret @@ -27,39 +25,26 @@ define <4 x i16> @v4i16(<8 x i16> %a) nounwind { define <2 x i32> @v2i32(<4 x i32> %a) nounwind { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0[1] ; CHECK-NEXT: ret %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> ret <2 x i32> %ret } define <1 x i64> @v1i64(<2 x i64> %a) nounwind { -; CHECK-SD-LABEL: v1i64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v1i64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov d0, v0[1] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov d0, v0[1] +; CHECK-NEXT: ret %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> ret <1 x i64> %ret } define <1 x ptr> @v1p0(<2 x ptr> %a) nounwind { -; CHECK-SD-LABEL: v1p0: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v1p0: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov d0, v0[1] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v1p0: +; CHECK: // %bb.0: +; CHECK-NEXT: mov d0, v0[1] +; CHECK-NEXT: ret %ret = shufflevector <2 x ptr> %a, <2 x ptr> %a, <1 x i32> ret <1 x ptr> %ret } @@ -67,25 +52,21 @@ define <1 x ptr> @v1p0(<2 x ptr> %a) nounwind { define <2 x float> @v2f32(<4 x float> %a) nounwind { ; CHECK-LABEL: v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0[1] ; CHECK-NEXT: ret %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> ret <2 x float> %ret } define <1 x double> @v1f64(<2 x double> %a) nounwind { -; CHECK-SD-LABEL: v1f64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext.16b v0, v0, v0, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v1f64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov d0, v0[1] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov d0, v0[1] +; CHECK-NEXT: ret %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> ret <1 x double> %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} +; CHECK-SD: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll index daf70859abfd6..7581671a51f60 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll @@ -14,7 +14,7 @@ define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { ; ; CHECK-GI-LABEL: test_vmull_high_n_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.4h, w0 ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret @@ -38,7 +38,7 @@ define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 { ; CHECK-GI-LABEL: test_vmull_high_n_s16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.4h, #29 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret entry: @@ -56,7 +56,7 @@ define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { ; ; CHECK-GI-LABEL: test_vmull_high_n_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.2s, w0 ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret @@ -78,7 +78,7 @@ define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 { ; CHECK-GI-LABEL: test_vmull_high_n_s32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.2s, #1, msl #8 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret entry: @@ -96,7 +96,7 @@ define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 { ; ; CHECK-GI-LABEL: test_vmull_high_n_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.4h, w0 ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret @@ -120,7 +120,7 @@ define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 { ; CHECK-GI-LABEL: test_vmull_high_n_u16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.4h, #17, lsl #8 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret entry: @@ -138,7 +138,7 @@ define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 { ; ; CHECK-GI-LABEL: test_vmull_high_n_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.2s, w0 ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret @@ -159,7 +159,7 @@ define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 { ; ; CHECK-GI-LABEL: test_vmull_high_n_u32_imm: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: mvni v1.2s, #1, msl #8 ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret @@ -178,7 +178,7 @@ define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { ; ; CHECK-GI-LABEL: test_vqdmull_high_n_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.4h, w0 ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret @@ -201,7 +201,7 @@ define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 { ; ; CHECK-GI-LABEL: test_vqdmull_high_n_s16_imm: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: mvni v1.4h, #17, lsl #8 ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret @@ -220,7 +220,7 @@ define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { ; ; CHECK-GI-LABEL: test_vqdmull_high_n_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.2s, w0 ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret @@ -242,7 +242,7 @@ define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 { ; CHECK-GI-LABEL: test_vqdmull_high_n_s32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.2s, #29 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret entry: @@ -260,7 +260,7 @@ define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlal_high_n_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.4h, w0 ; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret @@ -285,7 +285,7 @@ define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-GI-LABEL: test_vmlal_high_n_s16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.4h, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -304,7 +304,7 @@ define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlal_high_n_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.2s, w0 ; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret @@ -327,7 +327,7 @@ define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { ; CHECK-GI-LABEL: test_vmlal_high_n_s32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2s, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: @@ -346,7 +346,7 @@ define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlal_high_n_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.4h, w0 ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret @@ -371,7 +371,7 @@ define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-GI-LABEL: test_vmlal_high_n_u16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.4h, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -390,7 +390,7 @@ define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlal_high_n_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.2s, w0 ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret @@ -413,7 +413,7 @@ define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 { ; CHECK-GI-LABEL: test_vmlal_high_n_u32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2s, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: @@ -432,7 +432,7 @@ define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 ; ; CHECK-GI-LABEL: test_vqdmlal_high_n_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.4h, w0 ; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret @@ -457,7 +457,7 @@ define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-GI-LABEL: test_vqdmlal_high_n_s16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.4h, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -476,7 +476,7 @@ define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 ; ; CHECK-GI-LABEL: test_vqdmlal_high_n_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.2s, w0 ; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret @@ -499,7 +499,7 @@ define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { ; CHECK-GI-LABEL: test_vqdmlal_high_n_s32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2s, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: @@ -518,7 +518,7 @@ define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlsl_high_n_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.4h, w0 ; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret @@ -543,7 +543,7 @@ define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-GI-LABEL: test_vmlsl_high_n_s16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.4h, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -562,7 +562,7 @@ define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlsl_high_n_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.2s, w0 ; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret @@ -585,7 +585,7 @@ define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { ; CHECK-GI-LABEL: test_vmlsl_high_n_s32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2s, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: @@ -604,7 +604,7 @@ define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlsl_high_n_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.4h, w0 ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret @@ -629,7 +629,7 @@ define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-GI-LABEL: test_vmlsl_high_n_u16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.4h, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -648,7 +648,7 @@ define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { ; ; CHECK-GI-LABEL: test_vmlsl_high_n_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.2s, w0 ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret @@ -671,7 +671,7 @@ define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 { ; CHECK-GI-LABEL: test_vmlsl_high_n_u32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2s, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: @@ -690,7 +690,7 @@ define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 ; ; CHECK-GI-LABEL: test_vqdmlsl_high_n_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.4h, w0 ; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret @@ -715,7 +715,7 @@ define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-GI-LABEL: test_vqdmlsl_high_n_s16_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.4h, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -734,7 +734,7 @@ define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 ; ; CHECK-GI-LABEL: test_vqdmlsl_high_n_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v2.2s, w0 ; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret @@ -757,7 +757,7 @@ define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { ; CHECK-GI-LABEL: test_vqdmlsl_high_n_s32_imm: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v2.2s, #29 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll index 49ab7ae56d106..85d8b7c3e2866 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -812,7 +812,7 @@ define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> ; ; CHECK-GI-LABEL: test_vmlal_high_lane_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[3] ; CHECK-GI-NEXT: ret @@ -833,7 +833,7 @@ define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> ; ; CHECK-GI-LABEL: test_vmlal_high_lane_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret @@ -853,7 +853,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: @@ -872,7 +872,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: @@ -942,7 +942,7 @@ define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[3] ; CHECK-GI-NEXT: ret @@ -963,7 +963,7 @@ define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret @@ -983,7 +983,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: @@ -1002,7 +1002,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: @@ -1072,7 +1072,7 @@ define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> ; ; CHECK-GI-LABEL: test_vmlal_high_lane_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[3] ; CHECK-GI-NEXT: ret @@ -1093,7 +1093,7 @@ define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> ; ; CHECK-GI-LABEL: test_vmlal_high_lane_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret @@ -1113,7 +1113,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: @@ -1132,7 +1132,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: @@ -1202,7 +1202,7 @@ define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[3] ; CHECK-GI-NEXT: ret @@ -1223,7 +1223,7 @@ define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret @@ -1243,7 +1243,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: @@ -1262,7 +1262,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: @@ -1330,7 +1330,7 @@ define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[3] ; CHECK-GI-NEXT: ret @@ -1350,7 +1350,7 @@ define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.s[1] ; CHECK-GI-NEXT: ret @@ -1370,7 +1370,7 @@ define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.h[3] ; CHECK-GI-NEXT: ret @@ -1390,7 +1390,7 @@ define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.s[1] ; CHECK-GI-NEXT: ret @@ -1453,7 +1453,7 @@ define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[7] ; CHECK-GI-NEXT: ret entry: @@ -1471,7 +1471,7 @@ define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.s[3] ; CHECK-GI-NEXT: ret entry: @@ -1489,7 +1489,7 @@ define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_u16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.h[7] ; CHECK-GI-NEXT: ret entry: @@ -1507,7 +1507,7 @@ define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_u32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.s[3] ; CHECK-GI-NEXT: ret entry: @@ -1552,7 +1552,7 @@ define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i1 ; ; CHECK-GI-LABEL: test_vqdmlal_high_lane_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.h[3] ; CHECK-GI-NEXT: ret @@ -1573,7 +1573,7 @@ define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i3 ; ; CHECK-GI-LABEL: test_vqdmlal_high_lane_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret @@ -1620,7 +1620,7 @@ define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i1 ; ; CHECK-GI-LABEL: test_vqdmlsl_high_lane_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[3] ; CHECK-GI-NEXT: ret @@ -1641,7 +1641,7 @@ define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i3 ; ; CHECK-GI-LABEL: test_vqdmlsl_high_lane_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret @@ -1708,7 +1708,7 @@ define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_lane_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[3] ; CHECK-GI-NEXT: ret @@ -1728,7 +1728,7 @@ define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_lane_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[1] ; CHECK-GI-NEXT: ret @@ -1747,7 +1747,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[7] ; CHECK-GI-NEXT: ret entry: @@ -1765,7 +1765,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[3] ; CHECK-GI-NEXT: ret entry: @@ -2268,7 +2268,7 @@ define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x ; ; CHECK-GI-LABEL: test_vadd_lane5_i16_bitcast_bigger_aligned: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: dup v1.4h, v1.h[1] ; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: ret @@ -3002,7 +3002,7 @@ define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1 ; ; CHECK-GI-LABEL: test_vmlal_high_lane_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret @@ -3023,7 +3023,7 @@ define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3 ; ; CHECK-GI-LABEL: test_vmlal_high_lane_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret @@ -3043,7 +3043,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3062,7 +3062,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: @@ -3132,7 +3132,7 @@ define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1 ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret @@ -3153,7 +3153,7 @@ define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3 ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret @@ -3173,7 +3173,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3192,7 +3192,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: @@ -3262,7 +3262,7 @@ define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1 ; ; CHECK-GI-LABEL: test_vmlal_high_lane_u16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret @@ -3283,7 +3283,7 @@ define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3 ; ; CHECK-GI-LABEL: test_vmlal_high_lane_u32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret @@ -3303,7 +3303,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3322,7 +3322,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: @@ -3392,7 +3392,7 @@ define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1 ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_u16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret @@ -3413,7 +3413,7 @@ define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3 ; ; CHECK-GI-LABEL: test_vmlsl_high_lane_u32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret @@ -3433,7 +3433,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3452,7 +3452,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: @@ -3520,7 +3520,7 @@ define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret @@ -3540,7 +3540,7 @@ define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret @@ -3560,7 +3560,7 @@ define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_u16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret @@ -3580,7 +3580,7 @@ define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_lane_u32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret @@ -3643,7 +3643,7 @@ define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3661,7 +3661,7 @@ define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret entry: @@ -3679,7 +3679,7 @@ define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_u16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3697,7 +3697,7 @@ define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; ; CHECK-GI-LABEL: test_vmull_high_laneq_u32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret entry: @@ -3742,7 +3742,7 @@ define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x ; ; CHECK-GI-LABEL: test_vqdmlal_high_lane_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret @@ -3763,7 +3763,7 @@ define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x ; ; CHECK-GI-LABEL: test_vqdmlal_high_lane_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret @@ -3810,7 +3810,7 @@ define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x ; ; CHECK-GI-LABEL: test_vqdmlsl_high_lane_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret @@ -3831,7 +3831,7 @@ define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x ; ; CHECK-GI-LABEL: test_vqdmlsl_high_lane_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret @@ -3898,7 +3898,7 @@ define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_lane_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret @@ -3918,7 +3918,7 @@ define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_lane_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret @@ -3937,7 +3937,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret entry: @@ -3955,7 +3955,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll index 9aec2f1925b79..79822682c2178 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -2768,7 +2768,7 @@ define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coer ; CHECK-GI-NEXT: dup v2.8h, v1.h[0] ; CHECK-GI-NEXT: sqneg v1.8h, v2.8h ; CHECK-GI-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: fmov d3, x0 ; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.h[0] ; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll index aba73cf745c22..0ede4bc7a4d6c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll @@ -137,7 +137,7 @@ define i32 @addp_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-GI-LABEL: addp_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: addp v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: dup v1.2s, v0.s[1] ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s @@ -164,7 +164,7 @@ define <4 x i16> @addp_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-GI-LABEL: addp_v8i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: addp v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: ret %1 = add <8 x i16> %a, %b @@ -185,7 +185,7 @@ define <8 x i8> @addp_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK-GI-LABEL: addp_v16i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: addp v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: ret %1 = add <16 x i8> %a, %b diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 4f3666c58d317..18d745939e478 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -284,16 +284,10 @@ define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { } define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { -; CHECK-SD-LABEL: ins2f1: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ins2f1: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ins2f1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: ret %tmp3 = extractelement <2 x double> %tmp1, i32 1 %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 ret <1 x double> %tmp4 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll index dcd34af7c93cb..cd101afe57c40 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll @@ -2014,7 +2014,7 @@ define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) { ; CHECK-GI-NEXT: ushll v0.4s, v2.4h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d5, v1.d[1] ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s3 ; CHECK-GI-NEXT: mov w10, v0.s[1] @@ -2173,8 +2173,8 @@ define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) { ; CHECK-GI-NEXT: ushll v5.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll2 v4.4s, v4.8h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v1.4h, #0 -; CHECK-GI-NEXT: ext v18.16b, v3.16b, v3.16b, #8 -; CHECK-GI-NEXT: ext v19.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d18, v3.d[1] +; CHECK-GI-NEXT: mov d19, v1.d[1] ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov w12, v2.s[3] @@ -2390,7 +2390,7 @@ define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) { ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d5, v1.d[1] ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s3 ; CHECK-GI-NEXT: mov w10, v2.s[1] diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-vget.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-vget.ll index 96d5b5ef85415..dbf19b1d6fa57 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-vget.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-vget.ll @@ -4,8 +4,7 @@ define <8 x i8> @test_vget_high_s8(<16 x i8> %a) { ; CHECK-LABEL: test_vget_high_s8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> @@ -15,8 +14,7 @@ entry: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) { ; CHECK-LABEL: test_vget_high_s16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -26,8 +24,7 @@ entry: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) { ; CHECK-LABEL: test_vget_high_s32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -37,8 +34,7 @@ entry: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) { ; CHECK-LABEL: test_vget_high_s64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> @@ -48,8 +44,7 @@ entry: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) { ; CHECK-LABEL: test_vget_high_u8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> @@ -59,8 +54,7 @@ entry: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) { ; CHECK-LABEL: test_vget_high_u16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -70,8 +64,7 @@ entry: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) { ; CHECK-LABEL: test_vget_high_u32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -81,8 +74,7 @@ entry: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) { ; CHECK-LABEL: test_vget_high_u64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> @@ -92,8 +84,7 @@ entry: define <1 x i64> @test_vget_high_p64(<2 x i64> %a) { ; CHECK-LABEL: test_vget_high_p64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> @@ -103,8 +94,7 @@ entry: define <4 x i16> @test_vget_high_f16(<8 x i16> %a) { ; CHECK-LABEL: test_vget_high_f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -114,8 +104,7 @@ entry: define <2 x float> @test_vget_high_f32(<4 x float> %a) { ; CHECK-LABEL: test_vget_high_f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> @@ -125,8 +114,7 @@ entry: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) { ; CHECK-LABEL: test_vget_high_p8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> @@ -136,8 +124,7 @@ entry: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) { ; CHECK-LABEL: test_vget_high_p16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -147,8 +134,7 @@ entry: define <1 x double> @test_vget_high_f64(<2 x double> %a) { ; CHECK-LABEL: test_vget_high_f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index 0a03c81f9ecd8..d175247cca3d9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -1663,7 +1663,7 @@ define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { ; CHECK-GI-LABEL: uabdl2_from_extract_dup: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: dup.2s v1, w0 -; CHECK-GI-NEXT: ext.16b v0, v0, v0, #8 +; CHECK-GI-NEXT: mov d0, v0[1] ; CHECK-GI-NEXT: uabdl.2d v0, v0, v1 ; CHECK-GI-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 @@ -1698,7 +1698,7 @@ define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { ; CHECK-GI-LABEL: sabdl2_from_extract_dup: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: dup.2s v1, w0 -; CHECK-GI-NEXT: ext.16b v0, v0, v0, #8 +; CHECK-GI-NEXT: mov d0, v0[1] ; CHECK-GI-NEXT: sabdl.2d v0, v0, v1 ; CHECK-GI-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 @@ -2033,7 +2033,7 @@ define <16 x i16> @uabd16b_i16_const_select(<16 x i8> %a) { ; CHECK-GI-LABEL: uabd16b_i16_const_select: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI106_1 -; CHECK-GI-NEXT: ext.16b v3, v0, v0, #8 +; CHECK-GI-NEXT: mov d3, v0[1] ; CHECK-GI-NEXT: ushll.8h v4, v0, #0 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI106_1] ; CHECK-GI-NEXT: adrp x8, .LCPI106_0 @@ -2075,7 +2075,7 @@ define <16 x i16> @sabd16b_i16_const_select(<16 x i8> %a) { ; CHECK-GI-LABEL: sabd16b_i16_const_select: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: adrp x8, .LCPI107_1 -; CHECK-GI-NEXT: ext.16b v3, v0, v0, #8 +; CHECK-GI-NEXT: mov d3, v0[1] ; CHECK-GI-NEXT: sshll.8h v4, v0, #0 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI107_1] ; CHECK-GI-NEXT: adrp x8, .LCPI107_0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll index 9a2f7768dded5..3cf01150712c9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll @@ -1041,7 +1041,7 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) { ; CHECK-GI-LABEL: usubl2_duprhs: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: dup v1.2s, w0 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: usubl v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 611079ef6ad89..fbf6df2b1fda4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1542,7 +1542,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind { ; ; CHECK-GI-LABEL: sqdmull2_lane_4s: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[1] ; CHECK-GI-NEXT: ret %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> @@ -1559,7 +1559,7 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind { ; ; CHECK-GI-LABEL: sqdmull2_lane_2d: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[1] ; CHECK-GI-NEXT: ret %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> @@ -1673,7 +1673,7 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nou ; ; CHECK-GI-LABEL: sqdmlal2_lane_4s: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: sqdmlal v0.4s, v3.4h, v1.h[1] ; CHECK-GI-NEXT: ret @@ -1693,7 +1693,7 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou ; ; CHECK-GI-LABEL: sqdmlal2_lane_2d: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: sqdmlal v0.2d, v3.2s, v1.s[1] ; CHECK-GI-NEXT: ret @@ -1978,7 +1978,7 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nou ; ; CHECK-GI-LABEL: sqdmlsl2_lane_4s: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: sqdmlsl v0.4s, v3.4h, v1.h[1] ; CHECK-GI-NEXT: ret @@ -1998,7 +1998,7 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou ; ; CHECK-GI-LABEL: sqdmlsl2_lane_2d: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: sqdmlsl v0.2d, v3.2s, v1.s[1] ; CHECK-GI-NEXT: ret @@ -2908,7 +2908,7 @@ define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { ; CHECK-GI-LABEL: mull_from_extract_dup_high: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: dup v1.2s, w0 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 @@ -2945,7 +2945,7 @@ define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { ; CHECK-GI-LABEL: pmull_from_extract_dup_high: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: dup v1.8b, w0 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ret %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 @@ -2982,7 +2982,7 @@ define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) ; CHECK-GI-LABEL: pmull_from_extract_duplane_high: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: dup v1.8b, v1.b[0] ; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ret @@ -3013,7 +3013,7 @@ define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %r ; ; CHECK-GI-LABEL: sqdmull_from_extract_duplane_high: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> @@ -3044,7 +3044,7 @@ define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> ; ; CHECK-GI-LABEL: sqdmlal_from_extract_duplane_high: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> @@ -3076,7 +3076,7 @@ define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %l ; ; CHECK-GI-LABEL: umlal_from_extract_duplane_high: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> @@ -3495,7 +3495,7 @@ define <4 x i32> @sqdmlal2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> ; ; CHECK-GI-LABEL: sqdmlal2_lane_4s_lib: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlal v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> @@ -3513,7 +3513,7 @@ define <2 x i64> @sqdmlal2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> ; ; CHECK-GI-LABEL: sqdmlal2_lane_2d_lib: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> @@ -3599,7 +3599,7 @@ define <4 x i32> @sqdmlsl2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> ; ; CHECK-GI-LABEL: sqdmlsl2_lane_4s_lib: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> @@ -3617,7 +3617,7 @@ define <2 x i64> @sqdmlsl2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> ; ; CHECK-GI-LABEL: sqdmlsl2_lane_2d_lib: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1] ; CHECK-GI-NEXT: ret %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> @@ -3688,7 +3688,7 @@ define <16 x i16> @or_sext_idx_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1, <16 x i16 ; CHECK-GI-LABEL: or_sext_idx_v16i8_i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: dup v0.8b, v0.b[3] -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: smlal v2.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: smlal v3.8h, v0.8b, v4.8b ; CHECK-GI-NEXT: mov v0.16b, v2.16b @@ -3716,7 +3716,7 @@ define <16 x i16> @or_zext_idx_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1, <16 x i16 ; CHECK-GI-LABEL: or_zext_idx_v16i8_i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: dup v0.8b, v0.b[3] -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: umlal v3.8h, v0.8b, v4.8b ; CHECK-GI-NEXT: mov v0.16b, v2.16b @@ -3793,7 +3793,7 @@ define <8 x i32> @or_sext_idx_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1, <8 x i32> ; ; CHECK-GI-LABEL: or_sext_idx_v8i16_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: smlal v2.4s, v1.4h, v0.h[3] ; CHECK-GI-NEXT: smlal v3.4s, v4.4h, v0.h[3] ; CHECK-GI-NEXT: mov v0.16b, v2.16b @@ -3819,7 +3819,7 @@ define <8 x i32> @or_zext_idx_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1, <8 x i32> ; ; CHECK-GI-LABEL: or_zext_idx_v8i16_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: umlal v2.4s, v1.4h, v0.h[3] ; CHECK-GI-NEXT: umlal v3.4s, v4.4h, v0.h[3] ; CHECK-GI-NEXT: mov v0.16b, v2.16b @@ -3893,7 +3893,7 @@ define <4 x i64> @or_sext_idx_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1, <4 x i64> ; ; CHECK-GI-LABEL: or_sext_idx_v4i32_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: smlal v2.2d, v1.2s, v0.s[3] ; CHECK-GI-NEXT: smlal v3.2d, v4.2s, v0.s[3] ; CHECK-GI-NEXT: mov v0.16b, v2.16b @@ -3919,7 +3919,7 @@ define <4 x i64> @or_zext_idx_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1, <4 x i64> ; ; CHECK-GI-LABEL: or_zext_idx_v4i32_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: umlal v2.2d, v1.2s, v0.s[3] ; CHECK-GI-NEXT: umlal v3.2d, v4.2s, v0.s[3] ; CHECK-GI-NEXT: mov v0.16b, v2.16b diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll index aaf00dbb8dc7a..8d17836a2b761 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -2252,7 +2252,7 @@ define <8 x i16> @ushll2_8h(ptr %A) nounwind { ; CHECK-GI-LABEL: ushll2_8h: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #1 ; CHECK-GI-NEXT: ret %load1 = load <16 x i8>, ptr %A @@ -2272,7 +2272,7 @@ define <4 x i32> @ushll2_4s(ptr %A) nounwind { ; CHECK-GI-LABEL: ushll2_4s: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #1 ; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A @@ -2292,7 +2292,7 @@ define <2 x i64> @ushll2_2d(ptr %A) nounwind { ; CHECK-GI-LABEL: ushll2_2d: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #1 ; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A @@ -2892,7 +2892,7 @@ define <8 x i16> @sshll2_8h(ptr %A) nounwind { ; CHECK-GI-LABEL: sshll2_8h: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #1 ; CHECK-GI-NEXT: ret %load1 = load <16 x i8>, ptr %A @@ -2912,7 +2912,7 @@ define <4 x i32> @sshll2_4s(ptr %A) nounwind { ; CHECK-GI-LABEL: sshll2_4s: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #1 ; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A @@ -2932,7 +2932,7 @@ define <2 x i64> @sshll2_2d(ptr %A) nounwind { ; CHECK-GI-LABEL: sshll2_2d: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ldr q0, [x0] -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #1 ; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A diff --git a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll index d59de3c56f4ee..26a2f9d5299db 100644 --- a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll @@ -201,8 +201,7 @@ entry: define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) { ; CHECK-LABEL: test_vext_aligned_bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index b18b44b46d11a..9453eaa891ca2 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -2000,7 +2000,7 @@ define <8 x i32> @test_fptosi_i32(<8 x bfloat> %a) #0 { define <8 x i64> @test_fptosi_i64(<8 x bfloat> %a) #0 { ; CHECK-LABEL: test_fptosi_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: mov h4, v0.h[2] ; CHECK-NEXT: mov h3, v0.h[1] ; CHECK-NEXT: mov h7, v0.h[3] @@ -2098,7 +2098,7 @@ define <8 x i32> @test_fptoui_i32(<8 x bfloat> %a) #0 { define <8 x i64> @test_fptoui_i64(<8 x bfloat> %a) #0 { ; CHECK-LABEL: test_fptoui_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: mov h4, v0.h[2] ; CHECK-NEXT: mov h3, v0.h[1] ; CHECK-NEXT: mov h7, v0.h[3] diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll index 222d7435ff742..9e13201692db6 100644 --- a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll @@ -101,8 +101,7 @@ entry: define <4 x bfloat> @test_vget_high_bf16(<8 x bfloat> %a) nounwind { ; CHECK-LABEL: test_vget_high_bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll index 70ddde74aafbc..c0d1116230067 100644 --- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll +++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll @@ -2811,8 +2811,8 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) { define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: clmul_v16i16_neon_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: rbit v4.8b, v1.8b ; CHECK-NEXT: rbit v5.8b, v0.8b ; CHECK-NEXT: pmul v0.8b, v0.8b, v1.8b @@ -2928,8 +2928,8 @@ define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) { ; CHECK-NEXT: .cfi_offset b10, -24 ; CHECK-NEXT: .cfi_offset b11, -32 ; CHECK-NEXT: .cfi_offset b12, -48 -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: rev16 v5.8b, v1.8b ; CHECK-NEXT: rev16 v6.8b, v0.8b ; CHECK-NEXT: movi v4.2d, #0000000000000000 @@ -5456,8 +5456,8 @@ define <2 x i128> @clmul_v2i128_neon_zext(<2 x i64> %x, <2 x i64> %y) { ; CHECK-AES-NEXT: fmov x11, d1 ; CHECK-AES-NEXT: mov x8, v0.d[1] ; CHECK-AES-NEXT: mov x9, v1.d[1] -; CHECK-AES-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-AES-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-AES-NEXT: mov d2, v1.d[1] +; CHECK-AES-NEXT: mov d3, v0.d[1] ; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d ; CHECK-AES-NEXT: rbit x11, x11 ; CHECK-AES-NEXT: rbit x10, x10 diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll index d4244fedf7504..b0026f79403d6 100644 --- a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll +++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll @@ -347,7 +347,7 @@ define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1> define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { ; SVE-LABEL: test_masked_store_success_v16i32: ; SVE: // %bb.0: -; SVE-NEXT: ext v5.16b, v4.16b, v4.16b, #8 +; SVE-NEXT: mov d5, v4.d[1] ; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 ; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b ; SVE-NEXT: mov x8, #4 // =0x4 @@ -751,7 +751,7 @@ define void @test_masked_store_success_invert_mask_v8i32(<8 x i32> %x, ptr %ptr, define void @test_masked_store_success_invert_mask_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { ; SVE-LABEL: test_masked_store_success_invert_mask_v16i32: ; SVE: // %bb.0: -; SVE-NEXT: ext v5.16b, v4.16b, v4.16b, #8 +; SVE-NEXT: mov d5, v4.d[1] ; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 ; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b ; SVE-NEXT: mov x8, #4 // =0x4 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll index 1ed9cf2db24f7..7e0b46107ceea 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -44,9 +44,9 @@ define <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fsub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: fsub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d3, v2.d[1] +; CHECK-NEXT: mov d4, v0.d[1] +; CHECK-NEXT: mov d5, v1.d[1] ; CHECK-NEXT: zip2 v0.2s, v0.2s, v4.2s ; CHECK-NEXT: zip2 v4.2s, v2.2s, v3.2s ; CHECK-NEXT: zip1 v1.2s, v1.2s, v5.2s @@ -79,13 +79,13 @@ entry: define <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_mul270_mul: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d3, v2.d[1] +; CHECK-NEXT: mov d4, v1.d[1] ; CHECK-NEXT: zip1 v5.2s, v2.2s, v3.2s ; CHECK-NEXT: zip1 v6.2s, v1.2s, v4.2s ; CHECK-NEXT: zip2 v2.2s, v2.2s, v3.2s ; CHECK-NEXT: zip2 v1.2s, v1.2s, v4.2s -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d3, v0.d[1] ; CHECK-NEXT: fmul v7.2s, v6.2s, v5.2s ; CHECK-NEXT: fneg v4.2s, v7.2s ; CHECK-NEXT: zip2 v7.2s, v0.2s, v3.2s @@ -258,12 +258,12 @@ entry: define <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_triangle_addmul: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d3, v0.d[1] +; CHECK-NEXT: mov d4, v1.d[1] ; CHECK-NEXT: zip1 v5.2s, v0.2s, v3.2s ; CHECK-NEXT: zip1 v6.2s, v1.2s, v4.2s ; CHECK-NEXT: zip2 v1.2s, v1.2s, v4.2s -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d4, v2.d[1] ; CHECK-NEXT: zip2 v0.2s, v0.2s, v3.2s ; CHECK-NEXT: fmul v7.2s, v6.2s, v5.2s ; CHECK-NEXT: fmul v5.2s, v1.2s, v5.2s @@ -311,8 +311,8 @@ entry: define <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) { ; CHECK-LABEL: mul_triangle_multiuses: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s ; CHECK-NEXT: zip1 v5.2s, v1.2s, v3.2s ; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s @@ -442,13 +442,13 @@ entry: define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_divequal: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d3, v0.d[1] +; CHECK-NEXT: mov d4, v1.d[1] ; CHECK-NEXT: zip2 v5.2s, v0.2s, v3.2s ; CHECK-NEXT: zip2 v6.2s, v1.2s, v4.2s ; CHECK-NEXT: zip1 v0.2s, v0.2s, v3.2s ; CHECK-NEXT: zip1 v1.2s, v1.2s, v4.2s -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: fmul v7.2s, v5.2s, v6.2s ; CHECK-NEXT: fneg v4.2s, v7.2s ; CHECK-NEXT: zip1 v7.2s, v2.2s, v3.2s diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll index 039025dafa0d6..8cb1ac4ae548a 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll @@ -46,8 +46,8 @@ entry: define <4 x float> @mul_triangle_external_use(<4 x float> %a, <4 x float> %b, ptr %p) { ; CHECK-LABEL: mul_triangle_external_use: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s ; CHECK-NEXT: zip1 v5.2s, v1.2s, v3.2s ; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s @@ -94,9 +94,9 @@ entry: define <4 x float> @multiple_muls_shuffle_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) { ; CHECK-LABEL: multiple_muls_shuffle_external: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d5, v0.d[1] +; CHECK-NEXT: mov d6, v1.d[1] +; CHECK-NEXT: mov d4, v2.d[1] ; CHECK-NEXT: zip2 v7.2s, v0.2s, v5.2s ; CHECK-NEXT: zip1 v16.2s, v1.2s, v6.2s ; CHECK-NEXT: zip2 v1.2s, v1.2s, v6.2s @@ -168,7 +168,7 @@ define <4 x float> @multiple_muls_shuffle_external_with_loads(ptr %ptr_a, ptr %p ; CHECK-NEXT: fmla v4.2s, v0.2s, v2.2s ; CHECK-NEXT: str d4, [x4] ; CHECK-NEXT: ldr q5, [x2] -; CHECK-NEXT: ext v7.16b, v5.16b, v5.16b, #8 +; CHECK-NEXT: mov d7, v5.d[1] ; CHECK-NEXT: zip1 v0.2s, v5.2s, v7.2s ; CHECK-NEXT: zip2 v1.2s, v5.2s, v7.2s ; CHECK-NEXT: fmul v3.2s, v0.2s, v6.2s @@ -227,10 +227,10 @@ entry: define <4 x float> @multiple_muls_mul_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) { ; CHECK-LABEL: multiple_muls_mul_external: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: mov d4, v0.d[1] +; CHECK-NEXT: mov d5, v1.d[1] +; CHECK-NEXT: mov d16, v2.d[1] +; CHECK-NEXT: mov d17, v3.d[1] ; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s ; CHECK-NEXT: zip2 v7.2s, v1.2s, v5.2s ; CHECK-NEXT: zip1 v19.2s, v2.2s, v16.2s diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 13434fabefa78..09c2e481b0433 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -31,8 +31,8 @@ entry: define <4 x float> @simple_mul_no_contract(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_mul_no_contract: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s ; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s ; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s @@ -149,8 +149,8 @@ entry: define <4 x float> @add_external_use(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: add_external_use: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s ; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s ; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index bbeea72d9055f..385ec6710185b 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -338,7 +338,7 @@ define <8 x i16> @concat_high_low_v8i16(<8 x i16> %a_vec, <8 x i16> %b_vec) { ; ; CHECK-GI-LABEL: concat_high_low_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: @@ -351,7 +351,7 @@ entry: define <8 x i16> @concat_low_high_v8i16(<8 x i16> %a_vec, <8 x i16> %b_vec) { ; CHECK-LABEL: concat_low_high_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 5740eaa977875..a1b6f1b931808 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -343,10 +343,9 @@ define <3 x i64> @v3i64(<3 x i64> %d) { ; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b ; CHECK-SD-NEXT: uaddlp v0.2d, v0.4s ; CHECK-SD-NEXT: uaddlp v2.4s, v1.8h -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: uaddlp v2.2d, v2.4s ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index be62efca6cb6a..61a9d84ad2622 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -311,9 +311,8 @@ define <3 x i64> @v3i64(<3 x i64> %d) { ; CHECK-SD-NEXT: uaddlp v0.4s, v0.8h ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: uaddlp v0.2d, v0.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i64: diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index a72ebf8a9064a..3199796ecd341 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -485,9 +485,8 @@ define <3 x i64> @v3i64(<3 x i64> %d) { ; CHECK-SD-NEXT: uaddlp v0.2d, v0.4s ; CHECK-SD-NEXT: uaddlp v2.2d, v2.4s ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i64: diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll index ac23d7d360fe5..6b681b20733b5 100644 --- a/llvm/test/CodeGen/AArch64/double_reduct.ll +++ b/llvm/test/CodeGen/AArch64/double_reduct.ll @@ -53,7 +53,7 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -62,8 +62,8 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-GI-LABEL: fmul_f32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d3, v2.d[1] +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: fmul v1.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: mov s2, v0.s[1] @@ -82,7 +82,7 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) { ; CHECK-SD-LABEL: fmul_f32_same: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -90,8 +90,8 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) { ; ; CHECK-GI-LABEL: fmul_f32_same: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov s2, v0.s[1] @@ -378,7 +378,7 @@ define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-SD-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: fmov w0, s0 @@ -386,11 +386,11 @@ define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: mul_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d3, v0.d[1] +; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v3.2s ; CHECK-GI-NEXT: mul v1.2s, v1.2s, v4.2s -; CHECK-GI-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-GI-NEXT: mov d3, v2.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mul v1.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -411,7 +411,7 @@ define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) { ; CHECK-SD-LABEL: mul_i32_same: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: fmov w0, s0 @@ -419,8 +419,8 @@ define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: mul_i32_same: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -442,7 +442,7 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -452,8 +452,8 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-GI-LABEL: and_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v2.16b, v2.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v2.d[1] +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: and v1.8b, v2.8b, v1.8b ; CHECK-GI-NEXT: and v0.8b, v0.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v1.s[1] @@ -474,7 +474,7 @@ define i32 @and_i32_same(<4 x i32> %a, <4 x i32> %b) { ; CHECK-SD-LABEL: and_i32_same: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -483,8 +483,8 @@ define i32 @and_i32_same(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: and_i32_same: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: and v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -506,7 +506,7 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -516,8 +516,8 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-GI-LABEL: or_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v2.16b, v2.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v2.d[1] +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: orr v1.8b, v2.8b, v1.8b ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v1.s[1] @@ -538,7 +538,7 @@ define i32 @or_i32_same(<4 x i32> %a, <4 x i32> %b) { ; CHECK-SD-LABEL: or_i32_same: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -547,8 +547,8 @@ define i32 @or_i32_same(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: or_i32_same: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: orr v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -570,7 +570,7 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -580,8 +580,8 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-GI-LABEL: xor_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v2.16b, v2.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v2.d[1] +; CHECK-GI-NEXT: mov d3, v0.d[1] ; CHECK-GI-NEXT: eor v1.8b, v2.8b, v1.8b ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v1.s[1] @@ -602,7 +602,7 @@ define i32 @xor_i32_same(<4 x i32> %a, <4 x i32> %b) { ; CHECK-SD-LABEL: xor_i32_same: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -611,8 +611,8 @@ define i32 @xor_i32_same(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: xor_i32_same: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: eor v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -905,7 +905,7 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-SD-NEXT: fmul s2, s2, s3 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: fmul s0, s2, s0 @@ -913,8 +913,8 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) ; ; CHECK-GI-LABEL: nested_mul_f32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d4, v0.d[1] +; CHECK-GI-NEXT: mov d5, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v4.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v5.2s ; CHECK-GI-NEXT: mov s4, v0.s[1] @@ -1056,7 +1056,7 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-SD-NEXT: mul w8, w0, w1 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: fmov w9, s0 @@ -1065,8 +1065,8 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; ; CHECK-GI-LABEL: nested_mul_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -1092,7 +1092,7 @@ define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: and w8, w0, w1 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x9, d0 ; CHECK-SD-NEXT: lsr x10, x9, #32 @@ -1102,8 +1102,8 @@ define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; ; CHECK-GI-LABEL: nested_and_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: and v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -1129,7 +1129,7 @@ define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: orr w8, w0, w1 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x9, d0 ; CHECK-SD-NEXT: lsr x10, x9, #32 @@ -1139,8 +1139,8 @@ define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; ; CHECK-GI-LABEL: nested_or_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: orr v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] @@ -1166,7 +1166,7 @@ define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: eor w8, w0, w1 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x9, d0 ; CHECK-SD-NEXT: lsr x10, x9, #32 @@ -1176,8 +1176,8 @@ define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { ; ; CHECK-GI-LABEL: nested_xor_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b ; CHECK-GI-NEXT: eor v1.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 268d2144662fe..e275bd1fb9241 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -30,8 +30,7 @@ entry: define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) { ; CHECK-LABEL: i8_off8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> @@ -166,16 +165,10 @@ entry: } define <1 x i64> @i64_off1(<2 x i64> %arg1, <2 x i64> %arg2) { -; CHECK-SD-LABEL: i64_off1: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GISEL-LABEL: i64_off1: -; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: mov d0, v0.d[1] -; CHECK-GISEL-NEXT: ret +; CHECK-LABEL: i64_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> ret <1 x i64> %shuffle @@ -217,8 +210,7 @@ entry: define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) { ; CHECK-LABEL: i8_zero_off8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> @@ -354,16 +346,10 @@ entry: } define <1 x i64> @i64_zero_off1(<2 x i64> %arg1) { -; CHECK-SD-LABEL: i64_zero_off1: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GISEL-LABEL: i64_zero_off1: -; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: mov d0, v0.d[1] -; CHECK-GISEL-NEXT: ret +; CHECK-LABEL: i64_zero_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> ret <1 x i64> %shuffle diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll index 368103bf2f2fe..2779f535adc4b 100644 --- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll +++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll @@ -44,15 +44,14 @@ define <2 x i32> @and_extract_zext_idx2(<4 x i16> %vec) nounwind { ; CHECK-SD-LABEL: and_extract_zext_idx2: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: and_extract_zext_idx2: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> @@ -65,15 +64,14 @@ define <4 x i16> @and_extract_sext_idx4(<8 x i8> %vec) nounwind { ; CHECK-SD-LABEL: and_extract_sext_idx4: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: and_extract_sext_idx4: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: ret %sext = sext <8 x i8> %vec to <8 x i16> @@ -148,14 +146,13 @@ define <2 x i32> @sext_extract_zext_idx2(<4 x i16> %vec) nounwind { ; CHECK-SD-LABEL: sext_extract_zext_idx2: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_extract_zext_idx2: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-GI-NEXT: ret @@ -170,14 +167,13 @@ define <4 x i16> @sext_extract_sext_idx4(<8 x i8> %vec) nounwind { ; CHECK-SD-LABEL: sext_extract_sext_idx4: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_extract_sext_idx4: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 23bc6e0fb04ea..f05b37435f83c 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -765,7 +765,7 @@ define i32 @extract_v4i32_vector_insert(<4 x i32> %a, <2 x i32> %b, i32 %c) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 diff --git a/llvm/test/CodeGen/AArch64/f16f32dot-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/f16f32dot-fixed-length-fdot.ll index ab72173755a61..c7cfccb0e7446 100644 --- a/llvm/test/CodeGen/AArch64/f16f32dot-fixed-length-fdot.ll +++ b/llvm/test/CodeGen/AArch64/f16f32dot-fixed-length-fdot.ll @@ -52,7 +52,7 @@ define <2 x float> @fixed_fdot(<2 x float> %acc, <4 x half> %a, <4 x half> %b) { ; CHECK-EXPAND-NEXT: fcvtl v2.4s, v2.4h ; CHECK-EXPAND-NEXT: fmul v1.4s, v1.4s, v2.4s ; CHECK-EXPAND-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-EXPAND-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v1.d[1] ; CHECK-EXPAND-NEXT: fadd v0.2s, v1.2s, v0.2s ; CHECK-EXPAND-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll index 43e9007073634..ece7bdea5bc06 100644 --- a/llvm/test/CodeGen/AArch64/fabs.ll +++ b/llvm/test/CodeGen/AArch64/fabs.ll @@ -77,9 +77,8 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: fabs v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fabs v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fabs_v3f64: diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll index b15579199a059..51d5ff7e1b5de 100644 --- a/llvm/test/CodeGen/AArch64/faddsub.ll +++ b/llvm/test/CodeGen/AArch64/faddsub.ll @@ -79,9 +79,8 @@ define <3 x double> @fadd_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fadd v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fadd v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fadd_v3f64: @@ -420,9 +419,8 @@ define <3 x double> @fsub_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fsub v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fsub v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fsub_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index 3e443a1cf9111..23cd9ff63fca3 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -711,8 +711,8 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> ; CHECK-SD-NEXT: str q1, [sp, #64] // 16-byte Spill ; CHECK-SD-NEXT: ldp q0, q1, [sp, #112] // 32-byte Folded Reload ; CHECK-SD-NEXT: bl __lttf2 -; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload ; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload ; CHECK-SD-NEXT: ldp q2, q4, [sp, #64] // 32-byte Folded Reload ; CHECK-SD-NEXT: cset w8, mi ; CHECK-SD-NEXT: sbfx x8, x8, #0, #1 @@ -721,10 +721,9 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> ; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: fmov d2, x8 ; CHECK-SD-NEXT: bsl v2.16b, v4.16b, v3.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: add sp, sp, #160 ; CHECK-SD-NEXT: ret ; @@ -829,9 +828,8 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double> ; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3f64_double: diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll index 1c4b1395c7822..e7c60e0231766 100644 --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -91,9 +91,8 @@ define <3 x double> @copysign_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: bif v0.16b, v3.16b, v1.16b ; CHECK-SD-NEXT: bif v2.16b, v5.16b, v1.16b ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: copysign_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll index b408e9c1bd4e6..2532a7b6a9406 100644 --- a/llvm/test/CodeGen/AArch64/fcvt.ll +++ b/llvm/test/CodeGen/AArch64/fcvt.ll @@ -73,9 +73,8 @@ define <3 x double> @ceil_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frintp v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frintp v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: ceil_v3f64: @@ -372,9 +371,8 @@ define <3 x double> @floor_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frintm v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frintm v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: floor_v3f64: @@ -671,9 +669,8 @@ define <3 x double> @nearbyint_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frinti v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frinti v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: nearbyint_v3f64: @@ -970,9 +967,8 @@ define <3 x double> @roundeven_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frintn v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frintn v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: roundeven_v3f64: @@ -1269,9 +1265,8 @@ define <3 x double> @rint_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frintx v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frintx v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: rint_v3f64: @@ -1568,9 +1563,8 @@ define <3 x double> @round_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frinta v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frinta v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: round_v3f64: @@ -1867,9 +1861,8 @@ define <3 x double> @trunc_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: frintz v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: frintz v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: trunc_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index 5bdccccc62b99..cad3a5618a14d 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -79,9 +79,8 @@ define <3 x double> @fdiv_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fdiv v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fdiv v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fdiv_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll index 7b8a080349998..8c48cc609c75a 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -48,7 +48,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { ; CHECK-SD-LABEL: vector_deinterleave_v2f32_v4f32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: zip1 v2.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: zip2 v1.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmov d0, d2 @@ -236,12 +236,10 @@ define {<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>} @vector_deinterleave4_v1i64_ ; CHECK-LABEL: vector_deinterleave4_v1i64_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov v2.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q3 ; CHECK-NEXT: ret %retval = call {<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>} @llvm.vector.deinterleave4.v4i64(<4 x i64> %vec) ret {<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>} %retval @@ -298,8 +296,8 @@ define {<8 x half>, <8 x half>, <8 x half>, <8 x half>} @vector_deinterleave4_v8 define {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @vector_deinterleave4_v2f32_v8f32(<8 x float> %vec) { ; CHECK-LABEL: vector_deinterleave4_v2f32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d2, v1.d[1] +; CHECK-NEXT: mov d3, v0.d[1] ; CHECK-NEXT: uzp1 v4.2s, v1.2s, v2.2s ; CHECK-NEXT: uzp1 v5.2s, v0.2s, v3.2s ; CHECK-NEXT: uzp2 v6.2s, v1.2s, v2.2s @@ -333,12 +331,10 @@ define {<1 x double>, <1 x double>, <1 x double>, <1 x double>} @vector_deinterl ; CHECK-LABEL: vector_deinterleave4_v1f64_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov v2.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q3 ; CHECK-NEXT: ret %retval = call {<1 x double>, <1 x double>, <1 x double>, <1 x double>} @llvm.vector.deinterleave4.v4f64(<4 x double> %vec) ret {<1 x double>, <1 x double>, <1 x double>, <1 x double>} %retval diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index a24ec47ace71c..4ac0276aabfec 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -460,15 +460,12 @@ define <6 x double> @interleave3_v6f64(<2 x double> %vec0, <2 x double> %vec1, < ; CHECK-NEXT: st3 { v0.2d, v1.2d, v2.2d }, [x8] ; CHECK-NEXT: ldp q0, q2, [sp] ; CHECK-NEXT: ldr q4, [sp, #32] -; CHECK-NEXT: ext v5.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: mov d5, v4.d[1] ; CHECK-NEXT: // kill: def $d4 killed $d4 killed $q4 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NEXT: // kill: def $d5 killed $d5 killed $q5 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q3 ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %retval = call <6 x double> @llvm.vector.interleave3.v6f64(<2 x double> %vec0, <2 x double> %vec1, <2 x double> %vec2) diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll index fb12f8acf1745..f11f5f350770e 100644 --- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll +++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll @@ -140,9 +140,8 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fmin v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fmin v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: min_v3f64: @@ -177,9 +176,8 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fmax v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fmax v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: max_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll index 64f0da8b4cd0f..6450ea312dad0 100644 --- a/llvm/test/CodeGen/AArch64/fminmax.ll +++ b/llvm/test/CodeGen/AArch64/fminmax.ll @@ -140,9 +140,8 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fminnm v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fminnm v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: min_v3f64: @@ -177,9 +176,8 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fmaxnm v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fmaxnm v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: max_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll index 807527ac13ad0..987e5675c17a5 100644 --- a/llvm/test/CodeGen/AArch64/fmla.ll +++ b/llvm/test/CodeGen/AArch64/fmla.ll @@ -85,9 +85,8 @@ define <3 x double> @fma_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> %c ; CHECK-SD-NEXT: fmla v6.2d, v3.2d, v0.2d ; CHECK-SD-NEXT: ldr d3, [sp] ; CHECK-SD-NEXT: fmla v3.2d, v5.2d, v2.2d +; CHECK-SD-NEXT: mov d1, v6.d[1] ; CHECK-SD-NEXT: fmov d0, d6 -; CHECK-SD-NEXT: ext v1.16b, v6.16b, v6.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: fmov d2, d3 ; CHECK-SD-NEXT: ret ; @@ -736,9 +735,8 @@ define <3 x double> @fmuladd_v3f64(<3 x double> %a, <3 x double> %b, <3 x double ; CHECK-SD-NEXT: fmla v6.2d, v3.2d, v0.2d ; CHECK-SD-NEXT: ldr d3, [sp] ; CHECK-SD-NEXT: fmla v3.2d, v5.2d, v2.2d +; CHECK-SD-NEXT: mov d1, v6.d[1] ; CHECK-SD-NEXT: fmov d0, d6 -; CHECK-SD-NEXT: ext v1.16b, v6.16b, v6.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: fmov d2, d3 ; CHECK-SD-NEXT: ret ; @@ -1181,9 +1179,8 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b, <3 x double> % ; CHECK-SD-NEXT: fmla v6.2d, v3.2d, v0.2d ; CHECK-SD-NEXT: ldr d3, [sp] ; CHECK-SD-NEXT: fmla v3.2d, v5.2d, v2.2d +; CHECK-SD-NEXT: mov d1, v6.d[1] ; CHECK-SD-NEXT: fmov d0, d6 -; CHECK-SD-NEXT: ext v1.16b, v6.16b, v6.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: fmov d2, d3 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll index bd3d1353e643e..f5eea377a0707 100644 --- a/llvm/test/CodeGen/AArch64/fmul.ll +++ b/llvm/test/CodeGen/AArch64/fmul.ll @@ -79,9 +79,8 @@ define <3 x double> @fmul_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-SD-NEXT: fmul v2.2d, v2.2d, v5.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fmul_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll index de2671afe60ab..39a83e52a44c3 100644 --- a/llvm/test/CodeGen/AArch64/fneg.ll +++ b/llvm/test/CodeGen/AArch64/fneg.ll @@ -77,9 +77,8 @@ define <3 x double> @fabs_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: fneg v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fneg v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fabs_v3f64: diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index 5353920ed5667..a3a09839c54c4 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -137,6 +137,7 @@ ; CHECK-FP16-GI-NEXT: warning: Instruction selection used fallback path for min_v8f16 ; CHECK-FP16-GI-NEXT: warning: Instruction selection used fallback path for min_v9f16 ; CHECK-FP16-GI-NEXT: warning: Instruction selection used fallback path for min_v16f16 +; ;;;;;;;;;;;;;;;; max_f64 define double @max_nnan_f64(double %a, double %b) { @@ -173,9 +174,8 @@ define <3 x double> @max_nnan_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-NEXT: fmaxnm v2.2d, v2.2d, v5.2d ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-NEXT: fmaxnm v0.2d, v0.2d, v3.2d -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: ret entry: %c = call nnan <3 x double> @llvm.maximumnum.v3f64(<3 x double> %a, <3 x double> %b) @@ -710,9 +710,8 @@ define <3 x double> @min_nnan_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-NEXT: fminnm v2.2d, v2.2d, v5.2d ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-NEXT: fminnm v0.2d, v0.2d, v3.2d -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: ret entry: %c = call nnan <3 x double> @llvm.minimumnum.v3f64(<3 x double> %a, <3 x double> %b) @@ -1316,9 +1315,8 @@ define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-NEXT: fmaxnm v0.2d, v0.2d, v1.2d ; CHECK-NEXT: fminnm v1.2d, v5.2d, v5.2d ; CHECK-NEXT: fmaxnm v2.2d, v2.2d, v1.2d -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-NEXT: ret entry: @@ -1960,9 +1958,8 @@ define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) { ; CHECK-NEXT: fminnm v0.2d, v0.2d, v1.2d ; CHECK-NEXT: fminnm v1.2d, v5.2d, v5.2d ; CHECK-NEXT: fminnm v2.2d, v2.2d, v1.2d -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll index 05422d3cc6051..cbcdb8596da6e 100644 --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -287,8 +287,7 @@ entry: define <4 x half> @get_high(<8 x half> %a) #0 { ; CHECK-LABEL: get_high: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll index 7085bfdc52f40..c0b883c806331 100644 --- a/llvm/test/CodeGen/AArch64/fpext.ll +++ b/llvm/test/CodeGen/AArch64/fpext.ll @@ -75,9 +75,8 @@ define <3 x double> @fpext_v3f32_v3f64(<3 x float> %a) { ; CHECK-SD-NEXT: fcvtl v3.2d, v0.2s ; CHECK-SD-NEXT: fcvtl2 v2.2d, v0.4s ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: mov d1, v3.d[1] ; CHECK-SD-NEXT: fmov d0, d3 -; CHECK-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fpext_v3f32_v3f64: @@ -174,7 +173,7 @@ define <4 x fp128> @fpext_v4f32_v4f128(<4 x float> %a) { ; CHECK-SD-NEXT: .cfi_def_cfa_offset 80 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: str q0, [sp, #48] // 16-byte Spill -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Spill ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: bl __extendsftf2 @@ -332,26 +331,15 @@ entry: } define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) { -; CHECK-SD-LABEL: fpext_v3f16_v3f64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtl v1.4s, v0.4h -; CHECK-SD-NEXT: fcvtl v0.2d, v1.2s -; CHECK-SD-NEXT: fcvtl2 v2.2d, v1.4s -; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fpext_v3f16_v3f64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s -; CHECK-GI-NEXT: fcvtl2 v2.2d, v1.4s -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fpext_v3f16_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtl v1.4s, v0.4h +; CHECK-NEXT: fcvtl v0.2d, v1.2s +; CHECK-NEXT: fcvtl2 v2.2d, v1.4s +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = fpext <3 x half> %a to <3 x double> ret <3 x double> %c diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index caffdd548e710..c570f20268825 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -1363,118 +1363,36 @@ entry: } define <3 x i64> @fptos_v3f64_v3i64(<3 x double> %a) { -; CHECK-NOFP16-SD-LABEL: fptos_v3f64_v3i64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-SD-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: fptos_v3f64_v3i64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-SD-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: fptos_v3f64_v3i64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-GI-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: fptos_v3f64_v3i64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: fptos_v3f64_v3i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: fcvtzs v2.2d, v2.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = fptosi <3 x double> %a to <3 x i64> ret <3 x i64> %c } define <3 x i64> @fptou_v3f64_v3i64(<3 x double> %a) { -; CHECK-NOFP16-SD-LABEL: fptou_v3f64_v3i64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-SD-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: fptou_v3f64_v3i64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-SD-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: fptou_v3f64_v3i64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-GI-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: fptou_v3f64_v3i64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: fptou_v3f64_v3i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: fcvtzu v2.2d, v2.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: fcvtzu v0.2d, v0.2d +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = fptoui <3 x double> %a to <3 x i64> ret <3 x i64> %c @@ -3270,9 +3188,8 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) { ; CHECK-NOFP16-SD-NEXT: fcvtzs v3.2d, v1.2d ; CHECK-NOFP16-SD-NEXT: fcvtzs v2.2d, v0.2d ; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NOFP16-SD-NEXT: mov d1, v3.d[1] ; CHECK-NOFP16-SD-NEXT: fmov d0, d3 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NOFP16-SD-NEXT: ret ; ; CHECK-FP16-SD-LABEL: fptos_v3f32_v3i64: @@ -3282,9 +3199,8 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) { ; CHECK-FP16-SD-NEXT: fcvtzs v3.2d, v1.2d ; CHECK-FP16-SD-NEXT: fcvtzs v2.2d, v0.2d ; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-FP16-SD-NEXT: mov d1, v3.d[1] ; CHECK-FP16-SD-NEXT: fmov d0, d3 -; CHECK-FP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: fptos_v3f32_v3i64: @@ -3323,9 +3239,8 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) { ; CHECK-NOFP16-SD-NEXT: fcvtzu v3.2d, v1.2d ; CHECK-NOFP16-SD-NEXT: fcvtzu v2.2d, v0.2d ; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NOFP16-SD-NEXT: mov d1, v3.d[1] ; CHECK-NOFP16-SD-NEXT: fmov d0, d3 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NOFP16-SD-NEXT: ret ; ; CHECK-FP16-SD-LABEL: fptou_v3f32_v3i64: @@ -3335,9 +3250,8 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) { ; CHECK-FP16-SD-NEXT: fcvtzu v3.2d, v1.2d ; CHECK-FP16-SD-NEXT: fcvtzu v2.2d, v0.2d ; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-FP16-SD-NEXT: mov d1, v3.d[1] ; CHECK-FP16-SD-NEXT: fmov d0, d3 -; CHECK-FP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: fptou_v3f32_v3i64: @@ -4897,7 +4811,7 @@ entry: define <8 x i64> @fptos_v8f16_v8i64(<8 x half> %a) { ; CHECK-NOFP16-SD-LABEL: fptos_v8f16_v8i64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: mov h4, v0.h[2] ; CHECK-NOFP16-SD-NEXT: mov h3, v0.h[1] ; CHECK-NOFP16-SD-NEXT: mov h7, v0.h[3] @@ -4932,7 +4846,7 @@ define <8 x i64> @fptos_v8f16_v8i64(<8 x half> %a) { ; ; CHECK-FP16-SD-LABEL: fptos_v8f16_v8i64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: mov h4, v0.h[2] ; CHECK-FP16-SD-NEXT: mov h3, v0.h[1] ; CHECK-FP16-SD-NEXT: mov h7, v0.h[3] @@ -4992,7 +4906,7 @@ entry: define <8 x i64> @fptou_v8f16_v8i64(<8 x half> %a) { ; CHECK-NOFP16-SD-LABEL: fptou_v8f16_v8i64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: mov h4, v0.h[2] ; CHECK-NOFP16-SD-NEXT: mov h3, v0.h[1] ; CHECK-NOFP16-SD-NEXT: mov h7, v0.h[3] @@ -5027,7 +4941,7 @@ define <8 x i64> @fptou_v8f16_v8i64(<8 x half> %a) { ; ; CHECK-FP16-SD-LABEL: fptou_v8f16_v8i64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: mov h4, v0.h[2] ; CHECK-FP16-SD-NEXT: mov h3, v0.h[1] ; CHECK-FP16-SD-NEXT: mov h7, v0.h[3] @@ -5087,8 +5001,8 @@ entry: define <16 x i64> @fptos_v16f16_v16i64(<16 x half> %a) { ; CHECK-NOFP16-SD-LABEL: fptos_v16f16_v16i64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-NOFP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-NOFP16-SD-NEXT: mov h4, v0.h[1] ; CHECK-NOFP16-SD-NEXT: fcvt s5, h0 ; CHECK-NOFP16-SD-NEXT: mov h18, v0.h[2] @@ -5153,8 +5067,8 @@ define <16 x i64> @fptos_v16f16_v16i64(<16 x half> %a) { ; ; CHECK-FP16-SD-LABEL: fptos_v16f16_v16i64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-FP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-FP16-SD-NEXT: mov h4, v0.h[1] ; CHECK-FP16-SD-NEXT: mov h5, v0.h[2] ; CHECK-FP16-SD-NEXT: fcvtzs x8, h0 @@ -5256,8 +5170,8 @@ entry: define <16 x i64> @fptou_v16f16_v16i64(<16 x half> %a) { ; CHECK-NOFP16-SD-LABEL: fptou_v16f16_v16i64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-NOFP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-NOFP16-SD-NEXT: mov h4, v0.h[1] ; CHECK-NOFP16-SD-NEXT: fcvt s5, h0 ; CHECK-NOFP16-SD-NEXT: mov h18, v0.h[2] @@ -5322,8 +5236,8 @@ define <16 x i64> @fptou_v16f16_v16i64(<16 x half> %a) { ; ; CHECK-FP16-SD-LABEL: fptou_v16f16_v16i64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-FP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-FP16-SD-NEXT: mov h4, v0.h[1] ; CHECK-FP16-SD-NEXT: mov h5, v0.h[2] ; CHECK-FP16-SD-NEXT: fcvtzu x8, h0 diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 6a06d99689df9..145aef9123a4e 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -1888,7 +1888,7 @@ define <4 x i32> @test_signed_v4f32_v4i32_duplicate(<4 x float> %f) { define <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i50: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff ; CHECK-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 ; CHECK-NEXT: fcvtzs x12, s0 @@ -1985,7 +1985,7 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-SD-NEXT: bl __fixsfti ; CHECK-SD-NEXT: fcmp s8, s9 ; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Reload -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: csel x8, xzr, x0, lt ; CHECK-SD-NEXT: csel x9, x25, x1, lt ; CHECK-SD-NEXT: fcmp s8, s10 @@ -2182,7 +2182,7 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) { ; CHECK-SD-NEXT: bl __fixsfti ; CHECK-SD-NEXT: fcmp s8, s9 ; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Reload -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: csel x8, xzr, x0, lt ; CHECK-SD-NEXT: csel x9, x25, x1, lt ; CHECK-SD-NEXT: fcmp s8, s10 @@ -3636,7 +3636,7 @@ define <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> %f) { define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; CHECK-CVT-LABEL: test_signed_v8f16_v8i50: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-CVT-NEXT: mov d1, v0.d[1] ; CHECK-CVT-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff ; CHECK-CVT-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 ; CHECK-CVT-NEXT: mov h2, v1.h[1] @@ -3697,7 +3697,7 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; ; CHECK-FP16-LABEL: test_signed_v8f16_v8i50: ; CHECK-FP16: // %bb.0: -; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov d1, v0.d[1] ; CHECK-FP16-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff ; CHECK-FP16-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 ; CHECK-FP16-NEXT: mov h2, v1.h[1] @@ -3754,7 +3754,7 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { define <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-SD-CVT-LABEL: test_signed_v8f16_v8i64: ; CHECK-SD-CVT: // %bb.0: -; CHECK-SD-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-CVT-NEXT: mov d1, v0.d[1] ; CHECK-SD-CVT-NEXT: mov h4, v0.h[2] ; CHECK-SD-CVT-NEXT: mov h3, v0.h[1] ; CHECK-SD-CVT-NEXT: mov h7, v0.h[3] @@ -3789,7 +3789,7 @@ define <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; ; CHECK-SD-FP16-LABEL: test_signed_v8f16_v8i64: ; CHECK-SD-FP16: // %bb.0: -; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-FP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-FP16-NEXT: mov h4, v0.h[2] ; CHECK-SD-FP16-NEXT: mov h3, v0.h[1] ; CHECK-SD-FP16-NEXT: mov h7, v0.h[3] @@ -3860,7 +3860,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: .cfi_offset b9, -112 ; CHECK-NEXT: .cfi_offset b10, -128 ; CHECK-NEXT: str q0, [sp, #48] // 16-byte Spill -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: mov x19, x8 ; CHECK-NEXT: str q0, [sp, #32] // 16-byte Spill ; CHECK-NEXT: mov h0, v0.h[1] @@ -3942,8 +3942,8 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x20, xzr, x8, vs -; CHECK-NEXT: csel x21, xzr, x9, vs +; CHECK-NEXT: csel x21, xzr, x8, vs +; CHECK-NEXT: csel x28, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 @@ -3955,7 +3955,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x28, xzr, x8, vs +; CHECK-NEXT: csel x20, xzr, x8, vs ; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti @@ -3974,12 +3974,12 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr x9, [sp] // 8-byte Reload -; CHECK-NEXT: extr x8, x24, x28, #28 +; CHECK-NEXT: extr x8, x24, x20, #28 ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: bfi x25, x21, #36, #28 -; CHECK-NEXT: lsr x11, x20, #28 +; CHECK-NEXT: bfi x25, x28, #36, #28 +; CHECK-NEXT: lsr x11, x21, #28 ; CHECK-NEXT: stur x9, [x19, #75] -; CHECK-NEXT: extr x9, x20, x21, #28 +; CHECK-NEXT: extr x9, x21, x28, #28 ; CHECK-NEXT: stur x8, [x19, #41] ; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: str x9, [x19, #16] @@ -3998,7 +3998,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: ldr x13, [sp, #24] // 8-byte Reload ; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: bfi x8, x28, #36, #28 +; CHECK-NEXT: bfi x8, x20, #36, #28 ; CHECK-NEXT: extr x10, x14, x12, #28 ; CHECK-NEXT: bfi x27, x12, #36, #28 ; CHECK-NEXT: ldr x12, [sp, #72] // 8-byte Reload @@ -4057,7 +4057,7 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-SD-NEXT: .cfi_offset b9, -112 ; CHECK-SD-NEXT: .cfi_offset b10, -128 ; CHECK-SD-NEXT: str q0, [sp, #48] // 16-byte Spill -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: mov x19, x8 ; CHECK-SD-NEXT: fcvt s8, h0 ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Spill diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index d4feab2ea5d9b..083a84292029d 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -1550,7 +1550,7 @@ define <4 x i32> @test_unsigned_v4f32_v4i32_duplicate(<4 x float> %f) { define <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) { ; CHECK-LABEL: test_unsigned_v4f32_v4i50: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: mov s3, v0.s[1] ; CHECK-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: fcvtzu x11, s0 @@ -1629,7 +1629,7 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-SD-NEXT: bl __fixunssfti ; CHECK-SD-NEXT: fcmp s8, #0.0 ; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Reload -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: csel x8, xzr, x0, lt ; CHECK-SD-NEXT: csel x9, xzr, x1, lt ; CHECK-SD-NEXT: fcmp s8, s9 @@ -1784,7 +1784,7 @@ define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) { ; CHECK-SD-NEXT: bl __fixunssfti ; CHECK-SD-NEXT: fcmp s8, #0.0 ; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Reload -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: csel x8, xzr, x1, lt ; CHECK-SD-NEXT: csel x9, xzr, x0, lt ; CHECK-SD-NEXT: fcmp s8, s9 @@ -2939,7 +2939,7 @@ define <8 x i32> @test_unsigned_v8f16_v8i32_duplicate(<8 x half> %f) { define <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i50: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-CVT-NEXT: mov d1, v0.d[1] ; CHECK-CVT-NEXT: mov h5, v0.h[1] ; CHECK-CVT-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-CVT-NEXT: mov h6, v0.h[2] @@ -2983,7 +2983,7 @@ define <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { ; ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i50: ; CHECK-FP16: // %bb.0: -; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov d1, v0.d[1] ; CHECK-FP16-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-FP16-NEXT: fcvtzu x13, h0 ; CHECK-FP16-NEXT: mov h2, v1.h[1] @@ -3023,7 +3023,7 @@ define <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) { define <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-SD-CVT-LABEL: test_unsigned_v8f16_v8i64: ; CHECK-SD-CVT: // %bb.0: -; CHECK-SD-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-CVT-NEXT: mov d1, v0.d[1] ; CHECK-SD-CVT-NEXT: mov h4, v0.h[2] ; CHECK-SD-CVT-NEXT: mov h3, v0.h[1] ; CHECK-SD-CVT-NEXT: mov h7, v0.h[3] @@ -3058,7 +3058,7 @@ define <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; ; CHECK-SD-FP16-LABEL: test_unsigned_v8f16_v8i64: ; CHECK-SD-FP16: // %bb.0: -; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-FP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-FP16-NEXT: mov h4, v0.h[2] ; CHECK-SD-FP16-NEXT: mov h3, v0.h[1] ; CHECK-SD-FP16-NEXT: mov h7, v0.h[3] @@ -3127,7 +3127,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: .cfi_offset b8, -104 ; CHECK-NEXT: .cfi_offset b9, -112 ; CHECK-NEXT: str q0, [sp, #48] // 16-byte Spill -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: mov x19, x8 ; CHECK-NEXT: str q0, [sp, #32] // 16-byte Spill ; CHECK-NEXT: mov h0, v0.h[1] @@ -3291,7 +3291,7 @@ define <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) { ; CHECK-SD-NEXT: .cfi_offset b8, -104 ; CHECK-SD-NEXT: .cfi_offset b9, -112 ; CHECK-SD-NEXT: str q0, [sp, #48] // 16-byte Spill -; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d0, v0.d[1] ; CHECK-SD-NEXT: mov x19, x8 ; CHECK-SD-NEXT: fcvt s8, h0 ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Spill diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll index 136ac8b0a2aa1..3476dc0c81573 100644 --- a/llvm/test/CodeGen/AArch64/freeze.ll +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -242,9 +242,8 @@ define <3 x i64> @freeze_v3i64() { ; CHECK-SD-LABEL: freeze_v3i64: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmov d2, d0 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: freeze_v3i64: @@ -296,10 +295,9 @@ define <3 x ptr> @freeze_v3p0() { ; CHECK-SD-NEXT: mov w8, #4 // =0x4 ; CHECK-SD-NEXT: dup v2.2d, x8 ; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: add d2, d0, d2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: freeze_v3p0: diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll index 6c5fd8e52b017..7ba763727a3c8 100644 --- a/llvm/test/CodeGen/AArch64/fsqrt.ll +++ b/llvm/test/CodeGen/AArch64/fsqrt.ll @@ -73,9 +73,8 @@ define <3 x double> @sqrt_v3f64(<3 x double> %a) { ; CHECK-SD-NEXT: fsqrt v2.2d, v2.2d ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: fsqrt v0.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sqrt_v3f64: diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll index f3dd4975336b5..df5801b9af938 100644 --- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll +++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll @@ -94,9 +94,9 @@ define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v1.8h, v1.8h ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: mov d0, v0.d[1] ; CHECK-BE-NEXT: rev64 v0.4h, v0.4h -; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: mov d1, v1.d[1] ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -121,8 +121,8 @@ define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: mov d1, v1.d[1] +; CHECK-BE-NEXT: mov d0, v0.d[1] ; CHECK-BE-NEXT: rev16 v1.8b, v1.8b ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s @@ -140,8 +140,8 @@ entry: define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i16> %b) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1_wrongindex: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-LE-NEXT: mov d2, v0.d[1] +; CHECK-LE-NEXT: mov d1, v1.d[1] ; CHECK-LE-NEXT: ext v0.8b, v0.8b, v2.8b, #4 ; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-LE-NEXT: ret @@ -153,7 +153,7 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: mov d1, v1.d[1] ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -162,7 +162,7 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i ; CHECK-GI-LABEL: test_smull_high_s16_bitcasta1_wrongindex: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret entry: @@ -176,8 +176,8 @@ entry: define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i8> %bb) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1_wrongindex: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-LE-NEXT: mov d2, v1.d[1] +; CHECK-LE-NEXT: mov d0, v0.d[1] ; CHECK-LE-NEXT: ext v1.8b, v1.8b, v2.8b, #6 ; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-LE-NEXT: ret @@ -188,7 +188,7 @@ define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v1.8h, v1.8h ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: mov d0, v0.d[1] ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #6 ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s @@ -197,7 +197,7 @@ define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i ; ; CHECK-GI-LABEL: test_smull_high_s16_bitcastb1_wrongindex: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #6 ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret @@ -213,7 +213,7 @@ define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i1 ; CHECK-LABEL: test_smull_high_s16_bitcasta2_wrongindex: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret ; @@ -224,7 +224,7 @@ define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i1 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: mov d1, v1.d[1] ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s @@ -241,7 +241,7 @@ entry: define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2_wrongindex: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-LE-NEXT: mov d0, v0.d[1] ; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #4 ; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-LE-NEXT: ret @@ -253,7 +253,7 @@ define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #4 -; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: mov d0, v0.d[1] ; CHECK-BE-NEXT: rev16 v1.8b, v1.8b ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s @@ -262,7 +262,7 @@ define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i ; ; CHECK-GI-LABEL: test_smull_high_s16_bitcastb2_wrongindex: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #4 ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: ret @@ -294,7 +294,7 @@ define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 { ; ; CHECK-GI-LABEL: test_smull_high_s16_splata1: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: smull v0.4s, v1.4h, v0.h[3] ; CHECK-GI-NEXT: ret entry: @@ -324,7 +324,7 @@ define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 { ; ; CHECK-GI-LABEL: test_smull_high_s16_splatb1: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[3] ; CHECK-GI-NEXT: ret entry: @@ -339,7 +339,7 @@ define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-LABEL: test_smull_high_s16_splata2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: dup v0.2s, v0.s[3] -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret ; @@ -350,7 +350,7 @@ define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-BE-NEXT: dup v0.2s, v0.s[3] -; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: mov d1, v1.d[1] ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s @@ -367,7 +367,7 @@ entry: define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-LABEL: test_smull_high_s16_splatb2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: dup v1.8b, v1.b[3] ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret @@ -379,7 +379,7 @@ define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: dup v1.8b, v1.b[3] -; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: mov d0, v0.d[1] ; CHECK-BE-NEXT: rev16 v1.8b, v1.8b ; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s @@ -611,7 +611,7 @@ define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { ; ; CHECK-GI-LABEL: hadd32_zext_asr: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #1 ; CHECK-GI-NEXT: ret %src1 = bitcast <16 x i8> %src1a to <4 x i32> @@ -639,7 +639,7 @@ define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 { ; ; CHECK-GI-LABEL: test_umull_high_s16_splata1: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d1, v1.d[1] ; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.s[1] ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index 3aef1e52b2a2a..3c49374d81f40 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -1142,9 +1142,8 @@ define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> ; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b ; CHECK-SD-NEXT: cmgt v1.2d, v5.2d, v2.2d ; CHECK-SD-NEXT: mov v2.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll index 88b6ea4f0cb19..1aa8f861964cd 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll @@ -340,7 +340,7 @@ define <8 x i8> @load_v8i8_4_2(float %tmp, <8 x i8> %b, ptr %a) { define <16 x i8> @load_v16i8_8_1(float %tmp, <16 x i8> %b, ptr %a) { ; CHECK-LABEL: load_v16i8_8_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret @@ -557,7 +557,7 @@ define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) { define <8 x i16> @load_v8i16_4_1(float %tmp, <8 x i16> %b, ptr %a) { ; CHECK-LABEL: load_v8i16_4_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret @@ -585,7 +585,7 @@ define <8 x i16> @load_v8i16_4_2(float %tmp, <8 x i16> %b, ptr %a) { define <4 x i32> @load_v4i32_2_1(float %tmp, <4 x i32> %b, ptr %a) { ; CHECK-LABEL: load_v4i32_2_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index facee0829611f..8b65000bd1a1e 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -60,9 +60,8 @@ define <3 x double> @insert_v3f64_0(<3 x double> %a, double %b, i32 %c) { ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 ; CHECK-SD-NEXT: mov v0.d[0], v3.d[0] -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: insert_v3f64_0: @@ -99,9 +98,8 @@ define <3 x double> @insert_v3f64_c(<3 x double> %a, double %b, i32 %c) { ; CHECK-SD-NEXT: str d3, [x8, x9, lsl #3] ; CHECK-SD-NEXT: ldr q0, [sp] ; CHECK-SD-NEXT: ldr d2, [sp, #16] -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: add sp, sp, #32 ; CHECK-SD-NEXT: ret ; @@ -1182,9 +1180,8 @@ define <3 x i64> @insert_v3i64_0(<3 x i64> %a, i64 %b, i32 %c) { ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: mov v0.d[0], x0 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: insert_v3i64_0: @@ -1221,9 +1218,8 @@ define <3 x i64> @insert_v3i64_c(<3 x i64> %a, i64 %b, i32 %c) { ; CHECK-SD-NEXT: str x0, [x8, x9, lsl #3] ; CHECK-SD-NEXT: ldr q0, [sp] ; CHECK-SD-NEXT: ldr d2, [sp, #16] -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: add sp, sp, #32 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 7a963cceb8119..c61b38d73c29d 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -1191,118 +1191,36 @@ entry: } define <3 x double> @stofp_v3i64_v3f64(<3 x i64> %a) { -; CHECK-NOFP16-SD-LABEL: stofp_v3i64_v3f64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-SD-NEXT: scvtf v2.2d, v2.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: stofp_v3i64_v3f64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-SD-NEXT: scvtf v2.2d, v2.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: stofp_v3i64_v3f64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-GI-NEXT: scvtf v2.2d, v2.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: scvtf v0.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: stofp_v3i64_v3f64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: scvtf v2.2d, v2.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: scvtf v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: stofp_v3i64_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: scvtf v2.2d, v2.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = sitofp <3 x i64> %a to <3 x double> ret <3 x double> %c } define <3 x double> @utofp_v3i64_v3f64(<3 x i64> %a) { -; CHECK-NOFP16-SD-LABEL: utofp_v3i64_v3f64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-SD-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: utofp_v3i64_v3f64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-SD-NEXT: ucvtf v2.2d, v2.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: utofp_v3i64_v3f64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NOFP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NOFP16-GI-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: utofp_v3i64_v3f64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: ucvtf v2.2d, v2.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: ucvtf v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: utofp_v3i64_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = uitofp <3 x i64> %a to <3 x double> ret <3 x double> %c @@ -1353,102 +1271,32 @@ entry: } define <3 x double> @stofp_v3i32_v3f64(<3 x i32> %a) { -; CHECK-NOFP16-SD-LABEL: stofp_v3i32_v3f64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NOFP16-SD-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-NOFP16-SD-NEXT: scvtf v3.2d, v1.2d -; CHECK-NOFP16-SD-NEXT: scvtf v2.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: fmov d0, d3 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: stofp_v3i32_v3f64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-FP16-SD-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-FP16-SD-NEXT: scvtf v3.2d, v1.2d -; CHECK-FP16-SD-NEXT: scvtf v2.2d, v0.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: fmov d0, d3 -; CHECK-FP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: stofp_v3i32_v3f64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NOFP16-GI-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-NOFP16-GI-NEXT: scvtf v3.2d, v1.2d -; CHECK-NOFP16-GI-NEXT: scvtf v2.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: mov d1, v3.d[1] -; CHECK-NOFP16-GI-NEXT: fmov d0, d3 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: stofp_v3i32_v3f64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-FP16-GI-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-FP16-GI-NEXT: scvtf v3.2d, v1.2d -; CHECK-FP16-GI-NEXT: scvtf v2.2d, v0.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: mov d1, v3.d[1] -; CHECK-FP16-GI-NEXT: fmov d0, d3 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: stofp_v3i32_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: scvtf v3.2d, v1.2d +; CHECK-NEXT: scvtf v2.2d, v0.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fmov d0, d3 +; CHECK-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x double> ret <3 x double> %c } define <3 x double> @utofp_v3i32_v3f64(<3 x i32> %a) { -; CHECK-NOFP16-SD-LABEL: utofp_v3i32_v3f64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NOFP16-SD-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NOFP16-SD-NEXT: ucvtf v3.2d, v1.2d -; CHECK-NOFP16-SD-NEXT: ucvtf v2.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: fmov d0, d3 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: utofp_v3i32_v3f64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-FP16-SD-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-FP16-SD-NEXT: ucvtf v3.2d, v1.2d -; CHECK-FP16-SD-NEXT: ucvtf v2.2d, v0.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: fmov d0, d3 -; CHECK-FP16-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: utofp_v3i32_v3f64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NOFP16-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NOFP16-GI-NEXT: ucvtf v3.2d, v1.2d -; CHECK-NOFP16-GI-NEXT: ucvtf v2.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: mov d1, v3.d[1] -; CHECK-NOFP16-GI-NEXT: fmov d0, d3 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: utofp_v3i32_v3f64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-FP16-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-FP16-GI-NEXT: ucvtf v3.2d, v1.2d -; CHECK-FP16-GI-NEXT: ucvtf v2.2d, v0.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: mov d1, v3.d[1] -; CHECK-FP16-GI-NEXT: fmov d0, d3 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: utofp_v3i32_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: ucvtf v3.2d, v1.2d +; CHECK-NEXT: ucvtf v2.2d, v0.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fmov d0, d3 +; CHECK-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x double> ret <3 x double> %c @@ -1709,110 +1557,34 @@ entry: } define <3 x double> @stofp_v3i16_v3f64(<3 x i16> %a) { -; CHECK-NOFP16-SD-LABEL: stofp_v3i16_v3f64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-NOFP16-SD-NEXT: sshll v0.2d, v1.2s, #0 -; CHECK-NOFP16-SD-NEXT: sshll2 v1.2d, v1.4s, #0 -; CHECK-NOFP16-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: scvtf v2.2d, v1.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: stofp_v3i16_v3f64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-FP16-SD-NEXT: sshll v0.2d, v1.2s, #0 -; CHECK-FP16-SD-NEXT: sshll2 v1.2d, v1.4s, #0 -; CHECK-FP16-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: scvtf v2.2d, v1.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: stofp_v3i16_v3f64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-NOFP16-GI-NEXT: sshll v0.2d, v1.2s, #0 -; CHECK-NOFP16-GI-NEXT: sshll2 v1.2d, v1.4s, #0 -; CHECK-NOFP16-GI-NEXT: scvtf v0.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: scvtf v2.2d, v1.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: stofp_v3i16_v3f64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-FP16-GI-NEXT: sshll v0.2d, v1.2s, #0 -; CHECK-FP16-GI-NEXT: sshll2 v1.2d, v1.4s, #0 -; CHECK-FP16-GI-NEXT: scvtf v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: scvtf v2.2d, v1.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: stofp_v3i16_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-NEXT: sshll v0.2d, v1.2s, #0 +; CHECK-NEXT: sshll2 v1.2d, v1.4s, #0 +; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf v2.2d, v1.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = sitofp <3 x i16> %a to <3 x double> ret <3 x double> %c } define <3 x double> @utofp_v3i16_v3f64(<3 x i16> %a) { -; CHECK-NOFP16-SD-LABEL: utofp_v3i16_v3f64: -; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NOFP16-SD-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-NOFP16-SD-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NOFP16-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: ucvtf v2.2d, v1.2d -; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NOFP16-SD-NEXT: ret -; -; CHECK-FP16-SD-LABEL: utofp_v3i16_v3f64: -; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-FP16-SD-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-FP16-SD-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-FP16-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: ucvtf v2.2d, v1.2d -; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-FP16-SD-NEXT: ret -; -; CHECK-NOFP16-GI-LABEL: utofp_v3i16_v3f64: -; CHECK-NOFP16-GI: // %bb.0: // %entry -; CHECK-NOFP16-GI-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-NOFP16-GI-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-NOFP16-GI-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NOFP16-GI-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NOFP16-GI-NEXT: ucvtf v2.2d, v1.2d -; CHECK-NOFP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-NOFP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-GI-NEXT: ret -; -; CHECK-FP16-GI-LABEL: utofp_v3i16_v3f64: -; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-FP16-GI-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-FP16-GI-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-FP16-GI-NEXT: ucvtf v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: ucvtf v2.2d, v1.2d -; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-GI-NEXT: ret +; CHECK-LABEL: utofp_v3i16_v3f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 +; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v2.2d, v1.2d +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = uitofp <3 x i16> %a to <3 x double> ret <3 x double> %c @@ -2313,9 +2085,8 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) { ; CHECK-NOFP16-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-NOFP16-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NOFP16-SD-NEXT: ret ; ; CHECK-FP16-SD-LABEL: stofp_v3i8_v3f64: @@ -2332,9 +2103,8 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) { ; CHECK-FP16-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-FP16-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: stofp_v3i8_v3f64: @@ -2399,9 +2169,8 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) { ; CHECK-NOFP16-SD-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NOFP16-SD-NEXT: ucvtf v2.2d, v1.2d ; CHECK-NOFP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NOFP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NOFP16-SD-NEXT: ret ; ; CHECK-FP16-SD-LABEL: utofp_v3i8_v3f64: @@ -2417,9 +2186,8 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) { ; CHECK-FP16-SD-NEXT: ucvtf v0.2d, v0.2d ; CHECK-FP16-SD-NEXT: ucvtf v2.2d, v1.2d ; CHECK-FP16-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-FP16-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: utofp_v3i8_v3f64: @@ -2474,7 +2242,7 @@ define <4 x double> @stofp_v4i8_v4f64(<4 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: stofp_v4i8_v4f64: ; CHECK-NOFP16-SD: // %bb.0: // %entry ; CHECK-NOFP16-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NOFP16-SD-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NOFP16-SD-NEXT: shl v1.2s, v1.2s, #24 @@ -2488,7 +2256,7 @@ define <4 x double> @stofp_v4i8_v4f64(<4 x i8> %a) { ; CHECK-FP16-SD-LABEL: stofp_v4i8_v4f64: ; CHECK-FP16-SD: // %bb.0: // %entry ; CHECK-FP16-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-FP16-SD-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-FP16-SD-NEXT: shl v1.2s, v1.2s, #24 @@ -2534,7 +2302,7 @@ define <4 x double> @utofp_v4i8_v4f64(<4 x i8> %a) { ; CHECK-NOFP16-SD: // %bb.0: // %entry ; CHECK-NOFP16-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NOFP16-SD-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] ; CHECK-NOFP16-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-SD-NEXT: and v1.8b, v2.8b, v1.8b ; CHECK-NOFP16-SD-NEXT: ushll v0.2d, v0.2s, #0 @@ -2547,7 +2315,7 @@ define <4 x double> @utofp_v4i8_v4f64(<4 x i8> %a) { ; CHECK-FP16-SD: // %bb.0: // %entry ; CHECK-FP16-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-FP16-SD-NEXT: movi d1, #0x0000ff000000ff -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] ; CHECK-FP16-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-FP16-SD-NEXT: and v1.8b, v2.8b, v1.8b ; CHECK-FP16-SD-NEXT: ushll v0.2d, v0.2s, #0 @@ -2767,7 +2535,7 @@ entry: define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: stofp_v16i8_v16f64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: mov b2, v0.b[0] ; CHECK-NOFP16-SD-NEXT: mov b3, v0.b[2] ; CHECK-NOFP16-SD-NEXT: mov b4, v0.b[4] @@ -2820,7 +2588,7 @@ define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: stofp_v16i8_v16f64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: mov b2, v0.b[0] ; CHECK-FP16-SD-NEXT: mov b3, v0.b[2] ; CHECK-FP16-SD-NEXT: mov b4, v0.b[4] @@ -2930,7 +2698,7 @@ entry: define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: utofp_v16i8_v16f64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] ; CHECK-NOFP16-SD-NEXT: mov b3, v0.b[0] ; CHECK-NOFP16-SD-NEXT: mov b4, v0.b[2] ; CHECK-NOFP16-SD-NEXT: mov b5, v0.b[4] @@ -2976,7 +2744,7 @@ define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: utofp_v16i8_v16f64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] ; CHECK-FP16-SD-NEXT: mov b3, v0.b[0] ; CHECK-FP16-SD-NEXT: mov b4, v0.b[2] ; CHECK-FP16-SD-NEXT: mov b5, v0.b[4] @@ -3079,7 +2847,7 @@ entry: define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: stofp_v32i8_v32f64: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d3, v0.d[1] ; CHECK-NOFP16-SD-NEXT: mov b5, v1.b[6] ; CHECK-NOFP16-SD-NEXT: mov b17, v1.b[4] ; CHECK-NOFP16-SD-NEXT: mov b20, v1.b[2] @@ -3087,7 +2855,7 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-NOFP16-SD-NEXT: mov b18, v0.b[0] ; CHECK-NOFP16-SD-NEXT: mov b19, v0.b[6] ; CHECK-NOFP16-SD-NEXT: mov b22, v0.b[4] -; CHECK-NOFP16-SD-NEXT: ext v16.16b, v1.16b, v1.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d16, v1.d[1] ; CHECK-NOFP16-SD-NEXT: mov b2, v3.b[0] ; CHECK-NOFP16-SD-NEXT: mov b4, v3.b[2] ; CHECK-NOFP16-SD-NEXT: mov b6, v3.b[4] @@ -3189,7 +2957,7 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: stofp_v32i8_v32f64: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d3, v0.d[1] ; CHECK-FP16-SD-NEXT: mov b5, v1.b[6] ; CHECK-FP16-SD-NEXT: mov b17, v1.b[4] ; CHECK-FP16-SD-NEXT: mov b20, v1.b[2] @@ -3197,7 +2965,7 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-FP16-SD-NEXT: mov b18, v0.b[0] ; CHECK-FP16-SD-NEXT: mov b19, v0.b[6] ; CHECK-FP16-SD-NEXT: mov b22, v0.b[4] -; CHECK-FP16-SD-NEXT: ext v16.16b, v1.16b, v1.16b, #8 +; CHECK-FP16-SD-NEXT: mov d16, v1.d[1] ; CHECK-FP16-SD-NEXT: mov b2, v3.b[0] ; CHECK-FP16-SD-NEXT: mov b4, v3.b[2] ; CHECK-FP16-SD-NEXT: mov b6, v3.b[4] @@ -3418,7 +3186,7 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-NOFP16-SD: // %bb.0: // %entry ; CHECK-NOFP16-SD-NEXT: mov b6, v1.b[6] ; CHECK-NOFP16-SD-NEXT: mov b7, v1.b[4] -; CHECK-NOFP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-NOFP16-SD-NEXT: mov b16, v1.b[2] ; CHECK-NOFP16-SD-NEXT: mov b17, v1.b[0] ; CHECK-NOFP16-SD-NEXT: mov b19, v0.b[6] @@ -3426,7 +3194,7 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-NOFP16-SD-NEXT: movi d5, #0x0000ff000000ff ; CHECK-NOFP16-SD-NEXT: mov b24, v0.b[2] ; CHECK-NOFP16-SD-NEXT: mov b25, v0.b[0] -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] ; CHECK-NOFP16-SD-NEXT: mov v6.b[4], v1.b[7] ; CHECK-NOFP16-SD-NEXT: mov v7.b[4], v1.b[5] ; CHECK-NOFP16-SD-NEXT: mov b18, v3.b[0] @@ -3513,7 +3281,7 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-FP16-SD: // %bb.0: // %entry ; CHECK-FP16-SD-NEXT: mov b6, v1.b[6] ; CHECK-FP16-SD-NEXT: mov b7, v1.b[4] -; CHECK-FP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-FP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-FP16-SD-NEXT: mov b16, v1.b[2] ; CHECK-FP16-SD-NEXT: mov b17, v1.b[0] ; CHECK-FP16-SD-NEXT: mov b19, v0.b[6] @@ -3521,7 +3289,7 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-FP16-SD-NEXT: movi d5, #0x0000ff000000ff ; CHECK-FP16-SD-NEXT: mov b24, v0.b[2] ; CHECK-FP16-SD-NEXT: mov b25, v0.b[0] -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] ; CHECK-FP16-SD-NEXT: mov v6.b[4], v1.b[7] ; CHECK-FP16-SD-NEXT: mov v7.b[4], v1.b[5] ; CHECK-FP16-SD-NEXT: mov b18, v3.b[0] @@ -5093,7 +4861,7 @@ entry: define <16 x float> @stofp_v16i8_v16f32(<16 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: stofp_v16i8_v16f32: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: zip1 v3.8b, v1.8b, v0.8b @@ -5118,7 +4886,7 @@ define <16 x float> @stofp_v16i8_v16f32(<16 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: stofp_v16i8_v16f32: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: zip1 v3.8b, v1.8b, v0.8b @@ -5176,7 +4944,7 @@ entry: define <16 x float> @utofp_v16i8_v16f32(<16 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: utofp_v16i8_v16f32: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-NOFP16-SD-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: zip1 v3.8b, v1.8b, v0.8b @@ -5197,7 +4965,7 @@ define <16 x float> @utofp_v16i8_v16f32(<16 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: utofp_v16i8_v16f32: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-SD-NEXT: mov d1, v0.d[1] ; CHECK-FP16-SD-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: zip1 v3.8b, v1.8b, v0.8b @@ -5251,8 +5019,8 @@ entry: define <32 x float> @stofp_v32i8_v32f32(<32 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: stofp_v32i8_v32f32: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-NOFP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-NOFP16-SD-NEXT: zip1 v4.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: shl v4.4h, v4.4h, #8 @@ -5287,9 +5055,9 @@ define <32 x float> @stofp_v32i8_v32f32(<32 x i8> %a) { ; CHECK-NOFP16-SD-NEXT: sshll v7.4s, v7.4h, #0 ; CHECK-NOFP16-SD-NEXT: sshll v19.4s, v3.4h, #0 ; CHECK-NOFP16-SD-NEXT: scvtf v1.4s, v16.4s -; CHECK-NOFP16-SD-NEXT: scvtf v4.4s, v5.4s ; CHECK-NOFP16-SD-NEXT: scvtf v2.4s, v6.4s ; CHECK-NOFP16-SD-NEXT: scvtf v3.4s, v17.4s +; CHECK-NOFP16-SD-NEXT: scvtf v4.4s, v5.4s ; CHECK-NOFP16-SD-NEXT: scvtf v5.4s, v18.4s ; CHECK-NOFP16-SD-NEXT: scvtf v6.4s, v7.4s ; CHECK-NOFP16-SD-NEXT: scvtf v7.4s, v19.4s @@ -5297,8 +5065,8 @@ define <32 x float> @stofp_v32i8_v32f32(<32 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: stofp_v32i8_v32f32: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-FP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-FP16-SD-NEXT: zip1 v4.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: shl v4.4h, v4.4h, #8 @@ -5333,9 +5101,9 @@ define <32 x float> @stofp_v32i8_v32f32(<32 x i8> %a) { ; CHECK-FP16-SD-NEXT: sshll v7.4s, v7.4h, #0 ; CHECK-FP16-SD-NEXT: sshll v19.4s, v3.4h, #0 ; CHECK-FP16-SD-NEXT: scvtf v1.4s, v16.4s -; CHECK-FP16-SD-NEXT: scvtf v4.4s, v5.4s ; CHECK-FP16-SD-NEXT: scvtf v2.4s, v6.4s ; CHECK-FP16-SD-NEXT: scvtf v3.4s, v17.4s +; CHECK-FP16-SD-NEXT: scvtf v4.4s, v5.4s ; CHECK-FP16-SD-NEXT: scvtf v5.4s, v18.4s ; CHECK-FP16-SD-NEXT: scvtf v6.4s, v7.4s ; CHECK-FP16-SD-NEXT: scvtf v7.4s, v19.4s @@ -5396,8 +5164,8 @@ entry: define <32 x float> @utofp_v32i8_v32f32(<32 x i8> %a) { ; CHECK-NOFP16-SD-LABEL: utofp_v32i8_v32f32: ; CHECK-NOFP16-SD: // %bb.0: // %entry -; CHECK-NOFP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NOFP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NOFP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-NOFP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-NOFP16-SD-NEXT: zip1 v4.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NOFP16-SD-NEXT: bic v4.4h, #255, lsl #8 @@ -5434,8 +5202,8 @@ define <32 x float> @utofp_v32i8_v32f32(<32 x i8> %a) { ; ; CHECK-FP16-SD-LABEL: utofp_v32i8_v32f32: ; CHECK-FP16-SD: // %bb.0: // %entry -; CHECK-FP16-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-FP16-SD-NEXT: mov d2, v0.d[1] +; CHECK-FP16-SD-NEXT: mov d3, v1.d[1] ; CHECK-FP16-SD-NEXT: zip1 v4.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-FP16-SD-NEXT: bic v4.4h, #255, lsl #8 diff --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll index 8c3dec85af8b3..e6c931c6d1834 100644 --- a/llvm/test/CodeGen/AArch64/neon-abd.ll +++ b/llvm/test/CodeGen/AArch64/neon-abd.ll @@ -597,7 +597,7 @@ define <8 x i32> @sabd_8h_bv_imm(<8 x i16> %a) { ; CHECK-LABEL: sabd_8h_bv_imm: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x9, #549747425280 // =0x7fff800000 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: adrp x8, .LCPI45_0 ; CHECK-NEXT: movk x9, #69, lsl #48 ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI45_0] @@ -619,7 +619,7 @@ define <8 x i32> @sabd_8h_bv_imm(<8 x i16> %a) { define <8 x i32> @uabd_8h_bv_imm(<8 x i16> %a) { ; CHECK-LABEL: uabd_8h_bv_imm: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: adrp x8, .LCPI46_0 ; CHECK-NEXT: adrp x9, .LCPI46_1 ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI46_0] diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index dfff35d9eb1b2..709d3d387d3a1 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -113,10 +113,10 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d1, v1.d[1] +; CHECK-NODOT-NEXT: mov d2, v2.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s -; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v3.d[1] ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret @@ -172,10 +172,10 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d1, v1.d[1] +; CHECK-NODOT-NEXT: mov d2, v2.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s -; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v3.d[1] ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret @@ -324,11 +324,11 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d4, v1.d[1] +; CHECK-NODOT-NEXT: mov d5, v2.d[1] ; CHECK-NODOT-NEXT: smull2 v1.4s, v2.8h, v1.8h -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NODOT-NEXT: mov d3, v3.d[1] +; CHECK-NODOT-NEXT: mov d1, v1.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-NODOT-NEXT: smlal v0.4s, v5.4h, v4.4h ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s @@ -341,11 +341,11 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-DOT-NEXT: smull v3.4s, v2.4h, v1.4h ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-DOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-DOT-NEXT: mov d4, v1.d[1] +; CHECK-DOT-NEXT: mov d5, v2.d[1] ; CHECK-DOT-NEXT: smull2 v1.4s, v2.8h, v1.8h -; CHECK-DOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-DOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-DOT-NEXT: mov d3, v3.d[1] +; CHECK-DOT-NEXT: mov d1, v1.d[1] ; CHECK-DOT-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v4.4h ; CHECK-DOT-NEXT: add v0.2s, v1.2s, v0.2s @@ -490,11 +490,11 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d4, v1.d[1] +; CHECK-NODOT-NEXT: mov d5, v2.d[1] ; CHECK-NODOT-NEXT: smull2 v1.4s, v2.8h, v1.8h -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NODOT-NEXT: mov d3, v3.d[1] +; CHECK-NODOT-NEXT: mov d1, v1.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-NODOT-NEXT: smlal v0.4s, v5.4h, v4.4h ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s @@ -507,11 +507,11 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-DOT-NEXT: smull v3.4s, v2.4h, v1.4h ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-DOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-DOT-NEXT: mov d4, v1.d[1] +; CHECK-DOT-NEXT: mov d5, v2.d[1] ; CHECK-DOT-NEXT: smull2 v1.4s, v2.8h, v1.8h -; CHECK-DOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-DOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-DOT-NEXT: mov d3, v3.d[1] +; CHECK-DOT-NEXT: mov d1, v1.d[1] ; CHECK-DOT-NEXT: add v0.2s, v3.2s, v0.2s ; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v4.4h ; CHECK-DOT-NEXT: add v0.2s, v1.2s, v0.2s @@ -876,10 +876,10 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d1, v1.d[1] +; CHECK-NODOT-NEXT: mov d2, v2.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s -; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v3.d[1] ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret @@ -908,10 +908,10 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d1, v1.d[1] +; CHECK-NODOT-NEXT: mov d2, v2.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s -; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v3.d[1] ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret @@ -1036,7 +1036,7 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) { ; CHECK-COMMON-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-COMMON-NEXT: umull v3.4s, v2.4h, v1.4h ; CHECK-COMMON-NEXT: umlal v0.4s, v2.4h, v1.4h -; CHECK-COMMON-NEXT: ext v1.16b, v3.16b, v3.16b, #8 +; CHECK-COMMON-NEXT: mov d1, v3.d[1] ; CHECK-COMMON-NEXT: add v0.2s, v1.2s, v0.2s ; CHECK-COMMON-NEXT: ret %u.wide = zext <4 x i8> %u to <4 x i32> @@ -1461,19 +1461,19 @@ define <2 x i32> @udot_v16i8tov2i32(<2 x i32> %acc, <16 x i8> %input) { ; CHECK-NODOT-NEXT: ushll v3.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NODOT-NEXT: ushll2 v4.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v2.d[1] +; CHECK-NODOT-NEXT: mov d3, v3.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: ext v3.16b, v4.16b, v4.16b, #8 +; CHECK-NODOT-NEXT: mov d3, v4.d[1] ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v2.d[1] ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NODOT-NEXT: mov d1, v1.d[1] ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s -; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: mov d2, v3.d[1] ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll b/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll index 4a45484d17a95..387f3c250a701 100644 --- a/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll +++ b/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll @@ -1,142 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -asm-verbose=false < %s | FileCheck %s define float @test_dup_sv2S(<2 x float> %v) #0 { - ; CHECK-LABEL: test_dup_sv2S: - ; CHECK-NEXT: mov s{{[0-9]+}}, {{v[0-9]+}}.s[1] - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_sv2S: +; CHECK: mov s0, v0.s[1] +; CHECK-NEXT: ret %tmp1 = extractelement <2 x float> %v, i32 1 ret float %tmp1 } define float @test_dup_sv2S_0(<2 x float> %v) #0 { - ; CHECK-LABEL: test_dup_sv2S_0: - ; CHECK-NOT: dup {{[vsd][0-9]+}} - ; CHECK-NOT: ins {{[vsd][0-9]+}} - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_sv2S_0: +; CHECK: ret %tmp1 = extractelement <2 x float> %v, i32 0 ret float %tmp1 } define float @test_dup_sv4S(<4 x float> %v) #0 { - ; CHECK-LABEL: test_dup_sv4S: - ; CHECK-NEXT: mov s{{[0-9]+}}, {{v[0-9]+}}.s[1] - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_sv4S: +; CHECK: mov s0, v0.s[1] +; CHECK-NEXT: ret %tmp1 = extractelement <4 x float> %v, i32 1 ret float %tmp1 } define float @test_dup_sv4S_0(<4 x float> %v) #0 { - ; CHECK-LABEL: test_dup_sv4S_0: - ; CHECK-NOT: dup {{[vsd][0-9]+}} - ; CHECK-NOT: ins {{[vsd][0-9]+}} - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_sv4S_0: +; CHECK: ret %tmp1 = extractelement <4 x float> %v, i32 0 ret float %tmp1 } define double @test_dup_dvD(<1 x double> %v) #0 { - ; CHECK-LABEL: test_dup_dvD: - ; CHECK-NOT: dup {{[vsd][0-9]+}} - ; CHECK-NOT: ins {{[vsd][0-9]+}} - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_dvD: +; CHECK: ret %tmp1 = extractelement <1 x double> %v, i32 0 ret double %tmp1 } define double @test_dup_dv2D(<2 x double> %v) #0 { - ; CHECK-LABEL: test_dup_dv2D: - ; CHECK-NEXT: mov d{{[0-9]+}}, {{v[0-9]+}}.d[1] - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_dv2D: +; CHECK: mov d0, v0.d[1] +; CHECK-NEXT: ret %tmp1 = extractelement <2 x double> %v, i32 1 ret double %tmp1 } define double @test_dup_dv2D_0(<2 x double> %v) #0 { - ; CHECK-LABEL: test_dup_dv2D_0: - ; CHECK-NOT: dup {{[vsd][0-9]+}} - ; CHECK-NOT: ins {{[vsd][0-9]+}} - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_dv2D_0: +; CHECK: ret %tmp1 = extractelement <2 x double> %v, i32 0 ret double %tmp1 } define half @test_dup_hv8H(<8 x half> %v) #0 { - ; CHECK-LABEL: test_dup_hv8H: - ; CHECK-NEXT: mov h{{[0-9]+}}, {{v[0-9]+}}.h[1] - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_hv8H: +; CHECK: mov h0, v0.h[1] +; CHECK-NEXT: ret %tmp1 = extractelement <8 x half> %v, i32 1 ret half %tmp1 } define half @test_dup_hv8H_0(<8 x half> %v) #0 { - ; CHECK-LABEL: test_dup_hv8H_0: - ; CHECK-NOT: dup {{[vsdh][0-9]+}} - ; CHECK-NOT: ins {{[vsdh][0-9]+}} - ; CHECK-NEXT: ret +; CHECK-LABEL: test_dup_hv8H_0: +; CHECK: ret %tmp1 = extractelement <8 x half> %v, i32 0 ret half %tmp1 } define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_bv16B: - ; CHECK-NEXT: dup v0.8b, v0.b[14] - ; CHECK-NEXT: ret - %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_bv16B: +; CHECK: dup v0.8b, v0.b[14] +; CHECK-NEXT: ret + %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> ret <1 x i8> %shuffle.i } define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_bv8B: - ; CHECK-NEXT: dup v0.8b, v0.b[7] - ; CHECK-NEXT: ret - %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_bv8B: +; CHECK: dup v0.8b, v0.b[7] +; CHECK-NEXT: ret + %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> ret <1 x i8> %shuffle.i } define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_hv8H: - ; CHECK-NEXT: dup v0.4h, v0.h[7] - ; CHECK-NEXT: ret - %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_hv8H: +; CHECK: dup v0.4h, v0.h[7] +; CHECK-NEXT: ret + %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> ret <1 x i16> %shuffle.i } define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_hv4H: - ; CHECK-NEXT: dup v0.4h, v0.h[3] - ; CHECK-NEXT: ret - %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_hv4H: +; CHECK: dup v0.4h, v0.h[3] +; CHECK-NEXT: ret + %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> ret <1 x i16> %shuffle.i } define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_sv4S: - ; CHECK-NEXT: dup v0.2s, v0.s[3] - ; CHECK-NEXT: ret - %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_sv4S: +; CHECK: dup v0.2s, v0.s[3] +; CHECK-NEXT: ret + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> ret <1 x i32> %shuffle } define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_sv2S: - ; CHECK-NEXT: dup v0.2s, v0.s[1] - ; CHECK-NEXT: ret - %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_sv2S: +; CHECK: dup v0.2s, v0.s[1] +; CHECK-NEXT: ret + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> ret <1 x i32> %shuffle } define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) #0 { - ; CHECK-LABEL: test_vector_dup_dv2D: - ; CHECK-NEXT: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8 - ; CHECK-NEXT: ret - %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> +; CHECK-LABEL: test_vector_dup_dv2D: +; CHECK: mov d0, v0.d[1] +; CHECK-NEXT: ret + %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> ret <1 x i64> %shuffle.i } define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) #0 { - ; CHECK-LABEL: test_vector_copy_dup_dv2D: - ; CHECK-NEXT: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8 - ; CHECK-NEXT: ret +; CHECK-LABEL: test_vector_copy_dup_dv2D: +; CHECK: mov d0, v1.d[1] +; CHECK-NEXT: ret %vget_lane = extractelement <2 x i64> %c, i32 1 %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0 ret <1 x i64> %vset_lane @@ -146,7 +137,7 @@ define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) #0 { ; as long as we don't crash (since it could be dynamically unreachable). define i32 @test_out_of_range_extract(<4 x i32> %vec) { ; CHECK-LABEL: test_out_of_range_extract: -; CHECK: ret +; CHECK: ret %elt = extractelement <4 x i32> %vec, i32 4 ret i32 %elt } @@ -155,7 +146,7 @@ define i32 @test_out_of_range_extract(<4 x i32> %vec) { ; as long as we don't crash (since it could be dynamically unreachable). define void @test_out_of_range_insert(<4 x i32> %vec, i32 %elt) { ; CHECK-LABEL: test_out_of_range_insert: -; CHECK: ret +; CHECK: ret insertelement <4 x i32> %vec, i32 %elt, i32 4 ret void } diff --git a/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll b/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll index 50a022f632577..a8c55b476b810 100644 --- a/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll +++ b/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll @@ -154,7 +154,7 @@ define <8 x i16> @test_sshll2_v16i8(<16 x i8> %a) { ; ; CHECK-GI-LABEL: test_sshll2_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #3 ; CHECK-GI-NEXT: ret %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> @@ -189,7 +189,7 @@ define <4 x i32> @test_sshll2_v8i16(<8 x i16> %a) { ; ; CHECK-GI-LABEL: test_sshll2_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #9 ; CHECK-GI-NEXT: ret %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -224,7 +224,7 @@ define <2 x i64> @test_sshll2_v4i32(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_sshll2_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #19 ; CHECK-GI-NEXT: ret %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -259,7 +259,7 @@ define <8 x i16> @test_ushll2_v16i8(<16 x i8> %a) { ; ; CHECK-GI-LABEL: test_ushll2_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #3 ; CHECK-GI-NEXT: ret %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> @@ -288,7 +288,7 @@ define <4 x i32> @test_ushll2_v8i16(<8 x i16> %a) { ; ; CHECK-GI-LABEL: test_ushll2_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #9 ; CHECK-GI-NEXT: ret %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -317,7 +317,7 @@ define <2 x i64> @test_ushll2_v4i32(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_ushll2_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #19 ; CHECK-GI-NEXT: ret %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll index 62b3e5651423f..662a95e4477b7 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -543,12 +543,10 @@ define <5 x double> @test_ldnp_v5f64(ptr %A) { ; CHECK-LE: ; %bb.0: ; CHECK-LE-NEXT: ldnp q0, q2, [x0] ; CHECK-LE-NEXT: ldr d4, [x0, #32] -; CHECK-LE-NEXT: ext.16b v1, v0, v0, #8 -; CHECK-LE-NEXT: ext.16b v3, v2, v2, #8 +; CHECK-LE-NEXT: mov d1, v0[1] +; CHECK-LE-NEXT: mov d3, v2[1] ; CHECK-LE-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ; kill: def $d2 killed $d2 killed $q2 -; CHECK-LE-NEXT: ; kill: def $d1 killed $d1 killed $q1 -; CHECK-LE-NEXT: ; kill: def $d3 killed $d3 killed $q3 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v5f64: @@ -558,12 +556,10 @@ define <5 x double> @test_ldnp_v5f64(ptr %A) { ; CHECK-BE-NEXT: ldr d4, [x0, #32] ; CHECK-BE-NEXT: ld1 { v2.2d }, [x8] ; CHECK-BE-NEXT: // kill: def $d4 killed $d4 killed $q4 -; CHECK-BE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: mov d1, v0.d[1] ; CHECK-BE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-BE-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; CHECK-BE-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-BE-NEXT: mov d3, v2.d[1] ; CHECK-BE-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-BE-NEXT: // kill: def $d3 killed $d3 killed $q3 ; CHECK-BE-NEXT: ret %lv = load<5 x double>, ptr %A, align 8, !nontemporal !0 ret <5 x double> %lv diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll index fc364357436e2..7f7d79db65ed3 100644 --- a/llvm/test/CodeGen/AArch64/ptradd.ll +++ b/llvm/test/CodeGen/AArch64/ptradd.ll @@ -76,15 +76,14 @@ define <3 x ptr> @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: ext v4.16b, v3.16b, v3.16b, #8 +; CHECK-SD-NEXT: mov d4, v3.d[1] ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: saddw v2.2d, v2.2d, v4.2s ; CHECK-SD-NEXT: saddw v0.2d, v0.2d, v3.2s ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v3i32: @@ -250,9 +249,8 @@ define <3 x ptr> @vector_gep_v3i64_base(ptr %b, <3 x i64> %off) { ; CHECK-SD-NEXT: dup v1.2d, x0 ; CHECK-SD-NEXT: add d2, d3, d2 ; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v3i64_base: diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll index cea7028b322a4..c6229007bec3c 100644 --- a/llvm/test/CodeGen/AArch64/reduce-and.ll +++ b/llvm/test/CodeGen/AArch64/reduce-and.ll @@ -283,7 +283,7 @@ define i8 @test_redand_v8i8(<8 x i8> %a) { define i8 @test_redand_v16i8(<16 x i8> %a) { ; CHECK-SD-LABEL: test_redand_v16i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: and x8, x8, x8, lsr #32 @@ -294,7 +294,7 @@ define i8 @test_redand_v16i8(<16 x i8> %a) { ; ; CHECK-GI-LABEL: test_redand_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -320,7 +320,7 @@ define i8 @test_redand_v32i8(<32 x i8> %a) { ; CHECK-SD-LABEL: test_redand_v32i8: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: and x8, x8, x8, lsr #32 @@ -332,7 +332,7 @@ define i8 @test_redand_v32i8(<32 x i8> %a) { ; CHECK-GI-LABEL: test_redand_v32i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -381,7 +381,7 @@ define i16 @test_redand_v4i16(<4 x i16> %a) { define i16 @test_redand_v8i16(<8 x i16> %a) { ; CHECK-SD-LABEL: test_redand_v8i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: and x8, x8, x8, lsr #32 @@ -391,7 +391,7 @@ define i16 @test_redand_v8i16(<8 x i16> %a) { ; ; CHECK-GI-LABEL: test_redand_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -409,7 +409,7 @@ define i16 @test_redand_v16i16(<16 x i16> %a) { ; CHECK-SD-LABEL: test_redand_v16i16: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: and x8, x8, x8, lsr #32 @@ -420,7 +420,7 @@ define i16 @test_redand_v16i16(<16 x i16> %a) { ; CHECK-GI-LABEL: test_redand_v16i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -456,7 +456,7 @@ define i32 @test_redand_v2i32(<2 x i32> %a) { define i32 @test_redand_v4i32(<4 x i32> %a) { ; CHECK-SD-LABEL: test_redand_v4i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -465,7 +465,7 @@ define i32 @test_redand_v4i32(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_redand_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -479,7 +479,7 @@ define i32 @test_redand_v8i32(<8 x i32> %a) { ; CHECK-SD-LABEL: test_redand_v8i32: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -489,7 +489,7 @@ define i32 @test_redand_v8i32(<8 x i32> %a) { ; CHECK-GI-LABEL: test_redand_v8i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -502,7 +502,7 @@ define i32 @test_redand_v8i32(<8 x i32> %a) { define i64 @test_redand_v2i64(<2 x i64> %a) { ; CHECK-SD-LABEL: test_redand_v2i64: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret @@ -521,7 +521,7 @@ define i64 @test_redand_v4i64(<4 x i64> %a) { ; CHECK-SD-LABEL: test_redand_v4i64: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll index 71f21f316d9e1..42dea98051e0d 100644 --- a/llvm/test/CodeGen/AArch64/reduce-or.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or.ll @@ -284,7 +284,7 @@ define i8 @test_redor_v8i8(<8 x i8> %a) { define i8 @test_redor_v16i8(<16 x i8> %a) { ; CHECK-SD-LABEL: test_redor_v16i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: orr x8, x8, x8, lsr #32 @@ -295,7 +295,7 @@ define i8 @test_redor_v16i8(<16 x i8> %a) { ; ; CHECK-GI-LABEL: test_redor_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -321,7 +321,7 @@ define i8 @test_redor_v32i8(<32 x i8> %a) { ; CHECK-SD-LABEL: test_redor_v32i8: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: orr x8, x8, x8, lsr #32 @@ -333,7 +333,7 @@ define i8 @test_redor_v32i8(<32 x i8> %a) { ; CHECK-GI-LABEL: test_redor_v32i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -382,7 +382,7 @@ define i16 @test_redor_v4i16(<4 x i16> %a) { define i16 @test_redor_v8i16(<8 x i16> %a) { ; CHECK-SD-LABEL: test_redor_v8i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: orr x8, x8, x8, lsr #32 @@ -392,7 +392,7 @@ define i16 @test_redor_v8i16(<8 x i16> %a) { ; ; CHECK-GI-LABEL: test_redor_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -410,7 +410,7 @@ define i16 @test_redor_v16i16(<16 x i16> %a) { ; CHECK-SD-LABEL: test_redor_v16i16: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: orr x8, x8, x8, lsr #32 @@ -421,7 +421,7 @@ define i16 @test_redor_v16i16(<16 x i16> %a) { ; CHECK-GI-LABEL: test_redor_v16i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -457,7 +457,7 @@ define i32 @test_redor_v2i32(<2 x i32> %a) { define i32 @test_redor_v4i32(<4 x i32> %a) { ; CHECK-SD-LABEL: test_redor_v4i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -466,7 +466,7 @@ define i32 @test_redor_v4i32(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_redor_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -480,7 +480,7 @@ define i32 @test_redor_v8i32(<8 x i32> %a) { ; CHECK-SD-LABEL: test_redor_v8i32: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -490,7 +490,7 @@ define i32 @test_redor_v8i32(<8 x i32> %a) { ; CHECK-GI-LABEL: test_redor_v8i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -503,7 +503,7 @@ define i32 @test_redor_v8i32(<8 x i32> %a) { define i64 @test_redor_v2i64(<2 x i64> %a) { ; CHECK-SD-LABEL: test_redor_v2i64: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret @@ -522,7 +522,7 @@ define i64 @test_redor_v4i64(<4 x i64> %a) { ; CHECK-SD-LABEL: test_redor_v4i64: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll index 39e0d28c81eea..df7cb5b31dfb3 100644 --- a/llvm/test/CodeGen/AArch64/reduce-xor.ll +++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll @@ -273,7 +273,7 @@ define i8 @test_redxor_v8i8(<8 x i8> %a) { define i8 @test_redxor_v16i8(<16 x i8> %a) { ; CHECK-SD-LABEL: test_redxor_v16i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: eor x8, x8, x8, lsr #32 @@ -284,7 +284,7 @@ define i8 @test_redxor_v16i8(<16 x i8> %a) { ; ; CHECK-GI-LABEL: test_redxor_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -310,7 +310,7 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) { ; CHECK-SD-LABEL: test_redxor_v32i8: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: eor x8, x8, x8, lsr #32 @@ -322,7 +322,7 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) { ; CHECK-GI-LABEL: test_redxor_v32i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.b[0] ; CHECK-GI-NEXT: umov w9, v0.b[1] @@ -371,7 +371,7 @@ define i16 @test_redxor_v4i16(<4 x i16> %a) { define i16 @test_redxor_v8i16(<8 x i16> %a) { ; CHECK-SD-LABEL: test_redxor_v8i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: eor x8, x8, x8, lsr #32 @@ -381,7 +381,7 @@ define i16 @test_redxor_v8i16(<8 x i16> %a) { ; ; CHECK-GI-LABEL: test_redxor_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -399,7 +399,7 @@ define i16 @test_redxor_v16i16(<16 x i16> %a) { ; CHECK-SD-LABEL: test_redxor_v16i16: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: eor x8, x8, x8, lsr #32 @@ -410,7 +410,7 @@ define i16 @test_redxor_v16i16(<16 x i16> %a) { ; CHECK-GI-LABEL: test_redxor_v16i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] @@ -446,7 +446,7 @@ define i32 @test_redxor_v2i32(<2 x i32> %a) { define i32 @test_redxor_v4i32(<4 x i32> %a) { ; CHECK-SD-LABEL: test_redxor_v4i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -455,7 +455,7 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) { ; ; CHECK-GI-LABEL: test_redxor_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -469,7 +469,7 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) { ; CHECK-SD-LABEL: test_redxor_v8i32: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: lsr x9, x8, #32 @@ -479,7 +479,7 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) { ; CHECK-GI-LABEL: test_redxor_v8i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: fmov w9, s0 @@ -492,7 +492,7 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) { define i64 @test_redxor_v2i64(<2 x i64> %a) { ; CHECK-SD-LABEL: test_redxor_v2i64: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret @@ -511,7 +511,7 @@ define i64 @test_redxor_v4i64(<4 x i64> %a) { ; CHECK-SD-LABEL: test_redxor_v4i64: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll index 32bcd5a697d17..5dc5aa52b9155 100644 --- a/llvm/test/CodeGen/AArch64/rem.ll +++ b/llvm/test/CodeGen/AArch64/rem.ll @@ -1276,7 +1276,7 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-NEXT: ushll v0.4s, v2.4h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d5, v1.d[1] ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s3 ; CHECK-GI-NEXT: mov w10, v0.s[1] @@ -1436,8 +1436,8 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) { ; CHECK-GI-NEXT: ushll v5.4s, v3.4h, #0 ; CHECK-GI-NEXT: ushll2 v4.4s, v4.8h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v1.4h, #0 -; CHECK-GI-NEXT: ext v18.16b, v3.16b, v3.16b, #8 -; CHECK-GI-NEXT: ext v19.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d18, v3.d[1] +; CHECK-GI-NEXT: mov d19, v1.d[1] ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov w12, v2.s[3] @@ -1860,8 +1860,8 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: mov v20.h[1], w15 ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: ushll v7.4s, v4.4h, #0 -; CHECK-GI-NEXT: ext v29.16b, v4.16b, v4.16b, #8 -; CHECK-GI-NEXT: ext v28.16b, v2.16b, v2.16b, #8 +; CHECK-GI-NEXT: mov d29, v4.d[1] +; CHECK-GI-NEXT: mov d28, v2.d[1] ; CHECK-GI-NEXT: umlsl v26.4s, v19.4h, v2.4h ; CHECK-GI-NEXT: fmov w17, s7 ; CHECK-GI-NEXT: udiv w12, w12, w16 @@ -1937,8 +1937,8 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { ; CHECK-GI-NEXT: mov v23.h[1], w23 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v18.4s, v7.4h, #0 -; CHECK-GI-NEXT: ext v31.16b, v7.16b, v7.16b, #8 -; CHECK-GI-NEXT: ext v30.16b, v3.16b, v3.16b, #8 +; CHECK-GI-NEXT: mov d31, v7.d[1] +; CHECK-GI-NEXT: mov d30, v3.d[1] ; CHECK-GI-NEXT: umlsl v27.4s, v22.4h, v3.4h ; CHECK-GI-NEXT: fmov w25, s18 ; CHECK-GI-NEXT: mov w28, v18.s[3] @@ -2668,7 +2668,7 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) { ; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d5, v1.d[1] ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s3 ; CHECK-GI-NEXT: mov w10, v2.s[1] @@ -2851,8 +2851,8 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll v5.4s, v2.4h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v3.4h, #0 -; CHECK-GI-NEXT: ext v18.16b, v2.16b, v2.16b, #8 -; CHECK-GI-NEXT: ext v19.16b, v3.16b, v3.16b, #8 +; CHECK-GI-NEXT: mov d18, v2.d[1] +; CHECK-GI-NEXT: mov d19, v3.d[1] ; CHECK-GI-NEXT: fmov w8, s4 ; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov w12, v5.s[3] diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index e6af0256fe6e2..278731ad0663a 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -273,9 +273,8 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) { ; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v3i8_v3i64: @@ -324,9 +323,8 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-SD-NEXT: sshll v0.2d, v2.2s, #0 ; CHECK-SD-NEXT: sshll2 v2.2d, v2.4s, #0 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v3i16_v3i64: @@ -350,9 +348,8 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) { ; CHECK-SD-NEXT: sshll v3.2d, v0.2s, #0 ; CHECK-SD-NEXT: sshll2 v2.2d, v0.4s, #0 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: mov d1, v3.d[1] ; CHECK-SD-NEXT: fmov d0, d3 -; CHECK-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v3i32_v3i64: @@ -435,9 +432,8 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) { ; CHECK-SD-NEXT: shl v0.2d, v0.2d, #54 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #54 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v3i10_v3i64: diff --git a/llvm/test/CodeGen/AArch64/st1-lane.ll b/llvm/test/CodeGen/AArch64/st1-lane.ll index 06128df3b934a..a212376afe000 100644 --- a/llvm/test/CodeGen/AArch64/st1-lane.ll +++ b/llvm/test/CodeGen/AArch64/st1-lane.ll @@ -108,21 +108,13 @@ define void @v8bf16(<8 x bfloat> %a, ptr %p1, ptr %p2) { define ptr @post_v2i64(<2 x i64> %a, ptr %p1, ptr %p2) { -; CHECK-SD-LABEL: post_v2i64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: str d0, [x0] -; CHECK-SD-NEXT: str d1, [x1], #8 -; CHECK-SD-NEXT: mov x0, x1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: post_v2i64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: str d0, [x0] -; CHECK-GI-NEXT: str d1, [x1], #8 -; CHECK-GI-NEXT: mov x0, x1 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: post_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: str d1, [x1], #8 +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret %s1 = shufflevector <2 x i64> %a, <2 x i64> poison, <1 x i32> store <1 x i64> %s1, ptr %p1, align 8 %s2 = shufflevector <2 x i64> %a, <2 x i64> poison, <1 x i32> @@ -134,7 +126,7 @@ define ptr @post_v2i64(<2 x i64> %a, ptr %p1, ptr %p2) { define ptr @post_v4i32(<4 x i32> %a, ptr %p1, ptr %p2) { ; CHECK-LABEL: post_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: str d1, [x1], #8 ; CHECK-NEXT: mov x0, x1 @@ -150,7 +142,7 @@ define ptr @post_v4i32(<4 x i32> %a, ptr %p1, ptr %p2) { define ptr @post_v8i16(<8 x i16> %a, ptr %p1, ptr %p2) { ; CHECK-LABEL: post_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: str d1, [x1], #8 ; CHECK-NEXT: mov x0, x1 @@ -166,7 +158,7 @@ define ptr @post_v8i16(<8 x i16> %a, ptr %p1, ptr %p2) { define ptr @post_v16i8(<16 x i8> %a, ptr %p1, ptr %p2) { ; CHECK-LABEL: post_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: str d1, [x1], #8 ; CHECK-NEXT: mov x0, x1 @@ -180,21 +172,13 @@ define ptr @post_v16i8(<16 x i8> %a, ptr %p1, ptr %p2) { } define ptr @post_v2f64(<2 x double> %a, ptr %p1, ptr %p2) { -; CHECK-SD-LABEL: post_v2f64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: str d0, [x0] -; CHECK-SD-NEXT: str d1, [x1], #8 -; CHECK-SD-NEXT: mov x0, x1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: post_v2f64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: str d0, [x0] -; CHECK-GI-NEXT: str d1, [x1], #8 -; CHECK-GI-NEXT: mov x0, x1 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: post_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: str d1, [x1], #8 +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret %s1 = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> store <1 x double> %s1, ptr %p1, align 8 %s2 = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> @@ -206,7 +190,7 @@ define ptr @post_v2f64(<2 x double> %a, ptr %p1, ptr %p2) { define ptr @post_v4f32(<4 x float> %a, ptr %p1, ptr %p2) { ; CHECK-LABEL: post_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: str d1, [x1], #8 ; CHECK-NEXT: mov x0, x1 @@ -222,7 +206,7 @@ define ptr @post_v4f32(<4 x float> %a, ptr %p1, ptr %p2) { define ptr @post_v8f16(<8 x half> %a, ptr %p1, ptr %p2) { ; CHECK-LABEL: post_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: str d1, [x1], #8 ; CHECK-NEXT: mov x0, x1 @@ -238,7 +222,7 @@ define ptr @post_v8f16(<8 x half> %a, ptr %p1, ptr %p2) { define ptr @post_v8bf16(<8 x bfloat> %a, ptr %p1, ptr %p2) { ; CHECK-LABEL: post_v8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: str d1, [x1], #8 ; CHECK-NEXT: mov x0, x1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 6b082b1762cc8..da00e6003df12 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -27,8 +27,7 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8) ret <8 x i8> %ret @@ -185,8 +184,7 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4) ret <4 x i16> %ret @@ -300,8 +298,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2) ret <2 x i32> %ret @@ -404,8 +401,7 @@ define void @extract_subvector_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1) ret <1 x i64> %ret @@ -531,8 +527,7 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4) ret <4 x half> %ret @@ -646,8 +641,7 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) # define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2) ret <2 x float> %ret @@ -750,8 +744,7 @@ define void @extract_subvector_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1) ret <1 x double> %ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll index 1690c3c5a75f7..e59b508fb1f13 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -253,7 +253,7 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ldr q0, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -1140,7 +1140,7 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ldr q0, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index d2fa65599b973..83d29b08a67c3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -157,7 +157,7 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 { ; VBITS_GE_256-LABEL: sext_v16i8_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b @@ -245,7 +245,7 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 { ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -407,7 +407,7 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 { ; VBITS_GE_256-LABEL: sext_v8i16_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -656,7 +656,7 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 { ; VBITS_GE_256-LABEL: zext_v16i8_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b @@ -744,7 +744,7 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 { ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -906,7 +906,7 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 { ; VBITS_GE_256-LABEL: zext_v8i16_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 5753e5972f9c8..6f4f25890d46a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -255,7 +255,7 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ldr q0, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -1176,7 +1176,7 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ldr q0, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-expandloads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-expandloads.ll index 8789167e611e4..91d0a9ad37dc9 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-expandloads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-expandloads.ll @@ -7673,7 +7673,7 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB14_18: // %cond.load57 ; VBITS_GE_256-NEXT: ld1 { v0.b }[15], [x0] ; VBITS_GE_256-NEXT: .LBB14_19: // %else58 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -7834,7 +7834,7 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-EXPAND-NEXT: ptrue p0.s, vl8 ; CHECK-EXPAND-NEXT: expand z0.b, p1, z0.b -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: sunpklo z0.h, z0.b ; CHECK-EXPAND-NEXT: sunpklo z1.h, z1.b ; CHECK-EXPAND-NEXT: sunpklo z0.s, z0.h @@ -7889,7 +7889,7 @@ define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -7984,7 +7984,7 @@ define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ptrue p0.d, vl4 ; CHECK-EXPAND-NEXT: expand z0.b, p1, z0.b ; CHECK-EXPAND-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: sunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: sunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: sunpklo z0.d, z0.s @@ -8500,7 +8500,7 @@ define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB17_10: // %cond.load25 ; VBITS_GE_256-NEXT: ld1 { v0.h }[7], [x0] ; VBITS_GE_256-NEXT: .LBB17_11: // %else26 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 @@ -8595,7 +8595,7 @@ define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-EXPAND-NEXT: ptrue p0.d, vl4 ; CHECK-EXPAND-NEXT: expand z0.h, p1, z0.h -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: sunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: sunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: sunpklo z0.d, z0.s @@ -9879,7 +9879,7 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB20_18: // %cond.load57 ; VBITS_GE_256-NEXT: ld1 { v0.b }[15], [x0] ; VBITS_GE_256-NEXT: .LBB20_19: // %else58 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -10040,7 +10040,7 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-EXPAND-NEXT: ptrue p0.s, vl8 ; CHECK-EXPAND-NEXT: expand z0.b, p1, z0.b -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: uunpklo z0.h, z0.b ; CHECK-EXPAND-NEXT: uunpklo z1.h, z1.b ; CHECK-EXPAND-NEXT: uunpklo z0.s, z0.h @@ -10095,7 +10095,7 @@ define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -10190,7 +10190,7 @@ define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ptrue p0.d, vl4 ; CHECK-EXPAND-NEXT: expand z0.b, p1, z0.b ; CHECK-EXPAND-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: uunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: uunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: uunpklo z0.d, z0.s @@ -10706,7 +10706,7 @@ define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB23_10: // %cond.load25 ; VBITS_GE_256-NEXT: ld1 { v0.h }[7], [x0] ; VBITS_GE_256-NEXT: .LBB23_11: // %else26 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 @@ -10801,7 +10801,7 @@ define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-EXPAND-NEXT: ptrue p0.d, vl4 ; CHECK-EXPAND-NEXT: expand z0.h, p1, z0.h -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: uunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: uunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: uunpklo z0.d, z0.s @@ -12077,7 +12077,7 @@ define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB26_18: // %cond.load57 ; VBITS_GE_256-NEXT: ld1 { v0.b }[15], [x0] ; VBITS_GE_256-NEXT: .LBB26_19: // %else58 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b @@ -12288,7 +12288,7 @@ define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: whilelo p1.b, xzr, x9 ; CHECK-EXPAND-NEXT: ld1b { z0.b }, p1/z, [x0] ; CHECK-EXPAND-NEXT: expand z0.b, p2, z0.b -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: sunpklo z0.h, z0.b ; CHECK-EXPAND-NEXT: sunpklo z1.h, z1.b ; CHECK-EXPAND-NEXT: sunpklo z0.s, z0.h @@ -12368,7 +12368,7 @@ define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB27_11: // %else26 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -12494,7 +12494,7 @@ define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ld1b { z0.b }, p1/z, [x0] ; CHECK-EXPAND-NEXT: expand z0.b, p2, z0.b ; CHECK-EXPAND-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: sunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: sunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: sunpklo z0.d, z0.s @@ -13023,7 +13023,7 @@ define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB29_10: // %cond.load25 ; VBITS_GE_256-NEXT: ld1 { v0.h }[7], [x0] ; VBITS_GE_256-NEXT: .LBB29_11: // %else26 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h @@ -13147,7 +13147,7 @@ define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: whilelo p1.h, xzr, x9 ; CHECK-EXPAND-NEXT: ld1h { z0.h }, p1/z, [x0] ; CHECK-EXPAND-NEXT: expand z0.h, p2, z0.h -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: sunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: sunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: sunpklo z0.d, z0.s @@ -14441,7 +14441,7 @@ define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB32_18: // %cond.load57 ; VBITS_GE_256-NEXT: ld1 { v0.b }[15], [x0] ; VBITS_GE_256-NEXT: .LBB32_19: // %else58 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b @@ -14652,7 +14652,7 @@ define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: whilelo p1.b, xzr, x9 ; CHECK-EXPAND-NEXT: ld1b { z0.b }, p1/z, [x0] ; CHECK-EXPAND-NEXT: expand z0.b, p2, z0.b -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: uunpklo z0.h, z0.b ; CHECK-EXPAND-NEXT: uunpklo z1.h, z1.b ; CHECK-EXPAND-NEXT: uunpklo z0.s, z0.h @@ -14732,7 +14732,7 @@ define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB33_11: // %else26 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -14858,7 +14858,7 @@ define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: ld1b { z0.b }, p1/z, [x0] ; CHECK-EXPAND-NEXT: expand z0.b, p2, z0.b ; CHECK-EXPAND-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: uunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: uunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: uunpklo z0.d, z0.s @@ -15387,7 +15387,7 @@ define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: .LBB35_10: // %cond.load25 ; VBITS_GE_256-NEXT: ld1 { v0.h }[7], [x0] ; VBITS_GE_256-NEXT: .LBB35_11: // %else26 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h @@ -15511,7 +15511,7 @@ define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; CHECK-EXPAND-NEXT: whilelo p1.h, xzr, x9 ; CHECK-EXPAND-NEXT: ld1h { z0.h }, p1/z, [x0] ; CHECK-EXPAND-NEXT: expand z0.h, p2, z0.h -; CHECK-EXPAND-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-EXPAND-NEXT: mov d1, v0.d[1] ; CHECK-EXPAND-NEXT: uunpklo z0.s, z0.h ; CHECK-EXPAND-NEXT: uunpklo z1.s, z1.h ; CHECK-EXPAND-NEXT: uunpklo z0.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index d49e0498aa908..06039a4eb7af0 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -215,7 +215,7 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d0, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 @@ -622,7 +622,7 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d0, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 2a3a8d00641ac..a898151a12ace 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -408,7 +408,7 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -443,7 +443,7 @@ define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -510,7 +510,7 @@ define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -610,7 +610,7 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -645,7 +645,7 @@ define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -712,7 +712,7 @@ define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -831,7 +831,7 @@ define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] ; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p2/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -877,7 +877,7 @@ define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p2/z, [x0] ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -965,7 +965,7 @@ define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h ; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p2/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -1093,7 +1093,7 @@ define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] ; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p2/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -1139,7 +1139,7 @@ define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p2/z, [x0] ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -1227,7 +1227,7 @@ define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h ; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p2/z, [x0] -; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v0.d[1] ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 48d9d587164cf..b551cdf2110f5 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -204,10 +204,10 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d0, v0.d[1] ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h -; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v1.d[1] ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s @@ -580,10 +580,10 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: mov d0, v0.d[1] ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h -; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: mov d1, v1.d[1] ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll index 9372f2a82a795..88c0b34366809 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll @@ -54,7 +54,7 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half>) define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) nounwind { ; CHECK-LABEL: llrint_v8i64_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: frintx v0.4h, v0.4h ; CHECK-NEXT: frintx v1.4h, v1.4h ; CHECK-NEXT: mov h3, v0.h[2] @@ -88,56 +88,56 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half>) define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) nounwind { ; CHECK-LABEL: llrint_v16i64_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: frintx v0.4h, v0.4h +; CHECK-NEXT: mov d2, v1.d[1] ; CHECK-NEXT: frintx v1.4h, v1.4h +; CHECK-NEXT: mov d3, v0.d[1] +; CHECK-NEXT: frintx v0.4h, v0.4h ; CHECK-NEXT: frintx v2.4h, v2.4h +; CHECK-NEXT: mov h5, v1.h[1] ; CHECK-NEXT: frintx v3.4h, v3.4h -; CHECK-NEXT: mov h5, v0.h[2] ; CHECK-NEXT: mov h4, v1.h[2] -; CHECK-NEXT: mov h6, v0.h[1] +; CHECK-NEXT: mov h6, v0.h[2] ; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: mov h16, v0.h[3] +; CHECK-NEXT: mov h7, v0.h[3] +; CHECK-NEXT: mov h16, v0.h[1] ; CHECK-NEXT: fcvtzs x9, h0 -; CHECK-NEXT: mov h7, v1.h[1] ; CHECK-NEXT: mov h1, v1.h[3] ; CHECK-NEXT: mov h0, v2.h[3] ; CHECK-NEXT: mov h17, v2.h[2] ; CHECK-NEXT: fcvtzs x12, h5 ; CHECK-NEXT: mov h5, v3.h[2] +; CHECK-NEXT: fcvtzs x10, h4 ; CHECK-NEXT: fcvtzs x11, h2 ; CHECK-NEXT: mov h18, v3.h[3] ; CHECK-NEXT: fcvtzs x14, h3 ; CHECK-NEXT: mov h3, v3.h[1] -; CHECK-NEXT: mov h19, v2.h[1] -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: mov h2, v2.h[1] ; CHECK-NEXT: fcvtzs x13, h6 -; CHECK-NEXT: fcvtzs x15, h0 -; CHECK-NEXT: fcvtzs x8, h17 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: fcvtzs x9, h5 -; CHECK-NEXT: fcvtzs x16, h7 -; CHECK-NEXT: fcvtzs x17, h16 +; CHECK-NEXT: fcvtzs x9, h17 +; CHECK-NEXT: fcvtzs x16, h5 +; CHECK-NEXT: fcvtzs x15, h16 +; CHECK-NEXT: fcvtzs x17, h7 ; CHECK-NEXT: fmov d6, x11 ; CHECK-NEXT: fcvtzs x11, h18 ; CHECK-NEXT: fcvtzs x18, h3 -; CHECK-NEXT: fmov d2, x14 -; CHECK-NEXT: fcvtzs x14, h19 -; CHECK-NEXT: fcvtzs x0, h1 ; CHECK-NEXT: fmov d5, x10 -; CHECK-NEXT: fmov d1, x12 -; CHECK-NEXT: fmov d7, x8 -; CHECK-NEXT: fmov d3, x9 -; CHECK-NEXT: mov v0.d[1], x13 -; CHECK-NEXT: mov v4.d[1], x16 -; CHECK-NEXT: mov v2.d[1], x18 +; CHECK-NEXT: fcvtzs x10, h2 +; CHECK-NEXT: fcvtzs x0, h1 +; CHECK-NEXT: fmov d2, x14 +; CHECK-NEXT: fmov d1, x13 +; CHECK-NEXT: fmov d7, x9 +; CHECK-NEXT: fmov d3, x16 +; CHECK-NEXT: mov v0.d[1], x15 +; CHECK-NEXT: mov v4.d[1], x12 ; CHECK-NEXT: mov v1.d[1], x17 +; CHECK-NEXT: mov v2.d[1], x18 ; CHECK-NEXT: mov v5.d[1], x0 -; CHECK-NEXT: mov v6.d[1], x14 ; CHECK-NEXT: mov v3.d[1], x11 -; CHECK-NEXT: mov v7.d[1], x15 +; CHECK-NEXT: mov v6.d[1], x10 +; CHECK-NEXT: mov v7.d[1], x8 ; CHECK-NEXT: ret %a = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> %x) ret <16 x i64> %a @@ -151,118 +151,118 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) nounwind { ; CHECK-NEXT: sub x9, sp, #272 ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: frintx v5.4h, v0.4h -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v17.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: frintx v1.4h, v1.4h -; CHECK-NEXT: frintx v2.4h, v2.4h -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov h6, v5.h[3] +; CHECK-NEXT: mov d5, v0.d[1] ; CHECK-NEXT: frintx v0.4h, v0.4h -; CHECK-NEXT: mov h7, v5.h[2] -; CHECK-NEXT: mov h16, v5.h[1] -; CHECK-NEXT: frintx v4.4h, v4.4h -; CHECK-NEXT: fcvtzs x12, h5 -; CHECK-NEXT: ext v5.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: frintx v17.4h, v17.4h -; CHECK-NEXT: frintx v3.4h, v3.4h -; CHECK-NEXT: fcvtzs x9, h6 +; CHECK-NEXT: frintx v4.4h, v1.4h +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: frintx v5.4h, v5.4h ; CHECK-NEXT: mov h6, v0.h[3] -; CHECK-NEXT: fcvtzs x10, h7 ; CHECK-NEXT: mov h7, v0.h[2] -; CHECK-NEXT: fcvtzs x11, h16 ; CHECK-NEXT: mov h16, v0.h[1] -; CHECK-NEXT: fcvtzs x13, h6 -; CHECK-NEXT: mov h6, v4.h[3] +; CHECK-NEXT: fcvtzs x12, h0 +; CHECK-NEXT: frintx v1.4h, v1.4h +; CHECK-NEXT: frintx v0.4h, v2.4h +; CHECK-NEXT: mov d2, v2.d[1] +; CHECK-NEXT: mov h17, v5.h[3] +; CHECK-NEXT: fcvtzs x9, h6 +; CHECK-NEXT: mov h6, v5.h[2] +; CHECK-NEXT: fcvtzs x10, h7 +; CHECK-NEXT: fcvtzs x11, h16 +; CHECK-NEXT: mov h7, v5.h[1] +; CHECK-NEXT: mov h16, v4.h[3] +; CHECK-NEXT: frintx v2.4h, v2.4h +; CHECK-NEXT: fcvtzs x13, h17 ; CHECK-NEXT: stp x10, x9, [sp, #48] -; CHECK-NEXT: fcvtzs x9, h7 -; CHECK-NEXT: mov h7, v4.h[2] -; CHECK-NEXT: fcvtzs x10, h16 -; CHECK-NEXT: mov h16, v4.h[1] +; CHECK-NEXT: fcvtzs x9, h6 +; CHECK-NEXT: mov h6, v4.h[2] +; CHECK-NEXT: fcvtzs x10, h7 +; CHECK-NEXT: mov h7, v4.h[1] ; CHECK-NEXT: stp x12, x11, [sp, #32] -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: frintx v0.4h, v5.4h -; CHECK-NEXT: mov h5, v17.h[3] -; CHECK-NEXT: fcvtzs x12, h6 -; CHECK-NEXT: mov h6, v17.h[2] +; CHECK-NEXT: fcvtzs x11, h5 +; CHECK-NEXT: fcvtzs x12, h16 +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: stp x9, x13, [sp, #16] -; CHECK-NEXT: fcvtzs x13, h7 -; CHECK-NEXT: mov h7, v17.h[1] -; CHECK-NEXT: fcvtzs x9, h16 +; CHECK-NEXT: fcvtzs x13, h6 +; CHECK-NEXT: mov h6, v1.h[2] +; CHECK-NEXT: fcvtzs x9, h7 +; CHECK-NEXT: mov h7, v0.h[3] ; CHECK-NEXT: stp x11, x10, [sp] ; CHECK-NEXT: fcvtzs x10, h4 +; CHECK-NEXT: mov h4, v1.h[1] ; CHECK-NEXT: fcvtzs x11, h5 -; CHECK-NEXT: mov h4, v0.h[3] ; CHECK-NEXT: mov h5, v0.h[2] -; CHECK-NEXT: stp x13, x12, [sp, #80] +; CHECK-NEXT: stp x13, x12, [sp, #112] ; CHECK-NEXT: fcvtzs x12, h6 +; CHECK-NEXT: frintx v6.4h, v3.4h ; CHECK-NEXT: fcvtzs x13, h7 -; CHECK-NEXT: mov h6, v0.h[1] -; CHECK-NEXT: stp x10, x9, [sp, #64] -; CHECK-NEXT: fcvtzs x9, h17 -; CHECK-NEXT: mov h7, v1.h[3] -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: mov h4, v1.h[2] -; CHECK-NEXT: stp x12, x11, [sp, #144] +; CHECK-NEXT: mov d3, v3.d[1] +; CHECK-NEXT: stp x10, x9, [sp, #96] +; CHECK-NEXT: fcvtzs x9, h4 +; CHECK-NEXT: mov h4, v0.h[1] +; CHECK-NEXT: fcvtzs x10, h1 +; CHECK-NEXT: mov h1, v2.h[3] +; CHECK-NEXT: stp x12, x11, [sp, #80] ; CHECK-NEXT: fcvtzs x11, h5 -; CHECK-NEXT: mov h5, v1.h[1] -; CHECK-NEXT: fcvtzs x12, h6 -; CHECK-NEXT: stp x9, x13, [sp, #128] +; CHECK-NEXT: mov h5, v2.h[2] +; CHECK-NEXT: fcvtzs x12, h4 +; CHECK-NEXT: mov h4, v2.h[1] +; CHECK-NEXT: stp x10, x9, [sp, #64] ; CHECK-NEXT: fcvtzs x9, h0 -; CHECK-NEXT: fcvtzs x13, h7 -; CHECK-NEXT: mov h0, v2.h[3] -; CHECK-NEXT: stp x11, x10, [sp, #208] -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: mov h4, v2.h[2] +; CHECK-NEXT: fcvtzs x10, h1 +; CHECK-NEXT: mov h0, v6.h[3] +; CHECK-NEXT: frintx v1.4h, v3.4h +; CHECK-NEXT: stp x11, x13, [sp, #176] ; CHECK-NEXT: fcvtzs x11, h5 -; CHECK-NEXT: mov h5, v2.h[1] -; CHECK-NEXT: stp x9, x12, [sp, #192] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fcvtzs x12, h0 -; CHECK-NEXT: mov h0, v3.h[3] -; CHECK-NEXT: mov h1, v3.h[2] -; CHECK-NEXT: stp x10, x13, [sp, #112] -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: mov h4, v3.h[1] -; CHECK-NEXT: fcvtzs x13, h5 -; CHECK-NEXT: stp x9, x11, [sp, #96] +; CHECK-NEXT: mov h3, v6.h[2] +; CHECK-NEXT: fcvtzs x13, h4 +; CHECK-NEXT: mov h4, v6.h[1] +; CHECK-NEXT: stp x9, x12, [sp, #160] ; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: stp x10, x12, [sp, #176] -; CHECK-NEXT: fcvtzs x10, h1 -; CHECK-NEXT: fcvtzs x12, h4 -; CHECK-NEXT: stp x9, x13, [sp, #160] -; CHECK-NEXT: fcvtzs x9, h3 -; CHECK-NEXT: stp x10, x11, [sp, #240] -; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: stp x9, x12, [sp, #224] +; CHECK-NEXT: fcvtzs x12, h0 +; CHECK-NEXT: mov h0, v1.h[3] +; CHECK-NEXT: mov h2, v1.h[2] +; CHECK-NEXT: stp x11, x10, [sp, #144] +; CHECK-NEXT: fcvtzs x10, h3 +; CHECK-NEXT: mov h3, v1.h[1] +; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: stp x9, x13, [sp, #128] +; CHECK-NEXT: fcvtzs x9, h6 +; CHECK-NEXT: fcvtzs x13, h0 +; CHECK-NEXT: stp x10, x12, [sp, #240] +; CHECK-NEXT: fcvtzs x10, h2 +; CHECK-NEXT: fcvtzs x12, h3 +; CHECK-NEXT: stp x9, x11, [sp, #224] +; CHECK-NEXT: fcvtzs x9, h1 +; CHECK-NEXT: stp x10, x13, [sp, #208] +; CHECK-NEXT: add x10, sp, #96 +; CHECK-NEXT: stp x9, x12, [sp, #192] ; CHECK-NEXT: add x9, sp, #32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9] ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] -; CHECK-NEXT: add x9, sp, #224 -; CHECK-NEXT: add x10, sp, #128 +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: add x10, sp, #64 ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9] -; CHECK-NEXT: add x9, sp, #160 +; CHECK-NEXT: add x9, sp, #224 ; CHECK-NEXT: ld1d { z4.d }, p0/z, [x10] -; CHECK-NEXT: add x10, sp, #96 +; CHECK-NEXT: add x10, sp, #128 ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x9] -; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: add x9, sp, #160 ; CHECK-NEXT: ld1d { z6.d }, p0/z, [x10] -; CHECK-NEXT: mov x10, #24 // =0x18 +; CHECK-NEXT: mov x10, #28 // =0x1c ; CHECK-NEXT: ld1d { z7.d }, p0/z, [x9] -; CHECK-NEXT: mov x9, #16 // =0x10 +; CHECK-NEXT: mov x9, #24 // =0x18 ; CHECK-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3] ; CHECK-NEXT: st1d { z5.d }, p0, [x8, x9, lsl #3] -; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x9, #20 // =0x14 ; CHECK-NEXT: st1d { z6.d }, p0, [x8, x9, lsl #3] -; CHECK-NEXT: mov x9, #28 // =0x1c +; CHECK-NEXT: mov x9, #16 // =0x10 ; CHECK-NEXT: st1d { z7.d }, p0, [x8, x9, lsl #3] -; CHECK-NEXT: mov x9, #20 // =0x14 -; CHECK-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] ; CHECK-NEXT: mov x9, #12 // =0xc +; CHECK-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] +; CHECK-NEXT: mov x9, #8 // =0x8 ; CHECK-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] ; CHECK-NEXT: mov x9, #4 // =0x4 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll index 2d7fe4a22ad0a..c9553388a5d4e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll @@ -112,7 +112,7 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) nounwind { ; ; CHECK-i64-LABEL: lrint_v8f16: ; CHECK-i64: // %bb.0: -; CHECK-i64-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-i64-NEXT: mov d1, v0.d[1] ; CHECK-i64-NEXT: frintx v0.4h, v0.4h ; CHECK-i64-NEXT: frintx v1.4h, v1.4h ; CHECK-i64-NEXT: mov h3, v0.h[2] @@ -198,56 +198,56 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) nounwind { ; ; CHECK-i64-LABEL: lrint_v16f16: ; CHECK-i64: // %bb.0: -; CHECK-i64-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-i64-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-i64-NEXT: frintx v0.4h, v0.4h +; CHECK-i64-NEXT: mov d2, v1.d[1] ; CHECK-i64-NEXT: frintx v1.4h, v1.4h +; CHECK-i64-NEXT: mov d3, v0.d[1] +; CHECK-i64-NEXT: frintx v0.4h, v0.4h ; CHECK-i64-NEXT: frintx v2.4h, v2.4h +; CHECK-i64-NEXT: mov h5, v1.h[1] ; CHECK-i64-NEXT: frintx v3.4h, v3.4h -; CHECK-i64-NEXT: mov h5, v0.h[2] ; CHECK-i64-NEXT: mov h4, v1.h[2] -; CHECK-i64-NEXT: mov h6, v0.h[1] +; CHECK-i64-NEXT: mov h6, v0.h[2] ; CHECK-i64-NEXT: fcvtzs x8, h1 -; CHECK-i64-NEXT: mov h16, v0.h[3] +; CHECK-i64-NEXT: mov h7, v0.h[3] +; CHECK-i64-NEXT: mov h16, v0.h[1] ; CHECK-i64-NEXT: fcvtzs x9, h0 -; CHECK-i64-NEXT: mov h7, v1.h[1] ; CHECK-i64-NEXT: mov h1, v1.h[3] ; CHECK-i64-NEXT: mov h0, v2.h[3] ; CHECK-i64-NEXT: mov h17, v2.h[2] ; CHECK-i64-NEXT: fcvtzs x12, h5 ; CHECK-i64-NEXT: mov h5, v3.h[2] +; CHECK-i64-NEXT: fcvtzs x10, h4 ; CHECK-i64-NEXT: fcvtzs x11, h2 ; CHECK-i64-NEXT: mov h18, v3.h[3] ; CHECK-i64-NEXT: fcvtzs x14, h3 ; CHECK-i64-NEXT: mov h3, v3.h[1] -; CHECK-i64-NEXT: mov h19, v2.h[1] -; CHECK-i64-NEXT: fcvtzs x10, h4 -; CHECK-i64-NEXT: fmov d4, x8 +; CHECK-i64-NEXT: mov h2, v2.h[1] ; CHECK-i64-NEXT: fcvtzs x13, h6 -; CHECK-i64-NEXT: fcvtzs x15, h0 -; CHECK-i64-NEXT: fcvtzs x8, h17 +; CHECK-i64-NEXT: fmov d4, x8 +; CHECK-i64-NEXT: fcvtzs x8, h0 ; CHECK-i64-NEXT: fmov d0, x9 -; CHECK-i64-NEXT: fcvtzs x9, h5 -; CHECK-i64-NEXT: fcvtzs x16, h7 -; CHECK-i64-NEXT: fcvtzs x17, h16 +; CHECK-i64-NEXT: fcvtzs x9, h17 +; CHECK-i64-NEXT: fcvtzs x16, h5 +; CHECK-i64-NEXT: fcvtzs x15, h16 +; CHECK-i64-NEXT: fcvtzs x17, h7 ; CHECK-i64-NEXT: fmov d6, x11 ; CHECK-i64-NEXT: fcvtzs x11, h18 ; CHECK-i64-NEXT: fcvtzs x18, h3 -; CHECK-i64-NEXT: fmov d2, x14 -; CHECK-i64-NEXT: fcvtzs x14, h19 -; CHECK-i64-NEXT: fcvtzs x0, h1 ; CHECK-i64-NEXT: fmov d5, x10 -; CHECK-i64-NEXT: fmov d1, x12 -; CHECK-i64-NEXT: fmov d7, x8 -; CHECK-i64-NEXT: fmov d3, x9 -; CHECK-i64-NEXT: mov v0.d[1], x13 -; CHECK-i64-NEXT: mov v4.d[1], x16 -; CHECK-i64-NEXT: mov v2.d[1], x18 +; CHECK-i64-NEXT: fcvtzs x10, h2 +; CHECK-i64-NEXT: fcvtzs x0, h1 +; CHECK-i64-NEXT: fmov d2, x14 +; CHECK-i64-NEXT: fmov d1, x13 +; CHECK-i64-NEXT: fmov d7, x9 +; CHECK-i64-NEXT: fmov d3, x16 +; CHECK-i64-NEXT: mov v0.d[1], x15 +; CHECK-i64-NEXT: mov v4.d[1], x12 ; CHECK-i64-NEXT: mov v1.d[1], x17 +; CHECK-i64-NEXT: mov v2.d[1], x18 ; CHECK-i64-NEXT: mov v5.d[1], x0 -; CHECK-i64-NEXT: mov v6.d[1], x14 ; CHECK-i64-NEXT: mov v3.d[1], x11 -; CHECK-i64-NEXT: mov v7.d[1], x15 +; CHECK-i64-NEXT: mov v6.d[1], x10 +; CHECK-i64-NEXT: mov v7.d[1], x8 ; CHECK-i64-NEXT: ret %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half> %x) ret <16 x iXLen> %a @@ -363,118 +363,118 @@ define <32 x iXLen> @lrint_v32f16(<32 x half> %x) nounwind { ; CHECK-i64-NEXT: sub x9, sp, #272 ; CHECK-i64-NEXT: mov x29, sp ; CHECK-i64-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-i64-NEXT: frintx v5.4h, v0.4h -; CHECK-i64-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-i64-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-i64-NEXT: ext v17.16b, v2.16b, v2.16b, #8 -; CHECK-i64-NEXT: frintx v1.4h, v1.4h -; CHECK-i64-NEXT: frintx v2.4h, v2.4h -; CHECK-i64-NEXT: ptrue p0.d, vl4 -; CHECK-i64-NEXT: mov h6, v5.h[3] +; CHECK-i64-NEXT: mov d5, v0.d[1] ; CHECK-i64-NEXT: frintx v0.4h, v0.4h -; CHECK-i64-NEXT: mov h7, v5.h[2] -; CHECK-i64-NEXT: mov h16, v5.h[1] -; CHECK-i64-NEXT: frintx v4.4h, v4.4h -; CHECK-i64-NEXT: fcvtzs x12, h5 -; CHECK-i64-NEXT: ext v5.16b, v3.16b, v3.16b, #8 -; CHECK-i64-NEXT: frintx v17.4h, v17.4h -; CHECK-i64-NEXT: frintx v3.4h, v3.4h -; CHECK-i64-NEXT: fcvtzs x9, h6 +; CHECK-i64-NEXT: frintx v4.4h, v1.4h +; CHECK-i64-NEXT: mov d1, v1.d[1] +; CHECK-i64-NEXT: ptrue p0.d, vl4 +; CHECK-i64-NEXT: frintx v5.4h, v5.4h ; CHECK-i64-NEXT: mov h6, v0.h[3] -; CHECK-i64-NEXT: fcvtzs x10, h7 ; CHECK-i64-NEXT: mov h7, v0.h[2] -; CHECK-i64-NEXT: fcvtzs x11, h16 ; CHECK-i64-NEXT: mov h16, v0.h[1] -; CHECK-i64-NEXT: fcvtzs x13, h6 -; CHECK-i64-NEXT: mov h6, v4.h[3] +; CHECK-i64-NEXT: fcvtzs x12, h0 +; CHECK-i64-NEXT: frintx v1.4h, v1.4h +; CHECK-i64-NEXT: frintx v0.4h, v2.4h +; CHECK-i64-NEXT: mov d2, v2.d[1] +; CHECK-i64-NEXT: mov h17, v5.h[3] +; CHECK-i64-NEXT: fcvtzs x9, h6 +; CHECK-i64-NEXT: mov h6, v5.h[2] +; CHECK-i64-NEXT: fcvtzs x10, h7 +; CHECK-i64-NEXT: fcvtzs x11, h16 +; CHECK-i64-NEXT: mov h7, v5.h[1] +; CHECK-i64-NEXT: mov h16, v4.h[3] +; CHECK-i64-NEXT: frintx v2.4h, v2.4h +; CHECK-i64-NEXT: fcvtzs x13, h17 ; CHECK-i64-NEXT: stp x10, x9, [sp, #48] -; CHECK-i64-NEXT: fcvtzs x9, h7 -; CHECK-i64-NEXT: mov h7, v4.h[2] -; CHECK-i64-NEXT: fcvtzs x10, h16 -; CHECK-i64-NEXT: mov h16, v4.h[1] +; CHECK-i64-NEXT: fcvtzs x9, h6 +; CHECK-i64-NEXT: mov h6, v4.h[2] +; CHECK-i64-NEXT: fcvtzs x10, h7 +; CHECK-i64-NEXT: mov h7, v4.h[1] ; CHECK-i64-NEXT: stp x12, x11, [sp, #32] -; CHECK-i64-NEXT: fcvtzs x11, h0 -; CHECK-i64-NEXT: frintx v0.4h, v5.4h -; CHECK-i64-NEXT: mov h5, v17.h[3] -; CHECK-i64-NEXT: fcvtzs x12, h6 -; CHECK-i64-NEXT: mov h6, v17.h[2] +; CHECK-i64-NEXT: fcvtzs x11, h5 +; CHECK-i64-NEXT: fcvtzs x12, h16 +; CHECK-i64-NEXT: mov h5, v1.h[3] ; CHECK-i64-NEXT: stp x9, x13, [sp, #16] -; CHECK-i64-NEXT: fcvtzs x13, h7 -; CHECK-i64-NEXT: mov h7, v17.h[1] -; CHECK-i64-NEXT: fcvtzs x9, h16 +; CHECK-i64-NEXT: fcvtzs x13, h6 +; CHECK-i64-NEXT: mov h6, v1.h[2] +; CHECK-i64-NEXT: fcvtzs x9, h7 +; CHECK-i64-NEXT: mov h7, v0.h[3] ; CHECK-i64-NEXT: stp x11, x10, [sp] ; CHECK-i64-NEXT: fcvtzs x10, h4 +; CHECK-i64-NEXT: mov h4, v1.h[1] ; CHECK-i64-NEXT: fcvtzs x11, h5 -; CHECK-i64-NEXT: mov h4, v0.h[3] ; CHECK-i64-NEXT: mov h5, v0.h[2] -; CHECK-i64-NEXT: stp x13, x12, [sp, #80] +; CHECK-i64-NEXT: stp x13, x12, [sp, #112] ; CHECK-i64-NEXT: fcvtzs x12, h6 +; CHECK-i64-NEXT: frintx v6.4h, v3.4h ; CHECK-i64-NEXT: fcvtzs x13, h7 -; CHECK-i64-NEXT: mov h6, v0.h[1] -; CHECK-i64-NEXT: stp x10, x9, [sp, #64] -; CHECK-i64-NEXT: fcvtzs x9, h17 -; CHECK-i64-NEXT: mov h7, v1.h[3] -; CHECK-i64-NEXT: fcvtzs x10, h4 -; CHECK-i64-NEXT: mov h4, v1.h[2] -; CHECK-i64-NEXT: stp x12, x11, [sp, #144] +; CHECK-i64-NEXT: mov d3, v3.d[1] +; CHECK-i64-NEXT: stp x10, x9, [sp, #96] +; CHECK-i64-NEXT: fcvtzs x9, h4 +; CHECK-i64-NEXT: mov h4, v0.h[1] +; CHECK-i64-NEXT: fcvtzs x10, h1 +; CHECK-i64-NEXT: mov h1, v2.h[3] +; CHECK-i64-NEXT: stp x12, x11, [sp, #80] ; CHECK-i64-NEXT: fcvtzs x11, h5 -; CHECK-i64-NEXT: mov h5, v1.h[1] -; CHECK-i64-NEXT: fcvtzs x12, h6 -; CHECK-i64-NEXT: stp x9, x13, [sp, #128] +; CHECK-i64-NEXT: mov h5, v2.h[2] +; CHECK-i64-NEXT: fcvtzs x12, h4 +; CHECK-i64-NEXT: mov h4, v2.h[1] +; CHECK-i64-NEXT: stp x10, x9, [sp, #64] ; CHECK-i64-NEXT: fcvtzs x9, h0 -; CHECK-i64-NEXT: fcvtzs x13, h7 -; CHECK-i64-NEXT: mov h0, v2.h[3] -; CHECK-i64-NEXT: stp x11, x10, [sp, #208] -; CHECK-i64-NEXT: fcvtzs x10, h4 -; CHECK-i64-NEXT: mov h4, v2.h[2] +; CHECK-i64-NEXT: fcvtzs x10, h1 +; CHECK-i64-NEXT: mov h0, v6.h[3] +; CHECK-i64-NEXT: frintx v1.4h, v3.4h +; CHECK-i64-NEXT: stp x11, x13, [sp, #176] ; CHECK-i64-NEXT: fcvtzs x11, h5 -; CHECK-i64-NEXT: mov h5, v2.h[1] -; CHECK-i64-NEXT: stp x9, x12, [sp, #192] -; CHECK-i64-NEXT: fcvtzs x9, h1 -; CHECK-i64-NEXT: fcvtzs x12, h0 -; CHECK-i64-NEXT: mov h0, v3.h[3] -; CHECK-i64-NEXT: mov h1, v3.h[2] -; CHECK-i64-NEXT: stp x10, x13, [sp, #112] -; CHECK-i64-NEXT: fcvtzs x10, h4 -; CHECK-i64-NEXT: mov h4, v3.h[1] -; CHECK-i64-NEXT: fcvtzs x13, h5 -; CHECK-i64-NEXT: stp x9, x11, [sp, #96] +; CHECK-i64-NEXT: mov h3, v6.h[2] +; CHECK-i64-NEXT: fcvtzs x13, h4 +; CHECK-i64-NEXT: mov h4, v6.h[1] +; CHECK-i64-NEXT: stp x9, x12, [sp, #160] ; CHECK-i64-NEXT: fcvtzs x9, h2 -; CHECK-i64-NEXT: fcvtzs x11, h0 -; CHECK-i64-NEXT: stp x10, x12, [sp, #176] -; CHECK-i64-NEXT: fcvtzs x10, h1 -; CHECK-i64-NEXT: fcvtzs x12, h4 -; CHECK-i64-NEXT: stp x9, x13, [sp, #160] -; CHECK-i64-NEXT: fcvtzs x9, h3 -; CHECK-i64-NEXT: stp x10, x11, [sp, #240] -; CHECK-i64-NEXT: add x10, sp, #64 -; CHECK-i64-NEXT: stp x9, x12, [sp, #224] +; CHECK-i64-NEXT: fcvtzs x12, h0 +; CHECK-i64-NEXT: mov h0, v1.h[3] +; CHECK-i64-NEXT: mov h2, v1.h[2] +; CHECK-i64-NEXT: stp x11, x10, [sp, #144] +; CHECK-i64-NEXT: fcvtzs x10, h3 +; CHECK-i64-NEXT: mov h3, v1.h[1] +; CHECK-i64-NEXT: fcvtzs x11, h4 +; CHECK-i64-NEXT: stp x9, x13, [sp, #128] +; CHECK-i64-NEXT: fcvtzs x9, h6 +; CHECK-i64-NEXT: fcvtzs x13, h0 +; CHECK-i64-NEXT: stp x10, x12, [sp, #240] +; CHECK-i64-NEXT: fcvtzs x10, h2 +; CHECK-i64-NEXT: fcvtzs x12, h3 +; CHECK-i64-NEXT: stp x9, x11, [sp, #224] +; CHECK-i64-NEXT: fcvtzs x9, h1 +; CHECK-i64-NEXT: stp x10, x13, [sp, #208] +; CHECK-i64-NEXT: add x10, sp, #96 +; CHECK-i64-NEXT: stp x9, x12, [sp, #192] ; CHECK-i64-NEXT: add x9, sp, #32 ; CHECK-i64-NEXT: ld1d { z0.d }, p0/z, [x9] ; CHECK-i64-NEXT: mov x9, sp ; CHECK-i64-NEXT: ld1d { z2.d }, p0/z, [x10] ; CHECK-i64-NEXT: ld1d { z1.d }, p0/z, [x9] -; CHECK-i64-NEXT: add x9, sp, #224 -; CHECK-i64-NEXT: add x10, sp, #128 +; CHECK-i64-NEXT: add x9, sp, #192 +; CHECK-i64-NEXT: add x10, sp, #64 ; CHECK-i64-NEXT: ld1d { z3.d }, p0/z, [x9] -; CHECK-i64-NEXT: add x9, sp, #160 +; CHECK-i64-NEXT: add x9, sp, #224 ; CHECK-i64-NEXT: ld1d { z4.d }, p0/z, [x10] -; CHECK-i64-NEXT: add x10, sp, #96 +; CHECK-i64-NEXT: add x10, sp, #128 ; CHECK-i64-NEXT: ld1d { z5.d }, p0/z, [x9] -; CHECK-i64-NEXT: add x9, sp, #192 +; CHECK-i64-NEXT: add x9, sp, #160 ; CHECK-i64-NEXT: ld1d { z6.d }, p0/z, [x10] -; CHECK-i64-NEXT: mov x10, #24 // =0x18 +; CHECK-i64-NEXT: mov x10, #28 // =0x1c ; CHECK-i64-NEXT: ld1d { z7.d }, p0/z, [x9] -; CHECK-i64-NEXT: mov x9, #16 // =0x10 +; CHECK-i64-NEXT: mov x9, #24 // =0x18 ; CHECK-i64-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3] ; CHECK-i64-NEXT: st1d { z5.d }, p0, [x8, x9, lsl #3] -; CHECK-i64-NEXT: mov x9, #8 // =0x8 +; CHECK-i64-NEXT: mov x9, #20 // =0x14 ; CHECK-i64-NEXT: st1d { z6.d }, p0, [x8, x9, lsl #3] -; CHECK-i64-NEXT: mov x9, #28 // =0x1c +; CHECK-i64-NEXT: mov x9, #16 // =0x10 ; CHECK-i64-NEXT: st1d { z7.d }, p0, [x8, x9, lsl #3] -; CHECK-i64-NEXT: mov x9, #20 // =0x14 -; CHECK-i64-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] ; CHECK-i64-NEXT: mov x9, #12 // =0xc +; CHECK-i64-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] +; CHECK-i64-NEXT: mov x9, #8 // =0x8 ; CHECK-i64-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] ; CHECK-i64-NEXT: mov x9, #4 // =0x4 ; CHECK-i64-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 9bd4714667b12..0866701c34921 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -1005,7 +1005,7 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: umov.b w8, v0[1] -; CHECK-GI-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-GI-NEXT: mov d1, v0[1] ; CHECK-GI-NEXT: umov.b w10, v0[1] ; CHECK-GI-NEXT: umov.b w9, v0[0] ; CHECK-GI-NEXT: umov.b w13, v0[0] diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 2f51208e49351..df78c2394a942 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -141,7 +141,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; CHECK-NEXT: add v4.16b, v0.16b, v1.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v4.16b ; CHECK-NEXT: str q4, [x0] -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll index 935f4272218af..0a3d94d8ba952 100644 --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -169,7 +169,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; CHECK-NEXT: uzp2 v2.16b, v3.16b, v2.16b ; CHECK-NEXT: str q6, [x0] ; CHECK-NEXT: cmtst v2.16b, v2.16b, v2.16b -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: zip1 v4.8b, v2.8b, v0.8b ; CHECK-NEXT: zip2 v2.8b, v2.8b, v0.8b ; CHECK-NEXT: zip1 v5.8b, v3.8b, v0.8b diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll index ac54dd41b0962..7fbe6e55cc968 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -103,7 +103,7 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0xffffffffffffff00 ; CHECK-NEXT: orr v1.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -121,7 +121,7 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind { ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: mov v1.s[3], w8 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -147,7 +147,7 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind { define i24 @test_v4i24(<4 x i24> %a) nounwind { ; CHECK-LABEL: test_v4i24: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -173,7 +173,7 @@ define i32 @test_v16i32(<16 x i32> %a) nounwind { ; CHECK-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bitext.ll b/llvm/test/CodeGen/AArch64/vecreduce-bitext.ll index cc65f17b71864..01606f7ea07b6 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-bitext.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-bitext.ll @@ -37,7 +37,7 @@ entry: define zeroext i16 @and_sext_v16i8_i16(<16 x i8> %x) { ; CHECK-LABEL: and_sext_v16i8_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -56,7 +56,7 @@ entry: define zeroext i16 @and_zext_v16i8_i16(<16 x i8> %x) { ; CHECK-LABEL: and_zext_v16i8_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -106,7 +106,7 @@ entry: define i32 @and_sext_v16i8_i32(<16 x i8> %x) { ; CHECK-LABEL: and_sext_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -124,7 +124,7 @@ entry: define i32 @and_zext_v16i8_i32(<16 x i8> %x) { ; CHECK-LABEL: and_zext_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -174,7 +174,7 @@ entry: define i64 @and_sext_v16i8_i64(<16 x i8> %x) { ; CHECK-LABEL: and_sext_v16i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -192,7 +192,7 @@ entry: define i64 @and_zext_v16i8_i64(<16 x i8> %x) { ; CHECK-LABEL: and_zext_v16i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -239,7 +239,7 @@ entry: define i32 @and_sext_v8i16_i32(<8 x i16> %x) { ; CHECK-LABEL: and_sext_v8i16_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -256,7 +256,7 @@ entry: define i32 @and_zext_v8i16_i32(<8 x i16> %x) { ; CHECK-LABEL: and_zext_v8i16_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -301,7 +301,7 @@ entry: define i64 @and_sext_v8i16_i64(<8 x i16> %x) { ; CHECK-LABEL: and_sext_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -318,7 +318,7 @@ entry: define i64 @and_zext_v8i16_i64(<8 x i16> %x) { ; CHECK-LABEL: and_zext_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: and x8, x8, x8, lsr #32 @@ -361,7 +361,7 @@ entry: define i64 @and_sext_v4i32_i64(<4 x i32> %x) { ; CHECK-LABEL: and_sext_v4i32_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -377,7 +377,7 @@ entry: define i64 @and_zext_v4i32_i64(<4 x i32> %x) { ; CHECK-LABEL: and_zext_v4i32_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -425,7 +425,7 @@ entry: define zeroext i16 @or_sext_v16i8_i16(<16 x i8> %x) { ; CHECK-LABEL: or_sext_v16i8_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: orr x8, x8, x8, lsr #32 @@ -444,7 +444,7 @@ entry: define zeroext i16 @or_zext_v16i8_i16(<16 x i8> %x) { ; CHECK-LABEL: or_zext_v16i8_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: orr x8, x8, x8, lsr #32 @@ -494,7 +494,7 @@ entry: define i32 @or_sext_v16i8_i32(<16 x i8> %x) { ; CHECK-LABEL: or_sext_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: orr x8, x8, x8, lsr #32 @@ -512,7 +512,7 @@ entry: define i32 @or_zext_v16i8_i32(<16 x i8> %x) { ; CHECK-LABEL: or_zext_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: orr x8, x8, x8, lsr #32 @@ -562,7 +562,7 @@ entry: define i64 @or_sext_v16i8_i64(<16 x i8> %x) { ; CHECK-LABEL: or_sext_v16i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: orr x8, x8, x8, lsr #32 @@ -580,7 +580,7 @@ entry: define i64 @or_zext_v16i8_i64(<16 x i8> %x) { ; CHECK-LABEL: or_zext_v16i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: orr x8, x8, x8, lsr #32 @@ -628,7 +628,7 @@ entry: define i32 @or_sext_v8i16_i32(<8 x i16> %x) { ; CHECK-LABEL: or_sext_v8i16_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -645,7 +645,7 @@ entry: define i32 @or_zext_v8i16_i32(<8 x i16> %x) { ; CHECK-LABEL: or_zext_v8i16_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -692,7 +692,7 @@ entry: define i64 @or_sext_v8i16_i64(<8 x i16> %x) { ; CHECK-LABEL: or_sext_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -709,7 +709,7 @@ entry: define i64 @or_zext_v8i16_i64(<8 x i16> %x) { ; CHECK-LABEL: or_zext_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -753,7 +753,7 @@ entry: define i64 @or_sext_v4i32_i64(<4 x i32> %x) { ; CHECK-LABEL: or_sext_v4i32_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -769,7 +769,7 @@ entry: define i64 @or_zext_v4i32_i64(<4 x i32> %x) { ; CHECK-LABEL: or_zext_v4i32_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -817,7 +817,7 @@ entry: define zeroext i16 @xor_sext_v16i8_i16(<16 x i8> %x) { ; CHECK-LABEL: xor_sext_v16i8_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: eor x8, x8, x8, lsr #32 @@ -836,7 +836,7 @@ entry: define zeroext i16 @xor_zext_v16i8_i16(<16 x i8> %x) { ; CHECK-LABEL: xor_zext_v16i8_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: eor x8, x8, x8, lsr #32 @@ -886,7 +886,7 @@ entry: define i32 @xor_sext_v16i8_i32(<16 x i8> %x) { ; CHECK-LABEL: xor_sext_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: eor x8, x8, x8, lsr #32 @@ -904,7 +904,7 @@ entry: define i32 @xor_zext_v16i8_i32(<16 x i8> %x) { ; CHECK-LABEL: xor_zext_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: eor x8, x8, x8, lsr #32 @@ -954,7 +954,7 @@ entry: define i64 @xor_sext_v16i8_i64(<16 x i8> %x) { ; CHECK-LABEL: xor_sext_v16i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: eor x8, x8, x8, lsr #32 @@ -972,7 +972,7 @@ entry: define i64 @xor_zext_v16i8_i64(<16 x i8> %x) { ; CHECK-LABEL: xor_zext_v16i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: eor x8, x8, x8, lsr #32 @@ -1020,7 +1020,7 @@ entry: define i32 @xor_sext_v8i16_i32(<8 x i16> %x) { ; CHECK-LABEL: xor_sext_v8i16_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -1037,7 +1037,7 @@ entry: define i32 @xor_zext_v8i16_i32(<8 x i16> %x) { ; CHECK-LABEL: xor_zext_v8i16_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -1084,7 +1084,7 @@ entry: define i64 @xor_sext_v8i16_i64(<8 x i16> %x) { ; CHECK-LABEL: xor_sext_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -1101,7 +1101,7 @@ entry: define i64 @xor_zext_v8i16_i64(<8 x i16> %x) { ; CHECK-LABEL: xor_zext_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -1145,7 +1145,7 @@ entry: define i64 @xor_sext_v4i32_i64(<4 x i32> %x) { ; CHECK-LABEL: xor_sext_v4i32_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 @@ -1161,7 +1161,7 @@ entry: define i64 @xor_zext_v4i32_i64(<4 x i32> %x) { ; CHECK-LABEL: xor_zext_v4i32_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: lsr x9, x8, #32 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll index 8a68c90cd2660..8e7316c0d4bb9 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll @@ -50,7 +50,7 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; CHECK-GI-NOFP16-LABEL: mul_HalfH: ; CHECK-GI-NOFP16: // %bb.0: ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h -; CHECK-GI-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] ; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 @@ -78,7 +78,7 @@ define half @mul_H(<8 x half> %bin.rdx) { ; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-SD-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-SD-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 @@ -86,7 +86,7 @@ define half @mul_H(<8 x half> %bin.rdx) { ; ; CHECK-SD-FP16-LABEL: mul_H: ; CHECK-SD-FP16: // %bb.0: -; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-FP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h ; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1] ; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h @@ -98,7 +98,7 @@ define half @mul_H(<8 x half> %bin.rdx) { ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v1.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] ; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 @@ -107,7 +107,7 @@ define half @mul_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-FP16-LABEL: mul_H: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] @@ -123,7 +123,7 @@ define half @mul_H(<8 x half> %bin.rdx) { define float @mul_S(<4 x float> %bin.rdx) { ; CHECK-SD-LABEL: mul_S: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -131,7 +131,7 @@ define float @mul_S(<4 x float> %bin.rdx) { ; ; CHECK-GI-LABEL: mul_S: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: fmul s0, s0, s1 @@ -163,7 +163,7 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-SD-NOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECK-SD-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-SD-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 @@ -172,7 +172,7 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-SD-FP16-LABEL: mul_2H: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h -; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-FP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h ; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1] ; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h @@ -188,7 +188,7 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s ; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s ; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] ; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 @@ -198,7 +198,7 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-GI-FP16-LABEL: mul_2H: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h -; CHECK-GI-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] @@ -215,7 +215,7 @@ define float @mul_2S(<8 x float> %bin.rdx) { ; CHECK-SD-LABEL: mul_2S: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -224,7 +224,7 @@ define float @mul_2S(<8 x float> %bin.rdx) { ; CHECK-GI-LABEL: mul_2S: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: fmul s0, s0, s1 @@ -247,7 +247,7 @@ define double @mul_2D(<4 x double> %bin.rdx) { define float @mul_S_init_42(<4 x float> %bin.rdx) { ; CHECK-SD-LABEL: mul_S_init_42: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mov w8, #1109917696 // =0x42280000 ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmov s1, w8 @@ -257,7 +257,7 @@ define float @mul_S_init_42(<4 x float> %bin.rdx) { ; ; CHECK-GI-LABEL: mul_S_init_42: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mov w8, #1109917696 // =0x42280000 ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] @@ -279,8 +279,8 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-SD-NOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v2.4s ; CHECK-SD-NOFP16-NEXT: fmul v1.4s, v1.4s, v3.4s -; CHECK-SD-NOFP16-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NOFP16-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NOFP16-NEXT: mov d2, v0.d[1] +; CHECK-SD-NOFP16-NEXT: mov d3, v1.d[1] ; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-SD-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-SD-NOFP16-NEXT: fmul v0.2s, v0.2s, v0.s[1] @@ -296,7 +296,7 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-SD-FP16-LABEL: fmul_reduct_reassoc_v8f16: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h -; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-FP16-NEXT: mov d1, v0.d[1] ; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h ; CHECK-SD-FP16-NEXT: dup v1.2s, v0.s[1] ; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h @@ -311,8 +311,8 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h ; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s ; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NOFP16-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NOFP16-NEXT: mov d2, v0.d[1] +; CHECK-GI-NOFP16-NEXT: mov d3, v1.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] @@ -329,8 +329,8 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; ; CHECK-GI-FP16-LABEL: fmul_reduct_reassoc_v8f16: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-FP16-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-FP16-NEXT: mov d2, v0.d[1] +; CHECK-GI-FP16-NEXT: mov d3, v1.d[1] ; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v2.4h ; CHECK-GI-FP16-NEXT: fmul v1.4h, v1.4h, v3.4h ; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] @@ -359,7 +359,7 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-SD-NEXT: fmul v2.4s, v2.4s, v3.4s ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -369,8 +369,8 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: fmul v1.4s, v2.4s, v3.4s -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov s2, v0.s[1] @@ -389,7 +389,7 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -397,8 +397,8 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { ; ; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov s2, v0.s[1] @@ -416,9 +416,9 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) { ; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_init: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d3, v1.d[1] ; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-SD-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-SD-NEXT: mov d3, v2.d[1] ; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v1.s[1] ; CHECK-SD-NEXT: fmul v2.2s, v2.2s, v3.2s ; CHECK-SD-NEXT: fmul s0, s0, s1 @@ -428,9 +428,9 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa ; ; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_init: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-GI-NEXT: mov d3, v2.d[1] ; CHECK-GI-NEXT: mov s4, v1.s[1] ; CHECK-GI-NEXT: fmul v2.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: fmul s1, s1, s4 @@ -450,7 +450,7 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmul v1.4s, v1.4s, v2.4s ; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -459,8 +459,8 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { ; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fmul v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov s2, v0.s[1] @@ -501,8 +501,8 @@ define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) { ; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_extrause: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d2, v0.d[1] +; CHECK-SD-NEXT: mov d3, v1.d[1] ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v0.s[1] @@ -513,8 +513,8 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) ; ; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_extrause: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov s2, v0.s[1] diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll index 0806f7da5c89c..c6617dbd20c2e 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -98,7 +98,7 @@ define i128 @test_v1i128(<1 x i128> %a) nounwind { define i64 @test_v2i64(<2 x i64> %a) nounwind { ; CHECK-SD-LABEL: test_v2i64: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: cmhi d2, d0, d1 ; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll index c3b7161feefb5..e8a9a830dced4 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll @@ -36,7 +36,7 @@ define <8 x float> @sitofp_v8i8_float(<8 x i8> %a) { define <16 x float> @sitofp_v16i8_float(<16 x i8> %a) { ; CHECK-LABEL: sitofp_v16i8_float: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b @@ -145,7 +145,7 @@ define <8 x float> @uitofp_v8i8_float(<8 x i8> %a) { define <16 x float> @uitofp_v16i8_float(<16 x i8> %a) { ; CHECK-LABEL: uitofp_v16i8_float: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b @@ -225,7 +225,7 @@ define <4 x double> @sitofp_v4i8_double(<4 x i8> %a) { ; CHECK-LABEL: sitofp_v4i8_double: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 @@ -275,7 +275,7 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) { define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) { ; CHECK-LABEL: sitofp_v16i8_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: mov b2, v0.b[0] ; CHECK-NEXT: mov b3, v0.b[2] ; CHECK-NEXT: mov b4, v0.b[4] @@ -380,7 +380,7 @@ define <4 x double> @uitofp_v4i8_double(<4 x i8> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: and v1.8b, v2.8b, v1.8b ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 @@ -425,7 +425,7 @@ define <8 x double> @uitofp_v8i8_double(<8 x i8> %a) { define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) { ; CHECK-LABEL: uitofp_v16i8_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: mov b3, v0.b[0] ; CHECK-NEXT: mov b4, v0.b[2] ; CHECK-NEXT: mov b5, v0.b[4] diff --git a/llvm/test/CodeGen/AArch64/vector-llrint.ll b/llvm/test/CodeGen/AArch64/vector-llrint.ll index d9a9e57fe0a63..d1fe25457de8e 100644 --- a/llvm/test/CodeGen/AArch64/vector-llrint.ll +++ b/llvm/test/CodeGen/AArch64/vector-llrint.ll @@ -88,7 +88,7 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half>) define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) nounwind { ; CHECK-SD-LABEL: llrint_v8i64_v8f16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: mov h4, v0.h[2] ; CHECK-SD-NEXT: mov h3, v0.h[1] ; CHECK-SD-NEXT: mov h7, v0.h[3] @@ -156,8 +156,8 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half>) define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) nounwind { ; CHECK-SD-LABEL: llrint_v16i64_v16f16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d2, v0.d[1] +; CHECK-SD-NEXT: mov d3, v1.d[1] ; CHECK-SD-NEXT: mov h17, v0.h[1] ; CHECK-SD-NEXT: mov h19, v0.h[2] ; CHECK-SD-NEXT: fcvt s18, h0 @@ -279,10 +279,10 @@ declare <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half>) define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) nounwind { ; CHECK-SD-LABEL: llrint_v32i64_v32f16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-SD-NEXT: ext v5.16b, v2.16b, v2.16b, #8 -; CHECK-SD-NEXT: ext v6.16b, v3.16b, v3.16b, #8 -; CHECK-SD-NEXT: ext v7.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d4, v1.d[1] +; CHECK-SD-NEXT: mov d5, v2.d[1] +; CHECK-SD-NEXT: mov d6, v3.d[1] +; CHECK-SD-NEXT: mov d7, v0.d[1] ; CHECK-SD-NEXT: mov h19, v0.h[1] ; CHECK-SD-NEXT: fcvt s21, h0 ; CHECK-SD-NEXT: mov h23, v1.h[2] @@ -560,7 +560,7 @@ declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>) define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind { ; CHECK-SD-LABEL: llrint_v4i64_v4f32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s @@ -585,19 +585,19 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>) define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind { ; CHECK-SD-LABEL: llrint_v8i64_v8f32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d2, v0.d[1] +; CHECK-SD-NEXT: mov d3, v1.d[1] ; CHECK-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-SD-NEXT: frintx v2.2s, v2.2s ; CHECK-SD-NEXT: frintx v3.2s, v3.2s ; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s -; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s -; CHECK-SD-NEXT: fcvtl v4.2d, v2.2s +; CHECK-SD-NEXT: fcvtl v4.2d, v1.2s +; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s ; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: fcvtzs v2.2d, v1.2d -; CHECK-SD-NEXT: fcvtzs v1.2d, v4.2d +; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-SD-NEXT: fcvtzs v2.2d, v4.2d ; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d ; CHECK-SD-NEXT: ret ; @@ -622,10 +622,10 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>) define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind { ; CHECK-SD-LABEL: llrint_v16i64_v16f32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-SD-NEXT: ext v5.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: ext v6.16b, v2.16b, v2.16b, #8 -; CHECK-SD-NEXT: ext v7.16b, v3.16b, v3.16b, #8 +; CHECK-SD-NEXT: mov d4, v1.d[1] +; CHECK-SD-NEXT: mov d5, v0.d[1] +; CHECK-SD-NEXT: mov d6, v2.d[1] +; CHECK-SD-NEXT: mov d7, v3.d[1] ; CHECK-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-SD-NEXT: frintx v2.2s, v2.2s @@ -683,70 +683,70 @@ declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>) define <32 x i64> @llrint_v32i64_v32f32(<32 x float> %x) nounwind { ; CHECK-SD-LABEL: llrint_v32i64_v32f32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ext v16.16b, v7.16b, v7.16b, #8 -; CHECK-SD-NEXT: ext v17.16b, v6.16b, v6.16b, #8 +; CHECK-SD-NEXT: mov d16, v7.d[1] +; CHECK-SD-NEXT: mov d17, v6.d[1] ; CHECK-SD-NEXT: frintx v7.2s, v7.2s +; CHECK-SD-NEXT: mov d18, v5.d[1] ; CHECK-SD-NEXT: frintx v6.2s, v6.2s -; CHECK-SD-NEXT: ext v18.16b, v5.16b, v5.16b, #8 -; CHECK-SD-NEXT: ext v21.16b, v4.16b, v4.16b, #8 -; CHECK-SD-NEXT: ext v22.16b, v2.16b, v2.16b, #8 +; CHECK-SD-NEXT: mov d19, v4.d[1] +; CHECK-SD-NEXT: mov d22, v3.d[1] ; CHECK-SD-NEXT: frintx v5.2s, v5.2s -; CHECK-SD-NEXT: ext v23.16b, v3.16b, v3.16b, #8 -; CHECK-SD-NEXT: frintx v4.2s, v4.2s -; CHECK-SD-NEXT: ext v19.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: ext v20.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: mov d20, v0.d[1] +; CHECK-SD-NEXT: frintx v21.2s, v1.2s +; CHECK-SD-NEXT: mov d1, v1.d[1] +; CHECK-SD-NEXT: mov d23, v2.d[1] ; CHECK-SD-NEXT: frintx v16.2s, v16.2s ; CHECK-SD-NEXT: frintx v17.2s, v17.2s ; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s -; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s ; CHECK-SD-NEXT: frintx v18.2s, v18.2s -; CHECK-SD-NEXT: frintx v21.2s, v21.2s -; CHECK-SD-NEXT: frintx v2.2s, v2.2s +; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s +; CHECK-SD-NEXT: frintx v19.2s, v19.2s +; CHECK-SD-NEXT: frintx v4.2s, v4.2s +; CHECK-SD-NEXT: frintx v22.2s, v22.2s ; CHECK-SD-NEXT: frintx v3.2s, v3.2s ; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s +; CHECK-SD-NEXT: frintx v2.2s, v2.2s ; CHECK-SD-NEXT: frintx v23.2s, v23.2s -; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s -; CHECK-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-SD-NEXT: fcvtl v16.2d, v16.2s ; CHECK-SD-NEXT: fcvtl v17.2d, v17.2s ; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d -; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d ; CHECK-SD-NEXT: fcvtl v18.2d, v18.2s -; CHECK-SD-NEXT: fcvtl v21.2d, v21.2s -; CHECK-SD-NEXT: frintx v20.2s, v20.2s +; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-SD-NEXT: fcvtl v19.2d, v19.2s +; CHECK-SD-NEXT: frintx v1.2s, v1.2s +; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s ; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s ; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d ; CHECK-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s -; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d ; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d ; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d +; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d ; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s ; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d ; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s ; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-SD-NEXT: stp q6, q17, [x8, #192] -; CHECK-SD-NEXT: fcvtl v6.2d, v23.2s -; CHECK-SD-NEXT: frintx v17.2s, v19.2s +; CHECK-SD-NEXT: frintx v6.2s, v20.2s +; CHECK-SD-NEXT: fcvtl v17.2d, v23.2s ; CHECK-SD-NEXT: stp q7, q16, [x8, #224] -; CHECK-SD-NEXT: frintx v7.2s, v22.2s +; CHECK-SD-NEXT: fcvtl v7.2d, v22.2s ; CHECK-SD-NEXT: fcvtzs v16.2d, v18.2d -; CHECK-SD-NEXT: fcvtzs v18.2d, v21.2d +; CHECK-SD-NEXT: fcvtzs v18.2d, v19.2d ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s ; CHECK-SD-NEXT: stp q5, q16, [x8, #160] -; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s -; CHECK-SD-NEXT: fcvtl v5.2d, v20.2s -; CHECK-SD-NEXT: stp q4, q18, [x8, #128] -; CHECK-SD-NEXT: fcvtl v4.2d, v17.2s -; CHECK-SD-NEXT: stp q3, q6, [x8, #96] ; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d +; CHECK-SD-NEXT: fcvtl v5.2d, v21.2s +; CHECK-SD-NEXT: stp q4, q18, [x8, #128] +; CHECK-SD-NEXT: fcvtzs v16.2d, v17.2d +; CHECK-SD-NEXT: fcvtzs v4.2d, v6.2d +; CHECK-SD-NEXT: stp q3, q7, [x8, #96] ; CHECK-SD-NEXT: fcvtzs v3.2d, v5.2d -; CHECK-SD-NEXT: stp q1, q3, [x8, #32] -; CHECK-SD-NEXT: stp q2, q7, [x8, #64] -; CHECK-SD-NEXT: fcvtzs v2.2d, v4.2d -; CHECK-SD-NEXT: stp q0, q2, [x8] +; CHECK-SD-NEXT: stp q2, q16, [x8, #64] +; CHECK-SD-NEXT: stp q0, q4, [x8] +; CHECK-SD-NEXT: stp q3, q1, [x8, #32] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: llrint_v32i64_v32f32: diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll index 2abe0b7ae2106..67e1f6a77a2ff 100644 --- a/llvm/test/CodeGen/AArch64/vector-lrint.ll +++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll @@ -158,7 +158,7 @@ declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half>) define <8 x iXLen> @lrint_v8f16(<8 x half> %x) nounwind { ; CHECK-i32-SD-LABEL: lrint_v8f16: ; CHECK-i32-SD: // %bb.0: -; CHECK-i32-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-i32-SD-NEXT: mov d1, v0.d[1] ; CHECK-i32-SD-NEXT: mov h2, v0.h[1] ; CHECK-i32-SD-NEXT: mov h4, v0.h[2] ; CHECK-i32-SD-NEXT: fcvt s7, h0 @@ -199,7 +199,7 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v8f16: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-i64-SD-NEXT: mov d1, v0.d[1] ; CHECK-i64-SD-NEXT: mov h4, v0.h[2] ; CHECK-i64-SD-NEXT: mov h3, v0.h[1] ; CHECK-i64-SD-NEXT: mov h7, v0.h[3] @@ -281,8 +281,8 @@ declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half>) define <16 x iXLen> @lrint_v16f16(<16 x half> %x) nounwind { ; CHECK-i32-SD-LABEL: lrint_v16f16: ; CHECK-i32-SD: // %bb.0: -; CHECK-i32-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-i32-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-i32-SD-NEXT: mov d2, v0.d[1] +; CHECK-i32-SD-NEXT: mov d3, v1.d[1] ; CHECK-i32-SD-NEXT: mov h4, v0.h[1] ; CHECK-i32-SD-NEXT: mov h5, v1.h[1] ; CHECK-i32-SD-NEXT: mov h16, v0.h[2] @@ -359,8 +359,8 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v16f16: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-i64-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-i64-SD-NEXT: mov d2, v0.d[1] +; CHECK-i64-SD-NEXT: mov d3, v1.d[1] ; CHECK-i64-SD-NEXT: mov h17, v0.h[1] ; CHECK-i64-SD-NEXT: mov h19, v0.h[2] ; CHECK-i64-SD-NEXT: fcvt s18, h0 @@ -507,9 +507,9 @@ define <32 x iXLen> @lrint_v32f16(<32 x half> %x) nounwind { ; CHECK-i32-SD-LABEL: lrint_v32f16: ; CHECK-i32-SD: // %bb.0: ; CHECK-i32-SD-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-i32-SD-NEXT: ext v4.16b, v0.16b, v0.16b, #8 -; CHECK-i32-SD-NEXT: ext v5.16b, v1.16b, v1.16b, #8 -; CHECK-i32-SD-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; CHECK-i32-SD-NEXT: mov d4, v0.d[1] +; CHECK-i32-SD-NEXT: mov d5, v1.d[1] +; CHECK-i32-SD-NEXT: mov d6, v2.d[1] ; CHECK-i32-SD-NEXT: mov h27, v3.h[2] ; CHECK-i32-SD-NEXT: mov h16, v4.h[2] ; CHECK-i32-SD-NEXT: mov h17, v4.h[3] @@ -526,7 +526,7 @@ define <32 x iXLen> @lrint_v32f16(<32 x half> %x) nounwind { ; CHECK-i32-SD-NEXT: fcvt s18, h18 ; CHECK-i32-SD-NEXT: fcvt s23, h7 ; CHECK-i32-SD-NEXT: fcvt s19, h19 -; CHECK-i32-SD-NEXT: ext v7.16b, v3.16b, v3.16b, #8 +; CHECK-i32-SD-NEXT: mov d7, v3.d[1] ; CHECK-i32-SD-NEXT: fcvt s20, h20 ; CHECK-i32-SD-NEXT: fcvt s21, h21 ; CHECK-i32-SD-NEXT: fcvt s22, h22 @@ -660,10 +660,10 @@ define <32 x iXLen> @lrint_v32f16(<32 x half> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v32f16: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-i64-SD-NEXT: ext v5.16b, v2.16b, v2.16b, #8 -; CHECK-i64-SD-NEXT: ext v6.16b, v3.16b, v3.16b, #8 -; CHECK-i64-SD-NEXT: ext v7.16b, v0.16b, v0.16b, #8 +; CHECK-i64-SD-NEXT: mov d4, v1.d[1] +; CHECK-i64-SD-NEXT: mov d5, v2.d[1] +; CHECK-i64-SD-NEXT: mov d6, v3.d[1] +; CHECK-i64-SD-NEXT: mov d7, v0.d[1] ; CHECK-i64-SD-NEXT: mov h19, v0.h[1] ; CHECK-i64-SD-NEXT: fcvt s21, h0 ; CHECK-i64-SD-NEXT: mov h23, v1.h[2] @@ -1009,7 +1009,7 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v4f32: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-i64-SD-NEXT: mov d1, v0.d[1] ; CHECK-i64-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-i64-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-i64-SD-NEXT: fcvtl v0.2d, v0.2s @@ -1042,19 +1042,19 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v8f32: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-i64-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-i64-SD-NEXT: mov d2, v0.d[1] +; CHECK-i64-SD-NEXT: mov d3, v1.d[1] ; CHECK-i64-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-i64-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-i64-SD-NEXT: frintx v2.2s, v2.2s ; CHECK-i64-SD-NEXT: frintx v3.2s, v3.2s ; CHECK-i64-SD-NEXT: fcvtl v0.2d, v0.2s -; CHECK-i64-SD-NEXT: fcvtl v1.2d, v1.2s -; CHECK-i64-SD-NEXT: fcvtl v4.2d, v2.2s +; CHECK-i64-SD-NEXT: fcvtl v4.2d, v1.2s +; CHECK-i64-SD-NEXT: fcvtl v2.2d, v2.2s ; CHECK-i64-SD-NEXT: fcvtl v3.2d, v3.2s ; CHECK-i64-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-i64-SD-NEXT: fcvtzs v2.2d, v1.2d -; CHECK-i64-SD-NEXT: fcvtzs v1.2d, v4.2d +; CHECK-i64-SD-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-i64-SD-NEXT: fcvtzs v2.2d, v4.2d ; CHECK-i64-SD-NEXT: fcvtzs v3.2d, v3.2d ; CHECK-i64-SD-NEXT: ret ; @@ -1091,10 +1091,10 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v16f32: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v4.16b, v1.16b, v1.16b, #8 -; CHECK-i64-SD-NEXT: ext v5.16b, v0.16b, v0.16b, #8 -; CHECK-i64-SD-NEXT: ext v6.16b, v2.16b, v2.16b, #8 -; CHECK-i64-SD-NEXT: ext v7.16b, v3.16b, v3.16b, #8 +; CHECK-i64-SD-NEXT: mov d4, v1.d[1] +; CHECK-i64-SD-NEXT: mov d5, v0.d[1] +; CHECK-i64-SD-NEXT: mov d6, v2.d[1] +; CHECK-i64-SD-NEXT: mov d7, v3.d[1] ; CHECK-i64-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-i64-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-i64-SD-NEXT: frintx v2.2s, v2.2s @@ -1172,70 +1172,70 @@ define <32 x iXLen> @lrint_v32f32(<32 x float> %x) nounwind { ; ; CHECK-i64-SD-LABEL: lrint_v32f32: ; CHECK-i64-SD: // %bb.0: -; CHECK-i64-SD-NEXT: ext v16.16b, v7.16b, v7.16b, #8 -; CHECK-i64-SD-NEXT: ext v17.16b, v6.16b, v6.16b, #8 +; CHECK-i64-SD-NEXT: mov d16, v7.d[1] +; CHECK-i64-SD-NEXT: mov d17, v6.d[1] ; CHECK-i64-SD-NEXT: frintx v7.2s, v7.2s +; CHECK-i64-SD-NEXT: mov d18, v5.d[1] ; CHECK-i64-SD-NEXT: frintx v6.2s, v6.2s -; CHECK-i64-SD-NEXT: ext v18.16b, v5.16b, v5.16b, #8 -; CHECK-i64-SD-NEXT: ext v21.16b, v4.16b, v4.16b, #8 -; CHECK-i64-SD-NEXT: ext v22.16b, v2.16b, v2.16b, #8 +; CHECK-i64-SD-NEXT: mov d19, v4.d[1] +; CHECK-i64-SD-NEXT: mov d22, v3.d[1] ; CHECK-i64-SD-NEXT: frintx v5.2s, v5.2s -; CHECK-i64-SD-NEXT: ext v23.16b, v3.16b, v3.16b, #8 -; CHECK-i64-SD-NEXT: frintx v4.2s, v4.2s -; CHECK-i64-SD-NEXT: ext v19.16b, v0.16b, v0.16b, #8 -; CHECK-i64-SD-NEXT: ext v20.16b, v1.16b, v1.16b, #8 +; CHECK-i64-SD-NEXT: mov d20, v0.d[1] +; CHECK-i64-SD-NEXT: frintx v21.2s, v1.2s +; CHECK-i64-SD-NEXT: mov d1, v1.d[1] +; CHECK-i64-SD-NEXT: mov d23, v2.d[1] ; CHECK-i64-SD-NEXT: frintx v16.2s, v16.2s ; CHECK-i64-SD-NEXT: frintx v17.2s, v17.2s ; CHECK-i64-SD-NEXT: fcvtl v7.2d, v7.2s -; CHECK-i64-SD-NEXT: fcvtl v6.2d, v6.2s ; CHECK-i64-SD-NEXT: frintx v18.2s, v18.2s -; CHECK-i64-SD-NEXT: frintx v21.2s, v21.2s -; CHECK-i64-SD-NEXT: frintx v2.2s, v2.2s +; CHECK-i64-SD-NEXT: fcvtl v6.2d, v6.2s +; CHECK-i64-SD-NEXT: frintx v19.2s, v19.2s +; CHECK-i64-SD-NEXT: frintx v4.2s, v4.2s +; CHECK-i64-SD-NEXT: frintx v22.2s, v22.2s ; CHECK-i64-SD-NEXT: frintx v3.2s, v3.2s ; CHECK-i64-SD-NEXT: fcvtl v5.2d, v5.2s +; CHECK-i64-SD-NEXT: frintx v2.2s, v2.2s ; CHECK-i64-SD-NEXT: frintx v23.2s, v23.2s -; CHECK-i64-SD-NEXT: fcvtl v4.2d, v4.2s -; CHECK-i64-SD-NEXT: frintx v1.2s, v1.2s ; CHECK-i64-SD-NEXT: fcvtl v16.2d, v16.2s ; CHECK-i64-SD-NEXT: fcvtl v17.2d, v17.2s ; CHECK-i64-SD-NEXT: fcvtzs v7.2d, v7.2d -; CHECK-i64-SD-NEXT: fcvtzs v6.2d, v6.2d ; CHECK-i64-SD-NEXT: fcvtl v18.2d, v18.2s -; CHECK-i64-SD-NEXT: fcvtl v21.2d, v21.2s -; CHECK-i64-SD-NEXT: frintx v20.2s, v20.2s +; CHECK-i64-SD-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-i64-SD-NEXT: fcvtl v19.2d, v19.2s +; CHECK-i64-SD-NEXT: frintx v1.2s, v1.2s +; CHECK-i64-SD-NEXT: fcvtl v4.2d, v4.2s ; CHECK-i64-SD-NEXT: fcvtl v3.2d, v3.2s ; CHECK-i64-SD-NEXT: fcvtzs v5.2d, v5.2d ; CHECK-i64-SD-NEXT: frintx v0.2s, v0.2s ; CHECK-i64-SD-NEXT: fcvtl v2.2d, v2.2s -; CHECK-i64-SD-NEXT: fcvtzs v4.2d, v4.2d ; CHECK-i64-SD-NEXT: fcvtzs v16.2d, v16.2d ; CHECK-i64-SD-NEXT: fcvtzs v17.2d, v17.2d +; CHECK-i64-SD-NEXT: fcvtzs v4.2d, v4.2d ; CHECK-i64-SD-NEXT: fcvtl v1.2d, v1.2s ; CHECK-i64-SD-NEXT: fcvtzs v3.2d, v3.2d ; CHECK-i64-SD-NEXT: fcvtl v0.2d, v0.2s ; CHECK-i64-SD-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-i64-SD-NEXT: stp q6, q17, [x8, #192] -; CHECK-i64-SD-NEXT: fcvtl v6.2d, v23.2s -; CHECK-i64-SD-NEXT: frintx v17.2s, v19.2s +; CHECK-i64-SD-NEXT: frintx v6.2s, v20.2s +; CHECK-i64-SD-NEXT: fcvtl v17.2d, v23.2s ; CHECK-i64-SD-NEXT: stp q7, q16, [x8, #224] -; CHECK-i64-SD-NEXT: frintx v7.2s, v22.2s +; CHECK-i64-SD-NEXT: fcvtl v7.2d, v22.2s ; CHECK-i64-SD-NEXT: fcvtzs v16.2d, v18.2d -; CHECK-i64-SD-NEXT: fcvtzs v18.2d, v21.2d +; CHECK-i64-SD-NEXT: fcvtzs v18.2d, v19.2d ; CHECK-i64-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-i64-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-i64-SD-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-i64-SD-NEXT: fcvtl v6.2d, v6.2s ; CHECK-i64-SD-NEXT: stp q5, q16, [x8, #160] -; CHECK-i64-SD-NEXT: fcvtl v7.2d, v7.2s -; CHECK-i64-SD-NEXT: fcvtl v5.2d, v20.2s -; CHECK-i64-SD-NEXT: stp q4, q18, [x8, #128] -; CHECK-i64-SD-NEXT: fcvtl v4.2d, v17.2s -; CHECK-i64-SD-NEXT: stp q3, q6, [x8, #96] ; CHECK-i64-SD-NEXT: fcvtzs v7.2d, v7.2d +; CHECK-i64-SD-NEXT: fcvtl v5.2d, v21.2s +; CHECK-i64-SD-NEXT: stp q4, q18, [x8, #128] +; CHECK-i64-SD-NEXT: fcvtzs v16.2d, v17.2d +; CHECK-i64-SD-NEXT: fcvtzs v4.2d, v6.2d +; CHECK-i64-SD-NEXT: stp q3, q7, [x8, #96] ; CHECK-i64-SD-NEXT: fcvtzs v3.2d, v5.2d -; CHECK-i64-SD-NEXT: stp q1, q3, [x8, #32] -; CHECK-i64-SD-NEXT: stp q2, q7, [x8, #64] -; CHECK-i64-SD-NEXT: fcvtzs v2.2d, v4.2d -; CHECK-i64-SD-NEXT: stp q0, q2, [x8] +; CHECK-i64-SD-NEXT: stp q2, q16, [x8, #64] +; CHECK-i64-SD-NEXT: stp q0, q4, [x8] +; CHECK-i64-SD-NEXT: stp q3, q1, [x8, #32] ; CHECK-i64-SD-NEXT: ret ; ; CHECK-i64-GI-LABEL: lrint_v32f32: diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll index 4f2b9c5a62669..5ed427110ea2f 100644 --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -516,14 +516,14 @@ define <16 x i32> @same_sext_used_in_cmp_unsigned_pred_and_select(<16 x i8> %a) ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.16b v1, #10 ; CHECK-NEXT: sshll.8h v2, v0, #0 -; CHECK-NEXT: ext.16b v4, v2, v2, #8 +; CHECK-NEXT: mov d4, v2[1] ; CHECK-NEXT: cmhi.16b v1, v0, v1 ; CHECK-NEXT: sshll2.8h v0, v0, #0 ; CHECK-NEXT: sshll.8h v3, v1, #0 ; CHECK-NEXT: sshll2.8h v1, v1, #0 -; CHECK-NEXT: ext.16b v5, v0, v0, #8 -; CHECK-NEXT: ext.16b v6, v3, v3, #8 -; CHECK-NEXT: ext.16b v7, v1, v1, #8 +; CHECK-NEXT: mov d5, v0[1] +; CHECK-NEXT: mov d6, v3[1] +; CHECK-NEXT: mov d7, v1[1] ; CHECK-NEXT: and.8b v2, v3, v2 ; CHECK-NEXT: and.8b v1, v1, v0 ; CHECK-NEXT: sshll.4s v0, v2, #0 diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll index f3db88af0787c..cf917341f18b0 100644 --- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -4,7 +4,7 @@ define <2 x i64> @v2i64_02(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: v2i64_02: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -16,7 +16,7 @@ define <2 x i64> @v2i64_02(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @v2i64_13(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: v2i64_13: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: zip2 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -50,8 +50,8 @@ define <2 x i64> @v2i64_15913(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @v2i64_261014(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: v2i64_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -63,8 +63,8 @@ define <2 x i64> @v2i64_261014(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: v2i64_37: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: zip2 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index a4caf0e8068d4..26c814ba4f621 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1717,7 +1717,7 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: sub x11, x9, #64 ; CHECK-BE-NEXT: sub x12, x9, #32 ; CHECK-BE-NEXT: ld1 { v6.2d }, [x9] -; CHECK-BE-NEXT: ld1 { v21.2d }, [x11] +; CHECK-BE-NEXT: ld1 { v22.2d }, [x11] ; CHECK-BE-NEXT: tbl v4.16b, { v2.16b }, v1.16b ; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v0.16b ; CHECK-BE-NEXT: ld1 { v19.2d }, [x12] @@ -1727,15 +1727,15 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: sub x14, x9, #48 ; CHECK-BE-NEXT: add x16, x9, #48 ; CHECK-BE-NEXT: add x17, x9, #16 -; CHECK-BE-NEXT: ld1 { v22.2d }, [x13] +; CHECK-BE-NEXT: ld1 { v21.2d }, [x13] ; CHECK-BE-NEXT: subs x8, x8, #16 ; CHECK-BE-NEXT: add x10, x10, #16 ; CHECK-BE-NEXT: rev32 v7.8b, v4.8b -; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; CHECK-BE-NEXT: mov d4, v4.d[1] ; CHECK-BE-NEXT: rev32 v17.8b, v2.8b -; CHECK-BE-NEXT: ext v18.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v20.16b, v3.16b, v3.16b, #8 -; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-BE-NEXT: mov d18, v5.d[1] +; CHECK-BE-NEXT: mov d20, v3.d[1] +; CHECK-BE-NEXT: mov d2, v2.d[1] ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: rev32 v3.8b, v3.8b ; CHECK-BE-NEXT: uaddw v7.2d, v16.2d, v7.2s @@ -1747,14 +1747,14 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: ld1 { v16.2d }, [x16] ; CHECK-BE-NEXT: ld1 { v18.2d }, [x14] ; CHECK-BE-NEXT: uaddw v5.2d, v19.2d, v5.2s -; CHECK-BE-NEXT: uaddw v3.2d, v21.2d, v3.2s +; CHECK-BE-NEXT: uaddw v3.2d, v22.2d, v3.2s ; CHECK-BE-NEXT: st1 { v7.2d }, [x15] ; CHECK-BE-NEXT: ld1 { v7.2d }, [x17] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] ; CHECK-BE-NEXT: add x9, x9, #128 ; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s ; CHECK-BE-NEXT: st1 { v5.2d }, [x12] -; CHECK-BE-NEXT: uaddw v5.2d, v22.2d, v17.2s +; CHECK-BE-NEXT: uaddw v5.2d, v21.2d, v17.2s ; CHECK-BE-NEXT: st1 { v3.2d }, [x11] ; CHECK-BE-NEXT: uaddw v3.2d, v18.2d, v20.2s ; CHECK-BE-NEXT: uaddw v2.2d, v7.2d, v2.2s @@ -2960,27 +2960,27 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b -; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 +; CHECK-BE-NEXT: mov d24, v18.d[1] ; CHECK-BE-NEXT: add x9, x0, #32 -; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 +; CHECK-BE-NEXT: mov d25, v20.d[1] ; CHECK-BE-NEXT: add x10, x0, #16 ; CHECK-BE-NEXT: subs w2, w2, #1 -; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 +; CHECK-BE-NEXT: mov d17, v5.d[1] +; CHECK-BE-NEXT: mov d19, v6.d[1] ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b +; CHECK-BE-NEXT: mov d7, v7.d[1] ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b -; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 -; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; CHECK-BE-NEXT: mov d4, v4.d[1] ; CHECK-BE-NEXT: rev32 v6.8b, v6.8b ; CHECK-BE-NEXT: rev32 v17.8b, v17.8b ; CHECK-BE-NEXT: rev32 v19.8b, v19.8b ; CHECK-BE-NEXT: umull v5.2d, v5.2s, v18.2s ; CHECK-BE-NEXT: umull v18.2d, v21.2s, v22.2s -; CHECK-BE-NEXT: ext v21.16b, v22.16b, v22.16b, #8 +; CHECK-BE-NEXT: mov d21, v22.d[1] ; CHECK-BE-NEXT: rev32 v7.8b, v7.8b ; CHECK-BE-NEXT: umull v22.2d, v23.2s, v16.2s -; CHECK-BE-NEXT: ext v16.16b, v16.16b, v16.16b, #8 +; CHECK-BE-NEXT: mov d16, v16.d[1] ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index 292b7b28903ee..9ec760aa57e27 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -292,9 +292,8 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) { ; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: ushll v1.2d, v3.2s, #0 ; CHECK-SD-NEXT: mov v2.b[0], v1.b[0] -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: ret ; @@ -344,9 +343,8 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0 ; CHECK-SD-NEXT: ushll2 v2.2d, v2.4s, #0 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v3i16_v3i64: @@ -370,9 +368,8 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) { ; CHECK-SD-NEXT: ushll v3.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll2 v2.2d, v0.4s, #0 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: mov d1, v3.d[1] ; CHECK-SD-NEXT: fmov d0, d3 -; CHECK-SD-NEXT: ext v1.16b, v3.16b, v3.16b, #8 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v3i32_v3i64: @@ -443,17 +440,16 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) { ; CHECK-SD-LABEL: zext_v3i10_v3i64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: fmov s1, w2 ; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff +; CHECK-SD-NEXT: fmov s1, w2 ; CHECK-SD-NEXT: dup v2.2d, x8 ; CHECK-SD-NEXT: mov v0.s[1], w1 ; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s ; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b -; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov d1, v0.d[1] ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v3i10_v3i64: From 3fb3383f453109cab772005739f5f6ad1a0f7266 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 11 May 2026 09:36:44 +0200 Subject: [PATCH 229/538] Revert "[AA] No synchronization effects for never-escaping identified local" (#196890) Reverts llvm/llvm-project#193939 Caused buildbot failure. --- llvm/lib/Analysis/AliasAnalysis.cpp | 39 +++++----------- llvm/test/Analysis/BasicAA/atomics.ll | 46 +++++++++---------- .../test/Analysis/MemorySSA/atomic-clobber.ll | 2 +- .../Transforms/DeadStoreElimination/fence.ll | 8 ++++ llvm/test/Transforms/GVN/fence.ll | 4 +- .../GVN/simplify-icf-cache-invalidation.ll | 2 + llvm/test/Transforms/LICM/atomics.ll | 6 +-- 7 files changed, 49 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index ffb2dc0f4e041..1449a54d1de2b 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -458,34 +458,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) { // Helper method implementation //===----------------------------------------------------------------------===// -/// Get ModRefInfo for a synchronizing operation, such as a fence or stronger -/// than monotonic atomic load/store. -static ModRefInfo getSyncEffects(AAResults *AA, const MemoryLocation &Loc, - AAQueryInfo &AAQI) { - if (!Loc.Ptr) - return ModRefInfo::ModRef; - - // If the location is *never* captured, it cannot be affected by - // synchronizing operations. However, we cannot ignore locations that are - // only captured after the operation, as the synchronization may still have - // an effect if the object is only captured *later*. As such, set I to null - // and ReturnCaptures to true here. - const Value *Obj = getUnderlyingObject(Loc.Ptr); - if (capturesNothing(AAQI.CA->getCapturesBefore( - Obj, /*I=*/nullptr, /*OrAt=*/true, /*ReturnCaptures=*/true))) - return ModRefInfo::NoModRef; - - // If Loc is a constant memory location, the synchronization operation - // definitely could not modify it. - return AA->getModRefInfoMask(Loc); -} - ModRefInfo AAResults::getModRefInfo(const LoadInst *L, const MemoryLocation &Loc, AAQueryInfo &AAQI) { // Be conservative in the face of atomic. if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered)) - return getSyncEffects(this, Loc, AAQI); + return ModRefInfo::ModRef; // If the load address doesn't alias the given address, it doesn't read // or write the specified memory. @@ -503,7 +481,7 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S, AAQueryInfo &AAQI) { // Be conservative in the face of atomic. if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered)) - return getSyncEffects(this, Loc, AAQI); + return ModRefInfo::ModRef; if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(S), Loc, AAQI, S); @@ -537,9 +515,14 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *F, return ModRefInfo::NoModRef; } - return Result & getSyncEffects(this, Loc, AAQI); - } + // Apply the ModRef mask. This ensures that if Loc is a constant memory + // location, we take into account the fact that the fence definitely could + // not modify the memory location. + if (!isNoModRef(Result)) + Result &= getModRefInfoMask(Loc); + return Result; + } return ModRefInfo::ModRef; } @@ -593,7 +576,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX, AAQueryInfo &AAQI) { // Acquire/Release cmpxchg has properties that matter for arbitrary addresses. if (isStrongerThanMonotonic(CX->getSuccessOrdering())) - return getSyncEffects(this, Loc, AAQI); + return ModRefInfo::ModRef; if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(CX), Loc, AAQI, CX); @@ -611,7 +594,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, AAQueryInfo &AAQI) { // Acquire/Release atomicrmw has properties that matter for arbitrary addresses. if (isStrongerThanMonotonic(RMW->getOrdering())) - return getSyncEffects(this, Loc, AAQI); + return ModRefInfo::ModRef; if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(RMW), Loc, AAQI, RMW); diff --git a/llvm/test/Analysis/BasicAA/atomics.ll b/llvm/test/Analysis/BasicAA/atomics.ll index 1101466fe7055..db0417c758e92 100644 --- a/llvm/test/Analysis/BasicAA/atomics.ll +++ b/llvm/test/Analysis/BasicAA/atomics.ll @@ -8,29 +8,29 @@ declare noalias ptr @malloc(i64) ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 monotonic, align 4 ; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x monotonic, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x monotonic, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> fence release +; CHECK: Both ModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: NoModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %x acquire, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %6 = load atomic i32, ptr %x acquire, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> fence seq_cst +; CHECK: Both ModRef: Ptr: i32* %a <-> fence seq_cst ; CHECK: Both ModRef: Ptr: i32* %x <-> fence seq_cst -; CHECK: NoModRef: Ptr: i32* %a <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %9 = load atomic i32, ptr %x seq_cst, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %9 = load atomic i32, ptr %x seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %9 = load atomic i32, ptr %x seq_cst, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x seq_cst, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x seq_cst, align 4 define void @alloca_no_escape(ptr %x) { %a = alloca i32 @@ -83,15 +83,15 @@ define void @alloca_escape_after(ptr %x) { } ; CHECK-LABEL: Function: noalias_no_escape: -; CHECK: NoModRef: Ptr: i32* %a <-> fence release +; CHECK: Both ModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x acquire, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 define void @noalias_no_escape(ptr noalias %a, ptr %x) { store i32 0, ptr %a @@ -125,23 +125,21 @@ define void @noalias_escape_after(ptr noalias %a, ptr %x) { load atomic i32, ptr %x acquire, align 4 store atomic i32 0, ptr %x release, align 4 - call void @escape(ptr %a) - ret void } ; CHECK-LABEL: Function: malloc_no_escape: ; CHECK: Both ModRef: Ptr: i32* %a <-> %a = call ptr @malloc(i64 4) ; CHECK: Both ModRef: Ptr: i32* %x <-> %a = call ptr @malloc(i64 4) -; CHECK: NoModRef: Ptr: i32* %a <-> fence release +; CHECK: Both ModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x acquire, align 4 -; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 define void @malloc_no_escape(ptr %x) { %a = call ptr @malloc(i64 4) diff --git a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll index 86708ecbc58fd..326ec8b15283d 100644 --- a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll +++ b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll @@ -106,7 +106,7 @@ define void @seq_cst_clobber(ptr noalias %a, ptr noalias %b) { ; If AA gets more aggressive, we can find another way. ; ; CHECK-LABEL: define void @check_aa_is_sane -define void @check_aa_is_sane(ptr noalias %a, ptr %b) { +define void @check_aa_is_sane(ptr noalias %a, ptr noalias %b) { ; CHECK: 1 = MemoryDef(liveOnEntry) ; CHECK-NEXT: cmpxchg ptr %a, i32 0, i32 1 acquire acquire cmpxchg ptr %a, i32 0, i32 1 acquire acquire diff --git a/llvm/test/Transforms/DeadStoreElimination/fence.ll b/llvm/test/Transforms/DeadStoreElimination/fence.ll index 3c02d715eb94d..b619b0035ce03 100644 --- a/llvm/test/Transforms/DeadStoreElimination/fence.ll +++ b/llvm/test/Transforms/DeadStoreElimination/fence.ll @@ -54,6 +54,7 @@ define void @test2(ptr %addr.i) { ret void } +; TODO: ; We DSE stack alloc'ed and byval locations, in the presence of fences. ; Fence does not make an otherwise thread local store visible. ; Right now the DSE in presence of fence is only done in end blocks (with no successors), @@ -62,6 +63,7 @@ define void @test2(ptr %addr.i) { define void @test3(ptr byval(i32) %addr.i) { ; CHECK-LABEL: define void @test3( ; CHECK-SAME: ptr byval(i32) [[ADDR_I:%.*]]) { +; CHECK-NEXT: store i32 5, ptr [[ADDR_I]], align 4 ; CHECK-NEXT: fence release ; CHECK-NEXT: ret void ; @@ -74,11 +76,13 @@ declare void @foo(ptr nocapture %p) declare noalias ptr @malloc(i32) +; TODO: ; DSE of stores in locations allocated through library calls. define void @test_nocapture() { ; CHECK-LABEL: define void @test_nocapture() { ; CHECK-NEXT: [[M:%.*]] = call ptr @malloc(i32 24) ; CHECK-NEXT: call void @foo(ptr [[M]]) +; CHECK-NEXT: store i8 4, ptr [[M]], align 1 ; CHECK-NEXT: fence release ; CHECK-NEXT: ret void ; @@ -89,10 +93,14 @@ define void @test_nocapture() { ret void } + +; TODO: ; This is a full fence, but it does not make a thread local store visible. ; We can DSE the store in presence of the fence. define void @fence_seq_cst() { ; CHECK-LABEL: define void @fence_seq_cst() { +; CHECK-NEXT: [[P1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, ptr [[P1]], align 4 ; CHECK-NEXT: fence seq_cst ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/GVN/fence.ll b/llvm/test/Transforms/GVN/fence.ll index 16c6b5143703d..f2b1538843681 100644 --- a/llvm/test/Transforms/GVN/fence.ll +++ b/llvm/test/Transforms/GVN/fence.ll @@ -37,9 +37,9 @@ define i32 @test2(ptr %addr.i) { ; ordering property (though it is that too), but a liveness ; property. We expect to eventually see the value of store by ; another thread when spinning on that location. -define i32 @test3(ptr %addr.i) { +define i32 @test3(ptr noalias %addr.i, ptr noalias %otheraddr) { ; CHECK-LABEL: define i32 @test3 -; CHECK-SAME: (ptr [[ADDR_I:%.*]]) { +; CHECK-SAME: (ptr noalias [[ADDR_I:%.*]], ptr noalias [[OTHERADDR:%.*]]) { ; CHECK-NEXT: fence acquire ; CHECK-NEXT: [[A:%.*]] = load i32, ptr [[ADDR_I]], align 4 ; CHECK-NEXT: fence acquire diff --git a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll index fc85048ebdacf..f4a4155e94f80 100644 --- a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll +++ b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll @@ -28,6 +28,8 @@ define hidden void @eggs(ptr %arg, i1 %arg2, ptr %arg3, i32 %arg4, ptr %arg5) un ; CHECK-NEXT: br label %[[BB9]] ; CHECK: [[BB9]]: ; CHECK-NEXT: tail call void @quux(ptr [[ARG]], i1 [[ARG2]]) +; CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq ptr [[TMP17]], null ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll index 1d29b038c9a53..2b3435ba2e7ef 100644 --- a/llvm/test/Transforms/LICM/atomics.ll +++ b/llvm/test/Transforms/LICM/atomics.ll @@ -239,7 +239,6 @@ define i32 @test7b(ptr nocapture noalias %x, ptr nocapture %y, ptr noalias nocap ; CHECK-LABEL: define i32 @test7b( ; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]], ptr noalias captures(none) [[Z:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store i32 5, ptr [[X]], align 4 ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[VALA:%.*]] = load atomic i32, ptr [[Y]] monotonic, align 4 @@ -248,6 +247,7 @@ define i32 @test7b(ptr nocapture noalias %x, ptr nocapture %y, ptr noalias nocap ; CHECK: [[END]]: ; CHECK-NEXT: [[VALA_LCSSA1:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ] ; CHECK-NEXT: [[VALA_LCSSA:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ] +; CHECK-NEXT: store i32 5, ptr [[X]], align 4 ; CHECK-NEXT: store atomic i32 [[VALA_LCSSA1]], ptr [[Z]] unordered, align 4 ; CHECK-NEXT: ret i32 [[VALA_LCSSA]] ; @@ -266,9 +266,9 @@ end: } -define i32 @test8(ptr nocapture %x, ptr nocapture noalias %y) { +define i32 @test8(ptr nocapture noalias %x, ptr nocapture %y) { ; CHECK-LABEL: define i32 @test8( -; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr noalias captures(none) [[Y:%.*]]) { +; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: From e2e2529742b70788acbf4b94c93d3943fbe576bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Mon, 11 May 2026 08:57:03 +0100 Subject: [PATCH 230/538] Update GitHub PR Greeter (#194307) Following these two discussions: * https://discourse.llvm.org/t/rfc-mention-our-ai-policy-in-the-greeting-message-for-first-time-contributors/, * https://discourse.llvm.org/t/concerns-about-influx-of-ai-generated-bug-fixes/, add a reference to the LLVM AI policy in the GH greeter. In addition: * Update the message to include links to other relevant policies as well, since these are often shared during PR review. * Add FAQ section and move some of the original content there. * Include a request for people to confirm that they have familiarised themselves with the policies. * Add `Hello @{self.author} :wave:` to make the greeting more personal. --- llvm/utils/git/github-automation.py | 36 +++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py index 6cd5953eba19c..bbd029e59cfbc 100755 --- a/llvm/utils/git/github-automation.py +++ b/llvm/utils/git/github-automation.py @@ -248,6 +248,7 @@ class PRGreeter: def __init__(self, token: str, repo: str, pr_number: int): repo = github.Github(auth=github.Auth.Token(token)).get_repo(repo) self.pr = repo.get_issue(pr_number).as_pull_request() + self.author = self.pr.user def run(self) -> bool: # We assume that this is only called for a PR that has just been opened @@ -257,19 +258,40 @@ def run(self) -> bool: comment = f"""\ {PRGreeter.COMMENT_TAG} -Thank you for submitting a Pull Request (PR) to the LLVM Project! +Hello @{self.author} :wave: -This PR will be automatically labeled and the relevant teams will be notified. +Thank you for submitting a Pull Request (PR) to the LLVM Project. Since this is your first PR, here are a few useful links covering our main contribution policies and review practices. -If you wish to, you can add reviewers by using the "Reviewers" section on this page. +* All contributions to LLVM must follow our [LLVM AI Tool Use Policy](https://llvm.org/docs/AIToolPolicy.html). In particular, if you used AI while working on this PR, remember to add a note to the PR description. +* The [LLVM Code-Review Policy and Practices](https://llvm.org/docs/CodeReview.html) document contains practical information about the PR process, including how patches are reviewed and accepted, and who can review a PR. +* Our [LLVM Developer Policy](https://llvm.org/docs/DeveloperPolicy.html) describes our expectations for code quality, commit summaries and contains notes on our CI system. -If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using `@` followed by their GitHub username. +Please reply to this message to confirm that you have read these policies, especially the LLVM AI Tool Use Policy, and that any AI tool usage has been noted in the PR description. -If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers. +--- + +### Frequently asked questions + +**How do I add reviewers?** + +This PR will be automatically labeled, and the relevant teams will be notified. For some parts of the project, reviewers may also be added automatically. + +You can also add reviewers manually using the **Reviewers** section on this page. If you cannot use that section, it is probably because you do not have write permissions for the repository. In that case, you can request a review by tagging reviewers in a comment using `@` followed by their GitHub username. + +**What if there are no comments?** + +If you have not received any comments on your PR after a week, you can request a review by pinging the PR with a comment such as “Ping”. The common courtesy ping rate is once a week. Please remember that you are asking for volunteer time from other developers. + +**Are any special GitHub settings required to contribute to LLVM?** + +We only require contributors to have a public email address associated with their GitHub commits, see this [section](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) of LLVM Developer Policy for details. + +--- -If you have further questions, they may be answered by the [LLVM GitHub User Guide](https://llvm.org/docs/GitHub.html). +If you have questions, feel free to leave a comment on this PR, or ask on [LLVM Discord](https://discord.com/invite/xS7Z362) or [LLVM Discourse](https://discourse.llvm.org/). -You can also ask questions in a comment on this PR, on the [LLVM Discord](https://discord.com/invite/xS7Z362) or on the [forums](https://discourse.llvm.org/).""" +Thank you, +The LLVM Community""" self.pr.as_issue().create_comment(comment) return True From c3196408e97213692e62d406acbba70b74fa1c3e Mon Sep 17 00:00:00 2001 From: SunilKuravinakop <98882378+SunilKuravinakop@users.noreply.github.com> Date: Mon, 11 May 2026 13:45:32 +0530 Subject: [PATCH 231/538] [flang] dummy arguments used as function calls (#196426) Adding an error when a dummy argument is used as a statement function. ``` SUBROUTINE a(foo) foo(c) = 0 END SUBROUTINE a ``` This PR now points out: 1) Dummy argument 'foo' may not be used as a statement function 2) 'foo' is not a callable procedure Handles issue [196424](https://github.com/llvm/llvm-project/issues/196424) --------- Co-authored-by: Sunil Kuravinakop --- flang/lib/Semantics/resolve-names.cpp | 5 +++++ flang/test/Semantics/stmt-func01.f90 | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index f81eaf11618c4..b6c2e32b16a0e 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -4838,6 +4838,11 @@ bool SubprogramVisitor::HandleStmtFunction(const parser::StmtFunctionStmt &x) { name.symbol = nullptr; } else if (auto *entity{ultimate.detailsIf()}; entity && !ultimate.has()) { + if (entity->isDummy()) { + Say(name, + "Dummy argument '%s' may not be used as a statement function"_err_en_US); + return false; + } resultType = entity->type(); ultimate.details() = UnknownDetails{}; // will be replaced below } else { diff --git a/flang/test/Semantics/stmt-func01.f90 b/flang/test/Semantics/stmt-func01.f90 index d8ef9af25b389..d56aedc324e81 100644 --- a/flang/test/Semantics/stmt-func01.f90 +++ b/flang/test/Semantics/stmt-func01.f90 @@ -98,3 +98,22 @@ subroutine s5 !ERROR: 'k' is already declared in this scoping unit k() = 0.0 end + +subroutine s6(b) + !ERROR: Dummy argument 'b' may not be used as a statement function + !ERROR: 'b' is not a callable procedure + b(c) = 0 +end + +subroutine s7 + entry e7(b) + !ERROR: Dummy argument 'b' may not be used as a statement function + !ERROR: 'b' is not a callable procedure + b(c) = 0 +end + +subroutine s8(p) + external p + !ERROR: 'p' has not been declared as an array or pointer-valued function + p(c) = 0 +end From b71b576baf618a000fb795f77512e07fe24cae86 Mon Sep 17 00:00:00 2001 From: jofrn <165626406+jofrn@users.noreply.github.com> Date: Mon, 11 May 2026 01:18:02 -0700 Subject: [PATCH 232/538] [SelectionDAG] Split vector types for atomic load (#165818) Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. --- .../include/llvm/Target/TargetSelectionDAG.td | 14 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 34 ++ llvm/test/CodeGen/X86/atomic-load-store.ll | 353 +++++++++++++++++- 4 files changed, 398 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index ec8fbd84d5166..35848f76897b3 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -2111,6 +2111,20 @@ def atomic_load_64 : let MemoryVT = i64; } +def atomic_load_128_v2i64 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v2i64; +} + +def atomic_load_128_v4i32 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v4i32; +} + def atomic_load_nonext_8 : PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> { let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic? diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 84d5b454ba28e..a7d9974faee61 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -942,6 +942,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index d363cf91f0e6c..81e9e301f2572 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1350,6 +1350,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_STEP_VECTOR(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; + case ISD::ATOMIC_LOAD: + SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi); + break; case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; @@ -2347,6 +2350,37 @@ void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, } } +void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Extended load during type legalization!"); + SDLoc dl(LD); + EVT VT = LD->getValueType(0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + EVT MemIntVT = + EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits()); + SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT, + Ch, Ptr, LD->getMemOperand()); + + EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits()); + EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits()); + SDValue ExtractLo, ExtractHi; + SplitInteger(ALD, LoIntVT, HiIntVT, ExtractLo, ExtractHi); + + Lo = DAG.getBitcast(LoVT, ExtractLo); + Hi = DAG.getBitcast(HiVT, ExtractHi); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1)); +} + void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi) { assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!"); diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 7e15b9303887f..407a29e162b41 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3,CHECK-SSE2-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3,CHECK-SSE4-O3 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0,CHECK-SSE2-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0,CHECK-SSE4-O0 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0 @@ -295,6 +295,97 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { ret <2 x float> %ret } +define <2 x half> @atomic_vec2_half(ptr %x) { +; CHECK-SSE-O3-LABEL: atomic_vec2_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movl (%rdi), %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: shrl $16, %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movl (%rdi), %eax +; CHECK-SSE-O0-NEXT: movl %eax, %ecx +; CHECK-SSE-O0-NEXT: shrl $16, %ecx +; CHECK-SSE-O0-NEXT: movw %cx, %dx +; CHECK-SSE-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE-O0-NEXT: movw %dx, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE-O0-NEXT: movw %ax, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x half>, ptr %x acquire, align 4 + ret <2 x half> %ret +} +define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { +; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movl (%rdi), %eax +; CHECK-SSE-O3-NEXT: movl %eax, %ecx +; CHECK-SSE-O3-NEXT: shrl $16, %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm0 +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movl (%rdi), %eax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: shrl $16, %eax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movl (%rdi), %eax +; CHECK-SSE-O0-NEXT: movl %eax, %ecx +; CHECK-SSE-O0-NEXT: shrl $16, %ecx +; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-SSE-O0-NEXT: movw %ax, %dx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %dx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movl (%rdi), %eax +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: shrl $16, %eax +; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 + ret <2 x bfloat> %ret +} define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { ; CHECK-O3-LABEL: atomic_vec1_ptr: ; CHECK-O3: # %bb.0: @@ -585,6 +676,260 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { ret <4 x i16> %ret } +define <4 x half> @atomic_vec4_half(ptr %x) nounwind { +; CHECK-SSE2-O3-LABEL: atomic_vec4_half: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O3-NEXT: movl %eax, %ecx +; CHECK-SSE2-O3-NEXT: shrl $16, %ecx +; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O3-NEXT: movq %rax, %rcx +; CHECK-SSE2-O3-NEXT: shrq $32, %rcx +; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE2-O3-NEXT: shrq $48, %rax +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_half: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O3-NEXT: movl %eax, %ecx +; CHECK-SSE4-O3-NEXT: shrl $16, %ecx +; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O3-NEXT: movq %rax, %rcx +; CHECK-SSE4-O3-NEXT: shrq $32, %rcx +; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE4-O3-NEXT: shrq $48, %rax +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_half: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O0-NEXT: movl %eax, %ecx +; CHECK-SSE2-O0-NEXT: shrl $16, %ecx +; CHECK-SSE2-O0-NEXT: movw %cx, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE2-O0-NEXT: movw %ax, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm0 +; CHECK-SSE2-O0-NEXT: movq %rax, %rcx +; CHECK-SSE2-O0-NEXT: shrq $32, %rcx +; CHECK-SSE2-O0-NEXT: movw %cx, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE2-O0-NEXT: shrq $48, %rax +; CHECK-SSE2-O0-NEXT: movw %ax, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %cx, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm3 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_half: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O0-NEXT: movl %eax, %ecx +; CHECK-SSE4-O0-NEXT: shrl $16, %ecx +; CHECK-SSE4-O0-NEXT: movw %cx, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE4-O0-NEXT: movw %ax, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm0 +; CHECK-SSE4-O0-NEXT: movq %rax, %rcx +; CHECK-SSE4-O0-NEXT: shrq $32, %rcx +; CHECK-SSE4-O0-NEXT: movw %cx, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE4-O0-NEXT: shrq $48, %rax +; CHECK-SSE4-O0-NEXT: movw %ax, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %cx, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm3 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x half>, ptr %x acquire, align 8 + ret <4 x half> %ret +} +define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { +; CHECK-SSE2-O3-LABEL: atomic_vec4_bfloat: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O3-NEXT: movl %eax, %ecx +; CHECK-SSE2-O3-NEXT: shrl $16, %ecx +; CHECK-SSE2-O3-NEXT: movq %rax, %rdx +; CHECK-SSE2-O3-NEXT: shrq $32, %rdx +; CHECK-SSE2-O3-NEXT: movl %eax, %esi +; CHECK-SSE2-O3-NEXT: shrq $48, %rax +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %edx, %xmm2 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %esi, %xmm0 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm3 +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_bfloat: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O3-NEXT: movl %eax, %ecx +; CHECK-SSE4-O3-NEXT: shrl $16, %ecx +; CHECK-SSE4-O3-NEXT: movq %rax, %rdx +; CHECK-SSE4-O3-NEXT: shrq $32, %rdx +; CHECK-SSE4-O3-NEXT: movl %eax, %esi +; CHECK-SSE4-O3-NEXT: shrq $48, %rax +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %edx, %xmm2 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %esi, %xmm0 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm3 +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movq (%rdi), %rax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: movq %rax, %rcx +; CHECK-AVX-O3-NEXT: shrq $48, %rcx +; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: movq %rax, %rcx +; CHECK-AVX-O3-NEXT: shrq $32, %rcx +; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: shrl $16, %eax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_bfloat: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O0-NEXT: movl %eax, %ecx +; CHECK-SSE2-O0-NEXT: shrl $16, %ecx +; CHECK-SSE2-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-SSE2-O0-NEXT: movw %ax, %dx +; CHECK-SSE2-O0-NEXT: movq %rax, %rsi +; CHECK-SSE2-O0-NEXT: shrq $32, %rsi +; CHECK-SSE2-O0-NEXT: # kill: def $si killed $si killed $rsi +; CHECK-SSE2-O0-NEXT: shrq $48, %rax +; CHECK-SSE2-O0-NEXT: movw %ax, %di +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %di, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %si, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %dx, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %cx, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm2 +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_bfloat: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O0-NEXT: movl %eax, %ecx +; CHECK-SSE4-O0-NEXT: shrl $16, %ecx +; CHECK-SSE4-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-SSE4-O0-NEXT: movw %ax, %dx +; CHECK-SSE4-O0-NEXT: movq %rax, %rsi +; CHECK-SSE4-O0-NEXT: shrq $32, %rsi +; CHECK-SSE4-O0-NEXT: # kill: def $si killed $si killed $rsi +; CHECK-SSE4-O0-NEXT: shrq $48, %rax +; CHECK-SSE4-O0-NEXT: movw %ax, %di +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %di, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %si, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %dx, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %cx, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm2 +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movq (%rdi), %rax +; CHECK-AVX-O0-NEXT: movq %rax, %rcx +; CHECK-AVX-O0-NEXT: shrq $48, %rcx +; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: movq %rax, %rcx +; CHECK-AVX-O0-NEXT: shrq $32, %rcx +; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-AVX-O0-NEXT: shrl $16, %eax +; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 + ret <4 x bfloat> %ret +} + define <4 x float> @atomic_vec4_float(ptr %x) nounwind { ; CHECK-SSE-O3-LABEL: atomic_vec4_float: ; CHECK-SSE-O3: # %bb.0: From d48575ff7eca98f5f730d41736980698831848ce Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Mon, 11 May 2026 10:25:02 +0200 Subject: [PATCH 233/538] Add support for Ubuntu 26.10 - Stonking Stingray (#196896) Co-authored-by: Oliver Reiche --- clang/include/clang/Driver/Distro.h | 3 ++- clang/lib/Driver/Distro.cpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h index 0e17b30eb7e8d..aa4b3e1280920 100644 --- a/clang/include/clang/Driver/Distro.h +++ b/clang/include/clang/Driver/Distro.h @@ -74,6 +74,7 @@ class Distro { UbuntuPlucky, UbuntuQuesting, UbuntuResolute, + UbuntuStonking, UnknownDistro }; @@ -125,7 +126,7 @@ class Distro { } bool IsUbuntu() const { - return DistroVal >= UbuntuQuantal && DistroVal <= UbuntuResolute; + return DistroVal >= UbuntuQuantal && DistroVal <= UbuntuStonking; } bool IsAlpineLinux() const { return DistroVal == AlpineLinux; } diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp index 37be87b57b9a7..585497e998372 100644 --- a/clang/lib/Driver/Distro.cpp +++ b/clang/lib/Driver/Distro.cpp @@ -89,6 +89,7 @@ static Distro::DistroType DetectLsbRelease(llvm::vfs::FileSystem &VFS) { .Case("plucky", Distro::UbuntuPlucky) .Case("questing", Distro::UbuntuQuesting) .Case("resolute", Distro::UbuntuResolute) + .Case("stonking", Distro::UbuntuStonking) .Default(Distro::UnknownDistro); return Version; } From d1b0c854354cb9bc0033b126fa1f71b2a6c721b2 Mon Sep 17 00:00:00 2001 From: Zeyi Xu Date: Mon, 11 May 2026 16:28:36 +0800 Subject: [PATCH 234/538] [clang-tidy] Remove hicpp modules [2/4] (#196870) This is part two of removing the hicpp-* checks. RFC: https://discourse.llvm.org/t/rfc-regarding-the-current-status-of-hicpp-checks/89883 Part of https://github.com/llvm/llvm-project/issues/183462 --- .../clang-tidy/hicpp/HICPPTidyModule.cpp | 25 ----------- clang-tools-extra/clangd/TidyFastChecks.inc | 9 ---- clang-tools-extra/clangd/TidyProvider.cpp | 2 - clang-tools-extra/docs/ReleaseNotes.rst | 27 ++++++++---- .../checks/hicpp/invalid-access-moved.rst | 9 ---- .../clang-tidy/checks/hicpp/member-init.rst | 9 ---- .../checks/hicpp/move-const-arg.rst | 9 ---- .../checks/hicpp/named-parameter.rst | 9 ---- .../checks/hicpp/new-delete-operators.rst | 8 ---- .../checks/hicpp/no-array-decay.rst | 10 ----- .../clang-tidy/checks/hicpp/noexcept-move.rst | 8 ---- .../checks/hicpp/signed-bitwise.rst | 9 ---- .../docs/clang-tidy/checks/list.rst | 8 ---- ...ate-conflicted-fixes-of-alias-checkers.cpp | 21 ++++------ .../duplicate-fixes-of-alias-checkers.cpp | 42 ++++++------------- 15 files changed, 41 insertions(+), 164 deletions(-) delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/invalid-access-moved.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/member-init.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/move-const-arg.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/named-parameter.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/new-delete-operators.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/no-array-decay.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/noexcept-move.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/signed-bitwise.rst diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp index 3679b70ab2117..8d08e0cb87046 100644 --- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp @@ -8,16 +8,11 @@ #include "../ClangTidy.h" #include "../ClangTidyModule.h" -#include "../bugprone/SignedBitwiseCheck.h" #include "../bugprone/UndelegatedConstructorCheck.h" #include "../bugprone/UnhandledCodePathsCheck.h" -#include "../bugprone/UseAfterMoveCheck.h" #include "../cppcoreguidelines/NoMallocCheck.h" -#include "../cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h" -#include "../cppcoreguidelines/ProTypeMemberInitCheck.h" #include "../cppcoreguidelines/ProTypeVarargCheck.h" #include "../cppcoreguidelines/SpecialMemberFunctionsCheck.h" -#include "../misc/NewDeleteOverloadsCheck.h" #include "../misc/StaticAssertCheck.h" #include "../modernize/UseAutoCheck.h" #include "../modernize/UseEmplaceCheck.h" @@ -26,10 +21,7 @@ #include "../modernize/UseNoexceptCheck.h" #include "../modernize/UseNullptrCheck.h" #include "../modernize/UseOverrideCheck.h" -#include "../performance/MoveConstArgCheck.h" -#include "../performance/NoexceptMoveConstructorCheck.h" #include "../portability/NoAssemblerCheck.h" -#include "../readability/NamedParameterCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" namespace clang::tidy { @@ -41,23 +33,6 @@ class HICPPModule : public ClangTidyModule { void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { CheckFactories.registerCheck( "hicpp-multiway-paths-covered"); - CheckFactories.registerCheck( - "hicpp-signed-bitwise"); - CheckFactories.registerCheck( - "hicpp-named-parameter"); - CheckFactories.registerCheck( - "hicpp-invalid-access-moved"); - CheckFactories.registerCheck( - "hicpp-member-init"); - CheckFactories.registerCheck( - "hicpp-move-const-arg"); - CheckFactories.registerCheck( - "hicpp-new-delete-operators"); - CheckFactories.registerCheck( - "hicpp-noexcept-move"); - CheckFactories - .registerCheck( - "hicpp-no-array-decay"); CheckFactories.registerCheck( "hicpp-no-assembler"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clangd/TidyFastChecks.inc b/clang-tools-extra/clangd/TidyFastChecks.inc index c1a72bae035b7..e289e3cce99a5 100644 --- a/clang-tools-extra/clangd/TidyFastChecks.inc +++ b/clang-tools-extra/clangd/TidyFastChecks.inc @@ -272,17 +272,9 @@ FAST(google-runtime-float, 1.0) FAST(google-runtime-int, 2.0) FAST(google-runtime-operator, 1.0) FAST(google-upgrade-googletest-case, 1.0) -FAST(hicpp-invalid-access-moved, 9.0) -FAST(hicpp-member-init, 2.0) -FAST(hicpp-move-const-arg, 2.0) FAST(hicpp-multiway-paths-covered, -0.0) -FAST(hicpp-named-parameter, 1.0) -FAST(hicpp-new-delete-operators, 1.0) -FAST(hicpp-no-array-decay, 2.0) FAST(hicpp-no-assembler, 1.0) FAST(hicpp-no-malloc, 1.0) -FAST(hicpp-noexcept-move, -0.0) -FAST(hicpp-signed-bitwise, -1.0) FAST(hicpp-special-member-functions, -1.0) FAST(hicpp-static-assert, 2.0) FAST(hicpp-undelegated-constructor, 1.0) @@ -492,4 +484,3 @@ FAST(zircon-temporary-objects, 1.0) #undef FAST #undef SLOW - diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp index bfb0835af2245..aae7d6b126c5a 100644 --- a/clang-tools-extra/clangd/TidyProvider.cpp +++ b/clang-tools-extra/clangd/TidyProvider.cpp @@ -218,8 +218,6 @@ TidyProvider disableUnusableChecks(llvm::ArrayRef ExtraBadChecks) { // code, which is often the case when clangd // tries to build an AST. "-bugprone-use-after-move", - // Alias for bugprone-use-after-move. - "-hicpp-invalid-access-moved", // Check uses dataflow analysis, which might hang/crash unexpectedly on // incomplete code. "-bugprone-unchecked-optional-access", diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index c5a6857c7077f..241ae52cffdd2 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -58,9 +58,9 @@ Potentially Breaking Changes - Removed the :program:`clang-tidy` ``hicpp`` module. All checks have been moved to the other modules. Use the replacement checks instead: - ================================== ========================================================= + ================================== ======================================================================== Removed check Replacement check - ================================== ========================================================= + ================================== ======================================================================== ``hicpp-avoid-c-arrays`` :doc:`modernize-avoid-c-arrays ` ``hicpp-avoid-goto`` :doc:`cppcoreguidelines-avoid-goto @@ -77,7 +77,23 @@ Potentially Breaking Changes ` ``hicpp-ignored-remove-result`` :doc:`bugprone-unused-return-value ` - ================================== ========================================================= + ``hicpp-invalid-access-moved`` :doc:`bugprone-use-after-move + ` + ``hicpp-member-init`` :doc:`cppcoreguidelines-pro-type-member-init + ` + ``hicpp-move-const-arg`` :doc:`performance-move-const-arg + ` + ``hicpp-named-parameter`` :doc:`readability-named-parameter + ` + ``hicpp-new-delete-operators`` :doc:`misc-new-delete-overloads + ` + ``hicpp-no-array-decay`` :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay + ` + ``hicpp-noexcept-move`` :doc:`performance-noexcept-move-constructor + ` + ``hicpp-signed-bitwise`` :doc:`bugprone-signed-bitwise + ` + ================================== ======================================================================== Improvements to clangd ---------------------- @@ -258,11 +274,6 @@ New check aliases `. The `hicpp-no-assembler` name is kept as an alias. -- Renamed :doc:`hicpp-signed-bitwise ` - to :doc:`bugprone-signed-bitwise - `. The `hicpp-signed-bitwise` - name is kept as an alias. - - Renamed :doc:`performance-faster-string-find ` to :doc:`performance-prefer-single-char-overloads diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/invalid-access-moved.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/invalid-access-moved.rst deleted file mode 100644 index 05f7968287d8b..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/invalid-access-moved.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-invalid-access-moved - -hicpp-invalid-access-moved -========================== - -This check is an alias for -:doc:`bugprone-use-after-move <../bugprone/use-after-move>`. - -Implements parts of the `rule 8.4.1 `_ to check if moved-from objects are accessed. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/member-init.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/member-init.rst deleted file mode 100644 index fca19f4d407f9..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/member-init.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-member-init - -hicpp-member-init -================= - -This check is an alias for :doc:`cppcoreguidelines-pro-type-member-init <../cppcoreguidelines/pro-type-member-init>`. -Implements the check for -`rule 12.4.2 `_ -to initialize class members in the right order. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/move-const-arg.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/move-const-arg.rst deleted file mode 100644 index 6818e7b24a9dd..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/move-const-arg.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-move-const-arg - -hicpp-move-const-arg -==================== - -The `hicpp-move-const-arg` check is an alias, please see -:doc:`performance-move-const-arg <../performance/move-const-arg>` -for more information. -It enforces the `rule 17.3.1 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/named-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/named-parameter.rst deleted file mode 100644 index 93b51400cb04c..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/named-parameter.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-named-parameter - -hicpp-named-parameter -===================== - -This check is an alias for :doc:`readability-named-parameter -<../readability/named-parameter>`. - -Implements `rule 8.2.1 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/new-delete-operators.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/new-delete-operators.rst deleted file mode 100644 index dcf3c756f9224..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/new-delete-operators.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-new-delete-operators - -hicpp-new-delete-operators -========================== - -This check is an alias for :doc:`misc-new-delete-overloads <../misc/new-delete-overloads>`. -Implements `rule 12.3.1 `_ to ensure -the `new` and `delete` operators have the correct signature. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-array-decay.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-array-decay.rst deleted file mode 100644 index 9a84fde7b0aae..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-array-decay.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. title:: clang-tidy - hicpp-no-array-decay - -hicpp-no-array-decay -==================== - -The `hicpp-no-array-decay` check is an alias, please see -:doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay -<../cppcoreguidelines/pro-bounds-array-to-pointer-decay>` -for more information. -It enforces the `rule 4.1.1 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/noexcept-move.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/noexcept-move.rst deleted file mode 100644 index dee62948f3393..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/noexcept-move.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-noexcept-move - -hicpp-noexcept-move -=================== - -This check is an alias for :doc:`performance-noexcept-move-constructor -<../performance/noexcept-move-constructor>`. -Checks `rule 12.5.4 `_ to mark move assignment and move construction `noexcept`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/signed-bitwise.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/signed-bitwise.rst deleted file mode 100644 index dca5279534fd7..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/signed-bitwise.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-signed-bitwise - -hicpp-signed-bitwise -==================== - -The `hicpp-signed-bitwise` check is an alias, please see -`bugprone-signed-bitwise <../bugprone/signed-bitwise.html>`_ for more -information. - diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index f193c0920ec1b..f80df56d1a4ad 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -603,17 +603,9 @@ Check aliases :doc:`google-readability-casting `, :doc:`modernize-avoid-c-style-cast `, "Yes" :doc:`google-readability-function-size `, :doc:`readability-function-size `, :doc:`google-readability-namespace-comments `, :doc:`llvm-namespace-comment `, - :doc:`hicpp-invalid-access-moved `, :doc:`bugprone-use-after-move `, - :doc:`hicpp-member-init `, :doc:`cppcoreguidelines-pro-type-member-init `, "Yes" - :doc:`hicpp-move-const-arg `, :doc:`performance-move-const-arg `, "Yes" :doc:`hicpp-multiway-paths-covered `, :doc:`bugprone-unhandled-code-paths `, - :doc:`hicpp-named-parameter `, :doc:`readability-named-parameter `, "Yes" - :doc:`hicpp-new-delete-operators `, :doc:`misc-new-delete-overloads `, - :doc:`hicpp-no-array-decay `, :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay `, :doc:`hicpp-no-assembler `, :doc:`portability-no-assembler `, :doc:`hicpp-no-malloc `, :doc:`cppcoreguidelines-no-malloc `, - :doc:`hicpp-noexcept-move `, :doc:`performance-noexcept-move-constructor `, "Yes" - :doc:`hicpp-signed-bitwise `, :doc:`bugprone-signed-bitwise `, :doc:`hicpp-special-member-functions `, :doc:`cppcoreguidelines-special-member-functions `, :doc:`hicpp-static-assert `, :doc:`misc-static-assert `, "Yes" :doc:`hicpp-undelegated-constructor `, :doc:`bugprone-undelegated-constructor `, diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp index 365908107e563..7549bcafd41ff 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp @@ -1,23 +1,20 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-member-init,hicpp-member-init,modernize-use-emplace,hicpp-use-emplace %t -- \ +// RUN: %check_clang_tidy %s cppcoreguidelines-use-default-member-init,modernize-use-default-member-init %t -- \ //// RUN: -config='{CheckOptions: { \ -//// RUN: cppcoreguidelines-pro-type-member-init.UseAssignment: true, \ +//// RUN: cppcoreguidelines-use-default-member-init.UseAssignment: true, \ //// RUN: }}' class Foo { public: - Foo() : _num1(0) - // CHECK-MESSAGES: warning: constructor does not initialize these fields: _num2 [cppcoreguidelines-pro-type-member-init,hicpp-member-init] - // CHECK-MESSAGES: note: cannot apply fix-it because an alias checker has suggested a different fix-it; please remove one of the checkers ('cppcoreguidelines-pro-type-member-init', 'hicpp-member-init') or ensure they are both configured the same - { - _num1 = 10; - } + Foo() : _num(0) + // CHECK-MESSAGES: warning: use default member initializer for '_num' [cppcoreguidelines-use-default-member-init,modernize-use-default-member-init] + // CHECK-MESSAGES: note: cannot apply fix-it because an alias checker has suggested a different fix-it; please remove one of the checkers ('cppcoreguidelines-use-default-member-init', 'modernize-use-default-member-init') or ensure they are both configured the same + {} int use_the_members() const { - return _num1 + _num2; + return _num; } private: - int _num1; - int _num2; - // CHECK-FIXES: int _num2; + int _num; + // CHECK-FIXES: int _num; }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp index 78a903a2feec0..4c5d5a957ff7e 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp @@ -1,39 +1,23 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-member-init,hicpp-member-init,modernize-use-emplace,hicpp-use-emplace %t - -namespace std { - -template -class vector { -public: - void push_back(const T &) {} - void push_back(T &&) {} - - template - void emplace_back(Args &&... args){}; -}; -} // namespace std +// RUN: %check_clang_tidy %s cppcoreguidelines-use-default-member-init,modernize-use-default-member-init,cppcoreguidelines-explicit-constructor,misc-explicit-constructor %t class Foo { public: - Foo() : _num1(0) - // CHECK-MESSAGES: warning: constructor does not initialize these fields: _num2 [cppcoreguidelines-pro-type-member-init,hicpp-member-init] - { - _num1 = 10; - } + Foo() : _num(0) + // CHECK-MESSAGES: warning: use default member initializer for '_num' [cppcoreguidelines-use-default-member-init,modernize-use-default-member-init] + {} int use_the_members() const { - return _num1 + _num2; + return _num; } private: - int _num1; - int _num2; - // CHECK-FIXES: int _num2{}; + int _num; + // CHECK-FIXES: int _num{0}; }; -void should_use_emplace(std::vector &v) { - v.push_back(Foo()); - // CHECK-FIXES: v.emplace_back(); - // CHECK-MESSAGES: warning: use emplace_back instead of push_back [hicpp-use-emplace,modernize-use-emplace] -} - +class Bar { +public: + Bar(int); + // CHECK-MESSAGES: warning: single-argument constructors must be marked explicit to avoid unintentional implicit conversions [cppcoreguidelines-explicit-constructor,misc-explicit-constructor] + // CHECK-FIXES: explicit Bar(int); +}; From 4211fdec02d8be76b4cbe35aede70e0e08350653 Mon Sep 17 00:00:00 2001 From: Jacob Crawley Date: Mon, 11 May 2026 10:33:29 +0100 Subject: [PATCH 235/538] [LV] Handle FSub Partial Reductions (#191186) Introduces a new RecurKind value 'FSub' in order to handle partial reductions of floating point values. This is done by following the existing method for integer partial reductions, doing a positive accumulation followed by a final subtraction in the middle block. --- llvm/include/llvm/Analysis/IVDescriptors.h | 5 + llvm/lib/Analysis/IVDescriptors.cpp | 36 +- .../AArch64/AArch64TargetTransformInfo.cpp | 5 +- llvm/lib/Transforms/Utils/LoopUtils.cpp | 4 + .../Transforms/Vectorize/LoopVectorize.cpp | 24 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 6 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 8 +- .../AArch64/partial-reduce-fsub-chained.ll | 38 +++ .../partial-reduce-sub-epilogue-vec.ll | 317 ++++++++++++++++++ .../AArch64/partial-reduce-sub.ll | 139 ++++++++ 11 files changed, 577 insertions(+), 17 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 05c58d1a20afb..2120eb8cd9914 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -46,6 +46,8 @@ enum class RecurKind { UMin, ///< Unsigned integer min implemented in terms of select(cmp()). UMax, ///< Unsigned integer max implemented in terms of select(cmp()). FAdd, ///< Sum of floats. + FAddChainWithSubs, ///< A chain of fadds and fsubs. + FSub, ///< Subtraction of floats. FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). @@ -245,6 +247,9 @@ class RecurrenceDescriptor { /// Returns true if the recurrence kind is a floating point kind. LLVM_ABI static bool isFloatingPointRecurrenceKind(RecurKind Kind); + /// Returns true if the recurrence kind is for a sub operation. + LLVM_ABI static bool isSubRecurrenceKind(RecurKind Kind); + /// Returns true if the recurrence kind is an integer min/max kind. static bool isIntMinMaxRecurrenceKind(RecurKind Kind) { return Kind == RecurKind::UMin || Kind == RecurKind::UMax || diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 7549e14366d2c..22a519026f63f 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -92,6 +92,10 @@ static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT, return Phi; } +bool RecurrenceDescriptor::isSubRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::Sub || Kind == RecurKind::FSub; +} + /// Compute the minimal bit width needed to represent a reduction whose exit /// instruction is given by Exit. static std::pair computeRecurrenceType(Instruction *Exit, @@ -1009,13 +1013,18 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi, return InstDesc(Kind == RecurKind::FMul, I, I->hasAllowReassoc() ? nullptr : I); case Instruction::FSub: + return InstDesc(Kind == RecurKind::FSub || + Kind == RecurKind::FAddChainWithSubs, + I, I->hasAllowReassoc() ? nullptr : I); case Instruction::FAdd: - return InstDesc(Kind == RecurKind::FAdd, I, - I->hasAllowReassoc() ? nullptr : I); + return InstDesc(Kind == RecurKind::FAdd || + Kind == RecurKind::FAddChainWithSubs, + I, I->hasAllowReassoc() ? nullptr : I); case Instruction::Select: - if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul || - Kind == RecurKind::Add || Kind == RecurKind::Mul || - Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) + if (isSubRecurrenceKind(Kind) || Kind == RecurKind::FAdd || + Kind == RecurKind::FMul || Kind == RecurKind::Add || + Kind == RecurKind::Mul || Kind == RecurKind::AddChainWithSubs || + Kind == RecurKind::FAddChainWithSubs) return isConditionalRdxPattern(I); if (isFindRecurrenceKind(Kind) && SE) return isFindPattern(L, OrigPhi, I, *SE); @@ -1104,10 +1113,20 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FSub, TheLoop, RedDes, DB, AC, DT, SE)) { + LLVM_DEBUG(dbgs() << "Found an FSub reduction PHI." << *Phi << "\n"); + return true; + } if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, RedDes, DB, AC, DT, SE)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FAddChainWithSubs, TheLoop, RedDes, DB, + AC, DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a chained FADD-FSUB chained reduction PHI." + << *Phi << "\n"); + return true; + } if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, RedDes, DB, AC, DT, SE)) { LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); @@ -1224,8 +1243,11 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMul: return Instruction::FMul; case RecurKind::FMulAdd: + case RecurKind::FAddChainWithSubs: case RecurKind::FAdd: return Instruction::FAdd; + case RecurKind::FSub: + return Instruction::FSub; case RecurKind::SMax: case RecurKind::SMin: case RecurKind::UMax: @@ -1302,6 +1324,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const { Kind == RecurKind::AddChainWithSubs) return true; + if (Cur->getOpcode() == Instruction::FSub && + Kind == RecurKind::FAddChainWithSubs) + return true; + return Cur->getOpcode() == getOpcode(); }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 2e3e7b73ba390..9d86cf7aec45c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5668,6 +5668,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( switch (RdxDesc.getRecurrenceKind()) { case RecurKind::Sub: case RecurKind::AddChainWithSubs: + case RecurKind::FAddChainWithSubs: case RecurKind::Add: case RecurKind::FAdd: case RecurKind::And: @@ -5989,7 +5990,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( return Invalid; if ((Opcode != Instruction::Add && Opcode != Instruction::Sub && - Opcode != Instruction::FAdd) || + Opcode != Instruction::FAdd && Opcode != Instruction::FSub) || OpAExtend == TTI::PR_None) return Invalid; @@ -6056,7 +6057,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( NEONPred); }; - bool IsSub = Opcode == Instruction::Sub; + bool IsSub = (Opcode == Instruction::Sub) || (Opcode == Instruction::FSub); InstructionCost Cost = InputLT.first * TTI::TCC_Basic; // Integer partial sub-reductions that don't map to a specific instruction, // carry an extra cost for implementing a double negation: diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index cc965cb36c36e..f0586e4f0f464 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1110,6 +1110,8 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { case RecurKind::Xor: return Intrinsic::vector_reduce_xor; case RecurKind::FMulAdd: + case RecurKind::FAddChainWithSubs: + case RecurKind::FSub: case RecurKind::FAdd: return Intrinsic::vector_reduce_fadd; case RecurKind::FMul: @@ -1567,6 +1569,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, case RecurKind::FMaximumNum: return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src); case RecurKind::FMulAdd: + case RecurKind::FAddChainWithSubs: + case RecurKind::FSub: case RecurKind::FAdd: return Builder.CreateFAddReduce(getIdentity(), Src); case RecurKind::FMul: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1ace2275e2b6d..e77e5024e2cc4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7767,16 +7767,28 @@ static SmallVector preparePlanForEpilogueVectorLoop( // reduction start value in a final subtraction. Update it to use the // resume value from the main vector loop. if (PhiR->getVFScaleFactor() > 1 && - PhiR->getRecurrenceKind() == RecurKind::Sub) { + RecurrenceDescriptor::isSubRecurrenceKind( + PhiR->getRecurrenceKind())) { auto *Sub = cast(RdxResult->getSingleUser()); - assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode"); + assert((Sub->getOpcode() == Instruction::Sub || + Sub->getOpcode() == Instruction::FSub) && + "Unexpected opcode"); assert(isa(Sub->getOperand(0)) && "Expected operand to match the original start value of the " "reduction"); - assert(VPlanPatternMatch::match(VPI->getOperand(0), - VPlanPatternMatch::m_ZeroInt()) && - "Expected start value for partial sub-reduction to start at " - "zero"); + // For integer sub-reductions, verify start value is zero. + // For FP sub-reductions, verify start value is negative zero. + [[maybe_unused]] auto StartValueIsIdentity = [&] { + Value *IdentityValue = getRecurrenceIdentity( + PhiR->getRecurrenceKind(), ResumeV->getType(), + PhiR->getFastMathFlags()); + auto *StartValue = dyn_cast(VPI->getOperand(0)); + return StartValue && StartValue->getValue() == IdentityValue; + }; + assert(StartValueIsIdentity() && + "Expected start value for partial sub-reduction to be zero " + "(or negative zero)"); + Sub->setOperand(0, StartVal); } else VPI->setOperand(0, StartVal); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f1a6eb2d7e8af..c41170758efd5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -29540,6 +29540,8 @@ class HorizontalReduction { // res = vv break; case RecurKind::Sub: + case RecurKind::FSub: + case RecurKind::FAddChainWithSubs: case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: @@ -29691,6 +29693,8 @@ class HorizontalReduction { // res = vv return VectorizedValue; case RecurKind::Sub: + case RecurKind::FSub: + case RecurKind::FAddChainWithSubs: case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: @@ -29794,6 +29798,8 @@ class HorizontalReduction { return Builder.CreateFMul(VectorizedValue, Scale); } case RecurKind::Sub: + case RecurKind::FSub: + case RecurKind::FAddChainWithSubs: case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 11a91dcd46867..25b2853d002a7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -578,6 +578,14 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { } } +static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind) { + if (Kind == RecurKind::Sub) + return Instruction::Add; + if (Kind == RecurKind::FSub) + return Instruction::FAdd; + llvm_unreachable("RecurKind should be Sub/FSub."); +} + Value *VPInstruction::generate(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; @@ -791,8 +799,8 @@ Value *VPInstruction::generate(VPTransformState &State) { // For sub-recurrences, each part's reduction variable is already // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) Instruction::BinaryOps Opcode = - RK == RecurKind::Sub - ? Instruction::Add + RecurrenceDescriptor::isSubRecurrenceKind(RK) + ? getSubRecurOpcode(RK) : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK); ReducedPartRdx = Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 32d89a34105a4..6b50494ea364b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -6013,6 +6013,9 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, ExtendedOp = NegRecipe; } + assert((Chain.RK != RecurKind::FAddChainWithSubs) && + "FSub chain reduction isn't supported"); + // FIXME: Do these transforms before invoking the cost-model. ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo); @@ -6064,7 +6067,7 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, // If this is the last value in a sub-reduction chain, then update the PHI // node to start at `0` and update the reduction-result to subtract from // the PHI's start value. - if (Chain.RK != RecurKind::Sub) + if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub) return; VPValue *OldStartValue = StartInst->getOperand(0); @@ -6075,7 +6078,8 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, assert(RdxResult && "Could not find reduction result"); VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult); - constexpr unsigned SubOpc = Instruction::BinaryOps::Sub; + unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub + : Instruction::BinaryOps::Sub; VPInstruction *NewResult = Builder.createNaryOp( SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc), RdxPhi->getDebugLoc()); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll new file mode 100644 index 0000000000000..f22be2c5fd977 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll @@ -0,0 +1,38 @@ +; REQUIRES: asserts +; RUN: not --crash opt -passes=loop-vectorize -S %s 2>&1 | FileCheck %s --check-prefix=ASSERTION + +; Tests a partial reduction with an fadd->fsub chain. +; There's an assertion preventing this type of partial reduction from +; being generated as the current codegen for this case is incorrect. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; ASSERTION: (Chain.RK != RecurKind::FAddChainWithSubs) +define float @fadd_fsub_reduction(float %startval, ptr %src1, ptr %src2, ptr %src3) #0 { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %accum = phi float [ %startval, %entry ], [ %sub, %loop ] + %src1.gep = getelementptr half, ptr %src1, i32 %iv + %src1.load = load half, ptr %src1.gep, align 4 + %src1.load.ext = fpext half %src1.load to float + %src2.gep = getelementptr half, ptr %src2, i32 %iv + %src2.load = load half, ptr %src2.gep, align 4 + %src2.load.ext = fpext half %src2.load to float + %src3.gep = getelementptr half, ptr %src3, i32 %iv + %src3.load = load half, ptr %src3.gep, align 4 + %src3.load.ext = fpext half %src3.load to float + %mul1 = fmul reassoc contract float %src1.load.ext, %src2.load.ext + %add = fadd reassoc contract float %accum, %mul1 + %mul2 = fmul reassoc contract float %src3.load.ext, %src1.load.ext + %sub = fsub reassoc contract float %add, %mul2 + %iv.next = add i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret float %sub +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll index 8db3a3294d7bc..21c9b9cde13ba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll @@ -185,4 +185,321 @@ exit: ret i32 %sub } +define float @fsub_reduction(float %startval, ptr %src1, ptr %src2) #0 { +; CHECK-EPI-LABEL: define float @fsub_reduction( +; CHECK-EPI-SAME: float [[STARTVAL:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; CHECK-EPI-NEXT: [[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]: +; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK1:.*]] +; CHECK-EPI: [[VECTOR_MAIN_LOOP_ITER_CHECK1]]: +; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]] +; CHECK-EPI: [[VECTOR_PH1]]: +; CHECK-EPI-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-EPI: [[VECTOR_PH]]: +; CHECK-EPI-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_PH]] ] +; CHECK-EPI-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_PH]] ] +; CHECK-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX1]] +; CHECK-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <16 x half>, ptr [[TMP0]], align 4 +; CHECK-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX1]] +; CHECK-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x half>, ptr [[TMP1]], align 4 +; CHECK-EPI-NEXT: [[TMP5:%.*]] = fpext <16 x half> [[WIDE_LOAD]] to <16 x float> +; CHECK-EPI-NEXT: [[TMP10:%.*]] = fpext <16 x half> [[WIDE_LOAD1]] to <16 x float> +; CHECK-EPI-NEXT: [[TMP11:%.*]] = fmul reassoc contract <16 x float> [[TMP5]], [[TMP10]] +; CHECK-EPI-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <8 x float> @llvm.vector.partial.reduce.fadd.v8f32.v16f32(<8 x float> [[VEC_PHI]], <16 x float> [[TMP11]]) +; CHECK-EPI-NEXT: [[INDEX_NEXT1]] = add nuw i32 [[INDEX1]], 16 +; CHECK-EPI-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT1]], 32 +; CHECK-EPI-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_PH]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-EPI: [[MIDDLE_BLOCK1]]: +; CHECK-EPI-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[PARTIAL_REDUCE]]) +; CHECK-EPI-NEXT: [[TMP7:%.*]] = fsub float [[STARTVAL]], [[TMP6]] +; CHECK-EPI-NEXT: br i1 false, label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK-EPI: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] +; CHECK-EPI: [[VEC_EPILOG_PH]]: +; CHECK-EPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] +; CHECK-EPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] +; CHECK-EPI-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-EPI: [[VECTOR_BODY]]: +; CHECK-EPI-NEXT: [[INDEX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-EPI-NEXT: [[VEC_PHI3:%.*]] = phi <2 x float> [ splat (float -0.000000e+00), %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-EPI-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX]] +; CHECK-EPI-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x half>, ptr [[TMP8]], align 4 +; CHECK-EPI-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX]] +; CHECK-EPI-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x half>, ptr [[TMP9]], align 4 +; CHECK-EPI-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[WIDE_LOAD4]] to <4 x float> +; CHECK-EPI-NEXT: [[TMP3:%.*]] = fpext <4 x half> [[WIDE_LOAD5]] to <4 x float> +; CHECK-EPI-NEXT: [[TMP4:%.*]] = fmul reassoc contract <4 x float> [[TMP2]], [[TMP3]] +; CHECK-EPI-NEXT: [[PARTIAL_REDUCE6]] = call reassoc contract <2 x float> @llvm.vector.partial.reduce.fadd.v2f32.v4f32(<2 x float> [[VEC_PHI3]], <4 x float> [[TMP4]]) +; CHECK-EPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-EPI-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 +; CHECK-EPI-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-EPI: [[MIDDLE_BLOCK]]: +; CHECK-EPI-NEXT: [[TMP14:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[PARTIAL_REDUCE6]]) +; CHECK-EPI-NEXT: [[TMP15:%.*]] = fsub float [[BC_MERGE_RDX]], [[TMP14]] +; CHECK-EPI-NEXT: br i1 false, label %[[EXIT]], label %[[VEC_EPILOG_VECTOR_BODY]] +; CHECK-EPI: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-EPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-EPI-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-EPI-NEXT: br label %[[LOOP:.*]] +; CHECK-EPI: [[LOOP]]: +; CHECK-EPI-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-EPI-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[SUB:%.*]], %[[LOOP]] ] +; CHECK-EPI-NEXT: [[SRC1_GEP:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[IV]] +; CHECK-EPI-NEXT: [[SRC1_LOAD:%.*]] = load half, ptr [[SRC1_GEP]], align 4 +; CHECK-EPI-NEXT: [[SRC1_LOAD_EXT:%.*]] = fpext half [[SRC1_LOAD]] to float +; CHECK-EPI-NEXT: [[SRC2_GEP:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[IV]] +; CHECK-EPI-NEXT: [[SRC2_LOAD:%.*]] = load half, ptr [[SRC2_GEP]], align 4 +; CHECK-EPI-NEXT: [[SRC2_LOAD_EXT:%.*]] = fpext half [[SRC2_LOAD]] to float +; CHECK-EPI-NEXT: [[MUL:%.*]] = fmul reassoc contract float [[SRC1_LOAD_EXT]], [[SRC2_LOAD_EXT]] +; CHECK-EPI-NEXT: [[SUB]] = fsub reassoc contract float [[ACCUM]], [[MUL]] +; CHECK-EPI-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-EPI-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 40 +; CHECK-EPI-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-EPI: [[EXIT]]: +; CHECK-EPI-NEXT: [[SUB_LCSSA:%.*]] = phi float [ [[SUB]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK1]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] +; CHECK-EPI-NEXT: ret float [[SUB_LCSSA]] +; +; CHECK-PARTIAL-RED-EPI-LABEL: define float @fsub_reduction( +; CHECK-PARTIAL-RED-EPI-SAME: float [[STARTVAL:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; CHECK-PARTIAL-RED-EPI-NEXT: [[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]: +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK1:.*]] +; CHECK-PARTIAL-RED-EPI: [[VECTOR_MAIN_LOOP_ITER_CHECK1]]: +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]] +; CHECK-PARTIAL-RED-EPI: [[VECTOR_PH1]]: +; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-PARTIAL-RED-EPI: [[VECTOR_PH]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_PH]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE1:%.*]], %[[VECTOR_PH]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX1]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x half>, ptr [[TMP8]], align 4 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX1]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x half>, ptr [[TMP9]], align 4 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP10:%.*]] = fpext <16 x half> [[WIDE_LOAD2]] to <16 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP11:%.*]] = fpext <16 x half> [[WIDE_LOAD3]] to <16 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP12:%.*]] = fmul reassoc contract <16 x float> [[TMP10]], [[TMP11]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE1]] = call reassoc contract <8 x float> @llvm.vector.partial.reduce.fadd.v8f32.v16f32(<8 x float> [[VEC_PHI1]], <16 x float> [[TMP12]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX_NEXT1]] = add nuw i32 [[INDEX1]], 16 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT1]], 32 +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_PH]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-PARTIAL-RED-EPI: [[MIDDLE_BLOCK1]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP14:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[PARTIAL_REDUCE1]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP7:%.*]] = fsub float [[STARTVAL]], [[TMP14]] +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK-PARTIAL-RED-EPI: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] +; CHECK-PARTIAL-RED-EPI: [[VEC_EPILOG_PH]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-PARTIAL-RED-EPI: [[VECTOR_BODY]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 4 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 4 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP4:%.*]] = fmul reassoc contract <8 x float> [[TMP2]], [[TMP3]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-PARTIAL-RED-EPI: [[MIDDLE_BLOCK]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP15:%.*]] = fsub float [[BC_MERGE_RDX]], [[TMP6]] +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[EXIT]], label %[[VEC_EPILOG_VECTOR_BODY]] +; CHECK-PARTIAL-RED-EPI: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[LOOP:.*]] +; CHECK-PARTIAL-RED-EPI: [[LOOP]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[SUB:%.*]], %[[LOOP]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC1_GEP:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[IV]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC1_LOAD:%.*]] = load half, ptr [[SRC1_GEP]], align 4 +; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC1_LOAD_EXT:%.*]] = fpext half [[SRC1_LOAD]] to float +; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC2_GEP:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[IV]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC2_LOAD:%.*]] = load half, ptr [[SRC2_GEP]], align 4 +; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC2_LOAD_EXT:%.*]] = fpext half [[SRC2_LOAD]] to float +; CHECK-PARTIAL-RED-EPI-NEXT: [[MUL:%.*]] = fmul reassoc contract float [[SRC1_LOAD_EXT]], [[SRC2_LOAD_EXT]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[SUB]] = fsub reassoc contract float [[ACCUM]], [[MUL]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-PARTIAL-RED-EPI-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 40 +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-PARTIAL-RED-EPI: [[EXIT]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[SUB_LCSSA:%.*]] = phi float [ [[SUB]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK1]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: ret float [[SUB_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %accum = phi float [ %startval, %entry ], [ %sub, %loop ] + %src1.gep = getelementptr half, ptr %src1, i32 %iv + %src1.load = load half, ptr %src1.gep, align 4 + %src1.load.ext = fpext half %src1.load to float + %src2.gep = getelementptr half, ptr %src2, i32 %iv + %src2.load = load half, ptr %src2.gep, align 4 + %src2.load.ext = fpext half %src2.load to float + %mul = fmul reassoc contract float %src1.load.ext, %src2.load.ext + %sub = fsub reassoc contract float %accum, %mul + %iv.next = add i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, 40 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret float %sub +} + +define float @fsub_reduction_nsz(ptr %a, ptr %b, ptr %c, i64 %n) { +; CHECK-EPI-LABEL: define float @fsub_reduction_nsz( +; CHECK-EPI-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-EPI-NEXT: [[ITER_CHECK:.*]]: +; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK-EPI: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-EPI: [[VECTOR_PH]]: +; CHECK-EPI-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-EPI: [[VECTOR_BODY]]: +; CHECK-EPI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-EPI-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-EPI-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] +; CHECK-EPI-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP2]], align 2 +; CHECK-EPI-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-EPI-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-EPI-NEXT: [[TMP5:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP3]], [[TMP4]] +; CHECK-EPI-NEXT: [[PARTIAL_REDUCE:%.*]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP5]]) +; CHECK-EPI-NEXT: [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-EPI-NEXT: [[TMP7:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP6]], [[TMP4]] +; CHECK-EPI-NEXT: [[PARTIAL_REDUCE3]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE]], <8 x float> [[TMP7]]) +; CHECK-EPI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-EPI-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-EPI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-EPI: [[MIDDLE_BLOCK]]: +; CHECK-EPI-NEXT: [[TMP9:%.*]] = call reassoc nsz contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PARTIAL_REDUCE3]]) +; CHECK-EPI-NEXT: [[TMP10:%.*]] = fsub float 0.000000e+00, [[TMP9]] +; CHECK-EPI-NEXT: br i1 true, label %[[FOR_EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK-EPI: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-EPI-NEXT: br i1 true, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF10:![0-9]+]] +; CHECK-EPI: [[VEC_EPILOG_PH]]: +; CHECK-EPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-EPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-EPI-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK-EPI: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-EPI-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-EPI-NEXT: [[VEC_PHI5:%.*]] = phi <2 x float> [ zeroinitializer, %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-EPI-NEXT: [[TMP11:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX4]] +; CHECK-EPI-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x half>, ptr [[TMP11]], align 2 +; CHECK-EPI-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX4]] +; CHECK-EPI-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x half>, ptr [[TMP12]], align 2 +; CHECK-EPI-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX4]] +; CHECK-EPI-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x half>, ptr [[TMP13]], align 2 +; CHECK-EPI-NEXT: [[TMP14:%.*]] = fpext <4 x half> [[WIDE_LOAD7]] to <4 x float> +; CHECK-EPI-NEXT: [[TMP15:%.*]] = fpext <4 x half> [[WIDE_LOAD6]] to <4 x float> +; CHECK-EPI-NEXT: [[TMP16:%.*]] = fmul reassoc nsz contract <4 x float> [[TMP14]], [[TMP15]] +; CHECK-EPI-NEXT: [[PARTIAL_REDUCE9:%.*]] = call reassoc nsz contract <2 x float> @llvm.vector.partial.reduce.fadd.v2f32.v4f32(<2 x float> [[VEC_PHI5]], <4 x float> [[TMP16]]) +; CHECK-EPI-NEXT: [[TMP17:%.*]] = fpext <4 x half> [[WIDE_LOAD8]] to <4 x float> +; CHECK-EPI-NEXT: [[TMP18:%.*]] = fmul reassoc nsz contract <4 x float> [[TMP17]], [[TMP15]] +; CHECK-EPI-NEXT: [[PARTIAL_REDUCE10]] = call reassoc nsz contract <2 x float> @llvm.vector.partial.reduce.fadd.v2f32.v4f32(<2 x float> [[PARTIAL_REDUCE9]], <4 x float> [[TMP18]]) +; CHECK-EPI-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 +; CHECK-EPI-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 1024 +; CHECK-EPI-NEXT: br i1 [[TMP19]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-EPI: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-EPI-NEXT: [[TMP20:%.*]] = call reassoc nsz contract float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[PARTIAL_REDUCE10]]) +; CHECK-EPI-NEXT: [[TMP21:%.*]] = fsub float [[BC_MERGE_RDX]], [[TMP20]] +; CHECK-EPI-NEXT: br i1 true, label %[[FOR_EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK-EPI: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-EPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-EPI-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ [[TMP21]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ] +; CHECK-EPI-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-EPI: [[FOR_BODY]]: +; CHECK-EPI-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-EPI-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX12]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SUB:%.*]], %[[FOR_BODY]] ] +; CHECK-EPI-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-EPI-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-EPI-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-EPI-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-EPI-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 2 +; CHECK-EPI-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-EPI-NEXT: [[GEP_C:%.*]] = getelementptr half, ptr [[C]], i64 [[IV]] +; CHECK-EPI-NEXT: [[LOAD_C:%.*]] = load half, ptr [[GEP_C]], align 2 +; CHECK-EPI-NEXT: [[EXT_C:%.*]] = fpext half [[LOAD_C]] to float +; CHECK-EPI-NEXT: [[MUL_AB:%.*]] = fmul reassoc nsz contract float [[EXT_B]], [[EXT_A]] +; CHECK-EPI-NEXT: [[MUL_AC:%.*]] = fmul reassoc nsz contract float [[EXT_C]], [[EXT_A]] +; CHECK-EPI-NEXT: [[SUB_AB:%.*]] = fsub reassoc nsz contract float [[ACCUM]], [[MUL_AB]] +; CHECK-EPI-NEXT: [[SUB]] = fsub reassoc nsz contract float [[SUB_AB]], [[MUL_AC]] +; CHECK-EPI-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-EPI-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-EPI-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-EPI: [[FOR_EXIT]]: +; CHECK-EPI-NEXT: [[SUB_LCSSA:%.*]] = phi float [ [[SUB]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-EPI-NEXT: ret float [[SUB_LCSSA]] +; +; CHECK-PARTIAL-RED-EPI-LABEL: define float @fsub_reduction_nsz( +; CHECK-PARTIAL-RED-EPI-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-PARTIAL-RED-EPI-NEXT: [[ENTRY:.*:]] +; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-PARTIAL-RED-EPI: [[VECTOR_PH]]: +; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-PARTIAL-RED-EPI: [[VECTOR_BODY]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP2]], align 2 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP5:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP3]], [[TMP4]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE:%.*]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP5]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP7:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP6]], [[TMP4]] +; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE3]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE]], <8 x float> [[TMP7]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-PARTIAL-RED-EPI: [[MIDDLE_BLOCK]]: +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP9:%.*]] = call reassoc nsz contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PARTIAL_REDUCE3]]) +; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP10:%.*]] = fsub float 0.000000e+00, [[TMP9]] +; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-PARTIAL-RED-EPI: [[FOR_EXIT]]: +; CHECK-PARTIAL-RED-EPI-NEXT: ret float [[TMP10]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %entry ], [ %sub, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 2 + %ext.a = fpext half %load.a to float + %gep.b = getelementptr half, ptr %b, i64 %iv + %load.b = load half, ptr %gep.b, align 2 + %ext.b = fpext half %load.b to float + %gep.c = getelementptr half, ptr %c, i64 %iv + %load.c = load half, ptr %gep.c, align 2 + %ext.c = fpext half %load.c to float + %mul.ab = fmul nsz reassoc contract float %ext.b, %ext.a + %mul.ac = fmul nsz reassoc contract float %ext.c, %ext.a + %sub.ab = fsub nsz reassoc contract float %accum, %mul.ab + %sub = fsub nsz reassoc contract float %sub.ab, %mul.ac + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: + ret float %sub +} + attributes #0 = { vscale_range(1,16) "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.width", i32 16} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index 2aea67f6e5499..f7ed72f3a45e5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -269,6 +269,145 @@ exit: ret i64 %sub } +define float @fdotp_fsub(ptr %a, ptr %b, ptr %c) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @fdotp_fsub( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP8]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = fmul reassoc contract <8 x float> [[TMP2]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE1:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = fmul reassoc contract <8 x float> [[TMP9]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE1]], <8 x float> [[TMP10]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = fsub float 0.000000e+00, [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret float [[TMP7]] +; +; CHECK-INTERLEAVED-LABEL: define float @fdotp_fsub( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[TMP0]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr half, ptr [[TMP2]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x half>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x half>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr half, ptr [[TMP16]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP16]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x half>, ptr [[TMP17]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[WIDE_LOAD3]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = fmul reassoc contract <8 x float> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = fpext <8 x half> [[WIDE_LOAD4]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = fmul reassoc contract <8 x float> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI1]], <8 x float> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = fpext <8 x half> [[WIDE_LOAD5]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = fmul reassoc contract <8 x float> [[TMP18]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE1]], <8 x float> [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = fpext <8 x half> [[WIDE_LOAD6]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = fmul reassoc contract <8 x float> [[TMP14]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE7]], <8 x float> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = fsub float 0.000000e+00, [[TMP11]] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret float [[TMP12]] +; +; CHECK-MAXBW-LABEL: define float @fdotp_fsub( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP8]], align 2 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = fmul reassoc contract <8 x float> [[TMP2]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE1:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = fmul reassoc contract <8 x float> [[TMP9]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE1]], <8 x float> [[TMP10]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fsub float 0.000000e+00, [[TMP6]] +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret float [[TMP7]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %entry ], [ %sub, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 2 + %ext.a = fpext half %load.a to float + %gep.b = getelementptr half, ptr %b, i64 %iv + %load.b = load half, ptr %gep.b, align 2 + %ext.b = fpext half %load.b to float + %gep.c = getelementptr half, ptr %c, i64 %iv + %load.c = load half, ptr %gep.c, align 2 + %ext.c = fpext half %load.c to float + %mul.ab = fmul reassoc contract float %ext.b, %ext.a + %mul.ac = fmul reassoc contract float %ext.c, %ext.a + %sub.ab = fsub reassoc contract float %accum, %mul.ab + %sub = fsub reassoc contract float %sub.ab, %mul.ac + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: + ret float %sub +} + !7 = distinct !{!7, !8, !9, !10} !8 = !{!"llvm.loop.mustprogress"} !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} From 2726c0f030a005adb24c0ddf9ecd0b80f3ff1e53 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 11 May 2026 10:35:00 +0100 Subject: [PATCH 236/538] [LV][NFC] Remove instcombine pass from RUN lines of simple tests (#196257) Most of the work done by the instcombine pass on these files involves canonicalising GEPs and shuffling code around. I don't believe there is any value running instcombine in these cases. --- .../AArch64/aarch64-predication.ll | 6 +-- .../sve2-histcnt-no-scalar-interleave.ll | 6 +-- .../AArch64/sve2-histcnt-too-many-deps.ll | 22 ++++---- .../LoopVectorize/AMDGPU/packed-math.ll | 31 +++++------ .../LoopVectorize/PowerPC/vsx-tsvc-s173.ll | 54 +++++++++---------- .../PowerPC/widened-massv-vfabi-attr.ll | 6 +-- .../LoopVectorize/reduction-inloop-min-max.ll | 18 +++---- .../LoopVectorize/runtime-check-readonly.ll | 8 +-- 8 files changed, 76 insertions(+), 75 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index c3348f61e6dd7..d5b96fbd4ce4c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 ; REQUIRES: asserts ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST -; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -24,7 +24,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[PRED_UDIV_CONTINUE2]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [8 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0 @@ -53,9 +53,9 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP16]]) ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP16]]) ; CHECK-NEXT: ret i64 [[TMP18]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll index 2b3e124b952ef..bc75e8000c334 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-no-scalar-interleave.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=2 -force-vector-width=1 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=2 -force-vector-width=1 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s ; REQUIRES: asserts ;; Make sure we don't interleave a histogram when vectorization is disabled. @@ -14,10 +14,10 @@ define void @simple_histogram_forced_scalar_interleave(ptr noalias %buckets, ptr ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[INDICES]], i64 [[IV]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll index 3d80b02ae585b..aeadbf7cd8f45 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 3 -; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -max-dependences=2 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s -; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s --check-prefix=NORMAL_DEP_LIMIT +; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -max-dependences=2 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s +; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s --check-prefix=NORMAL_DEP_LIMIT ; REQUIRES: asserts target triple = "aarch64-unknown-linux-gnu" @@ -20,17 +20,17 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[INDICES]], i64 [[IV]] +; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] ; CHECK-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 ; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64 -; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] ; CHECK-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1 ; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 -; CHECK-NEXT: [[IDX_ADDR:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[ARRAY]], i64 [[IV]] +; CHECK-NEXT: [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 -; CHECK-NEXT: [[GEP_OTHER:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[OTHER]], i64 [[IV]] +; CHECK-NEXT: [[GEP_OTHER:%.*]] = getelementptr inbounds i32, ptr [[OTHER]], i64 [[IV]] ; CHECK-NEXT: [[L_OTHER:%.*]] = load i32, ptr [[GEP_OTHER]], align 4 ; CHECK-NEXT: [[ADD_OTHER:%.*]] = add i32 [[L_OTHER]], [[IV_TRUNC]] ; CHECK-NEXT: store i32 [[ADD_OTHER]], ptr [[GEP_OTHER]], align 4 @@ -78,14 +78,14 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT: vector.body: ; NORMAL_DEP_LIMIT-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; NORMAL_DEP_LIMIT-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ] -; NORMAL_DEP_LIMIT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[IV]] +; NORMAL_DEP_LIMIT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] ; NORMAL_DEP_LIMIT-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4, !alias.scope [[META0:![0-9]+]] ; NORMAL_DEP_LIMIT-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to -; NORMAL_DEP_LIMIT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], [[TMP11]] +; NORMAL_DEP_LIMIT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], [[TMP11]] ; NORMAL_DEP_LIMIT-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP12]], i32 1, splat (i1 true)) -; NORMAL_DEP_LIMIT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[ARRAY]], i64 [[IV]] +; NORMAL_DEP_LIMIT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]] ; NORMAL_DEP_LIMIT-NEXT: store [[VEC_IND]], ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]] -; NORMAL_DEP_LIMIT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i8], ptr [[OTHER]], i64 [[IV]] +; NORMAL_DEP_LIMIT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[OTHER]], i64 [[IV]] ; NORMAL_DEP_LIMIT-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP14]], align 4, !alias.scope [[META7:![0-9]+]], !noalias [[META0]] ; NORMAL_DEP_LIMIT-NEXT: [[TMP15:%.*]] = add [[WIDE_LOAD10]], [[VEC_IND]] ; NORMAL_DEP_LIMIT-NEXT: store [[TMP15]], ptr [[TMP14]], align 4, !alias.scope [[META7]], !noalias [[META0]] @@ -94,7 +94,7 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; NORMAL_DEP_LIMIT-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; NORMAL_DEP_LIMIT: middle.block: -; NORMAL_DEP_LIMIT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; NORMAL_DEP_LIMIT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NORMAL_DEP_LIMIT-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; NORMAL_DEP_LIMIT: scalar.ph: ; diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll index 8f298bd678014..c803fc4cf308b 100644 --- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s -passes=loop-vectorize,instcombine -S | FileCheck -check-prefix=GFX9 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s -passes=loop-vectorize,instcombine -S | FileCheck -check-prefix=VI %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s -passes=loop-vectorize,instcombine -S | FileCheck -check-prefix=CI %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s -passes=loop-vectorize -S | FileCheck -check-prefix=GFX9 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s -passes=loop-vectorize -S | FileCheck -check-prefix=VI %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s -passes=loop-vectorize -S | FileCheck -check-prefix=CI %s define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) { ; GFX9-LABEL: @vectorize_v2f16_loop( @@ -13,8 +13,8 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) { ; GFX9-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; GFX9-NEXT: [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; GFX9-NEXT: [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; GFX9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2 x i8], ptr addrspace(1) [[S:%.*]], i64 [[INDEX]] -; GFX9-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[TMP0]], i64 4 +; GFX9-NEXT: [[TMP0:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDEX]] +; GFX9-NEXT: [[TMP1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP0]], i64 2 ; GFX9-NEXT: [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2 ; GFX9-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP1]], align 2 ; GFX9-NEXT: [[TMP2]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]] @@ -23,10 +23,10 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) { ; GFX9-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; GFX9-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; GFX9: middle.block: -; GFX9-NEXT: br label [[FOR_BODY:%.*]] -; GFX9: for.end: ; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]] ; GFX9-NEXT: [[ADD_LCSSA:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0.000000e+00, <2 x half> [[BIN_RDX]]) +; GFX9-NEXT: br label [[FOR_END:%.*]] +; GFX9: for.end: ; GFX9-NEXT: ret half [[ADD_LCSSA]] ; ; VI-LABEL: @vectorize_v2f16_loop( @@ -38,8 +38,8 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) { ; VI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VI-NEXT: [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; VI-NEXT: [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; VI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2 x i8], ptr addrspace(1) [[S:%.*]], i64 [[INDEX]] -; VI-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[TMP0]], i64 4 +; VI-NEXT: [[TMP0:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDEX]] +; VI-NEXT: [[TMP1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP0]], i64 2 ; VI-NEXT: [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2 ; VI-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP1]], align 2 ; VI-NEXT: [[TMP2]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]] @@ -48,11 +48,11 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) { ; VI-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; VI-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VI: middle.block: -; VI-NEXT: br label [[FOR_BODY:%.*]] -; VI: for.end: ; VI-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]] -; VI-NEXT: [[ADD_LCSSA:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0.000000e+00, <2 x half> [[BIN_RDX]]) -; VI-NEXT: ret half [[ADD_LCSSA]] +; VI-NEXT: [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0.000000e+00, <2 x half> [[BIN_RDX]]) +; VI-NEXT: br label [[FOR_END:%.*]] +; VI: for.end: +; VI-NEXT: ret half [[TMP5]] ; ; CI-LABEL: @vectorize_v2f16_loop( ; CI-NEXT: entry: @@ -60,14 +60,15 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) { ; CI: for.body: ; CI-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CI-NEXT: [[Q_04:%.*]] = phi half [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [2 x i8], ptr addrspace(1) [[S:%.*]], i64 [[INDVARS_IV]] +; CI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDVARS_IV]] ; CI-NEXT: [[TMP0:%.*]] = load half, ptr addrspace(1) [[ARRAYIDX]], align 2 ; CI-NEXT: [[ADD]] = fadd fast half [[Q_04]], [[TMP0]] ; CI-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CI-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256 ; CI-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CI: for.end: -; CI-NEXT: ret half [[ADD]] +; CI-NEXT: [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], [[FOR_BODY]] ] +; CI-NEXT: ret half [[ADD_LCSSA]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll index 3ba1f64e4a2f7..7516261e4fdf6 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -passes=loop-vectorize,instcombine -S | FileCheck %s +; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -passes=loop-vectorize -S | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -24,14 +24,14 @@ define signext i32 @s173() { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr @global_data, i64 [[INDEX]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 64 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 96 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 112 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], ptr @global_data, i64 0, i32 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 12 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 24 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 28 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 @@ -40,14 +40,14 @@ define signext i32 @s173() { ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i8], ptr getelementptr inbounds nuw (i8, ptr @global_data, i64 128016), i64 [[INDEX]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 16 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 48 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 64 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 80 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 96 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 112 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 12 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 16 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 20 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 24 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 28 ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP10]], align 4 ; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP11]], align 4 @@ -64,15 +64,15 @@ define signext i32 @s173() { ; CHECK-NEXT: [[TMP22:%.*]] = fadd <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD13]] ; CHECK-NEXT: [[TMP23:%.*]] = fadd <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD14]] ; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD15]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr [4 x i8], ptr @global_data, i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64000 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64016 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64032 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64048 -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64064 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64080 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64096 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP25]], i64 64112 +; CHECK-NEXT: [[TMP25:%.*]] = add nsw i64 [[INDEX]], 16000 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 8 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 16 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 20 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 24 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 28 ; CHECK-NEXT: store <4 x float> [[TMP17]], ptr [[TMP26]], align 4 ; CHECK-NEXT: store <4 x float> [[TMP18]], ptr [[TMP27]], align 4 ; CHECK-NEXT: store <4 x float> [[TMP19]], ptr [[TMP28]], align 4 @@ -87,7 +87,7 @@ define signext i32 @s173() { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[INC11]] = add nuw nsw i32 [[NL_022]], 1 +; CHECK-NEXT: [[INC11]] = add nsw i32 [[NL_022]], 1 ; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr @ntimes, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP35]], 10 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC11]], [[MUL]] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll index c5ca52b96bed8..7053edddd730c 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -vectorizer-maximize-bandwidth -mtriple=powerpc64le-- -S \ -; RUN: -passes=loop-simplify,loop-rotate,loop-vectorize,instcombine -force-vector-interleave=1 < %s | FileCheck %s +; RUN: -passes=loop-simplify,loop-rotate,loop-vectorize -force-vector-interleave=1 < %s | FileCheck %s define double @test(ptr %Arr) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: @@ -11,7 +11,7 @@ define double @test(ptr %Arr) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[ARR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> ; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @__sind2_P8(<2 x double> [[TMP2]]) @@ -20,11 +20,11 @@ define double @test(ptr %Arr) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP4]]) ; CHECK-NEXT: ret double [[TMP6]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll index f0598f10b3907..e6c80d34ab3a0 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s define i32 @reduction_smin(ptr nocapture %A, ptr nocapture %B) { ; CHECK-LABEL: define i32 @reduction_smin @@ -11,7 +11,7 @@ define i32 @reduction_smin(ptr nocapture %A, ptr nocapture %B) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP1]], i32 [[VEC_PHI]]) @@ -53,7 +53,7 @@ define i32 @reduction_smin_select_ops_flipped(ptr nocapture %A, ptr nocapture %B ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 [[VEC_PHI]]) @@ -95,16 +95,16 @@ define i32 @reduction_smin_intrinsic(ptr nocapture %A, ptr nocapture %B) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1000), [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP3]] ; entry: @@ -137,7 +137,7 @@ define i32 @reduction_umax(ptr nocapture %A, ptr nocapture %B) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 [[VEC_PHI]]) @@ -179,7 +179,7 @@ define i32 @reduction_umax_select_ops_flipped(ptr nocapture %A, ptr nocapture %B ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[VEC_PHI]]) @@ -221,16 +221,16 @@ define i32 @reduction_umax_intrinsic(ptr nocapture %A, ptr nocapture %B) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1000), [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP3]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll index 0417ca88beb98..df3d9f97ff140 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "vector.ph\:" --version 5 -; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -7,11 +7,11 @@ define void @add_ints(ptr nocapture %A, ptr nocapture %B, ptr nocapture %C) { ; CHECK-LABEL: define void @add_ints( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[C:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[C3:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64 ; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: -; CHECK-NEXT: [[A1:%.*]] = ptrtoaddr ptr [[A]] to i64 -; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 -; CHECK-NEXT: [[C3:%.*]] = ptrtoaddr ptr [[C]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[C3]] From e87ae8da5267002e05cc0452f98187d83a9a2b5e Mon Sep 17 00:00:00 2001 From: Gergo Stomfai Date: Mon, 11 May 2026 10:36:49 +0100 Subject: [PATCH 237/538] [GISel][X86] port X86PreLegalizerCombiner to npm (#182638) Porting X86PreLegalizerCombiner to npm as part of llvm/llvm-project#178192 --- .../X86/GISel/X86PreLegalizerCombiner.cpp | 88 +++++++++++++------ llvm/lib/Target/X86/X86.h | 12 ++- llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp | 6 ++ llvm/lib/Target/X86/X86PassRegistry.def | 1 + llvm/lib/Target/X86/X86TargetMachine.cpp | 4 +- .../GlobalISel/prelegalizer-combiner-div.mir | 1 + .../prelegalizer-combiner-identity.mir | 1 + .../GlobalISel/prelegalizer-combiner-lshr.mir | 1 + .../GlobalISel/prelegalizer-combiner-mul.mir | 1 + .../GlobalISel/prelegalizer-combiner-or.mir | 1 + .../prelegalizer-combiner-ptr-add.mir | 1 + .../GlobalISel/prelegalizer-combiner-rem.mir | 1 + .../GlobalISel/prelegalizer-combiner-sub.mir | 1 + 13 files changed, 86 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp b/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp index 4aa58f0876466..d52bc3e19932a 100644 --- a/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp @@ -40,6 +40,16 @@ using namespace MIPatternMatch; namespace { +CombinerInfo createCombinerInfo(bool EnableOpt, const Function &F) { + CombinerInfo CInfo(/*AllowIllegalOps=*/true, /*ShouldLegalizeIllegal=*/false, + nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); + + // This is the first Combiner, so the input IR might contain dead + // instructions. + CInfo.EnableFullDCE = true; + return CInfo; +} + #define GET_GICOMBINER_TYPES #include "X86GenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES @@ -55,8 +65,7 @@ class X86PreLegalizerCombinerImpl : public Combiner { MachineFunction &MF, CombinerInfo &CInfo, GISelValueTracking &VT, GISelCSEInfo *CSEInfo, const X86PreLegalizerCombinerImplRuleConfig &RuleConfig, - const X86Subtarget &STI, MachineDominatorTree *MDT, - const LegalizerInfo *LI); + MachineDominatorTree *MDT); static const char *getName() { return "X86PreLegalizerCombiner"; } @@ -78,10 +87,11 @@ X86PreLegalizerCombinerImpl::X86PreLegalizerCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, GISelValueTracking &VT, GISelCSEInfo *CSEInfo, const X86PreLegalizerCombinerImplRuleConfig &RuleConfig, - const X86Subtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) + MachineDominatorTree *MDT) : Combiner(MF, CInfo, &VT, CSEInfo), - Helper(Observer, B, /*IsPreLegalize=*/true, &VT, MDT, LI), - RuleConfig(RuleConfig), STI(STI), + Helper(Observer, B, /*IsPreLegalize=*/true, &VT, MDT, + MF.getSubtarget().getLegalizerInfo()), + RuleConfig(RuleConfig), STI(MF.getSubtarget()), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "X86GenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS @@ -92,13 +102,15 @@ bool X86PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { return tryCombineAllImpl(MI); } -class X86PreLegalizerCombiner : public MachineFunctionPass { +class X86PreLegalizerCombinerLegacy : public MachineFunctionPass { public: static char ID; - X86PreLegalizerCombiner(); + X86PreLegalizerCombinerLegacy(); - StringRef getPassName() const override { return "X86PreLegalizerCombiner"; } + StringRef getPassName() const override { + return "X86PreLegalizerCombinerLegacy"; + } bool runOnMachineFunction(MachineFunction &MF) override; @@ -109,7 +121,7 @@ class X86PreLegalizerCombiner : public MachineFunctionPass { }; } // end anonymous namespace -void X86PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { +void X86PreLegalizerCombinerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); @@ -122,12 +134,13 @@ void X86PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -X86PreLegalizerCombiner::X86PreLegalizerCombiner() : MachineFunctionPass(ID) { +X86PreLegalizerCombinerLegacy::X86PreLegalizerCombinerLegacy() + : MachineFunctionPass(ID) { if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } -bool X86PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { +bool X86PreLegalizerCombinerLegacy::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasFailedISel()) return false; auto &TPC = getAnalysis(); @@ -136,10 +149,6 @@ bool X86PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelCSEAnalysisWrapper &Wrapper = getAnalysis().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig()); - - const X86Subtarget &ST = MF.getSubtarget(); - const LegalizerInfo *LI = ST.getLegalizerInfo(); - const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); @@ -147,31 +156,52 @@ bool X86PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis().getDomTree(); - CombinerInfo CInfo(/*AllowIllegalOps=*/true, /*ShouldLegalizeIllegal=*/false, - /*LegalizerInfo=*/LI, EnableOpt, F.hasOptSize(), - F.hasMinSize()); - - // This is the first Combiner, so the input IR might contain dead - // instructions. - CInfo.EnableFullDCE = true; - X86PreLegalizerCombinerImpl Impl(MF, CInfo, *VT, CSEInfo, RuleConfig, ST, MDT, - LI); + CombinerInfo CInfo = createCombinerInfo(EnableOpt, F); + X86PreLegalizerCombinerImpl Impl(MF, CInfo, *VT, CSEInfo, RuleConfig, MDT); return Impl.combineMachineInstrs(); } -char X86PreLegalizerCombiner::ID = 0; -INITIALIZE_PASS_BEGIN(X86PreLegalizerCombiner, DEBUG_TYPE, +char X86PreLegalizerCombinerLegacy::ID = 0; +INITIALIZE_PASS_BEGIN(X86PreLegalizerCombinerLegacy, DEBUG_TYPE, "Combine X86 machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) -INITIALIZE_PASS_END(X86PreLegalizerCombiner, DEBUG_TYPE, +INITIALIZE_PASS_END(X86PreLegalizerCombinerLegacy, DEBUG_TYPE, "Combine X86 machine instrs before legalization", false, false) namespace llvm { -FunctionPass *createX86PreLegalizerCombiner() { - return new X86PreLegalizerCombiner(); + +PreservedAnalyses +X86PreLegalizerCombinerPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (MF.getProperties().hasFailedISel()) + return PreservedAnalyses::all(); + + X86PreLegalizerCombinerImplRuleConfig RuleConfig; + if (!RuleConfig.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + + auto &CSEInfo = MFAM.getResult(MF); + const Function &F = MF.getFunction(); + bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None; + GISelValueTracking &VT = MFAM.getResult(MF); + MachineDominatorTree &MDT = MFAM.getResult(MF); + CombinerInfo CInfo = createCombinerInfo(EnableOpt, F); + X86PreLegalizerCombinerImpl Impl(MF, CInfo, VT, CSEInfo.get(), RuleConfig, + &MDT); + Impl.combineMachineInstrs(); + + PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + PA.preserve(); + PA.preserve(); + return PA; +} + +FunctionPass *createX86PreLegalizerCombinerLegacy() { + return new X86PreLegalizerCombinerLegacy(); } } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 8570115054062..f8b6a952d7232 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -411,7 +411,15 @@ class X86PostLegalizerCombinerPass }; FunctionPass *createX86PostLegalizerCombinerLegacy(); -FunctionPass *createX86PreLegalizerCombiner(); + +class X86PreLegalizerCombinerPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +FunctionPass *createX86PreLegalizerCombinerLegacy(); class X86LoadValueInjectionLoadHardeningPass : public OptionalPassInfoMixin { @@ -497,7 +505,7 @@ void initializeX86SpeculativeLoadHardeningLegacyPass(PassRegistry &); void initializeX86SuppressAPXForRelocationLegacyPass(PassRegistry &); void initializeX86TileConfigLegacyPass(PassRegistry &); void initializeX86WinEHUnwindV2LegacyPass(PassRegistry &); -void initializeX86PreLegalizerCombinerPass(PassRegistry &); +void initializeX86PreLegalizerCombinerLegacyPass(PassRegistry &); void initializeX86PostLegalizerCombinerLegacyPass(PassRegistry &); namespace X86AS { diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index 18901c65f1be0..6e3846f7d40f9 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -46,6 +46,7 @@ class X86CodeGenPassBuilder void addIRPasses(PassManagerWrapper &PMW) const; void addPreISel(PassManagerWrapper &PMW) const; Error addInstSelector(PassManagerWrapper &PMW) const; + void addPreLegalizeMachineIR(PassManagerWrapper &PMW) const; void addILPOpts(PassManagerWrapper &PMW) const; void addPreRegBankSelect(PassManagerWrapper &PMW) const; void addMachineSSAOptimization(PassManagerWrapper &PMW) const; @@ -117,6 +118,11 @@ Error X86CodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const { return Error::success(); } +void X86CodeGenPassBuilder::addPreLegalizeMachineIR( + PassManagerWrapper &PMW) const { + addMachineFunctionPass(X86PreLegalizerCombinerPass(), PMW); +} + void X86CodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const { addMachineFunctionPass(EarlyIfConverterPass(), PMW); if (X86EnableMachineCombinerPass) { diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index 779d1b8eddfd5..45e7d0ebdbf7b 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -59,6 +59,7 @@ MACHINE_FUNCTION_PASS("x86-lvi-ret", X86LoadValueInjectionRetHardeningPass()) MACHINE_FUNCTION_PASS("x86-optimize-leas", X86OptimizeLEAsPass()) MACHINE_FUNCTION_PASS("x86-postlegalizer-combiner-pass", X86PostLegalizerCombinerPass()) MACHINE_FUNCTION_PASS("x86-pre-tile-config", X86PreTileConfigPass()) +MACHINE_FUNCTION_PASS("x86-prelegalizer-combiner-pass", X86PreLegalizerCombinerPass()) MACHINE_FUNCTION_PASS("x86-return-thunks", X86ReturnThunksPass()) MACHINE_FUNCTION_PASS("x86-seses", X86SpeculativeExecutionSideEffectSuppressionPass()) MACHINE_FUNCTION_PASS("x86-slh", X86SpeculativeLoadHardeningPass()) diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 6de566b39d045..5c8a508f37917 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -107,7 +107,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86DynAllocaExpanderLegacyPass(PR); initializeX86SuppressAPXForRelocationLegacyPass(PR); initializeX86WinEHUnwindV2LegacyPass(PR); - initializeX86PreLegalizerCombinerPass(PR); + initializeX86PreLegalizerCombinerLegacyPass(PR); initializeX86PostLegalizerCombinerLegacyPass(PR); } @@ -495,7 +495,7 @@ bool X86PassConfig::addGlobalInstructionSelect() { void X86PassConfig::addPreLegalizeMachineIR() { if (getOptLevel() != CodeGenOptLevel::None) { - addPass(createX86PreLegalizerCombiner()); + addPass(createX86PreLegalizerCombinerLegacy()); } } diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-div.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-div.mir index 717f0daaddcd6..dc9c458cf2b02 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-div.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-div.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: sdiv_0 diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-identity.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-identity.mir index ebc9d92301c57..d41198950fc10 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-identity.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-identity.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: right_ident_sub diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-lshr.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-lshr.mir index 4db7750258801..2ded48ba58d01 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-lshr.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-lshr.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: lshr_of_vec_zero diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-mul.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-mul.mir index fce8b011a31eb..ed13149ea086a 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-mul.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-mul.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: mul_0 diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-or.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-or.mir index 9b34333dd3c22..29433a21cbff3 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-or.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-or.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: i128_or_cst diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-ptr-add.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-ptr-add.mir index 4c3c5d2e1e7fd..2a8612ce3ee5b 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-ptr-add.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-ptr-add.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: ptradd_of_vec_zero diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-rem.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-rem.mir index 500223080f899..2b98c5792e07b 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-rem.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-rem.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: srem_0 diff --git a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-sub.mir b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-sub.mir index 05215130a30f2..26965e3ecad73 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-sub.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/prelegalizer-combiner-sub.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple x86_64 -run-pass=x86-prelegalizer-combiner %s -o - | FileCheck %s +# RUN: llc -mtriple x86_64 -passes=x86-prelegalizer-combiner-pass %s -o - | FileCheck %s --- name: dont_fold_sub From 96313bcfe874afacae9d7e5e73c16705e0a02ddf Mon Sep 17 00:00:00 2001 From: jofrn <165626406+jofrn@users.noreply.github.com> Date: Mon, 11 May 2026 02:39:32 -0700 Subject: [PATCH 238/538] [X86] Cast atomic vectors in IR to support floats (#148899) This commit casts floats to ints in an atomic load during AtomicExpand to support floating point types. It also is required to support 128 bit vectors in SSE/AVX. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 + llvm/lib/Target/X86/X86ISelLowering.h | 2 + llvm/lib/Target/X86/X86InstrCompiler.td | 15 + llvm/test/CodeGen/X86/atomic-load-store.ll | 386 ++++++--------------- 4 files changed, 122 insertions(+), 288 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f4529ddf4983d..3be7d35a08b6d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32956,6 +32956,13 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const { } } +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const { + if (LI->getType()->getScalarType()->isFloatingPointTy()) + return AtomicExpansionKind::CastToInteger; + return AtomicExpansionKind::None; +} + LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0d05c5772a707..9a958525057b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -892,6 +892,8 @@ namespace llvm { shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandLogicAtomicRMWInIR(const AtomicRMWInst *AI) const; + TargetLoweringBase::AtomicExpansionKind + shouldCastAtomicLoadInIR(LoadInst *LI) const override; void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 48291cdf91f72..6ab6f870f1bb8 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1227,6 +1227,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))), def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))), (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>; +// load atomic <2 x i64> +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>; +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>; +// load atomic <4 x i32> +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>; +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>; + // Floating point loads/stores. def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst), (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 407a29e162b41..00310f6d1f219 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat: @@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-SSE-O0-NEXT: movw (%rdi), %cx ; CHECK-SSE-O0-NEXT: # implicit-def: $eax ; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat: @@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-AVX-O0-NEXT: movw (%rdi), %cx ; CHECK-AVX-O0-NEXT: # implicit-def: $eax ; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0 -; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x bfloat>, ptr %x acquire, align 2 ret <1 x bfloat> %ret @@ -298,11 +296,7 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { define <2 x half> @atomic_vec2_half(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_half: ; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O3-NEXT: shrl $16, %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec2_half: @@ -312,20 +306,7 @@ define <2 x half> @atomic_vec2_half(ptr %x) { ; ; CHECK-SSE-O0-LABEL: atomic_vec2_half: ; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movl (%rdi), %eax -; CHECK-SSE-O0-NEXT: movl %eax, %ecx -; CHECK-SSE-O0-NEXT: shrl $16, %ecx -; CHECK-SSE-O0-NEXT: movw %cx, %dx -; CHECK-SSE-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE-O0-NEXT: movw %dx, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE-O0-NEXT: movw %ax, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec2_half: @@ -338,50 +319,22 @@ define <2 x half> @atomic_vec2_half(ptr %x) { define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat: ; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movl (%rdi), %eax -; CHECK-SSE-O3-NEXT: movl %eax, %ecx -; CHECK-SSE-O3-NEXT: shrl $16, %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movl (%rdi), %eax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: shrl $16, %eax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat: ; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movl (%rdi), %eax -; CHECK-SSE-O0-NEXT: movl %eax, %ecx -; CHECK-SSE-O0-NEXT: shrl $16, %ecx -; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-SSE-O0-NEXT: movw %ax, %dx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %dx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat: ; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movl (%rdi), %eax -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: shrl $16, %eax -; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 ret <2 x bfloat> %ret @@ -418,13 +371,13 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec1_half: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec1_half: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec1_half: @@ -432,8 +385,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-SSE-O0-NEXT: movw (%rdi), %cx ; CHECK-SSE-O0-NEXT: # implicit-def: $eax ; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec1_half: @@ -441,8 +393,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-AVX-O0-NEXT: movw (%rdi), %cx ; CHECK-AVX-O0-NEXT: # implicit-def: $eax ; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0 -; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x half>, ptr %x acquire, align 2 ret <1 x half> %ret @@ -677,110 +628,20 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { } define <4 x half> @atomic_vec4_half(ptr %x) nounwind { -; CHECK-SSE2-O3-LABEL: atomic_vec4_half: -; CHECK-SSE2-O3: # %bb.0: -; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O3-NEXT: movl %eax, %ecx -; CHECK-SSE2-O3-NEXT: shrl $16, %ecx -; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O3-NEXT: movq %rax, %rcx -; CHECK-SSE2-O3-NEXT: shrq $32, %rcx -; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE2-O3-NEXT: shrq $48, %rax -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-O3-NEXT: retq -; -; CHECK-SSE4-O3-LABEL: atomic_vec4_half: -; CHECK-SSE4-O3: # %bb.0: -; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O3-NEXT: movl %eax, %ecx -; CHECK-SSE4-O3-NEXT: shrl $16, %ecx -; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O3-NEXT: movq %rax, %rcx -; CHECK-SSE4-O3-NEXT: shrq $32, %rcx -; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE4-O3-NEXT: shrq $48, %rax -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero -; CHECK-SSE4-O3-NEXT: retq +; CHECK-SSE-O3-LABEL: atomic_vec4_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec4_half: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0 ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-SSE2-O0-LABEL: atomic_vec4_half: -; CHECK-SSE2-O0: # %bb.0: -; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O0-NEXT: movl %eax, %ecx -; CHECK-SSE2-O0-NEXT: shrl $16, %ecx -; CHECK-SSE2-O0-NEXT: movw %cx, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE2-O0-NEXT: movw %ax, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK-SSE2-O0-NEXT: movq %rax, %rcx -; CHECK-SSE2-O0-NEXT: shrq $32, %rcx -; CHECK-SSE2-O0-NEXT: movw %cx, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE2-O0-NEXT: shrq $48, %rax -; CHECK-SSE2-O0-NEXT: movw %ax, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %cx, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm3 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-O0-NEXT: retq -; -; CHECK-SSE4-O0-LABEL: atomic_vec4_half: -; CHECK-SSE4-O0: # %bb.0: -; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O0-NEXT: movl %eax, %ecx -; CHECK-SSE4-O0-NEXT: shrl $16, %ecx -; CHECK-SSE4-O0-NEXT: movw %cx, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE4-O0-NEXT: movw %ax, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK-SSE4-O0-NEXT: movq %rax, %rcx -; CHECK-SSE4-O0-NEXT: shrq $32, %rcx -; CHECK-SSE4-O0-NEXT: movw %cx, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE4-O0-NEXT: shrq $48, %rax -; CHECK-SSE4-O0-NEXT: movw %ax, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %cx, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm3 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-SSE4-O0-NEXT: retq +; CHECK-SSE-O0-LABEL: atomic_vec4_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec4_half: ; CHECK-AVX-O0: # %bb.0: @@ -790,141 +651,24 @@ define <4 x half> @atomic_vec4_half(ptr %x) nounwind { ret <4 x half> %ret } define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { -; CHECK-SSE2-O3-LABEL: atomic_vec4_bfloat: -; CHECK-SSE2-O3: # %bb.0: -; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O3-NEXT: movl %eax, %ecx -; CHECK-SSE2-O3-NEXT: shrl $16, %ecx -; CHECK-SSE2-O3-NEXT: movq %rax, %rdx -; CHECK-SSE2-O3-NEXT: shrq $32, %rdx -; CHECK-SSE2-O3-NEXT: movl %eax, %esi -; CHECK-SSE2-O3-NEXT: shrq $48, %rax -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %edx, %xmm2 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %esi, %xmm0 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm3 -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-O3-NEXT: retq -; -; CHECK-SSE4-O3-LABEL: atomic_vec4_bfloat: -; CHECK-SSE4-O3: # %bb.0: -; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O3-NEXT: movl %eax, %ecx -; CHECK-SSE4-O3-NEXT: shrl $16, %ecx -; CHECK-SSE4-O3-NEXT: movq %rax, %rdx -; CHECK-SSE4-O3-NEXT: shrq $32, %rdx -; CHECK-SSE4-O3-NEXT: movl %eax, %esi -; CHECK-SSE4-O3-NEXT: shrq $48, %rax -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %edx, %xmm2 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %esi, %xmm0 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm3 -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero -; CHECK-SSE4-O3-NEXT: retq +; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movq (%rdi), %rax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: movq %rax, %rcx -; CHECK-AVX-O3-NEXT: shrq $48, %rcx -; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: movq %rax, %rcx -; CHECK-AVX-O3-NEXT: shrq $32, %rcx -; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: shrl $16, %eax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0 ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-SSE2-O0-LABEL: atomic_vec4_bfloat: -; CHECK-SSE2-O0: # %bb.0: -; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O0-NEXT: movl %eax, %ecx -; CHECK-SSE2-O0-NEXT: shrl $16, %ecx -; CHECK-SSE2-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-SSE2-O0-NEXT: movw %ax, %dx -; CHECK-SSE2-O0-NEXT: movq %rax, %rsi -; CHECK-SSE2-O0-NEXT: shrq $32, %rsi -; CHECK-SSE2-O0-NEXT: # kill: def $si killed $si killed $rsi -; CHECK-SSE2-O0-NEXT: shrq $48, %rax -; CHECK-SSE2-O0-NEXT: movw %ax, %di -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %di, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %si, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %dx, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %cx, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-O0-NEXT: retq -; -; CHECK-SSE4-O0-LABEL: atomic_vec4_bfloat: -; CHECK-SSE4-O0: # %bb.0: -; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O0-NEXT: movl %eax, %ecx -; CHECK-SSE4-O0-NEXT: shrl $16, %ecx -; CHECK-SSE4-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-SSE4-O0-NEXT: movw %ax, %dx -; CHECK-SSE4-O0-NEXT: movq %rax, %rsi -; CHECK-SSE4-O0-NEXT: shrq $32, %rsi -; CHECK-SSE4-O0-NEXT: # kill: def $si killed $si killed $rsi -; CHECK-SSE4-O0-NEXT: shrq $48, %rax -; CHECK-SSE4-O0-NEXT: movw %ax, %di -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %di, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %si, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %dx, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %cx, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-SSE4-O0-NEXT: retq +; CHECK-SSE-O0-LABEL: atomic_vec4_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec4_bfloat: ; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movq (%rdi), %rax -; CHECK-AVX-O0-NEXT: movq %rax, %rcx -; CHECK-AVX-O0-NEXT: shrq $48, %rcx -; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: movq %rax, %rcx -; CHECK-AVX-O0-NEXT: shrq $32, %rcx -; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-AVX-O0-NEXT: shrl $16, %eax -; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0 ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 ret <4 x bfloat> %ret @@ -982,6 +726,72 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind { ret <4 x float> %ret } +define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind { +; +; CHECK-SSE2-O3-LABEL: atomic_vec4_float_align: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movl $2, %esi +; CHECK-SSE2-O3-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O3-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_float_align: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: pushq %rbx +; CHECK-SSE4-O3-NEXT: xorl %eax, %eax +; CHECK-SSE4-O3-NEXT: xorl %edx, %edx +; CHECK-SSE4-O3-NEXT: xorl %ecx, %ecx +; CHECK-SSE4-O3-NEXT: xorl %ebx, %ebx +; CHECK-SSE4-O3-NEXT: lock cmpxchg16b (%rdi) +; CHECK-SSE4-O3-NEXT: movq %rdx, %xmm1 +; CHECK-SSE4-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE4-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE4-O3-NEXT: popq %rbx +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_float_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_float_align: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movl $2, %esi +; CHECK-SSE2-O0-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_float_align: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: pushq %rbx +; CHECK-SSE4-O0-NEXT: xorl %eax, %eax +; CHECK-SSE4-O0-NEXT: movl %eax, %ebx +; CHECK-SSE4-O0-NEXT: movq %rbx, %rax +; CHECK-SSE4-O0-NEXT: movq %rbx, %rdx +; CHECK-SSE4-O0-NEXT: movq %rbx, %rcx +; CHECK-SSE4-O0-NEXT: lock cmpxchg16b (%rdi) +; CHECK-SSE4-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE4-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE4-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE4-O0-NEXT: popq %rbx +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_float_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x float>, ptr %x acquire, align 16 + ret <4 x float> %ret +} + define <8 x double> @atomic_vec8_double(ptr %x) nounwind { ; CHECK-SSE-O3-LABEL: atomic_vec8_double: ; CHECK-SSE-O3: # %bb.0: From 7aef15d44bf13fad3ae95546fee9b2ac15da8aa3 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Mon, 11 May 2026 11:42:37 +0200 Subject: [PATCH 239/538] [AMDGPU] Add VMovB64 subtarget feature (#196340) --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++++++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 -- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++---- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index f12c404c035ae..c0647a5ce443e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -483,6 +483,10 @@ defm SBarrierLeaveImm : AMDGPUSubtargetFeature<"s-barrier-leave-imm", "s_barrier_leave takes an immediate operand" >; +defm VMovB64Inst : AMDGPUSubtargetFeature<"v-mov-b64-inst", + "Has v_mov_b64 instruction" +>; + defm GFX950Insts : AMDGPUSubtargetFeature<"gfx950-insts", "Additional instructions for GFX950+", /*GenPredicate=*/1, @@ -1763,6 +1767,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureLshlAddU64Inst, + FeatureVMovB64Inst, ]>; def FeatureISAVersion9_5_Common : FeatureSet< @@ -2120,6 +2125,7 @@ def FeatureISAVersion12_50_Common : FeatureSet< FeatureMcastLoadInsts, FeatureNoF16PseudoScalarTransInlineConstants, FeatureRealTrue16Insts, + FeatureVMovB64Inst, ]>; def FeatureISAVersion12_50 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 34e85ed2aa170..f0918a86be757 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5557,7 +5557,7 @@ bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const { auto &ST = DAG.getSubtarget(); const auto *TII = ST.getInstrInfo(); - if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant)) + if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant)) return false; if (ST.has64BitLiterals()) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ec7dd92d6b10e..2e2797269fbcf 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -507,8 +507,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasMadF16() const; - bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; } - // Scalar and global loads support scale_offset bit. bool hasScaleOffset() const { return HasGFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3efae655b311e..451b5a4d3da6d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1018,7 +1018,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { - if (ST.hasMovB64()) { + if (ST.hasVMovB64Inst()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -1067,7 +1067,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, (RI.isProperlyAlignedRC(*RC) && (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. - if (ST.hasMovB64()) { + if (ST.hasVMovB64Inst()) { Opcode = AMDGPU::V_MOV_B64_e32; EltSize = 8; } else if (ST.hasPkMovB32()) { @@ -2159,7 +2159,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); - if (ST.hasMovB64() && Mov64RC->contains(Dst)) { + if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) { MI.setDesc(Mov64Desc); if (SrcOp.isReg() || isInlineConstant(MI, 1) || isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals()) @@ -2703,7 +2703,7 @@ std::pair SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); - if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) && + if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) && AMDGPU::isLegalDPALU_DPPControl( ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); From 5e7ce91c68990bee67ddcb68969c2ae901e72a87 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Mon, 11 May 2026 11:43:19 +0200 Subject: [PATCH 240/538] [mlir][SPIR-V] Add CL.{exp2,exp10,log2,log10} ops (#196869) --- .../mlir/Dialect/SPIRV/IR/SPIRVCLOps.td | 84 ++++++++ mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir | 200 ++++++++++++++++++ mlir/test/Target/SPIRV/ocl-ops.mlir | 8 + 3 files changed, 292 insertions(+) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td index 71f9c9579db81..37989e6e7e54a 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td @@ -386,6 +386,48 @@ def SPIRV_CLExpOp : SPIRV_CLUnaryArithmeticOp<"exp", 19, SPIRV_Float> { // ----- +def SPIRV_CLExp2Op : SPIRV_CLUnaryArithmeticOp<"exp2", 20, SPIRV_Float> { + let summary = "Compute the base-2 exponential of x."; + + let description = [{ + Result Type and x must be floating-point or vector(2,3,4,8,16) of + floating-point values. + + All of the operands, including the Result Type operand, + must be of the same type. + + #### Example: + + ```mlir + %2 = spirv.CL.exp2 %0 : f32 + %3 = spirv.CL.exp2 %1 : vector<3xf16> + ``` + }]; +} + +// ----- + +def SPIRV_CLExp10Op : SPIRV_CLUnaryArithmeticOp<"exp10", 21, SPIRV_Float> { + let summary = "Compute the base-10 exponential of x."; + + let description = [{ + Result Type and x must be floating-point or vector(2,3,4,8,16) of + floating-point values. + + All of the operands, including the Result Type operand, + must be of the same type. + + #### Example: + + ```mlir + %2 = spirv.CL.exp10 %0 : f32 + %3 = spirv.CL.exp10 %1 : vector<3xf16> + ``` + }]; +} + +// ----- + def SPIRV_CLFAbsOp : SPIRV_CLUnaryArithmeticOp<"fabs", 23, SPIRV_Float> { let summary = "Absolute value of operand"; @@ -568,6 +610,48 @@ def SPIRV_CLLogOp : SPIRV_CLUnaryArithmeticOp<"log", 37, SPIRV_Float> { // ----- +def SPIRV_CLLog2Op : SPIRV_CLUnaryArithmeticOp<"log2", 38, SPIRV_Float> { + let summary = "Compute the base-2 logarithm of x."; + + let description = [{ + Result Type and x must be floating-point or vector(2,3,4,8,16) of + floating-point values. + + All of the operands, including the Result Type operand, must be of the + same type. + + #### Example: + + ```mlir + %2 = spirv.CL.log2 %0 : f32 + %3 = spirv.CL.log2 %1 : vector<3xf16> + ``` + }]; +} + +// ----- + +def SPIRV_CLLog10Op : SPIRV_CLUnaryArithmeticOp<"log10", 39, SPIRV_Float> { + let summary = "Compute the base-10 logarithm of x."; + + let description = [{ + Result Type and x must be floating-point or vector(2,3,4,8,16) of + floating-point values. + + All of the operands, including the Result Type operand, must be of the + same type. + + #### Example: + + ```mlir + %2 = spirv.CL.log10 %0 : f32 + %3 = spirv.CL.log10 %1 : vector<3xf16> + ``` + }]; +} + +// ----- + def SPIRV_CLMixOp : SPIRV_CLTernaryArithmeticOp<"mix", 99, SPIRV_Float> { let summary = "Returns the linear blend of x & y implemented as: x + (y - x) * a"; diff --git a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir index d6d3c53f23356..68751211c5fa6 100644 --- a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir @@ -50,6 +50,206 @@ func.func @exp(%arg0 : i32) -> () { // ----- +//===----------------------------------------------------------------------===// +// spirv.CL.exp2 +//===----------------------------------------------------------------------===// + +func.func @exp2(%arg0 : f32) -> () { + // CHECK: spirv.CL.exp2 {{%.*}} : f32 + %2 = spirv.CL.exp2 %arg0 : f32 + return +} + +func.func @exp2vec(%arg0 : vector<3xf16>) -> () { + // CHECK: spirv.CL.exp2 {{%.*}} : vector<3xf16> + %2 = spirv.CL.exp2 %arg0 : vector<3xf16> + return +} + +// ----- + +func.func @exp2(%arg0 : i32) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values}} + %2 = spirv.CL.exp2 %arg0 : i32 + return +} + +// ----- + +func.func @exp2(%arg0 : vector<5xf32>) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values of length 2/3/4}} + %2 = spirv.CL.exp2 %arg0 : vector<5xf32> + return +} + +// ----- + +func.func @exp2(%arg0 : f32, %arg1 : f32) -> () { + // expected-error @+1 {{expected ':'}} + %2 = spirv.CL.exp2 %arg0, %arg1 : i32 + return +} + +// ----- + +func.func @exp2(%arg0 : i32) -> () { + // expected-error @+1 {{expected non-function type}} + %2 = spirv.CL.exp2 %arg0 : + return +} + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.CL.exp10 +//===----------------------------------------------------------------------===// + +func.func @exp10(%arg0 : f32) -> () { + // CHECK: spirv.CL.exp10 {{%.*}} : f32 + %2 = spirv.CL.exp10 %arg0 : f32 + return +} + +func.func @exp10vec(%arg0 : vector<3xf16>) -> () { + // CHECK: spirv.CL.exp10 {{%.*}} : vector<3xf16> + %2 = spirv.CL.exp10 %arg0 : vector<3xf16> + return +} + +// ----- + +func.func @exp10(%arg0 : i32) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values}} + %2 = spirv.CL.exp10 %arg0 : i32 + return +} + +// ----- + +func.func @exp10(%arg0 : vector<5xf32>) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values of length 2/3/4}} + %2 = spirv.CL.exp10 %arg0 : vector<5xf32> + return +} + +// ----- + +func.func @exp10(%arg0 : f32, %arg1 : f32) -> () { + // expected-error @+1 {{expected ':'}} + %2 = spirv.CL.exp10 %arg0, %arg1 : i32 + return +} + +// ----- + +func.func @exp10(%arg0 : i32) -> () { + // expected-error @+1 {{expected non-function type}} + %2 = spirv.CL.exp10 %arg0 : + return +} + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.CL.log2 +//===----------------------------------------------------------------------===// + +func.func @log2(%arg0 : f32) -> () { + // CHECK: spirv.CL.log2 {{%.*}} : f32 + %2 = spirv.CL.log2 %arg0 : f32 + return +} + +func.func @log2vec(%arg0 : vector<3xf16>) -> () { + // CHECK: spirv.CL.log2 {{%.*}} : vector<3xf16> + %2 = spirv.CL.log2 %arg0 : vector<3xf16> + return +} + +// ----- + +func.func @log2(%arg0 : i32) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values}} + %2 = spirv.CL.log2 %arg0 : i32 + return +} + +// ----- + +func.func @log2(%arg0 : vector<5xf32>) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values of length 2/3/4}} + %2 = spirv.CL.log2 %arg0 : vector<5xf32> + return +} + +// ----- + +func.func @log2(%arg0 : f32, %arg1 : f32) -> () { + // expected-error @+1 {{expected ':'}} + %2 = spirv.CL.log2 %arg0, %arg1 : i32 + return +} + +// ----- + +func.func @log2(%arg0 : i32) -> () { + // expected-error @+1 {{expected non-function type}} + %2 = spirv.CL.log2 %arg0 : + return +} + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.CL.log10 +//===----------------------------------------------------------------------===// + +func.func @log10(%arg0 : f32) -> () { + // CHECK: spirv.CL.log10 {{%.*}} : f32 + %2 = spirv.CL.log10 %arg0 : f32 + return +} + +func.func @log10vec(%arg0 : vector<3xf16>) -> () { + // CHECK: spirv.CL.log10 {{%.*}} : vector<3xf16> + %2 = spirv.CL.log10 %arg0 : vector<3xf16> + return +} + +// ----- + +func.func @log10(%arg0 : i32) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values}} + %2 = spirv.CL.log10 %arg0 : i32 + return +} + +// ----- + +func.func @log10(%arg0 : vector<5xf32>) -> () { + // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or fixed-length vector of 16/32/64-bit float values of length 2/3/4}} + %2 = spirv.CL.log10 %arg0 : vector<5xf32> + return +} + +// ----- + +func.func @log10(%arg0 : f32, %arg1 : f32) -> () { + // expected-error @+1 {{expected ':'}} + %2 = spirv.CL.log10 %arg0, %arg1 : i32 + return +} + +// ----- + +func.func @log10(%arg0 : i32) -> () { + // expected-error @+1 {{expected non-function type}} + %2 = spirv.CL.log10 %arg0 : + return +} + +// ----- + //===----------------------------------------------------------------------===// // spirv.CL.fabs //===----------------------------------------------------------------------===// diff --git a/mlir/test/Target/SPIRV/ocl-ops.mlir b/mlir/test/Target/SPIRV/ocl-ops.mlir index 7a4abbd9dd344..e43223e65db5c 100644 --- a/mlir/test/Target/SPIRV/ocl-ops.mlir +++ b/mlir/test/Target/SPIRV/ocl-ops.mlir @@ -17,6 +17,14 @@ spirv.module Physical64 OpenCL requires #spirv.vce Date: Mon, 11 May 2026 10:45:28 +0100 Subject: [PATCH 241/538] [Clang] Fix incorrect type for `__mfp8` in `extractelement` codegen (#192977) The codegen for extracting an element from an FP8 vector was emitting a simple `extractelement` with `i8` type for the extracted element. The `__mfp8` type is represented as `<1 x i8>` in LLVM IR. This codegen created inconsistency in Clang - some `__mfp8` expressions would correspond to LLVM IR values with `<1 x i8>` type and some to `i8` type. It also caused an assertion failure when the extracted element was passed as a function argument. This patch fixes the issue by inserting the extracted element into a `<1 x i8>`. --- clang/lib/CodeGen/CGExprScalar.cpp | 10 ++++- clang/test/CodeGen/AArch64/fp8-extract.c | 52 ++++++++++++++++++++++++ clang/test/CodeGen/arm-mfp8.c | 12 ++---- 3 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-extract.c diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index d60f1b37be50e..c8a8ec7b6d928 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2213,7 +2213,15 @@ Value *ScalarExprEmitter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) { if (CGF.SanOpts.has(SanitizerKind::ArrayBounds)) CGF.EmitBoundsCheck(E, E->getBase(), Idx, IdxTy, /*Accessed*/true); - return Builder.CreateExtractElement(Base, Idx, "vecext"); + Value *Ret = Builder.CreateExtractElement(Base, Idx, "vecext"); + + // Even being a scalar the `__mfp8` type corresponds to `<1 x i8>` in LLVM IR. + if (E->getType()->isMFloat8Type()) + Ret = Builder.CreateInsertElement( + llvm::PoisonValue::get(llvm::FixedVectorType::get(CGF.Int8Ty, 1)), Ret, + uint64_t(0), "mfp8ext"); + + return Ret; } Value *ScalarExprEmitter::VisitMatrixSingleSubscriptExpr( diff --git a/clang/test/CodeGen/AArch64/fp8-extract.c b/clang/test/CodeGen/AArch64/fp8-extract.c new file mode 100644 index 0000000000000..7fa93c6f15305 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-extract.c @@ -0,0 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -S -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +// Test for a n internal compiler error when extracting an element from an FP8 +// vector and passing it to a function. + +// CHECK-LABEL: define dso_local void @test_var( +// CHECK-SAME: <16 x i8> [[V:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <16 x i8> [[V]], i32 [[I]] +// CHECK-NEXT: [[MFP8CAST:%.*]] = insertelement <1 x i8> poison, i8 [[VECEXT]], i64 0 +// CHECK-NEXT: call void @g(<1 x i8> [[MFP8CAST]]) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z8test_var14__Mfloat8x16_ti( +// CHECK-CXX-SAME: <16 x i8> [[V:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VECEXT:%.*]] = extractelement <16 x i8> [[V]], i32 [[I]] +// CHECK-CXX-NEXT: [[MFP8CAST:%.*]] = insertelement <1 x i8> poison, i8 [[VECEXT]], i64 0 +// CHECK-CXX-NEXT: call void @_Z1gu6__mfp8(<1 x i8> [[MFP8CAST]]) #[[ATTR2:[0-9]+]] +// CHECK-CXX-NEXT: ret void +// +void test_var(__Mfloat8x16_t v, int i) { + void g(__mfp8); + g(v[i]); +} + +// CHECK-LABEL: define dso_local void @test_cst( +// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MFP8CAST:%.*]] = shufflevector <16 x i8> [[V]], <16 x i8> poison, <1 x i32> +// CHECK-NEXT: call void @g(<1 x i8> [[MFP8CAST]]) #[[ATTR2]] +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z8test_cst14__Mfloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[MFP8CAST:%.*]] = shufflevector <16 x i8> [[V]], <16 x i8> poison, <1 x i32> +// CHECK-CXX-NEXT: call void @_Z1gu6__mfp8(<1 x i8> [[MFP8CAST]]) #[[ATTR2]] +// CHECK-CXX-NEXT: ret void +// +void test_cst(__Mfloat8x16_t v) { + void g(__mfp8); + g(v[3]); +} diff --git a/clang/test/CodeGen/arm-mfp8.c b/clang/test/CodeGen/arm-mfp8.c index 9385b537f18b3..82df375a45686 100644 --- a/clang/test/CodeGen/arm-mfp8.c +++ b/clang/test/CodeGen/arm-mfp8.c @@ -64,20 +64,16 @@ __mfp8 func1n(__mfp8 mfp8) { // CHECK-C-LABEL: define dso_local <1 x i8> @test_extract_element( // CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 // CHECK-C-NEXT: [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]] -// CHECK-C-NEXT: store i8 [[VECEXT]], ptr [[RETVAL]], align 1 -// CHECK-C-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 -// CHECK-C-NEXT: ret <1 x i8> [[TMP0]] +// CHECK-C-NEXT: [[MFP8CAST:%.*]] = insertelement <1 x i8> poison, i8 [[VECEXT]], i64 0 +// CHECK-C-NEXT: ret <1 x i8> [[MFP8CAST]] // // CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z20test_extract_element14__Mfloat8x16_ti( // CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 // CHECK-CXX-NEXT: [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]] -// CHECK-CXX-NEXT: store i8 [[VECEXT]], ptr [[RETVAL]], align 1 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 -// CHECK-CXX-NEXT: ret <1 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[MFP8CAST:%.*]] = insertelement <1 x i8> poison, i8 [[VECEXT]], i64 0 +// CHECK-CXX-NEXT: ret <1 x i8> [[MFP8CAST]] // mfloat8_t test_extract_element(mfloat8x16_t x, int i) { return x[i]; From 3f091af9873f298d348541db77ccc159bda6884e Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Mon, 11 May 2026 10:57:01 +0100 Subject: [PATCH 242/538] [mlir][tosa] Add a pass to downgrade TOSA `1.1.draft` to `1.0` (#194971) This commit adds a pass that will allow 1.1.draft operations to be rewritten to their 1.0 counterparts where possible. The pass currently covers the following operations: - bool <-> fp32 casts via i8 bridge casts - bool gather/scatter with i32 indices via i8 payload rewrites Note that the downgrade is 'best-effort' and the pass does not perform any validation itself. The validation pass should be run after downgrading to check that the resulting IR was downgraded successfully. Motivation: This decouples the target specification version in legalizations and backends. Legalizations from higher level frameworks may be updated to support producing TOSA 1.1.draft variants of operations, while backends can still consume TOSA 1.0 IR after running the downgrade pass. --- .../mlir/Dialect/Tosa/Transforms/Passes.td | 11 ++ .../Dialect/Tosa/Transforms/CMakeLists.txt | 1 + .../Tosa/Transforms/TosaDowngrade1p1To1p0.cpp | 162 ++++++++++++++++++ .../Tosa/tosa-downgrade-1-1-to-1-0.mlir | 118 +++++++++++++ 4 files changed, 292 insertions(+) create mode 100644 mlir/lib/Dialect/Tosa/Transforms/TosaDowngrade1p1To1p0.cpp create mode 100644 mlir/test/Dialect/Tosa/tosa-downgrade-1-1-to-1-0.mlir diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td index 5979ce4962e55..005cbfab782df 100644 --- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td @@ -185,6 +185,17 @@ def TosaAttachTarget : Pass<"tosa-attach-target", "ModuleOp"> { ]; } +def TosaDowngrade1p1To1p0Pass + : Pass<"tosa-downgrade-1-1-to-1-0", "func::FuncOp"> { + let summary = "Downgrade TOSA 1.1 specification constructs to TOSA 1.0"; + let description = [{ + Rewrites constructs which are only compatible in TOSA specification 1.1 and + above to their TOSA 1.0 counterparts where possible. Downgrading is best-effort + and validation should be performed afterwards to ensure compatibility with + the TOSA 1.0 specification. + }]; +} + def TosaNarrowI64ToI32Pass : Pass<"tosa-narrow-i64-to-i32", "func::FuncOp"> { let summary = "Narrow I64 TOSA operations to I32"; let description = [{ diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt index e8a76fa3a1d21..1fd18bb5a395b 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt @@ -2,6 +2,7 @@ add_mlir_dialect_library(MLIRTosaTransforms TosaAttachTarget.cpp TosaArithConstantToConst.cpp TosaConvertIntegerTypeToSignless.cpp + TosaDowngrade1p1To1p0.cpp TosaDecomposeTransposeConv.cpp TosaDecomposeDepthwise.cpp TosaFolders.cpp diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDowngrade1p1To1p0.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDowngrade1p1To1p0.cpp new file mode 100644 index 0000000000000..cfd2dd9d29650 --- /dev/null +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDowngrade1p1To1p0.cpp @@ -0,0 +1,162 @@ +//===- TosaDowngrade1_1To1_0.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Rewrites constructs which are only compatible in TOSA specification 1.1 and +// above to their TOSA 1.0 counterparts where possible. Downgrading is +// best-effort and validation should be performed afterwards to ensure +// compatibility with the TOSA 1.0 specification. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Tosa/Transforms/Passes.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +namespace tosa { +#define GEN_PASS_DEF_TOSADOWNGRADE1P1TO1P0PASS +#include "mlir/Dialect/Tosa/Transforms/Passes.h.inc" +} // namespace tosa +} // namespace mlir + +using namespace mlir; +using namespace mlir::tosa; + +namespace { + +class BoolFp32CastRewrite : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::CastOp op, + PatternRewriter &rewriter) const override { + const Value input = op.getInput(); + + const Type i1Type = rewriter.getI1Type(); + const Type f32Type = rewriter.getF32Type(); + + const Type inputElemType = getElementTypeOrSelf(input.getType()); + const Type outputElemType = getElementTypeOrSelf(op.getType()); + const bool isFp32ToBool = + inputElemType == f32Type && outputElemType == i1Type; + const bool isBoolToFp32 = + inputElemType == i1Type && outputElemType == f32Type; + + if (!isFp32ToBool && !isBoolToFp32) + return rewriter.notifyMatchFailure(op, + "expected cast between bool and f32"); + + const Type outputType = op.getType(); + const Type i8Type = rewriter.getI8Type(); + const Type intermediateType = cast(outputType).clone(i8Type); + + auto inner = + tosa::CastOp::create(rewriter, op.getLoc(), intermediateType, input); + auto outer = tosa::CastOp::create(rewriter, op.getLoc(), outputType, + inner.getOutput()); + rewriter.replaceOp(op, outer.getOutput()); + return success(); + } +}; + +class BoolGatherRewrite : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::GatherOp op, + PatternRewriter &rewriter) const override { + const Value values = op.getValues(); + const Value indices = op.getIndices(); + + const Type valuesType = values.getType(); + const Type resultType = op.getType(); + + const Type i1Type = rewriter.getI1Type(); + const Type i32Type = rewriter.getI32Type(); + if (getElementTypeOrSelf(valuesType) != i1Type || + getElementTypeOrSelf(indices.getType()) != i32Type) + return rewriter.notifyMatchFailure( + op, "expected values of bool type and indices of i32 type"); + + const Type i8Type = rewriter.getI8Type(); + const Type valuesI8Type = cast(valuesType).clone(i8Type); + const Type resultI8Type = cast(resultType).clone(i8Type); + + auto valuesToI8 = + tosa::CastOp::create(rewriter, op.getLoc(), valuesI8Type, values); + auto gatherI8 = tosa::GatherOp::create(rewriter, op.getLoc(), resultI8Type, + valuesToI8.getOutput(), indices); + auto i8ToBool = tosa::CastOp::create(rewriter, op.getLoc(), resultType, + gatherI8.getOutput()); + rewriter.replaceOp(op, i8ToBool.getOutput()); + return success(); + } +}; + +class BoolScatterRewrite : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::ScatterOp op, + PatternRewriter &rewriter) const override { + const Value valuesIn = op.getValuesIn(); + const Value indices = op.getIndices(); + + const Type valuesInType = valuesIn.getType(); + const Type i1Type = rewriter.getI1Type(); + const Type i32Type = rewriter.getI32Type(); + if (getElementTypeOrSelf(valuesInType) != i1Type || + getElementTypeOrSelf(indices.getType()) != i32Type) + return rewriter.notifyMatchFailure( + op, "expected values of bool type and indices of i32 type"); + + const Value input = op.getInput(); + const Type inputType = input.getType(); + const Type resultType = op.getType(); + + const Type i8Type = rewriter.getI8Type(); + const Type valuesInI8Type = cast(valuesInType).clone(i8Type); + const Type inputI8Type = cast(inputType).clone(i8Type); + const Type resultI8Type = cast(resultType).clone(i8Type); + + auto valuesInToI8 = + tosa::CastOp::create(rewriter, op.getLoc(), valuesInI8Type, valuesIn); + auto inputToI8 = + tosa::CastOp::create(rewriter, op.getLoc(), inputI8Type, input); + auto scatterI8 = tosa::ScatterOp::create( + rewriter, op.getLoc(), resultI8Type, valuesInToI8.getOutput(), indices, + inputToI8.getOutput()); + auto i8ToBool = tosa::CastOp::create(rewriter, op.getLoc(), resultType, + scatterI8.getValuesOut()); + rewriter.replaceOp(op, i8ToBool.getOutput()); + return success(); + } +}; + +struct TosaDowngrade1p1To1p0Pass + : public tosa::impl::TosaDowngrade1p1To1p0PassBase< + TosaDowngrade1p1To1p0Pass> { + using Base::Base; + + void runOnOperation() override { + MLIRContext &context = getContext(); + func::FuncOp func = getOperation(); + + RewritePatternSet patterns(&context); + patterns.add( + &context); + FrozenRewritePatternSet frozenPatterns(std::move(patterns)); + + if (failed(applyPatternsGreedily(func, frozenPatterns))) + return signalPassFailure(); + } +}; + +} // namespace diff --git a/mlir/test/Dialect/Tosa/tosa-downgrade-1-1-to-1-0.mlir b/mlir/test/Dialect/Tosa/tosa-downgrade-1-1-to-1-0.mlir new file mode 100644 index 0000000000000..5427d9119b7af --- /dev/null +++ b/mlir/test/Dialect/Tosa/tosa-downgrade-1-1-to-1-0.mlir @@ -0,0 +1,118 @@ +// RUN: mlir-opt --split-input-file --tosa-downgrade-1-1-to-1-0 %s | FileCheck %s + +// CHECK-LABEL: @test_bool_to_fp32 +// CHECK: %[[BOOL_TO_I8:.+]] = tosa.cast %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xi8> +// CHECK: %[[I8_TO_F32:.+]] = tosa.cast %[[BOOL_TO_I8]] : (tensor<13x21x3xi8>) -> tensor<13x21x3xf32> +// CHECK: return %[[I8_TO_F32]] +func.func @test_bool_to_fp32(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xf32> { + %0 = tosa.cast %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- + +// CHECK-LABEL: @test_bool_to_fp32_unranked +// CHECK: %[[BOOL_TO_I8:.+]] = tosa.cast %arg0 : (tensor<*xi1>) -> tensor<*xi8> +// CHECK: %[[I8_TO_F32:.+]] = tosa.cast %[[BOOL_TO_I8]] : (tensor<*xi8>) -> tensor<*xf32> +// CHECK: return %[[I8_TO_F32]] +func.func @test_bool_to_fp32_unranked(%arg0: tensor<*xi1>) -> tensor<*xf32> { + %0 = tosa.cast %arg0 : (tensor<*xi1>) -> tensor<*xf32> + return %0 : tensor<*xf32> +} + +// ----- + +// CHECK-LABEL: @test_fp32_to_bool_ranked_dynamic +// CHECK: %[[FP32_TO_I8:.+]] = tosa.cast %arg0 : (tensor<13x?x3xf32>) -> tensor<13x?x3xi8> +// CHECK: %[[I8_TO_BOOL:.+]] = tosa.cast %[[FP32_TO_I8]] : (tensor<13x?x3xi8>) -> tensor<13x?x3xi1> +// CHECK: return %[[I8_TO_BOOL]] +func.func @test_fp32_to_bool_ranked_dynamic(%arg0: tensor<13x?x3xf32>) -> tensor<13x?x3xi1> { + %0 = tosa.cast %arg0 : (tensor<13x?x3xf32>) -> tensor<13x?x3xi1> + return %0 : tensor<13x?x3xi1> +} + +// ----- + +// CHECK-LABEL: @test_unranked_fp32_to_bool +// CHECK: %[[FP32_TO_I8:.+]] = tosa.cast %arg0 : (tensor<*xf32>) -> tensor<*xi8> +// CHECK: %[[I8_TO_BOOL:.+]] = tosa.cast %[[FP32_TO_I8]] : (tensor<*xi8>) -> tensor<*xi1> +// CHECK: return %[[I8_TO_BOOL]] +func.func @test_unranked_fp32_to_bool(%arg0: tensor<*xf32>) -> tensor<*xi1> { + %0 = tosa.cast %arg0 : (tensor<*xf32>) -> tensor<*xi1> + return %0 : tensor<*xi1> +} + +// ----- + +// CHECK-LABEL: @test_preserve_bool_to_i8 +// CHECK: %[[CAST:.+]] = tosa.cast %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xi8> +// CHECK: return %[[CAST]] +func.func @test_preserve_bool_to_i8(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xi8> { + %0 = tosa.cast %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> +} + +// ----- + +// CHECK-LABEL: @test_gather_bool_i32 +// CHECK: %[[VALUES_TO_I8:.+]] = tosa.cast %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xi8> +// CHECK: %[[GATHER_I8:.+]] = tosa.gather %[[VALUES_TO_I8]], %arg1 : (tensor<13x21x3xi8>, tensor<13x26xi32>) -> tensor<13x26x3xi8> +// CHECK: %[[I8_TO_BOOL:.+]] = tosa.cast %[[GATHER_I8]] : (tensor<13x26x3xi8>) -> tensor<13x26x3xi1> +// CHECK: return %[[I8_TO_BOOL]] +func.func @test_gather_bool_i32(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xi1> { + %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x26xi32>) -> tensor<13x26x3xi1> + return %0 : tensor<13x26x3xi1> +} + +// ----- + +// CHECK-LABEL: @test_preserve_gather_bool_i64 +// CHECK: %[[GATHER:.+]] = tosa.gather %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x26xi64>) -> tensor<13x26x3xi1> +// CHECK: return %[[GATHER]] +func.func @test_preserve_gather_bool_i64(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x26xi64>) -> tensor<13x26x3xi1> { + %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x26xi64>) -> tensor<13x26x3xi1> + return %0 : tensor<13x26x3xi1> +} + +// ----- + +// CHECK-LABEL: @test_preserve_gather_i8_i32 +// CHECK: %[[GATHER:.+]] = tosa.gather %arg0, %arg1 : (tensor<13x21x3xi8>, tensor<13x26xi32>) -> tensor<13x26x3xi8> +// CHECK: return %[[GATHER]] +func.func @test_preserve_gather_i8_i32(%arg0: tensor<13x21x3xi8>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xi8> { + %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xi8>, tensor<13x26xi32>) -> tensor<13x26x3xi8> + return %0 : tensor<13x26x3xi8> +} + +// ----- + +// CHECK-LABEL: @test_scatter_bool_i32 +// CHECK: %[[VALUES_IN_TO_I8:.+]] = tosa.cast %arg0 : (tensor<13x52x3xi1>) -> tensor<13x52x3xi8> +// CHECK: %[[INPUT_TO_I8:.+]] = tosa.cast %arg2 : (tensor<13x26x3xi1>) -> tensor<13x26x3xi8> +// CHECK: %[[SCATTER_I8:.+]] = tosa.scatter %[[VALUES_IN_TO_I8]], %arg1, %[[INPUT_TO_I8]] : (tensor<13x52x3xi8>, tensor<13x26xi32>, tensor<13x26x3xi8>) -> tensor<13x52x3xi8> +// CHECK: %[[I8_TO_BOOL:.+]] = tosa.cast %[[SCATTER_I8]] : (tensor<13x52x3xi8>) -> tensor<13x52x3xi1> +// CHECK: return %[[I8_TO_BOOL]] +func.func @test_scatter_bool_i32(%arg0: tensor<13x52x3xi1>, %arg1: tensor<13x26xi32>, %arg2: tensor<13x26x3xi1>) -> tensor<13x52x3xi1> { + %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x52x3xi1>, tensor<13x26xi32>, tensor<13x26x3xi1>) -> tensor<13x52x3xi1> + return %0 : tensor<13x52x3xi1> +} + +// ----- + +// CHECK-LABEL: @test_preserve_scatter_bool_i64 +// CHECK: %[[SCATTER:.+]] = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x52x3xi1>, tensor<13x26xi64>, tensor<13x26x3xi1>) -> tensor<13x52x3xi1> +// CHECK: return %[[SCATTER]] +func.func @test_preserve_scatter_bool_i64(%arg0: tensor<13x52x3xi1>, %arg1: tensor<13x26xi64>, %arg2: tensor<13x26x3xi1>) -> tensor<13x52x3xi1> { + %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x52x3xi1>, tensor<13x26xi64>, tensor<13x26x3xi1>) -> tensor<13x52x3xi1> + return %0 : tensor<13x52x3xi1> +} + +// ----- + +// CHECK-LABEL: @test_preserve_scatter_i8_i32 +// CHECK: %[[SCATTER:.+]] = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x52x3xi8>, tensor<13x26xi32>, tensor<13x26x3xi8>) -> tensor<13x52x3xi8> +// CHECK: return %[[SCATTER]] +func.func @test_preserve_scatter_i8_i32(%arg0: tensor<13x52x3xi8>, %arg1: tensor<13x26xi32>, %arg2: tensor<13x26x3xi8>) -> tensor<13x52x3xi8> { + %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x52x3xi8>, tensor<13x26xi32>, tensor<13x26x3xi8>) -> tensor<13x52x3xi8> + return %0 : tensor<13x52x3xi8> +} From c2b871b321ce3bcef71653f19ab1fc44694a8f3c Mon Sep 17 00:00:00 2001 From: Zhige Chen Date: Mon, 11 May 2026 18:12:28 +0800 Subject: [PATCH 243/538] [llubi] Upstream existing floating-point intrinsics (#196034) This PR upstreams existing floating-point intrinsics in the out-of-tree version of llubi. Including FP vector reduction, FP min/max operations, etc. Some minor bugs from #188453 are also fixed. --- llvm/test/tools/llubi/intr_fp_fma.ll | 30 ++ llvm/test/tools/llubi/intr_fp_fmuladd.ll | 19 + llvm/test/tools/llubi/intr_fp_fptoi_sat.ll | 40 ++ llvm/test/tools/llubi/intr_fp_is_fpclass.ll | 35 ++ llvm/test/tools/llubi/intr_fp_minmax.ll | 122 +++++ llvm/test/tools/llubi/intr_fp_unary.ll | 28 ++ .../test/tools/llubi/intr_fp_vector_reduce.ll | 82 ++++ .../llubi/intr_fp_vector_reduce_nondet.ll | 59 +++ llvm/tools/llubi/lib/Context.h | 3 + llvm/tools/llubi/lib/Interpreter.cpp | 442 +++++++++++++++--- llvm/tools/llubi/llubi.cpp | 5 + 11 files changed, 799 insertions(+), 66 deletions(-) create mode 100644 llvm/test/tools/llubi/intr_fp_fma.ll create mode 100644 llvm/test/tools/llubi/intr_fp_fmuladd.ll create mode 100644 llvm/test/tools/llubi/intr_fp_fptoi_sat.ll create mode 100644 llvm/test/tools/llubi/intr_fp_is_fpclass.ll create mode 100644 llvm/test/tools/llubi/intr_fp_minmax.ll create mode 100644 llvm/test/tools/llubi/intr_fp_unary.ll create mode 100644 llvm/test/tools/llubi/intr_fp_vector_reduce.ll create mode 100644 llvm/test/tools/llubi/intr_fp_vector_reduce_nondet.ll diff --git a/llvm/test/tools/llubi/intr_fp_fma.ll b/llvm/test/tools/llubi/intr_fp_fma.ll new file mode 100644 index 0000000000000..cd52f1ac7d0d4 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_fma.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %fma = call float @llvm.fma.f32(float 2.0, float 3.0, float 4.0) + %fma_poison_lhs = call float @llvm.fma.f32(float poison, float 3.0, float 4.0) + %fma_poison_rhs = call float @llvm.fma.f32(float 2.0, float poison, float 4.0) + %fma_poison_addend = call float @llvm.fma.f32(float 2.0, float 3.0, float poison) + %fma_vec = call <4 x float> @llvm.fma.v4f32(<4 x float> , <4 x float> , <4 x float> ) + + %fmuladd = call float @llvm.fmuladd.f32(float 2.0, float 3.0, float 4.0) + %fmuladd_poison_lhs = call float @llvm.fmuladd.f32(float poison, float 3.0, float 4.0) + %fmuladd_poison_rhs = call float @llvm.fmuladd.f32(float 2.0, float poison, float 4.0) + %fmuladd_poison_addend = call float @llvm.fmuladd.f32(float 2.0, float 3.0, float poison) + %fmuladd_vec = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> , <4 x float> , <4 x float> ) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %fma = call float @llvm.fma.f32(float 2.000000e+00, float 3.000000e+00, float 4.000000e+00) => float 1.000000e+01 +; CHECK-NEXT: %fma_poison_lhs = call float @llvm.fma.f32(float poison, float 3.000000e+00, float 4.000000e+00) => poison +; CHECK-NEXT: %fma_poison_rhs = call float @llvm.fma.f32(float 2.000000e+00, float poison, float 4.000000e+00) => poison +; CHECK-NEXT: %fma_poison_addend = call float @llvm.fma.f32(float 2.000000e+00, float 3.000000e+00, float poison) => poison +; CHECK-NEXT: %fma_vec = call <4 x float> @llvm.fma.v4f32(<4 x float> , <4 x float> , <4 x float> ) => { float 1.000000e+01, poison, float 1.700000e+01, poison } +; CHECK-NEXT: %fmuladd = call float @llvm.fmuladd.f32(float 2.000000e+00, float 3.000000e+00, float 4.000000e+00) => float 1.000000e+01 +; CHECK-NEXT: %fmuladd_poison_lhs = call float @llvm.fmuladd.f32(float poison, float 3.000000e+00, float 4.000000e+00) => poison +; CHECK-NEXT: %fmuladd_poison_rhs = call float @llvm.fmuladd.f32(float 2.000000e+00, float poison, float 4.000000e+00) => poison +; CHECK-NEXT: %fmuladd_poison_addend = call float @llvm.fmuladd.f32(float 2.000000e+00, float 3.000000e+00, float poison) => poison +; CHECK-NEXT: %fmuladd_vec = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> , <4 x float> , <4 x float> ) => { float 1.000000e+01, poison, float 1.700000e+01, poison } +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_fmuladd.ll b/llvm/test/tools/llubi/intr_fp_fmuladd.ll new file mode 100644 index 0000000000000..dd504a3e71a97 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_fmuladd.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose --fuse-fmuladd=false < %s 2>&1 | FileCheck %s + +define void @main() { + %fmuladd = call float @llvm.fmuladd.f32(float 2.0, float 3.0, float 4.0) + %fmuladd_poison_lhs = call float @llvm.fmuladd.f32(float poison, float 3.0, float 4.0) + %fmuladd_poison_rhs = call float @llvm.fmuladd.f32(float 2.0, float poison, float 4.0) + %fmuladd_poison_addend = call float @llvm.fmuladd.f32(float 2.0, float 3.0, float poison) + %fmuladd_vec = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> , <4 x float> , <4 x float> ) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %fmuladd = call float @llvm.fmuladd.f32(float 2.000000e+00, float 3.000000e+00, float 4.000000e+00) => float 1.000000e+01 +; CHECK-NEXT: %fmuladd_poison_lhs = call float @llvm.fmuladd.f32(float poison, float 3.000000e+00, float 4.000000e+00) => poison +; CHECK-NEXT: %fmuladd_poison_rhs = call float @llvm.fmuladd.f32(float 2.000000e+00, float poison, float 4.000000e+00) => poison +; CHECK-NEXT: %fmuladd_poison_addend = call float @llvm.fmuladd.f32(float 2.000000e+00, float 3.000000e+00, float poison) => poison +; CHECK-NEXT: %fmuladd_vec = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> , <4 x float> , <4 x float> ) => { float 1.000000e+01, poison, float 1.700000e+01, poison } +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_fptoi_sat.ll b/llvm/test/tools/llubi/intr_fp_fptoi_sat.ll new file mode 100644 index 0000000000000..5b80641d2afb9 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_fptoi_sat.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %spos = call i8 @llvm.fptosi.sat.i8.f32(float 127.75) + %sneg = call i8 @llvm.fptosi.sat.i8.f32(float -128.75) + %slarge = call i8 @llvm.fptosi.sat.i8.f32(float 1.0e+10) + %slarge_neg = call i8 @llvm.fptosi.sat.i8.f32(float -1.0e+10) + %spoison = call i8 @llvm.fptosi.sat.i8.f32(float poison) + %svec = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> ) + %sqnan = call i8 @llvm.fptosi.sat.i8.f32(float +nan(0x42)) + %ssnan = call i8 @llvm.fptosi.sat.i8.f32(float +snan(0x42)) + + %upos = call i8 @llvm.fptoui.sat.i8.f32(float 255.75) + %uneg = call i8 @llvm.fptoui.sat.i8.f32(float -1.25) + %ularge = call i8 @llvm.fptoui.sat.i8.f32(float 1.0e+10) + %upoison = call i8 @llvm.fptoui.sat.i8.f32(float poison) + %uvec = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> ) + %uqnan = call i8 @llvm.fptoui.sat.i8.f32(float +nan(0x42)) + %usnan = call i8 @llvm.fptoui.sat.i8.f32(float +snan(0x42)) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %spos = call i8 @llvm.fptosi.sat.i8.f32(float 1.277500e+02) => i8 127 +; CHECK-NEXT: %sneg = call i8 @llvm.fptosi.sat.i8.f32(float -1.287500e+02) => i8 -128 +; CHECK-NEXT: %slarge = call i8 @llvm.fptosi.sat.i8.f32(float 1.000000e+10) => i8 127 +; CHECK-NEXT: %slarge_neg = call i8 @llvm.fptosi.sat.i8.f32(float -1.000000e+10) => i8 -128 +; CHECK-NEXT: %spoison = call i8 @llvm.fptosi.sat.i8.f32(float poison) => poison +; CHECK-NEXT: %svec = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> ) => { i8 1, poison, i8 127, i8 -128 } +; CHECK-NEXT: %sqnan = call i8 @llvm.fptosi.sat.i8.f32(float +nan(0x42)) => i8 0 +; CHECK-NEXT: %ssnan = call i8 @llvm.fptosi.sat.i8.f32(float +snan(0x42)) => i8 0 +; CHECK-NEXT: %upos = call i8 @llvm.fptoui.sat.i8.f32(float 2.557500e+02) => i8 -1 +; CHECK-NEXT: %uneg = call i8 @llvm.fptoui.sat.i8.f32(float -1.250000e+00) => i8 0 +; CHECK-NEXT: %ularge = call i8 @llvm.fptoui.sat.i8.f32(float 1.000000e+10) => i8 -1 +; CHECK-NEXT: %upoison = call i8 @llvm.fptoui.sat.i8.f32(float poison) => poison +; CHECK-NEXT: %uvec = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> ) => { i8 1, poison, i8 -1, i8 0 } +; CHECK-NEXT: %uqnan = call i8 @llvm.fptoui.sat.i8.f32(float +nan(0x42)) => i8 0 +; CHECK-NEXT: %usnan = call i8 @llvm.fptoui.sat.i8.f32(float +snan(0x42)) => i8 0 +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_is_fpclass.ll b/llvm/test/tools/llubi/intr_fp_is_fpclass.ll new file mode 100644 index 0000000000000..32614a2bf65d8 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_is_fpclass.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %is_snan = call i1 @llvm.is.fpclass.f32(float +snan(0x42), i32 1) + %is_qnan = call i1 @llvm.is.fpclass.f32(float +nan(0x42), i32 2) + %is_nan = call i1 @llvm.is.fpclass.f32(float -nan(0x42), i32 3) + %is_neg_inf = call i1 @llvm.is.fpclass.f32(float -inf, i32 4) + %is_neg_normal = call i1 @llvm.is.fpclass.f32(float -1.0, i32 8) + %is_neg_subnormal = call i1 @llvm.is.fpclass.f32(float -1.434929627468612680625899e-42, i32 16) + %is_neg_zero = call i1 @llvm.is.fpclass.f32(float -0.0, i32 32) + %is_pos_zero = call i1 @llvm.is.fpclass.f32(float 0.0, i32 64) + %is_pos_subnormal = call i1 @llvm.is.fpclass.f32(float 1.434929627468612680625899e-42, i32 128) + %is_pos_normal = call i1 @llvm.is.fpclass.f32(float 1.0, i32 256) + %is_pos_inf = call i1 @llvm.is.fpclass.f32(float +inf, i32 512) + %is_poison = call i1 @llvm.is.fpclass.f32(float poison, i32 256) + %is_vec = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> , i32 291) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %is_snan = call i1 @llvm.is.fpclass.f32(float +snan(0x42), i32 1) => T +; CHECK-NEXT: %is_qnan = call i1 @llvm.is.fpclass.f32(float +nan(0x42), i32 2) => T +; CHECK-NEXT: %is_nan = call i1 @llvm.is.fpclass.f32(float -nan(0x42), i32 3) => T +; CHECK-NEXT: %is_neg_inf = call i1 @llvm.is.fpclass.f32(float -inf, i32 4) => T +; CHECK-NEXT: %is_neg_normal = call i1 @llvm.is.fpclass.f32(float -1.000000e+00, i32 8) => T +; CHECK-NEXT: %is_neg_subnormal = call i1 @llvm.is.fpclass.f32(float -1.434930e-42, i32 16) => T +; CHECK-NEXT: %is_neg_zero = call i1 @llvm.is.fpclass.f32(float -0.000000e+00, i32 32) => T +; CHECK-NEXT: %is_pos_zero = call i1 @llvm.is.fpclass.f32(float 0.000000e+00, i32 64) => T +; CHECK-NEXT: %is_pos_subnormal = call i1 @llvm.is.fpclass.f32(float 1.434930e-42, i32 128) => T +; CHECK-NEXT: %is_pos_normal = call i1 @llvm.is.fpclass.f32(float 1.000000e+00, i32 256) => T +; CHECK-NEXT: %is_pos_inf = call i1 @llvm.is.fpclass.f32(float +inf, i32 512) => T +; CHECK-NEXT: %is_poison = call i1 @llvm.is.fpclass.f32(float poison, i32 256) => poison +; CHECK-NEXT: %is_vec = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> , i32 291) => { T, poison, T, T } +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_minmax.ll b/llvm/test/tools/llubi/intr_fp_minmax.ll new file mode 100644 index 0000000000000..d6b34394607e4 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_minmax.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %maxnum = call float @llvm.maxnum.f32(float 1.0, float 2.0) + %maxnum_neg_zero = call float @llvm.maxnum.f32(float -0.0, float 0.0) + %maxnum_qnan_lhs = call float @llvm.maxnum.f32(float +nan(0x42), float 2.0) + %maxnum_qnan_rhs = call float @llvm.maxnum.f32(float 2.0, float +nan(0x42)) + %maxnum_snan_lhs = call float @llvm.maxnum.f32(float +snan(0x42), float 2.0) + %maxnum_snan_rhs = call float @llvm.maxnum.f32(float 2.0, float +snan(0x42)) + %maxnum_poison_lhs = call float @llvm.maxnum.f32(float poison, float 2.0) + %maxnum_poison_rhs = call float @llvm.maxnum.f32(float 1.0, float poison) + %maxnum_vec = call <4 x float> @llvm.maxnum.v4f32(<4 x float> , <4 x float> ) + + %minnum = call float @llvm.minnum.f32(float 1.0, float 2.0) + %minnum_neg_zero = call float @llvm.minnum.f32(float -0.0, float 0.0) + %minnum_qnan_lhs = call float @llvm.minnum.f32(float +nan(0x42), float 2.0) + %minnum_qnan_rhs = call float @llvm.minnum.f32(float 2.0, float +nan(0x42)) + %minnum_snan_lhs = call float @llvm.minnum.f32(float +snan(0x42), float 2.0) + %minnum_snan_rhs = call float @llvm.minnum.f32(float 2.0, float +snan(0x42)) + %minnum_poison_lhs = call float @llvm.minnum.f32(float poison, float 2.0) + %minnum_poison_rhs = call float @llvm.minnum.f32(float 1.0, float poison) + %minnum_vec = call <4 x float> @llvm.minnum.v4f32(<4 x float> , <4 x float> ) + + %maximum = call float @llvm.maximum.f32(float 1.0, float 2.0) + %maximum_neg_zero = call float @llvm.maximum.f32(float -0.0, float 0.0) + %maximum_qnan_lhs = call float @llvm.maximum.f32(float +nan(0x42), float 2.0) + %maximum_qnan_rhs = call float @llvm.maximum.f32(float 2.0, float +nan(0x42)) + %maximum_snan_lhs = call float @llvm.maximum.f32(float +snan(0x42), float 2.0) + %maximum_snan_rhs = call float @llvm.maximum.f32(float 2.0, float +snan(0x42)) + %maximum_poison_lhs = call float @llvm.maximum.f32(float poison, float 2.0) + %maximum_poison_rhs = call float @llvm.maximum.f32(float 1.0, float poison) + %maximum_vec = call <4 x float> @llvm.maximum.v4f32(<4 x float> , <4 x float> ) + + %minimum = call float @llvm.minimum.f32(float 1.0, float 2.0) + %minimum_neg_zero = call float @llvm.minimum.f32(float -0.0, float 0.0) + %minimum_qnan_lhs = call float @llvm.minimum.f32(float +nan(0x42), float 2.0) + %minimum_qnan_rhs = call float @llvm.minimum.f32(float 2.0, float +nan(0x42)) + %minimum_snan_lhs = call float @llvm.minimum.f32(float +snan(0x42), float 2.0) + %minimum_snan_rhs = call float @llvm.minimum.f32(float 2.0, float +snan(0x42)) + %minimum_poison_lhs = call float @llvm.minimum.f32(float poison, float 2.0) + %minimum_poison_rhs = call float @llvm.minimum.f32(float 1.0, float poison) + %minimum_vec = call <4 x float> @llvm.minimum.v4f32(<4 x float> , <4 x float> ) + + %maximumnum = call float @llvm.maximumnum.f32(float 1.0, float 2.0) + %maximumnum_neg_zero = call float @llvm.maximumnum.f32(float -0.0, float 0.0) + %maximumnum_qnan_lhs = call float @llvm.maximumnum.f32(float +nan(0x42), float 2.0) + %maximumnum_qnan_rhs = call float @llvm.maximumnum.f32(float 2.0, float +nan(0x42)) + %maximumnum_snan_lhs = call float @llvm.maximumnum.f32(float +snan(0x42), float 2.0) + %maximumnum_snan_rhs = call float @llvm.maximumnum.f32(float 2.0, float +snan(0x42)) + %maximumnum_poison_lhs = call float @llvm.maximumnum.f32(float poison, float 2.0) + %maximumnum_poison_rhs = call float @llvm.maximumnum.f32(float 1.0, float poison) + %maximumnum_vec = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> , <4 x float> ) + + %minimumnum = call float @llvm.minimumnum.f32(float 1.0, float 2.0) + %minimumnum_neg_zero = call float @llvm.minimumnum.f32(float -0.0, float 0.0) + %minimumnum_qnan_lhs = call float @llvm.minimumnum.f32(float +nan(0x42), float 2.0) + %minimumnum_qnan_rhs = call float @llvm.minimumnum.f32(float 2.0, float +nan(0x42)) + %minimumnum_snan_lhs = call float @llvm.minimumnum.f32(float +snan(0x42), float 2.0) + %minimumnum_snan_rhs = call float @llvm.minimumnum.f32(float 2.0, float +snan(0x42)) + %minimumnum_poison_lhs = call float @llvm.minimumnum.f32(float poison, float 2.0) + %minimumnum_poison_rhs = call float @llvm.minimumnum.f32(float 1.0, float poison) + %minimumnum_vec = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> , <4 x float> ) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %maxnum = call float @llvm.maxnum.f32(float 1.000000e+00, float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %maxnum_neg_zero = call float @llvm.maxnum.f32(float -0.000000e+00, float 0.000000e+00) => float 0.000000e+00 +; CHECK-NEXT: %maxnum_qnan_lhs = call float @llvm.maxnum.f32(float +nan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %maxnum_qnan_rhs = call float @llvm.maxnum.f32(float 2.000000e+00, float +nan(0x42)) => float 2.000000e+00 +; CHECK-NEXT: %maxnum_snan_lhs = call float @llvm.maxnum.f32(float +snan(0x42), float 2.000000e+00) => float 0xFFC00042 +; CHECK-NEXT: %maxnum_snan_rhs = call float @llvm.maxnum.f32(float 2.000000e+00, float +snan(0x42)) => float 0x7F800042 +; CHECK-NEXT: %maxnum_poison_lhs = call float @llvm.maxnum.f32(float poison, float 2.000000e+00) => poison +; CHECK-NEXT: %maxnum_poison_rhs = call float @llvm.maxnum.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %maxnum_vec = call <4 x float> @llvm.maxnum.v4f32(<4 x float> , <4 x float> ) => { float 2.000000e+00, poison, float 2.000000e+00, poison } +; CHECK-NEXT: %minnum = call float @llvm.minnum.f32(float 1.000000e+00, float 2.000000e+00) => float 1.000000e+00 +; CHECK-NEXT: %minnum_neg_zero = call float @llvm.minnum.f32(float -0.000000e+00, float 0.000000e+00) => float -0.000000e+00 +; CHECK-NEXT: %minnum_qnan_lhs = call float @llvm.minnum.f32(float +nan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %minnum_qnan_rhs = call float @llvm.minnum.f32(float 2.000000e+00, float +nan(0x42)) => float 2.000000e+00 +; CHECK-NEXT: %minnum_snan_lhs = call float @llvm.minnum.f32(float +snan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %minnum_snan_rhs = call float @llvm.minnum.f32(float 2.000000e+00, float +snan(0x42)) => float 0x7F800042 +; CHECK-NEXT: %minnum_poison_lhs = call float @llvm.minnum.f32(float poison, float 2.000000e+00) => poison +; CHECK-NEXT: %minnum_poison_rhs = call float @llvm.minnum.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %minnum_vec = call <4 x float> @llvm.minnum.v4f32(<4 x float> , <4 x float> ) => { float 1.000000e+00, poison, float 2.000000e+00, poison } +; CHECK-NEXT: %maximum = call float @llvm.maximum.f32(float 1.000000e+00, float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %maximum_neg_zero = call float @llvm.maximum.f32(float -0.000000e+00, float 0.000000e+00) => float 0.000000e+00 +; CHECK-NEXT: %maximum_qnan_lhs = call float @llvm.maximum.f32(float +nan(0x42), float 2.000000e+00) => float 0xFFC00000 +; CHECK-NEXT: %maximum_qnan_rhs = call float @llvm.maximum.f32(float 2.000000e+00, float +nan(0x42)) => float NaN +; CHECK-NEXT: %maximum_snan_lhs = call float @llvm.maximum.f32(float +snan(0x42), float 2.000000e+00) => float 0x7F800042 +; CHECK-NEXT: %maximum_snan_rhs = call float @llvm.maximum.f32(float 2.000000e+00, float +snan(0x42)) => float 0xFF800042 +; CHECK-NEXT: %maximum_poison_lhs = call float @llvm.maximum.f32(float poison, float 2.000000e+00) => poison +; CHECK-NEXT: %maximum_poison_rhs = call float @llvm.maximum.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %maximum_vec = call <4 x float> @llvm.maximum.v4f32(<4 x float> , <4 x float> ) => { float 2.000000e+00, poison, float 0xFFC00000, poison } +; CHECK-NEXT: %minimum = call float @llvm.minimum.f32(float 1.000000e+00, float 2.000000e+00) => float 1.000000e+00 +; CHECK-NEXT: %minimum_neg_zero = call float @llvm.minimum.f32(float -0.000000e+00, float 0.000000e+00) => float -0.000000e+00 +; CHECK-NEXT: %minimum_qnan_lhs = call float @llvm.minimum.f32(float +nan(0x42), float 2.000000e+00) => float 0xFFC00042 +; CHECK-NEXT: %minimum_qnan_rhs = call float @llvm.minimum.f32(float 2.000000e+00, float +nan(0x42)) => float NaN +; CHECK-NEXT: %minimum_snan_lhs = call float @llvm.minimum.f32(float +snan(0x42), float 2.000000e+00) => float 0x7FC00042 +; CHECK-NEXT: %minimum_snan_rhs = call float @llvm.minimum.f32(float 2.000000e+00, float +snan(0x42)) => float NaN +; CHECK-NEXT: %minimum_poison_lhs = call float @llvm.minimum.f32(float poison, float 2.000000e+00) => poison +; CHECK-NEXT: %minimum_poison_rhs = call float @llvm.minimum.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %minimum_vec = call <4 x float> @llvm.minimum.v4f32(<4 x float> , <4 x float> ) => { float 1.000000e+00, poison, float 0xFFC00042, poison } +; CHECK-NEXT: %maximumnum = call float @llvm.maximumnum.f32(float 1.000000e+00, float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %maximumnum_neg_zero = call float @llvm.maximumnum.f32(float -0.000000e+00, float 0.000000e+00) => float 0.000000e+00 +; CHECK-NEXT: %maximumnum_qnan_lhs = call float @llvm.maximumnum.f32(float +nan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %maximumnum_qnan_rhs = call float @llvm.maximumnum.f32(float 2.000000e+00, float +nan(0x42)) => float 2.000000e+00 +; CHECK-NEXT: %maximumnum_snan_lhs = call float @llvm.maximumnum.f32(float +snan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %maximumnum_snan_rhs = call float @llvm.maximumnum.f32(float 2.000000e+00, float +snan(0x42)) => float 2.000000e+00 +; CHECK-NEXT: %maximumnum_poison_lhs = call float @llvm.maximumnum.f32(float poison, float 2.000000e+00) => poison +; CHECK-NEXT: %maximumnum_poison_rhs = call float @llvm.maximumnum.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %maximumnum_vec = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> , <4 x float> ) => { float 2.000000e+00, poison, float 2.000000e+00, poison } +; CHECK-NEXT: %minimumnum = call float @llvm.minimumnum.f32(float 1.000000e+00, float 2.000000e+00) => float 1.000000e+00 +; CHECK-NEXT: %minimumnum_neg_zero = call float @llvm.minimumnum.f32(float -0.000000e+00, float 0.000000e+00) => float -0.000000e+00 +; CHECK-NEXT: %minimumnum_qnan_lhs = call float @llvm.minimumnum.f32(float +nan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %minimumnum_qnan_rhs = call float @llvm.minimumnum.f32(float 2.000000e+00, float +nan(0x42)) => float 2.000000e+00 +; CHECK-NEXT: %minimumnum_snan_lhs = call float @llvm.minimumnum.f32(float +snan(0x42), float 2.000000e+00) => float 2.000000e+00 +; CHECK-NEXT: %minimumnum_snan_rhs = call float @llvm.minimumnum.f32(float 2.000000e+00, float +snan(0x42)) => float 2.000000e+00 +; CHECK-NEXT: %minimumnum_poison_lhs = call float @llvm.minimumnum.f32(float poison, float 2.000000e+00) => poison +; CHECK-NEXT: %minimumnum_poison_rhs = call float @llvm.minimumnum.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %minimumnum_vec = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> , <4 x float> ) => { float 1.000000e+00, poison, float 2.000000e+00, poison } +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_unary.ll b/llvm/test/tools/llubi/intr_fp_unary.ll new file mode 100644 index 0000000000000..269664bf14c34 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_unary.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %fabs = call float @llvm.fabs.f32(float -1.5) + %fabs_neg_zero = call float @llvm.fabs.f32(float -0.0) + %fabs_poison = call float @llvm.fabs.f32(float poison) + %fabs_vec = call <4 x float> @llvm.fabs.v4f32(<4 x float> ) + + %copysign = call float @llvm.copysign.f32(float 1.0, float -2.0) + %copysign_neg_zero = call float @llvm.copysign.f32(float 1.0, float -0.0) + %copysign_poison_mag = call float @llvm.copysign.f32(float poison, float -2.0) + %copysign_poison_sign = call float @llvm.copysign.f32(float 1.0, float poison) + %copysign_vec = call <4 x float> @llvm.copysign.v4f32(<4 x float> , <4 x float> ) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float -1.500000e+00) => float 1.500000e+00 +; CHECK-NEXT: %fabs_neg_zero = call float @llvm.fabs.f32(float -0.000000e+00) => float 0.000000e+00 +; CHECK-NEXT: %fabs_poison = call float @llvm.fabs.f32(float poison) => poison +; CHECK-NEXT: %fabs_vec = call <4 x float> @llvm.fabs.v4f32(<4 x float> ) => { float 1.000000e+00, poison, float 2.000000e+00, float 0.000000e+00 } +; CHECK-NEXT: %copysign = call float @llvm.copysign.f32(float 1.000000e+00, float -2.000000e+00) => float -1.000000e+00 +; CHECK-NEXT: %copysign_neg_zero = call float @llvm.copysign.f32(float 1.000000e+00, float -0.000000e+00) => float -1.000000e+00 +; CHECK-NEXT: %copysign_poison_mag = call float @llvm.copysign.f32(float poison, float -2.000000e+00) => poison +; CHECK-NEXT: %copysign_poison_sign = call float @llvm.copysign.f32(float 1.000000e+00, float poison) => poison +; CHECK-NEXT: %copysign_vec = call <4 x float> @llvm.copysign.v4f32(<4 x float> , <4 x float> ) => { float -1.000000e+00, poison, float 3.000000e+00, float 0.000000e+00 } +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_vector_reduce.ll b/llvm/test/tools/llubi/intr_fp_vector_reduce.ll new file mode 100644 index 0000000000000..feccc03b7f1c4 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_vector_reduce.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --vscale=4 --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %fadd = call float @llvm.vector.reduce.fadd.v4f32(float 1.0, <4 x float> ) + %fadd_poison_acc = call float @llvm.vector.reduce.fadd.v4f32(float poison, <4 x float> ) + %fadd_poison_vec = call float @llvm.vector.reduce.fadd.v4f32(float 1.0, <4 x float> ) + + %fmul = call float @llvm.vector.reduce.fmul.v4f32(float 2.0, <4 x float> ) + %fmul_poison_acc = call float @llvm.vector.reduce.fmul.v4f32(float poison, <4 x float> ) + %fmul_poison_vec = call float @llvm.vector.reduce.fmul.v4f32(float 2.0, <4 x float> ) + + %fmax = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_poison = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_snan = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + + %fmin = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_poison = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_snan = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + + %fmaximum = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> ) + %fmaximum_poison = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> ) + + %fminimum = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> ) + %fminimum_poison = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> ) + + %sv_poison = insertelement splat (float 2.0), float poison, i64 1 + %sv_fadd = call float @llvm.vector.reduce.fadd.nxv4f32(float 1.0, splat (float 2.0)) + %sv_fadd_poison_acc = call float @llvm.vector.reduce.fadd.nxv4f32(float poison, splat (float 2.0)) + %sv_fadd_poison_vec = call float @llvm.vector.reduce.fadd.nxv4f32(float 1.0, %sv_poison) + + %sv_fmul = call float @llvm.vector.reduce.fmul.nxv4f32(float 2.0, splat (float 2.0)) + %sv_fmul_poison_acc = call float @llvm.vector.reduce.fmul.nxv4f32(float poison, splat (float 2.0)) + %sv_fmul_poison_vec = call float @llvm.vector.reduce.fmul.nxv4f32(float 2.0, %sv_poison) + + %sv_fmax = call float @llvm.vector.reduce.fmax.nxv4f32( splat (float 2.0)) + %sv_fmax_poison = call float @llvm.vector.reduce.fmax.nxv4f32( %sv_poison) + + %sv_fmin = call float @llvm.vector.reduce.fmin.nxv4f32( splat (float 2.0)) + %sv_fmin_poison = call float @llvm.vector.reduce.fmin.nxv4f32( %sv_poison) + + %sv_fmaximum = call float @llvm.vector.reduce.fmaximum.nxv4f32( splat (float 2.0)) + %sv_fmaximum_poison = call float @llvm.vector.reduce.fmaximum.nxv4f32( %sv_poison) + + %sv_fminimum = call float @llvm.vector.reduce.fminimum.nxv4f32( splat (float 2.0)) + %sv_fminimum_poison = call float @llvm.vector.reduce.fminimum.nxv4f32( %sv_poison) + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %fadd = call float @llvm.vector.reduce.fadd.v4f32(float 1.000000e+00, <4 x float> ) => float 1.500000e+01 +; CHECK-NEXT: %fadd_poison_acc = call float @llvm.vector.reduce.fadd.v4f32(float poison, <4 x float> ) => poison +; CHECK-NEXT: %fadd_poison_vec = call float @llvm.vector.reduce.fadd.v4f32(float 1.000000e+00, <4 x float> ) => poison +; CHECK-NEXT: %fmul = call float @llvm.vector.reduce.fmul.v4f32(float 2.000000e+00, <4 x float> ) => float 2.400000e+02 +; CHECK-NEXT: %fmul_poison_acc = call float @llvm.vector.reduce.fmul.v4f32(float poison, <4 x float> ) => poison +; CHECK-NEXT: %fmul_poison_vec = call float @llvm.vector.reduce.fmul.v4f32(float 2.000000e+00, <4 x float> ) => poison +; CHECK-NEXT: %fmax = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 5.000000e+00 +; CHECK-NEXT: %fmax_poison = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => poison +; CHECK-NEXT: %fmax_snan = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 5.000000e+00 +; CHECK-NEXT: %fmin = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_poison = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => poison +; CHECK-NEXT: %fmin_snan = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 0x7F800042 +; CHECK-NEXT: %fmaximum = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> ) => float 5.000000e+00 +; CHECK-NEXT: %fmaximum_poison = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> ) => poison +; CHECK-NEXT: %fminimum = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fminimum_poison = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> ) => poison +; CHECK-NEXT: %sv_poison = insertelement splat (float 2.000000e+00), float poison, i64 1 => { float 2.000000e+00, poison, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00 } +; CHECK-NEXT: %sv_fadd = call float @llvm.vector.reduce.fadd.nxv4f32(float 1.000000e+00, splat (float 2.000000e+00)) => float 3.300000e+01 +; CHECK-NEXT: %sv_fadd_poison_acc = call float @llvm.vector.reduce.fadd.nxv4f32(float poison, splat (float 2.000000e+00)) => poison +; CHECK-NEXT: %sv_fadd_poison_vec = call float @llvm.vector.reduce.fadd.nxv4f32(float 1.000000e+00, %sv_poison) => poison +; CHECK-NEXT: %sv_fmul = call float @llvm.vector.reduce.fmul.nxv4f32(float 2.000000e+00, splat (float 2.000000e+00)) => float 1.310720e+05 +; CHECK-NEXT: %sv_fmul_poison_acc = call float @llvm.vector.reduce.fmul.nxv4f32(float poison, splat (float 2.000000e+00)) => poison +; CHECK-NEXT: %sv_fmul_poison_vec = call float @llvm.vector.reduce.fmul.nxv4f32(float 2.000000e+00, %sv_poison) => poison +; CHECK-NEXT: %sv_fmax = call float @llvm.vector.reduce.fmax.nxv4f32( splat (float 2.000000e+00)) => float 2.000000e+00 +; CHECK-NEXT: %sv_fmax_poison = call float @llvm.vector.reduce.fmax.nxv4f32( %sv_poison) => poison +; CHECK-NEXT: %sv_fmin = call float @llvm.vector.reduce.fmin.nxv4f32( splat (float 2.000000e+00)) => float 2.000000e+00 +; CHECK-NEXT: %sv_fmin_poison = call float @llvm.vector.reduce.fmin.nxv4f32( %sv_poison) => poison +; CHECK-NEXT: %sv_fmaximum = call float @llvm.vector.reduce.fmaximum.nxv4f32( splat (float 2.000000e+00)) => float 2.000000e+00 +; CHECK-NEXT: %sv_fmaximum_poison = call float @llvm.vector.reduce.fmaximum.nxv4f32( %sv_poison) => poison +; CHECK-NEXT: %sv_fminimum = call float @llvm.vector.reduce.fminimum.nxv4f32( splat (float 2.000000e+00)) => float 2.000000e+00 +; CHECK-NEXT: %sv_fminimum_poison = call float @llvm.vector.reduce.fminimum.nxv4f32( %sv_poison) => poison +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/test/tools/llubi/intr_fp_vector_reduce_nondet.ll b/llvm/test/tools/llubi/intr_fp_vector_reduce_nondet.ll new file mode 100644 index 0000000000000..c23b79e84b8a9 --- /dev/null +++ b/llvm/test/tools/llubi/intr_fp_vector_reduce_nondet.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llubi_test_checks.py UTC_ARGS: --version 6 +; RUN: llubi --verbose < %s 2>&1 | FileCheck %s + +define void @main() { + %fmax_1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_3 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_6 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_7 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_8 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_9 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_10 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_11 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + %fmax_12 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) + + %fmin_1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_3 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_6 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_7 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_8 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_9 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_10 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_11 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + %fmin_12 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) + + ret void +} +; CHECK: Entering function: main +; CHECK-NEXT: %fmax_1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 0x7F800042 +; CHECK-NEXT: %fmax_3 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 0x7FC00042 +; CHECK-NEXT: %fmax_4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 4.000000e+00 +; CHECK-NEXT: %fmax_5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_6 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_7 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_8 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 4.000000e+00 +; CHECK-NEXT: %fmax_9 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_10 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_11 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmax_12 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmin_1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 4.000000e+00 +; CHECK-NEXT: %fmin_3 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 4.000000e+00 +; CHECK-NEXT: %fmin_4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_6 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_7 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_8 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_9 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmin_10 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 6.000000e+00 +; CHECK-NEXT: %fmin_11 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float 1.000000e+00 +; CHECK-NEXT: %fmin_12 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> ) => float NaN +; CHECK-NEXT: ret void +; CHECK-NEXT: Exiting function: main diff --git a/llvm/tools/llubi/lib/Context.h b/llvm/tools/llubi/lib/Context.h index 177778aa1f691..12a9924c70633 100644 --- a/llvm/tools/llubi/lib/Context.h +++ b/llvm/tools/llubi/lib/Context.h @@ -208,6 +208,7 @@ class Context { bool Deterministic = false; UndefValueBehavior UndefBehavior = UndefValueBehavior::NonDeterministic; NaNPropagationBehavior NaNBehavior = NaNPropagationBehavior::NonDeterministic; + bool FusedMultiplyAdd = false; std::mt19937_64 Rng; @@ -260,6 +261,7 @@ class Context { void setVScale(uint32_t VS) { VScale = VS; } void setMaxSteps(uint32_t MS) { MaxSteps = MS; } void setMaxStackDepth(uint32_t Depth) { MaxStackDepth = Depth; } + void setFusedMultiplyAdd(bool F) { FusedMultiplyAdd = F; } uint64_t getMemoryLimit() const { return MaxMem; } uint32_t getVScale() const { return VScale; } uint32_t getMaxSteps() const { return MaxSteps; } @@ -269,6 +271,7 @@ class Context { bool mayUseNonDeterminism() const { return !Deterministic; } UndefValueBehavior getEffectiveUndefValueBehavior() const; NaNPropagationBehavior getEffectiveNaNPropagationBehavior() const; + bool fuseMultiplyAdd() const { return FusedMultiplyAdd; } void setUndefValueBehavior(UndefValueBehavior UB) { UndefBehavior = UB; } void setNaNPropagationBehavior(NaNPropagationBehavior NaNBehav) { NaNBehavior = NaNBehav; diff --git a/llvm/tools/llubi/lib/Interpreter.cpp b/llvm/tools/llubi/lib/Interpreter.cpp index ad3d2f380506d..537a769790abf 100644 --- a/llvm/tools/llubi/lib/Interpreter.cpp +++ b/llvm/tools/llubi/lib/Interpreter.cpp @@ -165,7 +165,7 @@ class InstExecutor : public InstVisitor, } APFloat handleDenormal(APFloat Val, DenormalMode::DenormalModeKind Mode, - bool IsInput = false) { + bool IsInput) { if (!Val.isDenormal()) return Val; if (IsInput) { @@ -260,6 +260,20 @@ class InstExecutor : public InstVisitor, return Res; } + APFloat maybeQuietSNaN(APFloat Val) const { + if (Val.isSignaling() && Ctx.getRandomBool()) + return Val.makeQuiet(); + return Val; + } + + APFloat maxnumWithSNaNQuieting(const APFloat &LHS, const APFloat &RHS) { + return maxnum(maybeQuietSNaN(LHS), maybeQuietSNaN(RHS)); + } + + APFloat minnumWithSNaNQuieting(const APFloat &LHS, const APFloat &RHS) { + return minnum(maybeQuietSNaN(LHS), maybeQuietSNaN(RHS)); + } + void addPropagatedNaNCandidates(SmallVectorImpl &Candidates, ArrayRef Inputs, const fltSemantics &DstSem, bool QuietingMode, @@ -360,27 +374,11 @@ class InstExecutor : public InstVisitor, }); } - void visitFPUnOp(Instruction &I, - function_ref ScalarFn) { - FastMathFlags FMF = cast(I).getFastMathFlags(); - - visitUnOp(I, [&](const AnyValue &Operand) -> AnyValue { - if (Operand.isPoison()) - return AnyValue::poison(); - - // We don't flush denormals here since the only floating-point unary - // operation is fneg. And fneg is specified as a bitwise operation which - // only flips the sign bit of the input. - - AnyValue ValidatedOperand = - handleFMFFlags(Operand, FMF, /*IsInput=*/true); - if (ValidatedOperand.isPoison()) - return ValidatedOperand; - - APFloat Result = ScalarFn(ValidatedOperand.asFloat()); - - return handleFMFFlags(Result, FMF, /*IsInput=*/false); - }); + void visitBitwiseFPUnOp(Instruction &I, + function_ref ScalarFn) { + setResult(I, visitBitwiseFPUnOpWithResult( + I.getType(), cast(I).getFastMathFlags(), + getValue(I.getOperand(0)), ScalarFn)); } AnyValue @@ -394,6 +392,28 @@ class InstExecutor : public InstVisitor, }); } + AnyValue visitBitwiseFPUnOpWithResult( + Type *RetTy, const FastMathFlags &FMF, const AnyValue &Operand, + function_ref ScalarFn) { + return computeUnOp( + RetTy, Operand, [&](const AnyValue &OperandInner) -> AnyValue { + if (OperandInner.isPoison()) + return AnyValue::poison(); + + // We don't flush denormals here since bitwise floating-point + // operations only manipulate on certain bits of the operand. + + AnyValue ValidatedOperand = + handleFMFFlags(OperandInner, FMF, /*IsInput=*/true); + if (ValidatedOperand.isPoison()) + return ValidatedOperand; + + APFloat Result = ScalarFn(ValidatedOperand.asFloat()); + + return handleFMFFlags(Result, FMF, /*IsInput=*/false); + }); + } + AnyValue computeBinOp( Type *Ty, const AnyValue &LHS, const AnyValue &RHS, function_ref ScalarFn) { @@ -429,41 +449,10 @@ class InstExecutor : public InstVisitor, void visitFPBinOp( Instruction &I, function_ref ScalarFn) { - FastMathFlags FMF = cast(I).getFastMathFlags(); - DenormalMode DenormMode = getCurrentDenormalMode(I); - - if (!Ctx.isDefaultFPEnv()) - reportImmediateUB() << "Non-constrained floating-point operation assumes " - "default floating-point environment"; - - visitBinOp(I, [&](const AnyValue &LHS, const AnyValue &RHS) -> AnyValue { - if (LHS.isPoison() || RHS.isPoison()) - return AnyValue::poison(); - - AnyValue ValidatedLHS = handleFMFFlags(LHS, FMF, /*IsInput=*/true); - AnyValue ValidatedRHS = handleFMFFlags(RHS, FMF, /*IsInput=*/true); - if (ValidatedLHS.isPoison()) - return ValidatedLHS; - if (ValidatedRHS.isPoison()) - return ValidatedRHS; - - // Flush input denormals - APFloat FLHS = handleDenormal(ValidatedLHS.asFloat(), DenormMode.Input); - APFloat FRHS = handleDenormal(ValidatedRHS.asFloat(), DenormMode.Input); - - APFloat RawResult = ScalarFn(FLHS, FRHS); - - // Flush output denormals and handle fast-math flags. - AnyValue FResult = handleFMFFlags( - handleDenormal(RawResult, DenormMode.Output, /*IsInput=*/true), FMF, - /*IsInput=*/false); - - if (FResult.isPoison()) - return FResult; - - APFloat Result = FResult.asFloat(); - return applyNaNPropagation(Result, {&FLHS, &FRHS}); - }); + setResult(I, visitFPBinOpWithResult( + I.getType(), cast(I).getFastMathFlags(), + getValue(I.getOperand(0)), getValue(I.getOperand(1)), + ScalarFn)); } AnyValue visitIntBinOpWithResult( @@ -510,6 +499,53 @@ class InstExecutor : public InstVisitor, AnyValue(std::move(OverflowVec))}; } + AnyValue visitFPBinOpWithResult( + Type *RetTy, const FastMathFlags &FMF, const AnyValue &LHS, + const AnyValue &RHS, + function_ref ScalarFn) { + DenormalMode DenormMode = getCurrentDenormalMode(RetTy); + + if (!Ctx.isDefaultFPEnv()) + reportImmediateUB() << "Non-constrained floating-point operation assumes " + "default floating-point environment"; + + return computeBinOp( + RetTy, LHS, RHS, + [&](const AnyValue &LHSInner, const AnyValue &RHSInner) -> AnyValue { + if (LHSInner.isPoison() || RHSInner.isPoison()) + return AnyValue::poison(); + + AnyValue ValidatedLHS = + handleFMFFlags(LHSInner, FMF, /*IsInput=*/true); + AnyValue ValidatedRHS = + handleFMFFlags(RHSInner, FMF, /*IsInput=*/true); + if (ValidatedLHS.isPoison()) + return ValidatedLHS; + if (ValidatedRHS.isPoison()) + return ValidatedRHS; + + // Flush input denormals + APFloat FLHS = handleDenormal(ValidatedLHS.asFloat(), + DenormMode.Input, /*IsInput=*/true); + APFloat FRHS = handleDenormal(ValidatedRHS.asFloat(), + DenormMode.Input, /*IsInput=*/true); + + APFloat RawResult = ScalarFn(FLHS, FRHS); + + // Flush output denormals and handle fast-math flags. + AnyValue FResult = handleFMFFlags( + handleDenormal(RawResult, DenormMode.Output, /*IsInput=*/false), + FMF, + /*IsInput=*/false); + + if (FResult.isPoison()) + return FResult; + + APFloat Result = FResult.asFloat(); + return applyNaNPropagation(Result, {&FLHS, &FRHS}); + }); + } + AnyValue computeTriOp(Type *Ty, const AnyValue &Op1, const AnyValue &Op2, const AnyValue &Op3, @@ -569,6 +605,61 @@ class InstExecutor : public InstVisitor, }); } + AnyValue visitFPTriOpWithResult( + Type *RetTy, const FastMathFlags &FMF, const AnyValue &Op1, + const AnyValue &Op2, const AnyValue &Op3, + function_ref + ScalarFn) { + DenormalMode DenormMode = getCurrentDenormalMode(RetTy); + + if (!Ctx.isDefaultFPEnv()) + reportImmediateUB() << "Non-constrained floating-point operation assumes " + "default floating-point environment"; + + return computeTriOp( + RetTy, Op1, Op2, Op3, + [&](const AnyValue &Op1Inner, const AnyValue &Op2Inner, + const AnyValue &Op3Inner) -> AnyValue { + if (Op1Inner.isPoison() || Op2Inner.isPoison() || Op3Inner.isPoison()) + return AnyValue::poison(); + + AnyValue ValidatedOp1 = + handleFMFFlags(Op1Inner, FMF, /*IsInput=*/true); + AnyValue ValidatedOp2 = + handleFMFFlags(Op2Inner, FMF, /*IsInput=*/true); + AnyValue ValidatedOp3 = + handleFMFFlags(Op3Inner, FMF, /*IsInput=*/true); + if (ValidatedOp1.isPoison()) + return ValidatedOp1; + if (ValidatedOp2.isPoison()) + return ValidatedOp2; + if (ValidatedOp3.isPoison()) + return ValidatedOp3; + + // Flush input denormals + APFloat FOp1 = handleDenormal(ValidatedOp1.asFloat(), + DenormMode.Input, /*IsInput=*/true); + APFloat FOp2 = handleDenormal(ValidatedOp2.asFloat(), + DenormMode.Input, /*IsInput=*/true); + APFloat FOp3 = handleDenormal(ValidatedOp3.asFloat(), + DenormMode.Input, /*IsInput=*/true); + + APFloat RawResult = ScalarFn(FOp1, FOp2, FOp3); + + // Flush output denormals and handle fast-math flags. + AnyValue FResult = handleFMFFlags( + handleDenormal(RawResult, DenormMode.Output, /*IsInput=*/false), + FMF, + /*IsInput=*/false); + + if (FResult.isPoison()) + return FResult; + + APFloat Result = FResult.asFloat(); + return applyNaNPropagation(Result, {&FOp1, &FOp2, &FOp3}); + }); + } + void jumpTo(Instruction &Terminator, BasicBlock *DestBB) { if (!Handler.onBBJump(Terminator, *DestBB)) { setFailed(); @@ -691,9 +782,9 @@ class InstExecutor : public InstVisitor, return IdxInt.sext(IndexBitWidth); } - DenormalMode getCurrentDenormalMode(Instruction &I) { + DenormalMode getCurrentDenormalMode(Type *Ty) { return CurrentFrame->Func.getDenormalMode( - I.getOperand(0)->getType()->getScalarType()->getFltSemantics()); + Ty->getScalarType()->getFltSemantics()); } // Helper function to convert BooleanKind to bool. Report an immediate UB if @@ -816,6 +907,8 @@ class InstExecutor : public InstVisitor, AnyValue callIntrinsic(CallBase &CB, ArrayRef Args) { Intrinsic::ID IID = CB.getIntrinsicID(); Type *RetTy = CB.getType(); + const FastMathFlags FMF = + isa(CB) ? CB.getFastMathFlags() : FastMathFlags(); switch (IID) { case Intrinsic::assume: @@ -1304,6 +1397,218 @@ class InstExecutor : public InstVisitor, } return std::move(Res); } + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: + case Intrinsic::vector_reduce_fmaximum: + case Intrinsic::vector_reduce_fminimum: { + const auto DenormMode = getCurrentDenormalMode(RetTy); + const bool HasStart = IID == Intrinsic::vector_reduce_fadd || + IID == Intrinsic::vector_reduce_fmul; + const AnyValue &Vector = HasStart ? Args[1] : Args[0]; + std::optional Res; + if (HasStart) { + if (Args[0].isPoison()) + return AnyValue::poison(); + const AnyValue ValidatedStart = + handleFMFFlags(Args[0], FMF, /*IsInput=*/true); + if (ValidatedStart.isPoison()) + return AnyValue::poison(); + Res = handleDenormal(ValidatedStart.asFloat(), DenormMode.Input, + /*IsInput=*/true); + } + for (const auto &V : Vector.asAggregate()) { + if (V.isPoison()) + return AnyValue::poison(); + const AnyValue ValidatedOp = handleFMFFlags(V, FMF, /*IsInput=*/true); + if (ValidatedOp.isPoison()) + return AnyValue::poison(); + APFloat Op = handleDenormal(ValidatedOp.asFloat(), DenormMode.Input, + /*IsInput=*/true); + if (!Res) { + Res = std::move(Op); + continue; + } + switch (IID) { + case Intrinsic::vector_reduce_fadd: + *Res = *Res + Op; + break; + case Intrinsic::vector_reduce_fmul: + *Res = *Res * Op; + break; + case Intrinsic::vector_reduce_fmaximum: + *Res = maximum(*Res, Op); + break; + case Intrinsic::vector_reduce_fminimum: + *Res = minimum(*Res, Op); + break; + default: + llvm_unreachable("Unexpected intrinsic ID"); + } + } + assert(Res.has_value()); + const AnyValue ValidatedRes = + handleFMFFlags(*Res, FMF, /*IsInput=*/false); + if (ValidatedRes.isPoison()) + return AnyValue::poison(); + const APFloat FRes = + handleDenormal(ValidatedRes.asFloat(), DenormMode.Output, + /*IsInput=*/false); + SmallVector InputVec; + InputVec.reserve(Vector.asAggregate().size()); + transform( + Vector.asAggregate(), std::back_inserter(InputVec), + [](const AnyValue &V) -> const APFloat * { return &V.asFloat(); }); + return applyNaNPropagation(FRes, InputVec); + } + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: { + const auto DenormMode = getCurrentDenormalMode(RetTy); + const auto &Vector = Args[0].asAggregate(); + SmallVector InputFloats; + SmallVector InputVec; + InputFloats.reserve(Vector.size()); + InputVec.reserve(Vector.size()); + for (const auto &V : Vector) { + if (V.isPoison()) + return AnyValue::poison(); + const AnyValue ValidatedOp = handleFMFFlags(V, FMF, /*IsInput=*/true); + if (ValidatedOp.isPoison()) + return AnyValue::poison(); + InputFloats.push_back(handleDenormal(ValidatedOp.asFloat(), + DenormMode.Input, + /*IsInput=*/true)); + InputVec.push_back(&InputFloats.back()); + } + assert(!InputVec.empty()); + SmallVector Worklist(InputFloats); + const bool HasSNaN = + any_of(InputVec, [](const APFloat *V) { return V->isSignaling(); }); + while (Worklist.size() > 1) { + size_t LHSIdx = 0; + size_t RHSIdx = 1; + if (HasSNaN) { + LHSIdx = Ctx.getRandomUInt64() % Worklist.size(); + RHSIdx = Ctx.getRandomUInt64() % (Worklist.size() - 1); + if (RHSIdx >= LHSIdx) + ++RHSIdx; + } + + APFloat Res = + IID == Intrinsic::vector_reduce_fmax + ? maxnumWithSNaNQuieting(Worklist[LHSIdx], Worklist[RHSIdx]) + : minnumWithSNaNQuieting(Worklist[LHSIdx], Worklist[RHSIdx]); + if (LHSIdx < RHSIdx) + std::swap(LHSIdx, RHSIdx); + Worklist.erase(Worklist.begin() + LHSIdx); + Worklist.erase(Worklist.begin() + RHSIdx); + Worklist.push_back(std::move(Res)); + } + + AnyValue ValidatedRes = + handleFMFFlags(Worklist.front(), FMF, /*IsInput=*/false); + if (ValidatedRes.isPoison()) + return AnyValue::poison(); + APFloat FRes = handleDenormal(ValidatedRes.asFloat(), DenormMode.Output, + /*IsInput=*/false); + + return applyNaNPropagation(FRes, InputVec); + } + case Intrinsic::fabs: { + return visitBitwiseFPUnOpWithResult( + RetTy, FMF, Args[0], + [](const APFloat &Operand) -> APFloat { return abs(Operand); }); + } + case Intrinsic::fma: { + return visitFPTriOpWithResult( + RetTy, FMF, Args[0], Args[1], Args[2], + [](const APFloat &Op1, const APFloat &Op2, + const APFloat &Op3) -> APFloat { + auto Res = Op1; + Res.fusedMultiplyAdd(Op2, Op3, RoundingMode::NearestTiesToEven); + return Res; + }); + } + case Intrinsic::fmuladd: { + return visitFPTriOpWithResult( + RetTy, FMF, Args[0], Args[1], Args[2], + [&](const APFloat &Op1, const APFloat &Op2, + const APFloat &Op3) -> APFloat { + if (Ctx.fuseMultiplyAdd()) { + auto Res = Op1; + Res.fusedMultiplyAdd(Op2, Op3, RoundingMode::NearestTiesToEven); + return Res; + } + return Op1 * Op2 + Op3; + }); + } + case Intrinsic::is_fpclass: { + const FPClassTest Mask = + static_cast(Args[1].asInteger().getZExtValue()); + return computeUnOp(RetTy, Args[0], [&](const AnyValue &Op) -> AnyValue { + if (Op.isPoison()) + return AnyValue::poison(); + return AnyValue::boolean( + static_cast(Op.asFloat().classify() & Mask)); + }); + } + case Intrinsic::copysign: { + return computeBinOp( + RetTy, Args[0], Args[1], + [&](const AnyValue &LHS, const AnyValue &RHS) -> AnyValue { + if (LHS.isPoison() || RHS.isPoison()) + return AnyValue::poison(); + const AnyValue ValidatedLHS = + handleFMFFlags(LHS, FMF, /*IsInput=*/true); + const AnyValue ValidatedRHS = + handleFMFFlags(RHS, FMF, /*IsInput=*/true); + if (ValidatedLHS.isPoison() || ValidatedRHS.isPoison()) + return AnyValue::poison(); + + return handleFMFFlags(APFloat::copySign(ValidatedLHS.asFloat(), + ValidatedRHS.asFloat()), + FMF, /*IsInput=*/false); + }); + } + case Intrinsic::maxnum: + case Intrinsic::minnum: + case Intrinsic::maximum: + case Intrinsic::minimum: + case Intrinsic::maximumnum: + case Intrinsic::minimumnum: { + return visitFPBinOpWithResult( + RetTy, FMF, Args[0], Args[1], + [&](const APFloat &LHS, const APFloat &RHS) -> APFloat { + switch (IID) { + case Intrinsic::maximum: + return maximum(LHS, RHS); + case Intrinsic::minimum: + return minimum(LHS, RHS); + case Intrinsic::maximumnum: + return maximumnum(LHS, RHS); + case Intrinsic::minimumnum: + return minimumnum(LHS, RHS); + case Intrinsic::maxnum: + return maxnumWithSNaNQuieting(LHS, RHS); + case Intrinsic::minnum: + return minnumWithSNaNQuieting(LHS, RHS); + default: + llvm_unreachable("Unexpected intrinsic ID"); + } + }); + } + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: { + const auto BitWidth = RetTy->getScalarSizeInBits(); + return computeUnOp(RetTy, Args[0], [&](const AnyValue &Op) -> AnyValue { + if (Op.isPoison()) + return AnyValue::poison(); + const APFloat &Operand = Op.asFloat(); + APSInt V(BitWidth, IID == Intrinsic::fptoui_sat); + [[maybe_unused]] bool IsExact; + Operand.convertToInteger(V, APFloat::rmTowardZero, &IsExact); + return V; + }); + } default: Handler.onUnrecognizedInstruction(CB); setFailed(); @@ -1723,7 +2028,8 @@ class InstExecutor : public InstVisitor, } void visitFNeg(UnaryOperator &I) { - visitFPUnOp(I, [](const APFloat &Operand) -> APFloat { return -Operand; }); + visitBitwiseFPUnOp( + I, [](const APFloat &Operand) -> APFloat { return -Operand; }); } void visitTruncInst(TruncInst &Trunc) { @@ -1770,14 +2076,15 @@ class InstExecutor : public InstVisitor, return AnyValue::poison(); FastMathFlags FMF = cast(I).getFastMathFlags(); - DenormalMode DenormMode = getCurrentDenormalMode(I); + DenormalMode DenormMode = + getCurrentDenormalMode(I.getOperand(0)->getType()); auto ValidatedOperand = handleFMFFlags(Operand, FMF, /*IsInput=*/true); if (ValidatedOperand.isPoison()) return ValidatedOperand; - APFloat FOperand = - handleDenormal(ValidatedOperand.asFloat(), DenormMode.Input); + APFloat FOperand = handleDenormal(ValidatedOperand.asFloat(), + DenormMode.Input, /*IsInput=*/true); APFloat SourceNaN = FOperand; bool LosesInfo; @@ -1917,7 +2224,8 @@ class InstExecutor : public InstVisitor, } void visitFCmpInst(FCmpInst &I) { - DenormalMode DenormMode = getCurrentDenormalMode(I); + DenormalMode DenormMode = + getCurrentDenormalMode(I.getOperand(0)->getType()); FastMathFlags FMF = I.getFastMathFlags(); visitBinOp(I, [&](const AnyValue &LHS, const AnyValue &RHS) -> AnyValue { @@ -1931,8 +2239,10 @@ class InstExecutor : public InstVisitor, ValidateRes.isPoison()) return ValidateRes; - APFloat FLHS = handleDenormal(LHS.asFloat(), DenormMode.Input); - APFloat FRHS = handleDenormal(RHS.asFloat(), DenormMode.Input); + APFloat FLHS = + handleDenormal(LHS.asFloat(), DenormMode.Input, /*IsInput=*/true); + APFloat FRHS = + handleDenormal(RHS.asFloat(), DenormMode.Input, /*IsInput=*/true); return AnyValue::boolean(FCmpInst::compare(FLHS, FRHS, I.getPredicate())); }); diff --git a/llvm/tools/llubi/llubi.cpp b/llvm/tools/llubi/llubi.cpp index 396ace887d0fc..53946b4cb08f8 100644 --- a/llvm/tools/llubi/llubi.cpp +++ b/llvm/tools/llubi/llubi.cpp @@ -84,6 +84,10 @@ static cl::opt cl::desc("Disable interpreter-introduced non-determinism."), cl::init(false), cl::cat(InterpreterCategory)); +static cl::opt FuseFMulAdd("fuse-fmuladd", + cl::desc("Fuse llvm.fmuladd.* intrinsic"), + cl::init(true), cl::cat(InterpreterCategory)); + cl::opt UndefBehavior( "", cl::desc("Choose undef value behavior:"), cl::values(clEnumVal(ubi::UndefValueBehavior::NonDeterministic, @@ -234,6 +238,7 @@ int main(int argc, char **argv) { Ctx.setVScale(VScale); Ctx.setMaxSteps(MaxSteps); Ctx.setMaxStackDepth(MaxStackDepth); + Ctx.setFusedMultiplyAdd(FuseFMulAdd); Ctx.setDeterministic(Deterministic); Ctx.setUndefValueBehavior(UndefBehavior); Ctx.setNaNPropagationBehavior(NaNPropagationBehavior); From f0c84b3ac80c4c594045aff9b4f88ba564614361 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 11 May 2026 11:19:39 +0100 Subject: [PATCH 244/538] [GlobalISel] Skip match table for opcodes with no combines (#196017) Generate an opcode predicate for GICombiner matchers and use it to return from tryCombineAll before setting up matcher state and executing the match table. The opcode list is collected from the generated rules, so the guard stays in sync with the match table and avoids match-table overhead for instructions the combiner cannot handle. Improves CTMark geomean by -0.33% on stage1-aarch64-O0-g. https://llvm-compile-time-tracker.com/compare.php?from=ed50ea52004259af958bb3e5636268342c49ee62&to=aea6e13cbc76c500a2e0aaedced716b9508811a7&stat=instructions%3Au Also improves -O3 GISel geomean by -0.07%. Local results since this config isn't available on llvm-compile-time-tracker: ``` instructions:u diff old new 7zip 203476185395 203435124795 -0.02% Bullet 103700835951 103694675411 -0.01% ClamAV 52725697786 52671103456 -0.10% SPASS 41611892042 41564657601 -0.11% consumer-typeset 31700292250 31663189016 -0.12% kimwitu++ 39506799515 39520035914 0.03% lencod 66874010678 66804617092 -0.10% mafft 36550480158 36513382949 -0.10% sqlite3 33262506697 33216990490 -0.14% tramp3d-v4 76018141926 76024936414 0.01% geomean 56941557756 56903973091 -0.07% ``` Assisted-by: codex --- .../GlobalISelCombinerEmitter/match-table.td | 19 ++++++++- .../TableGen/GlobalISelCombinerEmitter.cpp | 39 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index a3f29015ffb0f..27b1dbc02b2e6 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -82,8 +82,25 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: return matchIConstant(State.MIs[0]->getOperand(1), 0); // CHECK-NEXT: } -// Verify we reset MatchData on each tryCombineAll +// Verify we gate on opcodes with generated combines and reset MatchData on each +// tryCombineAll. +// CHECK: static bool GenMyCombiner_canMatchOpcode(unsigned Opc) { +// CHECK-NEXT: switch (Opc) { +// CHECK-DAG: case TargetOpcode::COPY: +// CHECK-DAG: case TargetOpcode::G_AND: +// CHECK-DAG: case TargetOpcode::G_STORE: +// CHECK-DAG: case TargetOpcode::G_TRUNC: +// CHECK-DAG: case TargetOpcode::G_SEXT: +// CHECK-DAG: case TargetOpcode::G_ZEXT: +// CHECK: return true; +// CHECK-NEXT: default: +// CHECK-NEXT: return false; +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-EMPTY: // CHECK: bool GenMyCombiner::tryCombineAll(MachineInstr &I) const { +// CHECK-NEXT: if (!GenMyCombiner_canMatchOpcode(I.getOpcode())) +// CHECK-NEXT: return false; // CHECK-NEXT: const PredicateBitset AvailableFeatures = getAvailableFeatures(); // CHECK-NEXT: State.MIs.clear(); // CHECK-NEXT: State.MIs.push_back(&I); diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index cffc8ccb813ca..e01f2a2685049 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -41,6 +41,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSet.h" @@ -2404,6 +2405,9 @@ class GICombinerEmitter final : public GlobalISelMatchTableExecutorEmitter { // combine rule used to disable/enable it. std::vector> AllCombineRules; + // Opcodes handled by the generated matcher. + SmallSetVector MatchOpcodes; + // Keep track of all rules we've seen so far to ensure we don't process // the same rule twice. StringSet<> RulesSeen; @@ -2411,6 +2415,8 @@ class GICombinerEmitter final : public GlobalISelMatchTableExecutorEmitter { MatchTable buildMatchTable(MutableArrayRef Rules); void emitRuleConfigImpl(raw_ostream &OS); + void collectMatchOpcodes(ArrayRef Rules); + void emitCanMatchOpcodeFn(raw_ostream &OS, StringRef FnName) const; void emitAdditionalImpl(raw_ostream &OS) override; @@ -2557,9 +2563,41 @@ void GICombinerEmitter::emitRuleConfigImpl(raw_ostream &OS) { << "}\n\n"; } +void GICombinerEmitter::collectMatchOpcodes(ArrayRef Rules) { + for (const RuleMatcher &Rule : Rules) { + for (const CodeGenInstruction *I : + Rule.insnmatchers_front().getOpcodeMatcher().getAlternativeOpcodes()) + MatchOpcodes.insert(I); + } +} + +void GICombinerEmitter::emitCanMatchOpcodeFn(raw_ostream &OS, + StringRef FnName) const { + OS << "static bool " << FnName << "(unsigned Opc) {\n"; + if (MatchOpcodes.empty()) { + OS << " (void)Opc;\n" + << " return false;\n" + << "}\n\n"; + return; + } + + OS << " switch (Opc) {\n"; + for (const CodeGenInstruction *I : MatchOpcodes) + OS << " case " << I->Namespace << "::" << I->getName() << ":\n"; + OS << " return true;\n" + << " default:\n" + << " return false;\n" + << " }\n" + << "}\n\n"; +} + void GICombinerEmitter::emitAdditionalImpl(raw_ostream &OS) { + std::string CanMatchOpcodeFnName = (getClassName() + "_canMatchOpcode").str(); + emitCanMatchOpcodeFn(OS, CanMatchOpcodeFnName); OS << "bool " << getClassName() << "::" << getCombineAllMethodName() << "(MachineInstr &I) const {\n" + << " if (!" << CanMatchOpcodeFnName << "(I.getOpcode()))\n" + << " return false;\n" << " const PredicateBitset AvailableFeatures = " "getAvailableFeatures();\n" << " State.MIs.clear();\n" @@ -2780,6 +2818,7 @@ void GICombinerEmitter::run(raw_ostream &OS) { return false; }); + collectMatchOpcodes(Rules); const MatchTable Table = buildMatchTable(Rules); Timer.startTimer("Emit combiner"); From 07b3b882e0bcfccb34bea6da0023ee24e4a64ade Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Mon, 11 May 2026 12:23:37 +0200 Subject: [PATCH 245/538] [LLVM|ADT] Fix shadowing warnings for SmallVector (#196907) Fixes the following warning with `-Wshadow-field`. ``` Parameter 'Size' shadows member inherited from type 'SmallVectorBase' ``` --- llvm/include/llvm/ADT/SmallVector.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h index 7f63b569643df..6b55581e3d625 100644 --- a/llvm/include/llvm/ADT/SmallVector.h +++ b/llvm/include/llvm/ADT/SmallVector.h @@ -137,7 +137,7 @@ class SmallVectorTemplateCommon } // Space after 'FirstEl' is clobbered, do not add any instance vars after it. - SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {} + SmallVectorTemplateCommon(size_t SizeArg) : Base(getFirstEl(), SizeArg) {} void grow_pod(size_t MinSize, size_t TSize) { Base::grow_pod(getFirstEl(), MinSize, TSize); @@ -342,7 +342,8 @@ class SmallVectorTemplateBase : public SmallVectorTemplateCommon { static constexpr bool TakesParamByValue = false; using ValueParamT = const T &; - SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + SmallVectorTemplateBase(size_t SizeArg) + : SmallVectorTemplateCommon(SizeArg) {} static void destroy_range(T *S, T *E) { while (S != E) { @@ -493,14 +494,15 @@ class SmallVectorTemplateBase : public SmallVectorTemplateCommon { /// parameters by value. using ValueParamT = std::conditional_t; - SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + SmallVectorTemplateBase(size_t SizeArg) + : SmallVectorTemplateCommon(SizeArg) {} // No need to do a destroy loop for POD's. static void destroy_range(T *, T *) {} /// Move the range [I, E) onto the uninitialized memory /// starting with "Dest", constructing elements into it as needed. - template + template static void uninitialized_move(It1 I, It1 E, It2 Dest) { // Just do a copy. uninitialized_copy(I, E, Dest); @@ -1229,14 +1231,12 @@ class LLVM_GSL_OWNER SmallVector : public SmallVectorImpl, this->destroy_range(this->begin(), this->end()); } - explicit SmallVector(size_t Size) - : SmallVectorImpl(N) { - this->resize(Size); + explicit SmallVector(size_t SizeArg) : SmallVectorImpl(N) { + this->resize(SizeArg); } - SmallVector(size_t Size, const T &Value) - : SmallVectorImpl(N) { - this->assign(Size, Value); + SmallVector(size_t SizeArg, const T &Value) : SmallVectorImpl(N) { + this->assign(SizeArg, Value); } template > From 363c871c8b5a50d10b58f409ff7125f2f41c08ca Mon Sep 17 00:00:00 2001 From: Konrad Kleine Date: Mon, 11 May 2026 12:33:44 +0200 Subject: [PATCH 246/538] libclc: group spirv archs in LIBCLC_ARCHS_SPIRV (#196911) This was done do remove repetitive comparisons. --- libclc/CMakeLists.txt | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index cf1834a7ece63..3f84458336950 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -23,7 +23,9 @@ option( ) # List of all supported architectures. -set( LIBCLC_ARCHS_ALL amdgpu amdgcn nvptx64 spirv spirv32 spirv64 ) +set( LIBCLC_ARCHS_ALL amdgpu amdgcn nvptx64 ) +set( LIBCLC_ARCHS_SPIRV spirv spirv32 spirv64) +list( APPEND LIBCLC_ARCHS_ALL ${LIBCLC_ARCHS_SPIRV}) set(LIBCLC_TARGET ${LLVM_DEFAULT_TARGET_TRIPLE}) @@ -100,7 +102,7 @@ string( REPLACE "-" ";" TRIPLE ${LIBCLC_TARGET} ) list(GET TRIPLE 0 ARCH) list(GET TRIPLE 2 OS) -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) +if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) if(NOT OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND AND NOT llvm-spirv_exe) message(FATAL_ERROR "SPIR-V backend or llvm-spirv is required for libclc ${LIBCLC_TARGET}") endif() @@ -138,7 +140,7 @@ add_dependencies( libclc libclc-opencl-builtins ) # Determine the clang target triple. Vulkan and SPIR-V backend targets use the # triple directly; other SPIR-V targets fall back to the legacy SPIR target. set(clang_triple ${LIBCLC_TARGET}) -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) +if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) if(NOT OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND) if(ARCH STREQUAL spirv) set(clang_triple spir--) @@ -154,7 +156,7 @@ set(generic_addrspace_val 0) if(ARCH STREQUAL amdgcn) set(private_addrspace_val 5) endif() -if((ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) AND NOT OS STREQUAL vulkan) +if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV AND NOT OS STREQUAL vulkan) set(generic_addrspace_val 4) endif() @@ -163,7 +165,7 @@ set(target_compile_flags) set(target_extra_defines) set(opt_flags -O3) -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) +if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) if(OS STREQUAL vulkan) list(APPEND target_compile_flags -Wno-unknown-assumption -U__opencl_c_int64) else() @@ -181,7 +183,7 @@ if(ARCH STREQUAL amdgcn) list(APPEND _clc_overrides ${CLC_AMDGPU_SOURCES}) elseif(ARCH STREQUAL nvptx64 AND (OS STREQUAL nvidiacl OR OS STREQUAL cuda)) list(APPEND _clc_overrides ${CLC_PTX_NVIDIACL_SOURCES}) -elseif(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) +elseif(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) if(OS STREQUAL vulkan) list(APPEND _clc_overrides ${CLC_VULKAN_SOURCES}) else() @@ -192,7 +194,7 @@ libclc_merge_sources(clc_sources ${CLC_GENERIC_SOURCES} ${_clc_overrides}) # Collect OpenCL sources. SPIR-V and Vulkan targets use self-contained # subsets while others merge with target-specific overrides. -if(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) +if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) if(OS STREQUAL vulkan) set(opencl_sources ${OPENCL_VULKAN_SOURCES}) else() From d5d7c9c59fae10c459885b81f83ee73eaf0ba36d Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Mon, 11 May 2026 18:35:15 +0800 Subject: [PATCH 247/538] [libc++] Implement `ranges::fold_right_last` (completes P2322R6) (#195580) - Closes https://github.com/llvm/llvm-project/issues/105208. - Closes https://github.com/llvm/llvm-project/issues/174061. - Closes https://github.com/llvm/llvm-project/issues/174063. --- libcxx/docs/FeatureTestMacroTable.rst | 2 + libcxx/docs/ReleaseNotes/23.rst | 3 +- libcxx/docs/Status/Cxx23Papers.csv | 2 +- libcxx/include/__algorithm/ranges_fold.h | 25 +++ libcxx/include/algorithm | 14 +- libcxx/include/version | 2 + libcxx/modules/std/algorithm.inc | 2 - .../algorithms/nonmodifying/fold.bench.cpp | 16 +- .../libcxx/algorithms/nodiscard.verify.cpp | 4 + .../alg.fold/ranges.fold_right_last.pass.cpp | 204 ++++++++++++++++++ .../algorithm.version.compile.pass.cpp | 30 +++ .../version.version.compile.pass.cpp | 30 +++ .../niebloid.compile.pass.cpp | 1 + .../generate_feature_test_macro_components.py | 5 + 14 files changed, 330 insertions(+), 10 deletions(-) create mode 100644 libcxx/test/std/algorithms/alg.nonmodifying/alg.fold/ranges.fold_right_last.pass.cpp diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 941c1c23c7d8a..ae48eaed1f46b 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -380,6 +380,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_ranges_find_last`` ``202207L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_ranges_fold`` ``202207L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_ranges_iota`` ``202202L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_ranges_join_with`` ``202202L`` diff --git a/libcxx/docs/ReleaseNotes/23.rst b/libcxx/docs/ReleaseNotes/23.rst index a34e379b145fe..a55869a8bf783 100644 --- a/libcxx/docs/ReleaseNotes/23.rst +++ b/libcxx/docs/ReleaseNotes/23.rst @@ -46,8 +46,7 @@ Implemented Papers - P2781R9: ``std::constant_wrapper`` (`Github `__) - P3978R3: ``constant_wrapper`` should unwrap on call and subscript (`Github `__) - P2164R9: ``views::enumerate`` (`Github `__) -- P2322R6 (partial): ``ranges::fold_left_first``, ``ranges::fold_left_first_with_iter``, ``ranges::fold_right`` are - supported (`Github `__) +- P2322R6: ``ranges::fold`` (`Github `__) - P4144R1: Remove ``span``'s ``initializer_list`` constructor for C++26 (`Github `__) Improvements and New Features diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 0fff5258729f7..eb580ea891f5b 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -65,7 +65,7 @@ "`P2286R8 `__","Formatting Ranges","2022-07 (Virtual)","|Complete|","16","`#105202 `__","" "`P2291R3 `__","Add Constexpr Modifiers to Functions ``to_chars`` and ``from_chars`` for Integral Types in ```` Header","2022-07 (Virtual)","|Complete|","16","`#105204 `__","" "`P2302R4 `__","``std::ranges::contains``","2022-07 (Virtual)","|Complete|","19","`#105206 `__","" -"`P2322R6 `__","``ranges::fold``","2022-07 (Virtual)","|Partial|","","Only ``fold_left_with_iter``, ``fold_left``, ``fold_left_first_with_iter``, ``fold_left_first`` and ``fold_right`` are implemented." +"`P2322R6 `__","``ranges::fold``","2022-07 (Virtual)","|Complete|","23","`#105208 `__","" "`P2374R4 `__","``views::cartesian_product``","2022-07 (Virtual)","","","`#105209 `__","" "`P2404R3 `__","Move-only types for ``equality_comparable_with``, ``totally_ordered_with``, and ``three_way_comparable_with``","2022-07 (Virtual)","|Complete|","22","`#105210 `__","Implemented as a DR in C++20." "`P2408R5 `__","Ranges iterators as inputs to non-Ranges algorithms","2022-07 (Virtual)","","","`#105211 `__","" diff --git a/libcxx/include/__algorithm/ranges_fold.h b/libcxx/include/__algorithm/ranges_fold.h index 2e36a7ce3a86c..87579b264296a 100644 --- a/libcxx/include/__algorithm/ranges_fold.h +++ b/libcxx/include/__algorithm/ranges_fold.h @@ -23,6 +23,7 @@ #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> #include <__iterator/next.h> +#include <__iterator/prev.h> #include <__iterator/reverse_iterator.h> #include <__ranges/access.h> #include <__ranges/concepts.h> @@ -234,6 +235,30 @@ struct __fold_right { }; inline constexpr auto fold_right = __fold_right(); + +struct __fold_right_last { + template _Sp, + __indirectly_binary_right_foldable, _Iter> _Func> + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Iter __first, _Sp __last, _Func __func) { + using _Up = decltype(fold_right(__first, __last, iter_value_t<_Iter>(*__first), __func)); + + if (__first == __last) + return optional<_Up>(); + + _Iter __tail = ranges::prev(ranges::next(__first, __last)); + return optional<_Up>( + in_place, ranges::fold_right(std::move(__first), __tail, iter_value_t<_Iter>(*__tail), std::move(__func))); + } + + template , iterator_t<_Range>> _Func> + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Range&& __range, _Func __func) { + return operator()(ranges::begin(__range), ranges::end(__range), std::ref(__func)); + } +}; + +inline constexpr auto fold_right_last = __fold_right_last(); } // namespace ranges #endif // _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm index d332b8ce4fd70..95446b44eaf9c 100644 --- a/libcxx/include/algorithm +++ b/libcxx/include/algorithm @@ -962,11 +962,21 @@ namespace ranges { template S, class T = iter_value_t, indirectly-binary-right-foldable F> - constexpr auto ranges::fold_right(I first, S last, T init, F f); + constexpr auto ranges::fold_right(I first, S last, T init, F f); // since C++23 template, indirectly-binary-right-foldable> F> - constexpr auto ranges::fold_right(R&& r, T init, F f); + constexpr auto ranges::fold_right(R&& r, T init, F f); // since C++23 + + template S, + indirectly-binary-right-foldable, I> F> + requires constructible_from, iter_reference_t> + constexpr auto ranges::fold_right_last(I first, S last, F f); // since C++23 + + template, iterator_t> F> + requires constructible_from, range_reference_t> + constexpr auto ranges::fold_right_last(R&& r, F f); // since C++23 template using fold_left_with_iter_result = in_value_result; // since C++23 diff --git a/libcxx/include/version b/libcxx/include/version index 8ff06dae7bb0a..1c683b67e5700 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -212,6 +212,7 @@ __cpp_lib_ranges_concat 202403L __cpp_lib_ranges_contains 202207L __cpp_lib_ranges_enumerate 202302L __cpp_lib_ranges_find_last 202207L +__cpp_lib_ranges_fold 202207L __cpp_lib_ranges_indices 202506L __cpp_lib_ranges_iota 202202L __cpp_lib_ranges_join_with 202202L @@ -529,6 +530,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_ranges_contains 202207L # define __cpp_lib_ranges_enumerate 202302L # define __cpp_lib_ranges_find_last 202207L +# define __cpp_lib_ranges_fold 202207L # define __cpp_lib_ranges_iota 202202L # define __cpp_lib_ranges_join_with 202202L # define __cpp_lib_ranges_repeat 202207L diff --git a/libcxx/modules/std/algorithm.inc b/libcxx/modules/std/algorithm.inc index 3a59726135fe7..1db72eff08757 100644 --- a/libcxx/modules/std/algorithm.inc +++ b/libcxx/modules/std/algorithm.inc @@ -170,9 +170,7 @@ export namespace std { using std::ranges::fold_left_with_iter; using std::ranges::fold_left_with_iter_result; using std::ranges::fold_right; -# if 0 using std::ranges::fold_right_last; -# endif #endif // _LIBCPP_STD_VER >= 23 } // namespace ranges diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/fold.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/fold.bench.cpp index 1e061cee3a07a..95d71fad21a34 100644 --- a/libcxx/test/benchmarks/algorithms/nonmodifying/fold.bench.cpp +++ b/libcxx/test/benchmarks/algorithms/nonmodifying/fold.bench.cpp @@ -28,6 +28,12 @@ int main(int argc, char** argv) { auto std_ranges_fold_left_first = [](auto first, auto last, auto, auto func) { return std::ranges::fold_left_first(first, last, func); }; + auto std_ranges_fold_right = [](auto first, auto last, auto init, auto func) { + return std::ranges::fold_right(first, last, init, func); + }; + auto std_ranges_fold_right_last = [](auto first, auto last, auto, auto func) { + return std::ranges::fold_right_last(first, last, func); + }; // ranges::{fold_left,fold_left_first,fold_right,fold_right_last} { auto bm = [](std::string name, auto fold) { @@ -71,9 +77,13 @@ int main(int argc, char** argv) { bm.operator()>("rng::fold_left_first(deque)", std_ranges_fold_left_first); bm.operator()>("rng::fold_left_first(list)", std_ranges_fold_left_first); - bm.operator()>("rng::fold_right(vector)", std::ranges::fold_right); - bm.operator()>("rng::fold_right(deque)", std::ranges::fold_right); - bm.operator()>("rng::fold_right(list)", std::ranges::fold_right); + bm.operator()>("rng::fold_right(vector)", std_ranges_fold_right); + bm.operator()>("rng::fold_right(deque)", std_ranges_fold_right); + bm.operator()>("rng::fold_right(list)", std_ranges_fold_right); + + bm.operator()>("rng::fold_right_last(vector)", std_ranges_fold_right_last); + bm.operator()>("rng::fold_right_last(deque)", std_ranges_fold_right_last); + bm.operator()>("rng::fold_right_last(list)", std_ranges_fold_right_last); } benchmark::Initialize(&argc, argv); diff --git a/libcxx/test/libcxx/algorithms/nodiscard.verify.cpp b/libcxx/test/libcxx/algorithms/nodiscard.verify.cpp index 4530d8c9507f6..2bb7f59b24dcd 100644 --- a/libcxx/test/libcxx/algorithms/nodiscard.verify.cpp +++ b/libcxx/test/libcxx/algorithms/nodiscard.verify.cpp @@ -408,5 +408,9 @@ void test() { // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}} std::ranges::fold_right(iter, iter, 0, std::plus()); // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::fold_right_last(range, std::plus()); + // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}} + std::ranges::fold_right_last(iter, iter, std::plus()); + // expected-warning@-1{{ignoring return value of function declared with 'nodiscard' attribute}} #endif } diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.fold/ranges.fold_right_last.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.fold/ranges.fold_right_last.pass.cpp new file mode 100644 index 0000000000000..d429762b96dce --- /dev/null +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.fold/ranges.fold_right_last.pass.cpp @@ -0,0 +1,204 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// REQUIRES: std-at-least-c++23 + +// template S, +// indirectly-binary-right-foldable, I> F> +// requires constructible_from, iter_reference_t> +// constexpr auto ranges::fold_right_last(I first, S last, F f); + +// template, iterator_t> F> +// requires constructible_from, range_reference_t> +// constexpr auto ranges::fold_right_last(R&& r, F f); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "test_range.h" +#include "invocable_with_telemetry.h" +#include "maths.h" + +using std::ranges::fold_right_last; + +template + requires std::copyable +constexpr void check_iterator(R& r, F f, std::optional const& expected) { + { + std::same_as> decltype(auto) result = fold_right_last(r.begin(), r.end(), f); + assert(result == expected); + } + + { + auto telemetry = invocable_telemetry(); + auto f2 = invocable_with_telemetry(f, telemetry); + std::same_as> decltype(auto) result = fold_right_last(r.begin(), r.end(), f2); + assert(result == expected); + if (expected.has_value()) { + assert(telemetry.invocations == std::ranges::distance(r) - 1); + assert(telemetry.moves == 1); + assert(telemetry.copies == 1); + } + } +} + +template + requires std::copyable +constexpr void check_lvalue_range(R& r, F f, std::optional const& expected) { + { + std::same_as> decltype(auto) result = fold_right_last(r, f); + assert(result == expected); + } + + { + auto telemetry = invocable_telemetry(); + auto f2 = invocable_with_telemetry(f, telemetry); + std::same_as> decltype(auto) result = fold_right_last(r, f2); + assert(result == expected); + if (expected.has_value()) { + assert(telemetry.invocations == std::ranges::distance(r) - 1); + assert(telemetry.moves == 0); + assert(telemetry.copies == 1); + } + } +} + +template + requires std::copyable +constexpr void check_rvalue_range(R& r, F f, std::optional const& expected) { + { + auto r2 = r; + std::same_as> decltype(auto) result = fold_right_last(std::move(r2), f); + assert(result == expected); + } + + { + auto telemetry = invocable_telemetry(); + auto f2 = invocable_with_telemetry(f, telemetry); + auto r2 = r; + std::same_as> decltype(auto) result = fold_right_last(std::move(r2), f2); + assert(result == expected); + if (expected.has_value()) { + assert(telemetry.invocations == std::ranges::distance(r) - 1); + assert(telemetry.moves == 0); + assert(telemetry.copies == 1); + } + } +} + +template + requires std::copyable +constexpr void check(R r, F f, std::optional const& expected) { + check_iterator(r, f, expected); + check_lvalue_range(r, f, expected); + check_rvalue_range(r, f, expected); +} + +constexpr void empty_range_test_case() { + auto const data = std::vector{}; + check(data, std::plus(), std::optional()); + check(data, std::multiplies(), std::optional()); +} + +constexpr void common_range_test_case() { + auto const data = std::vector{1, 2, 3, 4}; + check(data, std::plus(), std::optional(triangular_sum(data))); + check(data, std::multiplies(), std::optional(factorial(data.back()))); + + auto multiply_with_next = [n = 1](auto const x, auto const y) mutable { + auto const result = x * y * n; + n = x; + return static_cast(result); + }; + check(data, multiply_with_next, std::optional(factorial(data.size()) * factorial(data.size() - 1))); + + auto fib = [n = 1](auto x, auto) mutable { + auto old_x = x; + x += n; + n = old_x; + return x; + }; + check(data, fib, std::optional(fibonacci(data.back()))); +} + +constexpr void non_common_range_test_case() { + auto parse = [](std::string_view const s) { + return s == "zero" ? 0.0 + : s == "one" ? 1.0 + : s == "two" ? 2.0 + : s == "three" ? 3.0 + : s == "four" ? 4.0 + : s == "five" ? 5.0 + : s == "six" ? 6.0 + : s == "seven" ? 7.0 + : s == "eight" ? 8.0 + : s == "nine" ? 9.0 + : (assert(false), 10.0); // the number here is arbitrary + }; + + { + auto data = std::vector{"five", "three", "two", "six", "one", "four"}; + auto range = data | std::views::transform(parse); + check(range, std::plus(), std::optional(triangular_sum(range))); + } +} + +constexpr bool test_case() { + empty_range_test_case(); + common_range_test_case(); + non_common_range_test_case(); + return true; +} + +// Most containers aren't constexpr +void runtime_only_test_case() { + { + auto const data = std::list{2, 4, 6, 8, 10, 12}; + auto const expected = triangular_sum(data); + check(data, std::plus(), std::optional(static_cast(expected))); + } + + { + auto const data = std::deque{-1.1, -2.2, -3.3, -4.4, -5.5, -6.6}; + auto plus = [](double const x, int const y) { return x + y; }; + auto const expected = -21.1; // -5.5 + int(- 6.6) = -5.5 + -6 = -11.5 + // -4.4 + int(-11.5) = -4.4 + -11 = -15.4 + // -3.3 + int(-15.4) = -3.3 + -15 = -18.3 + // -2.2 + int(-18.3) = -2.2 + -18 = -20.2 + // -1.1 + int(-20.2) = -1.1 + -20 = -21.1. + check(data, plus, std::optional(expected)); + } + + { + auto const data = std::set{2, 4, 6, 8, 10, 12}; + auto const expected = triangular_sum(data); + check(data, std::plus(), std::optional(static_cast(expected))); + } +} + +int main(int, char**) { + test_case(); + static_assert(test_case()); + runtime_only_test_case(); + return 0; +} diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp index beecf87cd4aa7..657875444c687 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp @@ -52,6 +52,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_starts_ends_with # error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23" # endif @@ -102,6 +106,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_starts_ends_with # error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23" # endif @@ -167,6 +175,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_starts_ends_with # error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23" # endif @@ -241,6 +253,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_starts_ends_with # error "__cpp_lib_ranges_starts_ends_with should not be defined before c++23" # endif @@ -324,6 +340,13 @@ # error "__cpp_lib_ranges_find_last should have the value 202207L in c++23" # endif +# ifndef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should be defined in c++23" +# endif +# if __cpp_lib_ranges_fold != 202207L +# error "__cpp_lib_ranges_fold should have the value 202207L in c++23" +# endif + # ifndef __cpp_lib_ranges_starts_ends_with # error "__cpp_lib_ranges_starts_ends_with should be defined in c++23" # endif @@ -428,6 +451,13 @@ # error "__cpp_lib_ranges_find_last should have the value 202207L in c++26" # endif +# ifndef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should be defined in c++26" +# endif +# if __cpp_lib_ranges_fold != 202207L +# error "__cpp_lib_ranges_fold should have the value 202207L in c++26" +# endif + # ifndef __cpp_lib_ranges_starts_ends_with # error "__cpp_lib_ranges_starts_ends_with should be defined in c++26" # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 3f15dd2fd1d06..dfee4b6d458db 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -680,6 +680,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_indices # error "__cpp_lib_ranges_indices should not be defined before c++26" # endif @@ -1644,6 +1648,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_indices # error "__cpp_lib_ranges_indices should not be defined before c++26" # endif @@ -2773,6 +2781,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_indices # error "__cpp_lib_ranges_indices should not be defined before c++26" # endif @@ -4169,6 +4181,10 @@ # error "__cpp_lib_ranges_find_last should not be defined before c++23" # endif +# ifdef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should not be defined before c++23" +# endif + # ifdef __cpp_lib_ranges_indices # error "__cpp_lib_ranges_indices should not be defined before c++26" # endif @@ -5763,6 +5779,13 @@ # error "__cpp_lib_ranges_find_last should have the value 202207L in c++23" # endif +# ifndef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should be defined in c++23" +# endif +# if __cpp_lib_ranges_fold != 202207L +# error "__cpp_lib_ranges_fold should have the value 202207L in c++23" +# endif + # ifdef __cpp_lib_ranges_indices # error "__cpp_lib_ranges_indices should not be defined before c++26" # endif @@ -7702,6 +7725,13 @@ # error "__cpp_lib_ranges_find_last should have the value 202207L in c++26" # endif +# ifndef __cpp_lib_ranges_fold +# error "__cpp_lib_ranges_fold should be defined in c++26" +# endif +# if __cpp_lib_ranges_fold != 202207L +# error "__cpp_lib_ranges_fold should have the value 202207L in c++26" +# endif + # ifndef __cpp_lib_ranges_indices # error "__cpp_lib_ranges_indices should be defined in c++26" # endif diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp index 7c67e7083e88d..080380712cf2a 100644 --- a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp +++ b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp @@ -102,6 +102,7 @@ static_assert(test(std::ranges::fold_left_with_iter, a, 0, std::plus())); static_assert(test(std::ranges::fold_left_first, a, std::plus())); static_assert(test(std::ranges::fold_left_first_with_iter, a, std::plus())); static_assert(test(std::ranges::fold_right, a, 0, std::plus())); +static_assert(test(std::ranges::fold_right_last, a, std::plus())); #endif static_assert(test(std::ranges::for_each, a, odd)); static_assert(test(std::ranges::for_each_n, a, 10, odd)); diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 3770f026009ca..2c1f92992bf1a 100644 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1142,6 +1142,11 @@ def add_version_header(tc): "values": {"c++23": 202207}, "headers": ["algorithm"], }, + { + "name": "__cpp_lib_ranges_fold", + "values": {"c++23": 202207}, + "headers": ["algorithm"], + }, { "name": "__cpp_lib_ranges_indices", "values": {"c++26": 202506}, From 0ddf1e9e55e4aef6cc6c8aa2a561f131156950de Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 11 May 2026 12:45:31 +0200 Subject: [PATCH 248/538] Revert "[LV] Handle FSub Partial Reductions" (#196922) Reverts llvm/llvm-project#191186. Causes test failures. --- llvm/include/llvm/Analysis/IVDescriptors.h | 5 - llvm/lib/Analysis/IVDescriptors.cpp | 36 +- .../AArch64/AArch64TargetTransformInfo.cpp | 5 +- llvm/lib/Transforms/Utils/LoopUtils.cpp | 4 - .../Transforms/Vectorize/LoopVectorize.cpp | 24 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 6 - .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 8 +- .../AArch64/partial-reduce-fsub-chained.ll | 38 --- .../partial-reduce-sub-epilogue-vec.ll | 317 ------------------ .../AArch64/partial-reduce-sub.ll | 139 -------- 11 files changed, 17 insertions(+), 577 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 2120eb8cd9914..05c58d1a20afb 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -46,8 +46,6 @@ enum class RecurKind { UMin, ///< Unsigned integer min implemented in terms of select(cmp()). UMax, ///< Unsigned integer max implemented in terms of select(cmp()). FAdd, ///< Sum of floats. - FAddChainWithSubs, ///< A chain of fadds and fsubs. - FSub, ///< Subtraction of floats. FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). @@ -247,9 +245,6 @@ class RecurrenceDescriptor { /// Returns true if the recurrence kind is a floating point kind. LLVM_ABI static bool isFloatingPointRecurrenceKind(RecurKind Kind); - /// Returns true if the recurrence kind is for a sub operation. - LLVM_ABI static bool isSubRecurrenceKind(RecurKind Kind); - /// Returns true if the recurrence kind is an integer min/max kind. static bool isIntMinMaxRecurrenceKind(RecurKind Kind) { return Kind == RecurKind::UMin || Kind == RecurKind::UMax || diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 22a519026f63f..7549e14366d2c 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -92,10 +92,6 @@ static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT, return Phi; } -bool RecurrenceDescriptor::isSubRecurrenceKind(RecurKind Kind) { - return Kind == RecurKind::Sub || Kind == RecurKind::FSub; -} - /// Compute the minimal bit width needed to represent a reduction whose exit /// instruction is given by Exit. static std::pair computeRecurrenceType(Instruction *Exit, @@ -1013,18 +1009,13 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi, return InstDesc(Kind == RecurKind::FMul, I, I->hasAllowReassoc() ? nullptr : I); case Instruction::FSub: - return InstDesc(Kind == RecurKind::FSub || - Kind == RecurKind::FAddChainWithSubs, - I, I->hasAllowReassoc() ? nullptr : I); case Instruction::FAdd: - return InstDesc(Kind == RecurKind::FAdd || - Kind == RecurKind::FAddChainWithSubs, - I, I->hasAllowReassoc() ? nullptr : I); + return InstDesc(Kind == RecurKind::FAdd, I, + I->hasAllowReassoc() ? nullptr : I); case Instruction::Select: - if (isSubRecurrenceKind(Kind) || Kind == RecurKind::FAdd || - Kind == RecurKind::FMul || Kind == RecurKind::Add || - Kind == RecurKind::Mul || Kind == RecurKind::AddChainWithSubs || - Kind == RecurKind::FAddChainWithSubs) + if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul || + Kind == RecurKind::Add || Kind == RecurKind::Mul || + Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) return isConditionalRdxPattern(I); if (isFindRecurrenceKind(Kind) && SE) return isFindPattern(L, OrigPhi, I, *SE); @@ -1113,20 +1104,10 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FSub, TheLoop, RedDes, DB, AC, DT, SE)) { - LLVM_DEBUG(dbgs() << "Found an FSub reduction PHI." << *Phi << "\n"); - return true; - } if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, RedDes, DB, AC, DT, SE)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FAddChainWithSubs, TheLoop, RedDes, DB, - AC, DT, SE)) { - LLVM_DEBUG(dbgs() << "Found a chained FADD-FSUB chained reduction PHI." - << *Phi << "\n"); - return true; - } if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, RedDes, DB, AC, DT, SE)) { LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); @@ -1243,11 +1224,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMul: return Instruction::FMul; case RecurKind::FMulAdd: - case RecurKind::FAddChainWithSubs: case RecurKind::FAdd: return Instruction::FAdd; - case RecurKind::FSub: - return Instruction::FSub; case RecurKind::SMax: case RecurKind::SMin: case RecurKind::UMax: @@ -1324,10 +1302,6 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const { Kind == RecurKind::AddChainWithSubs) return true; - if (Cur->getOpcode() == Instruction::FSub && - Kind == RecurKind::FAddChainWithSubs) - return true; - return Cur->getOpcode() == getOpcode(); }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9d86cf7aec45c..2e3e7b73ba390 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5668,7 +5668,6 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( switch (RdxDesc.getRecurrenceKind()) { case RecurKind::Sub: case RecurKind::AddChainWithSubs: - case RecurKind::FAddChainWithSubs: case RecurKind::Add: case RecurKind::FAdd: case RecurKind::And: @@ -5990,7 +5989,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( return Invalid; if ((Opcode != Instruction::Add && Opcode != Instruction::Sub && - Opcode != Instruction::FAdd && Opcode != Instruction::FSub) || + Opcode != Instruction::FAdd) || OpAExtend == TTI::PR_None) return Invalid; @@ -6057,7 +6056,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( NEONPred); }; - bool IsSub = (Opcode == Instruction::Sub) || (Opcode == Instruction::FSub); + bool IsSub = Opcode == Instruction::Sub; InstructionCost Cost = InputLT.first * TTI::TCC_Basic; // Integer partial sub-reductions that don't map to a specific instruction, // carry an extra cost for implementing a double negation: diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index f0586e4f0f464..cc965cb36c36e 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1110,8 +1110,6 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { case RecurKind::Xor: return Intrinsic::vector_reduce_xor; case RecurKind::FMulAdd: - case RecurKind::FAddChainWithSubs: - case RecurKind::FSub: case RecurKind::FAdd: return Intrinsic::vector_reduce_fadd; case RecurKind::FMul: @@ -1569,8 +1567,6 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, case RecurKind::FMaximumNum: return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src); case RecurKind::FMulAdd: - case RecurKind::FAddChainWithSubs: - case RecurKind::FSub: case RecurKind::FAdd: return Builder.CreateFAddReduce(getIdentity(), Src); case RecurKind::FMul: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e77e5024e2cc4..1ace2275e2b6d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7767,28 +7767,16 @@ static SmallVector preparePlanForEpilogueVectorLoop( // reduction start value in a final subtraction. Update it to use the // resume value from the main vector loop. if (PhiR->getVFScaleFactor() > 1 && - RecurrenceDescriptor::isSubRecurrenceKind( - PhiR->getRecurrenceKind())) { + PhiR->getRecurrenceKind() == RecurKind::Sub) { auto *Sub = cast(RdxResult->getSingleUser()); - assert((Sub->getOpcode() == Instruction::Sub || - Sub->getOpcode() == Instruction::FSub) && - "Unexpected opcode"); + assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode"); assert(isa(Sub->getOperand(0)) && "Expected operand to match the original start value of the " "reduction"); - // For integer sub-reductions, verify start value is zero. - // For FP sub-reductions, verify start value is negative zero. - [[maybe_unused]] auto StartValueIsIdentity = [&] { - Value *IdentityValue = getRecurrenceIdentity( - PhiR->getRecurrenceKind(), ResumeV->getType(), - PhiR->getFastMathFlags()); - auto *StartValue = dyn_cast(VPI->getOperand(0)); - return StartValue && StartValue->getValue() == IdentityValue; - }; - assert(StartValueIsIdentity() && - "Expected start value for partial sub-reduction to be zero " - "(or negative zero)"); - + assert(VPlanPatternMatch::match(VPI->getOperand(0), + VPlanPatternMatch::m_ZeroInt()) && + "Expected start value for partial sub-reduction to start at " + "zero"); Sub->setOperand(0, StartVal); } else VPI->setOperand(0, StartVal); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c41170758efd5..f1a6eb2d7e8af 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -29540,8 +29540,6 @@ class HorizontalReduction { // res = vv break; case RecurKind::Sub: - case RecurKind::FSub: - case RecurKind::FAddChainWithSubs: case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: @@ -29693,8 +29691,6 @@ class HorizontalReduction { // res = vv return VectorizedValue; case RecurKind::Sub: - case RecurKind::FSub: - case RecurKind::FAddChainWithSubs: case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: @@ -29798,8 +29794,6 @@ class HorizontalReduction { return Builder.CreateFMul(VectorizedValue, Scale); } case RecurKind::Sub: - case RecurKind::FSub: - case RecurKind::FAddChainWithSubs: case RecurKind::AddChainWithSubs: case RecurKind::Mul: case RecurKind::FMul: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 25b2853d002a7..11a91dcd46867 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -578,14 +578,6 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { } } -static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind) { - if (Kind == RecurKind::Sub) - return Instruction::Add; - if (Kind == RecurKind::FSub) - return Instruction::FAdd; - llvm_unreachable("RecurKind should be Sub/FSub."); -} - Value *VPInstruction::generate(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; @@ -799,8 +791,8 @@ Value *VPInstruction::generate(VPTransformState &State) { // For sub-recurrences, each part's reduction variable is already // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) Instruction::BinaryOps Opcode = - RecurrenceDescriptor::isSubRecurrenceKind(RK) - ? getSubRecurOpcode(RK) + RK == RecurKind::Sub + ? Instruction::Add : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK); ReducedPartRdx = Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6b50494ea364b..32d89a34105a4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -6013,9 +6013,6 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, ExtendedOp = NegRecipe; } - assert((Chain.RK != RecurKind::FAddChainWithSubs) && - "FSub chain reduction isn't supported"); - // FIXME: Do these transforms before invoking the cost-model. ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo); @@ -6067,7 +6064,7 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, // If this is the last value in a sub-reduction chain, then update the PHI // node to start at `0` and update the reduction-result to subtract from // the PHI's start value. - if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub) + if (Chain.RK != RecurKind::Sub) return; VPValue *OldStartValue = StartInst->getOperand(0); @@ -6078,8 +6075,7 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain, assert(RdxResult && "Could not find reduction result"); VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult); - unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub - : Instruction::BinaryOps::Sub; + constexpr unsigned SubOpc = Instruction::BinaryOps::Sub; VPInstruction *NewResult = Builder.createNaryOp( SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc), RdxPhi->getDebugLoc()); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll deleted file mode 100644 index f22be2c5fd977..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fsub-chained.ll +++ /dev/null @@ -1,38 +0,0 @@ -; REQUIRES: asserts -; RUN: not --crash opt -passes=loop-vectorize -S %s 2>&1 | FileCheck %s --check-prefix=ASSERTION - -; Tests a partial reduction with an fadd->fsub chain. -; There's an assertion preventing this type of partial reduction from -; being generated as the current codegen for this case is incorrect. - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -; ASSERTION: (Chain.RK != RecurKind::FAddChainWithSubs) -define float @fadd_fsub_reduction(float %startval, ptr %src1, ptr %src2, ptr %src3) #0 { -entry: - br label %loop - -loop: - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %accum = phi float [ %startval, %entry ], [ %sub, %loop ] - %src1.gep = getelementptr half, ptr %src1, i32 %iv - %src1.load = load half, ptr %src1.gep, align 4 - %src1.load.ext = fpext half %src1.load to float - %src2.gep = getelementptr half, ptr %src2, i32 %iv - %src2.load = load half, ptr %src2.gep, align 4 - %src2.load.ext = fpext half %src2.load to float - %src3.gep = getelementptr half, ptr %src3, i32 %iv - %src3.load = load half, ptr %src3.gep, align 4 - %src3.load.ext = fpext half %src3.load to float - %mul1 = fmul reassoc contract float %src1.load.ext, %src2.load.ext - %add = fadd reassoc contract float %accum, %mul1 - %mul2 = fmul reassoc contract float %src3.load.ext, %src1.load.ext - %sub = fsub reassoc contract float %add, %mul2 - %iv.next = add i32 %iv, 1 - %exitcond.not = icmp eq i32 %iv, 1024 - br i1 %exitcond.not, label %exit, label %loop - -exit: - ret float %sub -} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll index 21c9b9cde13ba..8db3a3294d7bc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll @@ -185,321 +185,4 @@ exit: ret i32 %sub } -define float @fsub_reduction(float %startval, ptr %src1, ptr %src2) #0 { -; CHECK-EPI-LABEL: define float @fsub_reduction( -; CHECK-EPI-SAME: float [[STARTVAL:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { -; CHECK-EPI-NEXT: [[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]: -; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK1:.*]] -; CHECK-EPI: [[VECTOR_MAIN_LOOP_ITER_CHECK1]]: -; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]] -; CHECK-EPI: [[VECTOR_PH1]]: -; CHECK-EPI-NEXT: br label %[[VECTOR_PH:.*]] -; CHECK-EPI: [[VECTOR_PH]]: -; CHECK-EPI-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_PH]] ] -; CHECK-EPI-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_PH]] ] -; CHECK-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX1]] -; CHECK-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <16 x half>, ptr [[TMP0]], align 4 -; CHECK-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX1]] -; CHECK-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x half>, ptr [[TMP1]], align 4 -; CHECK-EPI-NEXT: [[TMP5:%.*]] = fpext <16 x half> [[WIDE_LOAD]] to <16 x float> -; CHECK-EPI-NEXT: [[TMP10:%.*]] = fpext <16 x half> [[WIDE_LOAD1]] to <16 x float> -; CHECK-EPI-NEXT: [[TMP11:%.*]] = fmul reassoc contract <16 x float> [[TMP5]], [[TMP10]] -; CHECK-EPI-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <8 x float> @llvm.vector.partial.reduce.fadd.v8f32.v16f32(<8 x float> [[VEC_PHI]], <16 x float> [[TMP11]]) -; CHECK-EPI-NEXT: [[INDEX_NEXT1]] = add nuw i32 [[INDEX1]], 16 -; CHECK-EPI-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT1]], 32 -; CHECK-EPI-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_PH]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK-EPI: [[MIDDLE_BLOCK1]]: -; CHECK-EPI-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[PARTIAL_REDUCE]]) -; CHECK-EPI-NEXT: [[TMP7:%.*]] = fsub float [[STARTVAL]], [[TMP6]] -; CHECK-EPI-NEXT: br i1 false, label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK-EPI: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] -; CHECK-EPI: [[VEC_EPILOG_PH]]: -; CHECK-EPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] -; CHECK-EPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] -; CHECK-EPI-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK-EPI: [[VECTOR_BODY]]: -; CHECK-EPI-NEXT: [[INDEX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-EPI-NEXT: [[VEC_PHI3:%.*]] = phi <2 x float> [ splat (float -0.000000e+00), %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], %[[VECTOR_BODY]] ] -; CHECK-EPI-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX]] -; CHECK-EPI-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x half>, ptr [[TMP8]], align 4 -; CHECK-EPI-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX]] -; CHECK-EPI-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x half>, ptr [[TMP9]], align 4 -; CHECK-EPI-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[WIDE_LOAD4]] to <4 x float> -; CHECK-EPI-NEXT: [[TMP3:%.*]] = fpext <4 x half> [[WIDE_LOAD5]] to <4 x float> -; CHECK-EPI-NEXT: [[TMP4:%.*]] = fmul reassoc contract <4 x float> [[TMP2]], [[TMP3]] -; CHECK-EPI-NEXT: [[PARTIAL_REDUCE6]] = call reassoc contract <2 x float> @llvm.vector.partial.reduce.fadd.v2f32.v4f32(<2 x float> [[VEC_PHI3]], <4 x float> [[TMP4]]) -; CHECK-EPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-EPI-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 -; CHECK-EPI-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK-EPI: [[MIDDLE_BLOCK]]: -; CHECK-EPI-NEXT: [[TMP14:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[PARTIAL_REDUCE6]]) -; CHECK-EPI-NEXT: [[TMP15:%.*]] = fsub float [[BC_MERGE_RDX]], [[TMP14]] -; CHECK-EPI-NEXT: br i1 false, label %[[EXIT]], label %[[VEC_EPILOG_VECTOR_BODY]] -; CHECK-EPI: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-EPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-EPI-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-EPI-NEXT: br label %[[LOOP:.*]] -; CHECK-EPI: [[LOOP]]: -; CHECK-EPI-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-EPI-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[SUB:%.*]], %[[LOOP]] ] -; CHECK-EPI-NEXT: [[SRC1_GEP:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[IV]] -; CHECK-EPI-NEXT: [[SRC1_LOAD:%.*]] = load half, ptr [[SRC1_GEP]], align 4 -; CHECK-EPI-NEXT: [[SRC1_LOAD_EXT:%.*]] = fpext half [[SRC1_LOAD]] to float -; CHECK-EPI-NEXT: [[SRC2_GEP:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[IV]] -; CHECK-EPI-NEXT: [[SRC2_LOAD:%.*]] = load half, ptr [[SRC2_GEP]], align 4 -; CHECK-EPI-NEXT: [[SRC2_LOAD_EXT:%.*]] = fpext half [[SRC2_LOAD]] to float -; CHECK-EPI-NEXT: [[MUL:%.*]] = fmul reassoc contract float [[SRC1_LOAD_EXT]], [[SRC2_LOAD_EXT]] -; CHECK-EPI-NEXT: [[SUB]] = fsub reassoc contract float [[ACCUM]], [[MUL]] -; CHECK-EPI-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-EPI-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 40 -; CHECK-EPI-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK-EPI: [[EXIT]]: -; CHECK-EPI-NEXT: [[SUB_LCSSA:%.*]] = phi float [ [[SUB]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK1]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] -; CHECK-EPI-NEXT: ret float [[SUB_LCSSA]] -; -; CHECK-PARTIAL-RED-EPI-LABEL: define float @fsub_reduction( -; CHECK-PARTIAL-RED-EPI-SAME: float [[STARTVAL:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { -; CHECK-PARTIAL-RED-EPI-NEXT: [[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]: -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK1:.*]] -; CHECK-PARTIAL-RED-EPI: [[VECTOR_MAIN_LOOP_ITER_CHECK1]]: -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]] -; CHECK-PARTIAL-RED-EPI: [[VECTOR_PH1]]: -; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_PH:.*]] -; CHECK-PARTIAL-RED-EPI: [[VECTOR_PH]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_PH]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE1:%.*]], %[[VECTOR_PH]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX1]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x half>, ptr [[TMP8]], align 4 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX1]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x half>, ptr [[TMP9]], align 4 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP10:%.*]] = fpext <16 x half> [[WIDE_LOAD2]] to <16 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP11:%.*]] = fpext <16 x half> [[WIDE_LOAD3]] to <16 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP12:%.*]] = fmul reassoc contract <16 x float> [[TMP10]], [[TMP11]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE1]] = call reassoc contract <8 x float> @llvm.vector.partial.reduce.fadd.v8f32.v16f32(<8 x float> [[VEC_PHI1]], <16 x float> [[TMP12]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX_NEXT1]] = add nuw i32 [[INDEX1]], 16 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT1]], 32 -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_PH]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK-PARTIAL-RED-EPI: [[MIDDLE_BLOCK1]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP14:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[PARTIAL_REDUCE1]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP7:%.*]] = fsub float [[STARTVAL]], [[TMP14]] -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK-PARTIAL-RED-EPI: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_VECTOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] -; CHECK-PARTIAL-RED-EPI: [[VEC_EPILOG_PH]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK1]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK-PARTIAL-RED-EPI: [[VECTOR_BODY]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[INDEX]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 4 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[INDEX]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 4 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP4:%.*]] = fmul reassoc contract <8 x float> [[TMP2]], [[TMP3]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40 -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK-PARTIAL-RED-EPI: [[MIDDLE_BLOCK]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP15:%.*]] = fsub float [[BC_MERGE_RDX]], [[TMP6]] -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 false, label %[[EXIT]], label %[[VEC_EPILOG_VECTOR_BODY]] -; CHECK-PARTIAL-RED-EPI: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 32, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[LOOP:.*]] -; CHECK-PARTIAL-RED-EPI: [[LOOP]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[SUB:%.*]], %[[LOOP]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC1_GEP:%.*]] = getelementptr half, ptr [[SRC1]], i32 [[IV]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC1_LOAD:%.*]] = load half, ptr [[SRC1_GEP]], align 4 -; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC1_LOAD_EXT:%.*]] = fpext half [[SRC1_LOAD]] to float -; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC2_GEP:%.*]] = getelementptr half, ptr [[SRC2]], i32 [[IV]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC2_LOAD:%.*]] = load half, ptr [[SRC2_GEP]], align 4 -; CHECK-PARTIAL-RED-EPI-NEXT: [[SRC2_LOAD_EXT:%.*]] = fpext half [[SRC2_LOAD]] to float -; CHECK-PARTIAL-RED-EPI-NEXT: [[MUL:%.*]] = fmul reassoc contract float [[SRC1_LOAD_EXT]], [[SRC2_LOAD_EXT]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[SUB]] = fsub reassoc contract float [[ACCUM]], [[MUL]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-PARTIAL-RED-EPI-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 40 -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK-PARTIAL-RED-EPI: [[EXIT]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[SUB_LCSSA:%.*]] = phi float [ [[SUB]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK1]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: ret float [[SUB_LCSSA]] -; -entry: - br label %loop - -loop: - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %accum = phi float [ %startval, %entry ], [ %sub, %loop ] - %src1.gep = getelementptr half, ptr %src1, i32 %iv - %src1.load = load half, ptr %src1.gep, align 4 - %src1.load.ext = fpext half %src1.load to float - %src2.gep = getelementptr half, ptr %src2, i32 %iv - %src2.load = load half, ptr %src2.gep, align 4 - %src2.load.ext = fpext half %src2.load to float - %mul = fmul reassoc contract float %src1.load.ext, %src2.load.ext - %sub = fsub reassoc contract float %accum, %mul - %iv.next = add i32 %iv, 1 - %exitcond.not = icmp eq i32 %iv, 40 - br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 - -exit: - ret float %sub -} - -define float @fsub_reduction_nsz(ptr %a, ptr %b, ptr %c, i64 %n) { -; CHECK-EPI-LABEL: define float @fsub_reduction_nsz( -; CHECK-EPI-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { -; CHECK-EPI-NEXT: [[ITER_CHECK:.*]]: -; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK-EPI: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-EPI-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK-EPI: [[VECTOR_PH]]: -; CHECK-EPI-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK-EPI: [[VECTOR_BODY]]: -; CHECK-EPI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-EPI-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ] -; CHECK-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] -; CHECK-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 -; CHECK-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] -; CHECK-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 -; CHECK-EPI-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] -; CHECK-EPI-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP2]], align 2 -; CHECK-EPI-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> -; CHECK-EPI-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> -; CHECK-EPI-NEXT: [[TMP5:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP3]], [[TMP4]] -; CHECK-EPI-NEXT: [[PARTIAL_REDUCE:%.*]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP5]]) -; CHECK-EPI-NEXT: [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> -; CHECK-EPI-NEXT: [[TMP7:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP6]], [[TMP4]] -; CHECK-EPI-NEXT: [[PARTIAL_REDUCE3]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE]], <8 x float> [[TMP7]]) -; CHECK-EPI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-EPI-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-EPI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; CHECK-EPI: [[MIDDLE_BLOCK]]: -; CHECK-EPI-NEXT: [[TMP9:%.*]] = call reassoc nsz contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PARTIAL_REDUCE3]]) -; CHECK-EPI-NEXT: [[TMP10:%.*]] = fsub float 0.000000e+00, [[TMP9]] -; CHECK-EPI-NEXT: br i1 true, label %[[FOR_EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK-EPI: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-EPI-NEXT: br i1 true, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF10:![0-9]+]] -; CHECK-EPI: [[VEC_EPILOG_PH]]: -; CHECK-EPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-EPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-EPI-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK-EPI: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-EPI-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-EPI-NEXT: [[VEC_PHI5:%.*]] = phi <2 x float> [ zeroinitializer, %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-EPI-NEXT: [[TMP11:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX4]] -; CHECK-EPI-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x half>, ptr [[TMP11]], align 2 -; CHECK-EPI-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX4]] -; CHECK-EPI-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x half>, ptr [[TMP12]], align 2 -; CHECK-EPI-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX4]] -; CHECK-EPI-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x half>, ptr [[TMP13]], align 2 -; CHECK-EPI-NEXT: [[TMP14:%.*]] = fpext <4 x half> [[WIDE_LOAD7]] to <4 x float> -; CHECK-EPI-NEXT: [[TMP15:%.*]] = fpext <4 x half> [[WIDE_LOAD6]] to <4 x float> -; CHECK-EPI-NEXT: [[TMP16:%.*]] = fmul reassoc nsz contract <4 x float> [[TMP14]], [[TMP15]] -; CHECK-EPI-NEXT: [[PARTIAL_REDUCE9:%.*]] = call reassoc nsz contract <2 x float> @llvm.vector.partial.reduce.fadd.v2f32.v4f32(<2 x float> [[VEC_PHI5]], <4 x float> [[TMP16]]) -; CHECK-EPI-NEXT: [[TMP17:%.*]] = fpext <4 x half> [[WIDE_LOAD8]] to <4 x float> -; CHECK-EPI-NEXT: [[TMP18:%.*]] = fmul reassoc nsz contract <4 x float> [[TMP17]], [[TMP15]] -; CHECK-EPI-NEXT: [[PARTIAL_REDUCE10]] = call reassoc nsz contract <2 x float> @llvm.vector.partial.reduce.fadd.v2f32.v4f32(<2 x float> [[PARTIAL_REDUCE9]], <4 x float> [[TMP18]]) -; CHECK-EPI-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 -; CHECK-EPI-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 1024 -; CHECK-EPI-NEXT: br i1 [[TMP19]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] -; CHECK-EPI: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-EPI-NEXT: [[TMP20:%.*]] = call reassoc nsz contract float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[PARTIAL_REDUCE10]]) -; CHECK-EPI-NEXT: [[TMP21:%.*]] = fsub float [[BC_MERGE_RDX]], [[TMP20]] -; CHECK-EPI-NEXT: br i1 true, label %[[FOR_EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK-EPI: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-EPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-EPI-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ [[TMP21]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ] -; CHECK-EPI-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-EPI: [[FOR_BODY]]: -; CHECK-EPI-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-EPI-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX12]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SUB:%.*]], %[[FOR_BODY]] ] -; CHECK-EPI-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] -; CHECK-EPI-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 -; CHECK-EPI-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float -; CHECK-EPI-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] -; CHECK-EPI-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 2 -; CHECK-EPI-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float -; CHECK-EPI-NEXT: [[GEP_C:%.*]] = getelementptr half, ptr [[C]], i64 [[IV]] -; CHECK-EPI-NEXT: [[LOAD_C:%.*]] = load half, ptr [[GEP_C]], align 2 -; CHECK-EPI-NEXT: [[EXT_C:%.*]] = fpext half [[LOAD_C]] to float -; CHECK-EPI-NEXT: [[MUL_AB:%.*]] = fmul reassoc nsz contract float [[EXT_B]], [[EXT_A]] -; CHECK-EPI-NEXT: [[MUL_AC:%.*]] = fmul reassoc nsz contract float [[EXT_C]], [[EXT_A]] -; CHECK-EPI-NEXT: [[SUB_AB:%.*]] = fsub reassoc nsz contract float [[ACCUM]], [[MUL_AB]] -; CHECK-EPI-NEXT: [[SUB]] = fsub reassoc nsz contract float [[SUB_AB]], [[MUL_AC]] -; CHECK-EPI-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-EPI-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-EPI-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK-EPI: [[FOR_EXIT]]: -; CHECK-EPI-NEXT: [[SUB_LCSSA:%.*]] = phi float [ [[SUB]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-EPI-NEXT: ret float [[SUB_LCSSA]] -; -; CHECK-PARTIAL-RED-EPI-LABEL: define float @fsub_reduction_nsz( -; CHECK-PARTIAL-RED-EPI-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { -; CHECK-PARTIAL-RED-EPI-NEXT: [[ENTRY:.*:]] -; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_PH:.*]] -; CHECK-PARTIAL-RED-EPI: [[VECTOR_PH]]: -; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK-PARTIAL-RED-EPI: [[VECTOR_BODY]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ] -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP2]], align 2 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP5:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP3]], [[TMP4]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE:%.*]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP5]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP7:%.*]] = fmul reassoc nsz contract <8 x float> [[TMP6]], [[TMP4]] -; CHECK-PARTIAL-RED-EPI-NEXT: [[PARTIAL_REDUCE3]] = call reassoc nsz contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE]], <8 x float> [[TMP7]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-PARTIAL-RED-EPI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; CHECK-PARTIAL-RED-EPI: [[MIDDLE_BLOCK]]: -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP9:%.*]] = call reassoc nsz contract float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PARTIAL_REDUCE3]]) -; CHECK-PARTIAL-RED-EPI-NEXT: [[TMP10:%.*]] = fsub float 0.000000e+00, [[TMP9]] -; CHECK-PARTIAL-RED-EPI-NEXT: br label %[[FOR_EXIT:.*]] -; CHECK-PARTIAL-RED-EPI: [[FOR_EXIT]]: -; CHECK-PARTIAL-RED-EPI-NEXT: ret float [[TMP10]] -; -entry: - br label %for.body - -for.body: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi float [ 0.0, %entry ], [ %sub, %for.body ] - %gep.a = getelementptr half, ptr %a, i64 %iv - %load.a = load half, ptr %gep.a, align 2 - %ext.a = fpext half %load.a to float - %gep.b = getelementptr half, ptr %b, i64 %iv - %load.b = load half, ptr %gep.b, align 2 - %ext.b = fpext half %load.b to float - %gep.c = getelementptr half, ptr %c, i64 %iv - %load.c = load half, ptr %gep.c, align 2 - %ext.c = fpext half %load.c to float - %mul.ab = fmul nsz reassoc contract float %ext.b, %ext.a - %mul.ac = fmul nsz reassoc contract float %ext.c, %ext.a - %sub.ab = fsub nsz reassoc contract float %accum, %mul.ab - %sub = fsub nsz reassoc contract float %sub.ab, %mul.ac - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: - ret float %sub -} - attributes #0 = { vscale_range(1,16) "target-features"="+sve" } - -!0 = distinct !{!0, !1} -!1 = !{!"llvm.loop.vectorize.width", i32 16} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index f7ed72f3a45e5..2aea67f6e5499 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -269,145 +269,6 @@ exit: ret i64 %sub } -define float @fdotp_fsub(ptr %a, ptr %b, ptr %c) #0 { -; CHECK-INTERLEAVE1-LABEL: define float @fdotp_fsub( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP8]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = fmul reassoc contract <8 x float> [[TMP2]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE1:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = fmul reassoc contract <8 x float> [[TMP9]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE1]], <8 x float> [[TMP10]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = fsub float 0.000000e+00, [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: ret float [[TMP7]] -; -; CHECK-INTERLEAVED-LABEL: define float @fdotp_fsub( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] { -; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[TMP0]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr half, ptr [[TMP2]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x half>, ptr [[TMP2]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x half>, ptr [[TMP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr half, ptr [[TMP16]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP16]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x half>, ptr [[TMP17]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[WIDE_LOAD3]] to <8 x float> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = fmul reassoc contract <8 x float> [[TMP4]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP6]]) -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = fpext <8 x half> [[WIDE_LOAD4]] to <8 x float> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = fmul reassoc contract <8 x float> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI1]], <8 x float> [[TMP9]]) -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = fpext <8 x half> [[WIDE_LOAD5]] to <8 x float> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = fmul reassoc contract <8 x float> [[TMP18]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE1]], <8 x float> [[TMP13]]) -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = fpext <8 x half> [[WIDE_LOAD6]] to <8 x float> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = fmul reassoc contract <8 x float> [[TMP14]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE7]], <8 x float> [[TMP15]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = fsub float 0.000000e+00, [[TMP11]] -; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: ret float [[TMP12]] -; -; CHECK-MAXBW-LABEL: define float @fdotp_fsub( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-MAXBW: vector.body: -; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[C]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP8]], align 2 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = fmul reassoc contract <8 x float> [[TMP2]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE1:%.*]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = fmul reassoc contract <8 x float> [[TMP9]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call reassoc contract <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[PARTIAL_REDUCE1]], <8 x float> [[TMP10]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fsub float 0.000000e+00, [[TMP6]] -; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-MAXBW: for.exit: -; CHECK-MAXBW-NEXT: ret float [[TMP7]] -; -entry: - br label %for.body - -for.body: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %accum = phi float [ 0.0, %entry ], [ %sub, %for.body ] - %gep.a = getelementptr half, ptr %a, i64 %iv - %load.a = load half, ptr %gep.a, align 2 - %ext.a = fpext half %load.a to float - %gep.b = getelementptr half, ptr %b, i64 %iv - %load.b = load half, ptr %gep.b, align 2 - %ext.b = fpext half %load.b to float - %gep.c = getelementptr half, ptr %c, i64 %iv - %load.c = load half, ptr %gep.c, align 2 - %ext.c = fpext half %load.c to float - %mul.ab = fmul reassoc contract float %ext.b, %ext.a - %mul.ac = fmul reassoc contract float %ext.c, %ext.a - %sub.ab = fsub reassoc contract float %accum, %mul.ab - %sub = fsub reassoc contract float %sub.ab, %mul.ac - %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %for.exit, label %for.body - -for.exit: - ret float %sub -} - !7 = distinct !{!7, !8, !9, !10} !8 = !{!"llvm.loop.mustprogress"} !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} From c1056d17e1208d5bd96cb7eb1f4a369f559625ac Mon Sep 17 00:00:00 2001 From: Nerixyz Date: Mon, 11 May 2026 12:49:01 +0200 Subject: [PATCH 249/538] [lldb][Windows] Use WaitForDebugEventEx if available (#196817) This makes use of [`WaitForDebugEventEx`](https://learn.microsoft.com/en-us/windows/win32/api/debugapi/nf-debugapi-waitfordebugeventex) over `WaitForDebugEvent` if available (Windows 10+). The two functions are identical except for the handling of `OutputDebugStringW`. The `-Ex` version forwards the string as Unicode whereas the other version forwards ASCII strings. Since we don't handle these outputs yet, it shouldn't make any difference. Split from #196395. --- .../Process/Windows/Common/DebuggerThread.cpp | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp index 6359a63dfef91..8c35a2b9262cc 100644 --- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp @@ -25,6 +25,7 @@ #include "Plugins/Process/Windows/Common/ProcessWindowsLog.h" +#include "lldb/Utility/LLDBLog.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Threading.h" @@ -40,9 +41,45 @@ using namespace lldb; using namespace lldb_private; +typedef BOOL WINAPI WaitForDebugEventFn(LPDEBUG_EVENT, DWORD); +static WaitForDebugEventFn *g_wait_for_debug_event = nullptr; + +static WaitForDebugEventFn *GetWaitForDebugEventEx() { + HMODULE h_kernel32 = LoadLibraryW(L"kernel32.dll"); + if (!h_kernel32) { + llvm::Error err = llvm::errorCodeToError( + std::error_code(GetLastError(), std::system_category())); + LLDB_LOG_ERROR(GetLog(LLDBLog::Host), std::move(err), + "Could not load kernel32: {0}"); + return nullptr; + } + + return reinterpret_cast( + GetProcAddress(h_kernel32, "WaitForDebugEventEx")); +} + +/// WaitForDebugEventEx is only available on Windows 10+. This lazily checks if +/// the function is available and falls back to WaitForDebugEvent if +/// unavailable. The -Ex version ensures correct forwarding of +/// OutputDebugStringW events. +static void InitializeWaitForDebugEvent() { + if (g_wait_for_debug_event) + return; + + g_wait_for_debug_event = GetWaitForDebugEventEx(); + if (!g_wait_for_debug_event) { + LLDB_LOG( + GetLog(LLDBLog::Host), + "WaitForDebugEventEx unavailable, using WaitForDebugEvent instead. " + "Unicode strings from OutputDebugStringW might show incorrectly."); + g_wait_for_debug_event = &WaitForDebugEvent; + } +} + DebuggerThread::DebuggerThread(DebugDelegateSP debug_delegate) : m_debug_delegate(debug_delegate), m_pid_to_detach(0), m_is_shutting_down(false) { + InitializeWaitForDebugEvent(); m_debugging_ended_event = ::CreateEvent(nullptr, TRUE, FALSE, nullptr); } @@ -236,7 +273,7 @@ void DebuggerThread::DebugLoop() { LLDB_LOG_VERBOSE(log, "Entering WaitForDebugEvent loop"); while (should_debug) { LLDB_LOG_VERBOSE(log, "Calling WaitForDebugEvent"); - BOOL wait_result = WaitForDebugEvent(&dbe, INFINITE); + BOOL wait_result = g_wait_for_debug_event(&dbe, INFINITE); if (wait_result) { DWORD continue_status = DBG_CONTINUE; bool shutting_down = m_is_shutting_down; @@ -314,7 +351,7 @@ void DebuggerThread::DebugLoop() { // target threads are running at this time, there is possibility to // have some breakpoint exception between last WaitForDebugEvent and // DebugActiveProcessStop but ignore for now. - while (WaitForDebugEvent(&dbe, 0)) { + while (g_wait_for_debug_event(&dbe, 0)) { continue_status = DBG_CONTINUE; if (dbe.dwDebugEventCode == EXCEPTION_DEBUG_EVENT && !(dbe.u.Exception.ExceptionRecord.ExceptionCode == From 68e953423c14c6579d26f6eb109fe3077a45cb51 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 11 May 2026 12:58:50 +0200 Subject: [PATCH 250/538] [OCaml] Fix test after float printing changes (NFC) (#196925) --- llvm/test/Bindings/OCaml/core.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/Bindings/OCaml/core.ml b/llvm/test/Bindings/OCaml/core.ml index 049236b05d291..f57042fea7878 100644 --- a/llvm/test/Bindings/OCaml/core.ml +++ b/llvm/test/Bindings/OCaml/core.ml @@ -161,8 +161,8 @@ let test_constants () = (* CHECK: const_single{{.*}}2.75 * CHECK: const_double{{.*}}3.1459 * CHECK: const_double_string{{.*}}2 - * CHECK: const_fake_fp128{{.*}}0xL00000000000000004000000000000000 - * CHECK: const_fp128_string{{.*}}0xLF3CB1CCF26FBC178452FB4EC7F91973F + * CHECK: const_fake_fp128{{.*}}2.000000e+00 + * CHECK: const_fp128_string{{.*}}1.000000e+400 *) begin group "real"; let cs = const_float float_type 2.75 in From 9b3f3b90e0fcdd0ef24da2615572bb2b71f84893 Mon Sep 17 00:00:00 2001 From: Artur Wojcik Date: Mon, 11 May 2026 13:01:48 +0200 Subject: [PATCH 251/538] [CMake] Don't pass --gc-sections to MSVC-style linkers when using clang's MSVC mode (#196393) The PR concerns Clang with a GNU-like command-line interface on Windows. The LLVM linker on Windows (lld-link.exe) does not understand the --gc-sections option. The PR excludes that option when compiling on Windows to remove a linker warning (and an error if warnings are treated as such). --- llvm/cmake/modules/AddLLVM.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 28858db434f91..e8a1b8b948b5a 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -382,7 +382,7 @@ function(add_link_opts target_name) set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,-z,discard-unused=sections") endif() - elseif(NOT MSVC AND NOT CMAKE_SYSTEM_NAME MATCHES "AIX|OS390") + elseif(NOT MSVC AND NOT CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC" AND NOT CMAKE_SYSTEM_NAME MATCHES "AIX|OS390") # TODO Revisit this later on z/OS. set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--gc-sections") From 6f31d41ad4ccfda7853bfd9c5cffd9177f08cda8 Mon Sep 17 00:00:00 2001 From: NeKon69 Date: Mon, 11 May 2026 14:11:17 +0300 Subject: [PATCH 252/538] [LifetimeSafety] Impove `[[clang::lifetimbound]]` violation diagnostics (#196824) Reports lifetimebound verification diagnostics at the attribute location, so declarations with the attribute now point at the declaration rather than only at the function definition. --- .../clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h | 3 ++- clang/lib/Sema/SemaLifetimeSafety.h | 5 +++-- clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp | 5 ++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 7ccf30ba14987..7b0799d923f40 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -110,7 +110,8 @@ class LifetimeSafetySemaHelper { // Reports misuse of [[clang::lifetimebound]] when parameter doesn't escape // through return. - virtual void reportLifetimeboundViolation(const ParmVarDecl *VD) {} + virtual void + reportLifetimeboundViolation(const ParmVarDecl *ParmWithLifetimebound) {} // Suggests lifetime bound annotations for implicit this. virtual void suggestLifetimeboundToImplicitThis(SuggestionScope Scope, diff --git a/clang/lib/Sema/SemaLifetimeSafety.h b/clang/lib/Sema/SemaLifetimeSafety.h index 5b1cf41445399..1ef28d8ba2cee 100644 --- a/clang/lib/Sema/SemaLifetimeSafety.h +++ b/clang/lib/Sema/SemaLifetimeSafety.h @@ -180,11 +180,12 @@ class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper { void reportLifetimeboundViolation( const ParmVarDecl *ParmWithLifetimebound) override { + const auto *Attr = ParmWithLifetimebound->getAttr(); StringRef ParamName = ParmWithLifetimebound->getName(); bool HasName = ParamName.size() > 0; - S.Diag(ParmWithLifetimebound->getLocation(), + S.Diag(Attr->getLocation(), diag::warn_lifetime_safety_param_lifetimebound_violation) - << HasName << ParamName << ParmWithLifetimebound->getSourceRange(); + << HasName << ParamName << Attr->getRange(); } void suggestLifetimeboundToImplicitThis(SuggestionScope Scope, diff --git a/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp b/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp index 941a3bb8ce1e3..5764647ca62e0 100644 --- a/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp +++ b/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp @@ -81,9 +81,8 @@ View unnamed_lifetimebound_param( return View(); } -// FIXME: Should warn on declaration, not definiton -View annotated_decl_but_not_def_not_returned(const MyObj &obj [[clang::lifetimebound]]); +View annotated_decl_but_not_def_not_returned(const MyObj &obj [[clang::lifetimebound]]); // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} -View annotated_decl_but_not_def_not_returned(const MyObj &obj) { // expected-warning {{could not verify that the return value can be lifetime bound to 'obj'}} +View annotated_decl_but_not_def_not_returned(const MyObj &obj) { return not_lb(obj); } From 2c0d0e1b3b6a69f27384687a6eb850ba47014254 Mon Sep 17 00:00:00 2001 From: jofrn <165626406+jofrn@users.noreply.github.com> Date: Mon, 11 May 2026 04:26:35 -0700 Subject: [PATCH 253/538] [AtomicExpand] Add bitcasts when expanding load atomic vector (#148900) AtomicExpand fails for aligned `load atomic ` because it does not find a compatible library call. This change adds appropriate bitcasts so that the call can be lowered. It also adds support for 128 bit lowering in tablegen to support SSE/AVX. --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +- llvm/test/CodeGen/ARM/atomic-load-store.ll | 51 ++++ llvm/test/CodeGen/X86/atomic-load-store.ll | 91 +++++- .../X86/expand-atomic-non-integer.ll | 287 ++++++++++++++---- 4 files changed, 382 insertions(+), 66 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 160382168489f..7327290f62970 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -567,7 +567,9 @@ LoadInst *AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) { NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); - Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); + Value *NewVal = LI->getType()->isPtrOrPtrVectorTy() + ? Builder.CreateIntToPtr(NewLI, LI->getType()) + : Builder.CreateBitCast(NewLI, LI->getType()); LI->replaceAllUsesWith(NewVal); LI->eraseFromParent(); return NewLI; @@ -2237,9 +2239,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( I->replaceAllUsesWith(V); } else if (HasResult) { Value *V; - if (UseSizedLibcall) - V = Builder.CreateBitOrPointerCast(Result, I->getType()); - else { + if (UseSizedLibcall) { + // Add bitcasts from Result's scalar type to I's vector type + auto *PtrTy = dyn_cast(I->getType()->getScalarType()); + auto *VTy = dyn_cast(I->getType()); + if (VTy && PtrTy && !Result->getType()->isVectorTy()) { + unsigned AS = PtrTy->getAddressSpace(); + Value *BC = Builder.CreateBitCast( + Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS))); + V = Builder.CreateIntToPtr(BC, I->getType()); + } else + V = Builder.CreateBitOrPointerCast(Result, I->getType()); + } else { V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, AllocaAlignment); Builder.CreateLifetimeEnd(AllocaResult); diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index f3860a763dd5a..1af2832702296 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -987,3 +987,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { store atomic double %val1, ptr %ptr seq_cst, align 8 ret void } + +define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 { +; ARM-LABEL: atomic_vec1_ptr: +; ARM: @ %bb.0: +; ARM-NEXT: ldr r0, [r0] +; ARM-NEXT: dmb ish +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: atomic_vec1_ptr: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: ldr r0, [r0] +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: atomic_vec1_ptr: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: ldr r0, [r0] +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: atomic_vec1_ptr: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: movs r1, #0 +; THUMBONE-NEXT: mov r2, r1 +; THUMBONE-NEXT: bl __sync_val_compare_and_swap_4 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: atomic_vec1_ptr: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r1, #2 +; ARMV4-NEXT: bl __atomic_load_4 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: atomic_vec1_ptr: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: ldr r0, [r0] +; ARMV6-NEXT: mov r1, #0 +; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5 +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: atomic_vec1_ptr: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: ldr r0, [r0] +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: bx lr + %ret = load atomic <1 x ptr>, ptr %x acquire, align 4 + ret <1 x ptr> %ret +} diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 00310f6d1f219..867a4acb791bc 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) { %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8 ret <2 x ptr addrspace(270)> %ret } +define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind { +; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movl $2, %esi +; CHECK-SSE2-O3-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O3-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec2_ptr_align: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movaps (%rdi), %xmm0 +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_ptr_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec2_ptr_align: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movl $2, %esi +; CHECK-SSE2-O0-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec2_ptr_align: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movapd (%rdi), %xmm0 +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_ptr_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovapd (%rdi), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x ptr>, ptr %x acquire, align 16 + ret <2 x ptr> %ret +} +define <4 x ptr addrspace(270)> @atomic_vec4_ptr270(ptr %x) nounwind { +; CHECK-SSE2-O3-LABEL: atomic_vec4_ptr270: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movl $2, %esi +; CHECK-SSE2-O3-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O3-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_ptr270: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movaps (%rdi), %xmm0 +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_ptr270: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_ptr270: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movl $2, %esi +; CHECK-SSE2-O0-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_ptr270: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movapd (%rdi), %xmm0 +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_ptr270: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovapd (%rdi), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x ptr addrspace(270)>, ptr %x acquire, align 16 + ret <4 x ptr addrspace(270)> %ret +} define <2 x i32> @atomic_vec2_i32_align(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align: @@ -727,7 +817,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind { } define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind { -; ; CHECK-SSE2-O3-LABEL: atomic_vec4_float_align: ; CHECK-SSE2-O3: # %bb.0: ; CHECK-SSE2-O3-NEXT: pushq %rax diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll index 82a0a7ab72cd0..9f973ac5531d1 100644 --- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll +++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -S %s -passes='require,atomic-expand' -mtriple=x86_64-linux-gnu | FileCheck %s +; RUN: opt -S %s -passes='require,atomic-expand' -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK64 +; RUN: opt -S %s -passes='require,atomic-expand' -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK32 ; This file tests the functions `llvm::convertAtomicLoadToIntegerType` and ; `llvm::convertAtomicStoreToIntegerType`. If X86 stops using this @@ -94,98 +95,262 @@ define void @float_store_expand_addr1(ptr addrspace(1) %ptr, float %v) { } define void @pointer_cmpxchg_expand(ptr %ptr, ptr %v) { -; CHECK-LABEL: define void @pointer_cmpxchg_expand( -; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst monotonic, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 -; CHECK-NEXT: ret void +; CHECK64-LABEL: define void @pointer_cmpxchg_expand( +; CHECK64-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 +; CHECK64-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst monotonic, align 8 +; CHECK64-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 +; CHECK64-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; CHECK64-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK64-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK64-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @pointer_cmpxchg_expand( +; CHECK32-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK32-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i32 +; CHECK32-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 0, i32 [[TMP1]] seq_cst monotonic, align 4 +; CHECK32-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK32-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK32-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr +; CHECK32-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK32-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK32-NEXT: ret void ; cmpxchg ptr %ptr, ptr null, ptr %v seq_cst monotonic ret void } define void @pointer_cmpxchg_expand2(ptr %ptr, ptr %v) { -; CHECK-LABEL: define void @pointer_cmpxchg_expand2( -; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] release monotonic, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 -; CHECK-NEXT: ret void +; CHECK64-LABEL: define void @pointer_cmpxchg_expand2( +; CHECK64-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 +; CHECK64-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] release monotonic, align 8 +; CHECK64-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 +; CHECK64-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; CHECK64-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK64-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK64-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @pointer_cmpxchg_expand2( +; CHECK32-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK32-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i32 +; CHECK32-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 0, i32 [[TMP1]] release monotonic, align 4 +; CHECK32-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK32-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK32-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr +; CHECK32-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK32-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK32-NEXT: ret void ; cmpxchg ptr %ptr, ptr null, ptr %v release monotonic ret void } define void @pointer_cmpxchg_expand3(ptr %ptr, ptr %v) { -; CHECK-LABEL: define void @pointer_cmpxchg_expand3( -; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 -; CHECK-NEXT: ret void +; CHECK64-LABEL: define void @pointer_cmpxchg_expand3( +; CHECK64-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 +; CHECK64-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 +; CHECK64-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 +; CHECK64-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; CHECK64-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK64-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK64-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @pointer_cmpxchg_expand3( +; CHECK32-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK32-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i32 +; CHECK32-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 0, i32 [[TMP1]] seq_cst seq_cst, align 4 +; CHECK32-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK32-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK32-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr +; CHECK32-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK32-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK32-NEXT: ret void ; cmpxchg ptr %ptr, ptr null, ptr %v seq_cst seq_cst ret void } define void @pointer_cmpxchg_expand4(ptr %ptr, ptr %v) { -; CHECK-LABEL: define void @pointer_cmpxchg_expand4( -; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg weak ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 -; CHECK-NEXT: ret void +; CHECK64-LABEL: define void @pointer_cmpxchg_expand4( +; CHECK64-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 +; CHECK64-NEXT: [[TMP2:%.*]] = cmpxchg weak ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 +; CHECK64-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 +; CHECK64-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; CHECK64-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK64-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK64-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @pointer_cmpxchg_expand4( +; CHECK32-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK32-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i32 +; CHECK32-NEXT: [[TMP2:%.*]] = cmpxchg weak ptr [[PTR]], i32 0, i32 [[TMP1]] seq_cst seq_cst, align 4 +; CHECK32-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK32-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK32-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr +; CHECK32-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK32-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK32-NEXT: ret void ; cmpxchg weak ptr %ptr, ptr null, ptr %v seq_cst seq_cst ret void } define void @pointer_cmpxchg_expand5(ptr %ptr, ptr %v) { -; CHECK-LABEL: define void @pointer_cmpxchg_expand5( -; CHECK-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg volatile ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 -; CHECK-NEXT: ret void +; CHECK64-LABEL: define void @pointer_cmpxchg_expand5( +; CHECK64-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i64 +; CHECK64-NEXT: [[TMP2:%.*]] = cmpxchg volatile ptr [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 +; CHECK64-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 +; CHECK64-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; CHECK64-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK64-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK64-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @pointer_cmpxchg_expand5( +; CHECK32-SAME: ptr [[PTR:%.*]], ptr [[V:%.*]]) { +; CHECK32-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V]] to i32 +; CHECK32-NEXT: [[TMP2:%.*]] = cmpxchg volatile ptr [[PTR]], i32 0, i32 [[TMP1]] seq_cst seq_cst, align 4 +; CHECK32-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK32-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK32-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr +; CHECK32-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0 +; CHECK32-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK32-NEXT: ret void ; cmpxchg volatile ptr %ptr, ptr null, ptr %v seq_cst seq_cst ret void } define void @pointer_cmpxchg_expand6(ptr addrspace(1) %ptr, ptr addrspace(2) %v) { -; CHECK-LABEL: define void @pointer_cmpxchg_expand6( -; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[V:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(2) [[V]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(2) -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr addrspace(2), i1 } poison, ptr addrspace(2) [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr addrspace(2), i1 } [[TMP6]], i1 [[TMP4]], 1 -; CHECK-NEXT: ret void +; CHECK64-LABEL: define void @pointer_cmpxchg_expand6( +; CHECK64-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[V:%.*]]) { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(2) [[V]] to i64 +; CHECK64-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 0, i64 [[TMP1]] seq_cst seq_cst, align 8 +; CHECK64-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0 +; CHECK64-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; CHECK64-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(2) +; CHECK64-NEXT: [[TMP6:%.*]] = insertvalue { ptr addrspace(2), i1 } poison, ptr addrspace(2) [[TMP5]], 0 +; CHECK64-NEXT: [[TMP7:%.*]] = insertvalue { ptr addrspace(2), i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @pointer_cmpxchg_expand6( +; CHECK32-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[V:%.*]]) { +; CHECK32-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(2) [[V]] to i32 +; CHECK32-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 0, i32 [[TMP1]] seq_cst seq_cst, align 4 +; CHECK32-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK32-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK32-NEXT: [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(2) +; CHECK32-NEXT: [[TMP6:%.*]] = insertvalue { ptr addrspace(2), i1 } poison, ptr addrspace(2) [[TMP5]], 0 +; CHECK32-NEXT: [[TMP7:%.*]] = insertvalue { ptr addrspace(2), i1 } [[TMP6]], i1 [[TMP4]], 1 +; CHECK32-NEXT: ret void ; cmpxchg ptr addrspace(1) %ptr, ptr addrspace(2) null, ptr addrspace(2) %v seq_cst seq_cst ret void } +define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind { +; CHECK64-LABEL: define <2 x ptr> @atomic_vec2_ptr_align( +; CHECK64-SAME: ptr [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK64-NEXT: [[TMP1:%.*]] = call i128 @__atomic_load_16(ptr [[X]], i32 2) +; CHECK64-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <2 x i64> +; CHECK64-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr> +; CHECK64-NEXT: ret <2 x ptr> [[TMP3]] +; +; CHECK32-LABEL: define <2 x ptr> @atomic_vec2_ptr_align( +; CHECK32-SAME: ptr [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK32-NEXT: [[RET:%.*]] = load atomic <2 x ptr>, ptr [[X]] acquire, align 16 +; CHECK32-NEXT: ret <2 x ptr> [[RET]] +; + %ret = load atomic <2 x ptr>, ptr %x acquire, align 16 + ret <2 x ptr> %ret +} + +define <4 x ptr addrspace(270)> @atomic_vec4_ptr_align(ptr %x) nounwind { +; CHECK64-LABEL: define <4 x ptr addrspace(270)> @atomic_vec4_ptr_align( +; CHECK64-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = call i128 @__atomic_load_16(ptr [[X]], i32 2) +; CHECK64-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x i32> +; CHECK64-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x ptr addrspace(270)> +; CHECK64-NEXT: ret <4 x ptr addrspace(270)> [[TMP3]] +; +; CHECK32-LABEL: define <4 x ptr addrspace(270)> @atomic_vec4_ptr_align( +; CHECK32-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: [[TMP1:%.*]] = alloca <4 x ptr addrspace(270)>, align 16 +; CHECK32-NEXT: call void @llvm.lifetime.start.p0(ptr [[TMP1]]) +; CHECK32-NEXT: call void @__atomic_load(i32 16, ptr [[X]], ptr [[TMP1]], i32 2) +; CHECK32-NEXT: [[TMP2:%.*]] = load <4 x ptr addrspace(270)>, ptr [[TMP1]], align 16 +; CHECK32-NEXT: call void @llvm.lifetime.end.p0(ptr [[TMP1]]) +; CHECK32-NEXT: ret <4 x ptr addrspace(270)> [[TMP2]] +; + %ret = load atomic <4 x ptr addrspace(270)>, ptr %x acquire, align 16 + ret <4 x ptr addrspace(270)> %ret +} + +define <2 x i16> @atomic_vec2_i16(ptr %x) nounwind { +; CHECK-LABEL: define <2 x i16> @atomic_vec2_i16( +; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[RET:%.*]] = load atomic <2 x i16>, ptr [[X]] acquire, align 8 +; CHECK-NEXT: ret <2 x i16> [[RET]] +; + %ret = load atomic <2 x i16>, ptr %x acquire, align 8 + ret <2 x i16> %ret +} + +define <2 x half> @atomic_vec2_half(ptr %x) nounwind { +; CHECK-LABEL: define <2 x half> @atomic_vec2_half( +; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[X]] acquire, align 8 +; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[TMP1]] to <2 x half> +; CHECK-NEXT: ret <2 x half> [[RET]] +; + %ret = load atomic <2 x half>, ptr %x acquire, align 8 + ret <2 x half> %ret +} + +define <4 x i32> @atomic_vec4_i32(ptr %x) nounwind { +; CHECK64-LABEL: define <4 x i32> @atomic_vec4_i32( +; CHECK64-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = call i128 @__atomic_load_16(ptr [[X]], i32 2) +; CHECK64-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x i32> +; CHECK64-NEXT: ret <4 x i32> [[TMP2]] +; +; CHECK32-LABEL: define <4 x i32> @atomic_vec4_i32( +; CHECK32-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: [[TMP1:%.*]] = alloca <4 x i32>, align 16 +; CHECK32-NEXT: call void @llvm.lifetime.start.p0(ptr [[TMP1]]) +; CHECK32-NEXT: call void @__atomic_load(i32 16, ptr [[X]], ptr [[TMP1]], i32 2) +; CHECK32-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[TMP1]], align 16 +; CHECK32-NEXT: call void @llvm.lifetime.end.p0(ptr [[TMP1]]) +; CHECK32-NEXT: ret <4 x i32> [[TMP2]] +; + %ret = load atomic <4 x i32>, ptr %x acquire, align 16 + ret <4 x i32> %ret +} + +define <4 x float> @atomic_vec4_float(ptr %x) nounwind { +; CHECK64-LABEL: define <4 x float> @atomic_vec4_float( +; CHECK64-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = call i128 @__atomic_load_16(ptr [[X]], i32 2) +; CHECK64-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x float> +; CHECK64-NEXT: ret <4 x float> [[TMP2]] +; +; CHECK32-LABEL: define <4 x float> @atomic_vec4_float( +; CHECK32-SAME: ptr [[X:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: [[TMP1:%.*]] = alloca <4 x float>, align 16 +; CHECK32-NEXT: call void @llvm.lifetime.start.p0(ptr [[TMP1]]) +; CHECK32-NEXT: call void @__atomic_load(i32 16, ptr [[X]], ptr [[TMP1]], i32 2) +; CHECK32-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP1]], align 16 +; CHECK32-NEXT: call void @llvm.lifetime.end.p0(ptr [[TMP1]]) +; CHECK32-NEXT: ret <4 x float> [[TMP2]] +; + %ret = load atomic <4 x float>, ptr %x acquire, align 16 + ret <4 x float> %ret +} From 72188570e33648b173fc8d4262aea97776192c52 Mon Sep 17 00:00:00 2001 From: Fujun Han Date: Mon, 11 May 2026 19:26:55 +0800 Subject: [PATCH 254/538] [MLIR] Make MLIRRegisterAllPasses depend on mlir-headers (#196913) RegisterAllPasses.cpp pulls in dialect Passes.h / generated Passes.h.inc via TableGen targets that are tied to mlir-headers, but add_mlir_library only adds mlir-generic-headers by default, so this TU can compile before those generated headers are ready and registerAllPasses() can miss passes (e.g. sporadic mlir-opt --help gaps). Add DEPENDS mlir-headers to MLIRRegisterAllPasses in mlir/lib/CMakeLists.txt so it waits for those outputs. Verified with ninja mlir-opt and mlir-opt --help | grep -E 'nvvm-attach-target|rocdl-attach-target' (or similar stable upstream passes in your tree). Signed-off-by: Fujun Han Co-authored-by: Cursor --- mlir/lib/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt index d7a6e28d98586..576942b78f4a8 100644 --- a/mlir/lib/CMakeLists.txt +++ b/mlir/lib/CMakeLists.txt @@ -41,6 +41,12 @@ add_mlir_library(MLIRRegisterAllPasses PARTIAL_SOURCES_INTENDED + # This TU includes dialect pass registration headers that depend on + # TableGen outputs (e.g. Passes.h.inc) wired into mlir-headers. Without this + # dependency it may compile before those headers are regenerated. + DEPENDS + mlir-headers + LINK_LIBS PUBLIC ${dialect_libs} # Some passes are part of the dialect libs ${conversion_libs} From 5c8c7bae4045d49869c6ee2a979419ad112e9964 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 11 May 2026 13:36:07 +0200 Subject: [PATCH 255/538] [clang][bytecode] Check destination size when initializing from an array initlist (#196916) --- clang/lib/AST/ByteCode/Compiler.cpp | 8 ++++---- clang/lib/AST/ByteCode/Interp.h | 16 ++++++++++++++++ clang/lib/AST/ByteCode/Opcodes.td | 1 + clang/test/AST/ByteCode/new-delete.cpp | 20 ++++++++++++++++++++ 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index faad6e0b4a230..beaeed09005b9 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2205,16 +2205,16 @@ bool Compiler::visitInitList(ArrayRef Inits, } if (QT->isArrayType()) { - if (Inits.size() == 1 && QT == Inits[0]->getType()) - return this->delegate(Inits[0]); - const ConstantArrayType *CAT = Ctx.getASTContext().getAsConstantArrayType(QT); uint64_t NumElems = CAT->getZExtSize(); - if (!this->emitCheckArraySize(NumElems, E)) + if (Initializing && !this->emitCheckArrayDestSize(NumElems, E)) return false; + if (Inits.size() == 1 && QT == Inits[0]->getType()) + return this->delegate(Inits[0]); + OptPrimType InitT = classify(CAT->getElementType()); unsigned ElementIndex = 0; for (const Expr *Init : Inits) { diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index fe2d99901d367..b620d1cce9010 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -3753,6 +3753,22 @@ inline bool CheckDecl(InterpState &S, CodePtr OpPC, const VarDecl *VD) { return true; } +/// Check if the destination array we're initializing can hold the \p NumElems +/// elements. +inline bool CheckArrayDestSize(InterpState &S, CodePtr OpPC, size_t NumElems) { + if (!CheckArraySize(S, OpPC, NumElems)) + return false; + + const Pointer &Ptr = S.Stk.peek(); + if (!Ptr.isUnknownSizeArray() && NumElems > Ptr.getNumElems()) { + S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_new_too_small) + << Ptr.getNumElems() << NumElems; + return false; + } + + return true; +} + inline bool Alloc(InterpState &S, CodePtr OpPC, const Descriptor *Desc) { assert(Desc); diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 57ed71fb6f16b..0838263a53ede 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -458,6 +458,7 @@ def CheckLiteralType : Opcode { } def CheckArraySize : Opcode { let Args = [ArgUint64]; } +def CheckArrayDestSize : Opcode { let Args = [ArgUint64]; } def CheckFunctionDecl : Opcode { let Args = [ArgFunctionDecl]; } def CheckBitCast : Opcode { let Args = [ArgTypePtr, ArgBool]; } diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index ac2c2ff4a73c6..b6e6d333a4bcb 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -1197,6 +1197,26 @@ namespace vdtor { static_assert(vdtor_3(3) == 3); } +namespace ArrayDestSize { + template + constexpr T dynarray(int elems, int i) { + T *p; + if constexpr (sizeof(T) == 1) + p = new T[elems]{"fox"}; // both-note {{evaluated array bound 3 is too small to hold 4 explicitly initialized elements}} + else + p = new T[elems]{1, 2, 3}; // both-note {{evaluated array bound 2 is too small to hold 3 explicitly initialized elements}} + T n = p[i]; // both-note 4{{past-the-end}} + delete [] p; + return n; + } + static_assert(dynarray(4, 4) == 0); // both-error {{constant expression}} both-note {{in call}} + static_assert(dynarray(3, 3) == 0); // both-error {{constant expression}} both-note {{in call}} + static_assert(dynarray(2, 1) == 0); // both-error {{constant expression}} both-note {{in call}} + static_assert(dynarray(5, 5) == 0); // both-error {{constant expression}} both-note {{in call}} + static_assert(dynarray(4, 4) == 0); // both-error {{constant expression}} both-note {{in call}} + static_assert(dynarray(3, 2) == 'x'); // both-error {{constant expression}} both-note {{in call}} +} + #else /// Make sure we reject this prior to C++20 constexpr int a() { // both-error {{never produces a constant expression}} From 34502b0c7e076e658bd176030223029cd4402941 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Mon, 11 May 2026 14:51:05 +0300 Subject: [PATCH 256/538] [MLIR][GPU] Add gpu-lower-to-rocdl-pipeline meta-pass (#196751) Add `gpu-lower-to-rocdl-pipeline` meta-pass which lowers common MLIR dialects (gpu/arith/scf/vector) to binary, similar to the existing XeVM/NVVM pipelines. --- .../mlir/Dialect/GPU/Pipelines/Passes.h | 60 ++++++++ mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt | 5 + .../GPU/Pipelines/GPUToROCDLPipeline.cpp | 136 ++++++++++++++++++ mlir/lib/RegisterAllPasses.cpp | 1 + .../GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir | 69 +++++++++ 5 files changed, 271 insertions(+) create mode 100644 mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp create mode 100644 mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index ee3632ba149e5..6263ea63cbf22 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -64,6 +64,59 @@ struct GPUToNVVMPipelineOptions llvm::cl::init(true)}; }; +/// Options for the gpu to rocdl pipeline. +struct GPUToROCDLPipelineOptions + : public PassPipelineOptions { + PassOptions::Option indexBitWidth{ + *this, "index-bitwidth", + llvm::cl::desc("Bitwidth of the index type for the host (warning this " + "should be 64 until the GPU layering is fixed)"), + llvm::cl::init(64)}; + PassOptions::Option triple{ + *this, "triple", + llvm::cl::desc("AMDGPU target triple (e.g. amdgcn-amd-amdhsa)."), + llvm::cl::init("amdgcn-amd-amdhsa")}; + PassOptions::Option chip{ + *this, "chip", + llvm::cl::desc( + "AMDGPU target chip (e.g. gfx90a, gfx942, gfx1100). Required: " + "AMDGCN binaries are not forward-compatible across chip families.")}; + PassOptions::Option features{ + *this, "features", llvm::cl::desc("AMDGPU target features."), + llvm::cl::init("")}; + PassOptions::Option binaryFormat{ + *this, "binary-format", + llvm::cl::desc("Final GPU binary emission format (e.g. fatbin, binary, " + "isa, llvm, offloading)."), + llvm::cl::init("fatbin")}; + PassOptions::Option abiVersion{ + *this, "abi", + llvm::cl::desc("AMDHSA ABI version (e.g. \"500\", \"600\")."), + llvm::cl::init("600")}; + PassOptions::Option wave64{ + *this, "wave64", + llvm::cl::desc("Use Wave64 mode (default true; wave32 if false, " + "appropriate for RDNA / gfx10+ where supported)."), + llvm::cl::init(true)}; + PassOptions::Option optLevel{ + *this, "opt-level", + llvm::cl::desc("Optimization level for ROCDL/AMDGPU compilation."), + llvm::cl::init(2)}; + PassOptions::Option cmdOptions{ + *this, "rocdl-cmd-options", + llvm::cl::desc( + "Command line options to pass to the downstream AMDGPU compiler."), + llvm::cl::init("")}; + PassOptions::Option kernelUseBarePtrCallConv{ + *this, "kernel-bare-ptr-calling-convention", + llvm::cl::desc("Use bareptr calling convention for device kernels."), + llvm::cl::init(false)}; + PassOptions::Option hostUseBarePtrCallConv{ + *this, "host-bare-ptr-calling-convention", + llvm::cl::desc("Use bareptr calling convention for the host."), + llvm::cl::init(false)}; +}; + // Options for the gpu to xevm pipeline. struct GPUToXeVMPipelineOptions : public PassPipelineOptions { @@ -120,6 +173,12 @@ struct GPUToXeVMPipelineOptions void buildLowerToNVVMPassPipeline(OpPassManager &pm, const GPUToNVVMPipelineOptions &options); +/// Adds the GPU to ROCDL pipeline to the given pass manager. Transforms main +/// dialects (arith, memref, scf, vector, gpu) into ROCDL/AMDGPU. Begins with +/// GPU code regions, then handles host code. +void buildLowerToROCDLPassPipeline(OpPassManager &pm, + const GPUToROCDLPipelineOptions &options); + /// Adds the GPU to XeVM pipeline to the given pass manager. Transforms main /// dialects into XeVM targets. Begins with GPU code regions, then handles host /// code. @@ -128,6 +187,7 @@ void buildLowerToXeVMPassPipeline(OpPassManager &pm, /// Register all pipelines for the `gpu` dialect. void registerGPUToNVVMPipeline(); +void registerGPUToROCDLPipeline(); void registerGPUToXeVMPipeline(); } // namespace gpu diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt index 85b7b1ce90637..f523ccaee3f9f 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRGPUPipelines GPUToNVVMPipeline.cpp + GPUToROCDLPipeline.cpp GPUToXeVMPipeline.cpp ADDITIONAL_HEADER_DIRS @@ -12,8 +13,12 @@ add_mlir_dialect_library(MLIRGPUPipelines MLIRTransforms MLIRLinalgTransforms MLIRAffineToStandard + MLIRAMDGPUToROCDL + MLIRArithToLLVM + MLIRFuncToLLVM MLIRGPUToLLVMSPV MLIRGPUToNVVMTransforms + MLIRGPUToROCDLTransforms MLIRIndexToLLVM MLIRMathToLLVM MLIRMathToXeVM diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp new file mode 100644 index 0000000000000..1e5fd09a00a75 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp @@ -0,0 +1,136 @@ +//===- GPUToROCDLPipeline.cpp - Lowering pipeline to ROCDL/AMDGPU --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a sink pipeline that lowers a payload containing +// `gpu.launch` / `gpu.module` ops to AMDGPU/ROCDL and emits an AMDGCN binary +// blob via `gpu-module-to-binary`. It is the AMD counterpart of +// `gpu-lower-to-nvvm-pipeline` and `gpu-lower-to-xevm-pipeline`. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" +#include "mlir/Conversion/GPUToROCDL/Runtimes.h" +#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" +#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" +#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/VectorToSCF/VectorToSCF.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Pipelines/Passes.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { + +//===----------------------------------------------------------------------===// +// Common pipeline +//===----------------------------------------------------------------------===// +void buildCommonPassPipeline( + OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) { + // Lower AMDGPU dialect ops (e.g. amdgpu.lds_barrier, amdgpu.dpp, + // amdgpu.mfma, amdgpu.dot, ...) to ROCDL intrinsics first, while they may + // still live in unout-lined `gpu.launch` bodies. Mirrors the way NVVM's + // pipeline runs `convert-nvgpu-to-nvvm` before kernel outlining. + ConvertAMDGPUToROCDLPassOptions amdgpuToROCDLOpt; + amdgpuToROCDLOpt.chipset = options.chip; + pm.addPass(createConvertAMDGPUToROCDLPass(amdgpuToROCDLOpt)); + + pm.addPass(createGpuKernelOutliningPass()); + pm.addPass(createConvertVectorToSCFPass()); + pm.addPass(createSCFToControlFlowPass()); + pm.addPass(createConvertFuncToLLVMPass()); + pm.addPass(memref::createExpandStridedMetadataPass()); + + GpuROCDLAttachTargetOptions rocdlTargetOptions; + rocdlTargetOptions.triple = options.triple; + rocdlTargetOptions.chip = options.chip; + rocdlTargetOptions.features = options.features; + rocdlTargetOptions.abiVersion = options.abiVersion; + rocdlTargetOptions.optLevel = options.optLevel; + rocdlTargetOptions.wave64Flag = options.wave64; + pm.addPass(createGpuROCDLAttachTarget(rocdlTargetOptions)); + + pm.addPass(createLowerAffinePass()); + pm.addPass(createArithToLLVMConversionPass()); + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt; + convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth; + pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); +} + +//===----------------------------------------------------------------------===// +// GPUModule-specific stuff. +//===----------------------------------------------------------------------===// +void buildGpuPassPipeline(OpPassManager &pm, + const mlir::gpu::GPUToROCDLPipelineOptions &options) { + ConvertGpuOpsToROCDLOpsOptions opt; + opt.chipset = options.chip; + opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv; + opt.indexBitwidth = options.indexBitWidth; + // Always declare HIP as the runtime so that gpu.printf etc. lower to the + // matching runtime entry points exposed by `libmlir_rocm_runtime.so`. + opt.runtime = mlir::gpu::amd::Runtime::HIP; + pm.addNestedPass(createConvertGpuOpsToROCDLOps(opt)); + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createCSEPass()); + pm.addNestedPass(createReconcileUnrealizedCastsPass()); +} + +//===----------------------------------------------------------------------===// +// Host Post-GPU pipeline +//===----------------------------------------------------------------------===// +void buildHostPostPipeline( + OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) { + GpuToLLVMConversionPassOptions opt; + opt.hostBarePtrCallConv = options.hostUseBarePtrCallConv; + opt.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv; + pm.addPass(createGpuToLLVMConversionPass(opt)); + + GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions; + gpuModuleToBinaryPassOptions.compilationTarget = options.binaryFormat; + gpuModuleToBinaryPassOptions.cmdOptions = options.cmdOptions; + pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions)); + pm.addPass(createConvertMathToLLVMPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(createReconcileUnrealizedCastsPass()); +} + +} // namespace + +void mlir::gpu::buildLowerToROCDLPassPipeline( + OpPassManager &pm, const GPUToROCDLPipelineOptions &options) { + // Common pipelines + buildCommonPassPipeline(pm, options); + + // GPUModule-specific stuff + buildGpuPassPipeline(pm, options); + + // Host post-GPUModule-specific stuff + buildHostPostPipeline(pm, options); +} + +void mlir::gpu::registerGPUToROCDLPipeline() { + PassPipelineRegistration( + "gpu-lower-to-rocdl-pipeline", + "The default pipeline lowers main dialects (arith, memref, scf, vector, " + "gpu) to ROCDL. It starts by lowering GPU code to the specified " + "compilation target (default is fatbin) then lowers the host code.", + buildLowerToROCDLPassPipeline); +} diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp index e1d5b1236c8a6..c645d737cb766 100644 --- a/mlir/lib/RegisterAllPasses.cpp +++ b/mlir/lib/RegisterAllPasses.cpp @@ -100,5 +100,6 @@ void mlir::registerAllPasses() { sparse_tensor::registerSparseTensorPipelines(); tosa::registerTosaToLinalgPipelines(); gpu::registerGPUToNVVMPipeline(); + gpu::registerGPUToROCDLPipeline(); gpu::registerGPUToXeVMPipeline(); } diff --git a/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir new file mode 100644 index 0000000000000..e4d2ad48644ae --- /dev/null +++ b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir @@ -0,0 +1,69 @@ +// RUN: mlir-opt %s \ +// RUN: --gpu-lower-to-rocdl-pipeline="chip=%chip" \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_rocm_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +// Mirror image of `vecadd.mlir`, but lowered through the +// `gpu-lower-to-rocdl-pipeline` meta-pass instead of the hand-rolled pass +// pipeline. Verifies that a single `--gpu-lower-to-rocdl-pipeline` invocation +// reproduces the same numeric output the multi-step recipe used to require. +// +// The kernel intentionally embeds an `amdgpu.sched_barrier` op (semantically a +// no-op, lowers to `rocdl.sched.barrier`) so that the test also exercises the +// `convert-amdgpu-to-rocdl` step that runs first inside the meta-pass. + +func.func @vecadd(%arg0 : memref<5xf32>, %arg1 : memref<5xf32>, %arg2 : memref<5xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %block_dim = arith.constant 5 : index + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) { + %a = memref.load %arg0[%tx] : memref<5xf32> + %b = memref.load %arg1[%tx] : memref<5xf32> + amdgpu.sched_barrier allow = + %c = arith.addf %a, %b : f32 + memref.store %c, %arg2[%tx] : memref<5xf32> + gpu.terminator + } + return +} + +// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46] +func.func @main() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c5 = arith.constant 5 : index + %cf1dot23 = arith.constant 1.23 : f32 + %0 = memref.alloc() : memref<5xf32> + %1 = memref.alloc() : memref<5xf32> + %2 = memref.alloc() : memref<5xf32> + %3 = memref.cast %0 : memref<5xf32> to memref + %4 = memref.cast %1 : memref<5xf32> to memref + %5 = memref.cast %2 : memref<5xf32> to memref + scf.for %i = %c0 to %c5 step %c1 { + memref.store %cf1dot23, %3[%i] : memref + memref.store %cf1dot23, %4[%i] : memref + } + %6 = memref.cast %3 : memref to memref<*xf32> + %7 = memref.cast %4 : memref to memref<*xf32> + %8 = memref.cast %5 : memref to memref<*xf32> + gpu.host_register %6 : memref<*xf32> + gpu.host_register %7 : memref<*xf32> + gpu.host_register %8 : memref<*xf32> + %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref) -> (memref) + %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref) -> (memref) + %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref) -> (memref) + %12 = memref.cast %9 : memref to memref<5xf32> + %13 = memref.cast %10 : memref to memref<5xf32> + %14 = memref.cast %11 : memref to memref<5xf32> + + call @vecadd(%12, %13, %14) : (memref<5xf32>, memref<5xf32>, memref<5xf32>) -> () + call @printMemrefF32(%8) : (memref<*xf32>) -> () + return +} + +func.func private @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref) -> (memref) +func.func private @printMemrefF32(%ptr : memref<*xf32>) From 1e8421904e34ce58b4b09a115aa058c6692395c5 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Mon, 11 May 2026 14:53:41 +0300 Subject: [PATCH 257/538] [mlir][dataflow] IntRange: Replace yield-based widening with per-state lattice budget (#196616) IntegerRangeAnalysis can hang on `scf.while` loops with dynamic bounds: a loop-carried range ratchets [0,0]->[0,1]->[0,2]->... by one per worklist visit, requiring up to 2^31 iterations on i32. The new `int-range-analysis-convergence.mlir` test reproduces this. The ratchet lives at framework merge sites (region successors, callable args) where the solver joins lattices via virtual `Lattice::join(const AbstractSparseLattice &)`. The pre-existing `isYieldedResult`/`isYieldedValue` heuristic in `IntegerRangeAnalysis::visitOperation` doesn't help: it runs in the transfer-function callback for inferrable-op results used by a terminator, not on the merge path. It is also harmful where it fires - slams to maxRange on the *second* visit (after, say, [1,1]->[1,2]), so naturally bounded accumulators (e.g. `arith.minsi`-clamped iter args) widen to [INT_MIN, INT_MAX]. Replace it with a per-state widening budget on `IntegerValueRangeLattice`: the lattice counts merge-site joins and forces the range to its max once the count hits `kIntegerRangeWideningBudget` (128). Only the virtual overload is overridden, so transfer-function joins via the non-virtual `join(const ValueT &)` are unaffected. The new `int-range-loop-iter-args.mlir` test pins the tighter bounds; the convergence test verifies termination. --- .../Analysis/DataFlow/IntegerRangeAnalysis.h | 28 ++++++ .../DataFlow/IntegerRangeAnalysis.cpp | 59 +++++------- .../Arith/int-range-analysis-convergence.mlir | 91 +++++++++++++++++++ .../Arith/int-range-loop-iter-args.mlir | 63 +++++++++++++ 4 files changed, 207 insertions(+), 34 deletions(-) create mode 100644 mlir/test/Dialect/Arith/int-range-analysis-convergence.mlir create mode 100644 mlir/test/Dialect/Arith/int-range-loop-iter-args.mlir diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h index 5b6ae9bf84265..8d75c4016355b 100644 --- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h @@ -26,9 +26,37 @@ class RewriterBase; namespace dataflow { /// This lattice element represents the integer value range of an SSA value. +/// +/// `join` overrides the base behaviour to apply per-state widening: once +/// the lattice has absorbed enough strictly-increasing merges the range is +/// forced to its max as a sound over-approximation. This is the sole +/// convergence guarantee for `IntegerRangeAnalysis` on loop-carried +/// values; without it, `scf.while` loops with dynamic bounds and nested +/// region ops can keep the solver ratcheting a loop-carried range by +1 +/// per worklist visit for up to 2^31 iterations on i32. The budget is +/// sized to be much larger than realistic merge counts on naturally +/// bounded accumulators (e.g. `arith.minsi`/`arith.andi`-clamped iter +/// args) so the analysis still converges to a tight range on those. +/// +/// Note that only the `(const AbstractSparseLattice &)` overload is +/// overridden, so the widening fires only at framework merge sites +/// (block-arg / region-successor / callable-arg joins) — +/// transfer-function updates that go through the non-virtual +/// `join(const ValueT &)` overload are unaffected. class IntegerValueRangeLattice : public Lattice { public: using Lattice::Lattice; + // The override below would otherwise hide the inherited + // `join(const ValueT &)` overload that callers (e.g. transfer functions) + // rely on for direct-value joins. + using Lattice::join; + + ChangeResult join(const AbstractSparseLattice &rhs) override; + +private: + /// Per-state merge-site change counter. Drives the widening budget in + /// `join`. + unsigned mergeChangeCount = 0; }; /// Integer range analysis determines the integer value range of SSA values diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp index b29fc28131806..613772c2b7404 100644 --- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp @@ -58,6 +58,29 @@ LogicalResult staticallyNonNegative(DataFlowSolver &solver, Operation *op) { } } // namespace mlir::dataflow +/// Number of merge-site joins a single integer-range lattice element is +/// allowed to absorb before `IntegerValueRangeLattice::join` forces it to +/// its max as a sound over-approximation. +/// +/// Trade-off: high enough that realistic loops with dynamic bounds (which +/// typically converge to a tight range in a small number of merge +/// iterations) are not widened prematurely; low enough that the +1 +/// ratchet pathology this widening exists to cut off (loop-carried ranges +/// growing by one per worklist visit) terminates after at most this many +/// extra solver iterations rather than ~2^31. +static constexpr unsigned kIntegerRangeWideningBudget = 128; + +ChangeResult IntegerValueRangeLattice::join(const AbstractSparseLattice &rhs) { + ChangeResult changed = Lattice::join(rhs); + if (mergeChangeCount >= kIntegerRangeWideningBudget) { + return changed | Lattice::join(IntegerValueRange::getMaxRange( + cast(getAnchor()))); + } + if (changed == ChangeResult::Change) + ++mergeChangeCount; + return changed; +} + LogicalResult IntegerRangeAnalysis::visitOperation( Operation *op, ArrayRef operands, ArrayRef results) { @@ -82,23 +105,7 @@ LogicalResult IntegerRangeAnalysis::visitOperation( LDBG() << "Inferred range " << attrs; IntegerValueRangeLattice *lattice = results[result.getResultNumber()]; - IntegerValueRange oldRange = lattice->getValue(); - - ChangeResult changed = lattice->join(attrs); - - // Catch loop results with loop variant bounds and conservatively make - // them [-inf, inf] so we don't circle around infinitely often (because - // the dataflow analysis in MLIR doesn't attempt to work out trip counts - // and often can't). - bool isYieldedResult = llvm::any_of(v.getUsers(), [](Operation *op) { - return op->hasTrait(); - }); - if (isYieldedResult && !oldRange.isUninitialized() && - !(lattice->getValue() == oldRange)) { - LDBG() << "Loop variant loop result detected"; - changed |= lattice->join(IntegerValueRange::getMaxRange(v)); - } - propagateIfChanged(lattice, changed); + propagateIfChanged(lattice, lattice->join(attrs)); }; inferrable.inferResultRangesFromOptional(argRanges, joinCallback); @@ -132,23 +139,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( std::distance(successor.getSuccessor()->getArguments().begin(), it); IntegerValueRangeLattice *lattice = nonSuccessorInputLattices[nonSuccessorInputIdx]; - IntegerValueRange oldRange = lattice->getValue(); - - ChangeResult changed = lattice->join(attrs); - - // Catch loop results with loop variant bounds and conservatively make - // them [-inf, inf] so we don't circle around infinitely often (because - // the dataflow analysis in MLIR doesn't attempt to work out trip counts - // and often can't). - bool isYieldedValue = llvm::any_of(v.getUsers(), [](Operation *op) { - return op->hasTrait(); - }); - if (isYieldedValue && !oldRange.isUninitialized() && - !(lattice->getValue() == oldRange)) { - LDBG() << "Loop variant loop result detected"; - changed |= lattice->join(IntegerValueRange::getMaxRange(v)); - } - propagateIfChanged(lattice, changed); + propagateIfChanged(lattice, lattice->join(attrs)); }; inferrable.inferResultRangesFromOptional(argRanges, joinCallback); diff --git a/mlir/test/Dialect/Arith/int-range-analysis-convergence.mlir b/mlir/test/Dialect/Arith/int-range-analysis-convergence.mlir new file mode 100644 index 0000000000000..a932d4b699a89 --- /dev/null +++ b/mlir/test/Dialect/Arith/int-range-analysis-convergence.mlir @@ -0,0 +1,91 @@ +// IntegerRangeAnalysis convergence on scf.while with dynamic bounds. +// +// The carry range ratchets [0,0]->[0,1]->[0,2]->... per worklist visit; +// nested scf.if layers with arith chains (addi, muli) bounded by remui +// create enough worklist cascade to defeat the solver's back-to-back +// convergence shortcut. The per-state widening budget on +// IntegerValueRangeLattice forces the range to its max after a bounded +// number of strict refinements, so the analysis terminates instead of +// hanging for ~minutes (or 2^31 iterations). +// +// We assert: +// - the analysis terminates and produces well-formed IR; +// - the loop-carried iter arg of the outer scf.while widens to +// [INT_MIN, INT_MAX] (the only sound result once the budget fires); +// - transfer-function results inside the body stay tight (e.g. +// `arith.remui ..., %c127` = [0, 126]), verifying the widening is +// scoped to framework merge sites, not transfer-function joins. +// +// RUN: mlir-opt -int-range-optimizations %s | FileCheck %s + +// CHECK-LABEL: func.func @grouped_gemm_while_hang +// CHECK-SAME: (%[[N:.*]]: i32, %{{.*}}: i1) -> i32 +func.func @grouped_gemm_while_hang(%n: i32, %flag: i1) -> i32 { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : i32 + %c3 = arith.constant 3 : i32 + %c7 = arith.constant 7 : i32 + %c127 = arith.constant 127 : i32 + %init = arith.cmpi slt, %c0, %n : i32 + + // CHECK: %[[OUTER:.*]]:2 = scf.while + %res:2 = scf.while (%a0 = %c0, %cond = %init) : (i32, i1) -> (i32, i1) { + scf.condition(%cond) %a0, %cond : i32, i1 + } do { + ^bb0(%b0: i32, %bc: i1): + %t0 = arith.addi %b0, %c1 : i32 + %ic = arith.cmpi slt, %t0, %n : i32 + + // CHECK: scf.while + %inner:2 = scf.while (%i0 = %t0, %iic = %ic) : (i32, i1) -> (i32, i1) { + scf.condition(%iic) %i0, %iic : i32, i1 + } do { + ^bb1(%j0: i32, %jc: i1): + + %L0 = scf.if %flag -> (i32) { + %a0_0 = arith.addi %j0, %c1 : i32 + %a0_1 = arith.muli %a0_0, %c7 : i32 + %a0_r = arith.remui %a0_1, %c127 : i32 + scf.yield %a0_r : i32 + } else { + %b0_0 = arith.addi %j0, %c3 : i32 + %b0_1 = arith.muli %b0_0, %c7 : i32 + %b0_r = arith.remui %b0_1, %c127 : i32 + scf.yield %b0_r : i32 + } + + %L1 = scf.if %flag -> (i32) { + %a1_0 = arith.addi %L0, %c1 : i32 + %a1_1 = arith.muli %a1_0, %c7 : i32 + %a1_r = arith.remui %a1_1, %c127 : i32 + scf.yield %a1_r : i32 + } else { + %b1_0 = arith.addi %L0, %c3 : i32 + %b1_1 = arith.muli %b1_0, %c7 : i32 + %b1_r = arith.remui %b1_1, %c127 : i32 + scf.yield %b1_r : i32 + } + + %nic = arith.cmpi slt, %L1, %n : i32 + // The yielded `arith.remui` result stays at [0, 126]: the widening + // budget only fires on virtual `Lattice::join` at framework merge + // sites, not on transfer-function joins for inferrable ops. + // CHECK: test.reflect_bounds {smax = 126 : si32, smin = 0 : si32, umax = 126 : ui32, umin = 0 : ui32} + %r_l1 = test.reflect_bounds %L1 : i32 + scf.yield %L1, %nic : i32, i1 + } + + %nc = arith.cmpi slt, %inner#0, %n : i32 + scf.yield %inner#0, %nc : i32, i1 + } + // The outer loop-carried iter arg goes through region-successor merges + // and is widened to maxRange after the budget is exhausted. The mere + // presence of these bounds here is the convergence assertion: without + // the patch the analysis would not terminate to print this attribute. + // CHECK: %[[BOUNDED:.*]] = test.reflect_bounds + // CHECK-SAME: {smax = 2147483647 : si32, smin = -2147483648 : si32, umax = 4294967295 : ui32, umin = 0 : ui32} + // CHECK-SAME: %[[OUTER]]#0 : i32 + %r = test.reflect_bounds %res#0 : i32 + // CHECK: return %[[BOUNDED]] : i32 + return %r : i32 +} diff --git a/mlir/test/Dialect/Arith/int-range-loop-iter-args.mlir b/mlir/test/Dialect/Arith/int-range-loop-iter-args.mlir new file mode 100644 index 0000000000000..24801875e257c --- /dev/null +++ b/mlir/test/Dialect/Arith/int-range-loop-iter-args.mlir @@ -0,0 +1,63 @@ +// RUN: mlir-opt --int-range-optimizations %s | FileCheck %s + +// Verify that `IntegerRangeAnalysis` infers tight bounds for loop-carried +// values that are structurally bounded inside the loop body (via +// `arith.minsi`, `arith.andi`, etc.). Convergence is guaranteed by the +// per-state widening budget on `IntegerValueRangeLattice`; the budget is +// large enough that these naturally bounded ratchets reach a fixpoint +// without being widened to `[INT_MIN, INT_MAX]`. + +// CHECK-LABEL: func @bounded_acc_for +// CHECK: test.reflect_bounds {smax = 10 : si32, smin = 0 : si32, umax = 10 : ui32, umin = 0 : ui32} +func.func @bounded_acc_for(%n: i32) -> i32 { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : i32 + %c10 = arith.constant 10 : i32 + %res = scf.for %i = %c0 to %n step %c1 iter_args(%acc = %c0) -> i32 : i32 { + %incr = arith.addi %acc, %c1 : i32 + %clamped = arith.minsi %incr, %c10 : i32 + scf.yield %clamped : i32 + } + %r = test.reflect_bounds %res : i32 + return %r : i32 +} + +// The `arith.cmpi slt, %acc, 100` should fold to `true` once the analysis +// proves the iter arg stays in `[0, 10]`, exposing a downstream +// optimization that the previous yield-based widening masked. +// CHECK-LABEL: func @bounded_acc_while +// CHECK: %[[TRUE:.*]] = arith.constant true +// CHECK: scf.condition(%[[TRUE]]) +// CHECK: test.reflect_bounds {smax = 10 : si32, smin = 0 : si32, umax = 10 : ui32, umin = 0 : ui32} +func.func @bounded_acc_while() -> i32 { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : i32 + %c10 = arith.constant 10 : i32 + %c100 = arith.constant 100 : i32 + %res = scf.while (%acc = %c0) : (i32) -> i32 { + %cond = arith.cmpi slt, %acc, %c100 : i32 + scf.condition(%cond) %acc : i32 + } do { + ^bb0(%a: i32): + %incr = arith.addi %a, %c1 : i32 + %clamped = arith.minsi %incr, %c10 : i32 + scf.yield %clamped : i32 + } + %r = test.reflect_bounds %res : i32 + return %r : i32 +} + +// CHECK-LABEL: func @bounded_mask_for +// CHECK: test.reflect_bounds {smax = 15 : si32, smin = 0 : si32, umax = 15 : ui32, umin = 0 : ui32} +func.func @bounded_mask_for(%n: i32) -> i32 { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : i32 + %c15 = arith.constant 15 : i32 + %res = scf.for %i = %c0 to %n step %c1 iter_args(%acc = %c0) -> i32 : i32 { + %incr = arith.addi %acc, %c1 : i32 + %masked = arith.andi %incr, %c15 : i32 + scf.yield %masked : i32 + } + %r = test.reflect_bounds %res : i32 + return %r : i32 +} From a5547d35922c14da0c40ac49e820ce75a48fe84d Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Mon, 11 May 2026 13:04:28 +0100 Subject: [PATCH 258/538] [Dexter] Add basic structured script parsing (#193710) See PSA: https://discourse.llvm.org/t/psa-planned-changes-to-dexter/90402 This patch begins adding support for "structured scripts" to Dexter, starting with some of the core classes and the ability to parse script files. This patch does not add the ability to actually run scripts, or any of the underlying functionality required to do so. NB: This patch adds a dependency on PyYAML, which is specified in a new requirements.txt file. --- .../debuginfo-tests/dexter/README.md | 7 +- .../dexter/dex/dextIR/DextIR.py | 4 +- .../dexter/dex/test_script/Nodes.py | 204 +++++++++++++++ .../dexter/dex/test_script/Script.py | 238 ++++++++++++++++++ .../dexter/dex/test_script/__init__.py | 0 .../dexter/dex/tools/TestToolBase.py | 6 + .../dexter/dex/tools/test/Tool.py | 34 ++- .../scripts/parser/bad-where-attr.test | 13 + .../scripts/parser/error-locations.test | 23 ++ .../scripts/parser/invalid-script-nodes.test | 24 ++ .../scripts/parser/valid-parse.test | 20 ++ .../debuginfo-tests/dexter/requirements.txt | 1 + cross-project-tests/lit.cfg.py | 60 ++++- 13 files changed, 621 insertions(+), 13 deletions(-) create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/test_script/Nodes.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/test_script/Script.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/bad-where-attr.test create mode 100644 cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/error-locations.test create mode 100644 cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/invalid-script-nodes.test create mode 100644 cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/valid-parse.test create mode 100644 cross-project-tests/debuginfo-tests/dexter/requirements.txt diff --git a/cross-project-tests/debuginfo-tests/dexter/README.md b/cross-project-tests/debuginfo-tests/dexter/README.md index 44c43435b20d5..99aa6fb601da7 100644 --- a/cross-project-tests/debuginfo-tests/dexter/README.md +++ b/cross-project-tests/debuginfo-tests/dexter/README.md @@ -13,11 +13,12 @@ The following command evaluates your environment, listing the available and comp dexter.py list-debuggers ## Dependencies -[TODO] Add a requirements.txt or an install.py and document it here. -### Python 3.6 +See: requirements.txt -DExTer requires python version 3.6 or greater. +### Python 3.8 + +DExTer requires python version 3.8 or greater. ### pywin32 python package diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/DextIR.py b/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/DextIR.py index 42500c4b9681d..eb5ad1b97afc5 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/DextIR.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/dextIR/DextIR.py @@ -6,10 +6,11 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception from collections import OrderedDict import os -from typing import List +from typing import List, Union from dex.dextIR.DebuggerIR import DebuggerIR from dex.dextIR.StepIR import StepIR, StepKind +from dex.test_script.Script import DexterScript def _step_kind_func(context, step): @@ -55,6 +56,7 @@ def __init__( self.debugger = debugger self.commands = commands self.steps: List[StepIR] = [] + self.script: Union[DexterScript, None] = None def __str__(self): colors = "rgby" diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/Nodes.py b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/Nodes.py new file mode 100644 index 0000000000000..875fe949bf2a2 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/Nodes.py @@ -0,0 +1,204 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""This file defines all of the Nodes used to create Dexter scripts. All Nodes must be registered with the yaml +constructor/representer in `setup_yaml_parser` before loading or printing any script. +""" + +import abc +from dataclasses import dataclass +from typing import Any, Dict, Optional, Union +import yaml +from dex.dextIR.ValueIR import ValueIR +from dex.utils.Exceptions import Error + + +def setup_yaml_parser(loader): + reg_classes = [ + Where, + Value, + DexRange, + ] + for c in reg_classes: + c.register_yaml(loader) + + +class DexterNodeError(Error): + """Class representing errors with Dexter node parsing.""" + + def __init__(self, node, msg): + super(DexterNodeError, self).__init__(msg) + self.msg = msg + self.node = node + + def __str__(self): + return f"Error with node: {self.node}: {self.msg}" + + +################### +## Structural Nodes: These are used as keys in the Script, and collectively define Dexter's actions when running a test: +## how it steps and navigates through the debuggee program, and what information it collects from the +## debugger. + + +class Where: + """One or more instances of this class define a range of steps in a debugging session. Any expects in the script + within scope of a "Where" will only be evaluated for the steps where the Where applies. + """ + + def __init__(self, attributes: dict): + self.file: Optional[str] = attributes.pop("file", None) + self.function: Union[list[str], str, None] = attributes.pop("function", None) + self.lines: Union[int, DexRange, None] = attributes.pop("lines", None) + self.after_hit_count: Optional[int] = attributes.pop("after_hit_count", None) + self.for_hit_count: Optional[int] = attributes.pop("for_hit_count", None) + self.conditions: dict = attributes.pop("conditions", None) + if attributes: + raise DexterNodeError( + self, f"unexpected attributes {', '.join(attributes)}" + ) + if ( + not self.function + and not self.lines + and (self.for_hit_count or self.after_hit_count) + ): + raise DexterNodeError( + self, "can't check hit counts without an explicit lines or function arg" + ) + + def __repr__(self): + elts = [ + f"{name}={value}" + for name, value in self.get_attrs().items() + if value is not None + ] + return f"Where(" + ", ".join(elts) + ")" + + def get_attrs(self) -> Dict[str, Any]: + return { + "file": self.file, + "function": self.function, + "lines": self.lines, + "for_hit_count": self.for_hit_count, + "after_hit_count": self.after_hit_count, + "conditions": self.conditions, + } + + @staticmethod + def constructor(loader: yaml.Loader, node): + return Where(loader.construct_mapping(node)) + + @staticmethod + def representer(dumper: yaml.Dumper, data: "Where"): + mapping = { + name: value for name, value in data.get_attrs().items() if value is not None + } + return dumper.represent_mapping("!where", mapping, flow_style=True) + + @staticmethod + def register_yaml(loader): + yaml.add_constructor("!where", Where.constructor, loader) + yaml.add_representer(Where, Where.representer) + + def get_lines(self) -> range: + """Returns the range of line numbers that this Where references, returning an empty range if this Where does not + refer to any lines.""" + if not self.lines: + return range(-1) + if isinstance(self.lines, int): + return range(self.lines, self.lines + 1) + assert isinstance( + self.lines, DexRange + ), f"Invalid type for lines: {self.lines}: ({type(self.lines)})" + return self.lines.to_range() + + +################### +## Expect Nodes: These nodes define the expected outputs from the debugger - they are the only nodes that produce +## metrics, and map to an expected value in the script. + + +class Expect: + """An expectation of some debugger state that will be compared to actual observed debugger state and generate one + or more metrics as a measurement of the difference. + Expects are largely evaluated independently, but may influence each other through the evaluation context. + """ + + @staticmethod + def get_variable_result(value: ValueIR) -> Optional[str]: + """For Expects that extract actual results from ValueIR, this method returns that result from the given value, + excluding any subvalues (i.e. struct members), or None if there is no valid result for this ValueIR. + """ + + @abc.abstractmethod + def get_watched_expr(self) -> str: + """Returns the list of expressions that this Expect wants to evaluate.""" + + +class Value(Expect): + def __init__(self, variable_name: str): + self.variable_name = variable_name + self.actual_values = None + + @staticmethod + def get_variable_result(value: ValueIR) -> Optional[str]: + if value.could_evaluate and not ( + value.is_irretrievable or value.is_optimized_away + ): + return value.value + return None + + def get_watched_expr(self) -> str: + return self.variable_name + + def __repr__(self): + return f"Value({self.variable_name})" + + @staticmethod + def constructor(loader: yaml.Loader, node): + return Value(loader.construct_scalar(node)) + + @staticmethod + def representer(dumper, data): + return dumper.represent_scalar("!value", data.variable_name) + + @staticmethod + def register_yaml(loader): + yaml.add_constructor("!value", Value.constructor, loader) + yaml.add_representer(Value, Value.representer) + + +############## +## Utility Nodes: Can be used anywhere in a script as a form of syntactic sugar. + + +@dataclass(frozen=True) +class DexRange: + start: int + stop: int + + def __repr__(self) -> str: + return f"[{self.start} - {self.stop}]" + + # We use an inclusive range in Dexter scripts, while python ranges are exclusive. + def to_range(self) -> range: + return range(self.start, self.stop + 1) + + @staticmethod + def constructor(loader: yaml.Loader, node): + range_seq = loader.construct_sequence(node) + if len(range_seq) != 2 or not all(isinstance(elt, int) for elt in range_seq): + raise DexterNodeError(node, "range must have exactly 2 int elements") + return DexRange(range_seq[0], range_seq[1]) + + @staticmethod + def representer(dumper, data: "DexRange"): + return dumper.represent_sequence("!range", [data.start, data.stop]) + + @staticmethod + def register_yaml(loader): + yaml.add_constructor("!range", DexRange.constructor, loader) + yaml.add_representer(DexRange, DexRange.representer) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/Script.py b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/Script.py new file mode 100644 index 0000000000000..a338f62f95af7 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/Script.py @@ -0,0 +1,238 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""This file defines the DexterScript class. Using Nodes as building blocks, the DexterScript defines a complete Dexter +test, a structured definition of locations, values, and actions used to drive a debugging session and evaluate the +results. +""" + +from pathlib import PurePath +import os +from typing import Any, Callable, Optional, Set +import yaml + +from dex.test_script.Nodes import ( + Expect, + Where, + setup_yaml_parser, +) + +from dex.utils.Exceptions import Error +from dex.utils.Timer import Timer + + +class DexterScriptError(Error): + pass + + +class Scope: + """Helper class used to simplify queries about the context of a Node in the Dexter Script. The context for a given + Node consists of some base context information in the root of the script, and then all Where nodes in the parent + chain of the current Node. Therefore each Script has a root Scope object, and each Node's context is given by a + Scope chain built from the root Scope and every Where between the root and the given Node. + """ + + def __init__( + self, + file: Optional[str] = None, + where: Optional[Where] = None, + parent_scope: "Optional[Scope]" = None, + ): + """Can be initialized with either a file for the default Scope, or with the properties of a Where + for any script-nested Scope. + """ + if where is not None: + assert ( + parent_scope is not None + ), "Scope for a Where node must have a parent scope!" + assert ( + file is None + ), "Scope for a Where node cannot have a separately-defined file!" + self.file = None + self.where = where + self.parent_scope = parent_scope + else: + assert ( + parent_scope is None + ), "Scope for a Root node cannot have a parent scope!" + self.file = file + self.where = None + self.parent_scope = None + + def add_where(self, where: Where): + """Adds `where` to this Scope's chain.""" + return Scope(where=where, parent_scope=self) + + +class DexterScript: + def __init__( + self, + context, + script_obj, + scope: Scope, + ): + self.context = context + self.script_obj = script_obj + self.root_scope = scope + # `visit_script` will validate the structure of the script, as it traverses the full script and raises an + # exception if it sees anything unexpected. + self.visit_script() + + # If a truthy value is returned, abort further visiting and return that value. + def _visit_script( + self, script, scope: Scope, visit_where=None, visit_expect=None, visit_then=None + ) -> Any: + def do(visitor, *args): + if visitor: + return visitor(*args) + return None + + if not isinstance(script, dict): + raise DexterScriptError(f"Found unexpected node: {script}") + for key, value in script.items(): + if isinstance(key, Where): + if result := do(visit_where, key, scope): + return result + new_scope = scope.add_where(key) + if result := self._visit_script( + value, new_scope, visit_where, visit_expect, visit_then + ): + return result + elif isinstance(key, Expect): + if result := do(visit_expect, key, value, scope): + return result + else: + raise DexterScriptError(f"Found unexpected node: {key}") + + # Any visitor function provided may return a truthy value to abort the visit and return that value. + def visit_script( + self, + visit_where: Optional[Callable[[Where, Scope], Any]] = None, + visit_expect: Optional[Callable[[Expect, Any, Scope], Any]] = None, + ) -> Any: + """Visits all nodes in the script in pre-order traversal, calling any non-none provided visitor functions for + each respective node type. Note that we do not visit expected values independently of their associated expect; + instead, visit_expect accepts the Expect node and its expected value as an argument. + + If any visit function returns a truthy value, traversal will early-exit and this function returns that value; + otherwise, this function returns None.""" + return self._visit_script( + self.script_obj, self.root_scope, visit_where, visit_expect + ) + + @property + def root_wheres(self) -> Set[Where]: + return set(node for node in self.script_obj if isinstance(node, Where)) + + def dump(self) -> str: + return yaml.dump(self.script_obj) + + +# Helper function to apply a line offset to the errors reported by YAML while loading, to account for the YAML documents +# being embedded in part of a file. +def try_load_yaml(yaml_doc, loader, line_offset=0): + """Helper function that loads a YAML document from within a file, where the document may start in the middle of the + file. In this case, the value of line_offset should be set to the start line of the YAML document, and this function + will fix-up any returned syntax errors to point to the correct line in the file.""" + try: + return yaml.load(yaml_doc, loader) + except yaml.MarkedYAMLError as e: + # MarkedYAMLError is an error with a 'Mark' pointing to the location of the error; this helper function applies + # our line offset to the provided mark if it is present. + def adjust_mark_loc(mark: Optional[yaml.Mark]) -> Optional[yaml.Mark]: + if mark is None: + return None + return yaml.Mark( + mark.name, + mark.index, + mark.line + line_offset, + mark.column, + mark.buffer, + mark.pointer, + ) + + # Adjust the error marks and then propagate the adjusted error. + e.context_mark = adjust_mark_loc(e.context_mark) + e.problem_mark = adjust_mark_loc(e.problem_mark) + raise e + + +def get_script(context, file, loader) -> DexterScript: + """Searches the given file for a valid Dexter script, and returns the first valid script that it finds or raises an + Error if none is found.""" + if not os.path.exists(file): + raise Error(f"Provided script file '{file}' does not exist.") + with open(file, "r") as r: + lines = r.readlines() + if not lines: + raise Error(f"Provided script file '{file}' is empty.") + + numbered_lines = [(idx + 1, line) for idx, line in enumerate(lines)] + root_scope = Scope(file=str(file)) + start_line = None + attempted_scripts = [] + start_line = next((idx for idx, line in numbered_lines if line == "---\n"), None) + if start_line is None: + # If we saw no '---', then assume the whole file is a document and try to parse it. + try: + return DexterScript( + context, + try_load_yaml("\n".join(lines), loader), + root_scope, + ) + except (Error, yaml.YAMLError) as e: + raise Error(f"File '{file}' was not a valid Dexter script:\n{e}") + # If we have at least one valid document start, then check every document until we see one that is a valid Dexter + # test. + while start_line is not None: + stop_line = next( + ( + idx + for idx, line in numbered_lines[start_line + 1 :] + if line.startswith("...") + ), + len(lines), + ) + try: + return DexterScript( + context, + try_load_yaml( + "\n".join(lines[start_line:stop_line]), loader, start_line + ), + root_scope, + ) + except (Error, yaml.YAMLError) as e: + attempted_scripts.append((start_line, e)) + start_line = next( + (idx for idx, line in numbered_lines[stop_line + 1 :] if line == "---\n"), + None, + ) + script_error_messages = "\n".join( + f"Script starting line {line}:\n{e}" for line, e in attempted_scripts + ) + raise Error( + f"No valid Dexter script found in file '{file}'; candidates:\n{script_error_messages}" + ) + + +def get_dexter_script(context, test_file, source_root_dir): + setup_yaml_parser(yaml.CLoader) + with Timer("parsing script"): + script = get_script(context, test_file, yaml.CLoader) + assert script.root_scope.file == test_file + source_files = set() + source_dir = source_root_dir if source_root_dir else str(test_file) + + def check_explicit_files(where: Where, _: Scope): + if not where.file: + return + declared_path = where.file + if not os.path.isabs(declared_path): + declared_path = os.path.join(source_dir, declared_path) + source_files.add(str(PurePath(declared_path))) + + script.visit_script(visit_where=check_explicit_files) + return script, source_files diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py index ecfc8ebcb1507..c153193a0baca 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py @@ -65,6 +65,12 @@ def add_tool_arguments(self, parser, defaults): default=None, help="if passed, result names will include relative path from this directory", ) + parser.add_argument( + "--use-script", + action="store_true", + default=False, + help="if passed, Dexter will look for a structured YAML script instead of dexter commands", + ) def handle_options(self, defaults): options = self.context.options diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index 693c05b97af7c..b295ce02735ee 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -19,6 +19,7 @@ from dex.debugger.DebuggerControllers.ConditionalController import ConditionalController from dex.dextIR.DextIR import DextIR from dex.heuristic import Heuristic +from dex.test_script.Script import get_dexter_script from dex.tools import TestToolBase from dex.utils.Exceptions import DebuggerException from dex.utils.Exceptions import BuildScriptException, HeuristicException @@ -106,6 +107,11 @@ def add_tool_arguments(self, parser, defaults): action="store_true", help="calculate the average score of every test run", ) + parser.add_argument( + "--skip-run", + action="store_true", + help="if true, skip running the debugger and produce no output; used for testing purposes", + ) super(Tool, self).add_tool_arguments(parser, defaults) def _init_debugger_controller(self): @@ -115,12 +121,26 @@ def _init_debugger_controller(self): dexter_version=self.context.version, ) - step_collection.commands, new_source_files = get_command_infos( - self.context.options.test_files, self.context.options.source_root_dir - ) + if self.context.options.use_script: + step_collection.script, new_source_files = get_dexter_script( + self.context, + self.context.options.test_files[0], + self.context.options.source_root_dir, + ) + assert ( + self.context.options.skip_run + ), "Debugging not yet supported with --use-script" + else: + step_collection.commands, new_source_files = get_command_infos( + self.context.options.test_files, self.context.options.source_root_dir + ) self.context.options.source_files.extend(list(new_source_files)) + # If we are not running a debugger, return the DextIR instead of a DebuggerController. + if self.context.options.skip_run: + return step_collection + cond_controller_cmds = ["DexLimitSteps", "DexStepFunction", "DexContinue"] if any(c in step_collection.commands for c in cond_controller_cmds): debugger_controller = ConditionalController(self.context, step_collection) @@ -132,6 +152,10 @@ def _init_debugger_controller(self): def _get_steps(self): """Generate a list of debugger steps from a test case.""" debugger_controller = self._init_debugger_controller() + + if self.context.options.skip_run: + self.context.logger.warning("Skipping run...") + return debugger_controller debugger_controller = run_debugger_subprocess( debugger_controller, self.context.working_directory.path ) @@ -227,6 +251,10 @@ def _run_test(self, test_name): self.context.options.binary, self.context.options.executable ) steps = self._get_steps() + if self.context.options.skip_run: + if steps.script is not None: + print(steps.script.dump()) + return self._record_steps(test_name, steps) heuristic_score = Heuristic(self.context, steps) self._record_score(test_name, heuristic_score) diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/bad-where-attr.test b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/bad-where-attr.test new file mode 100644 index 0000000000000..9d1d01413acec --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/bad-where-attr.test @@ -0,0 +1,13 @@ +RUN: not %dexter_regression_test_run --binary %s --use-script -- %s 2>&1 | FileCheck %s + +This is a test that when the script document starts part-way through a file, any syntax errors are reported with the +correct line number. + +CHECK: No valid Dexter script found in file +CHECK-NEXT: Script starting line 10 +CHECK-NEXT: Error with node: Where(file=main.cpp): unexpected attributes bees + +--- +!where {file: "main.cpp", bees: nose}: + !value x: 5 +... diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/error-locations.test b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/error-locations.test new file mode 100644 index 0000000000000..941d5963ccb61 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/error-locations.test @@ -0,0 +1,23 @@ +RUN: not %dexter_regression_test_run --binary %s --use-script -- %s 2>&1 | FileCheck %s + +This is a test that when the script document starts part-way through a file, any syntax errors are reported with the +correct line number. +NB: There is some weirdness with PyYAML where every comment line inside of a document causes 2 lines of offset for + subsequently-reported errors; this is out of Dexter's hands, so don't test for it here. + +CHECK: No valid Dexter script found in file + +CHECK-NEXT: Script starting line [[# @LINE + 3]] +CHECK-NEXT: could not determine a constructor for the tag '!not_a_real_node' +CHECK-NEXT: line [[# @LINE + 2]], column 1 +--- +!not_a_real_node {function: foo}: + !value x: 5 +... +CHECK-NEXT: Script starting line [[# @LINE + 3]] +CHECK-NEXT: could not determine a constructor for the tag '!also_not_a_real_node' +CHECK-NEXT: line [[# @LINE + 2]], column 1 +--- +!also_not_a_real_node {function: foo}: + !value x: 5 +... diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/invalid-script-nodes.test b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/invalid-script-nodes.test new file mode 100644 index 0000000000000..6989f475a11e5 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/invalid-script-nodes.test @@ -0,0 +1,24 @@ +RUN: not %dexter_regression_test_run --binary %s --use-script -- %s 2>&1 | FileCheck %s + +This is a test that Dexter validates that nodes appear in the correct place in scripts. + + +CHECK: No valid Dexter script found in file +CHECK-NEXT: Script starting line [[# @LINE + 1 ]] +--- +# CHECK-NEXT: Found unexpected node: not a dict +!where {function: foo}: not a dict +... + +CHECK-NEXT: Script starting line [[# @LINE + 1 ]] +--- +# CHECK-NEXT: Found unexpected node: Where(function=foo) +!where {function: foo} +... + +CHECK-NEXT: Script starting line [[# @LINE + 1 ]] +--- +# CHECK-NEXT: Found unexpected node: [{Value(x): 5}] +!where {function: foo}: +- !value x: 5 +... diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/valid-parse.test b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/valid-parse.test new file mode 100644 index 0000000000000..a639fadf576fa --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/parser/valid-parse.test @@ -0,0 +1,20 @@ +RUN: %dexter_regression_test_run --binary %s --use-script --skip-run -- %s 2>&1 | FileCheck %s + +Provides a valid test case and checks that we can parse it successfully and print the script back out correctly. + +CHECK: ? !where {function: foo} +CHECK-NEXT: : !value 'x': 5 +CHECK-NEXT: ? !where {file: lib.cpp, lines: !range [10, 20]} +CHECK-NEXT: : !value 'y': 10 +CHECK-NEXT: ? !where {lines: 5} +CHECK-NEXT: : !value 'z': bees + + +--- +!where {function: foo}: + !value x: 5 + !where {file: "lib.cpp", lines: !range [10, 20]}: + !value y: 10 +!where {lines: 5}: + !value z: bees +... diff --git a/cross-project-tests/debuginfo-tests/dexter/requirements.txt b/cross-project-tests/debuginfo-tests/dexter/requirements.txt new file mode 100644 index 0000000000000..17eef56b9ca9e --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/requirements.txt @@ -0,0 +1 @@ +PyYAML >= 6.0.0 diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index a6412329ba0ef..df45221d90ef3 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -110,6 +110,49 @@ def get_required_attr(config, attr_name): config.available_features.add("llvm-ar") +def check_dexter_requirements(): + # Determine whether Dexter's dependencies are available, and disable Dexter tests if not. + dexter_requirements_path = os.path.join( + config.cross_project_tests_src_root, + "debuginfo-tests", + "dexter", + "requirements.txt", + ) + if not os.path.isfile(dexter_requirements_path): + print( + f"Couldn't find Dexter requirements path at existed path: {dexter_requirements_path}" + ) + return False + with open(dexter_requirements_path) as req: + requirements_list = [ + req_str + for req_line in req + if (req_str := req_line.strip()) and not req_str.startswith("#") + ] + try: + from packaging.requirements import Requirement + from importlib.metadata import version + except Exception as e: + # If we don't have packaging, we can't check requirements - assume false. + print(f"Missing required packages to check version: {e}") + return False + for req_str in requirements_list: + req = Requirement(req_str) + if req.marker and not req.marker.evaluate(): + continue + try: + current_version = version(req.name) + except BaseException as e: + print(f"Missing required packages for Dexter: {req_str}") + return False + if req.specifier and current_version not in req.specifier: + print( + f"Dexter Requirement {req_str} has incorrect installed version {current_version}" + ) + return False + return True + + def configure_dexter_substitutions(): """Configure substitutions for host platform and return list of dependencies""" # Produce dexter path, lldb path, and combine into the %dexter substitution @@ -223,17 +266,22 @@ def can_target_host(): # Dexter tests run on the host machine. If the host arch is supported add # 'dexter' as an available feature and force the dexter tests to use the host # triple. -if can_target_host(): +if not check_dexter_requirements(): + print( + "Missing or unable to verify dexter requirements; skipping dexter tests in the debuginfo-tests project." + ) +elif not can_target_host(): + print( + "Host triple {} not supported. Skipping dexter tests in the " + "debuginfo-tests project.".format(config.host_triple) + ) +else: if config.host_triple != config.target_triple: print("Forcing dexter tests to use host triple {}.".format(config.host_triple)) + dependencies = configure_dexter_substitutions() if all(d in config.available_features for d in dependencies): config.available_features.add("dexter") -else: - print( - "Host triple {} not supported. Skipping dexter tests in the " - "debuginfo-tests project.".format(config.host_triple) - ) tool_dirs = [config.llvm_tools_dir] From 64212c8bcc85a1de3e4a8cebde72b721f5ed27e8 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Mon, 11 May 2026 14:13:21 +0200 Subject: [PATCH 259/538] [NFC][SPIR-V] Use createVirtualRegister helper in selectSUCmp (#196905) Resolve the existing TODO that asks us to do that --- llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 338f53c8d791d..4618e2df74ae0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -4295,11 +4295,8 @@ bool SPIRVInstructionSelector::selectSUCmp(Register ResVReg, BoolType = GR.getOrCreateSPIRVVectorType(BoolType, N, I, TII); Register BoolTypeReg = GR.getSPIRVTypeID(BoolType); // Build less-than-equal and less-than. - // TODO: replace with one-liner createVirtualRegister() from - // llvm/lib/Target/SPIRV/SPIRVUtils.cpp when PR #116609 is merged. - Register IsLessEqReg = MRI->createVirtualRegister(GR.getRegClass(ResType)); - MRI->setType(IsLessEqReg, LLT::scalar(64)); - GR.assignSPIRVTypeToVReg(ResType, IsLessEqReg, MIRBuilder.getMF()); + Register IsLessEqReg = + createVirtualRegister(BoolType, &GR, MRI, MIRBuilder.getMF()); BuildMI(BB, I, I.getDebugLoc(), TII.get(IsSigned ? SPIRV::OpSLessThanEqual : SPIRV::OpULessThanEqual)) .addDef(IsLessEqReg) @@ -4307,9 +4304,8 @@ bool SPIRVInstructionSelector::selectSUCmp(Register ResVReg, .addUse(I.getOperand(1).getReg()) .addUse(I.getOperand(2).getReg()) .constrainAllUses(TII, TRI, RBI); - Register IsLessReg = MRI->createVirtualRegister(GR.getRegClass(ResType)); - MRI->setType(IsLessReg, LLT::scalar(64)); - GR.assignSPIRVTypeToVReg(ResType, IsLessReg, MIRBuilder.getMF()); + Register IsLessReg = + createVirtualRegister(BoolType, &GR, MRI, MIRBuilder.getMF()); BuildMI(BB, I, I.getDebugLoc(), TII.get(IsSigned ? SPIRV::OpSLessThan : SPIRV::OpULessThan)) .addDef(IsLessReg) From c3628c7f125b1fe5dffaf78048322ff69cffd152 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 11 May 2026 14:30:14 +0200 Subject: [PATCH 260/538] Reapply [AA] No synchronization effects for never-escaping identified local (#196923) Relative to the previous attempt, this makes sure that the location does not alias with the pointer operand first. If it aliases, then we need to consider the direct ModRef effects of the instruction, not just the synchronization effects. ----- Fences and other synchronizing operations (such as atomic accesses stronger than monotonic) are modelled as reading and writing all memory, in order to enforce their implied ordering constraints. Currently, this happens even for identified function locals that do not escape. This patch excludes those objects. Notably, we can not reason based on captures-before here, because the synchronizing operation still has an effect even if the object only escapes later. The hope here is that with this restriction in place, it may be viable to respect potential synchronization inside non-nosync function calls. --- llvm/lib/Analysis/AliasAnalysis.cpp | 82 +++++++++++++------ llvm/test/Analysis/BasicAA/atomics.ll | 72 +++++++++++----- .../test/Analysis/MemorySSA/atomic-clobber.ll | 2 +- .../Transforms/DeadStoreElimination/fence.ll | 8 -- llvm/test/Transforms/GVN/fence.ll | 4 +- .../GVN/simplify-icf-cache-invalidation.ll | 2 - llvm/test/Transforms/LICM/atomics.ll | 6 +- 7 files changed, 111 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 1449a54d1de2b..8f311cd0bfeac 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -458,20 +458,48 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) { // Helper method implementation //===----------------------------------------------------------------------===// +/// Get ModRefInfo for a synchronizing operation, such as a fence or stronger +/// than monotonic atomic load/store. +static ModRefInfo getSyncEffects(AAResults *AA, const MemoryLocation &Loc, + AAQueryInfo &AAQI) { + if (!Loc.Ptr) + return ModRefInfo::ModRef; + + // If the location is *never* captured, it cannot be affected by + // synchronizing operations. However, we cannot ignore locations that are + // only captured after the operation, as the synchronization may still have + // an effect if the object is only captured *later*. As such, set I to null + // and ReturnCaptures to true here. + const Value *Obj = getUnderlyingObject(Loc.Ptr); + if (capturesNothing(AAQI.CA->getCapturesBefore( + Obj, /*I=*/nullptr, /*OrAt=*/true, /*ReturnCaptures=*/true))) + return ModRefInfo::NoModRef; + + // If Loc is a constant memory location, the synchronization operation + // definitely could not modify it. + return AA->getModRefInfoMask(Loc); +} + ModRefInfo AAResults::getModRefInfo(const LoadInst *L, const MemoryLocation &Loc, AAQueryInfo &AAQI) { - // Be conservative in the face of atomic. - if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered)) - return ModRefInfo::ModRef; - // If the load address doesn't alias the given address, it doesn't read // or write the specified memory. if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(L), Loc, AAQI, L); - if (AR == AliasResult::NoAlias) + if (AR == AliasResult::NoAlias) { + // Synchronization effects may affect locations that do not alias. + // FIXME: Should be isStrongerThanMonotonic(). + if (isStrongerThanUnordered(L->getOrdering())) + return getSyncEffects(this, Loc, AAQI); return ModRefInfo::NoModRef; + } } + + // Preserve the ordering requirement. + if (isStrongerThanUnordered(L->getOrdering())) + return ModRefInfo::ModRef; + // Otherwise, a load just reads. return ModRefInfo::Ref; } @@ -479,16 +507,17 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L, ModRefInfo AAResults::getModRefInfo(const StoreInst *S, const MemoryLocation &Loc, AAQueryInfo &AAQI) { - // Be conservative in the face of atomic. - if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered)) - return ModRefInfo::ModRef; - if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(S), Loc, AAQI, S); // If the store address cannot alias the pointer in question, then the // specified memory cannot be modified by the store. - if (AR == AliasResult::NoAlias) + if (AR == AliasResult::NoAlias) { + // Synchronization effects may affect locations that do not alias. + // FIXME: Should be isStrongerThanMonotonic(). + if (isStrongerThanUnordered(S->getOrdering())) + return getSyncEffects(this, Loc, AAQI); return ModRefInfo::NoModRef; + } // Examine the ModRef mask. If Mod isn't present, then return NoModRef. // This ensures that if Loc is a constant memory location, we take into @@ -498,6 +527,10 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S, return ModRefInfo::NoModRef; } + // Preserve the ordering requirement. + if (isStrongerThanUnordered(S->getOrdering())) + return ModRefInfo::ModRef; + // Otherwise, a store just writes. return ModRefInfo::Mod; } @@ -515,14 +548,9 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *F, return ModRefInfo::NoModRef; } - // Apply the ModRef mask. This ensures that if Loc is a constant memory - // location, we take into account the fact that the fence definitely could - // not modify the memory location. - if (!isNoModRef(Result)) - Result &= getModRefInfoMask(Loc); - - return Result; + return Result & getSyncEffects(this, Loc, AAQI); } + return ModRefInfo::ModRef; } @@ -574,16 +602,16 @@ ModRefInfo AAResults::getModRefInfo(const CatchReturnInst *CatchRet, ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX, const MemoryLocation &Loc, AAQueryInfo &AAQI) { - // Acquire/Release cmpxchg has properties that matter for arbitrary addresses. - if (isStrongerThanMonotonic(CX->getSuccessOrdering())) - return ModRefInfo::ModRef; - if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(CX), Loc, AAQI, CX); // If the cmpxchg address does not alias the location, it does not access // it. - if (AR == AliasResult::NoAlias) + if (AR == AliasResult::NoAlias) { + // Synchronization effects may affect locations that do not alias. + if (isStrongerThanMonotonic(CX->getSuccessOrdering())) + return getSyncEffects(this, Loc, AAQI); return ModRefInfo::NoModRef; + } } return ModRefInfo::ModRef; @@ -592,16 +620,16 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX, ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, const MemoryLocation &Loc, AAQueryInfo &AAQI) { - // Acquire/Release atomicrmw has properties that matter for arbitrary addresses. - if (isStrongerThanMonotonic(RMW->getOrdering())) - return ModRefInfo::ModRef; - if (Loc.Ptr) { AliasResult AR = alias(MemoryLocation::get(RMW), Loc, AAQI, RMW); // If the atomicrmw address does not alias the location, it does not access // it. - if (AR == AliasResult::NoAlias) + if (AR == AliasResult::NoAlias) { + // Synchronization effects may affect locations that do not alias. + if (isStrongerThanMonotonic(RMW->getOrdering())) + return getSyncEffects(this, Loc, AAQI); return ModRefInfo::NoModRef; + } } return ModRefInfo::ModRef; diff --git a/llvm/test/Analysis/BasicAA/atomics.ll b/llvm/test/Analysis/BasicAA/atomics.ll index db0417c758e92..9a96b94a34c78 100644 --- a/llvm/test/Analysis/BasicAA/atomics.ll +++ b/llvm/test/Analysis/BasicAA/atomics.ll @@ -8,29 +8,29 @@ declare noalias ptr @malloc(i64) ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 monotonic, align 4 ; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> fence release +; CHECK: NoModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: Both ModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %x acquire, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %6 = load atomic i32, ptr %x acquire, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> fence seq_cst +; CHECK: NoModRef: Ptr: i32* %a <-> fence seq_cst ; CHECK: Both ModRef: Ptr: i32* %x <-> fence seq_cst -; CHECK: Both ModRef: Ptr: i32* %a <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %9 = load atomic i32, ptr %x seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %9 = load atomic i32, ptr %x seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %9 = load atomic i32, ptr %x seq_cst, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x seq_cst, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x seq_cst, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x seq_cst, align 4 define void @alloca_no_escape(ptr %x) { %a = alloca i32 @@ -82,16 +82,42 @@ define void @alloca_escape_after(ptr %x) { ret void } +; CHECK-LABEL: Function: alloca_no_escape_aliasing: +; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %a, i32 1 monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %a, i32 0, i32 1 monotonic monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %a monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %a monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %4 = atomicrmw add ptr %a, i32 1 acq_rel, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %5 = cmpxchg ptr %a, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> %6 = load atomic i32, ptr %a acquire, align 4 +; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %a release, align 4 +define void @alloca_no_escape_aliasing() { + %a = alloca i32 + store i32 0, ptr %a + + atomicrmw add ptr %a, i32 1 monotonic + cmpxchg ptr %a, i32 0, i32 1 monotonic monotonic + load atomic i32, ptr %a monotonic, align 4 + store atomic i32 0, ptr %a monotonic, align 4 + + atomicrmw add ptr %a, i32 1 acq_rel + cmpxchg ptr %a, i32 0, i32 1 acq_rel monotonic + load atomic i32, ptr %a acquire, align 4 + store atomic i32 0, ptr %a release, align 4 + + ret void +} + ; CHECK-LABEL: Function: noalias_no_escape: -; CHECK: Both ModRef: Ptr: i32* %a <-> fence release +; CHECK: NoModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x acquire, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 define void @noalias_no_escape(ptr noalias %a, ptr %x) { store i32 0, ptr %a @@ -125,21 +151,23 @@ define void @noalias_escape_after(ptr noalias %a, ptr %x) { load atomic i32, ptr %x acquire, align 4 store atomic i32 0, ptr %x release, align 4 + call void @escape(ptr %a) + ret void } ; CHECK-LABEL: Function: malloc_no_escape: ; CHECK: Both ModRef: Ptr: i32* %a <-> %a = call ptr @malloc(i64 4) ; CHECK: Both ModRef: Ptr: i32* %x <-> %a = call ptr @malloc(i64 4) -; CHECK: Both ModRef: Ptr: i32* %a <-> fence release +; CHECK: NoModRef: Ptr: i32* %a <-> fence release ; CHECK: Both ModRef: Ptr: i32* %x <-> fence release -; CHECK: Both ModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> %3 = load atomic i32, ptr %x acquire, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> %3 = load atomic i32, ptr %x acquire, align 4 -; CHECK: Both ModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 +; CHECK: NoModRef: Ptr: i32* %a <-> store atomic i32 0, ptr %x release, align 4 ; CHECK: Both ModRef: Ptr: i32* %x <-> store atomic i32 0, ptr %x release, align 4 define void @malloc_no_escape(ptr %x) { %a = call ptr @malloc(i64 4) diff --git a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll index 326ec8b15283d..86708ecbc58fd 100644 --- a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll +++ b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll @@ -106,7 +106,7 @@ define void @seq_cst_clobber(ptr noalias %a, ptr noalias %b) { ; If AA gets more aggressive, we can find another way. ; ; CHECK-LABEL: define void @check_aa_is_sane -define void @check_aa_is_sane(ptr noalias %a, ptr noalias %b) { +define void @check_aa_is_sane(ptr noalias %a, ptr %b) { ; CHECK: 1 = MemoryDef(liveOnEntry) ; CHECK-NEXT: cmpxchg ptr %a, i32 0, i32 1 acquire acquire cmpxchg ptr %a, i32 0, i32 1 acquire acquire diff --git a/llvm/test/Transforms/DeadStoreElimination/fence.ll b/llvm/test/Transforms/DeadStoreElimination/fence.ll index b619b0035ce03..3c02d715eb94d 100644 --- a/llvm/test/Transforms/DeadStoreElimination/fence.ll +++ b/llvm/test/Transforms/DeadStoreElimination/fence.ll @@ -54,7 +54,6 @@ define void @test2(ptr %addr.i) { ret void } -; TODO: ; We DSE stack alloc'ed and byval locations, in the presence of fences. ; Fence does not make an otherwise thread local store visible. ; Right now the DSE in presence of fence is only done in end blocks (with no successors), @@ -63,7 +62,6 @@ define void @test2(ptr %addr.i) { define void @test3(ptr byval(i32) %addr.i) { ; CHECK-LABEL: define void @test3( ; CHECK-SAME: ptr byval(i32) [[ADDR_I:%.*]]) { -; CHECK-NEXT: store i32 5, ptr [[ADDR_I]], align 4 ; CHECK-NEXT: fence release ; CHECK-NEXT: ret void ; @@ -76,13 +74,11 @@ declare void @foo(ptr nocapture %p) declare noalias ptr @malloc(i32) -; TODO: ; DSE of stores in locations allocated through library calls. define void @test_nocapture() { ; CHECK-LABEL: define void @test_nocapture() { ; CHECK-NEXT: [[M:%.*]] = call ptr @malloc(i32 24) ; CHECK-NEXT: call void @foo(ptr [[M]]) -; CHECK-NEXT: store i8 4, ptr [[M]], align 1 ; CHECK-NEXT: fence release ; CHECK-NEXT: ret void ; @@ -93,14 +89,10 @@ define void @test_nocapture() { ret void } - -; TODO: ; This is a full fence, but it does not make a thread local store visible. ; We can DSE the store in presence of the fence. define void @fence_seq_cst() { ; CHECK-LABEL: define void @fence_seq_cst() { -; CHECK-NEXT: [[P1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 0, ptr [[P1]], align 4 ; CHECK-NEXT: fence seq_cst ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/GVN/fence.ll b/llvm/test/Transforms/GVN/fence.ll index f2b1538843681..16c6b5143703d 100644 --- a/llvm/test/Transforms/GVN/fence.ll +++ b/llvm/test/Transforms/GVN/fence.ll @@ -37,9 +37,9 @@ define i32 @test2(ptr %addr.i) { ; ordering property (though it is that too), but a liveness ; property. We expect to eventually see the value of store by ; another thread when spinning on that location. -define i32 @test3(ptr noalias %addr.i, ptr noalias %otheraddr) { +define i32 @test3(ptr %addr.i) { ; CHECK-LABEL: define i32 @test3 -; CHECK-SAME: (ptr noalias [[ADDR_I:%.*]], ptr noalias [[OTHERADDR:%.*]]) { +; CHECK-SAME: (ptr [[ADDR_I:%.*]]) { ; CHECK-NEXT: fence acquire ; CHECK-NEXT: [[A:%.*]] = load i32, ptr [[ADDR_I]], align 4 ; CHECK-NEXT: fence acquire diff --git a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll index f4a4155e94f80..fc85048ebdacf 100644 --- a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll +++ b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll @@ -28,8 +28,6 @@ define hidden void @eggs(ptr %arg, i1 %arg2, ptr %arg3, i32 %arg4, ptr %arg5) un ; CHECK-NEXT: br label %[[BB9]] ; CHECK: [[BB9]]: ; CHECK-NEXT: tail call void @quux(ptr [[ARG]], i1 [[ARG2]]) -; CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq ptr [[TMP17]], null ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll index 2b3435ba2e7ef..1d29b038c9a53 100644 --- a/llvm/test/Transforms/LICM/atomics.ll +++ b/llvm/test/Transforms/LICM/atomics.ll @@ -239,6 +239,7 @@ define i32 @test7b(ptr nocapture noalias %x, ptr nocapture %y, ptr noalias nocap ; CHECK-LABEL: define i32 @test7b( ; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]], ptr noalias captures(none) [[Z:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: store i32 5, ptr [[X]], align 4 ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[VALA:%.*]] = load atomic i32, ptr [[Y]] monotonic, align 4 @@ -247,7 +248,6 @@ define i32 @test7b(ptr nocapture noalias %x, ptr nocapture %y, ptr noalias nocap ; CHECK: [[END]]: ; CHECK-NEXT: [[VALA_LCSSA1:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ] ; CHECK-NEXT: [[VALA_LCSSA:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ] -; CHECK-NEXT: store i32 5, ptr [[X]], align 4 ; CHECK-NEXT: store atomic i32 [[VALA_LCSSA1]], ptr [[Z]] unordered, align 4 ; CHECK-NEXT: ret i32 [[VALA_LCSSA]] ; @@ -266,9 +266,9 @@ end: } -define i32 @test8(ptr nocapture noalias %x, ptr nocapture %y) { +define i32 @test8(ptr nocapture %x, ptr nocapture noalias %y) { ; CHECK-LABEL: define i32 @test8( -; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) { +; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr noalias captures(none) [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: From fb69fcdb7e283185d0806c7cd30957cfbcc50aa5 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Mon, 11 May 2026 18:03:51 +0530 Subject: [PATCH 261/538] [Flang][OpenMP] Fix COPYIN of derived types with allocatable components at -O3 (#196063) COPYIN of threadprivate derived types with allocatable components segfaults at -O3 because the OpenMP runtime zero-fills per-thread storage, leaving allocatable component descriptors with invalid metadata. This patch skips the copy on the master thread (where source and destination alias) and uses temporary_lhs assignment on worker threads so the runtime initializes descriptors before the deep copy. Assisted-by: Claude Opus 4.6 Fixes : [https://github.com/llvm/llvm-project/issues/196134](https://github.com/llvm/llvm-project/issues/196134) Minimal reprducing test-case : ``` program repro_o3_segv use omp_lib implicit none integer, parameter :: NT = 4 integer :: i, rc integer :: nThreads(NT), nThreads1(NT) type structure_1 integer, allocatable :: a(:) end type type structure_2 type(structure_1) :: struc1 end type type(structure_2), save :: struc2(2) !$omp threadprivate(struc2) rc = 0 nThreads = -999 nThreads1 = -999 ! Keep this: dynamic teams can change codegen/runtime behavior. call omp_set_dynamic(.true.) call omp_set_num_threads(NT) allocate(struc2(1)%struc1%a(2)) struc2(1)%struc1%a(1) = 1 struc2(1)%struc1%a(2) = 2 !$omp parallel copyin(struc2) if (omp_get_thread_num() == NT-1) then ! Keep branch shape from original. ! struc2(1)%struc1%a(1) = 3 end if !$omp barrier allocate(struc2(2)%struc1%a(3)) nThreads(omp_get_thread_num()+1) = struc2(1)%struc1%a(1) struc2(2)%struc1%a(1) = omp_get_thread_num()+1 struc2(2)%struc1%a(2) = omp_get_thread_num()+2 !$omp end parallel do i = 1, NT if (nThreads(i) /= 1) rc = 1 end do struc2(2)%struc1%a(2) = -1 !$omp parallel copyin(struc2) nThreads(omp_get_thread_num()+1) = struc2(2)%struc1%a(1) nThreads1(omp_get_thread_num()+1) = struc2(2)%struc1%a(2) !$omp end parallel do i = 1, NT if (nThreads1(i) /= -1) rc = 1 end do if (rc /= 0) stop 1 deallocate(struc2(1)%struc1%a) deallocate(struc2(2)%struc1%a) end program ``` [>./flang -fopenmp -fopenmp-version=50 copyin_derrived_alloct.f90 -O3 > ./a.out Segmentation fault (core dumped) Co-authored-by: Chandra Ghale --- flang/lib/Lower/Bridge.cpp | 36 ++++++++ .../copyin-derived-allocatable-comp.f90 | 88 +++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 flang/test/Lower/OpenMP/copyin-derived-allocatable-comp.f90 diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index c5709e1cd94d4..08bb9def8ad5f 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -1548,6 +1548,42 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->genIfThen(loc, isAllocated) .genThen([&]() { copyData(lhs, rhs); }) .end(); + } else if (!isAllocatable && + flags.test(Fortran::semantics::Symbol::Flag::OmpCopyIn) && + hlfir::mayHaveAllocatableComponent(lhs.getType())) { + // For copyin of derived types with allocatable components where the + // variable itself is not allocatable: the threadprivate copy's + // allocatable component descriptors may be uninitialized (e.g., + // zero-filled by the OpenMP runtime). Use temporary_lhs semantics + // which routes through AssignTemporary at runtime. AssignTemporary + // first initializes the LHS descriptor metadata (setting + // attribute=CFI_attribute_allocatable, base_addr=nullptr for + // allocatable components via the Initialize runtime call), then + // performs the assignment with MaybeReallocate semantics for proper + // deep copy of allocatable components. + // + // On the master thread, the LHS and RHS resolve to the same + // threadprivate storage, so we must skip the temporary_lhs path + // (which would destroy the source data via Initialize before the + // copy). Use a runtime address comparison to distinguish threads. + mlir::Value lhsAddr = + fir::ConvertOp::create(*builder, loc, builder->getIndexType(), lhs); + mlir::Value rhsAddr = + fir::ConvertOp::create(*builder, loc, builder->getIndexType(), rhs); + mlir::Value sameAddr = mlir::arith::CmpIOp::create( + *builder, loc, mlir::arith::CmpIPredicate::eq, lhsAddr, rhsAddr); + builder->genIfThenElse(loc, sameAddr) + .genThen([&]() { + // Master thread: lhs and rhs are the same, nothing to do. + }) + .genElse([&]() { + hlfir::Entity r = hlfir::loadTrivialScalar(loc, *builder, rhs); + hlfir::AssignOp::create(*builder, loc, r, lhs, + /*realloc=*/false, + /*keep_lhs_length_if_realloc=*/false, + /*temporary_lhs=*/true); + }) + .end(); } else { copyData(lhs, rhs); } diff --git a/flang/test/Lower/OpenMP/copyin-derived-allocatable-comp.f90 b/flang/test/Lower/OpenMP/copyin-derived-allocatable-comp.f90 new file mode 100644 index 0000000000000..c3ca8cff569da --- /dev/null +++ b/flang/test/Lower/OpenMP/copyin-derived-allocatable-comp.f90 @@ -0,0 +1,88 @@ +! Test lowering of COPYIN clause for derived types with allocatable components. +! Threadprivate copies may have uninitialized allocatable component descriptors +! (zero-filled by the OpenMP runtime). For non-master threads, use +! temporary_lhs semantics so that AssignTemporary initializes descriptors +! before performing the deep copy. The master thread is skipped via a runtime +! address comparison (lhs == rhs means same threadprivate storage). + +! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPcopyin_derived_alloc_comp +subroutine copyin_derived_alloc_comp() + type inner + integer, allocatable :: a(:) + end type inner + type outer + type(inner) :: s + end type outer + type(outer), save :: x(2) + !$omp threadprivate(x) + + allocate(x(1)%s%a(3)) + x(1)%s%a = 42 + +! CHECK: omp.parallel { +! CHECK: omp.threadprivate +! CHECK: hlfir.declare +! CHECK: fir.convert {{.*}} -> index +! CHECK: fir.convert {{.*}} -> index +! CHECK: arith.cmpi eq, {{.*}} : index +! CHECK: fir.if +! CHECK: } else { +! CHECK: hlfir.assign {{.*}} temporary_lhs +! CHECK: } +! CHECK: omp.barrier + !$omp parallel copyin(x) + call sub(x(1)%s%a) + !$omp end parallel + deallocate(x(1)%s%a) +end subroutine + +! CHECK-LABEL: func.func @_QPcopyin_scalar_derived_alloc_comp +subroutine copyin_scalar_derived_alloc_comp() + type dt + integer, allocatable :: a(:) + end type dt + type(dt), save :: y + !$omp threadprivate(y) + allocate(y%a(5)) + y%a = 10 + +! CHECK: omp.parallel { +! CHECK: omp.threadprivate +! CHECK: hlfir.declare +! CHECK: fir.convert {{.*}} -> index +! CHECK: fir.convert {{.*}} -> index +! CHECK: arith.cmpi eq, {{.*}} : index +! CHECK: fir.if +! CHECK: } else { +! CHECK: hlfir.assign {{.*}} temporary_lhs +! CHECK: } +! CHECK: omp.barrier + !$omp parallel copyin(y) + call sub(y%a) + !$omp end parallel + deallocate(y%a) +end subroutine + +! Derived type WITHOUT allocatable components: plain assign, no address check. +! CHECK-LABEL: func.func @_QPcopyin_no_alloc_comp +subroutine copyin_no_alloc_comp() + type simple + integer :: val + end type simple + type(simple), save :: z + !$omp threadprivate(z) +! CHECK: omp.parallel { +! CHECK: omp.threadprivate +! CHECK: hlfir.declare +! CHECK-NOT: arith.cmpi +! CHECK: hlfir.assign +! CHECK-NOT: temporary_lhs +! CHECK: omp.barrier + !$omp parallel copyin(z) + call sub2(z%val) + !$omp end parallel +end subroutine + From 9ff5e1243da181f49617c2d5cecc3b978e06fae3 Mon Sep 17 00:00:00 2001 From: ShashwathiNavada Date: Mon, 11 May 2026 18:04:14 +0530 Subject: [PATCH 262/538] [Flang][Semantics] Treat host/use-associated objects as externally visible. (#192892) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes a false semantic error in Flang where function result variables were incorrectly treated as externally visible in pure-definability checks. As a result, valid code assigning a pointer component of a function result (as in flang/test/Semantics/pure-function-result-pointer.f90) was rejected with “not definable in a pure subprogram.” The fix updates _FindExternallyVisibleObject_ to treat function result symbols as local, which matches Fortran semantics for function result variables. --- flang/lib/Semantics/tools.cpp | 15 +++--- .../pure-function-result-pointer.f90 | 46 +++++++++++++++++++ .../Semantics/pure-host-associated-result.f90 | 14 ++++++ 3 files changed, 68 insertions(+), 7 deletions(-) create mode 100644 flang/test/Semantics/pure-function-result-pointer.f90 create mode 100644 flang/test/Semantics/pure-host-associated-result.f90 diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index bd8002e8141b2..c1ea909f21ff1 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -331,7 +331,13 @@ const Symbol *FindExternallyVisibleObject( // TODO: Storage association with any object for which this predicate holds, // once EQUIVALENCE is supported. const Symbol &ultimate{GetAssociationRoot(object)}; - if (IsDummy(ultimate)) { + if (ultimate.owner().IsDerivedType()) { + return nullptr; + } else if (!IsDummy(ultimate) && + (IsUseAssociated(object, scope) || + IsHostAssociatedIntoSubprogram(object, scope))) { + return &object; + } else if (IsDummy(ultimate)) { if (IsIntentIn(ultimate)) { return &ultimate; } @@ -339,12 +345,7 @@ const Symbol *FindExternallyVisibleObject( IsPureProcedure(ultimate.owner()) && IsFunction(ultimate.owner())) { return &ultimate; } - } else if (ultimate.owner().IsDerivedType()) { - return nullptr; - } else if (&GetProgramUnitContaining(ultimate) != - &GetProgramUnitContaining(scope)) { - return &object; - } else if (const Symbol * block{FindCommonBlockContaining(ultimate)}) { + } else if (const Symbol *block{FindCommonBlockContaining(ultimate)}) { return block; } return nullptr; diff --git a/flang/test/Semantics/pure-function-result-pointer.f90 b/flang/test/Semantics/pure-function-result-pointer.f90 new file mode 100644 index 0000000000000..85e18156f906a --- /dev/null +++ b/flang/test/Semantics/pure-function-result-pointer.f90 @@ -0,0 +1,46 @@ +! RUN: %flang_fc1 -fsyntax-only %s + +module associated_func_call + implicit none + private + public :: type_t + public :: test_function_i + abstract interface + function test_function_i() result(passes) + implicit none + logical passes + end function + end interface + + type type_t + private + procedure(test_function_i), pointer, nopass :: test_function_ => null() + contains + generic :: operator(==) => equals + procedure, private :: equals + end type + + interface type_t + module function construct(test_function) result(test_description) + implicit none + procedure(test_function_i), intent(in), pointer :: test_function + type(type_t) test_description + end function + end interface + + interface + elemental module function equals(lhs, rhs) result(lhs_eq_rhs) + implicit none + class(type_t), intent(in) :: lhs, rhs + logical lhs_eq_rhs + end function + end interface + +contains + module procedure construct + test_description%test_function_ => test_function + end procedure + module procedure equals + lhs_eq_rhs = associated(lhs%test_function_, rhs%test_function_) + end procedure +end module associated_func_call diff --git a/flang/test/Semantics/pure-host-associated-result.f90 b/flang/test/Semantics/pure-host-associated-result.f90 new file mode 100644 index 0000000000000..3e0d8bdc6fb48 --- /dev/null +++ b/flang/test/Semantics/pure-host-associated-result.f90 @@ -0,0 +1,14 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 + +function test_func(x) result(i) + integer, pointer :: i + real :: x + x = func() +contains + pure real function func() + !ERROR: Left-hand side of assignment is not definable + !BECAUSE: 'i' is externally visible via 'i' and not definable in a pure subprogram + i = 0 + func = 0. + end function +end function From 09820890585f3cbcee5419bc6beb08aa76e52af6 Mon Sep 17 00:00:00 2001 From: "forking-google-bazel-bot[bot]" <265904573+forking-google-bazel-bot[bot]@users.noreply.github.com> Date: Mon, 11 May 2026 07:48:46 -0500 Subject: [PATCH 263/538] [Bazel] Fixes 34502b0 (#196930) This fixes 34502b0c7e076e658bd176030223029cd4402941. Co-authored-by: Google Bazel Bot --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 63cc42179055a..456cf1e06f539 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5681,6 +5681,7 @@ cc_library( hdrs = ["include/mlir/Dialect/GPU/Pipelines/Passes.h"], includes = ["include"], deps = [ + ":AMDGPUToROCDL", ":AffineToStandard", ":ArithToLLVM", ":ConversionPasses", @@ -5689,6 +5690,7 @@ cc_library( ":GPUDialect", ":GPUToGPURuntimeTransforms", ":GPUToNVVMTransforms", + ":GPUToROCDLTransforms", ":GPUTransforms", ":IndexToLLVM", ":LLVMDialect", From f325d13bfdd084122bb74094f2c99177a1ad5b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 11 May 2026 15:00:08 +0200 Subject: [PATCH 264/538] [LV] Use isLegalMaskedLoadOrStore for interleaved accesses too (NFC) (#195243) isLegalMaskedLoadOrStore is now the central place for querying target capabilities for masked accesses. Access pattern legality checks are hoisted outside of it. --- .../Transforms/Vectorize/LoopVectorizationPlanner.cpp | 4 ---- .../Transforms/Vectorize/LoopVectorizationPlanner.h | 3 ++- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++------ 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp index f29834d2f804e..33e3187d2d336 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp @@ -64,14 +64,10 @@ static cl::opt ForceTargetSupportsMaskedMemoryOps( bool VFSelectionContext::isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const { assert(isa(I) || isa(I)); - auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); const unsigned AS = getLoadStoreAddressSpace(I); const Align Alignment = getLoadStoreAlignment(I); - if (!Legal->isConsecutivePtr(Ty, Ptr)) - return false; - return ForceTargetSupportsMaskedMemoryOps || (isa(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS) : TTI.isLegalMaskedStore(Ty, Alignment, AS)); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 00b689326d770..565d05d234666 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -670,7 +670,8 @@ class VFSelectionContext { bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const; /// Returns true if the target machine supports masked loads or stores - /// for \p I's data type and alignment. + /// for \p I's data type and alignment. The caller must ensure the access is + /// consecutive or part of an interleave group. bool isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const; /// Returns true if the target machine can represent \p V as a masked gather diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1ace2275e2b6d..61c2d3cd228ec 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2372,7 +2372,9 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, return getCallWideningDecision(cast(I), VF).Kind == CM_Scalarize; case Instruction::Load: case Instruction::Store: { - return !Config.isLegalMaskedLoadOrStore(I, VF) && + bool IsConsecutive = Legal->isConsecutivePtr(getLoadStoreType(I), + getLoadStorePointerOperand(I)); + return !(IsConsecutive && Config.isLegalMaskedLoadOrStore(I, VF)) && !Config.isLegalGatherOrScatter(I, VF); } case Instruction::UDiv: @@ -2593,11 +2595,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (VF.isScalable() && NeedsMaskForGaps) return false; - auto *Ty = getLoadStoreType(I); - const Align Alignment = getLoadStoreAlignment(I); - unsigned AS = getLoadStoreAddressSpace(I); - return isa(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS) - : TTI.isLegalMaskedStore(Ty, Alignment, AS); + return Config.isLegalMaskedLoadOrStore(I, VF); } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( From b4e8f5911221dcac405cfcd4ac2cffadc0c4ddda Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Mon, 11 May 2026 15:02:41 +0200 Subject: [PATCH 265/538] [mlir][SPIR-V] Lower math.{exp2,log2,log10} operations (#196723) --- .../Conversion/MathToSPIRV/MathToSPIRV.cpp | 5 +++-- .../MathToSPIRV/math-to-opencl-spirv.mlir | 20 ++++++++----------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp index 8d850e01d5e62..ce603d4a85072 100644 --- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp +++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp @@ -562,8 +562,6 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter, // OpenCL patterns patterns.add< Log1pOpPattern, ExpM1OpPattern, - Log2Log10OpPattern, - Log2Log10OpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, @@ -573,9 +571,12 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, PowIOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir index 037f69e63a2dc..1b7d63fb38d36 100644 --- a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir +++ b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir @@ -20,14 +20,12 @@ func.func @float32_unary_scalar(%arg0: f32) { // CHECK: %[[ADDONE:.+]] = spirv.FAdd %[[ONE]], %{{.+}} // CHECK: spirv.CL.log %[[ADDONE]] %5 = math.log1p %arg0 : f32 - // CHECK: %[[LOG2_RECIPROCAL:.+]] = spirv.Constant 1.44269502 : f32 - // CHECK: %[[LOG0:.+]] = spirv.CL.log {{.+}} - // CHECK: spirv.FMul %[[LOG0]], %[[LOG2_RECIPROCAL]] + // CHECK: spirv.CL.log2 %{{.*}}: f32 %6 = math.log2 %arg0 : f32 - // CHECK: %[[LOG10_RECIPROCAL:.+]] = spirv.Constant 0.434294492 : f32 - // CHECK: %[[LOG1:.+]] = spirv.CL.log {{.+}} - // CHECK: spirv.FMul %[[LOG1]], %[[LOG10_RECIPROCAL]] + // CHECK: spirv.CL.log10 %{{.*}}: f32 %7 = math.log10 %arg0 : f32 + // CHECK: spirv.CL.exp2 %{{.*}}: f32 + %exp2_scalar = math.exp2 %arg0 : f32 // CHECK: spirv.CL.rint %{{.*}}: f32 %8 = math.roundeven %arg0 : f32 // CHECK: spirv.CL.rsqrt %{{.*}}: f32 @@ -85,14 +83,12 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) { // CHECK: %[[ADDONE:.+]] = spirv.FAdd %[[ONE]], %{{.+}} // CHECK: spirv.CL.log %[[ADDONE]] %5 = math.log1p %arg0 : vector<3xf32> - // CHECK: %[[LOG2_RECIPROCAL:.+]] = spirv.Constant dense<1.44269502> : vector<3xf32> - // CHECK: %[[LOG0:.+]] = spirv.CL.log {{.+}} - // CHECK: spirv.FMul %[[LOG0]], %[[LOG2_RECIPROCAL]] + // CHECK: spirv.CL.log2 %{{.*}}: vector<3xf32> %6 = math.log2 %arg0 : vector<3xf32> - // CHECK: %[[LOG10_RECIPROCAL:.+]] = spirv.Constant dense<0.434294492> : vector<3xf32> - // CHECK: %[[LOG1:.+]] = spirv.CL.log {{.+}} - // CHECK: spirv.FMul %[[LOG1]], %[[LOG10_RECIPROCAL]] + // CHECK: spirv.CL.log10 %{{.*}}: vector<3xf32> %7 = math.log10 %arg0 : vector<3xf32> + // CHECK: spirv.CL.exp2 %{{.*}}: vector<3xf32> + %exp2_vec = math.exp2 %arg0 : vector<3xf32> // CHECK: spirv.CL.rint %{{.*}}: vector<3xf32> %8 = math.roundeven %arg0 : vector<3xf32> // CHECK: spirv.CL.rsqrt %{{.*}}: vector<3xf32> From 0aa46198a57f689335de27d8d883ba32a808f347 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 09:18:41 -0400 Subject: [PATCH 266/538] [gn] port 07b5dfe9473c6 + deps (LLVMABI dep in clang) (#196944) Also adds build files for llvm/lib/ABI, which was dead code before 07b5dfe9473c6 (at least in the GN build). --- .../utils/gn/secondary/clang/lib/CodeGen/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/lib/ABI/BUILD.gn | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 llvm/utils/gn/secondary/llvm/lib/ABI/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn index 80f23019b2abd..f3112ec773fc4 100644 --- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn @@ -14,6 +14,7 @@ static_library("CodeGen") { "//clang/lib/Basic", "//clang/lib/Frontend", "//clang/lib/Lex", + "//llvm/lib/ABI", "//llvm/lib/Analysis", "//llvm/lib/Bitcode/Reader", "//llvm/lib/CodeGen", diff --git a/llvm/utils/gn/secondary/llvm/lib/ABI/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ABI/BUILD.gn new file mode 100644 index 0000000000000..3d8e817a94bc7 --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/lib/ABI/BUILD.gn @@ -0,0 +1,15 @@ +static_library("ABI") { + output_name = "LLVMABI" + deps = [ + "//llvm/include/llvm/Config:config", + "//llvm/lib/IR", + "//llvm/lib/Support", + ] + sources = [ + "Types.cpp", + "FunctionInfo.cpp", + "TargetInfo.cpp", + "IRTypeMapper.cpp", + "Targets/BPF.cpp", + ] +} From 9a3193b1a0e487e54c5649b5fa4456249f651d9a Mon Sep 17 00:00:00 2001 From: David Pagan Date: Mon, 11 May 2026 06:18:44 -0700 Subject: [PATCH 267/538] [clang][OpenMP 6.0][CodeGen] Codegen for declare_target 'local' clause (#196431) Implement code generation for the OpenMP 6.0 declare_target 'local' clause, which creates device-only variables with per-device static storage. A 'local' variable exists in the device image with its static initializer and is always accessed directly by device code. This is the same as 'to'/'enter' without unified shared memory, except that no offload entry is registered. Using 'device_type(nohost)' with 'local' is not yet supported. Sema generates a warning and converts it to 'device_type(any)'. Testing: - Updated tests: clang/test/OpenMP/declare_target_messages.cpp clang/test/OpenMP/declare_target_ast_print.cpp - New tests: clang/test/OpenMP/declare_target_local_codegen.cpp clang/test/OpenMP/declare_target_local_usm_codegen.cpp offload/test/offloading/declare_target_local.cpp --- clang/docs/OpenMPSupport.rst | 3 +- clang/docs/ReleaseNotes.rst | 2 + .../clang/Basic/DiagnosticSemaKinds.td | 9 +- clang/lib/CodeGen/CGExpr.cpp | 19 +- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 26 +- clang/lib/CodeGen/CodeGenModule.cpp | 11 +- clang/lib/Sema/SemaOpenMP.cpp | 15 +- .../test/OpenMP/declare_target_ast_print.cpp | 12 +- .../OpenMP/declare_target_local_codegen.cpp | 430 ++++++++++++++++++ .../declare_target_local_usm_codegen.cpp | 52 +++ clang/test/OpenMP/declare_target_messages.cpp | 9 +- .../test/offloading/declare_target_local.cpp | 40 ++ 12 files changed, 581 insertions(+), 47 deletions(-) create mode 100644 clang/test/OpenMP/declare_target_local_codegen.cpp create mode 100644 clang/test/OpenMP/declare_target_local_usm_codegen.cpp create mode 100644 offload/test/offloading/declare_target_local.cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 962fc717bc496..efe9726137625 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -595,7 +595,8 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Message and severity clauses | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/146093 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Local clause on declare target | :part:`In Progress` | :none:`unclaimed` | | +| Local clause on declare target | :good:`done` | :none:`unclaimed` | clang Parse/Sema: https://github.com/llvm/llvm-project/pull/186281 | +| | | | clang Codegen : https://github.com/llvm/llvm-project/pull/196431 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | groupprivate directive | :part:`In Progress` | :part:`partial` | Flang: kparzysz, mjklemm | | | | | | diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 40b97b4c8bf4b..53f3d42270f1a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -810,6 +810,8 @@ OpenMP Support - Added support for ``transparent`` clause in task and taskloop directives. - Added support for ``use_device_ptr`` clause to accept an optional ``fallback`` modifier (``fb_nullify`` or ``fb_preserve``) with OpenMP >= 61. +- Added support for ``local`` clause with declare_target directive when + OpenMP >= 60. Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 879812f3de0d3..f687e759d1267 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12179,6 +12179,10 @@ def err_omp_declare_target_var_in_both_clauses def err_omp_declare_target_local_host_only : Error<"'local' clause is incompatible with 'device_type(host)'; " "local variables exist only on the device">; +def warn_omp_declare_target_local_nohost + : Warning<"'device_type(nohost)' is not yet supported with 'local' clause; " + "treating as 'device_type(any)'">, + InGroup; def warn_omp_not_in_target_context : Warning< "declaration is not declared in any declare target region">, InGroup; @@ -12601,11 +12605,6 @@ def err_omp_declare_target_has_local_vars : Error< def warn_omp_declare_target_after_first_use : Warning< "declaration marked as declare target after first use, it may lead to incorrect results">, InGroup; -def warn_omp_declare_target_local_not_implemented - : Warning<"'local' clause on 'declare_target' directive is not yet fully " - "implemented; " - "variable will be treated as 'enter'">, - InGroup; def err_omp_declare_variant_incompat_attributes : Error< "'#pragma omp declare variant' is not compatible with any target-specific attributes">; def warn_omp_declare_variant_score_not_constant diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 9107553652688..5764b59e538ae 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -3360,19 +3360,18 @@ static Address emitDeclTargetVarDeclLValue(CodeGenFunction &CGF, const VarDecl *VD, QualType T) { std::optional Res = OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD); - // Return an invalid address if variable is MT_To (or MT_Enter starting with - // OpenMP 5.2, or MT_Local in OpenMP 6.0) and unified memory is not enabled. - // For all other cases: MT_Link and MT_To (or MT_Enter/MT_Local) with unified - // memory, return a valid address. - if (!Res || ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && - !CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) + // Always return an invalid address for MT_Local, and also for + // MT_To/MT_Enter when unified memory is not enabled. These use direct + // access (global exists in device image). Otherwise, return a valid + // address. + if (!Res || *Res == OMPDeclareTargetDeclAttr::MT_Local || + ((*Res == OMPDeclareTargetDeclAttr::MT_To || + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && + !CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) return Address::invalid(); assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) || ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) && "Expected link clause OR to clause with unified memory enabled."); QualType PtrTy = CGF.getContext().getPointerType(VD->getType()); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index a99a257c14a2a..7cdc206aea0c4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1529,12 +1529,14 @@ convertCaptureClause(const VarDecl *VD) { return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo; break; case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Enter: - case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Local: return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter; - break; case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Link: return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink; break; + case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Local: + // MT_Local variables don't need offload entry (device-local). + llvm_unreachable("MT_Local should not reach convertCaptureClause"); + break; default: return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryNone; break; @@ -7983,8 +7985,7 @@ class MappableExprsHandler { OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) { if ((*Res == OMPDeclareTargetDeclAttr::MT_Link) || ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) { RequiresReference = true; BP = CGF.CGM.getOpenMPRuntime().getAddrOfDeclareTargetVar(VD); @@ -11326,8 +11327,7 @@ bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) { cast(GD.getDecl())); if (!Res || *Res == OMPDeclareTargetDeclAttr::MT_Link || ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && HasRequiresUnifiedSharedMemory)) { DeferredGlobalVariables.insert(cast(GD.getDecl())); return true; @@ -11350,6 +11350,11 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD, VD->hasExternalStorage()) return; + // MT_Local variables use direct access with no host-device mapping. + // No offload entry needed — the device global keeps its own initializer. + if (Res && *Res == OMPDeclareTargetDeclAttr::MT_Local) + return; + if (!Res) { if (CGM.getLangOpts().OpenMPIsTargetDevice) { // Register non-target variables being emitted in device code (debug info @@ -11396,10 +11401,11 @@ void CGOpenMPRuntime::emitDeferredTargetDecls() const { OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD); if (!Res) continue; - if ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && - !HasRequiresUnifiedSharedMemory) { + // MT_Local and MT_To/MT_Enter without USM are always emitted. + if (*Res == OMPDeclareTargetDeclAttr::MT_Local || + ((*Res == OMPDeclareTargetDeclAttr::MT_To || + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && + !HasRequiresUnifiedSharedMemory)) { CGM.EmitGlobal(VD); } else { assert((*Res == OMPDeclareTargetDeclAttr::MT_Link || diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 2d91b7eaa52dc..72a1f771962b1 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4491,16 +4491,15 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { bool UnifiedMemoryEnabled = getOpenMPRuntime().hasRequiresUnifiedSharedMemory(); - if ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && - !UnifiedMemoryEnabled) { + if (*Res == OMPDeclareTargetDeclAttr::MT_Local || + ((*Res == OMPDeclareTargetDeclAttr::MT_To || + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && + !UnifiedMemoryEnabled)) { (void)GetAddrOfGlobalVar(VD); } else { assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) || ((*Res == OMPDeclareTargetDeclAttr::MT_To || - *Res == OMPDeclareTargetDeclAttr::MT_Enter || - *Res == OMPDeclareTargetDeclAttr::MT_Local) && + *Res == OMPDeclareTargetDeclAttr::MT_Enter) && UnifiedMemoryEnabled)) && "Link clause or to clause with unified memory expected."); (void)getOpenMPRuntime().getAddrOfDeclareTargetVar(VD); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 53ded7a5e177e..d6f6bc919a31b 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -24859,13 +24859,18 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetName( if (!IndirectE) IsIndirect = true; } - // FIXME: 'local' clause is not yet implemented in CodeGen. For now, it is - // treated as 'enter'. For host compilation, 'local' is a no-op. + // FIXME: 'local' with 'device_type(nohost)' is not yet fully supported + // in codegen. Treat as 'device_type(any)' for now. The variable will + // exist on both host and device, but the host copy is unused. + auto DT = DTCI.DT; if (MT == OMPDeclareTargetDeclAttr::MT_Local && - getLangOpts().OpenMPIsTargetDevice) - Diag(Loc, diag::warn_omp_declare_target_local_not_implemented); + DT == OMPDeclareTargetDeclAttr::DT_NoHost) { + Diag(Loc, diag::warn_omp_declare_target_local_nohost); + DT = OMPDeclareTargetDeclAttr::DT_Any; + } + auto *A = OMPDeclareTargetDeclAttr::CreateImplicit( - getASTContext(), MT, DTCI.DT, IndirectE, IsIndirect, Level, + getASTContext(), MT, DT, IndirectE, IsIndirect, Level, SourceRange(Loc, Loc)); ND->addAttr(A); if (ASTMutationListener *ML = getASTContext().getASTMutationListener()) diff --git a/clang/test/OpenMP/declare_target_ast_print.cpp b/clang/test/OpenMP/declare_target_ast_print.cpp index 3ebe261cf79f0..7b63c15dd455e 100644 --- a/clang/test/OpenMP/declare_target_ast_print.cpp +++ b/clang/test/OpenMP/declare_target_ast_print.cpp @@ -4,7 +4,7 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 // RUN: %clang_cc1 -verify -fopenmp -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP60 +// RUN: %clang_cc1 -verify=omp60 -fopenmp -fopenmp-version=60 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP60 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50 @@ -133,18 +133,18 @@ int l1; // OMP60: #pragma omp end declare target int l2; -#pragma omp declare target device_type(nohost) local(l2) -// OMP60: #pragma omp declare target device_type(nohost) local +#pragma omp declare target device_type(nohost) local(l2) // omp60-warning {{'device_type(nohost)' is not yet supported with 'local' clause; treating as 'device_type(any)'}} +// OMP60: #pragma omp declare target local // OMP60: int l2; // OMP60: #pragma omp end declare target int l3; int a = 0; -#pragma omp declare target local(l3) device_type(nohost) local(a) -// OMP60: #pragma omp declare target device_type(nohost) local +#pragma omp declare target local(l3) device_type(nohost) local(a) // omp60-warning 2 {{'device_type(nohost)' is not yet supported with 'local' clause; treating as 'device_type(any)'}} +// OMP60: #pragma omp declare target local // OMP60: int l3; // OMP60: #pragma omp end declare target -// OMP60: #pragma omp declare target device_type(nohost) local +// OMP60: #pragma omp declare target local // OMP60: int a = 0; // OMP60: #pragma omp end declare target diff --git a/clang/test/OpenMP/declare_target_local_codegen.cpp b/clang/test/OpenMP/declare_target_local_codegen.cpp new file mode 100644 index 0000000000000..b82e8b3bba9ff --- /dev/null +++ b/clang/test/OpenMP/declare_target_local_codegen.cpp @@ -0,0 +1,430 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ +// RUN: %clang_cc1 -verify=omp60 -fopenmp -fopenmp-version=60 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=HOST +// RUN: %clang_cc1 -verify=omp60 -fopenmp -fopenmp-version=60 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-host.bc +// RUN: %clang_cc1 -verify=omp60 -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -o - | FileCheck %s --check-prefix=DEVICE +// RUN: %clang_cc1 -verify=omp60 -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t +// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - | FileCheck %s --check-prefix=DEVICE + +#ifndef HEADER +#define HEADER + +// --------------------------------------------------------------------------- +// Explicit local clause (default device_type is 'any') +// --------------------------------------------------------------------------- +int local_scalar; +#pragma omp declare target local(local_scalar) + +int local_array[64]; +#pragma omp declare target local(local_array) + +// --------------------------------------------------------------------------- +// local + device_type(nohost) +// --------------------------------------------------------------------------- +int local_nohost_var; +#pragma omp declare target local(local_nohost_var) device_type(nohost) // omp60-warning {{'device_type(nohost)' is not yet supported with 'local' clause; treating as 'device_type(any)'}} + +double local_nohost_arr[32]; +#pragma omp declare target local(local_nohost_arr) device_type(nohost) // omp60-warning {{'device_type(nohost)' is not yet supported with 'local' clause; treating as 'device_type(any)'}} + +// --------------------------------------------------------------------------- +// Template with local variable +// --------------------------------------------------------------------------- +template +struct LocalStorage { + static T value; +}; + +template +T LocalStorage::value; + +#pragma omp declare target local(LocalStorage::value) +#pragma omp declare target local(LocalStorage::value) + +#pragma omp begin declare target +template +T read_local_storage() { + return LocalStorage::value; +} +#pragma omp end declare target + +// --------------------------------------------------------------------------- +// Non-template static data member with local +// --------------------------------------------------------------------------- +struct PlainStruct { + static int s_member; +}; +int PlainStruct::s_member; +#pragma omp declare target local(PlainStruct::s_member) + +// --------------------------------------------------------------------------- +// Initialized local variable +// --------------------------------------------------------------------------- +int local_init_var = 42; +#pragma omp declare target local(local_init_var) + +// --------------------------------------------------------------------------- +// Use local variables in a target region +// --------------------------------------------------------------------------- +int use_local_vars() { + int result = 0; + #pragma omp target map(from: result) + { + local_scalar = 42; + local_array[0] = 1; + LocalStorage::value = 100; + result = local_scalar + local_array[0] + + read_local_storage(); + } + return result; +} + +// --------------------------------------------------------------------------- +// Use nohost local variables in a target region +// --------------------------------------------------------------------------- +int use_nohost_local_vars() { + int result = 0; + #pragma omp target map(from: result) + { + local_nohost_var = 7; + result = local_nohost_var; + } + return result; +} + +// --------------------------------------------------------------------------- +// Use static data member, initialized var, and static local in target region +// --------------------------------------------------------------------------- +int use_new_local_vars() { + int result = 0; + #pragma omp target map(from: result) + { + PlainStruct::s_member = 55; + local_init_var = 77; + result = PlainStruct::s_member + local_init_var; + } + return result; +} + +#endif +// HOST-LABEL: define {{[^@]+}}@_Z14use_local_varsv +// HOST-SAME: () #[[ATTR0:[0-9]+]] { +// HOST-NEXT: entry: +// HOST-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +// HOST-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// HOST-NEXT: store i32 0, ptr [[RESULT]], align 4 +// HOST-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// HOST-NEXT: store ptr [[RESULT]], ptr [[TMP0]], align 8 +// HOST-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// HOST-NEXT: store ptr [[RESULT]], ptr [[TMP1]], align 8 +// HOST-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// HOST-NEXT: store ptr null, ptr [[TMP2]], align 8 +// HOST-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// HOST-NEXT: store ptr null, ptr [[TMP3]], align 8 +// HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// HOST-NEXT: store ptr null, ptr [[TMP4]], align 8 +// HOST-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// HOST-NEXT: store ptr null, ptr [[TMP5]], align 8 +// HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// HOST-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// HOST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// HOST-NEXT: store i32 4, ptr [[TMP8]], align 4 +// HOST-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// HOST-NEXT: store i32 2, ptr [[TMP9]], align 4 +// HOST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// HOST-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// HOST-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// HOST-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 8 +// HOST-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// HOST-NEXT: store ptr @.offload_sizes, ptr [[TMP12]], align 8 +// HOST-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// HOST-NEXT: store ptr @.offload_maptypes, ptr [[TMP13]], align 8 +// HOST-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// HOST-NEXT: store ptr null, ptr [[TMP14]], align 8 +// HOST-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// HOST-NEXT: store ptr null, ptr [[TMP15]], align 8 +// HOST-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// HOST-NEXT: store i64 0, ptr [[TMP16]], align 8 +// HOST-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// HOST-NEXT: store i64 0, ptr [[TMP17]], align 8 +// HOST-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// HOST-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4 +// HOST-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// HOST-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4 +// HOST-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// HOST-NEXT: store i32 0, ptr [[TMP20]], align 4 +// HOST-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14use_local_varsv_l70.region_id, ptr [[KERNEL_ARGS]]) +// HOST-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// HOST-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// HOST: omp_offload.failed: +// HOST-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14use_local_varsv_l70(ptr [[RESULT]], ptr null) #[[ATTR2:[0-9]+]] +// HOST-NEXT: br label [[OMP_OFFLOAD_CONT]] +// HOST: omp_offload.cont: +// HOST-NEXT: [[TMP23:%.*]] = load i32, ptr [[RESULT]], align 4 +// HOST-NEXT: ret i32 [[TMP23]] +// +// +// HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14use_local_varsv_l70 +// HOST-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]], ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR1:[0-9]+]] { +// HOST-NEXT: entry: +// HOST-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR]], align 8 +// HOST-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR]], align 8, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] +// HOST-NEXT: store i32 42, ptr @local_scalar, align 4 +// HOST-NEXT: store i32 1, ptr @local_array, align 4 +// HOST-NEXT: store i32 100, ptr @_ZN12LocalStorageIiE5valueE, align 4 +// HOST-NEXT: [[TMP1:%.*]] = load i32, ptr @local_scalar, align 4 +// HOST-NEXT: [[TMP2:%.*]] = load i32, ptr @local_array, align 4 +// HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]] +// HOST-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z18read_local_storageIiET_v() +// HOST-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD]], [[CALL]] +// HOST-NEXT: store i32 [[ADD1]], ptr [[TMP0]], align 4 +// HOST-NEXT: ret void +// +// +// HOST-LABEL: define {{[^@]+}}@_Z18read_local_storageIiET_v +// HOST-SAME: () #[[ATTR0]] comdat { +// HOST-NEXT: entry: +// HOST-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZN12LocalStorageIiE5valueE, align 4 +// HOST-NEXT: ret i32 [[TMP0]] +// +// +// HOST-LABEL: define {{[^@]+}}@_Z21use_nohost_local_varsv +// HOST-SAME: () #[[ATTR0]] { +// HOST-NEXT: entry: +// HOST-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +// HOST-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// HOST-NEXT: store i32 0, ptr [[RESULT]], align 4 +// HOST-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// HOST-NEXT: store ptr [[RESULT]], ptr [[TMP0]], align 8 +// HOST-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// HOST-NEXT: store ptr [[RESULT]], ptr [[TMP1]], align 8 +// HOST-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// HOST-NEXT: store ptr null, ptr [[TMP2]], align 8 +// HOST-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// HOST-NEXT: store ptr null, ptr [[TMP3]], align 8 +// HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// HOST-NEXT: store ptr null, ptr [[TMP4]], align 8 +// HOST-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// HOST-NEXT: store ptr null, ptr [[TMP5]], align 8 +// HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// HOST-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// HOST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// HOST-NEXT: store i32 4, ptr [[TMP8]], align 4 +// HOST-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// HOST-NEXT: store i32 2, ptr [[TMP9]], align 4 +// HOST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// HOST-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// HOST-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// HOST-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 8 +// HOST-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// HOST-NEXT: store ptr @.offload_sizes.1, ptr [[TMP12]], align 8 +// HOST-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// HOST-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP13]], align 8 +// HOST-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// HOST-NEXT: store ptr null, ptr [[TMP14]], align 8 +// HOST-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// HOST-NEXT: store ptr null, ptr [[TMP15]], align 8 +// HOST-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// HOST-NEXT: store i64 0, ptr [[TMP16]], align 8 +// HOST-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// HOST-NEXT: store i64 0, ptr [[TMP17]], align 8 +// HOST-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// HOST-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4 +// HOST-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// HOST-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4 +// HOST-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// HOST-NEXT: store i32 0, ptr [[TMP20]], align 4 +// HOST-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21use_nohost_local_varsv_l86.region_id, ptr [[KERNEL_ARGS]]) +// HOST-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// HOST-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// HOST: omp_offload.failed: +// HOST-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21use_nohost_local_varsv_l86(ptr [[RESULT]], ptr null) #[[ATTR2]] +// HOST-NEXT: br label [[OMP_OFFLOAD_CONT]] +// HOST: omp_offload.cont: +// HOST-NEXT: [[TMP23:%.*]] = load i32, ptr [[RESULT]], align 4 +// HOST-NEXT: ret i32 [[TMP23]] +// +// +// HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21use_nohost_local_varsv_l86 +// HOST-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]], ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR1]] { +// HOST-NEXT: entry: +// HOST-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR]], align 8 +// HOST-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// HOST-NEXT: store i32 7, ptr @local_nohost_var, align 4 +// HOST-NEXT: [[TMP1:%.*]] = load i32, ptr @local_nohost_var, align 4 +// HOST-NEXT: store i32 [[TMP1]], ptr [[TMP0]], align 4 +// HOST-NEXT: ret void +// +// +// HOST-LABEL: define {{[^@]+}}@_Z18use_new_local_varsv +// HOST-SAME: () #[[ATTR0]] { +// HOST-NEXT: entry: +// HOST-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +// HOST-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x ptr], align 8 +// HOST-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// HOST-NEXT: store i32 0, ptr [[RESULT]], align 4 +// HOST-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// HOST-NEXT: store ptr [[RESULT]], ptr [[TMP0]], align 8 +// HOST-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// HOST-NEXT: store ptr [[RESULT]], ptr [[TMP1]], align 8 +// HOST-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// HOST-NEXT: store ptr null, ptr [[TMP2]], align 8 +// HOST-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// HOST-NEXT: store ptr null, ptr [[TMP3]], align 8 +// HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// HOST-NEXT: store ptr null, ptr [[TMP4]], align 8 +// HOST-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// HOST-NEXT: store ptr null, ptr [[TMP5]], align 8 +// HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// HOST-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// HOST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// HOST-NEXT: store i32 4, ptr [[TMP8]], align 4 +// HOST-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// HOST-NEXT: store i32 2, ptr [[TMP9]], align 4 +// HOST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// HOST-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// HOST-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// HOST-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 8 +// HOST-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// HOST-NEXT: store ptr @.offload_sizes.3, ptr [[TMP12]], align 8 +// HOST-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// HOST-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP13]], align 8 +// HOST-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// HOST-NEXT: store ptr null, ptr [[TMP14]], align 8 +// HOST-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// HOST-NEXT: store ptr null, ptr [[TMP15]], align 8 +// HOST-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// HOST-NEXT: store i64 0, ptr [[TMP16]], align 8 +// HOST-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// HOST-NEXT: store i64 0, ptr [[TMP17]], align 8 +// HOST-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// HOST-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4 +// HOST-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// HOST-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4 +// HOST-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// HOST-NEXT: store i32 0, ptr [[TMP20]], align 4 +// HOST-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18use_new_local_varsv_l99.region_id, ptr [[KERNEL_ARGS]]) +// HOST-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// HOST-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// HOST: omp_offload.failed: +// HOST-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18use_new_local_varsv_l99(ptr [[RESULT]], ptr null) #[[ATTR2]] +// HOST-NEXT: br label [[OMP_OFFLOAD_CONT]] +// HOST: omp_offload.cont: +// HOST-NEXT: [[TMP23:%.*]] = load i32, ptr [[RESULT]], align 4 +// HOST-NEXT: ret i32 [[TMP23]] +// +// +// HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18use_new_local_varsv_l99 +// HOST-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]], ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR1]] { +// HOST-NEXT: entry: +// HOST-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR]], align 8 +// HOST-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// HOST-NEXT: store i32 55, ptr @_ZN11PlainStruct8s_memberE, align 4 +// HOST-NEXT: store i32 77, ptr @local_init_var, align 4 +// HOST-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN11PlainStruct8s_memberE, align 4 +// HOST-NEXT: [[TMP2:%.*]] = load i32, ptr @local_init_var, align 4 +// HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]] +// HOST-NEXT: store i32 [[ADD]], ptr [[TMP0]], align 4 +// HOST-NEXT: ret void +// +// +// DEVICE-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14use_local_varsv_l70 +// DEVICE-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]], ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] { +// DEVICE-NEXT: entry: +// DEVICE-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// DEVICE-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// DEVICE-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr +// DEVICE-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// DEVICE-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8 +// DEVICE-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// DEVICE-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]] +// DEVICE-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14use_local_varsv_l70_kernel_environment to ptr), ptr [[DYN_PTR]]) +// DEVICE-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// DEVICE-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// DEVICE: user_code.entry: +// DEVICE-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(1) @local_scalar to ptr), align 4 +// DEVICE-NEXT: store i32 1, ptr addrspacecast (ptr addrspace(1) @local_array to ptr), align 4 +// DEVICE-NEXT: store i32 100, ptr addrspacecast (ptr addrspace(1) @_ZN12LocalStorageIiE5valueE to ptr), align 4 +// DEVICE-NEXT: [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @local_scalar to ptr), align 4 +// DEVICE-NEXT: [[TMP3:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @local_array to ptr), align 4 +// DEVICE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +// DEVICE-NEXT: [[CALL:%.*]] = call noundef i32 @_Z18read_local_storageIiET_v() #[[ATTR2:[0-9]+]] +// DEVICE-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD]], [[CALL]] +// DEVICE-NEXT: store i32 [[ADD1]], ptr [[TMP0]], align 4 +// DEVICE-NEXT: call void @__kmpc_target_deinit() +// DEVICE-NEXT: ret void +// DEVICE: worker.exit: +// DEVICE-NEXT: ret void +// +// +// DEVICE-LABEL: define {{[^@]+}}@_Z18read_local_storageIiET_v +// DEVICE-SAME: () #[[ATTR1:[0-9]+]] comdat { +// DEVICE-NEXT: entry: +// DEVICE-NEXT: [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @_ZN12LocalStorageIiE5valueE to ptr), align 4 +// DEVICE-NEXT: ret i32 [[TMP0]] +// +// +// DEVICE-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21use_nohost_local_varsv_l86 +// DEVICE-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]], ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] { +// DEVICE-NEXT: entry: +// DEVICE-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// DEVICE-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// DEVICE-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr +// DEVICE-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// DEVICE-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8 +// DEVICE-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// DEVICE-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]] +// DEVICE-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21use_nohost_local_varsv_l86_kernel_environment to ptr), ptr [[DYN_PTR]]) +// DEVICE-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// DEVICE-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// DEVICE: user_code.entry: +// DEVICE-NEXT: store i32 7, ptr addrspacecast (ptr addrspace(1) @local_nohost_var to ptr), align 4 +// DEVICE-NEXT: [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @local_nohost_var to ptr), align 4 +// DEVICE-NEXT: store i32 [[TMP2]], ptr [[TMP0]], align 4 +// DEVICE-NEXT: call void @__kmpc_target_deinit() +// DEVICE-NEXT: ret void +// DEVICE: worker.exit: +// DEVICE-NEXT: ret void +// +// +// DEVICE-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18use_new_local_varsv_l99 +// DEVICE-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]], ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] { +// DEVICE-NEXT: entry: +// DEVICE-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// DEVICE-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// DEVICE-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr +// DEVICE-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// DEVICE-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8 +// DEVICE-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// DEVICE-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]] +// DEVICE-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18use_new_local_varsv_l99_kernel_environment to ptr), ptr [[DYN_PTR]]) +// DEVICE-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// DEVICE-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// DEVICE: user_code.entry: +// DEVICE-NEXT: store i32 55, ptr addrspacecast (ptr addrspace(1) @_ZN11PlainStruct8s_memberE to ptr), align 4 +// DEVICE-NEXT: store i32 77, ptr addrspacecast (ptr addrspace(1) @local_init_var to ptr), align 4 +// DEVICE-NEXT: [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @_ZN11PlainStruct8s_memberE to ptr), align 4 +// DEVICE-NEXT: [[TMP3:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @local_init_var to ptr), align 4 +// DEVICE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +// DEVICE-NEXT: store i32 [[ADD]], ptr [[TMP0]], align 4 +// DEVICE-NEXT: call void @__kmpc_target_deinit() +// DEVICE-NEXT: ret void +// DEVICE: worker.exit: +// DEVICE-NEXT: ret void +// diff --git a/clang/test/OpenMP/declare_target_local_usm_codegen.cpp b/clang/test/OpenMP/declare_target_local_usm_codegen.cpp new file mode 100644 index 0000000000000..d97d6f409d265 --- /dev/null +++ b/clang/test/OpenMP/declare_target_local_usm_codegen.cpp @@ -0,0 +1,52 @@ +// Test that declare target local variables are NOT affected by +// unified_shared_memory. Local variables always use direct access +// (no offload entry, no _decl_tgt_ref_ptr) regardless of USM. For +// comparison, enter variables with USM use pointer-reference indirection +// when normally they would also be direct access. +// +// CHECK lines not auto-generated because they are specifically verifying +// absence of ref ptr and offload entry for local variable and, by contrast, +// presence of ref ptr and offload entry for enter variable. + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=HOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -o - | FileCheck %s --check-prefix=DEVICE + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +#pragma omp requires unified_shared_memory + +int local_var; +#pragma omp declare target local(local_var) + +int enter_var; +#pragma omp declare target enter(enter_var) + +// local_var: direct access, no ref ptr, no offload entry +// HOST-DAG: @local_var = global i32 0 +// HOST-NOT: @local_var_decl_tgt_ref_ptr + +// enter_var with USM: pointer-reference indirection +// HOST-DAG: @enter_var_decl_tgt_ref_ptr = weak global ptr @enter_var +// HOST-DAG: @.offloading.entry.enter_var_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @enter_var_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "llvm_offload_entries" + +// Device: local_var is a direct global, enter_var uses ref ptr +// DEVICE-DAG: @local_var = protected addrspace(1) global i32 0 +// DEVICE-NOT: @local_var_decl_tgt_ref_ptr +// DEVICE-DAG: @enter_var_decl_tgt_ref_ptr = weak global ptr null + +int use_vars() { + int result = 0; +#pragma omp target map(from: result) + { + local_var = 42; + enter_var = 10; + result = local_var + enter_var; + } + return result; +} + +#endif diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp index 9875bd95141fd..6fe477755dbe7 100644 --- a/clang/test/OpenMP/declare_target_messages.cpp +++ b/clang/test/OpenMP/declare_target_messages.cpp @@ -152,10 +152,14 @@ void func() {} // expected-note@+1 {{'func_local' defined here}} void func_local() {} -// dev60-warning@+3 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}} // omp60-error@+2 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type', 'indirect' or 'local' clauses expected}} // expected-error@+1 {{function name is not allowed in 'local' clause}} #pragma omp declare target local(func_local) allocate(a) + +// omp60-error@+1 {{unexpected 'local' clause, only 'device_type', 'indirect' clauses expected}} +#pragma omp begin declare target local +int begin_local_var; +#pragma omp end declare target #endif // _OPENMP void bar(); @@ -335,7 +339,6 @@ int y_enter_local; #pragma omp declare target local(y_enter_local) int y_local_enter; -// dev60-warning@+1 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}} #pragma omp declare target local(y_local_enter) // expected-error@+1 {{'y_local_enter' must not appear in both clauses 'local' and 'enter'}} #pragma omp declare target enter(y_local_enter) @@ -346,7 +349,6 @@ int y_link_local; #pragma omp declare target local(y_link_local) int y_local_link; -// dev60-warning@+1 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}} #pragma omp declare target local(y_local_link) // expected-error@+1 {{'y_local_link' must not appear in both clauses 'local' and 'link'}} #pragma omp declare target link(y_local_link) @@ -466,7 +468,6 @@ int MultiDevTy; #pragma omp declare target to(MultiDevTy) device_type(nohost) int counter = 0; -// dev60-warning@+9 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}} // omp52-error@+8 {{unexpected 'local' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}} // omp52-error@+7 {{expected at least one 'enter', 'link' or 'indirect' clause}} // omp51-error@+6 {{unexpected 'local' clause, only 'to', 'link', 'device_type' or 'indirect' clauses expected}} diff --git a/offload/test/offloading/declare_target_local.cpp b/offload/test/offloading/declare_target_local.cpp new file mode 100644 index 0000000000000..9bff4ea38a407 --- /dev/null +++ b/offload/test/offloading/declare_target_local.cpp @@ -0,0 +1,40 @@ +// clang-format off +// RUN: %libomptarget-compilexx-generic -fopenmp-version=60 +// RUN: %libomptarget-run-generic | %fcheck-generic +// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=60 +// RUN: %libomptarget-run-generic | %fcheck-generic +// clang-format on + +// Sanity test for OpenMP 6.0 declare target 'local' clause. +// Verify 'local' variable has device-local storage, that it has +// correct initial value, persists across target regions, and is +// independent from host copy. + +#include +int local_var = 42; +#pragma omp declare target local(local_var) + +int main() { + // Device should get the initializer value. + int init = -1; +#pragma omp target map(from : init) + init = local_var; + + // Device write should persist to a second region. + int persist = -1; +#pragma omp target + local_var = 100; +#pragma omp target map(from : persist) + persist = local_var; + + // Host copy should not be affected by device write. Should + // retain original value. + + // CHECK: PASS + if (init == 42 && persist == 100 && local_var == 42) + printf("PASS\n"); + else + printf("FAIL init=%d persist=%d host=%d\n", init, persist, local_var); + + return (init == 42 && persist == 100 && local_var == 42) ? 0 : 1; +} From 9830c433751d9d998f739d74b12a9a5a22928542 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 11 May 2026 15:20:48 +0200 Subject: [PATCH 268/538] [clang][bytecode] Fix a crash in Descriptor::getElemDataSize() (#196929) `FIXED_SIZE_INT_TYPE_SWITCH` does not handle `PT_Bool`, handle it explicitly before. --- clang/lib/AST/ByteCode/Descriptor.cpp | 2 ++ clang/test/AST/ByteCode/literals.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp index 729df4f200e30..2d7561480f645 100644 --- a/clang/lib/AST/ByteCode/Descriptor.cpp +++ b/clang/lib/AST/ByteCode/Descriptor.cpp @@ -493,6 +493,8 @@ bool Descriptor::isUnion() const { return isRecord() && ElemRecord->isUnion(); } unsigned Descriptor::getElemDataSize() const { if ((isPrimitive() || isPrimitiveArray()) && isIntegerOrBoolType(getPrimType())) { + if (getPrimType() == PT_Bool) + return 1; FIXED_SIZE_INT_TYPE_SWITCH(getPrimType(), { return T::bitWidth() / 8; }); } return ElemSize; diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp index f7eae07c8060a..8d39d6772a923 100644 --- a/clang/test/AST/ByteCode/literals.cpp +++ b/clang/test/AST/ByteCode/literals.cpp @@ -1503,3 +1503,7 @@ namespace ExternRedecl { constexpr int a = 10; static_assert(*p == 10, ""); } + +namespace GetElemDataSizeBool { + int foo[(intptr_t)(bool *)0]; // both-warning {{variable length array folded to constant array as an extension}} +} From fd30f5bb55a8b8868fabdca33675e7438da70a1a Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Mon, 11 May 2026 06:40:08 -0700 Subject: [PATCH 269/538] [CIR] Implement Namespace/global TLS CIR CodeGen (#196332) Unlike local TLS, global TLS functions need to be initialized upon their first use in a thread. First, all attempts to 'get' said TLS global are replaced with calls to a 'wrapper' function, which calls an 'init' alias function, then returns the global. While classic codegen manages to omit this in simple cases sometimes, this CIR implementation doesn't attempt to do such constant folding/inlining. The call to the 'init' is omitted if there is no ctor/dtor setup required, so sometimes the wrapper is just a 'no-op' (intentionally!). There are also two types of 'global' TLS functions: unordered, and ordered. Unordered are typically variable templates, and their 'init' function initializes JUST them. The rest are ordered, which requires all ordered initializations to happen as soon as any happen. The Wrapper: If necessary (omitted in a few places), calls the 'init' alias, then returns the global. Global Init Function (__tls_init): For ordered global TLS, a use of 1 TLS is required to initialize ALL of them. This function checks the global guard (__tls_guard), and calls the individual 'init' functions for each global(__cxx_global_var_init). Unordered ones do not get entered here. Individual init Functions (__cxx_global_var_init): Like the rest of our globals, these emit a __cxx_global_var_init function. However, they are not added to the global constructors list. These are identically emitted with one exception: Unordered Global TLS variables wrap said init in a guard check (ordered have their's guarded by __tls_guard in __tls_init). Init Alias: This alias is called by the wrapper function to make sure said variable is initialized before use. IN the case of ordered globals, this is an alias to __tls_init. IN the case of unordered globals, this goes directly to the __cxx_global_var_init for that variable. This patch implements the codegen part of the above by introducing a set of 3 strings on a GlobalOp in an attribute. if present, this will cause the above global tls behavior (with the next patch). At the moment, this only generates the attribute, and the lowering-prepare patches will come in future patches. --- .../include/clang/CIR/Dialect/IR/CIRAttrs.td | 59 +++++++++++++++++ clang/include/clang/CIR/Dialect/IR/CIROps.td | 2 + clang/lib/CIR/CodeGen/CIRGenCXX.cpp | 5 +- clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp | 39 +++++++++++ clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 7 +- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 15 +++-- clang/lib/CIR/CodeGen/CIRGenModule.h | 1 + clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 15 +++-- .../test/CIR/CodeGen/global-tls-dyn-init.cpp | 50 ++++++++++++++ .../CIR/CodeGen/global-tls-simple-init.cpp | 65 +++++++++++++++++++ .../test/CIR/CodeGen/global-tls-templates.cpp | 34 ++++++++++ clang/test/CIR/IR/invalid-tls.cir | 18 +++++ 12 files changed, 294 insertions(+), 16 deletions(-) create mode 100644 clang/test/CIR/CodeGen/global-tls-dyn-init.cpp create mode 100644 clang/test/CIR/CodeGen/global-tls-simple-init.cpp create mode 100644 clang/test/CIR/CodeGen/global-tls-templates.cpp diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td index 1520999e3f85f..4032d8219fff3 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td @@ -1530,6 +1530,65 @@ def CIR_StaticLocalGuardAttr : CIR_Attr<"StaticLocalGuard", let canHaveIllegalCXXABIType = 0; } +//===----------------------------------------------------------------------===// +// ThreadLocalGlobalWrapperInitAttr +//===----------------------------------------------------------------------===// + +def CIR_ThreadLocalGlobalWrapperInitAttr : CIR_Attr< + "ThreadLocalGlobalWrapperInit", "tls_wrapper_init"> { + let summary = "Wrapper and Init function names for thread local variables"; + let description = [{ + Contains the mangled name of the wrapper function, init function, and + guard variable for a namespace/global scope thread local variable. The + guard variable is optional, as it is only required for unordered thread + local variables, as ordered thread local variables share a guard. + + Unordered global thread local variables (such as variable template + instantiations) are individually initialized when first used on a thread. + Ordered global thread local variables are ALL initialized together when + any that require initialization are referenced. + + This is accomplished by rewriting all calls to these variables as calls to + the wrapper. If the variable requires initialization, the wrapper calls + the init function, then returns the global variable reference. + + Throughout CIR though, these are just represented as normal `get_global` + calls to `global`s with `ctor`/`dtor` regions (if necessary). The + lowering-prepare pass manages the generation of the wrapper,x + initialization, and call rewrites. + + Example: + ``` + cir.global tls_dyn dyn_tls_refs = <"_ZTW7tls_var", "_ZTH7tls_var"> @_ZZZ7tls_var = ... + ... + cir.get_global thread_local @ZZZ7tls_var : !cir.ptr + ``` + }]; + let parameters = (ins + "mlir::StringAttr" : $wrapper_name, + "mlir::StringAttr" : $init_name, + OptionalParameter<"mlir::StringAttr">: $guard_name + ); + + let builders = [ + AttrBuilder<(ins "llvm::StringRef" + : $wrapper, "llvm::StringRef" + : $init, "llvm::StringRef" + : $guard), + [{ + mlir::StringAttr guardAttr; + if (!guard.empty()) + guardAttr = mlir::StringAttr::get($_ctxt, guard); + return $_get($_ctxt, mlir::StringAttr::get($_ctxt, wrapper), + mlir::StringAttr::get($_ctxt, init), + guardAttr); + }]>, + ]; + let assemblyFormat = + "`<` $wrapper_name `,` $init_name (`,` $guard_name^)? `>`"; + let canHaveIllegalCXXABIType = 0; +} + //===----------------------------------------------------------------------===// // UsualDeleteParamsAttr //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index b87286e846879..ed285201ae8f8 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -2863,6 +2863,7 @@ def CIR_GlobalOp : CIR_Op<"global", [ CIR_GlobalLinkageKind:$linkage, OptionalAttr:$addr_space, OptionalAttr:$tls_model, + OptionalAttr:$dyn_tls_refs, OptionalAttr:$initial_value, UnitProp:$comdat, UnitProp:$constant, @@ -2885,6 +2886,7 @@ def CIR_GlobalOp : CIR_Op<"global", [ $linkage (`comdat` $comdat^)? ($tls_model^)? + (`dyn_tls_refs` `=` $dyn_tls_refs^)? (`dso_local` $dso_local^)? (`static_local_guard` `` $static_local_guard^)? (` ` custom($addr_space)^ )? diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp index b12307b124777..d1a9110125af7 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp @@ -295,7 +295,7 @@ void CIRGenModule::emitCXXSpecialVarDeclInit(const VarDecl *varDecl, builder.getInsertionBlock()}; scope.setAsGlobalInit(); builder.setInsertionPointToStart(block); - mlir::Value getGlobal = builder.createGetGlobal(addr); + mlir::Value getGlobal = builder.createGetGlobal(addr, varDecl->getTLSKind()); // If we're initializing a static local with a guard variable, set the flag // that indicates that. getGlobal.getDefiningOp().setStaticLocal( @@ -328,8 +328,7 @@ void CIRGenModule::emitCXXSpecialVarDeclInit(const VarDecl *varDecl, void CIRGenModule::emitCXXGlobalVarDeclInit(const VarDecl *varDecl, cir::GlobalOp addr, bool performInit) { - assert(!varDecl->isStaticLocal() && - varDecl->getTLSKind() == VarDecl::TLS_None); + assert(!varDecl->isStaticLocal()); // Create a CIRGenFunction to emit the initializer. While this isn't a true // function, the handling works the same way. diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp index 955f3f4815a06..03929e50f44f9 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp @@ -59,6 +59,45 @@ void CIRGenFunction::emitCXXGuardedInit(const VarDecl &varDecl, cgm.emitCXXStaticLocalVarDeclInit(&varDecl, globalOp, performInit); } +void CIRGenModule::setGlobalTlsReferences(const VarDecl &vd, + cir::GlobalOp globalOp) { + assert(!vd.isStaticLocal() && vd.getTLSKind()); + + // C doesn't need guarded thread-local init, because it can't have + // non-constant init. + if (!getLangOpts().CPlusPlus) + return; + + if (globalOp.getTlsModel() != cir::TLS_Model::GeneralDynamic) + return; + + llvm::SmallString<256> wrapperFuncName; + llvm::SmallString<256> initFuncName; + llvm::SmallString<256> guardName; + + if (getCXXABI().getMangleContext().getKind() == MangleContext::MK_Itanium) { + llvm::raw_svector_ostream wrapperOut(wrapperFuncName); + llvm::raw_svector_ostream initOut(initFuncName); + llvm::raw_svector_ostream guardStream(guardName); + + auto &mc = cast(getCXXABI().getMangleContext()); + mc.mangleItaniumThreadLocalWrapper(&vd, wrapperOut); + mc.mangleItaniumThreadLocalInit(&vd, initOut); + if (globalOp.hasWeakLinkage() || globalOp.hasLinkOnceLinkage() || + isTemplateInstantiation(vd.getTemplateSpecializationKind())) { + getCXXABI().getMangleContext().mangleStaticGuardVariable(&vd, + guardStream); + } + + } else { + errorNYI(vd.getSourceRange(), + "setGlobalTlsReferences: non-itanium mangler"); + return; + } + globalOp.setDynTlsRefsAttr(cir::ThreadLocalGlobalWrapperInitAttr::get( + &getMLIRContext(), wrapperFuncName, initFuncName, guardName)); +} + void CIRGenModule::emitCXXGlobalVarDeclInitFunc(const VarDecl *vd, cir::GlobalOp addr, bool performInit) { diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index cb53430438219..b8cfb74fc81e4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -425,10 +425,9 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e, const VarDecl *vd) { QualType t = e->getType(); - // If it's thread_local, emit a call to its wrapper function instead. - if (vd->getTLSKind() == VarDecl::TLS_Dynamic) - cgf.cgm.errorNYI(e->getSourceRange(), - "emitGlobalVarDeclLValue: thread_local variable"); + // In classic codegen, thread-locals get a wrapper function here. Rather than + // doing that, we instead treat this as a normal 'global', and leave it to + // lowerng-prepare to correctly generate the wrapper/etc. // Check if the variable is marked as declare target with link clause in // device codegen. diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 04e413aa916ec..8c39d94a6b2ec 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -1160,11 +1160,8 @@ CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty, setLinkageForGV(gv, d); - if (d->getTLSKind()) { - if (d->getTLSKind() == VarDecl::TLS_Dynamic) - errorNYI(d->getSourceRange(), "getOrCreateCIRGlobal: TLS dynamic"); + if (d->getTLSKind()) setTLSMode(gv, *d); - } setGVProperties(gv, d); @@ -1508,7 +1505,8 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd, setNonAliasAttributes(vd, gv); - assert(!cir::MissingFeatures::opGlobalThreadLocal()); + if (vd->getTLSKind() && !vd->isStaticLocal()) + setTLSMode(gv, *vd); maybeSetTrivialComdat(*vd, gv); @@ -2774,6 +2772,13 @@ void CIRGenModule::setTLSMode(mlir::Operation *op, const VarDecl &d) { auto global = cast(op); global.setTlsModel(tlm); + + // For namespace-scope dyanmic TLS we need to set the wrapper, int, or guard + // info. + if (d.isStaticLocal() || tlm != cir::TLS_Model::GeneralDynamic) + return; + + setGlobalTlsReferences(d, global); } void CIRGenModule::setCIRFunctionAttributes(GlobalDecl globalDecl, diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 1146e4561db0b..e2184ef8640f3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -648,6 +648,7 @@ class CIRGenModule : public CIRGenTypeCache { void emitCXXGlobalVarDeclInit(const VarDecl *varDecl, cir::GlobalOp addr, bool performInit); + void setGlobalTlsReferences(const VarDecl &vd, cir::GlobalOp globalOp); void emitCXXGlobalVarDeclInitFunc(const VarDecl *vd, cir::GlobalOp addr, bool performInit); diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 79d09b3ba1eb0..74ef856c5a067 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -1903,12 +1903,19 @@ mlir::LogicalResult cir::GlobalOp::verify() { return failure(); } - if ((getStaticLocalGuard().has_value() || getTlsModel()) && + if ((getStaticLocalGuard().has_value()) && (!getCtorRegion().empty() || !getDtorRegion().empty())) return emitOpError( - "Cannot have a thread-local or static-local global-op " - "with a constructor or destructor, they require in-function " - "initialization via LocalInitOp"); + "Cannot have a static-local global-op with a constructor or " + "destructor, they require in-function initialization via LocalInitOp"); + + if (getDynTlsRefs()) { + if (getStaticLocalGuard().has_value()) + return emitOpError( + "cannot have both static local and dynamic tls references"); + if (!getTlsModel() || getTlsModel() != TLS_Model::GeneralDynamic) + return emitOpError("'dyn_tls_refs' only valid for dynamic tls"); + } if (getAliasee().has_value()) { if (getInitialValue().has_value() || !getCtorRegion().empty() || diff --git a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp new file mode 100644 index 0000000000000..ef3c1e306f62d --- /dev/null +++ b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp @@ -0,0 +1,50 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2>&1 | FileCheck %s --check-prefix=CIR-BEFORE-LPP + +int get_i(); +struct CtorDtor { + constexpr CtorDtor(int i) : i(i){} + ~CtorDtor(){} + int i; +}; + +thread_local CtorDtor tls_cd = 5; +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor dtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd : !cir.ptr +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () +// CIR-BEFORE-LPP: } + +thread_local CtorDtor tls_cd_dyn = get_i(); +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_dyn", "_ZTH10tls_cd_dyn"> @tls_cd_dyn = ctor : !rec_CtorDtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) : (!cir.ptr +// CIR-BEFORE-LPP: } dtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () +// CIR-BEFORE-LPP: } + +thread_local CtorDtor &tls_cd_ref = tls_cd_dyn; +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_ref", "_ZTH10tls_cd_ref"> @tls_cd_ref = ctor : !cir.ptr { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_ref : !cir.ptr> +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr +// CIR-BEFORE-LPP: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr> +// CIR-BEFORE-LPP: } + +thread_local CtorDtor tls_cd_dyn_not_used = get_i(); +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW19tls_cd_dyn_not_used", "_ZTH19tls_cd_dyn_not_used"> @tls_cd_dyn_not_used = ctor : !rec_CtorDtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) +// CIR-BEFORE-LPP: } dtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () +// CIR-BEFORE-LPP: } + +void uses() { + auto a = tls_cd; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_cd : !cir.ptr + auto b = tls_cd_dyn; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_cd_dyn : !cir.ptr + auto c = tls_cd_ref; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_cd_ref : !cir.ptr> +} diff --git a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp new file mode 100644 index 0000000000000..fef55b0298c33 --- /dev/null +++ b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp @@ -0,0 +1,65 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2>&1 | FileCheck %s --check-prefix=CIR-BEFORE-LPP + +int get_i(); +struct CtorDtor { + constexpr CtorDtor(int i) : i(i){} + ~CtorDtor(){} + int i; +}; + +thread_local int tls_int = 5; +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW7tls_int", "_ZTH7tls_int"> @tls_int = #cir.int<5> : !s32i + +thread_local int tls_int_dyn = get_i(); +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_dyn", "_ZTH11tls_int_dyn"> @tls_int_dyn = ctor : !s32i { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_dyn : !cir.ptr +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr +// CIR-BEFORE-LPP: } + +thread_local int &tls_int_ref = tls_int_dyn; +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_ref", "_ZTH11tls_int_ref"> @tls_int_ref = ctor : !cir.ptr { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_ref : !cir.ptr> +// CIR-BEFORE-LPP: %[[GET_OTHER:.*]] = cir.get_global thread_local @tls_int_dyn : !cir.ptr +// CIR-BEFORE-LPP: cir.store {{.*}}%[[GET_OTHER]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr> +// CIR-BEFORE-LPP: } + +thread_local int tls_int_self_init = tls_int_self_init + get_i(); +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17tls_int_self_init", "_ZTH17tls_int_self_init"> @tls_int_self_init = ctor : !s32i { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_self_init : !cir.ptr +// CIR-BEFORE-LPP: %[[GET_SELF:.*]] = cir.get_global thread_local @tls_int_self_init : !cir.ptr +// CIR-BEFORE-LPP: %[[LOAD_SELF:.*]] = cir.load {{.*}}%[[GET_SELF]] : !cir.ptr, !s32i +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: %[[ADD:.*]] = cir.add nsw %[[LOAD_SELF]], %[[CALL]] : !s32i +// CIR-BEFORE-LPP: cir.store {{.*}}%[[ADD]], %[[GET_GLOB]] : !s32i, !cir.ptr +// CIR-BEFORE-LPP: } + +extern thread_local int definitely_inited = 5; +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17definitely_inited", "_ZTH17definitely_inited"> @definitely_inited = #cir.int<5> : !s32i + +extern thread_local int definitely_inited_dyn = get_i(); +// CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW21definitely_inited_dyn", "_ZTH21definitely_inited_dyn"> @definitely_inited_dyn = ctor : !s32i { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @definitely_inited_dyn : !cir.ptr +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr +// CIR-BEFORE-LPP: } + +extern thread_local int maybe_inited; +// CIR-BEFORE-LPP: cir.global "private" external tls_dyn dyn_tls_refs = <"_ZTW12maybe_inited", "_ZTH12maybe_inited"> @maybe_inited : !s32i + +void uses() { + auto a = tls_int; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_int : !cir.ptr + auto b = tls_int_dyn; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_int_dyn : !cir.ptr + auto c = tls_int_ref; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_int_ref : !cir.ptr> + auto d = tls_int_self_init; +// CIR-BEFORE-LPP: cir.get_global thread_local @tls_int_self_init : !cir.ptr + auto e = maybe_inited; +// CIR-BEFORE-LPP: cir.get_global thread_local @maybe_inited : !cir.ptr + auto f = definitely_inited; +// CIR-BEFORE-LPP: cir.get_global thread_local @definitely_inited : !cir.ptr + auto g = definitely_inited_dyn; +// CIR-BEFORE-LPP: cir.get_global thread_local @definitely_inited_dyn : !cir.ptr +} diff --git a/clang/test/CIR/CodeGen/global-tls-templates.cpp b/clang/test/CIR/CodeGen/global-tls-templates.cpp new file mode 100644 index 0000000000000..bad1f1440dde5 --- /dev/null +++ b/clang/test/CIR/CodeGen/global-tls-templates.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2>&1 | FileCheck %s --check-prefix=CIR-BEFORE-LPP + +int get_i(); +struct CtorDtor { + constexpr CtorDtor(int i) : i(i){} + ~CtorDtor(){} + int i; +}; + +template +thread_local T tls_templ = {get_i()}; + +// CIR-BEFORE-LPP-LABEL: cir.global linkonce_odr comdat tls_dyn dyn_tls_refs = <"_ZTW9tls_templIiE", "_ZTH9tls_templIiE", "_ZGV9tls_templIiE"> @_Z9tls_templIiE = ctor : !s32i { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templIiE : !cir.ptr +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: cir.store{{.*}} %[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr +// CIR-BEFORE-LPP: } +// +// CIR-BEFORE-LPP-LABEL: cir.global linkonce_odr comdat tls_dyn dyn_tls_refs = <"_ZTW9tls_templI8CtorDtorE", "_ZTH9tls_templI8CtorDtorE", "_ZGV9tls_templI8CtorDtorE"> @_Z9tls_templI8CtorDtorE = ctor : !rec_CtorDtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr +// CIR-BEFORE-LPP: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) : (!cir.ptr +// CIR-BEFORE-LPP: } dtor { +// CIR-BEFORE-LPP: %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr +// CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () +// CIR-BEFORE-LPP: } + +// CIR-BEFORE-LPP-LABEL: cir.func{{.*}}@_Z4usesv +void uses() { + auto x = tls_templ; +// CIR-BEFORE-LPP: cir.get_global thread_local @_Z9tls_templIiE : !cir.ptr + auto y = tls_templ; +// CIR-BEFORE-LPP: cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr +} diff --git a/clang/test/CIR/IR/invalid-tls.cir b/clang/test/CIR/IR/invalid-tls.cir index 36df7fdb1e619..e33b7070eb790 100644 --- a/clang/test/CIR/IR/invalid-tls.cir +++ b/clang/test/CIR/IR/invalid-tls.cir @@ -11,3 +11,21 @@ module { } } +// ----- + +!s32i = !cir.int + +module { + // expected-error@+1{{op cannot have both static local and dynamic tls references}} +cir.global "private" internal tls_dyn dyn_tls_refs = <"asdf", "asdf", "asdf"> static_local_guard<"asdf"> @_ZZ1fvE1y : !s32i +} + +// ----- + +!s32i = !cir.int + +module { + // expected-error@+1{{'dyn_tls_refs' only valid for dynamic tls}} +cir.global "private" internal tls_local_dyn dyn_tls_refs = <"asdf", "asdf", "asdf"> @_ZZ1fvE1y : !s32i +} + From dd907ed5f58b6313dccbec9ea36e291c2ab2f984 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Mon, 11 May 2026 15:43:38 +0200 Subject: [PATCH 270/538] [StackColoring] Introduce test for PR196542 (NFC) (#196951) --- .../X86/sjlj-do-not-merge-stack-slots.ll | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll diff --git a/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll b/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll new file mode 100644 index 0000000000000..c59ca19ea1550 --- /dev/null +++ b/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s + +define void @sjlj_do_not_merge_stack_slots() nounwind { +; CHECK-LABEL: sjlj_do_not_merge_stack_slots: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: callq setjmp@PLT +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.2: # %else +; CHECK-NEXT: callq opaque@PLT +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_1: # %then +; CHECK-NEXT: callq escape@PLT +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq +entry: + %obj = alloca [8 x i8], align 4 + %buf = alloca [12 x i8], align 4 + %obj.1 = alloca [8 x i8], align 4 + call void @llvm.lifetime.start.p0(ptr %obj) + call void @llvm.lifetime.start.p0(ptr %buf) + %rv = call i32 @setjmp(ptr %buf) + %c = icmp eq i32 %rv, 0 + br i1 %c, label %then, label %else + +then: + call void @escape(ptr %obj) + br label %exit + +else: + call void @llvm.lifetime.start.p0(ptr %obj.1) + call void @opaque(ptr %obj.1) + call void @llvm.lifetime.end.p0(ptr %obj.1) + br label %exit + +exit: + call void @llvm.lifetime.end.p0(ptr %buf) + call void @llvm.lifetime.end.p0(ptr %obj) + ret void +} + +declare i32 @setjmp(ptr) returns_twice +declare void @escape(ptr) +declare void @opaque(ptr) From 6443657de563bfb5870529e92075a2c13e9f4acb Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Mon, 11 May 2026 21:54:32 +0800 Subject: [PATCH 271/538] [clang][AST] Teach `CXXTypeidExpr::isMostDerived` to use `isEffectivelyFinal` (#196544) Resolves #196476. When a class is marked final, typeid on references to that type can be resolved at compile time since the most-derived type is statically known. --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/AST/ExprCXX.cpp | 5 ++ clang/test/CodeGenCXX/typeid-most-derived.cpp | 57 +++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 clang/test/CodeGenCXX/typeid-most-derived.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 53f3d42270f1a..bd91b8723a5c6 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -250,6 +250,9 @@ Non-comprehensive list of changes in this release enabling tools such as language servers and refactoring engines to accurately map source locations back to explicit instantiation sites. +- ``typeid`` on references and pointers of ``final`` types no longer emits a + vtable lookup at runtime. + New Compiler Flags ------------------ - New option ``-fms-anonymous-structs`` / ``-fno-ms-anonymous-structs`` added diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index be1bd3ba913ea..40e129d03dcea 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -149,6 +149,11 @@ bool CXXTypeidExpr::isPotentiallyEvaluated() const { bool CXXTypeidExpr::isMostDerived(const ASTContext &Context) const { assert(!isTypeOperand() && "Cannot call isMostDerived for typeid(type)"); const Expr *E = getExprOperand()->IgnoreParenNoopCasts(Context); + + if (const CXXRecordDecl *RD = E->getType()->getAsCXXRecordDecl()) + if (RD->isEffectivelyFinal()) + return true; + if (const auto *DRE = dyn_cast(E)) { QualType Ty = DRE->getDecl()->getType(); if (!Ty->isPointerOrReferenceType()) diff --git a/clang/test/CodeGenCXX/typeid-most-derived.cpp b/clang/test/CodeGenCXX/typeid-most-derived.cpp new file mode 100644 index 0000000000000..2b6bb850ff415 --- /dev/null +++ b/clang/test/CodeGenCXX/typeid-most-derived.cpp @@ -0,0 +1,57 @@ +// RUN: %clang_cc1 %s -triple %itanium_abi_triple -Wno-unused-value -emit-llvm -o - -std=c++11 | FileCheck %s + +namespace std { + class type_info {}; +} + +struct Base { + virtual int foo() { return 42; } + virtual ~Base(); +}; + +struct NonFinal : Base {}; +struct Final final : Base { + int foo() override { return 84; } +}; + +// Most derived +void base_by_value(Base b) { typeid(b); } +// CHECK-LABEL: define {{.*}}void @_Z13base_by_value4Base +// CHECK-NOT: %vtable +// CHECK: ret void + +// Most derived +void final_ref(Final &f) { typeid(f); } +// CHECK-LABEL: define {{.*}}void @_Z9final_refR5Final +// CHECK-NOT: %vtable +// CHECK: ret void + +// Most derived +void final_deref(Final *f) { typeid(*f); } +// CHECK-LABEL: define {{.*}}void @_Z11final_derefP5Final +// CHECK-NOT: %vtable +// CHECK: ret void + +// Not most derived +void base_ref(Base &b) { typeid(b); } +// CHECK-LABEL: define {{.*}}void @_Z8base_refR4Base +// CHECK: %vtable +// CHECK: ret void + +// Not most derived +void base_deref(Base *b) { typeid(*b); } +// CHECK-LABEL: define {{.*}}void @_Z10base_derefP4Base +// CHECK: %vtable +// CHECK: ret void + +// Not most derived +void nonfinal_ref(NonFinal &d) { typeid(d); } +// CHECK-LABEL: define {{.*}}void @_Z12nonfinal_refR8NonFinal +// CHECK: %vtable +// CHECK: ret void + +// Not most derived +void nonfinal_deref(NonFinal *d) { typeid(*d); } +// CHECK-LABEL: define {{.*}}void @_Z14nonfinal_derefP8NonFinal +// CHECK: %vtable +// CHECK: ret void From 58aa43febb7f768183a788d4f2dbc26970ef6bbe Mon Sep 17 00:00:00 2001 From: Alexander Kornienko Date: Mon, 11 May 2026 16:12:45 +0200 Subject: [PATCH 272/538] Revert "[LLVM] Fix use-after-free in AlwaysInliner flatten worklist" (#196950) Reverts llvm/llvm-project#194485, which causes a 20x+ compilation time increase (https://github.com/llvm/llvm-project/pull/194485#issuecomment-4416941666) and an up to 20% runtime performance regression on fleetbench memory benchmarks (https://github.com/llvm/llvm-project/pull/194485#issuecomment-4421136222). --- llvm/lib/Transforms/IPO/AlwaysInliner.cpp | 81 ++++++++++++----------- llvm/test/Transforms/Inline/flatten.ll | 38 ----------- 2 files changed, 42 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index bdba6a1dbca53..080cb8ddb33fd 100644 --- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -42,6 +42,7 @@ bool AlwaysInlineImpl( SmallSetVector Calls; bool Changed = false; SmallVector InlinedComdatFunctions; + SmallVector NeedFlattening; auto TryInline = [&](CallBase &CB, Function &Callee, OptimizationRemarkEmitter &ORE, const char *InlineReason, @@ -75,16 +76,52 @@ bool AlwaysInlineImpl( return true; }; - for (Function &F : M) { - if (!F.hasFnAttribute(Attribute::Flatten)) + for (Function &F : make_early_inc_range(M)) { + if (F.hasFnAttribute(Attribute::Flatten)) + NeedFlattening.push_back(&F); + + if (F.isPresplitCoroutine()) continue; + + if (F.isDeclaration() || !isInlineViable(F).isSuccess()) + continue; + + Calls.clear(); + + for (User *U : F.users()) + if (auto *CB = dyn_cast(U)) + if (CB->getCalledFunction() == &F && + CB->hasFnAttr(Attribute::AlwaysInline) && + !CB->getAttributes().hasFnAttr(Attribute::NoInline)) + Calls.insert(CB); + + for (CallBase *CB : Calls) { + OptimizationRemarkEmitter ORE(CB->getCaller()); + Changed |= TryInline(*CB, F, ORE, "always inline attribute"); + } + + F.removeDeadConstantUsers(); + if (F.hasFnAttribute(Attribute::AlwaysInline) && F.isDefTriviallyDead()) { + if (F.hasComdat()) { + InlinedComdatFunctions.push_back(&F); + } else { + if (FAM) + FAM->clear(F, F.getName()); + M.getFunctionList().erase(F); + Changed = true; + } + } + } + + // Flatten functions with the flatten attribute using a local worklist. + for (Function *F : NeedFlattening) { SmallVector, 16> Worklist; SmallVector, 16> InlineHistory; SmallVector NewCallSites; - OptimizationRemarkEmitter ORE(&F); + OptimizationRemarkEmitter ORE(F); // Collect initial calls. - for (BasicBlock &BB : F) { + for (BasicBlock &BB : *F) { for (Instruction &I : BB) { if (auto *CB = dyn_cast(&I)) { Function *Callee = CB->getCalledFunction(); @@ -104,7 +141,7 @@ bool AlwaysInlineImpl( continue; // Detect recursion. - if (Callee == &F) { + if (Callee == F) { ORE.emit([&]() { return OptimizationRemarkMissed("inline", "NotInlined", CB->getDebugLoc(), CB->getParent()) @@ -142,40 +179,6 @@ bool AlwaysInlineImpl( } } - for (Function &F : make_early_inc_range(M)) { - if (F.isPresplitCoroutine()) - continue; - - if (F.isDeclaration() || !isInlineViable(F).isSuccess()) - continue; - - Calls.clear(); - - for (User *U : F.users()) - if (auto *CB = dyn_cast(U)) - if (CB->getCalledFunction() == &F && - CB->hasFnAttr(Attribute::AlwaysInline) && - !CB->getAttributes().hasFnAttr(Attribute::NoInline)) - Calls.insert(CB); - - for (CallBase *CB : Calls) { - OptimizationRemarkEmitter ORE(CB->getCaller()); - Changed |= TryInline(*CB, F, ORE, "always inline attribute"); - } - - F.removeDeadConstantUsers(); - if (F.hasFnAttribute(Attribute::AlwaysInline) && F.isDefTriviallyDead()) { - if (F.hasComdat()) { - InlinedComdatFunctions.push_back(&F); - } else { - if (FAM) - FAM->clear(F, F.getName()); - M.getFunctionList().erase(F); - Changed = true; - } - } - } - if (!InlinedComdatFunctions.empty()) { // Now we just have the comdat functions. Filter out the ones whose comdats // are not actually dead. diff --git a/llvm/test/Transforms/Inline/flatten.ll b/llvm/test/Transforms/Inline/flatten.ll index 4e246ade6b1ae..355739a99dac7 100644 --- a/llvm/test/Transforms/Inline/flatten.ll +++ b/llvm/test/Transforms/Inline/flatten.ll @@ -203,43 +203,5 @@ define i32 @test_mutual_recursion() flatten { ret i32 %r } -; Always-inline function with flatten that becomes dead after inlining. -define internal i32 @alwaysinline_flatten_callee() alwaysinline flatten { - ret i32 5 -} - -define i32 @test_alwaysinline_flatten() { -; ALWAYS-LABEL: define i32 @test_alwaysinline_flatten() { -; ALWAYS-NEXT: ret i32 5 -; -; INLINE-LABEL: define i32 @test_alwaysinline_flatten() { -; INLINE-NEXT: ret i32 5 -; -; MANDATORY-LABEL: define i32 @test_alwaysinline_flatten() { -; MANDATORY-NEXT: ret i32 5 -; - %r = call i32 @alwaysinline_flatten_callee() alwaysinline - ret i32 %r -} - -; Flatten with alwaysinline: callees are flattened first, then the -; flattened function is always-inlined into callers. -define internal i32 @inner() { - ret i32 7 -} - -define internal i32 @alwaysinline_flatten_two_levels() alwaysinline flatten { - %r = call i32 @inner() - ret i32 %r -} - -define i32 @test_alwaysinline_flatten_two_levels() { -; CHECK-LABEL: define i32 @test_alwaysinline_flatten_two_levels() { -; CHECK-NEXT: ret i32 7 -; - %r = call i32 @alwaysinline_flatten_two_levels() alwaysinline - ret i32 %r -} - ; Check that optimization remark is emitted for recursive calls during flattening. ; REMARK: remark: {{.*}} 'test_direct_recursion' is not inlined into 'test_direct_recursion': recursive call during flattening From 29df0702c36ee8e02a65c2dd351b3b54b3fdf0d6 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Mon, 11 May 2026 16:22:46 +0200 Subject: [PATCH 273/538] [StackColoring] Treat all stack slots as conservative with `returns_twice` call-sites (#196542) Do not merge stack slots on disjoint paths if the function may call setjmp/sigsetjmp, as the current algorithm defaults to computing liveness analysis from the actual uses propagated through the CFG, rather than leveraging lifetime markers, thus making it unsound with `returns_twice` calls. Fixes: https://github.com/llvm/llvm-project/issues/196468. --- llvm/lib/CodeGen/StackColoring.cpp | 5 +++++ llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll | 9 +++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp index 65f9ae5433195..cdb0ca5147728 100644 --- a/llvm/lib/CodeGen/StackColoring.cpp +++ b/llvm/lib/CodeGen/StackColoring.cpp @@ -719,6 +719,11 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) { H.CatchObj.FrameIndex >= 0) ConservativeSlots.set(H.CatchObj.FrameIndex); + // Treat all stack slots as conservative if we happen to have calls to + // setjmp/sigsetjmp, as longjmp may re-enter the function on a different path. + if (MF->exposesReturnsTwice()) + ConservativeSlots.set(); + LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots)); // Step 2: compute begin/end sets for each block diff --git a/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll b/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll index c59ca19ea1550..ef2da73bbbf5b 100644 --- a/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll +++ b/llvm/test/CodeGen/X86/sjlj-do-not-merge-stack-slots.ll @@ -4,19 +4,20 @@ define void @sjlj_do_not_merge_stack_slots() nounwind { ; CHECK-LABEL: sjlj_do_not_merge_stack_slots: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: callq setjmp@PLT -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %else +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: callq opaque@PLT -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB0_1: # %then +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: callq escape@PLT -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq entry: %obj = alloca [8 x i8], align 4 From 86aa07a7f09942727c4f267f8b233334be268d13 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Mon, 11 May 2026 16:25:13 +0200 Subject: [PATCH 274/538] [InstCombine] Create `or` as `disjoint` where applicable (#193725) --- llvm/include/llvm/IR/IRBuilder.h | 4 ++++ .../lib/Target/RISCV/RISCVGatherScatterLowering.cpp | 2 +- .../Transforms/InstCombine/InstCombineAddSub.cpp | 4 ++-- .../Transforms/InstCombine/InstCombineAndOrXor.cpp | 13 ++++++------- llvm/test/Transforms/InstCombine/add.ll | 4 ++-- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index cb0fdeaecd1cc..a10a59a9ae9e7 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1637,6 +1637,10 @@ class IRBuilderBase { return Accum; } + Value *CreateDisjointOr(Value *LHS, Value *RHS, const Twine &Name = "") { + return CreateOr(LHS, RHS, Name, true); + } + Value *CreateXor(Value *LHS, Value *RHS, const Twine &Name = "") { if (Value *V = Folder.FoldBinOp(Instruction::Xor, LHS, RHS)) return V; diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 25b5af8324e64..ab089c4ab0198 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -167,7 +167,7 @@ static std::pair matchStridedStart(Value *Start, default: llvm_unreachable("Unexpected opcode"); case Instruction::Or: - Start = Builder.CreateOr(Start, Splat, "", /*IsDisjoint=*/true); + Start = Builder.CreateDisjointOr(Start, Splat); break; case Instruction::Add: Start = Builder.CreateAdd(Start, Splat); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 462256c78a48b..192355b12c4da 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -933,7 +933,7 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) { // If wrapping is not allowed, then the addition must set the sign bit: // X + (signmask) --> X | signmask if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap()) - return BinaryOperator::CreateOr(Op0, Op1); + return BinaryOperator::CreateDisjointOr(Op0, Op1); // If wrapping is allowed, then the addition flips the sign bit of LHS: // X + (signmask) --> X ^ signmask @@ -1962,7 +1962,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { haveNoCommonBitsSet(A, B, SQ.getWithInstruction(&I))) return replaceInstUsesWith( I, Builder.CreateIntrinsic(Intrinsic::ctpop, {I.getType()}, - {Builder.CreateOr(A, B)})); + {Builder.CreateDisjointOr(A, B)})); // Fold the log2_ceil idiom: // zext(ctpop(A) >u/!= 1) + (ctlz(A, true) ^ (BW - 1)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 52d5e28d20915..c9f2418fae6fd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3154,7 +3154,7 @@ static Value *matchOrConcat(Instruction &Or, InstCombiner::BuilderTy &Builder) { Value *NewLower = Builder.CreateZExt(Lo, Ty); Value *NewUpper = Builder.CreateZExt(Hi, Ty); NewUpper = Builder.CreateShl(NewUpper, HalfWidth); - Value *BinOp = Builder.CreateOr(NewLower, NewUpper); + Value *BinOp = Builder.CreateDisjointOr(NewLower, NewUpper); return Builder.CreateIntrinsic(id, Ty, BinOp); }; @@ -3964,16 +3964,16 @@ Value *InstCombinerImpl::reassociateDisjointOr(Value *LHS, Value *RHS) { Value *X, *Y; if (match(RHS, m_OneUse(m_DisjointOr(m_Value(X), m_Value(Y))))) { if (Value *Res = foldDisjointOr(LHS, X)) - return Builder.CreateOr(Res, Y, "", /*IsDisjoint=*/true); + return Builder.CreateDisjointOr(Res, Y); if (Value *Res = foldDisjointOr(LHS, Y)) - return Builder.CreateOr(Res, X, "", /*IsDisjoint=*/true); + return Builder.CreateDisjointOr(Res, X); } if (match(LHS, m_OneUse(m_DisjointOr(m_Value(X), m_Value(Y))))) { if (Value *Res = foldDisjointOr(X, RHS)) - return Builder.CreateOr(Res, Y, "", /*IsDisjoint=*/true); + return Builder.CreateDisjointOr(Res, Y); if (Value *Res = foldDisjointOr(Y, RHS)) - return Builder.CreateOr(Res, X, "", /*IsDisjoint=*/true); + return Builder.CreateDisjointOr(Res, X); } return nullptr; @@ -4438,8 +4438,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) { bool IsDisjointOuter = cast(I).isDisjoint(); bool IsDisjointInner = cast(Op0)->isDisjoint(); - Value *Inner = Builder.CreateOr(A, Op1); - cast(Inner)->setIsDisjoint(IsDisjointOuter); + Value *Inner = Builder.CreateOr(A, Op1, "", /*IsDisjoint=*/IsDisjointOuter); Inner->takeName(Op0); return IsDisjointOuter && IsDisjointInner ? BinaryOperator::CreateDisjointOr(Inner, CI) diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll index 4380491f07557..b4ea23d89dbc9 100644 --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -933,7 +933,7 @@ define <2 x i32> @xor_sign_bit_vec_splat(<2 x i32> %x) { define i8 @add_nsw_signbit(i8 %x) { ; CHECK-LABEL: @add_nsw_signbit( -; CHECK-NEXT: [[Y:%.*]] = or i8 [[X:%.*]], -128 +; CHECK-NEXT: [[Y:%.*]] = or disjoint i8 [[X:%.*]], -128 ; CHECK-NEXT: ret i8 [[Y]] ; %y = add nsw i8 %x, -128 @@ -944,7 +944,7 @@ define i8 @add_nsw_signbit(i8 %x) { define i8 @add_nuw_signbit(i8 %x) { ; CHECK-LABEL: @add_nuw_signbit( -; CHECK-NEXT: [[Y:%.*]] = or i8 [[X:%.*]], -128 +; CHECK-NEXT: [[Y:%.*]] = or disjoint i8 [[X:%.*]], -128 ; CHECK-NEXT: ret i8 [[Y]] ; %y = add nuw i8 %x, 128 From 912176016cfe18b6a848f0999e4aa94b6e323572 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 11 May 2026 09:27:24 -0500 Subject: [PATCH 275/538] [lldb] Release output lock across blocking el_wgetc in DisplayCompletions (#196686) DisplayCompletions held m_output_stream_sp->Lock() across the blocking el_wgetc() call used by the "More (Y/n/a)" pager. Because the lock is a recursive_mutex, this worked when Editline::Interrupt() ran on the same thread (the synchronous SIGINT handler), but deadlocks when Interrupt() runs on another thread: it blocks on the lock and can never call InterruptRead() to wake the editor thread. Mirror the pattern already used by Editline::GetCharacter: drop the lock across the blocking read and reacquire it afterward. The status check and the "^C\n" / "\n" prints stay under the lock. --- lldb/source/Host/common/Editline.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 39b0a649a7f60..833516b0b2c2d 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -1054,9 +1054,10 @@ void Editline::DisplayCompletions( Editline &editline, llvm::ArrayRef results) { assert(!results.empty()); - LockedStreamFile locked_stream = editline.m_output_stream_sp->Lock(); + std::optional locked_stream = + editline.m_output_stream_sp->Lock(); - fprintf(locked_stream.GetFile().GetStream(), + fprintf(locked_stream->GetFile().GetStream(), "\n" ANSI_CLEAR_BELOW "Available completions:\n"); /// Account for the current line, the line showing "Available completions" @@ -1075,14 +1076,20 @@ void Editline::DisplayCompletions( size_t cur_pos = 0; while (cur_pos < results.size()) { cur_pos += PrintCompletion( - locked_stream.GetFile().GetStream(), results.slice(cur_pos), max_len, + locked_stream->GetFile().GetStream(), results.slice(cur_pos), max_len, editline.GetTerminalWidth(), all ? std::nullopt : std::optional(page_size)); if (cur_pos >= results.size()) break; - fprintf(locked_stream.GetFile().GetStream(), "More (Y/n/a): "); + fprintf(locked_stream->GetFile().GetStream(), "More (Y/n/a): "); + + // Release the output lock across the blocking el_wgetc() so that + // Interrupt(), which may run on another thread, can acquire it to wake + // up the read. + locked_stream.reset(); + // The type for the output and the type for the parameter are different, // to allow interoperability with older versions of libedit. The container // for the reply must be as wide as what our implementation is using, @@ -1091,14 +1098,17 @@ void Editline::DisplayCompletions( EditLineGetCharType reply = L'n'; int got_char = el_wgetc(editline.m_editline, reinterpret_cast(&reply)); + + locked_stream.emplace(editline.m_output_stream_sp->Lock()); + // Check for a ^C or other interruption. if (editline.m_editor_status == EditorStatus::Interrupted) { editline.m_editor_status = EditorStatus::Editing; - fprintf(locked_stream.GetFile().GetStream(), "^C\n"); + fprintf(locked_stream->GetFile().GetStream(), "^C\n"); break; } - fprintf(locked_stream.GetFile().GetStream(), "\n"); + fprintf(locked_stream->GetFile().GetStream(), "\n"); if (got_char == -1 || reply == 'n') break; if (reply == 'a') From 30bda9f7c8f305bc8a16de3aaee71fc16a73e689 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 11 May 2026 16:31:59 +0200 Subject: [PATCH 276/538] [clang][bytecode] Check value-dependency before calling evaluateValue() (#196931) As always. --- clang/lib/AST/ByteCode/Compiler.cpp | 5 +++-- clang/test/AST/ByteCode/c.c | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index beaeed09005b9..ed464dbfadf71 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -7698,8 +7698,9 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { // For C. if (!Ctx.getLangOpts().CPlusPlus) { - if (VD->getInit() && DeclType.isConstant(Ctx.getASTContext()) && - !VD->isWeak() && VD->evaluateValue()) + if (VD->getInit() && !VD->getInit()->isValueDependent() && + DeclType.isConstant(Ctx.getASTContext()) && !VD->isWeak() && + VD->evaluateValue()) return revisit(VD, /*IsConstexprUnknown=*/false); return this->emitDummyPtr(D, E); } diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index 851f28ea77739..e82336e9731ba 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -457,3 +457,12 @@ void labelAndNull(void) { int bar = &*(void *)0 - &&baz; } // all-error {{use of void nonNumberRem(void) { *((int *)0) = (long)foo % 42; } // all-warning {{indirection of non-volatile null pointer will be deleted, not trap}} \ // all-note {{consider using __builtin_trap() or qualifying pointer with 'volatile'}} + +struct Oops { + int a; // all-note {{previous declaration is here}} + double a; // all-error {{duplicate member 'a'}} +}; +void evaluatevalue(void) { + const struct Oops s = {0, 0.}; + *(int *)(&s.a) = 42; // all-warning {{cast from 'const int *' to 'int *' drops const qualifier}} +} From a4c4836c1bb6bf5630756bfc010e76776d8a7668 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 11 May 2026 07:33:50 -0700 Subject: [PATCH 277/538] [NFC][LLVM][IRTests] Namespace cleanup (#196806) Remove llvm namespace surrounding entire .cpp files and instead use `using namespace` in these files. --- llvm/unittests/IR/BasicBlockTest.cpp | 4 ++-- llvm/unittests/IR/ConstantsTest.cpp | 4 ++-- llvm/unittests/IR/DroppedVariableStatsIRTest.cpp | 3 --- llvm/unittests/IR/InstructionsTest.cpp | 6 +++--- llvm/unittests/IR/TimePassesTest.cpp | 9 +++++---- llvm/unittests/IR/VerifierTest.cpp | 3 +-- 6 files changed, 13 insertions(+), 16 deletions(-) diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp index d1824ba837843..0f5497abaa9c0 100644 --- a/llvm/unittests/IR/BasicBlockTest.cpp +++ b/llvm/unittests/IR/BasicBlockTest.cpp @@ -21,7 +21,8 @@ #include "gtest/gtest.h" #include -namespace llvm { +using namespace llvm; + namespace { TEST(BasicBlockTest, PhiRange) { @@ -546,4 +547,3 @@ TEST(BasicBlockTest, DiscardValueNames2) { } } // End anonymous namespace. -} // End llvm namespace. diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp index b97d38a7b37ad..6716ec581bd06 100644 --- a/llvm/unittests/IR/ConstantsTest.cpp +++ b/llvm/unittests/IR/ConstantsTest.cpp @@ -19,7 +19,8 @@ #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" -namespace llvm { +using namespace llvm; + namespace { // Check that use count checks treat ConstantData like they have no uses. @@ -920,4 +921,3 @@ TEST(ConstantsTest, ToConstantRangeConstantByteVector) { } } // end anonymous namespace -} // end namespace llvm diff --git a/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp b/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp index 20c66b93fcb1a..03e6f429e14f8 100644 --- a/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp +++ b/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp @@ -24,8 +24,6 @@ #include using namespace llvm; -namespace llvm { -void initializePassTest1Pass(PassRegistry &); static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { SMDiagnostic Err; @@ -34,7 +32,6 @@ static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { Err.print("AbstractCallSiteTests", errs()); return Mod; } -} // namespace llvm namespace { diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp index b01569d216676..1c4361e4c0f20 100644 --- a/llvm/unittests/IR/InstructionsTest.cpp +++ b/llvm/unittests/IR/InstructionsTest.cpp @@ -33,8 +33,7 @@ #include "gtest/gtest.h" #include -namespace llvm { -namespace { +using namespace llvm; static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { SMDiagnostic Err; @@ -44,6 +43,8 @@ static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { return Mod; } +namespace { + TEST(InstructionsTest, ReturnInst) { LLVMContext C; @@ -2011,4 +2012,3 @@ TEST(InstructionsTest, StripAndAccumulateConstantOffset) { } } // end anonymous namespace -} // end namespace llvm diff --git a/llvm/unittests/IR/TimePassesTest.cpp b/llvm/unittests/IR/TimePassesTest.cpp index 1faaf68f80a67..d5649427851f5 100644 --- a/llvm/unittests/IR/TimePassesTest.cpp +++ b/llvm/unittests/IR/TimePassesTest.cpp @@ -29,6 +29,8 @@ namespace llvm { void initializePass1Pass(PassRegistry &); void initializePass2Pass(PassRegistry &); +} // namespace llvm + namespace { struct Pass1 : public ModulePass { static char ID; @@ -56,7 +58,6 @@ struct Pass2 : public ModulePass { }; char Pass2::ID; } // namespace -} // namespace llvm INITIALIZE_PASS(Pass1, "Pass1", "Pass1", false, false) INITIALIZE_PASS(Pass2, "Pass2", "Pass2", false, false) @@ -75,8 +76,8 @@ TEST(TimePassesTest, LegacyCustomOut) { // Setup pass manager legacy::PassManager PM1; - PM1.add(new llvm::Pass1()); - PM1.add(new llvm::Pass2()); + PM1.add(new Pass1()); + PM1.add(new Pass2()); // Enable time-passes and run passes. TimePassesIsEnabled = true; @@ -100,7 +101,7 @@ TEST(TimePassesTest, LegacyCustomOut) { // Now run just a single pass to populate timers again. legacy::PassManager PM2; - PM2.add(new llvm::Pass2()); + PM2.add(new Pass2()); PM2.run(M); // Generate report again. diff --git a/llvm/unittests/IR/VerifierTest.cpp b/llvm/unittests/IR/VerifierTest.cpp index 1e31fc5e06f65..529ab56f25247 100644 --- a/llvm/unittests/IR/VerifierTest.cpp +++ b/llvm/unittests/IR/VerifierTest.cpp @@ -21,7 +21,7 @@ #include "llvm/IR/Module.h" #include "gtest/gtest.h" -namespace llvm { +using namespace llvm; namespace { TEST(VerifierTest, Branch_i1) { @@ -538,4 +538,3 @@ TEST(VerifierTest, DeeplyNested) { } } // end anonymous namespace -} // end namespace llvm From 693ad7e4f78697bce9bc9823615b19530e6d7a4e Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 11 May 2026 07:34:39 -0700 Subject: [PATCH 278/538] [NFC][LLVM] Namespace cleanup in LegacyPassManagerTest (#196811) Remove llvm namespace surrounding the entire file, and remove extra indentation of the code within the anonymous namespace. --- llvm/unittests/IR/LegacyPassManagerTest.cpp | 1129 +++++++++---------- 1 file changed, 562 insertions(+), 567 deletions(-) diff --git a/llvm/unittests/IR/LegacyPassManagerTest.cpp b/llvm/unittests/IR/LegacyPassManagerTest.cpp index cf455fc0a3028..3a3982763bb06 100644 --- a/llvm/unittests/IR/LegacyPassManagerTest.cpp +++ b/llvm/unittests/IR/LegacyPassManagerTest.cpp @@ -39,617 +39,612 @@ using namespace llvm; namespace llvm { - void initializeModuleNDMPass(PassRegistry&); - void initializeFPassPass(PassRegistry&); - void initializeCGPassPass(PassRegistry&); - void initializeLPassPass(PassRegistry&); - - namespace { - // ND = no deps - // NM = no modifications - struct ModuleNDNM: public ModulePass { - public: - static char run; - static char ID; - ModuleNDNM() : ModulePass(ID) { } - bool runOnModule(Module &M) override { - run++; - return false; - } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - } - }; - char ModuleNDNM::ID=0; - char ModuleNDNM::run=0; - - struct ModuleNDM : public ModulePass { - public: - static char run; - static char ID; - ModuleNDM() : ModulePass(ID) {} - bool runOnModule(Module &M) override { - run++; - return true; - } - }; - char ModuleNDM::ID=0; - char ModuleNDM::run=0; - - struct ModuleNDM2 : public ModulePass { - public: - static char run; - static char ID; - ModuleNDM2() : ModulePass(ID) {} - bool runOnModule(Module &M) override { - run++; - return true; - } - }; - char ModuleNDM2::ID=0; - char ModuleNDM2::run=0; - - struct ModuleDNM : public ModulePass { - public: - static char run; - static char ID; - ModuleDNM() : ModulePass(ID) { - initializeModuleNDMPass(*PassRegistry::getPassRegistry()); - } - bool runOnModule(Module &M) override { - run++; - return false; - } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesAll(); - } - }; - char ModuleDNM::ID=0; - char ModuleDNM::run=0; - - template - struct PassTestBase : public P { - protected: - static int runc; - static bool initialized; - static bool finalized; - int allocated; - void run() { - EXPECT_TRUE(initialized); - EXPECT_FALSE(finalized); - EXPECT_EQ(0, allocated); - allocated++; - runc++; - } - public: - static char ID; - static void finishedOK(int run) { - EXPECT_GT(runc, 0); - EXPECT_TRUE(initialized); - EXPECT_TRUE(finalized); - EXPECT_EQ(run, runc); - } - PassTestBase() : P(ID), allocated(0) { - initialized = false; - finalized = false; - runc = 0; - } +void initializeModuleNDMPass(PassRegistry &); +void initializeFPassPass(PassRegistry &); +void initializeCGPassPass(PassRegistry &); +void initializeLPassPass(PassRegistry &); +} // namespace llvm + +namespace { +// ND = no deps +// NM = no modifications +struct ModuleNDNM : public ModulePass { +public: + static char run; + static char ID; + ModuleNDNM() : ModulePass(ID) {} + bool runOnModule(Module &M) override { + run++; + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } +}; +char ModuleNDNM::ID = 0; +char ModuleNDNM::run = 0; + +struct ModuleNDM : public ModulePass { +public: + static char run; + static char ID; + ModuleNDM() : ModulePass(ID) {} + bool runOnModule(Module &M) override { + run++; + return true; + } +}; +char ModuleNDM::ID = 0; +char ModuleNDM::run = 0; + +struct ModuleNDM2 : public ModulePass { +public: + static char run; + static char ID; + ModuleNDM2() : ModulePass(ID) {} + bool runOnModule(Module &M) override { + run++; + return true; + } +}; +char ModuleNDM2::ID = 0; +char ModuleNDM2::run = 0; + +struct ModuleDNM : public ModulePass { +public: + static char run; + static char ID; + ModuleDNM() : ModulePass(ID) { + initializeModuleNDMPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + run++; + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + } +}; +char ModuleDNM::ID = 0; +char ModuleDNM::run = 0; + +template struct PassTestBase : public P { +protected: + static int runc; + static bool initialized; + static bool finalized; + int allocated; + void run() { + EXPECT_TRUE(initialized); + EXPECT_FALSE(finalized); + EXPECT_EQ(0, allocated); + allocated++; + runc++; + } - void releaseMemory() override { - EXPECT_GT(runc, 0); - EXPECT_GT(allocated, 0); - allocated--; - } - }; - template char PassTestBase

::ID; - template int PassTestBase

::runc; - template bool PassTestBase

::initialized; - template bool PassTestBase

::finalized; - - template - struct PassTest : public PassTestBase

{ - public: +public: + static char ID; + static void finishedOK(int run) { + EXPECT_GT(runc, 0); + EXPECT_TRUE(initialized); + EXPECT_TRUE(finalized); + EXPECT_EQ(run, runc); + } + PassTestBase() : P(ID), allocated(0) { + initialized = false; + finalized = false; + runc = 0; + } + + void releaseMemory() override { + EXPECT_GT(runc, 0); + EXPECT_GT(allocated, 0); + allocated--; + } +}; +template char PassTestBase

::ID; +template int PassTestBase

::runc; +template bool PassTestBase

::initialized; +template bool PassTestBase

::finalized; + +template struct PassTest : public PassTestBase

{ +public: #ifndef _MSC_VER // MSVC complains that Pass is not base class. - using llvm::Pass::doInitialization; - using llvm::Pass::doFinalization; + using llvm::Pass::doFinalization; + using llvm::Pass::doInitialization; #endif - bool doInitialization(T &t) override { - EXPECT_FALSE(PassTestBase

::initialized); - PassTestBase

::initialized = true; - return false; - } - bool doFinalization(T &t) override { - EXPECT_FALSE(PassTestBase

::finalized); - PassTestBase

::finalized = true; - EXPECT_EQ(0, PassTestBase

::allocated); - return false; - } - }; - - struct CGPass : public PassTest { - public: - CGPass() { - initializeCGPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnSCC(CallGraphSCC &SCMM) override { - run(); - return false; - } - }; - - struct FPass : public PassTest { - public: - bool runOnFunction(Function &F) override { - // FIXME: PR4112 - // EXPECT_TRUE(getAnalysisIfAvailable()); - run(); - return false; - } - }; - - struct LPass : public PassTestBase { - private: - static int initcount; - static int fincount; - public: - LPass() { - initializeLPassPass(*PassRegistry::getPassRegistry()); - initcount = 0; fincount=0; - EXPECT_FALSE(initialized); - } - static void finishedOK(int run, int finalized) { - PassTestBase::finishedOK(run); - EXPECT_EQ(run, initcount); - EXPECT_EQ(finalized, fincount); - } - using llvm::Pass::doInitialization; - using llvm::Pass::doFinalization; - bool doInitialization(Loop* L, LPPassManager &LPM) override { - initialized = true; - initcount++; - return false; - } - bool runOnLoop(Loop *L, LPPassManager &LPM) override { - run(); - return false; - } - bool doFinalization() override { - fincount++; - finalized = true; - return false; - } - }; - int LPass::initcount=0; - int LPass::fincount=0; - - struct OnTheFlyTest: public ModulePass { - public: - static char ID; - OnTheFlyTest() : ModulePass(ID) { - initializeFPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnModule(Module &M) override { - for (Module::iterator I=M.begin(),E=M.end(); I != E; ++I) { - Function &F = *I; - { - SCOPED_TRACE("Running on the fly function pass"); - getAnalysis(F); - } - } - return false; - } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + bool doInitialization(T &t) override { + EXPECT_FALSE(PassTestBase

::initialized); + PassTestBase

::initialized = true; + return false; + } + bool doFinalization(T &t) override { + EXPECT_FALSE(PassTestBase

::finalized); + PassTestBase

::finalized = true; + EXPECT_EQ(0, PassTestBase

::allocated); + return false; + } +}; + +struct CGPass : public PassTest { +public: + CGPass() { initializeCGPassPass(*PassRegistry::getPassRegistry()); } + bool runOnSCC(CallGraphSCC &SCMM) override { + run(); + return false; + } +}; + +struct FPass : public PassTest { +public: + bool runOnFunction(Function &F) override { + // FIXME: PR4112 + // EXPECT_TRUE(getAnalysisIfAvailable()); + run(); + return false; + } +}; + +struct LPass : public PassTestBase { +private: + static int initcount; + static int fincount; + +public: + LPass() { + initializeLPassPass(*PassRegistry::getPassRegistry()); + initcount = 0; + fincount = 0; + EXPECT_FALSE(initialized); + } + static void finishedOK(int run, int finalized) { + PassTestBase::finishedOK(run); + EXPECT_EQ(run, initcount); + EXPECT_EQ(finalized, fincount); + } + using llvm::Pass::doFinalization; + using llvm::Pass::doInitialization; + bool doInitialization(Loop *L, LPPassManager &LPM) override { + initialized = true; + initcount++; + return false; + } + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + run(); + return false; + } + bool doFinalization() override { + fincount++; + finalized = true; + return false; + } +}; +int LPass::initcount = 0; +int LPass::fincount = 0; + +struct OnTheFlyTest : public ModulePass { +public: + static char ID; + OnTheFlyTest() : ModulePass(ID) { + initializeFPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + { + SCOPED_TRACE("Running on the fly function pass"); + getAnalysis(F); } - }; - char OnTheFlyTest::ID=0; - - TEST(PassManager, RunOnce) { - LLVMContext Context; - Module M("test-once", Context); - struct ModuleNDNM *mNDNM = new ModuleNDNM(); - struct ModuleDNM *mDNM = new ModuleDNM(); - struct ModuleNDM *mNDM = new ModuleNDM(); - struct ModuleNDM2 *mNDM2 = new ModuleNDM2(); - - mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0; - - legacy::PassManager Passes; - Passes.add(mNDM2); - Passes.add(mNDM); - Passes.add(mNDNM); - Passes.add(mDNM); - - Passes.run(M); - // each pass must be run exactly once, since nothing invalidates them - EXPECT_EQ(1, mNDM->run); - EXPECT_EQ(1, mNDNM->run); - EXPECT_EQ(1, mDNM->run); - EXPECT_EQ(1, mNDM2->run); } + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } +}; +char OnTheFlyTest::ID = 0; + +TEST(PassManager, RunOnce) { + LLVMContext Context; + Module M("test-once", Context); + struct ModuleNDNM *mNDNM = new ModuleNDNM(); + struct ModuleDNM *mDNM = new ModuleDNM(); + struct ModuleNDM *mNDM = new ModuleNDM(); + struct ModuleNDM2 *mNDM2 = new ModuleNDM2(); + + mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0; + + legacy::PassManager Passes; + Passes.add(mNDM2); + Passes.add(mNDM); + Passes.add(mNDNM); + Passes.add(mDNM); + + Passes.run(M); + // each pass must be run exactly once, since nothing invalidates them + EXPECT_EQ(1, mNDM->run); + EXPECT_EQ(1, mNDNM->run); + EXPECT_EQ(1, mDNM->run); + EXPECT_EQ(1, mNDM2->run); +} - TEST(PassManager, ReRun) { - LLVMContext Context; - Module M("test-rerun", Context); - struct ModuleNDNM *mNDNM = new ModuleNDNM(); - struct ModuleDNM *mDNM = new ModuleDNM(); - struct ModuleNDM *mNDM = new ModuleNDM(); - struct ModuleNDM2 *mNDM2 = new ModuleNDM2(); - - mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0; - - legacy::PassManager Passes; - Passes.add(mNDM); - Passes.add(mNDNM); - Passes.add(mNDM2);// invalidates mNDM needed by mDNM - Passes.add(mDNM); - - Passes.run(M); - // Some passes must be rerun because a pass that modified the - // module/function was run in between - EXPECT_EQ(2, mNDM->run); - EXPECT_EQ(1, mNDNM->run); - EXPECT_EQ(1, mNDM2->run); - EXPECT_EQ(1, mDNM->run); - } +TEST(PassManager, ReRun) { + LLVMContext Context; + Module M("test-rerun", Context); + struct ModuleNDNM *mNDNM = new ModuleNDNM(); + struct ModuleDNM *mDNM = new ModuleDNM(); + struct ModuleNDM *mNDM = new ModuleNDM(); + struct ModuleNDM2 *mNDM2 = new ModuleNDM2(); + + mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0; + + legacy::PassManager Passes; + Passes.add(mNDM); + Passes.add(mNDNM); + Passes.add(mNDM2); // invalidates mNDM needed by mDNM + Passes.add(mDNM); + + Passes.run(M); + // Some passes must be rerun because a pass that modified the + // module/function was run in between + EXPECT_EQ(2, mNDM->run); + EXPECT_EQ(1, mNDNM->run); + EXPECT_EQ(1, mNDM2->run); + EXPECT_EQ(1, mDNM->run); +} - Module *makeLLVMModule(LLVMContext &Context); - - template - void MemoryTestHelper(int run) { - LLVMContext Context; - std::unique_ptr M(makeLLVMModule(Context)); - T *P = new T(); - legacy::PassManager Passes; - Passes.add(P); - Passes.run(*M); - T::finishedOK(run); - } +Module *makeLLVMModule(LLVMContext &Context); - template - void MemoryTestHelper(int run, int N) { - LLVMContext Context; - Module *M = makeLLVMModule(Context); - T *P = new T(); - legacy::PassManager Passes; - Passes.add(P); - Passes.run(*M); - T::finishedOK(run, N); - delete M; - } +template void MemoryTestHelper(int run) { + LLVMContext Context; + std::unique_ptr M(makeLLVMModule(Context)); + T *P = new T(); + legacy::PassManager Passes; + Passes.add(P); + Passes.run(*M); + T::finishedOK(run); +} - TEST(PassManager, Memory) { - // SCC#1: test1->test2->test3->test1 - // SCC#2: test4 - // SCC#3: indirect call node - { - SCOPED_TRACE("Callgraph pass"); - MemoryTestHelper(3); - } +template void MemoryTestHelper(int run, int N) { + LLVMContext Context; + Module *M = makeLLVMModule(Context); + T *P = new T(); + legacy::PassManager Passes; + Passes.add(P); + Passes.run(*M); + T::finishedOK(run, N); + delete M; +} - { - SCOPED_TRACE("Function pass"); - MemoryTestHelper(4);// 4 functions - } +TEST(PassManager, Memory) { + // SCC#1: test1->test2->test3->test1 + // SCC#2: test4 + // SCC#3: indirect call node + { + SCOPED_TRACE("Callgraph pass"); + MemoryTestHelper(3); + } - { - SCOPED_TRACE("Loop pass"); - MemoryTestHelper(2, 1); //2 loops, 1 function - } + { + SCOPED_TRACE("Function pass"); + MemoryTestHelper(4); // 4 functions + } - } + { + SCOPED_TRACE("Loop pass"); + MemoryTestHelper(2, 1); // 2 loops, 1 function + } +} - TEST(PassManager, MemoryOnTheFly) { - LLVMContext Context; - Module *M = makeLLVMModule(Context); - { - SCOPED_TRACE("Running OnTheFlyTest"); - struct OnTheFlyTest *O = new OnTheFlyTest(); - legacy::PassManager Passes; - Passes.add(O); - Passes.run(*M); +TEST(PassManager, MemoryOnTheFly) { + LLVMContext Context; + Module *M = makeLLVMModule(Context); + { + SCOPED_TRACE("Running OnTheFlyTest"); + struct OnTheFlyTest *O = new OnTheFlyTest(); + legacy::PassManager Passes; + Passes.add(O); + Passes.run(*M); + + FPass::finishedOK(4); + } + delete M; +} - FPass::finishedOK(4); - } - delete M; - } +// Skips or runs optional passes. +struct CustomOptPassGate : public OptPassGate { + bool Skip; + CustomOptPassGate(bool Skip) : Skip(Skip) {} + bool shouldRunPass(StringRef PassName, + StringRef IRDescription) const override { + return !Skip; + } + bool isEnabled() const override { return true; } +}; + +// Optional module pass. +struct ModuleOpt : public ModulePass { + char run = 0; + static char ID; + ModuleOpt() : ModulePass(ID) {} + bool runOnModule(Module &M) override { + if (!skipModule(M)) + run++; + return false; + } +}; +char ModuleOpt::ID = 0; - // Skips or runs optional passes. - struct CustomOptPassGate : public OptPassGate { - bool Skip; - CustomOptPassGate(bool Skip) : Skip(Skip) { } - bool shouldRunPass(StringRef PassName, - StringRef IRDescription) const override { - return !Skip; - } - bool isEnabled() const override { return true; } - }; - - // Optional module pass. - struct ModuleOpt: public ModulePass { - char run = 0; - static char ID; - ModuleOpt() : ModulePass(ID) { } - bool runOnModule(Module &M) override { - if (!skipModule(M)) - run++; - return false; - } - }; - char ModuleOpt::ID=0; +TEST(PassManager, CustomOptPassGate) { + LLVMContext Context0; + LLVMContext Context1; + LLVMContext Context2; + CustomOptPassGate SkipOptionalPasses(true); + CustomOptPassGate RunOptionalPasses(false); - TEST(PassManager, CustomOptPassGate) { - LLVMContext Context0; - LLVMContext Context1; - LLVMContext Context2; - CustomOptPassGate SkipOptionalPasses(true); - CustomOptPassGate RunOptionalPasses(false); + Module M0("custom-opt-bisect", Context0); + Module M1("custom-opt-bisect", Context1); + Module M2("custom-opt-bisect2", Context2); + struct ModuleOpt *mOpt0 = new ModuleOpt(); + struct ModuleOpt *mOpt1 = new ModuleOpt(); + struct ModuleOpt *mOpt2 = new ModuleOpt(); - Module M0("custom-opt-bisect", Context0); - Module M1("custom-opt-bisect", Context1); - Module M2("custom-opt-bisect2", Context2); - struct ModuleOpt *mOpt0 = new ModuleOpt(); - struct ModuleOpt *mOpt1 = new ModuleOpt(); - struct ModuleOpt *mOpt2 = new ModuleOpt(); + mOpt0->run = mOpt1->run = mOpt2->run = 0; - mOpt0->run = mOpt1->run = mOpt2->run = 0; + legacy::PassManager Passes0; + legacy::PassManager Passes1; + legacy::PassManager Passes2; - legacy::PassManager Passes0; - legacy::PassManager Passes1; - legacy::PassManager Passes2; + Passes0.add(mOpt0); + Passes1.add(mOpt1); + Passes2.add(mOpt2); - Passes0.add(mOpt0); - Passes1.add(mOpt1); - Passes2.add(mOpt2); + Context1.setOptPassGate(SkipOptionalPasses); + Context2.setOptPassGate(RunOptionalPasses); - Context1.setOptPassGate(SkipOptionalPasses); - Context2.setOptPassGate(RunOptionalPasses); + Passes0.run(M0); + Passes1.run(M1); + Passes2.run(M2); - Passes0.run(M0); - Passes1.run(M1); - Passes2.run(M2); + // By default optional passes are run. + EXPECT_EQ(1, mOpt0->run); - // By default optional passes are run. - EXPECT_EQ(1, mOpt0->run); + // The first context skips optional passes. + EXPECT_EQ(0, mOpt1->run); - // The first context skips optional passes. - EXPECT_EQ(0, mOpt1->run); + // The second context runs optional passes. + EXPECT_EQ(1, mOpt2->run); +} - // The second context runs optional passes. - EXPECT_EQ(1, mOpt2->run); - } +Module *makeLLVMModule(LLVMContext &Context) { + // Module Construction + Module *mod = new Module("test-mem", Context); + mod->setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-" + "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-" + "a:0:64-s:64:64-f80:128:128"); + mod->setTargetTriple(Triple("x86_64-unknown-linux-gnu")); + + // Type Definitions + std::vector FuncTy_0_args; + FunctionType *FuncTy_0 = FunctionType::get( + /*Result=*/IntegerType::get(Context, 32), + /*Params=*/FuncTy_0_args, + /*isVarArg=*/false); + + std::vector FuncTy_2_args; + FuncTy_2_args.push_back(IntegerType::get(Context, 1)); + FunctionType *FuncTy_2 = FunctionType::get( + /*Result=*/Type::getVoidTy(Context), + /*Params=*/FuncTy_2_args, + /*isVarArg=*/false); + + // Function Declarations + + Function *func_test1 = Function::Create( + /*Type=*/FuncTy_0, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"test1", mod); + func_test1->setCallingConv(CallingConv::C); + AttributeList func_test1_PAL; + func_test1->setAttributes(func_test1_PAL); + + Function *func_test2 = Function::Create( + /*Type=*/FuncTy_0, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"test2", mod); + func_test2->setCallingConv(CallingConv::C); + AttributeList func_test2_PAL; + func_test2->setAttributes(func_test2_PAL); + + Function *func_test3 = Function::Create( + /*Type=*/FuncTy_0, + /*Linkage=*/GlobalValue::InternalLinkage, + /*Name=*/"test3", mod); + func_test3->setCallingConv(CallingConv::C); + AttributeList func_test3_PAL; + func_test3->setAttributes(func_test3_PAL); + + Function *func_test4 = Function::Create( + /*Type=*/FuncTy_2, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"test4", mod); + func_test4->setCallingConv(CallingConv::C); + AttributeList func_test4_PAL; + func_test4->setAttributes(func_test4_PAL); + + // Global Variable Declarations + + // Constant Definitions + + // Global Variable Definitions + + // Function Definitions + + // Function: test1 (func_test1) + { + + BasicBlock *label_entry = + BasicBlock::Create(Context, "entry", func_test1, nullptr); + + // Block entry (label_entry) + CallInst *int32_3 = CallInst::Create(func_test2, "", label_entry); + int32_3->setCallingConv(CallingConv::C); + int32_3->setTailCall(false); + AttributeList int32_3_PAL; + int32_3->setAttributes(int32_3_PAL); + + ReturnInst::Create(Context, int32_3, label_entry); + } - Module *makeLLVMModule(LLVMContext &Context) { - // Module Construction - Module *mod = new Module("test-mem", Context); - mod->setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-" - "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-" - "a:0:64-s:64:64-f80:128:128"); - mod->setTargetTriple(Triple("x86_64-unknown-linux-gnu")); - - // Type Definitions - std::vectorFuncTy_0_args; - FunctionType *FuncTy_0 = FunctionType::get( - /*Result=*/IntegerType::get(Context, 32), - /*Params=*/FuncTy_0_args, - /*isVarArg=*/false); - - std::vectorFuncTy_2_args; - FuncTy_2_args.push_back(IntegerType::get(Context, 1)); - FunctionType *FuncTy_2 = FunctionType::get( - /*Result=*/Type::getVoidTy(Context), - /*Params=*/FuncTy_2_args, - /*isVarArg=*/false); - - // Function Declarations - - Function* func_test1 = Function::Create( - /*Type=*/FuncTy_0, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"test1", mod); - func_test1->setCallingConv(CallingConv::C); - AttributeList func_test1_PAL; - func_test1->setAttributes(func_test1_PAL); - - Function* func_test2 = Function::Create( - /*Type=*/FuncTy_0, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"test2", mod); - func_test2->setCallingConv(CallingConv::C); - AttributeList func_test2_PAL; - func_test2->setAttributes(func_test2_PAL); - - Function* func_test3 = Function::Create( - /*Type=*/FuncTy_0, - /*Linkage=*/GlobalValue::InternalLinkage, - /*Name=*/"test3", mod); - func_test3->setCallingConv(CallingConv::C); - AttributeList func_test3_PAL; - func_test3->setAttributes(func_test3_PAL); - - Function* func_test4 = Function::Create( - /*Type=*/FuncTy_2, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"test4", mod); - func_test4->setCallingConv(CallingConv::C); - AttributeList func_test4_PAL; - func_test4->setAttributes(func_test4_PAL); - - // Global Variable Declarations - - - // Constant Definitions - - // Global Variable Definitions - - // Function Definitions - - // Function: test1 (func_test1) - { + // Function: test2 (func_test2) + { - BasicBlock *label_entry = - BasicBlock::Create(Context, "entry", func_test1, nullptr); + BasicBlock *label_entry_5 = + BasicBlock::Create(Context, "entry", func_test2, nullptr); - // Block entry (label_entry) - CallInst* int32_3 = CallInst::Create(func_test2, "", label_entry); - int32_3->setCallingConv(CallingConv::C); - int32_3->setTailCall(false); - AttributeList int32_3_PAL; - int32_3->setAttributes(int32_3_PAL); + // Block entry (label_entry_5) + CallInst *int32_6 = CallInst::Create(func_test3, "", label_entry_5); + int32_6->setCallingConv(CallingConv::C); + int32_6->setTailCall(false); + AttributeList int32_6_PAL; + int32_6->setAttributes(int32_6_PAL); - ReturnInst::Create(Context, int32_3, label_entry); - } + ReturnInst::Create(Context, int32_6, label_entry_5); + } - // Function: test2 (func_test2) - { + // Function: test3 (func_test3) + { - BasicBlock *label_entry_5 = - BasicBlock::Create(Context, "entry", func_test2, nullptr); + BasicBlock *label_entry_8 = + BasicBlock::Create(Context, "entry", func_test3, nullptr); - // Block entry (label_entry_5) - CallInst* int32_6 = CallInst::Create(func_test3, "", label_entry_5); - int32_6->setCallingConv(CallingConv::C); - int32_6->setTailCall(false); - AttributeList int32_6_PAL; - int32_6->setAttributes(int32_6_PAL); + // Block entry (label_entry_8) + CallInst *int32_9 = CallInst::Create(func_test1, "", label_entry_8); + int32_9->setCallingConv(CallingConv::C); + int32_9->setTailCall(false); + AttributeList int32_9_PAL; + int32_9->setAttributes(int32_9_PAL); - ReturnInst::Create(Context, int32_6, label_entry_5); - } + ReturnInst::Create(Context, int32_9, label_entry_8); + } - // Function: test3 (func_test3) - { + // Function: test4 (func_test4) + { + Function::arg_iterator args = func_test4->arg_begin(); + Value *int1_f = &*args++; + int1_f->setName("f"); + + BasicBlock *label_entry_11 = + BasicBlock::Create(Context, "entry", func_test4, nullptr); + BasicBlock *label_bb = + BasicBlock::Create(Context, "bb", func_test4, nullptr); + BasicBlock *label_bb1 = + BasicBlock::Create(Context, "bb1", func_test4, nullptr); + BasicBlock *label_return = + BasicBlock::Create(Context, "return", func_test4, nullptr); + + // Block entry (label_entry_11) + auto *AI = + new AllocaInst(func_test3->getType(), 0, "func3ptr", label_entry_11); + new StoreInst(func_test3, AI, label_entry_11); + UncondBrInst::Create(label_bb, label_entry_11); + + // Block bb (label_bb) + CondBrInst::Create(int1_f, label_bb, label_bb1, label_bb); + + // Block bb1 (label_bb1) + CondBrInst::Create(int1_f, label_bb1, label_return, label_bb1); + + // Block return (label_return) + ReturnInst::Create(Context, label_return); + } + return mod; +} - BasicBlock *label_entry_8 = - BasicBlock::Create(Context, "entry", func_test3, nullptr); +// Test for call graph SCC pass that replaces all callback call instructions +// with clones and updates CallGraph by calling CallGraph::replaceCallEdge() +// method. Test is expected to complete successfully after running pass on +// all SCCs in the test module. +struct CallbackCallsModifierPass : public CGPass { + bool runOnSCC(CallGraphSCC &SCC) override { + CGPass::run(); - // Block entry (label_entry_8) - CallInst* int32_9 = CallInst::Create(func_test1, "", label_entry_8); - int32_9->setCallingConv(CallingConv::C); - int32_9->setTailCall(false); - AttributeList int32_9_PAL; - int32_9->setAttributes(int32_9_PAL); + CallGraph &CG = const_cast(SCC.getCallGraph()); - ReturnInst::Create(Context, int32_9, label_entry_8); - } + bool Changed = false; + for (CallGraphNode *CGN : SCC) { + Function *F = CGN->getFunction(); + if (!F || F->isDeclaration()) + continue; - // Function: test4 (func_test4) - { - Function::arg_iterator args = func_test4->arg_begin(); - Value *int1_f = &*args++; - int1_f->setName("f"); - - BasicBlock *label_entry_11 = - BasicBlock::Create(Context, "entry", func_test4, nullptr); - BasicBlock *label_bb = - BasicBlock::Create(Context, "bb", func_test4, nullptr); - BasicBlock *label_bb1 = - BasicBlock::Create(Context, "bb1", func_test4, nullptr); - BasicBlock *label_return = - BasicBlock::Create(Context, "return", func_test4, nullptr); - - // Block entry (label_entry_11) - auto *AI = new AllocaInst(func_test3->getType(), 0, "func3ptr", - label_entry_11); - new StoreInst(func_test3, AI, label_entry_11); - UncondBrInst::Create(label_bb, label_entry_11); - - // Block bb (label_bb) - CondBrInst::Create(int1_f, label_bb, label_bb1, label_bb); - - // Block bb1 (label_bb1) - CondBrInst::Create(int1_f, label_bb1, label_return, label_bb1); - - // Block return (label_return) - ReturnInst::Create(Context, label_return); + SmallVector Calls; + for (Use &U : F->uses()) { + AbstractCallSite ACS(&U); + if (!ACS || !ACS.isCallbackCall() || !ACS.isCallee(&U)) + continue; + Calls.push_back(cast(ACS.getInstruction())); } - return mod; - } + if (Calls.empty()) + continue; + + for (CallBase *OldCB : Calls) { + CallGraphNode *CallerCGN = CG[OldCB->getParent()->getParent()]; + assert(any_of(*CallerCGN, + [CGN](const CallGraphNode::CallRecord &CallRecord) { + return CallRecord.second == CGN; + }) && + "function is not a callee"); + + CallBase *NewCB = cast(OldCB->clone()); + + NewCB->insertBefore(OldCB->getIterator()); + NewCB->takeName(OldCB); + + CallerCGN->replaceCallEdge(*OldCB, *NewCB, CG[F]); - // Test for call graph SCC pass that replaces all callback call instructions - // with clones and updates CallGraph by calling CallGraph::replaceCallEdge() - // method. Test is expected to complete successfully after running pass on - // all SCCs in the test module. - struct CallbackCallsModifierPass : public CGPass { - bool runOnSCC(CallGraphSCC &SCC) override { - CGPass::run(); - - CallGraph &CG = const_cast(SCC.getCallGraph()); - - bool Changed = false; - for (CallGraphNode *CGN : SCC) { - Function *F = CGN->getFunction(); - if (!F || F->isDeclaration()) - continue; - - SmallVector Calls; - for (Use &U : F->uses()) { - AbstractCallSite ACS(&U); - if (!ACS || !ACS.isCallbackCall() || !ACS.isCallee(&U)) - continue; - Calls.push_back(cast(ACS.getInstruction())); - } - if (Calls.empty()) - continue; - - for (CallBase *OldCB : Calls) { - CallGraphNode *CallerCGN = CG[OldCB->getParent()->getParent()]; - assert(any_of(*CallerCGN, - [CGN](const CallGraphNode::CallRecord &CallRecord) { - return CallRecord.second == CGN; - }) && - "function is not a callee"); - - CallBase *NewCB = cast(OldCB->clone()); - - NewCB->insertBefore(OldCB->getIterator()); - NewCB->takeName(OldCB); - - CallerCGN->replaceCallEdge(*OldCB, *NewCB, CG[F]); - - OldCB->replaceAllUsesWith(NewCB); - OldCB->eraseFromParent(); - } - Changed = true; - } - return Changed; + OldCB->replaceAllUsesWith(NewCB); + OldCB->eraseFromParent(); } - }; - - TEST(PassManager, CallbackCallsModifier0) { - LLVMContext Context; - - const char *IR = "define void @foo() {\n" - " call void @broker(ptr @callback0, ptr null)\n" - " call void @broker(ptr @callback1, ptr null)\n" - " ret void\n" - "}\n" - "\n" - "declare !callback !0 void @broker(ptr, ptr)\n" - "\n" - "define internal void @callback0(ptr %arg) {\n" - " ret void\n" - "}\n" - "\n" - "define internal void @callback1(ptr %arg) {\n" - " ret void\n" - "}\n" - "\n" - "!0 = !{!1}\n" - "!1 = !{i64 0, i64 1, i1 false}"; - - SMDiagnostic Err; - std::unique_ptr M = parseAssemblyString(IR, Err, Context); - if (!M) - Err.print("LegacyPassManagerTest", errs()); - - CallbackCallsModifierPass *P = new CallbackCallsModifierPass(); - legacy::PassManager Passes; - Passes.add(P); - Passes.run(*M); + Changed = true; } + return Changed; } +}; + +TEST(PassManager, CallbackCallsModifier0) { + LLVMContext Context; + + const char *IR = "define void @foo() {\n" + " call void @broker(ptr @callback0, ptr null)\n" + " call void @broker(ptr @callback1, ptr null)\n" + " ret void\n" + "}\n" + "\n" + "declare !callback !0 void @broker(ptr, ptr)\n" + "\n" + "define internal void @callback0(ptr %arg) {\n" + " ret void\n" + "}\n" + "\n" + "define internal void @callback1(ptr %arg) {\n" + " ret void\n" + "}\n" + "\n" + "!0 = !{!1}\n" + "!1 = !{i64 0, i64 1, i1 false}"; + + SMDiagnostic Err; + std::unique_ptr M = parseAssemblyString(IR, Err, Context); + if (!M) + Err.print("LegacyPassManagerTest", errs()); + + CallbackCallsModifierPass *P = new CallbackCallsModifierPass(); + legacy::PassManager Passes; + Passes.add(P); + Passes.run(*M); } +} // namespace INITIALIZE_PASS(ModuleNDM, "mndm", "mndm", false, false) INITIALIZE_PASS_BEGIN(CGPass, "cgp","cgp", false, false) From d2bc8385a11ccdec537684c64706456edc584fbf Mon Sep 17 00:00:00 2001 From: Aleksandr Popov <42888396+aleks-tmb@users.noreply.github.com> Date: Mon, 11 May 2026 16:38:42 +0200 Subject: [PATCH 279/538] [SE][LV] Add test: early-exit loop with umin trip count should vectorize (NFC) (#196942) See https://github.com/llvm/llvm-project/issues/196935 --- .../early-exit-umin-trip-count.ll | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/early-exit-umin-trip-count.ll diff --git a/llvm/test/Transforms/LoopVectorize/early-exit-umin-trip-count.ll b/llvm/test/Transforms/LoopVectorize/early-exit-umin-trip-count.ll new file mode 100644 index 0000000000000..36936ec9832a4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/early-exit-umin-trip-count.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=loop-vectorize %s | FileCheck %s + +; TODO: The loop should be vectorized. The vectorizer needs to prove +; (A umin B) u<= A to establish memory safety from the dereferenceable +; assumption. +; +; long n = umin(count, length); +; for (long i = 0; i < n; i++) +; if (ptr[i] == 0) return null; +; ... +define ptr @main(i32 %length, ptr %ptr, i64 %0) #0 { +; CHECK-LABEL: define ptr @main( +; CHECK-SAME: i32 [[LENGTH:%.*]], ptr [[PTR:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[NULL_CHECK0:%.*]] = icmp eq i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[NULL_CHECK0]], label %[[DEOPT:.*]], label %[[PREHEADER:.*]] +; CHECK: [[PREHEADER]]: +; CHECK-NEXT: [[LENGTH_64:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-NEXT: [[EXIT:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 [[LENGTH_64]]) +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i64 8), "dereferenceable"(ptr [[PTR]], i64 [[LENGTH_64]]) ] +; CHECK-NEXT: [[NULL_CHECK:%.*]] = icmp eq i32 [[LENGTH]], 0 +; CHECK-NEXT: br i1 [[NULL_CHECK]], label %[[DEOPT]], label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ELEMENT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[IV]] +; CHECK-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[ELEMENT_GEP]], align 1 +; CHECK-NEXT: [[FOUND_CHECK:%.*]] = icmp eq i8 [[ELEMENT]], 0 +; CHECK-NEXT: br i1 [[FOUND_CHECK]], label %[[FOUND:.*]], label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i64 [[IV_NEXT]], [[EXIT]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label %[[LOOP]], label %[[DEOPT_LOOPEXIT:.*]] +; CHECK: [[FOUND]]: +; CHECK-NEXT: ret ptr null +; CHECK: [[DEOPT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[DEOPT]] +; CHECK: [[DEOPT]]: +; CHECK-NEXT: unreachable +; +entry: + %null_check0 = icmp eq i64 %0, 0 + br i1 %null_check0, label %deopt, label %preheader + +preheader: ; preds = %entry + %length.64 = zext i32 %length to i64 + %exit = call i64 @llvm.umin.i64(i64 %0, i64 %length.64) + call void @llvm.assume(i1 true) [ "align"(ptr %ptr, i64 8), "dereferenceable"(ptr %ptr, i64 %length.64) ] + %null_check = icmp eq i32 %length, 0 + br i1 %null_check, label %deopt, label %loop + +loop: ; preds = %latch, %preheader + %iv = phi i64 [ %iv.next, %latch ], [ 0, %preheader ] + %element_gep = getelementptr i8, ptr %ptr, i64 %iv + %element = load i8, ptr %element_gep, align 1 + %found_check = icmp eq i8 %element, 0 + br i1 %found_check, label %found, label %latch + +latch: ; preds = %loop + %iv.next = add i64 %iv, 1 + %range_check = icmp ult i64 %iv.next, %exit + br i1 %range_check, label %loop, label %deopt + +found: ; preds = %loop + ret ptr null + +deopt: ; preds = %latch, %preheader + unreachable +} + +declare i64 @llvm.umin.i64(i64, i64) + +declare void @llvm.assume(i1 noundef) + +attributes #0 = { nofree nosync } From cc4922b276947e95aeed779bbf6a1e41f3a96582 Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Mon, 11 May 2026 10:43:19 -0400 Subject: [PATCH 280/538] [flang] Remove legacy (non-HLFIR) lowering and related options (#196205) Drop the user-facing options that selected the legacy non-HLFIR lowering path, the always-true `LowerToHighLevelFIR` lowering option, and the descriptor-discretization debug switch: - `-flang-experimental-hlfir` and `-flang-deprecated-no-hlfir` (flang driver and `-fc1`) - `-hlfir` / `--hlfir` (bbc) - `--use-desc-for-alloc` (bbc, debug) Remove every `if (lowerToHighLevelFIR()) { ... } else { ... }` branch in `lib/Lower/`, keeping the HLFIR side. Delete the now-unused legacy helpers in `Bridge.cpp` (`copyVarFIR`, both `genNoHLFIRPointerAssignment` overloads, the legacy block of `genAssignment`) and the always-empty `createMutableProperties` together with its four dead helpers in `Allocatable.cpp`. Drop the corresponding `alwaysUseBox` parameter from `createMutableBox`. Note: tests were modified in https://github.com/llvm/llvm-project/pull/196137 Assisted-by: AI --- clang/include/clang/Options/FlangOptions.td | 12 - clang/lib/Driver/ToolChains/Flang.cpp | 2 - flang/include/flang/Lower/Allocatable.h | 11 +- flang/include/flang/Lower/LoweringOptions.def | 3 - flang/lib/Frontend/CompilerInvocation.cpp | 19 - flang/lib/Frontend/FrontendActions.cpp | 8 +- flang/lib/Lower/Allocatable.cpp | 144 +--- flang/lib/Lower/Bridge.cpp | 681 +++--------------- flang/lib/Lower/CallInterface.cpp | 11 +- flang/lib/Lower/ConvertCall.cpp | 20 +- flang/lib/Lower/ConvertConstant.cpp | 12 - flang/lib/Lower/ConvertType.cpp | 30 +- flang/lib/Lower/ConvertVariable.cpp | 108 +-- flang/lib/Lower/HostAssociations.cpp | 7 +- flang/tools/bbc/bbc.cpp | 7 +- 15 files changed, 160 insertions(+), 915 deletions(-) diff --git a/clang/include/clang/Options/FlangOptions.td b/clang/include/clang/Options/FlangOptions.td index 1ab83b6ffbbad..50e4642358b71 100644 --- a/clang/include/clang/Options/FlangOptions.td +++ b/clang/include/clang/Options/FlangOptions.td @@ -114,18 +114,6 @@ def static_libflangrt : Flag<["-"], "static-libflangrt">, HelpText<"Link the flang-rt static library">, Group, Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>; -//===----------------------------------------------------------------------===// -// FlangOption + NoXarchOption -//===----------------------------------------------------------------------===// - -def flang_experimental_hlfir : Flag<["-"], "flang-experimental-hlfir">, - Flags<[HelpHidden]>, Visibility<[FlangOption, FC1Option]>, - HelpText<"Use HLFIR lowering (experimental)">; - -def flang_deprecated_no_hlfir : Flag<["-"], "flang-deprecated-no-hlfir">, - Flags<[HelpHidden]>, Visibility<[FlangOption, FC1Option]>, - HelpText<"Do not use HLFIR lowering (deprecated)">; - //===----------------------------------------------------------------------===// // FlangOption + CoreOption + NoXarchOption //===----------------------------------------------------------------------===// diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index ce503b74295e4..082df7beb4b85 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -252,8 +252,6 @@ void Flang::addCodegenOptions(const ArgList &Args, Args.addAllArgs( CmdArgs, {options::OPT_fdo_concurrent_to_openmp_EQ, - options::OPT_flang_experimental_hlfir, - options::OPT_flang_deprecated_no_hlfir, options::OPT_fno_ppc_native_vec_elem_order, options::OPT_fppc_native_vec_elem_order, options::OPT_finit_global_zero, options::OPT_fno_init_global_zero, options::OPT_frepack_arrays, diff --git a/flang/include/flang/Lower/Allocatable.h b/flang/include/flang/Lower/Allocatable.h index 0e89af94af40f..515fd20b2bcb5 100644 --- a/flang/include/flang/Lower/Allocatable.h +++ b/flang/include/flang/Lower/Allocatable.h @@ -68,11 +68,12 @@ void genDeallocateIfAllocated(AbstractConverter &converter, /// Create a MutableBoxValue for an allocatable or pointer entity. /// If the variables is a local variable that is not a dummy, it will be /// initialized to unallocated/diassociated status. -fir::MutableBoxValue -createMutableBox(AbstractConverter &converter, mlir::Location loc, - const pft::Variable &var, mlir::Value boxAddr, - mlir::ValueRange nonDeferredParams, bool alwaysUseBox, - unsigned allocator = kDefaultAllocator); +fir::MutableBoxValue createMutableBox(AbstractConverter &converter, + mlir::Location loc, + const pft::Variable &var, + mlir::Value boxAddr, + mlir::ValueRange nonDeferredParams, + unsigned allocator = kDefaultAllocator); /// Assign a boxed value to a boxed variable, \p box (known as a /// MutableBoxValue). Expression \p source will be lowered to build the diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 0b829bf3e08af..e89ad75704609 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -24,9 +24,6 @@ LOWERINGOPT(Name, Bits, Default) /// If true, lower transpose without a runtime call. ENUM_LOWERINGOPT(OptimizeTranspose, unsigned, 1, 1) -/// If true, lower to High level FIR before lowering to FIR. On by default. -ENUM_LOWERINGOPT(LowerToHighLevelFIR, unsigned, 1, 1) - /// If true, reverse PowerPC native vector element order. ENUM_LOWERINGOPT(NoPPCNativeVecElemOrder, unsigned, 1, 0) diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index e7f4762e167fb..7205eb4548968 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1629,25 +1629,6 @@ bool CompilerInvocation::createFromArgs( success = false; } - // -flang-experimental-hlfir - if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) || - args.hasArg(clang::options::OPT_emit_hlfir)) { - invoc.loweringOpts.setLowerToHighLevelFIR(true); - } - - // -flang-deprecated-no-hlfir - if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) && - !args.hasArg(clang::options::OPT_emit_hlfir)) { - if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) { - const unsigned diagID = diags.getCustomDiagID( - clang::DiagnosticsEngine::Error, - "Options '-flang-experimental-hlfir' and " - "'-flang-deprecated-no-hlfir' cannot be both specified"); - diags.Report(diagID); - } - invoc.loweringOpts.setLowerToHighLevelFIR(false); - } - // -fno-ppc-native-vector-element-order if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) { invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 4e058786a9a72..9ad1da0011ef4 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -1345,8 +1345,6 @@ void CodeGenAction::executeAction() { clang::DiagnosticsEngine &diags = ci.getDiagnostics(); const CodeGenOptions &codeGenOpts = ci.getInvocation().getCodeGenOpts(); const TargetOptions &targetOpts = ci.getInvocation().getTargetOpts(); - Fortran::lower::LoweringOptions &loweringOpts = - ci.getInvocation().getLoweringOpts(); mlir::DefaultTimingManager &timingMgr = ci.getTimingManager(); mlir::TimingScope &timingScopeRoot = ci.getTimingScopeRoot(); @@ -1375,16 +1373,12 @@ void CodeGenAction::executeAction() { } if (action == BackendActionTy::Backend_EmitFIR) { - if (loweringOpts.getLowerToHighLevelFIR()) { - lowerHLFIRToFIR(); - } + lowerHLFIRToFIR(); mlirModule->print(ci.isOutputStreamNull() ? *os : ci.getOutputStream()); return; } if (action == BackendActionTy::Backend_EmitHLFIR) { - assert(loweringOpts.getLowerToHighLevelFIR() && - "Lowering must have been configured to emit HLFIR"); mlirModule->print(ci.isOutputStreamNull() ? *os : ci.getOutputStream()); return; } diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index 5cbfba23cffdf..8ca861105ce23 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -48,16 +48,6 @@ static llvm::cl::opt useAllocateRuntime( "use-alloc-runtime", llvm::cl::desc("Lower allocations to fortran runtime calls"), llvm::cl::init(false)); -/// Switch to force lowering of allocatable and pointers to descriptors in all -/// cases. This is now turned on by default since that is what will happen with -/// HLFIR lowering, so this allows getting early feedback of the impact. -/// If this turns out to cause performance regressions, a dedicated fir.box -/// "discretization pass" would make more sense to cover all the fir.box usage -/// (taking advantage of any future inlining for instance). -static llvm::cl::opt useDescForMutableBox( - "use-desc-for-alloc", - llvm::cl::desc("Always use descriptors for POINTER and ALLOCATABLE"), - llvm::cl::init(true)); //===----------------------------------------------------------------------===// // Error management @@ -1017,123 +1007,12 @@ void Fortran::lower::genDeallocateStmt( // MutableBoxValue creation implementation //===----------------------------------------------------------------------===// -/// Is this symbol a pointer to a pointer array that does not have the -/// CONTIGUOUS attribute ? -static inline bool -isNonContiguousArrayPointer(const Fortran::semantics::Symbol &sym) { - return Fortran::semantics::IsPointer(sym) && sym.Rank() != 0 && - !sym.attrs().test(Fortran::semantics::Attr::CONTIGUOUS); -} - -/// Is this symbol a polymorphic pointer? -static inline bool isPolymorphicPointer(const Fortran::semantics::Symbol &sym) { - return Fortran::semantics::IsPointer(sym) && - Fortran::semantics::IsPolymorphic(sym); -} - -/// Is this symbol a polymorphic allocatable? -static inline bool -isPolymorphicAllocatable(const Fortran::semantics::Symbol &sym) { - return Fortran::semantics::IsAllocatable(sym) && - Fortran::semantics::IsPolymorphic(sym); -} - -/// Is this a local procedure symbol in a procedure that contains internal -/// procedures ? -static bool mayBeCapturedInInternalProc(const Fortran::semantics::Symbol &sym) { - const Fortran::semantics::Scope &owner = sym.owner(); - Fortran::semantics::Scope::Kind kind = owner.kind(); - // Test if this is a procedure scope that contains a subprogram scope that is - // not an interface. - if (kind == Fortran::semantics::Scope::Kind::Subprogram || - kind == Fortran::semantics::Scope::Kind::MainProgram) - for (const Fortran::semantics::Scope &childScope : owner.children()) - if (childScope.kind() == Fortran::semantics::Scope::Kind::Subprogram) - if (const Fortran::semantics::Symbol *childSym = childScope.symbol()) - if (const auto *details = - childSym->detailsIf()) - if (!details->isInterface()) - return true; - return false; -} - -/// In case it is safe to track the properties in variables outside a -/// descriptor, create the variables to hold the mutable properties of the -/// entity var. The variables are not initialized here. -static fir::MutableProperties -createMutableProperties(Fortran::lower::AbstractConverter &converter, - mlir::Location loc, - const Fortran::lower::pft::Variable &var, - mlir::ValueRange nonDeferredParams, bool alwaysUseBox) { - fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - const Fortran::semantics::Symbol &sym = var.getSymbol(); - // Globals and dummies may be associated, creating local variables would - // require keeping the values and descriptor before and after every single - // impure calls in the current scope (not only the ones taking the variable as - // arguments. All.) Volatile means the variable may change in ways not defined - // per Fortran, so lowering can most likely not keep the descriptor and values - // in sync as needed. - // Pointers to non contiguous arrays need to be represented with a fir.box to - // account for the discontiguity. - // Pointer/Allocatable in internal procedure are descriptors in the host link, - // and it would increase complexity to sync this descriptor with the local - // values every time the host link is escaping. - if (alwaysUseBox || var.isGlobal() || Fortran::semantics::IsDummy(sym) || - Fortran::semantics::IsFunctionResult(sym) || - sym.attrs().test(Fortran::semantics::Attr::VOLATILE) || - isNonContiguousArrayPointer(sym) || useAllocateRuntime || - useDescForMutableBox || mayBeCapturedInInternalProc(sym) || - isPolymorphicPointer(sym) || isPolymorphicAllocatable(sym)) - return {}; - fir::MutableProperties mutableProperties; - std::string name = converter.mangleName(sym); - mlir::Type baseAddrTy = converter.genType(sym); - if (auto boxType = mlir::dyn_cast(baseAddrTy)) - baseAddrTy = boxType.getEleTy(); - // Allocate and set a variable to hold the address. - // It will be set to null in setUnallocatedStatus. - mutableProperties.addr = - builder.allocateLocal(loc, baseAddrTy, name + ".addr", "", - /*shape=*/{}, /*typeparams=*/{}); - // Allocate variables to hold lower bounds and extents. - int rank = sym.Rank(); - mlir::Type idxTy = builder.getIndexType(); - for (decltype(rank) i = 0; i < rank; ++i) { - mlir::Value lboundVar = - builder.allocateLocal(loc, idxTy, name + ".lb" + std::to_string(i), "", - /*shape=*/{}, /*typeparams=*/{}); - mlir::Value extentVar = - builder.allocateLocal(loc, idxTy, name + ".ext" + std::to_string(i), "", - /*shape=*/{}, /*typeparams=*/{}); - mutableProperties.lbounds.emplace_back(lboundVar); - mutableProperties.extents.emplace_back(extentVar); - } - - // Allocate variable to hold deferred length parameters. - mlir::Type eleTy = baseAddrTy; - if (auto newTy = fir::dyn_cast_ptrEleTy(eleTy)) - eleTy = newTy; - if (auto seqTy = mlir::dyn_cast(eleTy)) - eleTy = seqTy.getEleTy(); - if (auto record = mlir::dyn_cast(eleTy)) - if (record.getNumLenParams() != 0) - TODO(loc, "deferred length type parameters."); - if (fir::isa_char(eleTy) && nonDeferredParams.empty()) { - mlir::Value lenVar = builder.allocateLocal( - loc, builder.getCharacterLengthType(), name + ".len", "", /*shape=*/{}, - /*typeparams=*/{}); - mutableProperties.deferredParams.emplace_back(lenVar); - } - return mutableProperties; -} - fir::MutableBoxValue Fortran::lower::createMutableBox( Fortran::lower::AbstractConverter &converter, mlir::Location loc, const Fortran::lower::pft::Variable &var, mlir::Value boxAddr, - mlir::ValueRange nonDeferredParams, bool alwaysUseBox, unsigned allocator) { - fir::MutableProperties mutableProperties = createMutableProperties( - converter, loc, var, nonDeferredParams, alwaysUseBox); - fir::MutableBoxValue box(boxAddr, nonDeferredParams, mutableProperties); + mlir::ValueRange nonDeferredParams, unsigned allocator) { + fir::MutableBoxValue box(boxAddr, nonDeferredParams, + /*mutableProperties=*/{}); fir::FirOpBuilder &builder = converter.getFirOpBuilder(); if (!var.isGlobal() && !Fortran::semantics::IsDummy(var.getSymbol())) fir::factory::disassociateMutableBox(builder, loc, box, @@ -1163,22 +1042,9 @@ void Fortran::lower::associateMutableBox( cuf::genPointerSync(box.getAddr(), builder); return; } - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { - fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx); - fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds); - cuf::genPointerSync(box.getAddr(), builder); - return; - } - // The right hand side is not be evaluated into a temp. Array sections can - // typically be represented as a value of type `!fir.box`. However, an - // expression that uses vector subscripts cannot be emboxed. In that case, - // generate a reference to avoid having to later use a fir.rebox to implement - // the pointer association. - fir::ExtendedValue rhs = isArraySectionWithoutVectorSubscript(source) - ? converter.genExprBox(loc, source, stmtCtx) - : converter.genExprAddr(loc, source, stmtCtx); - + fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx); fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds); + cuf::genPointerSync(box.getAddr(), builder); } bool Fortran::lower::isWholeAllocatable(const Fortran::lower::SomeExpr &expr) { diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 08bb9def8ad5f..01482c1cb90b5 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -808,11 +808,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::lower::StatementContext &context, mlir::Location *locPtr = nullptr) override final { mlir::Location loc = locPtr ? *locPtr : toLocation(); - if (lowerToHighLevelFIR()) - return Fortran::lower::convertExprToAddress(loc, *this, expr, - localSymbols, context); - return Fortran::lower::createSomeExtendedAddress(loc, *this, expr, - localSymbols, context); + return Fortran::lower::convertExprToAddress(loc, *this, expr, localSymbols, + context); } fir::ExtendedValue @@ -820,21 +817,15 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::lower::StatementContext &context, mlir::Location *locPtr = nullptr) override final { mlir::Location loc = locPtr ? *locPtr : toLocation(); - if (lowerToHighLevelFIR()) - return Fortran::lower::convertExprToValue(loc, *this, expr, localSymbols, - context); - return Fortran::lower::createSomeExtendedExpression(loc, *this, expr, - localSymbols, context); + return Fortran::lower::convertExprToValue(loc, *this, expr, localSymbols, + context); } fir::ExtendedValue genExprBox(mlir::Location loc, const Fortran::lower::SomeExpr &expr, Fortran::lower::StatementContext &stmtCtx) override final { - if (lowerToHighLevelFIR()) - return Fortran::lower::convertExprToBox(loc, *this, expr, localSymbols, - stmtCtx); - return Fortran::lower::createBoxValue(loc, *this, expr, localSymbols, - stmtCtx); + return Fortran::lower::convertExprToBox(loc, *this, expr, localSymbols, + stmtCtx); } Fortran::evaluate::FoldingContext &getFoldingContext() override final { @@ -1378,56 +1369,51 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::lower::SymMap *symMap = nullptr, bool forceHlfirBase = false) { symMap = symMap ? symMap : &localSymbols; - if (lowerToHighLevelFIR()) { - if (std::optional var = - symMap->lookupVariableDefinition(sym)) { - auto exv = hlfir::translateToExtendedValue(toLocation(), *builder, *var, - forceHlfirBase); - return exv.match( - [](mlir::Value x) -> Fortran::lower::SymbolBox { - return Fortran::lower::SymbolBox::Intrinsic{x}; - }, - [](auto x) -> Fortran::lower::SymbolBox { return x; }); - } - - // Entry character result represented as an argument pair - // needs to be represented in the symbol table even before - // we can create DeclareOp for it. The temporary mapping - // is EmboxCharOp that conveys the address and length information. - // After mapSymbolAttributes is done, the mapping is replaced - // with the new DeclareOp, and the following table lookups - // do not reach here. - if (sym.IsFuncResult()) - if (const Fortran::semantics::DeclTypeSpec *declTy = sym.GetType()) - if (declTy->category() == - Fortran::semantics::DeclTypeSpec::Category::Character) - return symMap->lookupSymbol(sym); - - // Procedure dummies are not mapped with an hlfir.declare because - // they are not "variable" (cannot be assigned to), and it would - // make hlfir.declare more complex than it needs to to allow this. - // Do a regular lookup. - if (Fortran::semantics::IsProcedure(sym)) - return symMap->lookupSymbol(sym); - - // Commonblock names are not variables, but in some lowerings (like - // OpenMP) it is useful to maintain the address of the commonblock in an - // MLIR value and query it. hlfir.declare need not be created for these. - if (sym.detailsIf()) - return symMap->lookupSymbol(sym); - - // For symbols to be privatized in OMP, the symbol is mapped to an - // instance of `SymbolBox::Intrinsic` (i.e. a direct mapping to an MLIR - // SSA value). This MLIR SSA value is the block argument to the - // `omp.private`'s `alloc` block. If this is the case, we return this - // `SymbolBox::Intrinsic` value. - if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym)) - return v; - - return {}; - } + if (std::optional var = + symMap->lookupVariableDefinition(sym)) { + auto exv = hlfir::translateToExtendedValue(toLocation(), *builder, *var, + forceHlfirBase); + return exv.match( + [](mlir::Value x) -> Fortran::lower::SymbolBox { + return Fortran::lower::SymbolBox::Intrinsic{x}; + }, + [](auto x) -> Fortran::lower::SymbolBox { return x; }); + } + + // Entry character result represented as an argument pair + // needs to be represented in the symbol table even before + // we can create DeclareOp for it. The temporary mapping + // is EmboxCharOp that conveys the address and length information. + // After mapSymbolAttributes is done, the mapping is replaced + // with the new DeclareOp, and the following table lookups + // do not reach here. + if (sym.IsFuncResult()) + if (const Fortran::semantics::DeclTypeSpec *declTy = sym.GetType()) + if (declTy->category() == + Fortran::semantics::DeclTypeSpec::Category::Character) + return symMap->lookupSymbol(sym); + + // Procedure dummies are not mapped with an hlfir.declare because + // they are not "variable" (cannot be assigned to), and it would + // make hlfir.declare more complex than it needs to to allow this. + // Do a regular lookup. + if (Fortran::semantics::IsProcedure(sym)) + return symMap->lookupSymbol(sym); + + // Commonblock names are not variables, but in some lowerings (like + // OpenMP) it is useful to maintain the address of the commonblock in an + // MLIR value and query it. hlfir.declare need not be created for these. + if (sym.detailsIf()) + return symMap->lookupSymbol(sym); + + // For symbols to be privatized in OMP, the symbol is mapped to an + // instance of `SymbolBox::Intrinsic` (i.e. a direct mapping to an MLIR + // SSA value). This MLIR SSA value is the block argument to the + // `omp.private`'s `alloc` block. If this is the case, we return this + // `SymbolBox::Intrinsic` value. if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym)) return v; + return {}; } @@ -1451,13 +1437,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::ExtendedValue val, bool forced = false) { if (!forced && lookupSymbol(sym)) return false; - if (lowerToHighLevelFIR()) { - Fortran::lower::genDeclareSymbol(*this, localSymbols, sym, val, - fir::FortranVariableFlagsEnum::None, - forced); - } else { - localSymbols.addSymbol(sym, val, forced); - } + Fortran::lower::genDeclareSymbol(*this, localSymbols, sym, val, + fir::FortranVariableFlagsEnum::None, + forced); return true; } @@ -1466,17 +1448,12 @@ class FirConverter : public Fortran::lower::AbstractConverter { const Fortran::lower::SymbolBox &rhs_sb, Fortran::semantics::Symbol::Flags flags) { mlir::Location loc = genLocation(sym.name()); - if (lowerToHighLevelFIR()) - copyVarHLFIR(loc, lhs_sb, rhs_sb, flags); - else - copyVarFIR(loc, sym, lhs_sb, rhs_sb); + copyVarHLFIR(loc, lhs_sb, rhs_sb, flags); } void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst, Fortran::lower::SymbolBox src, Fortran::semantics::Symbol::Flags flags) { - assert(lowerToHighLevelFIR()); - bool isBoxAllocatable = dst.match( [](const fir::MutableBoxValue &box) { return box.isAllocatable(); }, [](const fir::FortranVariableOpInterface &box) { @@ -1500,7 +1477,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst, Fortran::lower::SymbolBox src, bool isAllocatable, bool isPointer, Fortran::semantics::Symbol::Flags flags) { - assert(lowerToHighLevelFIR()); hlfir::Entity lhs{dst.getAddr()}; hlfir::Entity rhs{src.getAddr()}; @@ -1589,26 +1565,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { } } - void copyVarFIR(mlir::Location loc, const Fortran::semantics::Symbol &sym, - const Fortran::lower::SymbolBox &lhs_sb, - const Fortran::lower::SymbolBox &rhs_sb) { - assert(!lowerToHighLevelFIR()); - fir::ExtendedValue lhs = symBoxToExtendedValue(lhs_sb); - fir::ExtendedValue rhs = symBoxToExtendedValue(rhs_sb); - mlir::Type symType = genType(sym); - if (auto seqTy = mlir::dyn_cast(symType)) { - Fortran::lower::StatementContext stmtCtx; - Fortran::lower::createSomeArrayAssignment(*this, lhs, rhs, localSymbols, - stmtCtx); - stmtCtx.finalizeAndReset(); - } else if (lhs.getBoxOf()) { - fir::factory::CharacterExprHelper{*builder, loc}.createAssign(lhs, rhs); - } else { - auto loadVal = fir::LoadOp::create(*builder, loc, fir::getBase(rhs)); - fir::StoreOp::create(*builder, loc, loadVal, fir::getBase(lhs)); - } - } - /// Map a block argument to a result or dummy symbol. This is not the /// definitive mapping. The specification expression have not been lowered /// yet. The final mapping will be done using this pre-mapping in @@ -2166,21 +2122,15 @@ class FirConverter : public Fortran::lower::AbstractConverter { dir->u); } - if (lowerToHighLevelFIR()) { - std::optional resultType; - if (stmt.typedCall->hasAlternateReturns()) - resultType = builder->getIndexType(); - auto hlfirRes = Fortran::lower::convertCallToHLFIR( - toLocation(), *this, *stmt.typedCall, resultType, localSymbols, - stmtCtx); - if (hlfirRes) - res = *hlfirRes; - } else { - // Call statement lowering shares code with function call lowering. - res = Fortran::lower::createSubroutineCall( - *this, *stmt.typedCall, explicitIterSpace, implicitIterSpace, - localSymbols, stmtCtx, /*isUserDefAssignment=*/false); - } + std::optional resultType; + if (stmt.typedCall->hasAlternateReturns()) + resultType = builder->getIndexType(); + auto hlfirRes = + Fortran::lower::convertCallToHLFIR(toLocation(), *this, *stmt.typedCall, + resultType, localSymbols, stmtCtx); + if (hlfirRes) + res = *hlfirRes; + stmtCtx.finalizeAndReset(); if (!res) return; // "Normal" subroutine call. @@ -3426,24 +3376,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::common::visit([&](const auto &x) { genFIR(x); }, stmt.u); } - void genFIR(const Fortran::parser::EndForallStmt &) { - if (!lowerToHighLevelFIR()) - cleanupExplicitSpace(); - } - - template - void prepareExplicitSpace(const A &forall) { - if (!explicitIterSpace.isActive()) - analyzeExplicitSpace(forall); - localSymbols.pushScope(); - explicitIterSpace.enter(); - } - - /// Cleanup all the FORALL context information when we exit. - void cleanupExplicitSpace() { - explicitIterSpace.leave(); - localSymbols.popScope(); - } + void genFIR(const Fortran::parser::EndForallStmt &) {} /// Generate FIR for a FORALL statement. void genFIR(const Fortran::parser::ForallStmt &stmt) { @@ -3452,31 +3385,19 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::common::Indirection>( stmt.t) .value(); - if (lowerToHighLevelFIR()) { - mlir::OpBuilder::InsertionGuard guard(*builder); - Fortran::lower::SymMapScope scope(localSymbols); - genForallNest(concurrentHeader); - genFIR(std::get>(stmt.t) - .statement); - return; - } - prepareExplicitSpace(stmt); - genFIR(concurrentHeader); + mlir::OpBuilder::InsertionGuard guard(*builder); + Fortran::lower::SymMapScope scope(localSymbols); + genForallNest(concurrentHeader); genFIR(std::get>(stmt.t) .statement); - cleanupExplicitSpace(); } /// Generate FIR for a FORALL construct. void genFIR(const Fortran::parser::ForallConstruct &forall) { setCurrentPositionAt(forall); mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint(); - if (lowerToHighLevelFIR()) - localSymbols.pushScope(); - else - prepareExplicitSpace(forall); + localSymbols.pushScope(); genNestedStatement( std::get< Fortran::parser::Statement>( @@ -3494,10 +3415,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { genNestedStatement( std::get>( forall.t)); - if (lowerToHighLevelFIR()) { - localSymbols.popScope(); - builder->restoreInsertionPoint(insertPt); - } + localSymbols.popScope(); + builder->restoreInsertionPoint(insertPt); } /// Lower the concurrent header specification. @@ -3507,10 +3426,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::common::Indirection>( stmt.t) .value(); - if (lowerToHighLevelFIR()) - genForallNest(concurrentHeader); - else - genFIR(concurrentHeader); + genForallNest(concurrentHeader); } /// Generate hlfir.forall and hlfir.forall_mask nest given a Forall @@ -4345,12 +4261,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::ExtendedValue genAssociateSelector(const Fortran::lower::SomeExpr &selector, Fortran::lower::StatementContext &stmtCtx) { - if (lowerToHighLevelFIR()) - return genExprAddr(selector, stmtCtx); - return Fortran::lower::isArraySectionWithoutVectorSubscript(selector) - ? Fortran::lower::createSomeArrayBox(*this, selector, - localSymbols, stmtCtx) - : genExprAddr(selector, stmtCtx); + return genExprAddr(selector, stmtCtx); } void genFIR(const Fortran::parser::AssociateConstruct &) { @@ -5153,10 +5064,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { inline fir::MutableBoxValue genExprMutableBox(mlir::Location loc, const Fortran::lower::SomeExpr &expr) override final { - if (lowerToHighLevelFIR()) - return Fortran::lower::convertExprToMutableBox(loc, *this, expr, - localSymbols); - return Fortran::lower::createMutableBox(loc, *this, expr, localSymbols); + return Fortran::lower::convertExprToMutableBox(loc, *this, expr, + localSymbols); } // Create the [newRank] array with the lower bounds to be passed to the @@ -5183,61 +5092,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { return fir::EmboxOp::create(*builder, loc, boxTy, boundArray, shapeOp); } - // Generate pointer assignment with possibly empty bounds-spec. R1035: a - // bounds-spec is a lower bound value. - void genNoHLFIRPointerAssignment( - mlir::Location loc, const Fortran::evaluate::Assignment &assign, - const Fortran::evaluate::Assignment::BoundsSpec &lbExprs) { - Fortran::lower::StatementContext stmtCtx; - - assert(!lowerToHighLevelFIR() && "code should not be called with HFLIR"); - if (Fortran::evaluate::IsProcedureDesignator(assign.rhs)) - TODO(loc, "procedure pointer assignment"); - - std::optional lhsType = - assign.lhs.GetType(); - // Delegate pointer association to unlimited polymorphic pointer - // to the runtime. element size, type code, attribute and of - // course base_addr might need to be updated. - if (lhsType && lhsType->IsPolymorphic()) { - if (explicitIterationSpace()) - TODO(loc, "polymorphic pointer assignment in FORALL"); - llvm::SmallVector lbounds; - for (const Fortran::evaluate::ExtentExpr &lbExpr : lbExprs) - lbounds.push_back( - fir::getBase(genExprValue(toEvExpr(lbExpr), stmtCtx))); - fir::MutableBoxValue lhsMutableBox = genExprMutableBox(loc, assign.lhs); - if (Fortran::evaluate::UnwrapExpr( - assign.rhs)) { - fir::factory::disassociateMutableBox(*builder, loc, lhsMutableBox); - return; - } - mlir::Value lhs = lhsMutableBox.getAddr(); - mlir::Value rhs = fir::getBase(genExprBox(loc, assign.rhs, stmtCtx)); - if (!lbounds.empty()) { - mlir::Value boundsDesc = createLboundArray(lbounds, loc); - Fortran::lower::genPointerAssociateLowerBounds(*builder, loc, lhs, rhs, - boundsDesc); - return; - } - Fortran::lower::genPointerAssociate(*builder, loc, lhs, rhs); - return; - } - - llvm::SmallVector lbounds; - for (const Fortran::evaluate::ExtentExpr &lbExpr : lbExprs) - lbounds.push_back(fir::getBase(genExprValue(toEvExpr(lbExpr), stmtCtx))); - if (explicitIterationSpace()) { - // Pointer assignment in FORALL context. Copy the rhs box value - // into the lhs box variable. - genArrayAssignment(assign, stmtCtx, lbounds); - return; - } - fir::MutableBoxValue lhs = genExprMutableBox(loc, assign.lhs); - Fortran::lower::associateMutableBox(*this, loc, lhs, assign.rhs, lbounds, - stmtCtx); - } - void genPointerAssignment(mlir::Location loc, const Fortran::evaluate::Assignment &assign) { if (isInsideHlfirForallOrWhere()) { @@ -5421,75 +5275,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { return fir::EmboxOp::create(*builder, loc, boxTy, boundArray, shapeOp); } - // Pointer assignment with bounds-remapping. R1036: a bounds-remapping is a - // pair, lower bound and upper bound. - void genNoHLFIRPointerAssignment( - mlir::Location loc, const Fortran::evaluate::Assignment &assign, - const Fortran::evaluate::Assignment::BoundsRemapping &boundExprs) { - assert(!lowerToHighLevelFIR() && "code should not be called with HFLIR"); - Fortran::lower::StatementContext stmtCtx; - llvm::SmallVector lbounds; - llvm::SmallVector ubounds; - for (const std::pair &pair : boundExprs) { - const Fortran::evaluate::ExtentExpr &lbExpr = pair.first; - const Fortran::evaluate::ExtentExpr &ubExpr = pair.second; - lbounds.push_back(fir::getBase(genExprValue(toEvExpr(lbExpr), stmtCtx))); - ubounds.push_back(fir::getBase(genExprValue(toEvExpr(ubExpr), stmtCtx))); - } - - std::optional lhsType = - assign.lhs.GetType(); - std::optional rhsType = - assign.rhs.GetType(); - // Polymorphic lhs/rhs need more care. See F2018 10.2.2.3. - if ((lhsType && lhsType->IsPolymorphic()) || - (rhsType && rhsType->IsPolymorphic())) { - if (explicitIterationSpace()) - TODO(loc, "polymorphic pointer assignment in FORALL"); - - fir::MutableBoxValue lhsMutableBox = genExprMutableBox(loc, assign.lhs); - if (Fortran::evaluate::UnwrapExpr( - assign.rhs)) { - fir::factory::disassociateMutableBox(*builder, loc, lhsMutableBox); - return; - } - mlir::Value lhs = lhsMutableBox.getAddr(); - mlir::Value rhs = fir::getBase(genExprBox(loc, assign.rhs, stmtCtx)); - mlir::Value boundsDesc = createBoundArray(lbounds, ubounds, loc); - Fortran::lower::genPointerAssociateRemapping( - *builder, loc, lhs, rhs, boundsDesc, - lhsType && rhsType && !lhsType->IsPolymorphic() && - rhsType->IsPolymorphic()); - return; - } - if (explicitIterationSpace()) { - // Pointer assignment in FORALL context. Copy the rhs box value - // into the lhs box variable. - genArrayAssignment(assign, stmtCtx, lbounds, ubounds); - return; - } - fir::MutableBoxValue lhs = genExprMutableBox(loc, assign.lhs); - if (Fortran::evaluate::UnwrapExpr( - assign.rhs)) { - fir::factory::disassociateMutableBox(*builder, loc, lhs); - return; - } - // Do not generate a temp in case rhs is an array section. - fir::ExtendedValue rhs = - Fortran::lower::isArraySectionWithoutVectorSubscript(assign.rhs) - ? Fortran::lower::createSomeArrayBox(*this, assign.rhs, - localSymbols, stmtCtx) - : genExprAddr(assign.rhs, stmtCtx); - fir::factory::associateMutableBoxWithRemap(*builder, loc, lhs, rhs, lbounds, - ubounds); - if (explicitIterationSpace()) { - mlir::ValueRange inners = explicitIterSpace.getInnerArgs(); - if (!inners.empty()) - fir::ResultOp::create(*builder, loc, inners); - } - } - /// Given converted LHS and RHS of the assignment, materialize any /// implicit conversion of the RHS to the LHS type. The front-end /// usually already makes those explicit, except for non-standard @@ -5871,233 +5656,23 @@ class FirConverter : public Fortran::lower::AbstractConverter { const llvm::ArrayRef &dirs = {}) { mlir::Location loc = toLocation(); - if (lowerToHighLevelFIR()) { - Fortran::common::visit( - Fortran::common::visitors{ - [&](const Fortran::evaluate::Assignment::Intrinsic &) { - genDataAssignment(assign, /*userDefinedAssignment=*/nullptr, - dirs); - }, - [&](const Fortran::evaluate::ProcedureRef &procRef) { - genDataAssignment(assign, /*userDefinedAssignment=*/&procRef, - dirs); - }, - [&](const Fortran::evaluate::Assignment::BoundsSpec &lbExprs) { - genPointerAssignment(loc, assign); - }, - [&](const Fortran::evaluate::Assignment::BoundsRemapping - &boundExprs) { genPointerAssignment(loc, assign); }, - }, - assign.u); - return; - } - if (explicitIterationSpace()) { - Fortran::lower::createArrayLoads(*this, explicitIterSpace, localSymbols); - explicitIterSpace.genLoopNest(); - } - Fortran::lower::StatementContext stmtCtx; Fortran::common::visit( Fortran::common::visitors{ - // [1] Plain old assignment. [&](const Fortran::evaluate::Assignment::Intrinsic &) { - const Fortran::semantics::Symbol *sym = - Fortran::evaluate::GetLastSymbol(assign.lhs); - - if (!sym) - TODO(loc, "assignment to pointer result of function reference"); - - std::optional lhsType = - assign.lhs.GetType(); - assert(lhsType && "lhs cannot be typeless"); - std::optional rhsType = - assign.rhs.GetType(); - - // Assignment to/from polymorphic entities are done with the - // runtime. - if (lhsType->IsPolymorphic() || - lhsType->IsUnlimitedPolymorphic() || - (rhsType && (rhsType->IsPolymorphic() || - rhsType->IsUnlimitedPolymorphic()))) { - mlir::Value lhs; - if (Fortran::lower::isWholeAllocatable(assign.lhs)) - lhs = genExprMutableBox(loc, assign.lhs).getAddr(); - else - lhs = fir::getBase(genExprBox(loc, assign.lhs, stmtCtx)); - mlir::Value rhs = - fir::getBase(genExprBox(loc, assign.rhs, stmtCtx)); - if ((lhsType->IsPolymorphic() || - lhsType->IsUnlimitedPolymorphic()) && - Fortran::lower::isWholeAllocatable(assign.lhs)) - fir::runtime::genAssignPolymorphic(*builder, loc, lhs, rhs); - else - fir::runtime::genAssign(*builder, loc, lhs, rhs); - return; - } - - // Note: No ad-hoc handling for pointers is required here. The - // target will be assigned as per 2018 10.2.1.3 p2. genExprAddr - // on a pointer returns the target address and not the address of - // the pointer variable. - - if (assign.lhs.Rank() > 0 || explicitIterationSpace()) { - if (isDerivedCategory(lhsType->category()) && - Fortran::semantics::IsFinalizable( - lhsType->GetDerivedTypeSpec())) - TODO(loc, "derived-type finalization with array assignment"); - // Array assignment - // See Fortran 2018 10.2.1.3 p5, p6, and p7 - genArrayAssignment(assign, stmtCtx); - return; - } - - // Scalar assignment - const bool isNumericScalar = - isNumericScalarCategory(lhsType->category()); - const bool isVector = - isDerivedCategory(lhsType->category()) && - lhsType->GetDerivedTypeSpec().IsVectorType(); - fir::ExtendedValue rhs = (isNumericScalar || isVector) - ? genExprValue(assign.rhs, stmtCtx) - : genExprAddr(assign.rhs, stmtCtx); - const bool lhsIsWholeAllocatable = - Fortran::lower::isWholeAllocatable(assign.lhs); - std::optional lhsRealloc; - std::optional lhsMutableBox; - - // Set flag to know if the LHS needs finalization. Polymorphic, - // unlimited polymorphic assignment will be done with genAssign. - // Assign runtime function performs the finalization. - bool needFinalization = !lhsType->IsPolymorphic() && - !lhsType->IsUnlimitedPolymorphic() && - (isDerivedCategory(lhsType->category()) && - Fortran::semantics::IsFinalizable( - lhsType->GetDerivedTypeSpec())); - - auto lhs = [&]() -> fir::ExtendedValue { - if (lhsIsWholeAllocatable) { - lhsMutableBox = genExprMutableBox(loc, assign.lhs); - // Finalize if needed. - if (needFinalization) { - mlir::Value isAllocated = - fir::factory::genIsAllocatedOrAssociatedTest( - *builder, loc, *lhsMutableBox); - builder->genIfThen(loc, isAllocated) - .genThen([&]() { - fir::runtime::genDerivedTypeDestroy( - *builder, loc, fir::getBase(*lhsMutableBox)); - }) - .end(); - needFinalization = false; - } - - llvm::SmallVector lengthParams; - if (const fir::CharBoxValue *charBox = rhs.getCharBox()) - lengthParams.push_back(charBox->getLen()); - else if (fir::isDerivedWithLenParameters(rhs)) - TODO(loc, "assignment to derived type allocatable with " - "LEN parameters"); - lhsRealloc = fir::factory::genReallocIfNeeded( - *builder, loc, *lhsMutableBox, - /*shape=*/{}, lengthParams); - return lhsRealloc->newValue; - } - return genExprAddr(assign.lhs, stmtCtx); - }(); - - if (isNumericScalar || isVector) { - // Fortran 2018 10.2.1.3 p8 and p9 - // Conversions should have been inserted by semantic analysis, - // but they can be incorrect between the rhs and lhs. Correct - // that here. - mlir::Value addr = fir::getBase(lhs); - mlir::Value val = fir::getBase(rhs); - // A function with multiple entry points returning different - // types tags all result variables with one of the largest - // types to allow them to share the same storage. Assignment - // to a result variable of one of the other types requires - // conversion to the actual type. - mlir::Type toTy = genType(assign.lhs); - - // If Cray pointee, need to handle the address - // Array is handled in genCoordinateOp. - if (sym->test(Fortran::semantics::Symbol::Flag::CrayPointee) && - sym->Rank() == 0) { - // get the corresponding Cray pointer - - const Fortran::semantics::Symbol &ptrSym = - Fortran::semantics::GetCrayPointer(*sym); - fir::ExtendedValue ptr = - getSymbolExtendedValue(ptrSym, nullptr); - mlir::Value ptrVal = fir::getBase(ptr); - mlir::Type ptrTy = genType(ptrSym); - - fir::ExtendedValue pte = - getSymbolExtendedValue(*sym, nullptr); - mlir::Value pteVal = fir::getBase(pte); - mlir::Value cnvrt = Fortran::lower::addCrayPointerInst( - loc, *builder, ptrVal, ptrTy, pteVal.getType()); - addr = fir::LoadOp::create(*builder, loc, cnvrt); - } - mlir::Value cast = - isVector ? val - : builder->convertWithSemantics(loc, toTy, val); - if (fir::dyn_cast_ptrEleTy(addr.getType()) != toTy) { - assert(isFuncResultDesignator(assign.lhs) && "type mismatch"); - addr = builder->createConvert( - toLocation(), builder->getRefType(toTy), addr); - } - fir::StoreOp::create(*builder, loc, cast, addr); - } else if (isCharacterCategory(lhsType->category())) { - // Fortran 2018 10.2.1.3 p10 and p11 - fir::factory::CharacterExprHelper{*builder, loc}.createAssign( - lhs, rhs); - } else if (isDerivedCategory(lhsType->category())) { - // Handle parent component. - if (Fortran::lower::isParentComponent(assign.lhs)) { - if (!mlir::isa(fir::getBase(lhs).getType())) - lhs = fir::getBase(builder->createBox(loc, lhs)); - lhs = Fortran::lower::updateBoxForParentComponent(*this, lhs, - assign.lhs); - } - - // Fortran 2018 10.2.1.3 p13 and p14 - // Recursively gen an assignment on each element pair. - fir::factory::genRecordAssignment(*builder, loc, lhs, rhs, - needFinalization); - } else { - llvm_unreachable("unknown category"); - } - if (lhsIsWholeAllocatable) { - assert(lhsRealloc.has_value()); - fir::factory::finalizeRealloc(*builder, loc, *lhsMutableBox, - /*lbounds=*/{}, - /*takeLboundsIfRealloc=*/false, - *lhsRealloc); - } + genDataAssignment(assign, /*userDefinedAssignment=*/nullptr, + dirs); }, - - // [2] User defined assignment. If the context is a scalar - // expression then call the procedure. [&](const Fortran::evaluate::ProcedureRef &procRef) { - Fortran::lower::StatementContext &ctx = - explicitIterationSpace() ? explicitIterSpace.stmtContext() - : stmtCtx; - Fortran::lower::createSubroutineCall( - *this, procRef, explicitIterSpace, implicitIterSpace, - localSymbols, ctx, /*isUserDefAssignment=*/true); + genDataAssignment(assign, /*userDefinedAssignment=*/&procRef, + dirs); }, - [&](const Fortran::evaluate::Assignment::BoundsSpec &lbExprs) { - return genNoHLFIRPointerAssignment(loc, assign, lbExprs); + genPointerAssignment(loc, assign); }, [&](const Fortran::evaluate::Assignment::BoundsRemapping - &boundExprs) { - return genNoHLFIRPointerAssignment(loc, assign, boundExprs); - }, + &boundExprs) { genPointerAssignment(loc, assign); }, }, assign.u); - if (explicitIterationSpace()) - Fortran::lower::createArrayMergeStores(*this, explicitIterSpace); } // Is the insertion point of the builder directly or indirectly set @@ -6121,26 +5696,17 @@ class FirConverter : public Fortran::lower::AbstractConverter { void genFIR(const Fortran::parser::WhereConstruct &c) { setCurrentPositionAt(c); mlir::Location loc = getCurrentLocation(); - hlfir::WhereOp whereOp; + auto whereOp = hlfir::WhereOp::create(*builder, loc); + builder->createBlock(&whereOp.getMaskRegion()); - if (!lowerToHighLevelFIR()) { - implicitIterSpace.growStack(); - } else { - whereOp = hlfir::WhereOp::create(*builder, loc); - builder->createBlock(&whereOp.getMaskRegion()); - } - - // Lower the where mask. For HLFIR, this is done in the hlfir.where mask - // region. + // Lower the where mask in the hlfir.where mask region. genNestedStatement( std::get< Fortran::parser::Statement>( c.t)); - // Lower WHERE body. For HLFIR, this is done in the hlfir.where body - // region. - if (whereOp) - builder->createBlock(&whereOp.getBody()); + // Lower WHERE body in the hlfir.where body region. + builder->createBlock(&whereOp.getBody()); for (const auto &body : std::get>(c.t)) @@ -6158,12 +5724,10 @@ class FirConverter : public Fortran::lower::AbstractConverter { std::get>( c.t)); - if (whereOp) { - // For HLFIR, create fir.end terminator in the last hlfir.elsewhere, or - // in the hlfir.where if it had no elsewhere. - fir::FirEndOp::create(*builder, loc); - builder->setInsertionPointAfter(whereOp); - } + // Create fir.end terminator in the last hlfir.elsewhere, or in the + // hlfir.where if it had no elsewhere. + fir::FirEndOp::create(*builder, loc); + builder->setInsertionPointAfter(whereOp); } void genFIR(const Fortran::parser::WhereBodyConstruct &body) { Fortran::common::visit( @@ -6195,28 +5759,21 @@ class FirConverter : public Fortran::lower::AbstractConverter { void genFIR(const Fortran::parser::WhereConstructStmt &stmt) { const Fortran::semantics::SomeExpr *maskExpr = Fortran::semantics::GetExpr( std::get(stmt.t)); - if (lowerToHighLevelFIR()) - lowerWhereMaskToHlfir(getCurrentLocation(), maskExpr); - else - implicitIterSpace.append(maskExpr); + lowerWhereMaskToHlfir(getCurrentLocation(), maskExpr); } void genFIR(const Fortran::parser::WhereConstruct::MaskedElsewhere &ew) { setCurrentPositionAt(ew); mlir::Location loc = getCurrentLocation(); - hlfir::ElseWhereOp elsewhereOp; - if (lowerToHighLevelFIR()) { - elsewhereOp = hlfir::ElseWhereOp::create(*builder, loc); - // Lower mask in the mask region. - builder->createBlock(&elsewhereOp.getMaskRegion()); - } + auto elsewhereOp = hlfir::ElseWhereOp::create(*builder, loc); + // Lower mask in the mask region. + builder->createBlock(&elsewhereOp.getMaskRegion()); genNestedStatement( std::get< Fortran::parser::Statement>( ew.t)); - // For HLFIR, lower the body in the hlfir.elsewhere body region. - if (elsewhereOp) - builder->createBlock(&elsewhereOp.getBody()); + // Lower the body in the hlfir.elsewhere body region. + builder->createBlock(&elsewhereOp.getBody()); for (const auto &body : std::get>(ew.t)) @@ -6225,18 +5782,13 @@ class FirConverter : public Fortran::lower::AbstractConverter { void genFIR(const Fortran::parser::MaskedElsewhereStmt &stmt) { const auto *maskExpr = Fortran::semantics::GetExpr( std::get(stmt.t)); - if (lowerToHighLevelFIR()) - lowerWhereMaskToHlfir(getCurrentLocation(), maskExpr); - else - implicitIterSpace.append(maskExpr); + lowerWhereMaskToHlfir(getCurrentLocation(), maskExpr); } void genFIR(const Fortran::parser::WhereConstruct::Elsewhere &ew) { setCurrentPositionAt(ew); - if (lowerToHighLevelFIR()) { - auto elsewhereOp = - hlfir::ElseWhereOp::create(*builder, getCurrentLocation()); - builder->createBlock(&elsewhereOp.getBody()); - } + auto elsewhereOp = + hlfir::ElseWhereOp::create(*builder, getCurrentLocation()); + builder->createBlock(&elsewhereOp.getBody()); genNestedStatement( std::get>( ew.t)); @@ -6244,35 +5796,22 @@ class FirConverter : public Fortran::lower::AbstractConverter { std::get>(ew.t)) genFIR(body); } - void genFIR(const Fortran::parser::ElsewhereStmt &stmt) { - if (!lowerToHighLevelFIR()) - implicitIterSpace.append(nullptr); - } - void genFIR(const Fortran::parser::EndWhereStmt &) { - if (!lowerToHighLevelFIR()) - implicitIterSpace.shrinkStack(); - } + void genFIR(const Fortran::parser::ElsewhereStmt &stmt) {} + void genFIR(const Fortran::parser::EndWhereStmt &) {} void genFIR(const Fortran::parser::WhereStmt &stmt) { Fortran::lower::StatementContext stmtCtx; const auto &assign = std::get(stmt.t); const auto *mask = Fortran::semantics::GetExpr( std::get(stmt.t)); - if (lowerToHighLevelFIR()) { - mlir::Location loc = getCurrentLocation(); - auto whereOp = hlfir::WhereOp::create(*builder, loc); - builder->createBlock(&whereOp.getMaskRegion()); - lowerWhereMaskToHlfir(loc, mask); - builder->createBlock(&whereOp.getBody()); - genAssignment(*assign.typedAssignment->v); - fir::FirEndOp::create(*builder, loc); - builder->setInsertionPointAfter(whereOp); - return; - } - implicitIterSpace.growStack(); - implicitIterSpace.append(mask); + mlir::Location loc = getCurrentLocation(); + auto whereOp = hlfir::WhereOp::create(*builder, loc); + builder->createBlock(&whereOp.getMaskRegion()); + lowerWhereMaskToHlfir(loc, mask); + builder->createBlock(&whereOp.getBody()); genAssignment(*assign.typedAssignment->v); - implicitIterSpace.shrinkStack(); + fir::FirEndOp::create(*builder, loc); + builder->setInsertionPointAfter(whereOp); } void genFIR(const Fortran::parser::PointerAssignmentStmt &stmt) { @@ -6518,7 +6057,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { // Always generate fir.dummy_scope even if there are no arguments. // It is currently used to create proper TBAA forest. - if (lowerToHighLevelFIR()) { + { mlir::Value scopeOp = fir::DummyScopeOp::create(*builder, toLocation()); setDummyArgsScope(scopeOp); } @@ -7308,10 +6847,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { void createRuntimeTypeInfoGlobals() {} - bool lowerToHighLevelFIR() const { - return bridge.getLoweringOptions().getLowerToHighLevelFIR(); - } - // Returns the mangling prefix for the given constant expression. std::string getConstantExprManglePrefix(mlir::Location loc, const Fortran::lower::SomeExpr &expr, diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp index cd94f4d363061..e9059581c690a 100644 --- a/flang/lib/Lower/CallInterface.cpp +++ b/flang/lib/Lower/CallInterface.cpp @@ -226,8 +226,7 @@ Fortran::lower::CallerInterface::characterize() const { // ProcedureDesignator has no interface, or may mismatch in case of implicit // interface. if (!characteristic->HasExplicitInterface() || - (converter.getLoweringOptions().getLowerToHighLevelFIR() && - isExternalDefinedInSameCompilationUnit(procRef.proc()) && + (isExternalDefinedInSameCompilationUnit(procRef.proc()) && characteristic->CanBeCalledViaImplicitInterface())) { // In HLFIR lowering, calls to subprogram with implicit interfaces are // always prepared according to the actual arguments. This is to support @@ -1173,9 +1172,6 @@ class Fortran::lower::CallInterfaceImpl { addPassedArg(PassEntityBy::MutableBox, entity, characteristics); } else if (obj.IsPassedByDescriptor(isBindC)) { // Pass as fir.box or fir.class - if (isValueAttr && - !getConverter().getLoweringOptions().getLowerToHighLevelFIR()) - TODO(loc, "assumed shape dummy argument with VALUE attribute"); addFirOperand(boxType, nextPassedArgPosition(), Property::Box, attrs); addPassedArg(PassEntityBy::Box, entity, characteristics); } else if (dynamicType.category() == @@ -1233,11 +1229,6 @@ class Fortran::lower::CallInterfaceImpl { const DummyCharacteristics *characteristics, const Fortran::evaluate::characteristics::DummyProcedure &proc, const FortranEntity &entity) { - if (!interface.converter.getLoweringOptions().getLowerToHighLevelFIR() && - proc.attrs.test( - Fortran::evaluate::characteristics::DummyProcedure::Attr::Pointer)) - TODO(interface.converter.getCurrentLocation(), - "procedure pointer arguments"); const Fortran::evaluate::characteristics::Procedure &procedure = proc.procedure.value(); mlir::Type funcType = diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index d9204976ea7d3..e6c89122bde23 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -145,12 +145,11 @@ static bool mustCastFuncOpToCopeWithImplicitInterfaceMismatch( // mismatch on the arguments. The argument are always prepared according // to the implicit interface. Cast the actual function if any of the // argument mismatch cannot be dealt with a simple fir.convert. - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) - for (auto [actualType, dummyType] : - llvm::zip(callSiteType.getInputs(), funcOpType.getInputs())) - if (actualType != dummyType && - !fir::ConvertOp::canBeConverted(actualType, dummyType)) - return true; + for (auto [actualType, dummyType] : + llvm::zip(callSiteType.getInputs(), funcOpType.getInputs())) + if (actualType != dummyType && + !fir::ConvertOp::canBeConverted(actualType, dummyType)) + return true; return false; } @@ -408,7 +407,6 @@ Fortran::lower::genCallOpAndResult( } } const bool isExprCall = - converter.getLoweringOptions().getLowerToHighLevelFIR() && callSiteType.getNumResults() == 1 && llvm::isa(callSiteType.getResult(0)); @@ -622,8 +620,6 @@ Fortran::lower::genCallOpAndResult( // With the lowering to HLFIR, box arguments have already been built // according to the attributes, rank, bounds, and type they should have. // Do not attempt any reboxing here that could break this. - bool legacyLowering = - !converter.getLoweringOptions().getLowerToHighLevelFIR(); // When dealing with a dummy character argument (fir.boxchar), the // effective argument might be a non-character raw pointer. This may // happen when calling an implicit interface that was previously called @@ -635,7 +631,7 @@ Fortran::lower::genCallOpAndResult( cast = builder.createVolatileCast(loc, isVolatile, fst); cast = builder.convertWithSemantics(loc, snd, cast, allowCharacterConversions, - /*allowRebox=*/legacyLowering); + /*allowRebox=*/false); } } operands.push_back(cast); @@ -845,9 +841,7 @@ Fortran::lower::genCallOpAndResult( // In HLFIR, this is skipped when the result does not need to be finalized // because the result is moved to an expression that will deal with the // finalization. - if (allocatedResult && - (mustFinalizeResult || - !converter.getLoweringOptions().getLowerToHighLevelFIR())) { + if (allocatedResult && mustFinalizeResult) { // The result must be optionally destroyed (if it is of a derived type // that may need finalization or deallocation of the components). // For an allocatable result we have to free the memory allocated diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp index 0feb78e7fe9a1..41d8703bfc8e1 100644 --- a/flang/lib/Lower/ConvertConstant.cpp +++ b/flang/lib/Lower/ConvertConstant.cpp @@ -497,18 +497,6 @@ static mlir::Value genInlinedStructureCtorLitImpl( fir::FirOpBuilder &builder = converter.getFirOpBuilder(); auto recTy = mlir::cast(type); - if (!converter.getLoweringOptions().getLowerToHighLevelFIR()) { - mlir::Value res = fir::UndefOp::create(builder, loc, recTy); - for (const auto &[sym, expr] : ctor.values()) { - // Parent components need more work because they do not appear in the - // fir.rec type. - if (sym->test(Fortran::semantics::Symbol::Flag::ParentComp)) - TODO(loc, "parent component in structure constructor"); - res = genStructureComponentInit(converter, loc, sym, expr.value(), res); - } - return res; - } - auto fieldTy = fir::FieldType::get(recTy.getContext()); mlir::Value res{}; // When the first structure component values belong to some parent type PT diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp index 0d343968374f0..d2a5a978756fd 100644 --- a/flang/lib/Lower/ConvertType.cpp +++ b/flang/lib/Lower/ConvertType.cpp @@ -409,7 +409,7 @@ struct TypeBuilderImpl { // Gather the record type fields. // (1) The data components. - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { + { size_t prev_offset{0}; unsigned padCounter{0}; // In HLFIR the parent component is the first fir.type component. @@ -454,34 +454,6 @@ struct TypeBuilderImpl { } } } - } else { - for (const auto &component : - Fortran::semantics::OrderedComponentIterator(tySpec)) { - // In the lowering to FIR the parent component does not appear in the - // fir.type and its components are inlined at the beginning of the - // fir.type<>. - // FIXME: this strategy leads to bugs because padding should be inserted - // after the component of the parents so that the next components do not - // end-up in the parent storage if the sum of the parent's component - // storage size is not a multiple of the parent type storage alignment. - - // Lowering is assuming non deferred component lower bounds are - // always 1. Catch any situations where this is not true for now. - if (componentHasNonDefaultLowerBounds(component)) - TODO(converter.genLocation(component.name()), - "derived type components with non default lower bounds"); - if (IsProcedure(component)) - TODO(converter.genLocation(component.name()), "procedure components"); - mlir::Type ty = genSymbolType(component); - // Do not add the parent component (component of the parents are - // added and should be sufficient, the parent component would - // duplicate the fields). Note that genSymbolType must be called above - // on it so that the dispatch table for the parent type still gets - // emitted as needed. - if (component.test(Fortran::semantics::Symbol::Flag::ParentComp)) - continue; - cs.emplace_back(converter.getRecordTypeFieldName(component), ty); - } } mlir::Location loc = converter.genLocation(typeSymbol.name()); diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 05b3fb4a5d370..91222490fc395 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -66,15 +66,10 @@ static mlir::Value genScalarValue(Fortran::lower::AbstractConverter &converter, Fortran::lower::StatementContext &context) { // This does not use the AbstractConverter member function to override the // symbol mapping to be used expression lowering. - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { - hlfir::EntityWithAttributes loweredExpr = - Fortran::lower::convertExprToHLFIR(loc, converter, expr, symMap, - context); - return hlfir::loadTrivialScalar(loc, converter.getFirOpBuilder(), - loweredExpr); - } - return fir::getBase(Fortran::lower::createSomeExtendedExpression( - loc, converter, expr, symMap, context)); + hlfir::EntityWithAttributes loweredExpr = + Fortran::lower::convertExprToHLFIR(loc, converter, expr, symMap, context); + return hlfir::loadTrivialScalar(loc, converter.getFirOpBuilder(), + loweredExpr); } /// Does this variable have a default initialization? @@ -255,11 +250,8 @@ fir::ExtendedValue Fortran::lower::genExtAddrInInitializer( storeMap); } - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) - return Fortran::lower::convertExprToAddress(loc, converter, addr, - globalOpSymMap, stmtCtx); - return Fortran::lower::createInitializerAddress(loc, converter, addr, - globalOpSymMap, stmtCtx); + return Fortran::lower::convertExprToAddress(loc, converter, addr, + globalOpSymMap, stmtCtx); } /// create initial-data-target fir.box in a global initializer region. @@ -326,23 +318,11 @@ mlir::Value Fortran::lower::genInitialDataTarget( mlir::Value targetBox; mlir::Value targetShift; - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { + { auto target = Fortran::lower::convertExprToBox( loc, converter, initialTarget, globalOpSymMap, stmtCtx); targetBox = fir::getBase(target); targetShift = builder.createShape(loc, target); - } else { - if (initialTarget.Rank() > 0) { - auto target = Fortran::lower::createSomeArrayBox(converter, initialTarget, - globalOpSymMap, stmtCtx); - targetBox = fir::getBase(target); - targetShift = builder.createShape(loc, target); - } else { - fir::ExtendedValue addr = Fortran::lower::createInitializerAddress( - loc, converter, initialTarget, globalOpSymMap, stmtCtx); - targetBox = builder.createBox(loc, addr); - // Nothing to do for targetShift, the target is a scalar. - } } // The targetBox is a fir.box, not a fir.box> as it should for // pointers (this matters to get the POINTER attribute correctly inside the @@ -446,36 +426,21 @@ static mlir::Value genDefaultInitializerValue( const Fortran::semantics::DeclTypeSpec *declTy = sym.GetType(); assert(declTy && "var with default initialization must have a type"); - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { - // In HLFIR, the parent type is the first component, while in FIR there is - // not parent component in the fir.type and the component of the parent are - // "inlined" at the beginning of the fir.type. - const Fortran::semantics::Symbol &typeSymbol = - declTy->derivedTypeSpec().typeSymbol(); - const Fortran::semantics::Scope *derivedScope = - declTy->derivedTypeSpec().GetScope(); - assert(derivedScope && "failed to retrieve derived type scope"); - for (const auto &componentName : - typeSymbol.get() - .componentNames()) { - auto scopeIter = derivedScope->find(componentName); - assert(scopeIter != derivedScope->cend() && - "failed to find derived type component symbol"); - const Fortran::semantics::Symbol &component = scopeIter->second.get(); - initialValue = genComponentDefaultInit(converter, loc, component, recTy, - initialValue, stmtCtx); - } - } else { - Fortran::semantics::OrderedComponentIterator components( - declTy->derivedTypeSpec()); - for (const auto &component : components) { - // Skip parent components, the sub-components of parent types are part of - // components and will be looped through right after. - if (component.test(Fortran::semantics::Symbol::Flag::ParentComp)) - continue; - initialValue = genComponentDefaultInit(converter, loc, component, recTy, - initialValue, stmtCtx); - } + // In HLFIR, the parent type is the first component of the fir.type. + const Fortran::semantics::Symbol &typeSymbol = + declTy->derivedTypeSpec().typeSymbol(); + const Fortran::semantics::Scope *derivedScope = + declTy->derivedTypeSpec().GetScope(); + assert(derivedScope && "failed to retrieve derived type scope"); + for (const auto &componentName : + typeSymbol.get() + .componentNames()) { + auto scopeIter = derivedScope->find(componentName); + assert(scopeIter != derivedScope->cend() && + "failed to find derived type component symbol"); + const Fortran::semantics::Symbol &component = scopeIter->second.get(); + initialValue = genComponentDefaultInit(converter, loc, component, recTy, + initialValue, stmtCtx); } if (sequenceType) { @@ -544,10 +509,6 @@ fir::GlobalOp Fortran::lower::defineGlobal( if (global && globalIsInitialized(global)) return global; - if (!converter.getLoweringOptions().getLowerToHighLevelFIR() && - Fortran::semantics::IsProcedurePointer(sym)) - TODO(loc, "procedure pointer globals"); - const auto *oeDetails = sym.detailsIf(); @@ -1935,8 +1896,7 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter, // Commonblock names are not variables, but in some lowerings (like OpenMP) it // is useful to maintain the address of the commonblock in an MLIR value and // query it. hlfir.declare need not be created for these. - if (converter.getLoweringOptions().getLowerToHighLevelFIR() && - (!Fortran::semantics::IsProcedure(sym) || + if ((!Fortran::semantics::IsProcedure(sym) || Fortran::semantics::IsPointer(sym)) && !sym.detailsIf()) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); @@ -2043,8 +2003,7 @@ void Fortran::lower::genDeclareSymbol( Fortran::lower::SymMap &symMap, const Fortran::semantics::Symbol &sym, const fir::ExtendedValue &exv, fir::FortranVariableFlagsEnum extraFlags, bool force) { - if (converter.getLoweringOptions().getLowerToHighLevelFIR() && - (!Fortran::semantics::IsProcedure(sym) || + if ((!Fortran::semantics::IsProcedure(sym) || Fortran::semantics::IsPointer(sym.GetUltimate())) && !sym.detailsIf()) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); @@ -2087,10 +2046,6 @@ genAllocatableOrPointerDeclare(Fortran::lower::AbstractConverter &converter, Fortran::lower::SymMap &symMap, const Fortran::semantics::Symbol &sym, fir::MutableBoxValue box, bool force = false) { - if (!converter.getLoweringOptions().getLowerToHighLevelFIR()) { - symMap.addAllocatableOrPointer(sym, box, force); - return; - } assert(!box.isDescribedByVariables() && "HLFIR alloctables/pointers must be fir.ref"); mlir::Value base = box.getAddr(); @@ -2125,15 +2080,10 @@ static void genBoxDeclare(Fortran::lower::AbstractConverter &converter, llvm::ArrayRef explicitParams, llvm::ArrayRef explicitExtents, bool replace = false) { - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { - fir::BoxValue boxValue{box, lbounds, explicitParams, explicitExtents}; - Fortran::lower::genDeclareSymbol( - converter, symMap, sym, std::move(boxValue), - fir::FortranVariableFlagsEnum::None, replace); - return; - } - symMap.addBoxSymbol(sym, box, lbounds, explicitParams, explicitExtents, - replace); + fir::BoxValue boxValue{box, lbounds, explicitParams, explicitExtents}; + Fortran::lower::genDeclareSymbol(converter, symMap, sym, std::move(boxValue), + fir::FortranVariableFlagsEnum::None, + replace); } /// Lower specification expressions and attributes of variable \p var and @@ -2228,8 +2178,6 @@ void Fortran::lower::mapSymbolAttributes( } fir::MutableBoxValue box = Fortran::lower::createMutableBox( converter, loc, var, boxAlloc, nonDeferredLenParams, - /*alwaysUseBox=*/ - converter.getLoweringOptions().getLowerToHighLevelFIR(), Fortran::lower::getAllocatorIdx(var.getSymbol())); genAllocatableOrPointerDeclare(converter, symMap, var.getSymbol(), box, replace); diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp index ad6aba1d28ae4..93d0d748197d2 100644 --- a/flang/lib/Lower/HostAssociations.cpp +++ b/flang/lib/Lower/HostAssociations.cpp @@ -74,11 +74,8 @@ static void bindCapturedSymbol(const Fortran::semantics::Symbol &sym, fir::ExtendedValue val, Fortran::lower::AbstractConverter &converter, Fortran::lower::SymMap &symMap) { - if (converter.getLoweringOptions().getLowerToHighLevelFIR()) - Fortran::lower::genDeclareSymbol(converter, symMap, sym, val, - fir::FortranVariableFlagsEnum::host_assoc); - else - symMap.addSymbol(sym, val); + Fortran::lower::genDeclareSymbol(converter, symMap, sym, val, + fir::FortranVariableFlagsEnum::host_assoc); } namespace { diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index a21865f9c5ffe..bc6e9eb67e132 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -217,10 +217,6 @@ static llvm::cl::opt enableNoPPCNativeVecElemOrder( llvm::cl::desc("no PowerPC native vector element order."), llvm::cl::init(false)); -static llvm::cl::opt useHLFIR("hlfir", - llvm::cl::desc("Lower to high level FIR"), - llvm::cl::init(true)); - static llvm::cl::opt enableCUDA("fcuda", llvm::cl::desc("enable CUDA Fortran"), llvm::cl::init(false)); @@ -468,7 +464,6 @@ static llvm::LogicalResult convertFortranSourceToMLIR( // Use default lowering options for bbc. Fortran::lower::LoweringOptions loweringOptions{}; loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder); - loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR); loweringOptions.setIntegerWrapAround(integerWrapAround); loweringOptions.setInitGlobalZero(initGlobalZero); loweringOptions.setReallocateLHS(reallocateLHS); @@ -550,7 +545,7 @@ static llvm::LogicalResult convertFortranSourceToMLIR( return mlir::failure(); } - if (emitFIR && useHLFIR) { + if (emitFIR) { // lower HLFIR to FIR fir::EnableOpenMP enableOmp = enableOpenMP ? fir::EnableOpenMP::Full : fir::EnableOpenMP::None; From bd0aad55315be39e8fc3cff0aa1f8b0a6bbce640 Mon Sep 17 00:00:00 2001 From: Victor Chernyakin Date: Mon, 11 May 2026 07:43:45 -0700 Subject: [PATCH 281/538] [clang][NFC] Mark CWG988 as implemented and add a test (#196889) [CWG988](https://wg21.link/cwg988) specifies that reference collapsing is performed when trying to form a reference to a `decltype`. Clang implements this since 2.7: https://godbolt.org/z/vYzKbv8x7 (and I checked a few versions after that to make sure there were no regressions). --- clang/test/CXX/drs/cwg9xx.cpp | 11 +++++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp index c8fdca3a45c3e..d7385ab6db859 100644 --- a/clang/test/CXX/drs/cwg9xx.cpp +++ b/clang/test/CXX/drs/cwg9xx.cpp @@ -169,6 +169,17 @@ enum struct E4 : int { e = static_cast(E4()) }; #endif } // namespace cwg977 +namespace cwg988 { // cwg988: 2.7 +#if __cplusplus >= 201103L +void f(int& lvalue_ref, int&& rvalue_ref) { + static_assert(__is_same(decltype(lvalue_ref)&, int&), ""); + static_assert(__is_same(decltype(lvalue_ref)&&, int&), ""); + static_assert(__is_same(decltype(rvalue_ref)&, int&), ""); + static_assert(__is_same(decltype(rvalue_ref)&&, int&&), ""); +} +#endif +} // namespace cwg988 + namespace cwg990 { // cwg990: 3.5 #if __cplusplus >= 201103L struct A { // #cwg990-A diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index ace1f3b9aba33..5fb86ecb85393 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -6693,7 +6693,7 @@

C++ defect report implementation status

[dcl.type.simple] CD2 Reference-to-reference collapsing with decltype - Unknown + Clang 2.7 989 From c51634486b6092d89a57224a7fc78002ada71986 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 11 May 2026 07:44:02 -0700 Subject: [PATCH 282/538] [PM] Make InvalidateAllAnalysesPass Optional (#196956) Similar reasoning to 221a24e94f7b03ea881df34cc8867c58ac8fdb52. Making this required means we end up with assertion failures in the LPM around LCSSA. This is a bit unfortunate given it would be nice to ensure we can trivially invalidate analyses on optnone functions, but this matches the old behavior and prevents and assertion failure for now. --- llvm/include/llvm/IR/PassManager.h | 2 +- llvm/test/Other/lpm-require-analysis-optnone.ll | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index 7d9254e7d8e98..22e02853bc567 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -972,7 +972,7 @@ struct InvalidateAnalysisPass /// Because this preserves no analyses, any analysis passes queried after this /// pass runs will recompute fresh results. struct InvalidateAllAnalysesPass - : RequiredPassInfoMixin { + : OptionalPassInfoMixin { /// Run this pass over some unit of IR. template PreservedAnalyses run(IRUnitT &, AnalysisManagerT &, ExtraArgTs &&...) { diff --git a/llvm/test/Other/lpm-require-analysis-optnone.ll b/llvm/test/Other/lpm-require-analysis-optnone.ll index 4e3c406852565..abfa539fef4d4 100644 --- a/llvm/test/Other/lpm-require-analysis-optnone.ll +++ b/llvm/test/Other/lpm-require-analysis-optnone.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt < %s -passes="require" -S | FileCheck %s +; RUN: opt < %s -passes="loop(invalidate)" -S | FileCheck %s ;; Test that if we have a loop out of LCSSA in an optnone function, we do not ;; assert when we require a loop analysis. From b4c9e713fa40621c0f49b4a868a06d8bb6d06806 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 11 May 2026 16:51:36 +0200 Subject: [PATCH 283/538] [clang][bytecode] Allow IntAP(S) in CastIntegralFixedPoint op (#196949) --- clang/lib/AST/ByteCode/Opcodes.td | 2 +- clang/test/AST/ByteCode/fixed-point.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 0838263a53ede..bc2dc7f2fc02c 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -763,7 +763,7 @@ def CastPointerIntegralAPS : Opcode { let Args = [ArgUint32]; } def CastIntegralFixedPoint : Opcode { - let Types = [FixedSizeIntegralTypeClass]; + let Types = [IntegralTypeClass]; let Args = [ArgUint32]; let HasGroup = 1; } diff --git a/clang/test/AST/ByteCode/fixed-point.cpp b/clang/test/AST/ByteCode/fixed-point.cpp index c8baa1972536a..fb44558fc037b 100644 --- a/clang/test/AST/ByteCode/fixed-point.cpp +++ b/clang/test/AST/ByteCode/fixed-point.cpp @@ -20,6 +20,11 @@ static_assert(A == 0.0k); static_assert(A == 0); static_assert(!A); +#ifdef __SIZEOF_INT128__ +constexpr __int128 i128 = 42; +static_assert(i128 == 42.0k, ""); +#endif + constexpr bool toBool() { if (A) return true; From ab963e88b83f5cf9788efbfcea3a7336d66da72a Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Mon, 11 May 2026 22:51:38 +0800 Subject: [PATCH 284/538] [DAGCombiner] Fix abs(add) to abdu miscompile in foldABSToABD (#196782) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The abs(add(x, y)) → abdu(x, -y) fold added in #186659 is incorrect when both operands are known non-negative and their sum does not overflow signed. When both x and y are non-negative and `x + y < 2^31`, `abs(x + y) = x + y`, but `abdu(x, -y) = 2^32 - y - x ≠ x + y`. For example, `abs(add(0, 1)) = 1`, but `abdu(0, -1) = 0xFFFFFFFF`. Related: #185467 #175801 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +-- llvm/test/CodeGen/X86/abds.ll | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bf4d4cc261e42..3d65c82f051a8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12025,10 +12025,8 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) { return CreateZextedAbd(ISD::ABDS); // fold (abs (sub x, y)) -> abdu(x, y) - // fold (abs (add x, -y)) -> abdu(x, y) bool Op1SignBitIsOne = DAG.computeKnownBits(Op1).isNegative(); - bool AbsOpWillNUW = DAG.SignBitIsZero(Op0) && - (IsAdd ? DAG.SignBitIsZero(Op1) : Op1SignBitIsOne); + bool AbsOpWillNUW = !IsAdd && DAG.SignBitIsZero(Op0) && Op1SignBitIsOne; if (hasOperation(ISD::ABDU, VT) && AbsOpWillNUW) return CreateZextedAbd(ISD::ABDU); diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index 5948af563d152..a3056e9426643 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -1389,6 +1389,32 @@ define i32 @PR185467(i32 range(i32 0, 2147483647) %v) { ret i32 %absx } +define i32 @abs_add_known_positive(i32 %a) { +; X86-LABEL: abs_add_known_positive: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovsl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: abs_add_known_positive: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF +; X64-NEXT: leal 1(%rdi), %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmovsl %ecx, %eax +; X64-NEXT: retq + %x = and i32 %a, 2147483647 + %add = add i32 %x, 1 + %abs = call i32 @llvm.abs.i32(i32 %add, i1 false) + ret i32 %abs +} + declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) From b49288ff950d00bd44a6f116d3caa195435b2479 Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Mon, 11 May 2026 22:52:21 +0800 Subject: [PATCH 285/538] [SelectionDAG] Drop unnecessary lower bound check in `lowerRangeToAssertZExt` (#196785) Drop the `Lo.isMinValue()` check in `lowerRangeToAssertZExt`. The check was introduced in 2bba779272a2 when the bit width was computed via `logBase2(Hi)`, which required `Lo == 0` for correctness. It is no longer needed since 9e04befb0979 when we switched to `getUnsignedMax().getActiveBits()` for bit width. The change in `DAGCombiner.cpp` is to prevent a regression in `llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll`. I wasn't able to construct an individual test for it. --------- Co-authored-by: nikic --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 + .../SelectionDAG/SelectionDAGBuilder.cpp | 4 - llvm/test/CodeGen/X86/call-range-attr.ll | 73 +++++++++++++++++++ 3 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/X86/call-range-attr.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3d65c82f051a8..14bf2b704c4da 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16213,6 +16213,10 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { AssertVT == cast(N0.getOperand(1))->getVT()) return N0; + // fold (assert?ext c, vt) -> c + if (isa(N0)) + return N0; + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).getOpcode() == Opcode) { // We have an assert, truncate, assert sandwich. Make one stronger assert diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 68ae86d8d561f..5753d74168e59 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -10852,10 +10852,6 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG, if (!CR || CR->isFullSet() || CR->isEmptySet() || CR->isUpperWrapped()) return Op; - APInt Lo = CR->getUnsignedMin(); - if (!Lo.isMinValue()) - return Op; - APInt Hi = CR->getUnsignedMax(); unsigned Bits = std::max(Hi.getActiveBits(), static_cast(IntegerType::MIN_INT_BITS)); diff --git a/llvm/test/CodeGen/X86/call-range-attr.ll b/llvm/test/CodeGen/X86/call-range-attr.ll new file mode 100644 index 0000000000000..80e03e91d12dc --- /dev/null +++ b/llvm/test/CodeGen/X86/call-range-attr.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +declare i64 @returns_i64() + +define i64 @call_range_nonzero_lo() nounwind { +; CHECK-LABEL: call_range_nonzero_lo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq returns_i64@PLT +; CHECK-NEXT: andq $-8, %rax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq + %v = call range(i64 1, 2305843009213693952) i64 @returns_i64() + %r = and i64 %v, 2305843009213693944 + ret i64 %r +} + +define i64 @call_range_zero_lo() nounwind { +; CHECK-LABEL: call_range_zero_lo: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq returns_i64@PLT +; CHECK-NEXT: andl $-8, %eax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq + %v = call range(i64 0, 256) i64 @returns_i64() + %r = and i64 %v, 248 + ret i64 %r +} + +define i64 @call_range_narrow() nounwind { +; CHECK-LABEL: call_range_narrow: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq returns_i64@PLT +; CHECK-NEXT: andl $-8, %eax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq + %v = call range(i64 100, 256) i64 @returns_i64() + %r = and i64 %v, 248 + ret i64 %r +} + +; Negative tests + +define i64 @call_no_range() nounwind { +; CHECK-LABEL: call_no_range: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq returns_i64@PLT +; CHECK-NEXT: movabsq $2305843009213693944, %rcx # imm = 0x1FFFFFFFFFFFFFF8 +; CHECK-NEXT: andq %rcx, %rax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq + %v = call i64 @returns_i64() + %r = and i64 %v, 2305843009213693944 + ret i64 %r +} + +define i64 @call_wrapped_range() nounwind { +; CHECK-LABEL: call_wrapped_range: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq returns_i64@PLT +; CHECK-NEXT: movabsq $2305843009213693944, %rcx # imm = 0x1FFFFFFFFFFFFFF8 +; CHECK-NEXT: andq %rcx, %rax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq + %v = call range(i64 -100, 100) i64 @returns_i64() + %r = and i64 %v, 2305843009213693944 + ret i64 %r +} From 2144d0a732e4366c2c4961ce495ae2ee575ce30e Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Mon, 11 May 2026 17:01:33 +0200 Subject: [PATCH 286/538] [SPIR-V] Add support for OpFMod intrinsic (#193172) Add the `spv.fmod` intrinsic and lower it directly to `SPIRV::OpFMod` covering scalar and vector cases --- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 18 ++++++++++++++++++ llvm/lib/Target/SPIRV/SPIRVBuiltins.td | 4 ++++ .../scalar-floating-point-arithmetic.ll | 18 +++++++++++++++++- .../vector-floating-point-arithmetic.ll | 19 ++++++++++++++++++- 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 4b663d7b78e37..70644c402c38a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -2139,6 +2139,21 @@ static bool generateMulExtendedInst(const SPIRV::IncomingCall *Call, return true; } +static bool generateArithmeticInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; + unsigned Opcode = + SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; + + auto MIB = MIRBuilder.buildInstr(Opcode) + .addDef(Call->ReturnRegister) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)); + for (Register Arg : Call->Arguments) + MIB.addUse(Arg); + return true; +} + static bool generateGetQueryInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { @@ -3346,6 +3361,7 @@ mapBuiltinToOpcode(const StringRef DemangledCall, case SPIRV::AsyncCopy: case SPIRV::LoadStore: case SPIRV::CoopMatr: + case SPIRV::Arithmetic: if (const auto *R = SPIRV::lookupNativeBuiltin(Call->Builtin->Name, Call->Builtin->Set)) return std::make_tuple(Call->Builtin->Group, R->Opcode, 0); @@ -3459,6 +3475,8 @@ std::optional lowerBuiltin(const StringRef DemangledCall, return generateICarryBorrowInst(Call.get(), MIRBuilder, GR); case SPIRV::MulExtended: return generateMulExtendedInst(Call.get(), MIRBuilder, GR); + case SPIRV::Arithmetic: + return generateArithmeticInst(Call.get(), MIRBuilder, GR); case SPIRV::GetQuery: return generateGetQueryInst(Call.get(), MIRBuilder, GR); case SPIRV::ImageSizeQuery: diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index ad192883d162b..806d283ff715f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -72,6 +72,7 @@ def TernaryBitwiseINTEL : BuiltinGroup; def Block2DLoadStore : BuiltinGroup; def Pipe : BuiltinGroup; def PredicatedLoadStore : BuiltinGroup; +def Arithmetic : BuiltinGroup; def ArbitraryPrecisionFixedPoint : BuiltinGroup; def BlockingPipes : BuiltinGroup; def ImageChannelDataTypes : BuiltinGroup; @@ -697,6 +698,9 @@ defm : DemangledNativeBuiltin<"__spirv_UMulExtended", GLSL_std_450, MulExtended, defm : DemangledNativeBuiltin<"__spirv_SMulExtended", OpenCL_std, MulExtended, 2, 3, OpSMulExtended>; defm : DemangledNativeBuiltin<"__spirv_SMulExtended", GLSL_std_450, MulExtended, 2, 3, OpSMulExtended>; +// Arithmetic builtin records: +defm : DemangledNativeBuiltin<"__spirv_FMod", OpenCL_std, Arithmetic, 2, 2, OpFMod>; + // cl_intel_split_work_group_barrier defm : DemangledNativeBuiltin<"intel_work_group_barrier_arrive", OpenCL_std, Barrier, 1, 2, OpControlBarrierArriveINTEL>; defm : DemangledNativeBuiltin<"__spirv_ControlBarrierArriveINTEL", OpenCL_std, Barrier, 3, 3, OpControlBarrierArriveINTEL>; diff --git a/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll b/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll index b04db946f36d0..bdad95a1615bc 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll @@ -8,7 +8,7 @@ ; CHECK-DAG: OpName [[SCALAR_FDIV:%.+]] "scalar_fdiv" ; CHECK-DAG: OpName [[SCALAR_FREM:%.+]] "scalar_frem" ; CHECK-DAG: OpName [[SCALAR_FMA:%.+]] "scalar_fma" -;; FIXME: add test for OpFMod +; CHECK-DAG: OpName [[SCALAR_FMOD:%.+]] "scalar_fmod" ; CHECK-NOT: DAG-FENCE @@ -109,6 +109,22 @@ define float @scalar_frem(float %a, float %b) { ; CHECK: OpReturnValue [[C]] ; CHECK-NEXT: OpFunctionEnd +;; Test fmod on scalar: +define spir_func float @scalar_fmod(float %a, float %b) { + %c = call spir_func float @_Z12__spirv_FModff(float %a, float %b) + ret float %c +} + +declare spir_func float @_Z12__spirv_FModff(float, float) + +; CHECK: [[SCALAR_FMOD]] = OpFunction [[SCALAR]] None [[SCALAR_FN]] +; CHECK-NEXT: [[A:%.+]] = OpFunctionParameter [[SCALAR]] +; CHECK-NEXT: [[B:%.+]] = OpFunctionParameter [[SCALAR]] +; CHECK: OpLabel +; CHECK: [[C:%.+]] = OpFMod [[SCALAR]] [[A]] [[B]] +; CHECK: OpReturnValue [[C]] +; CHECK-NEXT: OpFunctionEnd + declare float @llvm.fma.f32(float, float, float) ;; Test fma on scalar: diff --git a/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll b/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll index 0b0e505829ed1..e79b3a6595a0f 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll @@ -7,7 +7,7 @@ ; CHECK-DAG: OpName [[VECTOR_FMUL:%.+]] "vector_fmul" ; CHECK-DAG: OpName [[VECTOR_FDIV:%.+]] "vector_fdiv" ; CHECK-DAG: OpName [[VECTOR_FREM:%.+]] "vector_frem" -;; TODO: add test for OpFMod +; CHECK-DAG: OpName [[VECTOR_FMOD:%.+]] "vector_fmod" ; CHECK-NOT: DAG-FENCE @@ -106,3 +106,20 @@ define <2 x half> @vector_frem(<2 x half> %a, <2 x half> %b) { ; CHECK: [[C:%.+]] = OpFRem [[VECTOR]] [[A]] [[B]] ; CHECK: OpReturnValue [[C]] ; CHECK-NEXT: OpFunctionEnd + + +;; Test fmod on vector: +define spir_func <2 x half> @vector_fmod(<2 x half> %a, <2 x half> %b) { + %c = call spir_func <2 x half> @_Z12__spirv_FModDv2_DhS_(<2 x half> %a, <2 x half> %b) + ret <2 x half> %c +} + +declare spir_func <2 x half> @_Z12__spirv_FModDv2_DhS_(<2 x half>, <2 x half>) + +; CHECK: [[VECTOR_FMOD]] = OpFunction [[VECTOR]] None [[VECTOR_FN]] +; CHECK-NEXT: [[A:%.+]] = OpFunctionParameter [[VECTOR]] +; CHECK-NEXT: [[B:%.+]] = OpFunctionParameter [[VECTOR]] +; CHECK: OpLabel +; CHECK: [[C:%.+]] = OpFMod [[VECTOR]] [[A]] [[B]] +; CHECK: OpReturnValue [[C]] +; CHECK-NEXT: OpFunctionEnd From 2fcf77664f6501548ba0475bc28c2f03bb3801c7 Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Mon, 11 May 2026 17:06:19 +0200 Subject: [PATCH 287/538] [clang] Forward `-fvalidate-ast-input-files-content` when loading AST dumps `-fvalidate-ast-input-files-content` is silently ignored when loading PCH files for additional translation units. This triggers an import failure when modification time of some of input files changes (for example, the AST dump is being reused for a subsequent cross-translation-unit re-analysis with Clang Static Analyzer after a fresh code checkout). This makes it difficult to use cached AST dumps. This patch enables the user to control validation of AST input files, without imposing it on them. -- CPP-8312, CPP-8025 --- clang/lib/Frontend/ASTUnit.cpp | 5 ++- clang/test/Analysis/ctu/reusable-pch.c | 49 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 clang/test/Analysis/ctu/reusable-pch.c diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp index 05ae1f348f920..83fe82365b008 100644 --- a/clang/lib/Frontend/ASTUnit.cpp +++ b/clang/lib/Frontend/ASTUnit.cpp @@ -784,7 +784,10 @@ std::unique_ptr ASTUnit::LoadFromASTFile( *AST->PP, *AST->ModCache, AST->Ctx.get(), PCHContainerRdr, *AST->CodeGenOpts, ArrayRef>(), /*isysroot=*/"", - /*DisableValidationKind=*/disableValid, AllowASTWithCompilerErrors); + /*DisableValidationKind=*/disableValid, AllowASTWithCompilerErrors, + /*AllowConfigurationMismatch=*/false, + /*ValidateSystemInputs=*/false, + /*ForceValidateUserInputs=*/true, HSOpts.ValidateASTInputFilesContent); // Attach the AST reader to the AST context as an external AST source, so that // declarations will be deserialized from the AST file as needed. diff --git a/clang/test/Analysis/ctu/reusable-pch.c b/clang/test/Analysis/ctu/reusable-pch.c new file mode 100644 index 0000000000000..ddcccc9cf6150 --- /dev/null +++ b/clang/test/Analysis/ctu/reusable-pch.c @@ -0,0 +1,49 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// DEFINE: %{ctu_analysis} = %clang_analyze_cc1 \ +// DEFINE: -analyzer-checker=core \ +// DEFINE: -analyzer-config experimental-enable-naive-ctu-analysis=true \ +// DEFINE: -analyzer-config ctu-dir=%t \ +// DEFINE: -verify + +// Step 1: Build PCH and defmap. +// RUN: %clang_cc1 -x c -emit-pch -fvalidate-ast-input-files-content -o %t/other.c.ast %t/other.c +// RUN: %clang_extdef_map %t/other.c -- -c -x c > %t/externalDefMap.tmp.txt +// RUN: sed -e 's| .*other\.c| other.c.ast|' %t/externalDefMap.tmp.txt > %t/externalDefMap.txt + +// Step 2a: Run CTU using the PCH - the division by zero is found via inlining. +// RUN: %{ctu_analysis} %t/main.c + +// Step 2b: Run with content validation - no difference. +// RUN: %{ctu_analysis} %t/main.c -fvalidate-ast-input-files-content + +// Step 3: Set mtime of the source from which PCH was built to the year 3000 (way in the future). +// RUN: touch -t 300001010000 %t/other.c + +// Step 4a: Run CTU using the "stale" PCH, and it should still load it and find the division by zero bug. +// RUN: %{ctu_analysis} -fvalidate-ast-input-files-content %t/main.c + +// Step 4b: Run without content validation: CTU import failure +// RUN: not %{ctu_analysis} %t/main.c 2>&1 | FileCheck %s + +//--- main.c +// Without CTU, always_zero() has an unknown return value so no bug is found. +// With CTU, always_zero() is inlined and its return value (0) is known, +// exposing the division by zero. + +// CHECK: fatal error: file '{{.*}}other.c' has been modified since the precompiled file '{{.*}}other.c.ast' was built +// CHECK: note: mtime changed from expected +// CHECK: note: earlier input file validation has covered only user files +// CHECK: import of an external symbol for CTU failed: Failed to load external AST source. + +int always_zero(void); + +void f(void) { + int x = always_zero(); + (void)(1 / x); // expected-warning{{Division by zero}} +} + +//--- other.c +int always_zero(void) { return 0; } From fca1083fd68676e670267c5cc91c8ac0eeef1ddb Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 11 May 2026 10:09:08 -0500 Subject: [PATCH 288/538] [lldb] Handle SIGINT via the MainLoop signal thread (on POSIX) (#196687) The driver's async SIGINT handler called SBDebugger::DispatchInputInterrupt directly. That is not async-signal-safe and can lead to a crash. Register SIGINT with the existing signal-thread MainLoop instead so DispatchInputInterrupt runs in normal thread context. The Windows path is unchanged and keeps the legacy async handler. While DispatchInputInterrupt runs, the callback temporarily installs SIG_DFL so a second Ctrl-C still hard-terminates the process, preserving the escape hatch users rely on when the debugger is unresponsive. Moving SIGINT off the main thread means a Ctrl-C no longer interrupts blocking syscalls there (e.g. a Python REPL waiting on input or sleeping), so Python never observes the queued interrupt and KeyboardInterrupt is not raised. To restore that behavior, after dispatching the interrupt the callback re-raises SIGINT on the main thread via pthread_kill; the resulting EINTR lets Python pick up the pending interrupt. A skip flag suppresses the re-entry that this self-send produces. Because the callback only ever runs on the signal thread, the flag and the captured main-thread id live in the lambda's captures and need no synchronization. rdar://158218595 --- lldb/tools/driver/Driver.cpp | 60 +++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index d47d3daf1c3fc..e58286f9ff41e 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -43,6 +43,9 @@ #include #include #include +#ifndef _WIN32 +#include +#endif #include #include #include @@ -651,11 +654,10 @@ void Driver::UpdateWindowSize() { } } -void sigint_handler(int signo) { #ifdef _WIN32 +void sigint_handler(int signo) { // Restore handler as it is not persistent on Windows. signal(SIGINT, sigint_handler); -#endif static std::atomic_flag g_interrupt_sent = ATOMIC_FLAG_INIT; if (g_driver != nullptr) { @@ -668,6 +670,7 @@ void sigint_handler(int signo) { _exit(signo); } +#endif static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) { std::string usage_str = tool_name.str() + " [options]"; @@ -781,15 +784,64 @@ int main(int argc, char const *argv[]) { // Setup LLDB signal handlers once the debugger has been initialized. SBDebugger::PrintDiagnosticsOnError(); - // FIXME: Migrate the SIGINT handler to be handled by the signal loop below. +#ifdef _WIN32 signal(SIGINT, sigint_handler); -#if !defined(_WIN32) +#else signal(SIGPIPE, SIG_IGN); + // Capture the main thread's id so the signal thread can target it. + pthread_t main_thread = pthread_self(); + + // Set when the signal thread sends itself a SIGINT to wake the main thread. + // The next callback invocation observes this flag and skips the work. A + // plain bool is sufficient because the callback only ever runs on the + // signal thread; it lives outside the lambda because MainLoopPosix copies + // the callback on every dispatch, which would discard in-lambda state. + bool skip_next_sigint = false; + // Handle signals in a MainLoop running on a separate thread. MainLoop signal_loop; Status signal_status; + auto sigint_handler = signal_loop.RegisterSignal( + SIGINT, + [&, main_thread](MainLoopBase &) { + // Skip the self-sent wakeup SIGINT queued at the end of the previous + // invocation. + if (std::exchange(skip_next_sigint, false)) + return; + + // Temporarily restore the default disposition so that a second SIGINT + // delivered while DispatchInputInterrupt is running hard-terminates + // the process. This preserves the "double Ctrl-C to force exit" + // escape hatch users rely on when the debugger is unresponsive. + struct sigaction old_action; + struct sigaction new_action = {}; + new_action.sa_handler = SIG_DFL; + sigemptyset(&new_action.sa_mask); + + int ret = sigaction(SIGINT, &new_action, &old_action); + UNUSED_IF_ASSERT_DISABLED(ret); + assert(ret == 0 && "sigaction failed"); + + if (g_driver) + g_driver->GetDebugger().DispatchInputInterrupt(); + + ret = sigaction(SIGINT, &old_action, nullptr); + UNUSED_IF_ASSERT_DISABLED(ret); + assert(ret == 0 && "sigaction failed"); + + // Wake the main thread so any blocking syscall (e.g. the Python REPL + // waiting on input or sleeping) returns with EINTR. This lets Python + // observe the pending interrupt queued by DispatchInputInterrupt and + // raise KeyboardInterrupt. Flag the resulting callback invocation so + // it's skipped rather than re-running DispatchInputInterrupt. + skip_next_sigint = true; + pthread_kill(main_thread, SIGINT); + }, + signal_status); + assert(sigint_handler && signal_status.Success()); + auto sigwinch_handler = signal_loop.RegisterSignal( SIGWINCH, [&](MainLoopBase &) { From 0237e9a0d89614f1a3a638f7d27e6be898cae61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susan=20Tan=20=28=E3=82=B9-=E3=82=B6=E3=83=B3=E3=80=80?= =?UTF-8?q?=E3=82=BF=E3=83=B3=29?= Date: Mon, 11 May 2026 11:13:22 -0400 Subject: [PATCH 289/538] [flang][FIRToMemRef] [flang][fir-to-memref] Lower complex projected slices via memref<...x2xT> reinterpretation (#196123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the `fir.array_coor` site, reinterpret the `memref>` as `memref` via `fir.convert`, then append the component index (0=re, 1=im) as the final memref index. Loads and stores then operate directly on a scalar `T`-sized location. --- .../lib/Optimizer/Transforms/FIRToMemRef.cpp | 81 +++++---- .../FIRToMemRef/slice-projected.mlir | 154 +++++++++++++----- 2 files changed, 163 insertions(+), 72 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/FIRToMemRef.cpp b/flang/lib/Optimizer/Transforms/FIRToMemRef.cpp index f316700f6f341..5b353a2e3f689 100644 --- a/flang/lib/Optimizer/Transforms/FIRToMemRef.cpp +++ b/flang/lib/Optimizer/Transforms/FIRToMemRef.cpp @@ -162,6 +162,8 @@ class FIRToMemRef : public fir::impl::FIRToMemRefBase { SmallVector shiftVec; SmallVector sliceVec; bool hasProjectedSlice = false; + // Constant value of the first projected-slice field, if any. + std::optional projectedSliceStart; }; template @@ -183,6 +185,15 @@ class FIRToMemRef : public fir::impl::FIRToMemRefBase { return sliceOp && !sliceOp.getFields().empty(); } + // Returns the constant first projected-slice field, if available. + static std::optional + getProjectedSliceStartIfConstant(fir::SliceOp sliceOp) { + auto fields = sliceOp.getFields(); + if (fields.empty()) + return std::nullopt; + return fir::getIntIfConstant(fields.front()); + } + unsigned getRankFromEmbox(fir::EmboxOp embox) const { auto memrefType = embox.getMemref().getType(); Type unwrappedType = fir::unwrapRefType(memrefType); @@ -325,15 +336,12 @@ void FIRToMemRef::collectSliceInfoFrom(OpTy op, SliceInfo &info) const { } if (auto sliceOp = getSliceOp(op.getSlice())) { - // A slice path changes the physical projection of the boxed entity (for - // example, `complex -> real` for `%re`). Preserve shape/shift for logical - // indexing, but do not treat the triplets alone as layout information. if (hasProjectedSlice(sliceOp)) { info.hasProjectedSlice = true; - } else { - auto triples = sliceOp.getTriples(); - info.sliceVec.append(triples.begin(), triples.end()); + info.projectedSliceStart = getProjectedSliceStartIfConstant(sliceOp); } + auto triples = sliceOp.getTriples(); + info.sliceVec.append(triples.begin(), triples.end()); } } } @@ -485,9 +493,6 @@ FIRToMemRef::getMemrefIndices(fir::ArrayCoorOp arrayCoorOp, Operation *memref, rank = getRankFromEmbox(embox); } - // Projected boxed slices leave `sliceVec` empty on purpose: indices are - // computed in the logical section coordinate space, while stride/base come - // later from the box descriptor. SmallVector &shiftVec = sliceInfo.shiftVec; SmallVector &sliceVec = sliceInfo.sliceVec; SmallVector sliceLbs, sliceStrides; @@ -638,6 +643,41 @@ FIRToMemRef::convertArrayCoorOp(Operation *memOp, fir::ArrayCoorOp arrayCoorOp, bool isDescriptor = mlir::isa(firMemref.getType()) || firMemref.getDefiningOp() != nullptr; + // For complex projections, reinterpret memref> as + // memref and append the component index (0=re, 1=im) so that + // each load/store touches exactly sizeof(T) bytes. + SliceInfo sliceInfo; + collectSliceInfoFrom(arrayCoorOp, sliceInfo); + if (auto embox = firMemref.getDefiningOp()) + collectSliceInfoFrom(embox, sliceInfo); + else if (auto rebox = firMemref.getDefiningOp()) + collectSliceInfoFrom(rebox, sliceInfo); + auto srcTy = cast((*converted).getType()); + if (sliceInfo.hasProjectedSlice) { + if (auto complexTy = dyn_cast(srcTy.getElementType())) { + if (!sliceInfo.projectedSliceStart || + (*sliceInfo.projectedSliceStart != 0 && + *sliceInfo.projectedSliceStart != 1)) { + LLVM_DEBUG( + llvm::dbgs() + << "FIRToMemRef: projected complex slice selector must be constant " + "0 (real) or 1 (imaginary), bailing out of conversion\n"); + return failure(); + } + auto projection = *sliceInfo.projectedSliceStart; + SmallVector shape(srcTy.getShape()); + shape.push_back(2); + Value compMemref = + fir::ConvertOp::create( + rewriter, loc, MemRefType::get(shape, complexTy.getElementType()), + *converted) + .getResult(); + indices.push_back( + arith::ConstantIndexOp::create(rewriter, loc, projection)); + return std::pair{compMemref, indices}; + } + } + // Static shape does not imply contiguous layout for descriptor-backed // entities (e.g. boxed array sections with non-unit stride). Keep the // reinterpret-cast path so descriptor strides are preserved. @@ -645,7 +685,6 @@ FIRToMemRef::convertArrayCoorOp(Operation *memOp, fir::ArrayCoorOp arrayCoorOp, return std::pair{*converted, indices}; unsigned rank = arrayCoorOp.getIndices().size(); - if (auto embox = firMemref.getDefiningOp()) rank = getRankFromEmbox(embox); @@ -654,29 +693,17 @@ FIRToMemRef::convertArrayCoorOp(Operation *memOp, fir::ArrayCoorOp arrayCoorOp, SmallVector strides; strides.reserve(rank); - SliceInfo sliceInfo; - collectSliceInfoFrom(arrayCoorOp, sliceInfo); - - Value box = firMemref; - if (!isa(firMemref)) { - if (auto embox = firMemref.getDefiningOp()) { - collectSliceInfoFrom(embox, sliceInfo); - } else if (auto rebox = firMemref.getDefiningOp()) { - collectSliceInfoFrom(rebox, sliceInfo); - } - } - SmallVector &shapeVec = sliceInfo.shapeVec; if (sliceInfo.hasProjectedSlice || shapeVec.empty()) { // Projected slices carry their physical layout in the descriptor. Rebuild // the MemRef view from box metadata instead of from slice triplets. auto boxElementSize = - fir::BoxEleSizeOp::create(rewriter, loc, indexTy, box); + fir::BoxEleSizeOp::create(rewriter, loc, indexTy, firMemref); for (unsigned i = 0; i < rank; ++i) { Value dim = arith::ConstantIndexOp::create(rewriter, loc, rank - i - 1); auto boxDims = fir::BoxDimsOp::create(rewriter, loc, indexTy, indexTy, - indexTy, box, dim); + indexTy, firMemref, dim); Value extent = boxDims->getResult(1); sizes.push_back(castTypeToIndexType(extent, rewriter)); @@ -803,9 +830,7 @@ FIRToMemRef::getFIRConvert(Operation *memOp, Operation *op, "the same, bailing out of conversion\n"); return failure(); } - // Keep `box_addr` on the projected box so the descriptor remains the - // source of truth for projected element type and stride. - if (!projectedSlice && embox.getSlice() && + if (embox.getSlice() && embox.getSlice().getDefiningOp()) { Type originalType = embox.getMemref().getType(); basePtr = embox.getMemref(); @@ -1243,7 +1268,7 @@ void FIRToMemRef::rewriteStoreOp(fir::StoreOp store, PatternRewriter &rewriter, value = createTypeConversion(rewriter, store.getLoc(), convertedType, value); - Attribute attr = (store.getOperation())->getAttr("tbaa"); + Attribute attr = store.getOperation()->getAttr("tbaa"); memref::StoreOp storeOp = rewriter.replaceOpWithNewOp( store, value, converted, indices); if (attr) diff --git a/flang/test/Transforms/FIRToMemRef/slice-projected.mlir b/flang/test/Transforms/FIRToMemRef/slice-projected.mlir index 7b0fbdf748173..17af59086122c 100644 --- a/flang/test/Transforms/FIRToMemRef/slice-projected.mlir +++ b/flang/test/Transforms/FIRToMemRef/slice-projected.mlir @@ -2,12 +2,9 @@ // Tests for fir.slice with a path component (projected component slice). // A projected slice changes the element type of the boxed view, e.g. -// z%re projects complex -> f32. The layout (strides / base address) -// must come from the projected box descriptor, NOT from reconstructing the -// triplets, because memref.reinterpret_cast requires the same element type -// on both sides and the triplet strides are in storage-element units -// (complex) while the MemRef strides must be in projected-element units -// (f32). +// z%re projects complex -> f32. The pass bypasses the box descriptor +// and reinterprets the underlying complex array as memref<...x2xf32>, then +// appends the component index (0=re, 1=im) as the final memref index. // // Derived from: // complex, target :: z(4) = 0. @@ -16,40 +13,19 @@ // r = r + z(4:1:-1)%re // ---------------------------------------------------------------------------- -// Forward projected slice: z(1:4:1)%re -// The slice path %c0 projects complex -> f32 (real part). -// Expected lowering: -// - fir.box_addr on the projected box (!fir.box>) -// - fir.convert to memref<4xf32> (NOT to memref<4xcomplex>) -// - index = i - 1 (1-based, no triplet arithmetic) -// - strides from fir.box_dims / fir.box_elesize on the projected box +// Forward projected slice load: z(1:4:1)%re +// The fir.convert appears inside the loop body (insertion point tracks the +// array_coor inside the loop). elemIdx = (i - 1) * step + (lb - 1) = i - 1 +// for step=1, lb=1. Indices are reversed (col-major → row-major) but for 1D +// that is a no-op. // ---------------------------------------------------------------------------- // CHECK-LABEL: func.func @projected_slice_fwd -// CHECK: [[C1:%.*]] = arith.constant 1 : index -// CHECK: [[C4:%.*]] = arith.constant 4 : index -// CHECK: [[C0:%.*]] = arith.constant 0 : index -// CHECK: [[SHAPE:%.*]] = fir.shape [[C4]] : (index) -> !fir.shape<1> -// CHECK: [[SLICE:%.*]] = fir.slice [[C1]], [[C4]], [[C1]] path [[C0]] : (index, index, index, index) -> !fir.slice<1> -// CHECK: [[EMBOX:%.*]] = fir.embox %arg0([[SHAPE]]) {{\[}}[[SLICE]]{{\]}} : (!fir.ref>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box> -// CHECK: fir.do_loop [[I:%.*]] = [[C1]] to [[C4]] step [[C1]] unordered { -// Projected box_addr gives f32 pointer, not complex. -// CHECK: [[BOXADDR:%.*]] = fir.box_addr [[EMBOX]] : (!fir.box>) -> !fir.ref> -// CHECK: [[CONVERT:%.*]] = fir.convert [[BOXADDR]] : (!fir.ref>) -> memref<4xf32> -// Index: i-1 (1-based). The lowering emits: delta=i-1, scaled=delta*1, -// offset=1-1=0, finalIdx=scaled+offset. The addi result is what feeds the load. -// CHECK: [[C1_0:%.*]] = arith.constant 1 : index -// CHECK: [[DELTA:%.*]] = arith.subi [[I]], [[C1_0]] : index -// CHECK: [[SCALED:%.*]] = arith.muli [[DELTA]], [[C1_0]] : index -// CHECK: [[OFFSET:%.*]] = arith.subi [[C1_0]], [[C1_0]] : index -// CHECK: [[IDX:%.*]] = arith.addi [[SCALED]], [[OFFSET]] : index -// Layout: extent and stride come from the projected box descriptor. -// CHECK: [[ELE:%.*]] = fir.box_elesize [[EMBOX]] : (!fir.box>) -> index -// CHECK: [[C0_0:%.*]] = arith.constant 0 : index -// CHECK: [[DIMS:%.*]]:3 = fir.box_dims [[EMBOX]], [[C0_0]] : (!fir.box>, index) -> (index, index, index) -// CHECK: [[STRIDE:%.*]] = arith.divsi [[DIMS]]#2, [[ELE]] : index -// CHECK: [[C0_1:%.*]] = arith.constant 0 : index -// CHECK: [[VIEW:%.*]] = memref.reinterpret_cast [[CONVERT]] to offset: {{\[}}[[C0_1]]{{\]}}, sizes: {{\[}}[[DIMS]]#1{{\]}}, strides: {{\[}}[[STRIDE]]{{\]}} : memref<4xf32> to memref> -// CHECK: memref.load [[VIEW]]{{\[}}[[IDX]]{{\]}} : memref> +// CHECK: fir.do_loop [[I:%.*]] = +// CHECK: [[MEMREF:%.*]] = fir.convert %arg0 : (!fir.ref>>) -> memref<4xcomplex> +// CHECK: [[IDX:%.*]] = arith.addi +// CHECK: [[COMP:%.*]] = fir.convert [[MEMREF]] : (memref<4xcomplex>) -> memref<4x2xf32> +// CHECK: arith.constant 0 +// CHECK: memref.load [[COMP]][[[IDX]], {{%.*}}] : memref<4x2xf32> func.func @projected_slice_fwd(%arg0: !fir.ref>>) { %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index @@ -64,16 +40,106 @@ func.func @projected_slice_fwd(%arg0: !fir.ref>>) { return } +// ---------------------------------------------------------------------------- +// Backward projected slice load: z(4:1:-1)%re +// step = -1, lb = 4 → elemIdx = (i - 1) * (-1) + (4 - 1) = 3 - (i-1) +// ---------------------------------------------------------------------------- +// CHECK-LABEL: func.func @projected_slice_bwd +// CHECK: fir.do_loop [[I:%.*]] = +// CHECK: [[MEMREF:%.*]] = fir.convert %arg0 : (!fir.ref>>) -> memref<4xcomplex> +// CHECK: [[IDX:%.*]] = arith.addi +// CHECK: [[COMP:%.*]] = fir.convert [[MEMREF]] : (memref<4xcomplex>) -> memref<4x2xf32> +// CHECK: arith.constant 0 +// CHECK: memref.load [[COMP]][[[IDX]], {{%.*}}] : memref<4x2xf32> +func.func @projected_slice_bwd(%arg0: !fir.ref>>) { + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cm1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %shape = fir.shape %c4 : (index) -> !fir.shape<1> + %slice = fir.slice %c4, %c1, %cm1 path %c0 : (index, index, index, index) -> !fir.slice<1> + %embox = fir.embox %arg0(%shape) [%slice] : (!fir.ref>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box> + fir.do_loop %i = %c1 to %c4 step %c1 unordered { + %coor = fir.array_coor %embox %i : (!fir.box>, index) -> !fir.ref + %val = fir.load %coor : !fir.ref + } + return +} + +// ---------------------------------------------------------------------------- +// Imaginary component store: z(1:4:1)%im = val +// Direct scalar store — no read-modify-write, no complex.create. +// ---------------------------------------------------------------------------- +// CHECK-LABEL: func.func @projected_slice_store_im +// CHECK: fir.do_loop [[I:%.*]] = +// CHECK: [[MEMREF:%.*]] = fir.convert %arg0 : (!fir.ref>>) -> memref<4xcomplex> +// CHECK: [[IDX:%.*]] = arith.addi +// CHECK: [[COMP:%.*]] = fir.convert [[MEMREF]] : (memref<4xcomplex>) -> memref<4x2xf32> +// CHECK: arith.constant 1 +// CHECK: memref.store %arg1, [[COMP]][[[IDX]], {{%.*}}] : memref<4x2xf32> +func.func @projected_slice_store_im(%arg0: !fir.ref>>, + %arg1: f32) { + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c1_im = arith.constant 1 : index // imaginary component index + %shape = fir.shape %c4 : (index) -> !fir.shape<1> + %slice = fir.slice %c1, %c4, %c1 path %c1_im : (index, index, index, index) -> !fir.slice<1> + %embox = fir.embox %arg0(%shape) [%slice] : (!fir.ref>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box> + fir.do_loop %i = %c1 to %c4 step %c1 unordered { + %coor = fir.array_coor %embox %i : (!fir.box>, index) -> !fir.ref + fir.store %arg1 to %coor : !fir.ref + } + return +} + +// ---------------------------------------------------------------------------- +// 2-D boxed projected slice load: z(1:2:1, 1:3:1)%re +// Storage: !fir.array<2x3xcomplex> +// +// convertMemrefType reverses Fortran column-major extents to MLIR row-major: +// !fir.ref>> → memref<3x2xcomplex> +// Reinterpret adds the component dimension: +// memref<3x2xcomplex> → memref<3x2x2xf32> +// +// Per-dimension element index (0-based, column-major): +// elemIdx_i = (i-1)*1 + (1-1) = i-1 (Fortran dim 1, size 2) +// elemIdx_j = (j-1)*1 + (1-1) = j-1 (Fortran dim 2, size 3) +// +// After reversing for MLIR row-major access: +// memref.load [elemIdx_j, elemIdx_i, 0] +// ---------------------------------------------------------------------------- +// CHECK-LABEL: func.func @projected_slice_2d +// CHECK: fir.do_loop [[I:%.*]] = +// CHECK: fir.do_loop [[J:%.*]] = +// CHECK: [[MEMREF:%.*]] = fir.convert %arg0 : (!fir.ref>>) -> memref<3x2xcomplex> +// CHECK: [[IDX_I:%.*]] = arith.addi +// CHECK: [[IDX_J:%.*]] = arith.addi +// CHECK: [[COMP:%.*]] = fir.convert [[MEMREF]] : (memref<3x2xcomplex>) -> memref<3x2x2xf32> +// CHECK: arith.constant 0 +// CHECK: memref.load [[COMP]][[[IDX_J]], [[IDX_I]], {{%.*}}] : memref<3x2x2xf32> +func.func @projected_slice_2d(%arg0: !fir.ref>>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %shape = fir.shape %c2, %c3 : (index, index) -> !fir.shape<2> + %slice = fir.slice %c1, %c2, %c1, %c1, %c3, %c1 path %c0 : (index, index, index, index, index, index, index) -> !fir.slice<2> + %embox = fir.embox %arg0(%shape) [%slice] : (!fir.ref>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box> + fir.do_loop %i = %c1 to %c2 step %c1 unordered { + fir.do_loop %j = %c1 to %c3 step %c1 unordered { + %coor = fir.array_coor %embox %i, %j : (!fir.box>, index, index) -> !fir.ref + %val = fir.load %coor : !fir.ref + } + } + return +} + // ---------------------------------------------------------------------------- // Derived-type component projection: a%x where a : TYPE{x:f64, y:complex} // // This is NOT a complex projection — the storage element is the derived type T, -// not complex. FIRToMemRef cannot safely compute element-unit strides via -// divsi(byte_stride, elesize) because sizeof(T)/sizeof(component) may not be an -// integer (e.g. sizeof(T)=24, sizeof(complex)=16 -> 1.5, truncated to 1). -// -// The pass must leave fir.array_coor and fir.store/fir.load unconverted; -// downstream FIR-to-LLVM lowering handles them correctly via the descriptor. +// not complex. FIRToMemRef cannot safely handle this; downstream +// FIR-to-LLVM lowering handles it correctly via the descriptor. // // CHECK-LABEL: func.func @derived_component_not_projected // The fir.array_coor must survive (not be erased). From 5e8f473bd2fb90fa9209478f0b21640124915dff Mon Sep 17 00:00:00 2001 From: Milad Fa <46688537+miladfarca@users.noreply.github.com> Date: Mon, 11 May 2026 11:17:27 -0400 Subject: [PATCH 290/538] [libc] Fix BigInt shift on big-endian platforms (#196957) BigInt<128> stores the value in two separate word sized array slots with the low 64 bits being stored in val[0] and high 64 bits in val[1]. This can't be reinterpreted as a 128 bit value on big-endian platforms because the values are reversed. This has caused test failures on s390x builds of V8: https://issues.chromium.org/issues/511831894 --------- Co-authored-by: Guillaume Chatelet --- libc/src/__support/big_int.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h index bb9cefd67b552..a6dcff27ca167 100644 --- a/libc/src/__support/big_int.h +++ b/libc/src/__support/big_int.h @@ -259,7 +259,8 @@ LIBC_INLINE constexpr cpp::array shift(cpp::array array, constexpr size_t WORD_BITS = cpp::numeric_limits::digits; #ifdef LIBC_TYPES_HAS_INT128 constexpr size_t TOTAL_BITS = N * WORD_BITS; - if constexpr (TOTAL_BITS == 128) { + if constexpr (TOTAL_BITS == 128 && + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) { using type = cpp::conditional_t; auto tmp = cpp::bit_cast(array); if constexpr (direction == LEFT) From 92dad1322826c4df2c211ab57182c0dcaa458996 Mon Sep 17 00:00:00 2001 From: Rajveer Singh Bharadwaj Date: Mon, 11 May 2026 20:58:51 +0530 Subject: [PATCH 291/538] [AArch64] Additional Tablegen patterns for `shuffle(zext(...))` and `shuffle(sext(...))` to `uaddlp` (#195120) --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 39 +++++++ llvm/test/CodeGen/AArch64/addp-shuffle.ll | 64 +++-------- llvm/test/CodeGen/AArch64/uaddlp.ll | 111 ++++++++++++++++++++ 3 files changed, 166 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 4a8f11cef2713..c7d749ffe81e1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8659,6 +8659,7 @@ def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))), def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>; +// uaddlp from shuffle + zext def : Pat<(add (AArch64bici (v8i16 (bitconvert v16i8:$src)), (i32 255), (i32 8)), (AArch64vlshr (v8i16 (AArch64NvCast v16i8:$src)), (i32 8))), (UADDLPv16i8_v8i16 V128:$src)>; @@ -8674,6 +8675,44 @@ def : Pat<(add (v2i64 (zext (v2i32 (AArch64zip1 (v2i32 (extract_subvector v4i32: (v2i32 (extract_subvector v4i32:$src, (i64 2)))))))), (UADDLPv4i32_v2i64 V128:$src)>; +// uaddlp from zext + shuffle +def : Pat<(add (AArch64uzp1 (v8i16 (zext (v8i8 (extract_subvector v16i8:$src, (i64 0))))), + (v8i16 (zext (v8i8 (extract_subvector v16i8:$src, (i64 8)))))), + (AArch64uzp2 (v8i16 (zext (v8i8 (extract_subvector v16i8:$src, (i64 0))))), + (v8i16 (zext (v8i8 (extract_subvector v16i8:$src, (i64 8))))))), + (UADDLPv16i8_v8i16 V128:$src)>; + +def : Pat<(add (AArch64uzp1 (v4i32 (zext (v4i16 (extract_subvector v8i16:$src, (i64 0))))), + (v4i32 (zext (v4i16 (extract_subvector v8i16:$src, (i64 4)))))), + (AArch64uzp2 (v4i32 (zext (v4i16 (extract_subvector v8i16:$src, (i64 0))))), + (v4i32 (zext (v4i16 (extract_subvector v8i16:$src, (i64 4))))))), + (UADDLPv8i16_v4i32 V128:$src)>; + +def : Pat<(add (AArch64zip1 (v2i64 (zext (v2i32 (extract_subvector v4i32:$src, (i64 0))))), + (v2i64 (zext (v2i32 (extract_subvector v4i32:$src, (i64 2)))))), + (AArch64zip2 (v2i64 (zext (v2i32 (extract_subvector v4i32:$src, (i64 0))))), + (v2i64 (zext (v2i32 (extract_subvector v4i32:$src, (i64 2))))))), + (UADDLPv4i32_v2i64 V128:$src)>; + +// saddlp from sext + shuffle +def : Pat<(add (AArch64uzp1 (v8i16 (sext (v8i8 (extract_subvector v16i8:$src, (i64 0))))), + (v8i16 (sext (v8i8 (extract_subvector v16i8:$src, (i64 8)))))), + (AArch64uzp2 (v8i16 (sext (v8i8 (extract_subvector v16i8:$src, (i64 0))))), + (v8i16 (sext (v8i8 (extract_subvector v16i8:$src, (i64 8))))))), + (SADDLPv16i8_v8i16 V128:$src)>; + +def : Pat<(add (AArch64uzp1 (v4i32 (sext (v4i16 (extract_subvector v8i16:$src, (i64 0))))), + (v4i32 (sext (v4i16 (extract_subvector v8i16:$src, (i64 4)))))), + (AArch64uzp2 (v4i32 (sext (v4i16 (extract_subvector v8i16:$src, (i64 0))))), + (v4i32 (sext (v4i16 (extract_subvector v8i16:$src, (i64 4))))))), + (SADDLPv8i16_v4i32 V128:$src)>; + +def : Pat<(add (AArch64zip1 (v2i64 (sext (v2i32 (extract_subvector v4i32:$src, (i64 0))))), + (v2i64 (sext (v2i32 (extract_subvector v4i32:$src, (i64 2)))))), + (AArch64zip2 (v2i64 (sext (v2i32 (extract_subvector v4i32:$src, (i64 0))))), + (v2i64 (sext (v2i32 (extract_subvector v4i32:$src, (i64 2))))))), + (SADDLPv4i32_v2i64 V128:$src)>; + //------------------------------------------------------------------------------ // AdvSIMD modified immediate instructions //------------------------------------------------------------------------------ diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll index e71d23921785c..281fe3120ef14 100644 --- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll @@ -274,12 +274,8 @@ define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOFP16-SD: // %bb.0: ; CHECK-NOFP16-SD-NEXT: umull v3.8h, v1.8b, v2.8b ; CHECK-NOFP16-SD-NEXT: umull2 v1.8h, v1.16b, v2.16b -; CHECK-NOFP16-SD-NEXT: ushll2 v2.4s, v3.8h, #0 -; CHECK-NOFP16-SD-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NOFP16-SD-NEXT: ushll2 v4.4s, v1.8h, #0 -; CHECK-NOFP16-SD-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NOFP16-SD-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-NOFP16-SD-NEXT: addp v1.4s, v1.4s, v4.4s +; CHECK-NOFP16-SD-NEXT: uaddlp v1.4s, v1.8h +; CHECK-NOFP16-SD-NEXT: uaddlp v2.4s, v3.8h ; CHECK-NOFP16-SD-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-NOFP16-SD-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NOFP16-SD-NEXT: ret @@ -288,12 +284,8 @@ define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-FP16-SD: // %bb.0: ; CHECK-FP16-SD-NEXT: umull v3.8h, v1.8b, v2.8b ; CHECK-FP16-SD-NEXT: umull2 v1.8h, v1.16b, v2.16b -; CHECK-FP16-SD-NEXT: ushll2 v2.4s, v3.8h, #0 -; CHECK-FP16-SD-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-FP16-SD-NEXT: ushll2 v4.4s, v1.8h, #0 -; CHECK-FP16-SD-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-FP16-SD-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-FP16-SD-NEXT: addp v1.4s, v1.4s, v4.4s +; CHECK-FP16-SD-NEXT: uaddlp v1.4s, v1.8h +; CHECK-FP16-SD-NEXT: uaddlp v2.4s, v3.8h ; CHECK-FP16-SD-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-FP16-SD-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-FP16-SD-NEXT: ret @@ -302,12 +294,8 @@ define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOFP16-GI: // %bb.0: ; CHECK-NOFP16-GI-NEXT: umull v3.8h, v1.8b, v2.8b ; CHECK-NOFP16-GI-NEXT: umull2 v1.8h, v1.16b, v2.16b -; CHECK-NOFP16-GI-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-NOFP16-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-NOFP16-GI-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NOFP16-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NOFP16-GI-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: addp v1.4s, v4.4s, v1.4s +; CHECK-NOFP16-GI-NEXT: uaddlp v2.4s, v3.8h +; CHECK-NOFP16-GI-NEXT: uaddlp v1.4s, v1.8h ; CHECK-NOFP16-GI-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-NOFP16-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NOFP16-GI-NEXT: ret @@ -316,12 +304,8 @@ define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-FP16-GI: // %bb.0: ; CHECK-FP16-GI-NEXT: umull v3.8h, v1.8b, v2.8b ; CHECK-FP16-GI-NEXT: umull2 v1.8h, v1.16b, v2.16b -; CHECK-FP16-GI-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-FP16-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-FP16-GI-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-FP16-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-FP16-GI-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-FP16-GI-NEXT: addp v1.4s, v4.4s, v1.4s +; CHECK-FP16-GI-NEXT: uaddlp v2.4s, v3.8h +; CHECK-FP16-GI-NEXT: uaddlp v1.4s, v1.8h ; CHECK-FP16-GI-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-FP16-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-FP16-GI-NEXT: ret @@ -343,12 +327,8 @@ define <4 x i32> @sdot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOFP16-SD: // %bb.0: ; CHECK-NOFP16-SD-NEXT: smull v3.8h, v1.8b, v2.8b ; CHECK-NOFP16-SD-NEXT: smull2 v1.8h, v1.16b, v2.16b -; CHECK-NOFP16-SD-NEXT: sshll2 v2.4s, v3.8h, #0 -; CHECK-NOFP16-SD-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-NOFP16-SD-NEXT: sshll2 v4.4s, v1.8h, #0 -; CHECK-NOFP16-SD-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-NOFP16-SD-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-NOFP16-SD-NEXT: addp v1.4s, v1.4s, v4.4s +; CHECK-NOFP16-SD-NEXT: saddlp v1.4s, v1.8h +; CHECK-NOFP16-SD-NEXT: saddlp v2.4s, v3.8h ; CHECK-NOFP16-SD-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-NOFP16-SD-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NOFP16-SD-NEXT: ret @@ -357,12 +337,8 @@ define <4 x i32> @sdot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-FP16-SD: // %bb.0: ; CHECK-FP16-SD-NEXT: smull v3.8h, v1.8b, v2.8b ; CHECK-FP16-SD-NEXT: smull2 v1.8h, v1.16b, v2.16b -; CHECK-FP16-SD-NEXT: sshll2 v2.4s, v3.8h, #0 -; CHECK-FP16-SD-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-FP16-SD-NEXT: sshll2 v4.4s, v1.8h, #0 -; CHECK-FP16-SD-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-FP16-SD-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-FP16-SD-NEXT: addp v1.4s, v1.4s, v4.4s +; CHECK-FP16-SD-NEXT: saddlp v1.4s, v1.8h +; CHECK-FP16-SD-NEXT: saddlp v2.4s, v3.8h ; CHECK-FP16-SD-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-FP16-SD-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-FP16-SD-NEXT: ret @@ -371,12 +347,8 @@ define <4 x i32> @sdot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOFP16-GI: // %bb.0: ; CHECK-NOFP16-GI-NEXT: smull v3.8h, v1.8b, v2.8b ; CHECK-NOFP16-GI-NEXT: smull2 v1.8h, v1.16b, v2.16b -; CHECK-NOFP16-GI-NEXT: sshll v2.4s, v3.4h, #0 -; CHECK-NOFP16-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-NOFP16-GI-NEXT: sshll v4.4s, v1.4h, #0 -; CHECK-NOFP16-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-NOFP16-GI-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-NOFP16-GI-NEXT: addp v1.4s, v4.4s, v1.4s +; CHECK-NOFP16-GI-NEXT: saddlp v2.4s, v3.8h +; CHECK-NOFP16-GI-NEXT: saddlp v1.4s, v1.8h ; CHECK-NOFP16-GI-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-NOFP16-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NOFP16-GI-NEXT: ret @@ -385,12 +357,8 @@ define <4 x i32> @sdot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) { ; CHECK-FP16-GI: // %bb.0: ; CHECK-FP16-GI-NEXT: smull v3.8h, v1.8b, v2.8b ; CHECK-FP16-GI-NEXT: smull2 v1.8h, v1.16b, v2.16b -; CHECK-FP16-GI-NEXT: sshll v2.4s, v3.4h, #0 -; CHECK-FP16-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-FP16-GI-NEXT: sshll v4.4s, v1.4h, #0 -; CHECK-FP16-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-FP16-GI-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-FP16-GI-NEXT: addp v1.4s, v4.4s, v1.4s +; CHECK-FP16-GI-NEXT: saddlp v2.4s, v3.8h +; CHECK-FP16-GI-NEXT: saddlp v1.4s, v1.8h ; CHECK-FP16-GI-NEXT: addp v1.4s, v2.4s, v1.4s ; CHECK-FP16-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-FP16-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/uaddlp.ll b/llvm/test/CodeGen/AArch64/uaddlp.ll index 015aab48e7255..ccfb1cc55e643 100644 --- a/llvm/test/CodeGen/AArch64/uaddlp.ll +++ b/llvm/test/CodeGen/AArch64/uaddlp.ll @@ -72,3 +72,114 @@ start: %4 = add nuw nsw <8 x i16> %2, %3 ret <8 x i16> %4 } + +define <8 x i16> @vpaddlq_v16i8_zext_shuffle(<16 x i8> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v16i8_zext_shuffle: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: uaddlp v0.8h, v0.16b +; CHECK-NEXT: ret +start: + %0 = zext <16 x i8> %a to <16 x i16> + %1 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> + %2 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> + %3 = add nuw nsw <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <4 x i32> @vpaddlq_v8i16_zext_shuffle(<8 x i16> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v8i16_zext_shuffle: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: uaddlp v0.4s, v0.8h +; CHECK-NEXT: ret +start: + %0 = zext <8 x i16> %a to <8 x i32> + %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> + %2 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> + %3 = add nuw nsw <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <2 x i64> @vpaddlq_v4i32_zext_shuffle(<4 x i32> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v4i32_zext_shuffle: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: uaddlp v0.2d, v0.4s +; CHECK-NEXT: ret +start: + %0 = zext <4 x i32> %a to <4 x i64> + %1 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + %2 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + %3 = add nuw nsw <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <4 x i32> @vpaddlq_v8i16_zext_shuffle_neg(<8 x i16> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v8i16_zext_shuffle_neg: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v3.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v2.s[2], v0.s[3] +; CHECK-NEXT: mov v3.s[1], v1.s[0] +; CHECK-NEXT: add v0.4s, v2.4s, v3.4s +; CHECK-NEXT: ret +start: + %0 = zext <8 x i16> %a to <8 x i32> + %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> + %2 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> + %3 = add nuw nsw <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <8 x i16> @vpaddlq_v16i8_sext_shuffle(<16 x i8> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v16i8_sext_shuffle: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: saddlp v0.8h, v0.16b +; CHECK-NEXT: ret +start: + %0 = sext <16 x i8> %a to <16 x i16> + %1 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> + %2 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> + %3 = add nuw nsw <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <4 x i32> @vpaddlq_v8i16_sext_shuffle(<8 x i16> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v8i16_sext_shuffle: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: saddlp v0.4s, v0.8h +; CHECK-NEXT: ret +start: + %0 = sext <8 x i16> %a to <8 x i32> + %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> + %2 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> + %3 = add nuw nsw <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <2 x i64> @vpaddlq_v4i32_sext_shuffle(<4 x i32> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v4i32_sext_shuffle: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: saddlp v0.2d, v0.4s +; CHECK-NEXT: ret +start: + %0 = sext <4 x i32> %a to <4 x i64> + %1 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + %2 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + %3 = add nuw nsw <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <2 x i64> @vpaddlq_v4i32_sext_shuffle_neg(<4 x i32> %a) unnamed_addr { +; CHECK-LABEL: vpaddlq_v4i32_sext_shuffle_neg: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s +; CHECK-NEXT: ret +start: + %0 = sext <4 x i32> %a to <4 x i64> + %1 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + %2 = shufflevector <4 x i64> %0, <4 x i64> poison, <2 x i32> + %3 = add nuw nsw <2 x i64> %1, %2 + ret <2 x i64> %3 +} From 056d6e095814e15373e092428454a98d17c9fb59 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 11 May 2026 11:32:09 -0400 Subject: [PATCH 292/538] [AMDGPU] Add VOP1 DPP8 pseudo infrastructure (#196736) --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 37 ++++++++++++---------- llvm/lib/Target/AMDGPU/VOPInstructions.td | 25 +++++++++++++++ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 9955d60fb2849..62386077b25e2 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -105,6 +105,10 @@ class VOP1_DPP_Pseudo pattern=[]> : VOP_DPP_Pseudo { } +class VOP1_DPP8_Pseudo pattern=[]> : + VOP_DPP8_Pseudo { +} + multiclass VOP1Inst { // We only want to set this on the basic, non-SDWA or DPP forms. @@ -123,8 +127,11 @@ multiclass VOP1Inst ; - if P.HasExtDPP then + if P.HasExtDPP then { def _dpp : VOP1_DPP_Pseudo ; + def _dpp8 : VOP1_DPP8_Pseudo ; + } + if P.HasExtVOP3DPP then def _e64_dpp : VOP3_DPP_Pseudo { @@ -972,8 +979,8 @@ class VOP1_DPP16_Gen op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = let True16Predicate = ps.True16Predicate; } -class VOP1_DPP8 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : - VOP_DPP8 { +class VOP1_DPP8 op, VOP1_DPP8_Pseudo ps, int subtarget, VOPProfile p = ps.Pfl> : + VOP_DPP8, SIMCInstr { let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -988,8 +995,8 @@ class VOP1_DPP8 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : let Inst{31-25} = 0x3f; } -class VOP1_DPP8_Gen op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : - VOP1_DPP8 { +class VOP1_DPP8_Gen op, VOP1_DPP8_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : + VOP1_DPP8 { let AssemblerPredicate = Gen.AssemblerPredicate; let DecoderNamespace = Gen.DecoderNamespace; let True16Predicate = ps.True16Predicate; @@ -1047,7 +1054,7 @@ multiclass VOP1_Real_dpp_with_name op, string opName, multiclass VOP1_Real_dpp8 op, string opName = NAME> { defvar ps = !cast(opName#"_e32"); if !not(ps.Pfl.HasExt64BitDPP) then - def _dpp8#Gen.Suffix : VOP1_DPP8_Gen; + def _dpp8#Gen.Suffix : VOP1_DPP8_Gen(opName#"_dpp8"), Gen>; } multiclass VOP1_Real_dpp8_with_name op, string opName, @@ -1311,7 +1318,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP1_Real_dpp8_gfx10 op> { if !cast(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP1_DPP8(NAME#"_e32")>; + def _dpp8_gfx10 : VOP1_DPP8(NAME#"_dpp8"), SIEncodingFamily.GFX10>; } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" @@ -1828,14 +1835,10 @@ defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>; defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>; } -class MovDPP8Pattern : GCNPat < - (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), - (Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> { - let OtherPredicates = [Pred]; -} - -foreach vt = Reg32Types.types in { - def : MovDPP8Pattern; - def : MovDPP8Pattern; - def : MovDPP8Pattern; +let OtherPredicates = [HasDPP8] in { + foreach vt = Reg32Types.types in { + def : GCNPat< + (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), + (V_MOV_B32_dpp8 VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))>; + } } diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 82545a472cf17..4f9b679e05ea3 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1229,6 +1229,31 @@ class VOP_DPP8_Base : VOP_DPP8_Base, VOP_DPP8e

; +class VOP_DPP8_Pseudo pattern=[]> : + VOP_Pseudo { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + + let VALU = 1; + let DPP = 1; + let Size = 8; + let IsPacked = P.IsPacked; + + let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); + let mayRaiseFPException = ReadsModeReg; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); + let isConvergent = 1; + + string AsmOperands = P.AsmDPP8; + + let AsmMatchConverter = "cvtDPP8"; + let AssemblerPredicate = HasDPP8; + let AsmVariantName = AMDGPUAsmVariants.Disable; + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); +} + class VOP3_DPP8_Base : VOP_DPP8_Base { let OutOperandList = P.OutsVOP3DPP8; From 5b6b40169c1ce140e12dd6c0a82270ba742ad767 Mon Sep 17 00:00:00 2001 From: quic-k Date: Mon, 11 May 2026 21:12:33 +0530 Subject: [PATCH 293/538] [Clang][Hexagon] Driver changes for H2+Picolibc (#195795) depends on https://github.com/llvm/llvm-project/pull/195621 Signed-off-by: Kushal Pal --- clang/lib/Driver/ToolChains/Hexagon.cpp | 18 ++- .../include/c++/v1/.keep | 0 .../test/Driver/hexagon-toolchain-picolibc.c | 118 ++++++++++++++++++ 3 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 clang/test/Driver/Inputs/hexagon_tree/Tools/target/picolibc/hexagon-unknown-h2-elf/include/c++/v1/.keep diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp index e0e5ab5c1fde7..41f03e01b69c1 100644 --- a/clang/lib/Driver/ToolChains/Hexagon.cpp +++ b/clang/lib/Driver/ToolChains/Hexagon.cpp @@ -414,8 +414,14 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA, if (!IsShared) { if (HTC.GetCStdlibType(Args) == ToolChain::CST_Picolibc) { SmallString<128> Crt0 = LibraryDir; - llvm::sys::path::append(Crt0, "crt0-semihost.o"); - CmdArgs.push_back(Args.MakeArgString(Crt0)); + if (HTC.getTriple().isOSH2()) { + llvm::sys::path::append(Crt0, "crt0-noflash-hosted.o"); + CmdArgs.push_back(Args.MakeArgString(Crt0)); + } else if (HTC.getTriple().isOSUnknown()) { + llvm::sys::path::append(Crt0, "crt0-semihost.o"); + CmdArgs.push_back(Args.MakeArgString(Crt0)); + } + // Known OS other than H2: no semihost crt0; OS provides its own. } else { if (HasStandalone) { SmallString<128> Crt0SA = LibraryDir; @@ -468,7 +474,13 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA, if (!IsShared) { if (HTC.GetCStdlibType(Args) == ToolChain::CST_Picolibc) { - CmdArgs.push_back("-lsemihost"); + if (HTC.getTriple().isOSH2()) { + CmdArgs.push_back("-lh2"); + CmdArgs.push_back("-lsyscall_wrapper"); + } else if (HTC.getTriple().isOSUnknown()) { + CmdArgs.push_back("-lsemihost"); + } + // Known OS other than H2: no semihost lib; OS provides its own. } else { for (StringRef Lib : OsLibs) CmdArgs.push_back(Args.MakeArgString("-l" + Lib)); diff --git a/clang/test/Driver/Inputs/hexagon_tree/Tools/target/picolibc/hexagon-unknown-h2-elf/include/c++/v1/.keep b/clang/test/Driver/Inputs/hexagon_tree/Tools/target/picolibc/hexagon-unknown-h2-elf/include/c++/v1/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/hexagon-toolchain-picolibc.c b/clang/test/Driver/hexagon-toolchain-picolibc.c index b8ac8a9c5c88d..8282a4da81636 100644 --- a/clang/test/Driver/hexagon-toolchain-picolibc.c +++ b/clang/test/Driver/hexagon-toolchain-picolibc.c @@ -118,3 +118,121 @@ // CHECK-LIBPATHS-PIC: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-none-elf{{/|\\\\}}lib{{/|\\\\}}v68{{/|\\\\}}G0{{/|\\\\}}pic" // CHECK-LIBPATHS-PIC: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-none-elf{{/|\\\\}}lib{{/|\\\\}}v68{{/|\\\\}}G0" // CHECK-LIBPATHS-PIC: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-none-elf{{/|\\\\}}lib{{/|\\\\}}v68" + +// ============================================================================= +// H2 OS tests (--target=hexagon-h2-elf --cstdlib=picolibc) +// Differences from hexagon-none-elf: crt0-noflash-hosted.o, -lh2 -lsyscall_wrapper +// ============================================================================= + +// ----------------------------------------------------------------------------- +// Test standard include paths for H2 +// ----------------------------------------------------------------------------- +// RUN: %clang -### --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin %s 2>&1 | FileCheck -check-prefix=CHECK-H2-C-INCLUDES %s +// CHECK-H2-C-INCLUDES: "-cc1" {{.*}} "-internal-isystem" "{{.*}}{{/|\\\\}}lib{{/|\\\\}}clang{{/|\\\\}}{{[0-9]+}}{{/|\\\\}}include" +// CHECK-H2-C-INCLUDES: "-internal-externc-isystem" "{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}include" + +// RUN: %clangxx -### --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin %s 2>&1 | FileCheck -check-prefix=CHECK-H2-CXX-INCLUDES %s +// CHECK-H2-CXX-INCLUDES: "-cc1" {{.*}} "-internal-isystem" "{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1" +// CHECK-H2-CXX-INCLUDES: "-internal-isystem" "{{.*}}{{/|\\\\}}lib{{/|\\\\}}clang{{/|\\\\}}{{[0-9]+}}{{/|\\\\}}include" +// CHECK-H2-CXX-INCLUDES: "-internal-externc-isystem" "{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}include" + +// ----------------------------------------------------------------------------- +// H2 start files: crt0-noflash-hosted.o (not crt0-semihost.o) +// ----------------------------------------------------------------------------- +// RUN: %clang --target=hexagon-h2-elf --cstdlib=picolibc -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-H2-STARTUP +// CHECK-H2-STARTUP: "{{.*}}crt0-noflash-hosted.o" +// CHECK-H2-STARTUP-NOT: "{{.*}}crt0-semihost.o" + +// RUN: %clang --target=hexagon-h2-elf --cstdlib=picolibc -nostartfiles -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-H2-NOSTART +// CHECK-H2-NOSTART-NOT: "{{.*}}crt0-noflash-hosted.o" + +// ----------------------------------------------------------------------------- +// H2: -nostdlib, -nostartfiles, -nodefaultlibs, -nolibc +// ----------------------------------------------------------------------------- +// RUN: %clangxx -### --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \ +// RUN: -mcpu=hexagonv68 \ +// RUN: -nostdlib %s 2>&1 | FileCheck -check-prefix=CHECK-H2-NOSTDLIB %s +// CHECK-H2-NOSTDLIB: "-cc1" +// CHECK-H2-NOSTDLIB: {{hexagon-link|ld}} +// CHECK-H2-NOSTDLIB-NOT: "{{.*}}crt0-noflash-hosted.o" +// CHECK-H2-NOSTDLIB-NOT: "-lc++" +// CHECK-H2-NOSTDLIB-NOT: "-lm" +// CHECK-H2-NOSTDLIB-NOT: "--start-group" +// CHECK-H2-NOSTDLIB-NOT: "-lh2" +// CHECK-H2-NOSTDLIB-NOT: "-lsyscall_wrapper" +// CHECK-H2-NOSTDLIB-NOT: "-lc" +// CHECK-H2-NOSTDLIB-NOT: "-lclang_rt.builtins" +// CHECK-H2-NOSTDLIB-NOT: "--end-group" + +// RUN: %clangxx -### --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \ +// RUN: -mcpu=hexagonv68 \ +// RUN: -nostartfiles %s 2>&1 | FileCheck -check-prefix=CHECK-H2-NOSTARTFILES %s +// CHECK-H2-NOSTARTFILES: "-cc1" +// CHECK-H2-NOSTARTFILES: {{hexagon-link|ld}} +// CHECK-H2-NOSTARTFILES-NOT: "{{.*}}crt0-noflash-hosted.o" +// CHECK-H2-NOSTARTFILES: "-lc++" "-lc++abi" "-lunwind" "-lm" "--start-group" "-lh2" "-lsyscall_wrapper" "-lc" "-lclang_rt.builtins" "--end-group" + +// RUN: %clangxx -### --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \ +// RUN: -mcpu=hexagonv68 \ +// RUN: -nodefaultlibs %s 2>&1 | FileCheck -check-prefix=CHECK-H2-NODEFAULTLIBS %s +// CHECK-H2-NODEFAULTLIBS: "-cc1" +// CHECK-H2-NODEFAULTLIBS: {{hexagon-link|ld}} +// CHECK-H2-NODEFAULTLIBS: "{{.*}}crt0-noflash-hosted.o" +// CHECK-H2-NODEFAULTLIBS-NOT: "-lc++" +// CHECK-H2-NODEFAULTLIBS-NOT: "-lm" +// CHECK-H2-NODEFAULTLIBS-NOT: "--start-group" +// CHECK-H2-NODEFAULTLIBS-NOT: "-lh2" +// CHECK-H2-NODEFAULTLIBS-NOT: "-lsyscall_wrapper" +// CHECK-H2-NODEFAULTLIBS-NOT: "-lc" +// CHECK-H2-NODEFAULTLIBS-NOT: "-lclang_rt.builtins" +// CHECK-H2-NODEFAULTLIBS-NOT: "--end-group" + +// RUN: %clangxx -### --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \ +// RUN: -mcpu=hexagonv68 \ +// RUN: -nolibc %s 2>&1 | FileCheck -check-prefix=CHECK-H2-NOLIBC %s +// CHECK-H2-NOLIBC: "-cc1" +// CHECK-H2-NOLIBC: {{hexagon-link|ld}} +// CHECK-H2-NOLIBC: "{{.*}}crt0-noflash-hosted.o" +// CHECK-H2-NOLIBC-SAME: "-lc++" +// CHECK-H2-NOLIBC-SAME: "-lm" +// CHECK-H2-NOLIBC-SAME: "--start-group" +// CHECK-H2-NOLIBC-SAME: "-lh2" +// CHECK-H2-NOLIBC-SAME: "-lsyscall_wrapper" +// CHECK-H2-NOLIBC-NOT: "-lc" +// CHECK-H2-NOLIBC-SAME: "-lclang_rt.builtins" +// CHECK-H2-NOLIBC-SAME: "--end-group" + +// ----------------------------------------------------------------------------- +// H2: compiler-rt is forced (not -lgcc) +// ----------------------------------------------------------------------------- +// RUN: %clang --target=hexagon-h2-elf --cstdlib=picolibc -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-H2-RTLIB +// RUN: %clangxx --target=hexagon-h2-elf --cstdlib=picolibc -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-H2-RTLIB +// CHECK-H2-RTLIB: "-lclang_rt.builtins" +// CHECK-H2-RTLIB-NOT: "-lgcc" + +// ----------------------------------------------------------------------------- +// H2: libunwind linked for C++ but not C +// ----------------------------------------------------------------------------- +// RUN: %clangxx --target=hexagon-h2-elf --cstdlib=picolibc -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-H2-CXX-UNWIND +// CHECK-H2-CXX-UNWIND: "-lunwind" + +// ----------------------------------------------------------------------------- +// H2: library search paths use target/picolibc/hexagon-unknown-h2-elf/ +// ----------------------------------------------------------------------------- +// RUN: %clang --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \ +// RUN: -mcpu=hexagonv68 -### %s 2>&1 | FileCheck -check-prefix=CHECK-H2-LIBPATHS %s +// CHECK-H2-LIBPATHS: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}lib{{/|\\\\}}v68" +// CHECK-H2-LIBPATHS-NOT: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}lib{{/|\\\\}}v68{{/|\\\\}}G0" + +// RUN: %clang --target=hexagon-h2-elf --cstdlib=picolibc \ +// RUN: -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \ +// RUN: -mcpu=hexagonv68 -G0 -### %s 2>&1 | FileCheck -check-prefix=CHECK-H2-LIBPATHS-G0 %s +// CHECK-H2-LIBPATHS-G0: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}lib{{/|\\\\}}v68{{/|\\\\}}G0" +// CHECK-H2-LIBPATHS-G0: "-L{{.*}}{{/|\\\\}}Inputs{{/|\\\\}}hexagon_tree{{/|\\\\}}Tools{{/|\\\\}}bin{{/|\\\\}}..{{/|\\\\}}target{{/|\\\\}}picolibc{{/|\\\\}}hexagon-unknown-h2-elf{{/|\\\\}}lib{{/|\\\\}}v68" From b018d3a75e93b7ce804d3e990fe67404ffb47b14 Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Mon, 11 May 2026 17:42:42 +0200 Subject: [PATCH 294/538] [llvm][tools][llvm-objdump] Fix nested-offload-binary.test (#196912) In little endian systems the embedded image hex is incorrect and the test fails on Solaris/sparcv9. Switching to generate the inner image on the fly and patch the outer image CONTENT field. --- .../llvm-objdump/Offloading/Inputs/inner.yaml | 16 ++++++++++++ .../Offloading/nested-offload-binary.test | 25 +++---------------- 2 files changed, 20 insertions(+), 21 deletions(-) create mode 100644 llvm/test/tools/llvm-objdump/Offloading/Inputs/inner.yaml diff --git a/llvm/test/tools/llvm-objdump/Offloading/Inputs/inner.yaml b/llvm/test/tools/llvm-objdump/Offloading/Inputs/inner.yaml new file mode 100644 index 0000000000000..41859a5095938 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/Offloading/Inputs/inner.yaml @@ -0,0 +1,16 @@ +!Offload +Members: + - ImageKind: IMG_Bitcode + OffloadKind: OFK_OpenMP + String: + - Key: "triple" + Value: "x-y-z" + - Key: "arch" + Value: "arch1" + - ImageKind: IMG_Bitcode + OffloadKind: OFK_OpenMP + String: + - Key: "triple" + Value: "x-y-z" + - Key: "arch" + Value: "arch2" diff --git a/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test b/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test index d46180242144e..d09b2643fec96 100644 --- a/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test +++ b/llvm/test/tools/llvm-objdump/Offloading/nested-offload-binary.test @@ -1,24 +1,7 @@ ## Test that llvm-objdump can display nested OffloadBinary images. -## The content blobs below were generated from the following YAML Input -##!Offload -##Members: -## - ImageKind: IMG_Bitcode -## OffloadKind: OFK_OpenMP -## String: -## - Key: "triple" -## Value: "x-y-z" -## - Key: "arch" -## Value: "arch1" -## - ImageKind: IMG_Bitcode -## OffloadKind: OFK_OpenMP -## String: -## - Key: "triple" -## Value: "x-y-z" -## - Key: "arch" -## Value: "arch2" - -# RUN: yaml2obj %s -o %t.bin +# RUN: yaml2obj %S/Inputs/inner.yaml -o %t-inner.bin +# RUN: %python -c "import sys; hex = open('%/t-inner.bin', 'rb').read().hex(); test = open('%/s', 'r').read(); sys.stdout.write(test.replace('INNER_IMAGE', hex))" | yaml2obj - -o %t.bin # RUN: llvm-objdump --offloading %t.bin | FileCheck --match-full-lines --strict-whitespace --implicit-check-not={{.}} %s !Offload @@ -30,7 +13,7 @@ Members: Value: "x-y-z" - Key: "arch" Value: "none" - Content: 10ff10ad02000000f00000000000000020000000000000000200000000000000020001000000000070000000000000000200000000000000f00000000000000000000000000000000200010000000000a0000000000000000200000000000000f0000000000000000000000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e9000000000000000500000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e300000000000000050000000000000000782d792d7a006172636800747269706c650061726368320061726368310000 + Content: INNER_IMAGE - ImageKind: IMG_Object OffloadKind: OFK_OpenMP String: @@ -38,7 +21,7 @@ Members: Value: "a-b-c" - Key: "arch" Value: "none" - Content: 10ff10ad02000000f00000000000000020000000000000000200000000000000020001000000000070000000000000000200000000000000f00000000000000000000000000000000200010000000000a0000000000000000200000000000000f0000000000000000000000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e9000000000000000500000000000000dc00000000000000d1000000000000000500000000000000d700000000000000e300000000000000050000000000000000782d792d7a006172636800747269706c650061726368320061726368310000 + Content: INNER_IMAGE # CHECK:OFFLOADING IMAGE [0]: # CHECK-NEXT:kind elf From cbf814d239b7a434d4a67606f5fbc4a234d0b51d Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham Date: Mon, 11 May 2026 18:43:35 +0300 Subject: [PATCH 295/538] [CIR][AArch64] Lower vfmaq_v f32/f64 (#195602) Lower `BI__builtin_neon_vfmaq_v` in CIR for the `vfmaq_f32` and `vfmaq_f64` ACLE wrappers. This is split out from the broader fused multiply-accumulate work and only covers `BI__builtin_neon_vfmaq_v`. The related `vfma_v`, `vfmaq_f16`, lane, laneq, and scalar forms remain outside this PR. Tests move the existing `vfmaq_f32` and `vfmaq_f64` coverage from `neon-intrinsics.c` into `neon/vfmaq.c`, preserve the original LLVM checks, and add ClangIR coverage. Validation: rebuilt `clang` and ran the focused `vfmaq.c` lit test. Part of #185382 Split from feedback on #188190 --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 38 +++++++---- clang/test/CodeGen/AArch64/neon-intrinsics.c | 38 ----------- .../AArch64/neon/fused-multiple-fullfp16.c | 47 +++++++++++++ .../CodeGen/AArch64/neon/fused-multiply.c | 67 +++++++++++++++++++ .../CodeGen/AArch64/v8.2a-neon-intrinsics.c | 19 ------ 5 files changed, 140 insertions(+), 69 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c create mode 100644 clang/test/CodeGen/AArch64/neon/fused-multiply.c diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 26c64c6b26dca..abed7ff84069e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -85,6 +85,17 @@ findARMVectorIntrinsicInMap(ArrayRef intrinsicMap, //===----------------------------------------------------------------------===// // Generic helpers //===----------------------------------------------------------------------===// +// Emit an intrinsic where all operands are of the same type as the result. +// Depending on mode, this may be a constrained floating-point intrinsic. +static mlir::Value +emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc, + StringRef intrName, mlir::Type retTy, + llvm::SmallVector &ops) { + assert(!cir::MissingFeatures::emitConstrainedFPCall()); + + return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops); +} + static llvm::StringRef getLLVMIntrNameNoPrefix(llvm::Intrinsic::ID intrID) { llvm::StringRef llvmIntrName = llvm::Intrinsic::getBaseName(intrID); assert(llvmIntrName.starts_with("llvm.") && "Not an LLVM intrinsic!"); @@ -703,7 +714,21 @@ static mlir::Value emitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vext_v: case NEON::BI__builtin_neon_vextq_v: case NEON::BI__builtin_neon_vfma_v: - case NEON::BI__builtin_neon_vfmaq_v: + cgf.cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented AArch64 builtin call: ") + + ctx.BuiltinInfo.getName(builtinID)); + return mlir::Value{}; + case NEON::BI__builtin_neon_vfmaq_v: { + // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2) + // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator) + // Reorder arguments to match LLVM fma signature + mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty); + mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty); + mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty); + llvm::SmallVector fmaOps = {op1, op2, op0}; + return emitCallMaybeConstrainedBuiltin(cgf.getBuilder(), loc, "fma", ty, + fmaOps); + } case NEON::BI__builtin_neon_vld1_v: case NEON::BI__builtin_neon_vld1q_v: case NEON::BI__builtin_neon_vld1_x2_v: @@ -888,17 +913,6 @@ static mlir::Value emitCommonNeonBuiltinExpr( } } -// Emit an intrinsic where all operands are of the same type as the result. -// Depending on mode, this may be a constrained floating-point intrinsic. -static mlir::Value -emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc, - StringRef intrName, mlir::Type retTy, - llvm::SmallVector &ops) { - assert(!cir::MissingFeatures::emitConstrainedFPCall()); - - return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops); -} - bool CIRGenFunction::getAArch64SVEProcessedOperands( unsigned builtinID, const CallExpr *expr, SmallVectorImpl &ops, SVETypeFlags typeFlags) { diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index 5a1cbc492cd85..aaba75b9c835a 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -886,44 +886,6 @@ float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfma_f32(v1, v2, v3); } -// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32( -// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) -// CHECK-NEXT: ret <4 x float> [[TMP9]] -// -float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { - return vfmaq_f32(v1, v2, v3); -} - -// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64( -// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) -// CHECK-NEXT: ret <2 x double> [[TMP9]] -// -float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { - return vfmaq_f64(v1, v2, v3); -} - // CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32( // CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c new file mode 100644 index 0000000000000..af9330865796d --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c @@ -0,0 +1,47 @@ +// REQUIRES: aarch64-registered-target || arm-registered-target + +// RUN: %clang_cc1_cg_arm64_neon -target-feature +fullfp16 -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 -fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %} +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 -fclangir -emit-cir %s -disable-O0-optnone | FileCheck %s --check-prefixes=ALL,CIR %} + +// ALL: {{[Mm]}}odule + +//============================================================================= +// NOTES +// +// This file contains fullfp16 tests that were originally located in: +// * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +// The main difference is the use of RUN lines that enable ClangIR lowering. +// This file currently covers the f16 wrapper that lowers through +// BI__builtin_neon_vfmaq_v. +// +// ACLE section headings based on v2025Q2 of the ACLE specification: +// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2 +// +//============================================================================= + +#include + +//===------------------------------------------------------===// +// 2.6.1.9.3 Fused multiply-accumulate, vector quad forms +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vfmaq_f16( +// CIR-LABEL: @vfmaq_f16( +float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16> + +// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half> +// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B_CAST]], <8 x half> [[C_CAST]], <8 x half> [[A_CAST]]) +// LLVM-NEXT: ret <8 x half> [[FMA]] + return vfmaq_f16(a, b, c); +} diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c new file mode 100644 index 0000000000000..2501f54fb5427 --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c @@ -0,0 +1,67 @@ +// REQUIRES: aarch64-registered-target || arm-registered-target + +// RUN: %clang_cc1_cg_arm64_neon -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %} +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-cir %s -disable-O0-optnone | FileCheck %s --check-prefixes=ALL,CIR %} + +// ALL: {{[Mm]}}odule + +//============================================================================= +// NOTES +// +// This file contains tests that were originally located in: +// * clang/test/CodeGen/AArch64/neon-intrinsics.c +// The main difference is the use of RUN lines that enable ClangIR lowering. +// This file currently covers the f32/f64 wrappers that lower through +// BI__builtin_neon_vfmaq_v. +// +// ACLE section headings based on v2025Q2 of the ACLE specification: +// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate +// +//============================================================================= + +#include + +//===------------------------------------------------------===// +// 2.1.1.2.5 Fused multiply-accumulate, vector quad forms +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vfmaq_f32( +// CIR-LABEL: @vfmaq_f32( +float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x float> [[C]] to <4 x i32> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i32> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <4 x float> +// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B_CAST]], <4 x float> [[C_CAST]], <4 x float> [[A_CAST]]) +// LLVM-NEXT: ret <4 x float> [[FMA]] + return vfmaq_f32(a, b, c); +} + +// LLVM-LABEL: @test_vfmaq_f64( +// CIR-LABEL: @vfmaq_f64( +float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <2 x double> [[C]] to <2 x i64> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <2 x i64> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <2 x double> +// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B_CAST]], <2 x double> [[C_CAST]], <2 x double> [[A_CAST]]) +// LLVM-NEXT: ret <2 x double> [[FMA]] + return vfmaq_f64(a, b, c); +} diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c index b8380bd8ed6d4..ff1c206fc6350 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c @@ -1621,25 +1621,6 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); } -// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16 -// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) -// CHECK-NEXT: ret <8 x half> [[TMP9]] -// -float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { - return vfmaq_f16(a, b, c); -} - // CHECK-LABEL: define {{[^@]+}}@test_vfms_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: From 8d98bd4afa7759f5ad727f2120f83668b97ce0cd Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 11 May 2026 16:54:04 +0100 Subject: [PATCH 296/538] [VPlan] Avoid erroneously marking PredPHI as using scalars (#195511) PredInstPHIRecipe can use wide values, and indeed, we have several tests demonstrating this behavior. Strip the erroenous always-true usesScalars member, falling back to usesFirstLaneOnly as usual. --- llvm/lib/Transforms/Vectorize/VPlan.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 51193964bdd83..63436c79e9a98 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3479,13 +3479,6 @@ class LLVM_ABI_FOR_TEST VPPredInstPHIRecipe : public VPSingleDefRecipe { return 0; } - /// Returns true if the recipe uses scalars of operand \p Op. - bool usesScalars(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return true; - } - protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. From 6fd79a2aa9435f3f02f4bd138dfb2c6485318d42 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Mon, 11 May 2026 18:01:59 +0200 Subject: [PATCH 297/538] [SPIR-V] Reject OpSelect with scalar result and vector condition (#193745) Per SPIR-V spec, scalar Result Type requires a scalar bool condition. So, vector cond branches under a scalar result are unreachable --- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 6 ++ .../Target/SPIRV/SPIRVInstructionSelector.cpp | 8 +- .../SPIRV/select-invalid-vector-cond.ll | 16 ++++ llvm/test/CodeGen/SPIRV/select.ll | 77 +++++++++++++++++++ 4 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/select-invalid-vector-cond.ll create mode 100644 llvm/test/CodeGen/SPIRV/select.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 70644c402c38a..02a4a41e6295e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -2471,6 +2471,12 @@ static bool generateSampleImageInst(const StringRef DemangledCall, static bool generateSelectInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder) { + const MachineRegisterInfo *MRI = MIRBuilder.getMRI(); + LLT ResTy = MRI->getType(Call->ReturnRegister); + LLT CondTy = MRI->getType(Call->Arguments[0]); + if (!ResTy.isVector() && CondTy.isVector()) + report_fatal_error("OpSelect with a scalar result requires a scalar " + "boolean condition"); MIRBuilder.buildSelect(Call->ReturnRegister, Call->Arguments[0], Call->Arguments[1], Call->Arguments[2]); return true; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 4618e2df74ae0..616a274be9867 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -4203,12 +4203,14 @@ bool SPIRVInstructionSelector::selectSelect(Register ResVReg, Opcode = IsScalarBool ? SPIRV::OpSelectVISCond : SPIRV::OpSelectVIVCond; } } else { + assert(IsScalarBool && "OpSelect with a scalar result requires a scalar " + "boolean condition"); if (IsFloatTy) { - Opcode = IsScalarBool ? SPIRV::OpSelectSFSCond : SPIRV::OpSelectVFVCond; + Opcode = SPIRV::OpSelectSFSCond; } else if (IsPtrTy) { - Opcode = IsScalarBool ? SPIRV::OpSelectSPSCond : SPIRV::OpSelectVPVCond; + Opcode = SPIRV::OpSelectSPSCond; } else { - Opcode = IsScalarBool ? SPIRV::OpSelectSISCond : SPIRV::OpSelectVIVCond; + Opcode = SPIRV::OpSelectSISCond; } } BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode)) diff --git a/llvm/test/CodeGen/SPIRV/select-invalid-vector-cond.ll b/llvm/test/CodeGen/SPIRV/select-invalid-vector-cond.ll new file mode 100644 index 0000000000000..a92d25f43ea94 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/select-invalid-vector-cond.ll @@ -0,0 +1,16 @@ +; RUN: not --crash llc -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not --crash llc -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s + +; __spirv_Select can pair a vector boolean condition with scalar operands, +; which is malformed for OpSelect and must be diagnosed. + +; CHECK: LLVM ERROR: OpSelect with a scalar result requires a scalar boolean condition + +define spir_kernel void @bad_select(i32 %a, i32 %b, ptr addrspace(1) %out, <4 x i1> %cond) { +entry: + %call = call spir_func i32 @_Z14__spirv_SelectDv4_bii(<4 x i1> %cond, i32 %a, i32 %b) + store i32 %call, ptr addrspace(1) %out + ret void +} + +declare spir_func i32 @_Z14__spirv_SelectDv4_bii(<4 x i1>, i32, i32) diff --git a/llvm/test/CodeGen/SPIRV/select.ll b/llvm/test/CodeGen/SPIRV/select.ll new file mode 100644 index 0000000000000..019f713f739d4 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/select.ll @@ -0,0 +1,77 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; OpSelect: condition and object component counts must match. + +; CHECK-DAG: %[[#Bool:]] = OpTypeBool +; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#F32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#I8:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#PtrI8:]] = OpTypePointer Function %[[#I8]] +; CHECK-DAG: %[[#V4I32:]] = OpTypeVector %[[#I32]] 4 +; CHECK-DAG: %[[#V4F32:]] = OpTypeVector %[[#F32]] 4 +; CHECK-DAG: %[[#V4Bool:]] = OpTypeVector %[[#Bool]] 4 + +; Scalar result, scalar cond. +; CHECK: OpFunction +; CHECK: %[[#SC:]] = OpFunctionParameter %[[#Bool]] +; CHECK: %[[#SA:]] = OpFunctionParameter %[[#I32]] +; CHECK: %[[#SB:]] = OpFunctionParameter %[[#I32]] +; CHECK: %{{[0-9]+}} = OpSelect %[[#I32]] %[[#SC]] %[[#SA]] %[[#SB]] +define i32 @sel_i32_scond(i1 %c, i32 %a, i32 %b) { + %r = select i1 %c, i32 %a, i32 %b + ret i32 %r +} + +; CHECK: OpFunction +; CHECK: %[[#FC:]] = OpFunctionParameter %[[#Bool]] +; CHECK: %[[#FA:]] = OpFunctionParameter %[[#F32]] +; CHECK: %[[#FB:]] = OpFunctionParameter %[[#F32]] +; CHECK: %{{[0-9]+}} = OpSelect %[[#F32]] %[[#FC]] %[[#FA]] %[[#FB]] +define float @sel_f32_scond(i1 %c, float %a, float %b) { + %r = select i1 %c, float %a, float %b + ret float %r +} + +; CHECK: OpFunction +; CHECK: %[[#PC:]] = OpFunctionParameter %[[#Bool]] +; CHECK: %[[#PA:]] = OpFunctionParameter %[[#PtrI8]] +; CHECK: %[[#PB:]] = OpFunctionParameter %[[#PtrI8]] +; CHECK: %{{[0-9]+}} = OpSelect %[[#PtrI8]] %[[#PC]] %[[#PA]] %[[#PB]] +define ptr @sel_ptr_scond(i1 %c, ptr %a, ptr %b) { + %r = select i1 %c, ptr %a, ptr %b + ret ptr %r +} + +; Vector result, scalar (broadcast) cond. +; CHECK: OpFunction +; CHECK: %[[#VSC:]] = OpFunctionParameter %[[#Bool]] +; CHECK: %[[#VSA:]] = OpFunctionParameter %[[#V4I32]] +; CHECK: %[[#VSB:]] = OpFunctionParameter %[[#V4I32]] +; CHECK: %{{[0-9]+}} = OpSelect %[[#V4I32]] %[[#VSC]] %[[#VSA]] %[[#VSB]] +define <4 x i32> @sel_v4i32_scond(i1 %c, <4 x i32> %a, <4 x i32> %b) { + %r = select i1 %c, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %r +} + +; Vector result, vector cond. +; CHECK: OpFunction +; CHECK: %[[#VVIC:]] = OpFunctionParameter %[[#V4Bool]] +; CHECK: %[[#VVIA:]] = OpFunctionParameter %[[#V4I32]] +; CHECK: %[[#VVIB:]] = OpFunctionParameter %[[#V4I32]] +; CHECK: %{{[0-9]+}} = OpSelect %[[#V4I32]] %[[#VVIC]] %[[#VVIA]] %[[#VVIB]] +define <4 x i32> @sel_v4i32_vcond(<4 x i1> %c, <4 x i32> %a, <4 x i32> %b) { + %r = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %r +} + +; CHECK: OpFunction +; CHECK: %[[#VVFC:]] = OpFunctionParameter %[[#V4Bool]] +; CHECK: %[[#VVFA:]] = OpFunctionParameter %[[#V4F32]] +; CHECK: %[[#VVFB:]] = OpFunctionParameter %[[#V4F32]] +; CHECK: %{{[0-9]+}} = OpSelect %[[#V4F32]] %[[#VVFC]] %[[#VVFA]] %[[#VVFB]] +define <4 x float> @sel_v4f32_vcond(<4 x i1> %c, <4 x float> %a, <4 x float> %b) { + %r = select <4 x i1> %c, <4 x float> %a, <4 x float> %b + ret <4 x float> %r +} + From 254f21a48934b4095e45eedc67f3897a7a1dfbcb Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Tue, 12 May 2026 00:19:29 +0800 Subject: [PATCH 298/538] [mlir] Add erase sub-region dominate tree logic in DominanceInfoBase::invalidate method (#192469) Fix the issue: a region may contain nested sub-regions. When we remove the dominate tree of a parent region and delete the region, the dominate trees of its sub-regions should also be removed. --- mlir/include/mlir/IR/Dominance.h | 5 ++++- mlir/lib/IR/Dominance.cpp | 18 ++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h index 9e1254c1dfe1e..70924a2e9ae59 100644 --- a/mlir/include/mlir/IR/Dominance.h +++ b/mlir/include/mlir/IR/Dominance.h @@ -47,7 +47,10 @@ class DominanceInfoBase { /// Invalidate dominance info. This can be used by clients that make major /// changes to the CFG and don't have a good way to update it. void invalidate(); - void invalidate(Region *region); + + /// Invalidate dominance info of the \p region, if \p recursive is true, it + /// invalidates the dominance information of child regions. + void invalidate(Region *region, bool recursive = true); /// Finds the nearest common dominator block for the two given blocks a /// and b. If no common dominator can be found, this function will return diff --git a/mlir/lib/IR/Dominance.cpp b/mlir/lib/IR/Dominance.cpp index 0e53b431b5d31..79fb41f2e6b30 100644 --- a/mlir/lib/IR/Dominance.cpp +++ b/mlir/lib/IR/Dominance.cpp @@ -41,12 +41,18 @@ void DominanceInfoBase::invalidate() { } template -void DominanceInfoBase::invalidate(Region *region) { - auto it = dominanceInfos.find(region); - if (it != dominanceInfos.end()) { - delete it->second.getPointer(); - dominanceInfos.erase(it); - } +void DominanceInfoBase::invalidate(Region *region, bool recursive) { + auto invalidate = [&](Region *r) { + auto it = dominanceInfos.find(r); + if (it != dominanceInfos.end()) { + delete it->second.getPointer(); + dominanceInfos.erase(it); + } + }; + if (recursive) + region->walk([&](Region *r) { invalidate(r); }); + else + invalidate(region); } /// Return the dom tree and "hasSSADominance" bit for the given region. The From 0bc6312cf066f4f6b9ed51553d00d9d00b802496 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 11 May 2026 17:22:04 +0100 Subject: [PATCH 299/538] clang: Avoid some unnecessary uses of MakeArgString for hip (#196376) --- clang/lib/Driver/ToolChains/HIPAMD.cpp | 29 ++++++++++++-------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index b7732c9274725..7e38ecbb9b9f0 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -80,7 +80,7 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, // Add features to mattr such as cumode std::string MAttrString = "-plugin-opt=-mattr="; for (auto OneFeature : unifyTargetFeatures(Features)) { - MAttrString.append(Args.MakeArgString(OneFeature)); + MAttrString.append(Args.MakeArgStringRef(OneFeature)); if (OneFeature != Features.back()) MAttrString.append(","); } @@ -91,10 +91,9 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, // Since AMDGPU backend currently does not support ISA-level linking, all // called functions need to be imported. if (IsThinLTO) { - LldArgs.push_back(Args.MakeArgString("-plugin-opt=-force-import-all")); - LldArgs.push_back(Args.MakeArgString("-plugin-opt=-avail-extern-to-local")); - LldArgs.push_back(Args.MakeArgString( - "-plugin-opt=-avail-extern-gv-in-addrspace-to-local=3")); + LldArgs.push_back("-plugin-opt=-force-import-all"); + LldArgs.push_back("-plugin-opt=-avail-extern-to-local"); + LldArgs.push_back("-plugin-opt=-avail-extern-gv-in-addrspace-to-local=3"); } for (const Arg *A : Args.filtered(options::OPT_mllvm)) { @@ -126,7 +125,7 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, LldArgs.push_back( Args.MakeArgString(Twine("-plugin-opt=") + SplitArg.second)); } else { - LldArgs.push_back(Args.MakeArgString(ArgVal)); + LldArgs.push_back(Args.MakeArgStringRef(ArgVal)); } Arg->claim(); } @@ -143,7 +142,7 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, LldArgs.push_back("--no-whole-archive"); - const char *Lld = Args.MakeArgString(getToolChain().GetProgramPath("lld")); + const char *Lld = Args.MakeArgStringRef(getToolChain().GetProgramPath("lld")); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), Lld, LldArgs, Inputs, Output)); } @@ -175,11 +174,10 @@ void AMDGCN::Linker::constructLinkAndEmitSpirvCommand( // compiled to SPIR-V. llvm::opt::ArgStringList CmdArgs; - const char *Triple = - C.getArgs().MakeArgString("-triple=spirv64-amd-amdhsa"); - CmdArgs.append({"-cc1", Triple, "-emit-obj", "-disable-llvm-optzns", - LinkedBCFile.getFilename(), "-o", Output.getFilename()}); + CmdArgs.append({"-cc1", "-triple=spirv64-amd-amdhsa", "-emit-obj", + "-disable-llvm-optzns", LinkedBCFile.getFilename(), "-o", + Output.getFilename()}); const Driver &Driver = getToolChain().getDriver(); const char *Exec = Driver.getClangProgramPath(); @@ -254,9 +252,8 @@ void HIPAMDToolChain::addClangTargetOptions( StringRef MaxThreadsPerBlock = DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); if (!MaxThreadsPerBlock.empty()) { - std::string ArgStr = - (Twine("--gpu-max-threads-per-block=") + MaxThreadsPerBlock).str(); - CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr)); + CC1Args.push_back(DriverArgs.MakeArgString( + Twine("--gpu-max-threads-per-block=") + MaxThreadsPerBlock)); } // Default to "hidden" visibility, as object level linking will not be @@ -287,7 +284,7 @@ void HIPAMDToolChain::addClangTargetOptions( for (auto BCFile : getDeviceLibs(DriverArgs, DeviceOffloadingKind)) { CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode" : "-mlink-bitcode-file"); - CC1Args.push_back(DriverArgs.MakeArgString(BCFile.Path)); + CC1Args.push_back(DriverArgs.MakeArgStringRef(BCFile.Path)); } } @@ -389,7 +386,7 @@ HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs, // Find in --hip-device-lib-path and HIP_LIBRARY_PATH. for (StringRef Path : RocmInstallation->getRocmDeviceLibPathArg()) - LibraryPaths.push_back(DriverArgs.MakeArgString(Path)); + LibraryPaths.push_back(DriverArgs.MakeArgStringRef(Path)); addDirectoryList(DriverArgs, LibraryPaths, "", "HIP_DEVICE_LIB_PATH"); From 515cb37549e764ce927a9692861f03a95fae4fab Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 11 May 2026 09:22:31 -0700 Subject: [PATCH 300/538] [LLVM] Add validation to check the number of intrinsic args (#196563) Add validation to check the number of intrinsic args. In service of that, extend `getIntrinsicInfoTableEntries` to return several things: * ArrayRef to the Table (as a convenience). * Number of arguments * Does the intrinsic have variable arguments. This avoids some code duplication at various callers of `getIntrinsicInfoTableEntries`. To have `getIntrinsicInfoTableEntries` correctly compute the number of arguments based on trip count of the while loop, move parsing of element type for `IIT_SAME_VEC_WIDTH_ARG` to a recursive call from `DecodeIITType`. --- clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 12 +-- llvm/include/llvm/IR/Intrinsics.h | 15 +++- llvm/lib/IR/Intrinsics.cpp | 84 ++++++++++--------- ...implicit-intrinsic-declaration-invalid4.ll | 9 ++ 4 files changed, 66 insertions(+), 54 deletions(-) create mode 100644 llvm/test/Assembler/implicit-intrinsic-declaration-invalid4.ll diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index fe932834e9b55..868bca404949b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -955,21 +955,13 @@ static cir::FuncType getIntrinsicType(CIRGenFunction &cgf, using namespace llvm::Intrinsic; SmallVector table; - getIntrinsicInfoTableEntries(id, table); + auto [tableRef, _, isVarArg] = getIntrinsicInfoTableEntries(id, table); - ArrayRef tableRef = table; mlir::Type resultTy = decodeFixedType(cgf, tableRef, context); SmallVector argTypes; - bool isVarArg = false; - while (!tableRef.empty()) { - IITDescriptor::IITDescriptorKind kind = tableRef.front().Kind; - if (kind == IITDescriptor::VarArg) { - isVarArg = true; - break; // VarArg is last - } + while (!tableRef.empty()) argTypes.push_back(decodeFixedType(cgf, tableRef, context)); - } // CIR convention: no explicit void return type if (isa(resultTy)) diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index ae931c7961161..af97285fcee70 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -20,6 +20,7 @@ #include "llvm/Support/TypeSize.h" #include #include +#include namespace llvm { @@ -259,10 +260,16 @@ struct IITDescriptor { /// Returns true if \p id has a struct return type. LLVM_ABI bool hasStructReturnType(ID id); -/// Return the IIT table descriptor for the specified intrinsic into an array -/// of IITDescriptors. -LLVM_ABI void getIntrinsicInfoTableEntries(ID id, - SmallVectorImpl &T); +/// Fill the IIT table descriptor for the intrinsic \p id into an array +/// of IITDescriptors. Returns a tuple of 3 values: +/// - ArrayRef for the descriptor table (for convenience). +/// - Number of arguments. +/// - if it's a variable argument intrinsic. +/// +/// Note that for VarArg intrinsics, the last IIT `VarArg` token will be +/// consumed and not a part of the returned ArrayRef. +LLVM_ABI std::tuple, unsigned, bool> +getIntrinsicInfoTableEntries(ID id, SmallVectorImpl &T); /// Returns true if \p FT is a valid function type for intrinsic \p ID. If /// `ID` is an overloaded intrinsic, the overload types are pushed into the diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index ff57d335c9a13..89aef32938c75 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -38,10 +38,9 @@ using namespace llvm; // Forward declaration of static functions. -static bool isIntrinsicVarArg(ArrayRef &Infos, - bool Consume); static bool isSignatureValid(FunctionType *FTy, ArrayRef &Infos, + unsigned NumArgs, bool IsVarArg, SmallVectorImpl &OverloadTys, raw_ostream &OS); @@ -398,6 +397,8 @@ DecodeIITType(unsigned &NextElt, ArrayRef Infos, unsigned OverloadIndex = Infos[NextElt++]; OutputTable.push_back( IITDescriptor::get(IITDescriptor::SameVecWidth, OverloadIndex)); + // IIT_SAME_VEC_WIDTH_ARG entry is followed by the element type. + DecodeIITType(NextElt, Infos, OutputTable); return; } case IIT_VEC_OF_ANYPTRS_TO_ELT: { @@ -451,8 +452,9 @@ DecodeIITType(unsigned &NextElt, ArrayRef Infos, #define GET_INTRINSIC_GENERATOR_GLOBAL #include "llvm/IR/IntrinsicImpl.inc" -void Intrinsic::getIntrinsicInfoTableEntries( - ID id, SmallVectorImpl &T) { +std::tuple, unsigned, bool> +Intrinsic::getIntrinsicInfoTableEntries(ID id, + SmallVectorImpl &T) { // Note that `FixedEncodingTy` is defined in IntrinsicImpl.inc and can be // uint16_t or uint32_t based on the the value of `Use16BitFixedEncoding` in // IntrinsicEmitter.cpp. @@ -497,8 +499,21 @@ void Intrinsic::getIntrinsicInfoTableEntries( // Okay, decode the table into the output vector of IITDescriptors. DecodeIITType(NextElt, IITEntries, T); - while (IITEntries[NextElt] != IIT_Done) + unsigned NumArgs = 0; + while (IITEntries[NextElt] != IIT_Done) { DecodeIITType(NextElt, IITEntries, T); + ++NumArgs; + } + + ArrayRef TableRef = T; + + bool IsVarArg = false; + if (TableRef.back().Kind == Intrinsic::IITDescriptor::VarArg) { + IsVarArg = true; + TableRef.consume_back(); + --NumArgs; + } + return {TableRef, NumArgs, IsVarArg}; } static Type *DecodeFixedType(ArrayRef &Infos, @@ -511,8 +526,6 @@ static Type *DecodeFixedType(ArrayRef &Infos, switch (D.Kind) { case IITDescriptor::Void: return Type::getVoidTy(Context); - case IITDescriptor::VarArg: - return Type::getVoidTy(Context); case IITDescriptor::MMX: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); case IITDescriptor::AMX: @@ -589,6 +602,10 @@ static Type *DecodeFixedType(ArrayRef &Infos, assert(VTy && "Expected overload type to be a Vector Type"); return VectorType::getInteger(VTy); } + case IITDescriptor::VarArg: + // VarArg token should be consumed by `getIntrinsicInfoTableEntries`, so we + // should never see it here. + llvm_unreachable("IITDescriptor::VarArg not expected"); } llvm_unreachable("unhandled"); } @@ -596,10 +613,7 @@ static Type *DecodeFixedType(ArrayRef &Infos, FunctionType *Intrinsic::getType(LLVMContext &Context, ID id, ArrayRef OverloadTys) { SmallVector Table; - getIntrinsicInfoTableEntries(id, Table); - ArrayRef TableRef = Table; - - bool IsVarArg = isIntrinsicVarArg(TableRef, /*Consume=*/true); + auto [TableRef, _, IsVarArg] = getIntrinsicInfoTableEntries(id, Table); Type *ResultTy = DecodeFixedType(TableRef, OverloadTys, Context); @@ -777,16 +791,13 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id, Type *RetTy, // Get the intrinsic signature metadata. SmallVector Table; - getIntrinsicInfoTableEntries(id, Table); - ArrayRef TableRef = Table; - bool IsVarArg = isIntrinsicVarArg(TableRef, /*Consume=*/false); - + auto [TableRef, NumArgs, IsVarArg] = getIntrinsicInfoTableEntries(id, Table); FunctionType *FTy = FunctionType::get(RetTy, ArgTys, IsVarArg); // Automatically determine the overloaded types. SmallVector OverloadTys; - [[maybe_unused]] bool IsValid = - ::isSignatureValid(FTy, TableRef, OverloadTys, nulls()); + [[maybe_unused]] bool IsValid = ::isSignatureValid( + FTy, TableRef, NumArgs, IsVarArg, OverloadTys, nulls()); assert(IsValid && "intrinsic signature mismatch"); return getOrInsertIntrinsicDeclarationImpl(M, id, OverloadTys, FTy); } @@ -859,8 +870,6 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, switch (D.Kind) { case IITDescriptor::Void: return !Ty->isVoidTy(); - case IITDescriptor::VarArg: - return true; case IITDescriptor::MMX: { FixedVectorType *VT = dyn_cast(Ty); return !VT || VT->getNumElements() != 1 || @@ -1050,44 +1059,40 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, return true; return ThisArgVecTy != VectorType::getInteger(ReferenceType); } + case IITDescriptor::VarArg: + // VarArg token should be consumed by `getIntrinsicInfoTableEntries`, so we + // should never see it here. + llvm_unreachable("IITDescriptor::VarArg not expected"); } llvm_unreachable("unhandled"); } -/// Returns true if the intrinsic is a VarArg intrinsics. If \p Consume is true -/// the IITDescriptor for the VarArg is consumed and removed from \p Infos, else -/// it stays unchanged. -static bool isIntrinsicVarArg(ArrayRef &Infos, - bool Consume) { - if (!Infos.empty() && Infos.back().Kind == Intrinsic::IITDescriptor::VarArg) { - if (Consume) - Infos.consume_back(); - return true; - } - return false; -} - /// Return true if the function type \p FTy is a valid type signature for the -/// type constraints specified in the .td file, represented by \p Infos. -/// The overloaded type for the intrinsic are pushed to the OverloadTys vector. +/// type constraints specified in the .td file, represented by \p Infos and +/// \p IsVarArg. The overloaded types for the intrinsic are pushed to the +/// \p OverloadTys vector. /// /// If the type is not valid, returns false and prints an error message to /// \p OS. static bool isSignatureValid(FunctionType *FTy, ArrayRef &Infos, + unsigned NumArgs, bool IsVarArg, SmallVectorImpl &OverloadTys, raw_ostream &OS) { - bool IsVarArg = isIntrinsicVarArg(Infos, /*Consume=*/true); - SmallVector DeferredChecks; if (matchIntrinsicType(FTy->getReturnType(), Infos, OverloadTys, DeferredChecks, false)) { OS << "intrinsic has incorrect return type!"; return false; } - unsigned NumDeferredReturnChecks = DeferredChecks.size(); + if (FTy->getNumParams() != NumArgs) { + OS << "intrinsic has incorrect number of args. Expected " << NumArgs + << ", but got " << FTy->getNumParams(); + return false; + } + for (Type *Ty : FTy->params()) { if (matchIntrinsicType(Ty, Infos, OverloadTys, DeferredChecks, false)) { OS << "intrinsic has incorrect argument type!"; @@ -1137,10 +1142,9 @@ bool Intrinsic::isSignatureValid(Intrinsic::ID ID, FunctionType *FT, return false; SmallVector Table; - getIntrinsicInfoTableEntries(ID, Table); - ArrayRef TableRef = Table; + auto [TableRef, NumArgs, IsVarArg] = getIntrinsicInfoTableEntries(ID, Table); - return ::isSignatureValid(FT, TableRef, OverloadTys, OS); + return ::isSignatureValid(FT, TableRef, NumArgs, IsVarArg, OverloadTys, OS); } bool Intrinsic::isSignatureValid(Function *F, diff --git a/llvm/test/Assembler/implicit-intrinsic-declaration-invalid4.ll b/llvm/test/Assembler/implicit-intrinsic-declaration-invalid4.ll new file mode 100644 index 0000000000000..1a4e378519306 --- /dev/null +++ b/llvm/test/Assembler/implicit-intrinsic-declaration-invalid4.ll @@ -0,0 +1,9 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +; Use of intrinsic with an invalid signature should be rejected. + +; CHECK: intrinsic has incorrect number of args. Expected 1, but got 2 +define void @test(float %a) { + call float @llvm.ceil.f32(float %a, float %a) + ret void +} From fc25712d1b472d2d38a648fc286082aa8b3d8a29 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Mon, 11 May 2026 09:27:01 -0700 Subject: [PATCH 301/538] [lldb] Rename GetInstanceVariableName to GetInstanceName (NFC) (#196453) Based on Jim's comments (https://github.com/llvm/llvm-project/pull/195187#discussion_r3205135577) which highlights that it is incorrect to call this/self an "instance variable". I went with "instance name" to leave out the word "object", since not all instances values are objects. --- lldb/include/lldb/Symbol/SymbolContext.h | 6 +++--- lldb/include/lldb/Target/Language.h | 2 +- lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h | 2 +- lldb/source/Plugins/Language/ObjC/ObjCLanguage.h | 2 +- .../Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h | 2 +- lldb/source/Symbol/SymbolContext.cpp | 4 ++-- lldb/source/Target/StackFrame.cpp | 6 +++--- lldb/source/ValueObject/DILEval.cpp | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h index 777317c5200a3..f7295cdaaebc3 100644 --- a/lldb/include/lldb/Symbol/SymbolContext.h +++ b/lldb/include/lldb/Symbol/SymbolContext.h @@ -269,13 +269,13 @@ class SymbolContext { /// represented by this symbol context object, nullptr otherwise. Block *GetFunctionBlock(); - /// Determines the name of the instance variable for the this decl context. + /// Determines the name of the instance for this decl context. /// /// For C++ the name is "this", for Objective-C the name is "self". /// /// \return - /// Returns a StringRef for the name of the instance variable. - llvm::StringRef GetInstanceVariableName(); + /// Returns a StringRef for the name of the instance. + llvm::StringRef GetInstanceName(); /// Sorts the types in TypeMap according to SymbolContext to TypeList /// diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h index 4b8d4eaf262a0..9205b5e339977 100644 --- a/lldb/include/lldb/Target/Language.h +++ b/lldb/include/lldb/Target/Language.h @@ -468,7 +468,7 @@ class Language : public PluginInterface { return ConstString(); } - virtual llvm::StringRef GetInstanceVariableName() { return {}; } + virtual llvm::StringRef GetInstanceName() { return {}; } /// Given a symbol context list of matches which supposedly represent the /// same file and line number in a CU, erases those that should be ignored diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h index 53fae3fe71e3a..e9b0e80d8860b 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h @@ -229,7 +229,7 @@ class CPlusPlusLanguage : public Language { static llvm::Expected SubstituteStructorAliases_ItaniumMangle(llvm::StringRef mangled_name); - llvm::StringRef GetInstanceVariableName() override { return "this"; } + llvm::StringRef GetInstanceName() override { return "this"; } FormatEntity::Entry GetFunctionNameFormat() const override; diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h index cee80ed0b7f40..7b8f56063fe8b 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h @@ -184,7 +184,7 @@ class ObjCLanguage : public Language { return false; } - llvm::StringRef GetInstanceVariableName() override { return "self"; } + llvm::StringRef GetInstanceName() override { return "self"; } virtual std::optional GetBooleanFromString(llvm::StringRef str) const override; diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h index ab45acd9c68db..9506fec391a9a 100644 --- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h +++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h @@ -37,7 +37,7 @@ class ObjCPlusPlusLanguage : public Language { static lldb_private::Language *CreateInstance(lldb::LanguageType language); - llvm::StringRef GetInstanceVariableName() override { return "self"; } + llvm::StringRef GetInstanceName() override { return "self"; } virtual std::optional GetBooleanFromString(llvm::StringRef str) const override; diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp index dde39fe8f5738..b36992e12c688 100644 --- a/lldb/source/Symbol/SymbolContext.cpp +++ b/lldb/source/Symbol/SymbolContext.cpp @@ -530,7 +530,7 @@ Block *SymbolContext::GetFunctionBlock() { return nullptr; } -llvm::StringRef SymbolContext::GetInstanceVariableName() { +llvm::StringRef SymbolContext::GetInstanceName() { LanguageType lang_type = eLanguageTypeUnknown; if (Block *function_block = GetFunctionBlock()) @@ -541,7 +541,7 @@ llvm::StringRef SymbolContext::GetInstanceVariableName() { lang_type = GetLanguage(); if (auto *lang = Language::FindPlugin(lang_type)) - return lang->GetInstanceVariableName(); + return lang->GetInstanceName(); return {}; } diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index c5939011ea91c..ff21a5c8fe7d1 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -637,9 +637,9 @@ ValueObjectSP StackFrame::LegacyGetValueForVariableExpressionPath( // Check for direct ivars access which helps us with implicit access to // ivars using "this" or "self". GetSymbolContext(eSymbolContextFunction | eSymbolContextBlock); - llvm::StringRef instance_var_name = m_sc.GetInstanceVariableName(); - if (!instance_var_name.empty()) { - var_sp = variable_list->FindVariable(ConstString(instance_var_name)); + llvm::StringRef instance_name = m_sc.GetInstanceName(); + if (!instance_name.empty()) { + var_sp = variable_list->FindVariable(ConstString(instance_name)); if (var_sp) { separator_idx = 0; if (Type *var_type = var_sp->GetType()) diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp index 38db893c923b8..54ccfe76330a2 100644 --- a/lldb/source/ValueObject/DILEval.cpp +++ b/lldb/source/ValueObject/DILEval.cpp @@ -358,8 +358,8 @@ lldb::ValueObjectSP LookupIdentifier(llvm::StringRef name_ref, // Try looking for an instance variable (class member). SymbolContext sc = stack_frame->GetSymbolContext( lldb::eSymbolContextFunction | lldb::eSymbolContextBlock); - llvm::StringRef ivar_name = sc.GetInstanceVariableName(); - value_sp = stack_frame->FindVariable(ConstString(ivar_name)); + llvm::StringRef instance_name = sc.GetInstanceName(); + value_sp = stack_frame->FindVariable(ConstString(instance_name)); if (value_sp) value_sp = value_sp->GetChildMemberWithName(name_ref); From d3f679582b69ef0388792f2054479bb0976d0520 Mon Sep 17 00:00:00 2001 From: Vito Secona <77039267+secona@users.noreply.github.com> Date: Mon, 11 May 2026 23:27:45 +0700 Subject: [PATCH 302/538] [mlir][SparseTensor] Terminology cleanup PIV -> PCV (NFC) (#196707) This PR standardizes terminology in the MLIR sparsifier by replacing the PIV (Pointer, Index, Value) terminology with the PCV (Position, Coordinate, Value) terminology established in https://reviews.llvm.org/D144773. The changes include renaming template parameters and error macros. --- .../mlir/ExecutionEngine/SparseTensor/File.h | 8 ++++---- .../ExecutionEngine/SparseTensor/Storage.cpp | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h index 7e2190dc28084..a6f199c0a0c1e 100644 --- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h +++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h @@ -199,16 +199,16 @@ class SparseTensorReader final { /// Allocates a new sparse-tensor storage object with the given encoding, /// initializes it by reading all the elements from the file, and then - /// closes the file. Templated on P, I, and V. - template - SparseTensorStorage * + /// closes the file. Templated on P, C, and V. + template + SparseTensorStorage * readSparseTensor(uint64_t lvlRank, const uint64_t *lvlSizes, const LevelType *lvlTypes, const uint64_t *dim2lvl, const uint64_t *lvl2dim) { const uint64_t dimRank = getRank(); MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim); auto *lvlCOO = readCOO(map, lvlSizes); - auto *tensor = SparseTensorStorage::newFromCOO( + auto *tensor = SparseTensorStorage::newFromCOO( dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim, lvlCOO); delete lvlCOO; diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp index acb2d1bb5bed6..fd61e9b83ec02 100644 --- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp +++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp @@ -50,20 +50,20 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT } // Helper macro for wrong "partial method specialization" errors. -#define FATAL_PIV(NAME) \ - fprintf(stderr, " type mismatch for: " #NAME); \ +#define FATAL_PCV(NAME) \ + fprintf(stderr, " type mismatch for: " #NAME); \ exit(1); #define IMPL_GETPOSITIONS(PNAME, P) \ void SparseTensorStorageBase::getPositions(std::vector

**, uint64_t) { \ - FATAL_PIV("getPositions" #PNAME); \ + FATAL_PCV("getPositions" #PNAME); \ } MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETPOSITIONS) #undef IMPL_GETPOSITIONS #define IMPL_GETCOORDINATES(CNAME, C) \ void SparseTensorStorageBase::getCoordinates(std::vector **, uint64_t) { \ - FATAL_PIV("getCoordinates" #CNAME); \ + FATAL_PCV("getCoordinates" #CNAME); \ } MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETCOORDINATES) #undef IMPL_GETCOORDINATES @@ -71,21 +71,21 @@ MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETCOORDINATES) #define IMPL_GETCOORDINATESBUFFER(CNAME, C) \ void SparseTensorStorageBase::getCoordinatesBuffer(std::vector **, \ uint64_t) { \ - FATAL_PIV("getCoordinatesBuffer" #CNAME); \ + FATAL_PCV("getCoordinatesBuffer" #CNAME); \ } MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETCOORDINATESBUFFER) #undef IMPL_GETCOORDINATESBUFFER #define IMPL_GETVALUES(VNAME, V) \ void SparseTensorStorageBase::getValues(std::vector **) { \ - FATAL_PIV("getValues" #VNAME); \ + FATAL_PCV("getValues" #VNAME); \ } MLIR_SPARSETENSOR_FOREVERY_V(IMPL_GETVALUES) #undef IMPL_GETVALUES #define IMPL_LEXINSERT(VNAME, V) \ void SparseTensorStorageBase::lexInsert(const uint64_t *, V) { \ - FATAL_PIV("lexInsert" #VNAME); \ + FATAL_PCV("lexInsert" #VNAME); \ } MLIR_SPARSETENSOR_FOREVERY_V(IMPL_LEXINSERT) #undef IMPL_LEXINSERT @@ -93,9 +93,9 @@ MLIR_SPARSETENSOR_FOREVERY_V(IMPL_LEXINSERT) #define IMPL_EXPINSERT(VNAME, V) \ void SparseTensorStorageBase::expInsert(uint64_t *, V *, bool *, uint64_t *, \ uint64_t, uint64_t) { \ - FATAL_PIV("expInsert" #VNAME); \ + FATAL_PCV("expInsert" #VNAME); \ } MLIR_SPARSETENSOR_FOREVERY_V(IMPL_EXPINSERT) #undef IMPL_EXPINSERT -#undef FATAL_PIV +#undef FATAL_PCV From a8b0d14ccf14395726ad138adacb4156cf35eaaf Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 11 May 2026 09:31:48 -0700 Subject: [PATCH 303/538] [AArch64] Improve v1i64/v2i64 clmulh. (#196694) We can use pmull/pmull2 to compute the full product then take the high half. --------- Co-authored-by: Matthew Devereau --- .../Target/AArch64/AArch64ISelLowering.cpp | 1 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 9 ++++ llvm/test/CodeGen/AArch64/clmul-fixed.ll | 41 ++++++------------- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8dd94e1418400..8608bfe0a8205 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1533,6 +1533,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasAES()) { setOperationAction(ISD::CLMUL, {MVT::i16, MVT::i32, MVT::i64}, Custom); setOperationAction(ISD::CLMUL, {MVT::v1i64, MVT::v2i64}, Legal); + setOperationAction(ISD::CLMULH, {MVT::v1i64, MVT::v2i64}, Legal); } } else /* !isNeonAvailable */ { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c7d749ffe81e1..a8ca9f0f39a53 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7983,6 +7983,15 @@ let Predicates = [HasAES] in { (i64 1), (v2i64 (PMULLv2i64 $Rn, $Rm)), (i64 0)))>; + + def : Pat<(clmulh v1i64:$Rn, v1i64:$Rm), + (EXTRACT_SUBREG (v2i64 (EXTv16i8 (v2i64 (PMULLv1i64 $Rn, $Rm)), + (v2i64 (PMULLv1i64 $Rn, $Rm)), + 8)), dsub)>; + def : Pat<(clmulh v2i64:$Rn, v2i64:$Rm), + (ZIP2v2i64 (v2i64 (PMULLv1i64 (v1i64 (EXTRACT_SUBREG $Rn, dsub)), + (v1i64 (EXTRACT_SUBREG $Rm, dsub)))), + (v2i64 (PMULLv2i64 $Rn, $Rm)))>; } def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)), diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll index c0d1116230067..5360edda0c5eb 100644 --- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll +++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll @@ -6536,15 +6536,13 @@ define <2 x i64> @clmulr_v2i64_neon(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; CHECK-AES-LABEL: clmulr_v2i64_neon: ; CHECK-AES: // %bb.0: -; CHECK-AES-NEXT: rev64 v1.16b, v1.16b -; CHECK-AES-NEXT: rev64 v0.16b, v0.16b -; CHECK-AES-NEXT: rbit v1.16b, v1.16b -; CHECK-AES-NEXT: rbit v0.16b, v0.16b ; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d ; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d -; CHECK-AES-NEXT: mov v0.d[1], v2.d[0] -; CHECK-AES-NEXT: rev64 v0.16b, v0.16b -; CHECK-AES-NEXT: rbit v0.16b, v0.16b +; CHECK-AES-NEXT: zip2 v1.2d, v0.2d, v2.2d +; CHECK-AES-NEXT: mov v3.16b, v0.16b +; CHECK-AES-NEXT: mov v3.d[1], v2.d[0] +; CHECK-AES-NEXT: add v0.2d, v1.2d, v1.2d +; CHECK-AES-NEXT: usra v0.2d, v3.2d, #63 ; CHECK-AES-NEXT: ret %a.ext = zext <2 x i64> %a to <2 x i128> %b.ext = zext <2 x i64> %b to <2 x i128> @@ -7015,13 +7013,10 @@ define <1 x i64> @clmulr_v1i64_neon(<1 x i64> %a, <1 x i64> %b) nounwind { ; ; CHECK-AES-LABEL: clmulr_v1i64_neon: ; CHECK-AES: // %bb.0: -; CHECK-AES-NEXT: rev64 v1.8b, v1.8b -; CHECK-AES-NEXT: rev64 v0.8b, v0.8b -; CHECK-AES-NEXT: rbit v1.8b, v1.8b -; CHECK-AES-NEXT: rbit v0.8b, v0.8b -; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d -; CHECK-AES-NEXT: rev64 v0.8b, v0.8b -; CHECK-AES-NEXT: rbit v0.8b, v0.8b +; CHECK-AES-NEXT: pmull v1.1q, v0.1d, v1.1d +; CHECK-AES-NEXT: ext v0.16b, v1.16b, v1.16b, #8 +; CHECK-AES-NEXT: shl d0, d0, #1 +; CHECK-AES-NEXT: usra d0, d1, #63 ; CHECK-AES-NEXT: ret %a.ext = zext <1 x i64> %a to <1 x i128> %b.ext = zext <1 x i64> %b to <1 x i128> @@ -8089,16 +8084,9 @@ define <2 x i64> @clmulh_v2i64_neon(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; CHECK-AES-LABEL: clmulh_v2i64_neon: ; CHECK-AES: // %bb.0: -; CHECK-AES-NEXT: rev64 v1.16b, v1.16b -; CHECK-AES-NEXT: rev64 v0.16b, v0.16b -; CHECK-AES-NEXT: rbit v1.16b, v1.16b -; CHECK-AES-NEXT: rbit v0.16b, v0.16b ; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d ; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d -; CHECK-AES-NEXT: mov v0.d[1], v2.d[0] -; CHECK-AES-NEXT: rev64 v0.16b, v0.16b -; CHECK-AES-NEXT: rbit v0.16b, v0.16b -; CHECK-AES-NEXT: ushr v0.2d, v0.2d, #1 +; CHECK-AES-NEXT: zip2 v0.2d, v0.2d, v2.2d ; CHECK-AES-NEXT: ret %a.ext = zext <2 x i64> %a to <2 x i128> %b.ext = zext <2 x i64> %b to <2 x i128> @@ -8570,14 +8558,9 @@ define <1 x i64> @clmulh_v1i64_neon(<1 x i64> %a, <1 x i64> %b) nounwind { ; ; CHECK-AES-LABEL: clmulh_v1i64_neon: ; CHECK-AES: // %bb.0: -; CHECK-AES-NEXT: rev64 v1.8b, v1.8b -; CHECK-AES-NEXT: rev64 v0.8b, v0.8b -; CHECK-AES-NEXT: rbit v1.8b, v1.8b -; CHECK-AES-NEXT: rbit v0.8b, v0.8b ; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d -; CHECK-AES-NEXT: rev64 v0.8b, v0.8b -; CHECK-AES-NEXT: rbit v0.8b, v0.8b -; CHECK-AES-NEXT: ushr d0, d0, #1 +; CHECK-AES-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-AES-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-AES-NEXT: ret %a.ext = zext <1 x i64> %a to <1 x i128> %b.ext = zext <1 x i64> %b to <1 x i128> From bcb84e234980ca42ee47e21e4c97180c788c946a Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Mon, 11 May 2026 18:32:50 +0200 Subject: [PATCH 304/538] [AMDGPU] Replace vdst_in opcode exclusion list with position check (#196946) Use getNamedOperandIdx to detect if vdst_in has already been added by a prior converter, instead of maintaining a hardcoded opcode list. --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 48 +++---------------- 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3ae957906fd82..5bacb6628474d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9601,49 +9601,13 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Inst.addOperand(Inst.getOperand(0)); } - // Adding vdst_in operand is already covered for these DPP instructions in - // cvtVOP3DPP. - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) && - !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx11 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx11 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx11 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx11 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx11 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx11 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx11 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx11 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx11 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx11 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx11 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx11 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp_gfx1250 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp8_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 || - Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 || - Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 || - Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 || - Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 || - Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 || - Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) { + // Append vdst_in only if a previous converter (cvtVOP3DPP for DPP variants, + // cvtVOP3 for byte_sel variants) hasn't already placed it. Use the position + // of the named operand to detect that, the same way cvtVOP3DPP does + // internally. + int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + if (VdstInIdx != -1 && VdstInIdx == static_cast(Inst.getNumOperands())) Inst.addOperand(Inst.getOperand(0)); - } int BitOp3Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::bitop3); if (BitOp3Idx != -1) { From 178651f31b318e19f6e2f7cecd659e6ac63fa8c4 Mon Sep 17 00:00:00 2001 From: "Ivan R. Ivanov" Date: Mon, 11 May 2026 18:38:13 +0200 Subject: [PATCH 305/538] [Hashing] Silence compiler warning for empty parameter packs (#196941) --- llvm/include/llvm/ADT/Hashing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h index c3cc37683c79a..6d679d815a417 100644 --- a/llvm/include/llvm/ADT/Hashing.h +++ b/llvm/include/llvm/ADT/Hashing.h @@ -327,7 +327,7 @@ template hash_code hash_combine(const Ts &...args) { // Round up so `data()` is non-null when Total == 0; combine_bytes won't // read the buffer in that case (len=0 short-circuits in xxh3_64bits). std::array(1, Total)> buf; - size_t off = 0; + [[maybe_unused]] size_t off = 0; (hashing::detail::store_hashable_data(buf.data(), off, args), ...); return hashing::detail::combine_bytes(buf.data(), Total); } From e09449601b4ded2fa1f20955fc2a1be2795eef37 Mon Sep 17 00:00:00 2001 From: Arseniy Zaostrovnykh Date: Mon, 11 May 2026 18:39:44 +0200 Subject: [PATCH 306/538] fixup! [clang] Forward `-fvalidate-ast-input-files-content` when loading AST dumps Apparently `-t 300001010000` is not a universally valid date format. This should fix the buildbot failure caused by #196298 Replace with an in-line python script that should be more portable. --- clang/test/Analysis/ctu/reusable-pch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Analysis/ctu/reusable-pch.c b/clang/test/Analysis/ctu/reusable-pch.c index ddcccc9cf6150..5d757f9499998 100644 --- a/clang/test/Analysis/ctu/reusable-pch.c +++ b/clang/test/Analysis/ctu/reusable-pch.c @@ -19,8 +19,8 @@ // Step 2b: Run with content validation - no difference. // RUN: %{ctu_analysis} %t/main.c -fvalidate-ast-input-files-content -// Step 3: Set mtime of the source from which PCH was built to the year 3000 (way in the future). -// RUN: touch -t 300001010000 %t/other.c +// Step 3: Advance mtime of the source from which PCH was built. +// RUN: %python -c "import os, sys, time; os.utime(sys.argv[1], (time.time() + 120, time.time() + 120))" %t/other.c // Step 4a: Run CTU using the "stale" PCH, and it should still load it and find the division by zero bug. // RUN: %{ctu_analysis} -fvalidate-ast-input-files-content %t/main.c From 8e7ff6e962ba98b1eb459e39af5310e4b59e49a9 Mon Sep 17 00:00:00 2001 From: Jiahao Guo Date: Tue, 12 May 2026 00:43:03 +0800 Subject: [PATCH 307/538] [CIR][AArch64] Lower NEON vuzp1/2 intrinsics (#195527) ### Summary part of : https://github.com/llvm/llvm-project/issues/185382 Lower vuzp1 and vuzp2 intrinsics in https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#unzip-elements All the intrinsics are handled inline in llvm-project/build/lib/clang/23/include/arm_neon.h like: ``` #ifdef __LITTLE_ENDIAN__ __ai __attribute__((target("neon"))) int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) { int8x8_t __ret; __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14); return __ret; } #else __ai __attribute__((target("neon"))) int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) { int8x8_t __ret; int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, __lane_reverse_64_8); int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, __lane_reverse_64_8); __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14); __ret = __builtin_shufflevector(__ret, __ret, __lane_reverse_64_8); return __ret; } #endif ``` So no additional special lowering logic is needed. --- .../fp8-intrinsics/acle_neon_fp8_untyped.c | 40 -- clang/test/CodeGen/AArch64/neon-perm.c | 422 +------------- clang/test/CodeGen/AArch64/neon/perm.c | 529 +++++++++++++++++- clang/test/CodeGen/AArch64/poly64.c | 20 - 4 files changed, 529 insertions(+), 482 deletions(-) diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c index 0a9a88fc249b0..e386d2cca2cb1 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c @@ -631,26 +631,6 @@ mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { return vtrn1q_mf8(a, b); } -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_mf8( -// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -mfloat8x8_t test_vuzp1_mf8(mfloat8x8_t a, mfloat8x8_t b) { - return vuzp1_mf8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_mf8( -// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -mfloat8x16_t test_vuzp1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { - return vuzp1q_mf8(a, b); -} - // CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_mf8( // CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -671,26 +651,6 @@ mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { return vtrn2q_mf8(a, b); } -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_mf8( -// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -mfloat8x8_t test_vuzp2_mf8(mfloat8x8_t a, mfloat8x8_t b) { - return vuzp2_mf8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_mf8( -// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -mfloat8x16_t test_vuzp2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { - return vuzp2q_mf8(a, b); -} - // CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl1_mf8( // CHECK-SAME: <16 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c index 7e3745b952f50..79cf97f10ae40 100644 --- a/clang/test/CodeGen/AArch64/neon-perm.c +++ b/clang/test/CodeGen/AArch64/neon-perm.c @@ -6,428 +6,8 @@ #include -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) { - return vuzp1_s8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_s8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) { - return vuzp1q_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) { - return vuzp1_s16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_s16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) { - return vuzp1q_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) { - return vuzp1_s32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_s32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) { - return vuzp1q_s32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_s64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) { - return vuzp1q_s64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) { - return vuzp1_u8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_u8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) { - return vuzp1q_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) { - return vuzp1_u16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_u16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) { - return vuzp1q_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) { - return vuzp1_u32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_u32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) { - return vuzp1q_u32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_u64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) { - return vuzp1q_u64(a, b); -} - -// CHECK-LABEL: define dso_local <2 x float> @test_vuzp1_f32( -// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] -// -float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) { - return vuzp1_f32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x float> @test_vuzp1q_f32( -// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] -// -float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) { - return vuzp1q_f32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x double> @test_vuzp1q_f64( -// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] -// -float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) { - return vuzp1q_f64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_p8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) { - return vuzp1_p8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_p8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) { - return vuzp1q_p8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_p16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) { - return vuzp1_p16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_p16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) { - return vuzp1q_p16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) { - return vuzp2_s8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_s8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) { - return vuzp2q_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) { - return vuzp2_s16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_s16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) { - return vuzp2q_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) { - return vuzp2_s32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_s32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) { - return vuzp2q_s32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_s64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) { - return vuzp2q_s64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) { - return vuzp2_u8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_u8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) { - return vuzp2q_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) { - return vuzp2_u16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_u16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) { - return vuzp2q_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) { - return vuzp2_u32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_u32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) { - return vuzp2q_u32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_u64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) { - return vuzp2q_u64(a, b); -} - -// CHECK-LABEL: define dso_local <2 x float> @test_vuzp2_f32( -// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] -// -float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) { - return vuzp2_f32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x float> @test_vuzp2q_f32( -// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] -// -float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) { - return vuzp2q_f32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x double> @test_vuzp2q_f64( -// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] -// -float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) { - return vuzp2q_f64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_p8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) { - return vuzp2_p8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_p8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) { - return vuzp2q_p8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_p16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) { - return vuzp2_p16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_p16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) { - return vuzp2q_p16(a, b); -} - // CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> // CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] diff --git a/clang/test/CodeGen/AArch64/neon/perm.c b/clang/test/CodeGen/AArch64/neon/perm.c index 6c3a9a44a7084..c90eb8290db55 100644 --- a/clang/test/CodeGen/AArch64/neon/perm.c +++ b/clang/test/CodeGen/AArch64/neon/perm.c @@ -12,7 +12,6 @@ // ACLE section headings based on v2025Q2 of the ACLE specification: // * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#zip-elements // -// TODO: Migrate the unzip elements intrinsics in https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#unzip-elements //============================================================================= #include @@ -1280,6 +1279,534 @@ mfloat8x16x2_t test_vuzpq_mf8(mfloat8x16_t a, mfloat8x16_t b) { return vuzpq_mf8(a, b); } +// LLVM-LABEL: @test_vuzp1_s8( +// CIR-LABEL: @vuzp1_s8( +int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !s8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp1_s8(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_s8( +// CIR-LABEL: @vuzp1q_s8( +int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i, #cir.int<16> : !s64i, #cir.int<18> : !s64i, #cir.int<20> : !s64i, #cir.int<22> : !s64i, #cir.int<24> : !s64i, #cir.int<26> : !s64i, #cir.int<28> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !s8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp1q_s8(a, b); +} + +// LLVM-LABEL: @test_vuzp1_s16( +// CIR-LABEL: @vuzp1_s16( +int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s16i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !s16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[VUZP]] + return vuzp1_s16(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_s16( +// CIR-LABEL: @vuzp1q_s16( +int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[VUZP]] + return vuzp1q_s16(a, b); +} + +// LLVM-LABEL: @test_vuzp1_s32( +// CIR-LABEL: @vuzp1_s32( +int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !s32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[VUZP]] + return vuzp1_s32(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_s32( +// CIR-LABEL: @vuzp1q_s32( +int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[VUZP]] + return vuzp1q_s32(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_s64( +// CIR-LABEL: @vuzp1q_s64( +int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[VUZP]] + return vuzp1q_s64(a, b); +} + +// LLVM-LABEL: @test_vuzp1_u8( +// CIR-LABEL: @vuzp1_u8( +uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp1_u8(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_u8( +// CIR-LABEL: @vuzp1q_u8( +uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i, #cir.int<16> : !s64i, #cir.int<18> : !s64i, #cir.int<20> : !s64i, #cir.int<22> : !s64i, #cir.int<24> : !s64i, #cir.int<26> : !s64i, #cir.int<28> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp1q_u8(a, b); +} + +// LLVM-LABEL: @test_vuzp1_u16( +// CIR-LABEL: @vuzp1_u16( +uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[VUZP]] + return vuzp1_u16(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_u16( +// CIR-LABEL: @vuzp1q_u16( +uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[VUZP]] + return vuzp1q_u16(a, b); +} + +// LLVM-LABEL: @test_vuzp1_u32( +// CIR-LABEL: @vuzp1_u32( +uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[VUZP]] + return vuzp1_u32(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_u32( +// CIR-LABEL: @vuzp1q_u32( +uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[VUZP]] + return vuzp1q_u32(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_u64( +// CIR-LABEL: @vuzp1q_u64( +uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[VUZP]] + return vuzp1q_u64(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_p64( +// CIR-LABEL: @vuzp1q_p64( +poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[VUZP]] + return vuzp1q_p64(a, b); +} + +// LLVM-LABEL: @test_vuzp1_f32( +// CIR-LABEL: @vuzp1_f32( +float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !cir.float> + +// LLVM-SAME: <2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// LLVM: ret <2 x float> [[VUZP]] + return vuzp1_f32(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_f32( +// CIR-LABEL: @vuzp1q_f32( +float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// LLVM: ret <4 x float> [[VUZP]] + return vuzp1q_f32(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_f64( +// CIR-LABEL: @vuzp1q_f64( +float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// LLVM: ret <2 x double> [[VUZP]] + return vuzp1q_f64(a, b); +} + +// LLVM-LABEL: @test_vuzp1_p8( +// CIR-LABEL: @vuzp1_p8( +poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp1_p8(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_p8( +// CIR-LABEL: @vuzp1q_p8( +poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i, #cir.int<16> : !s64i, #cir.int<18> : !s64i, #cir.int<20> : !s64i, #cir.int<22> : !s64i, #cir.int<24> : !s64i, #cir.int<26> : !s64i, #cir.int<28> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp1q_p8(a, b); +} + +// LLVM-LABEL: @test_vuzp1_p16( +// CIR-LABEL: @vuzp1_p16( +poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[VUZP]] + return vuzp1_p16(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_p16( +// CIR-LABEL: @vuzp1q_p16( +poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[VUZP]] + return vuzp1q_p16(a, b); +} + +// LLVM-LABEL: @test_vuzp1_mf8( +// CIR-LABEL: @vuzp1_mf8( +mfloat8x8_t test_vuzp1_mf8(mfloat8x8_t a, mfloat8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp1_mf8(a, b); +} + +// LLVM-LABEL: @test_vuzp1q_mf8( +// CIR-LABEL: @vuzp1q_mf8( +mfloat8x16_t test_vuzp1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i, #cir.int<4> : !s64i, #cir.int<6> : !s64i, #cir.int<8> : !s64i, #cir.int<10> : !s64i, #cir.int<12> : !s64i, #cir.int<14> : !s64i, #cir.int<16> : !s64i, #cir.int<18> : !s64i, #cir.int<20> : !s64i, #cir.int<22> : !s64i, #cir.int<24> : !s64i, #cir.int<26> : !s64i, #cir.int<28> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp1q_mf8(a, b); +} + +// LLVM-LABEL: @test_vuzp2_s8( +// CIR-LABEL: @vuzp2_s8( +int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !s8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp2_s8(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_s8( +// CIR-LABEL: @vuzp2q_s8( +int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i, #cir.int<17> : !s64i, #cir.int<19> : !s64i, #cir.int<21> : !s64i, #cir.int<23> : !s64i, #cir.int<25> : !s64i, #cir.int<27> : !s64i, #cir.int<29> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !s8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp2q_s8(a, b); +} + +// LLVM-LABEL: @test_vuzp2_s16( +// CIR-LABEL: @vuzp2_s16( +int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s16i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !s16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[VUZP]] + return vuzp2_s16(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_s16( +// CIR-LABEL: @vuzp2q_s16( +int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[VUZP]] + return vuzp2q_s16(a, b); +} + +// LLVM-LABEL: @test_vuzp2_s32( +// CIR-LABEL: @vuzp2_s32( +int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !s32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[VUZP]] + return vuzp2_s32(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_s32( +// CIR-LABEL: @vuzp2q_s32( +int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[VUZP]] + return vuzp2q_s32(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_s64( +// CIR-LABEL: @vuzp2q_s64( +int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[VUZP]] + return vuzp2q_s64(a, b); +} + +// LLVM-LABEL: @test_vuzp2_u8( +// CIR-LABEL: @vuzp2_u8( +uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp2_u8(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_u8( +// CIR-LABEL: @vuzp2q_u8( +uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i, #cir.int<17> : !s64i, #cir.int<19> : !s64i, #cir.int<21> : !s64i, #cir.int<23> : !s64i, #cir.int<25> : !s64i, #cir.int<27> : !s64i, #cir.int<29> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp2q_u8(a, b); +} + +// LLVM-LABEL: @test_vuzp2_u16( +// CIR-LABEL: @vuzp2_u16( +uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[VUZP]] + return vuzp2_u16(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_u16( +// CIR-LABEL: @vuzp2q_u16( +uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[VUZP]] + return vuzp2q_u16(a, b); +} + +// LLVM-LABEL: @test_vuzp2_u32( +// CIR-LABEL: @vuzp2_u32( +uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[VUZP]] + return vuzp2_u32(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_u32( +// CIR-LABEL: @vuzp2q_u32( +uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[VUZP]] + return vuzp2q_u32(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_u64( +// CIR-LABEL: @vuzp2q_u64( +uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[VUZP]] + return vuzp2q_u64(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_p64( +// CIR-LABEL: @vuzp2q_p64( +poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[VUZP]] + return vuzp2q_p64(a, b); +} + +// LLVM-LABEL: @test_vuzp2_f32( +// CIR-LABEL: @vuzp2_f32( +float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !cir.float> + +// LLVM-SAME: <2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// LLVM: ret <2 x float> [[VUZP]] + return vuzp2_f32(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_f32( +// CIR-LABEL: @vuzp2q_f32( +float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// LLVM: ret <4 x float> [[VUZP]] + return vuzp2q_f32(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_f64( +// CIR-LABEL: @vuzp2q_f64( +float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// LLVM: ret <2 x double> [[VUZP]] + return vuzp2q_f64(a, b); +} + +// LLVM-LABEL: @test_vuzp2_p8( +// CIR-LABEL: @vuzp2_p8( +poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp2_p8(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_p8( +// CIR-LABEL: @vuzp2q_p8( +poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i, #cir.int<17> : !s64i, #cir.int<19> : !s64i, #cir.int<21> : !s64i, #cir.int<23> : !s64i, #cir.int<25> : !s64i, #cir.int<27> : !s64i, #cir.int<29> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp2q_p8(a, b); +} + +// LLVM-LABEL: @test_vuzp2_p16( +// CIR-LABEL: @vuzp2_p16( +poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[VUZP]] + return vuzp2_p16(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_p16( +// CIR-LABEL: @vuzp2q_p16( +poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[VUZP]] + return vuzp2q_p16(a, b); +} + +// LLVM-LABEL: @test_vuzp2_mf8( +// CIR-LABEL: @vuzp2_mf8( +mfloat8x8_t test_vuzp2_mf8(mfloat8x8_t a, mfloat8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[VUZP]] + return vuzp2_mf8(a, b); +} + +// LLVM-LABEL: @test_vuzp2q_mf8( +// CIR-LABEL: @vuzp2q_mf8( +mfloat8x16_t test_vuzp2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i, #cir.int<5> : !s64i, #cir.int<7> : !s64i, #cir.int<9> : !s64i, #cir.int<11> : !s64i, #cir.int<13> : !s64i, #cir.int<15> : !s64i, #cir.int<17> : !s64i, #cir.int<19> : !s64i, #cir.int<21> : !s64i, #cir.int<23> : !s64i, #cir.int<25> : !s64i, #cir.int<27> : !s64i, #cir.int<29> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[VUZP:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[VUZP]] + return vuzp2q_mf8(a, b); +} + //===------------------------------------------------------===// // 2.1.9.14. Unzip elements` // https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#unzip-elements-1 diff --git a/clang/test/CodeGen/AArch64/poly64.c b/clang/test/CodeGen/AArch64/poly64.c index 5e808f07a6f56..1a7eceefa6a58 100644 --- a/clang/test/CodeGen/AArch64/poly64.c +++ b/clang/test/CodeGen/AArch64/poly64.c @@ -515,26 +515,6 @@ poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) { return vextq_p64(a, b, 1); } -// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_p64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) { - return vuzp1q_p64(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_p64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) { - return vuzp2q_u64(a, b); -} - // CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_p64( // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] From a18fbe5890acb2a4e2e379e1c65826c9e17bc221 Mon Sep 17 00:00:00 2001 From: David Stone Date: Mon, 11 May 2026 10:47:57 -0600 Subject: [PATCH 308/538] [clang-tidy][NFC] Use `std::array` instead of `SmallVector` in `UseTrailingReturnTypeCheck` (#196992) We know we have exactly 2 `Token` in the array. There is no need to use `SmallVector` here. --- .../clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp index 8bc06afaeb33c..27be649c151bf 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp @@ -228,7 +228,7 @@ classifyToken(const FunctionDecl &F, Preprocessor &PP, Token Tok) { Token End; End.startToken(); End.setKind(tok::eof); - const SmallVector Stream{Tok, End}; + const std::array Stream{Tok, End}; // FIXME: do not report these token to Preprocessor.TokenWatcher. PP.EnterTokenStream(Stream, false, /*IsReinject=*/false); From 7377bac59b4aea64da09873b44df1430571e93c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Mon, 11 May 2026 18:50:37 +0200 Subject: [PATCH 309/538] [clang-tidy][clang][diagnostics] Unify system header/macro supression (#193774) Before this change, if Clang-Tidy ran with `--system-headers=false` (the default config), it suppressed all reports where the location was in a system header or in the expansion of a macro from a system header. This was problematic because e.g. practically all reports of 'clang-diagnostic-invalid-offsetof' (a/k/a `-Winvalid-offsetof`) are in the expansion of a system macro (namely 'offsetof'), so Clang-Tidy was unable to display these reports. In the clang frontend there was a more refined logic: certain diagnostics have the 'ShowInSystemHeader' and/or 'ShowInSystemMacro' properties which ensure that they are printed (by the clang frontend) even if the compiler wouldn't otherwise print diagnostics from system headers/macros. This commit exposes this more refined logic under the name `DiagnosticIDs::shouldSuppressAsSystemWarning` and invokes this logic from Clang-Tidy to ensure that the `--system-headers=false` flag of Clang-Tidy cannot suppress 'clang-diagnostic-*' reports that are "protected by" the properties 'ShowInSystemHeader' or 'ShowInSystemMacro'. The changes in `DiagnosticIDs.{cpp,h}` are NFC, they do not affect the behavior of 'clang'. This commit does not influence the reporting of "native" tidy reports and 'clang-analyzer-*' reports. However, in the future it would be possible to extend this logic and assign 'ShowInSystemHeader' and/or 'ShowInSystemMacro' properties to Clang-Tidy checks that need them. --- .../ClangTidyDiagnosticConsumer.cpp | 21 ++++++-- .../clang-tidy/ClangTidyDiagnosticConsumer.h | 9 +++- clang-tools-extra/docs/ReleaseNotes.rst | 7 +++ .../Inputs/system-headers/mock_cstddef.h | 1 + .../system-macro-diagnostic.cpp | 28 +++++++++++ clang/include/clang/Basic/DiagnosticIDs.h | 9 ++++ clang/lib/Basic/DiagnosticIDs.cpp | 48 ++++++++++++------- 7 files changed, 99 insertions(+), 24 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/Inputs/system-headers/mock_cstddef.h create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/system-macro-diagnostic.cpp diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index f7232645a329c..88d0a433bc7fb 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -312,6 +312,10 @@ std::string ClangTidyContext::getCheckName(unsigned DiagnosticID) const { return ""; } +bool ClangTidyContext::isCompilerDiagnostic(unsigned DiagnosticID) const { + return !CheckNamesByDiagnosticID.contains(DiagnosticID); +} + ClangTidyDiagnosticConsumer::ClangTidyDiagnosticConsumer( ClangTidyContext &Ctx, DiagnosticsEngine *ExternalDiagEngine, bool RemoveIncompatibleErrors, bool GetFixesFromNotes, @@ -470,7 +474,7 @@ void ClangTidyDiagnosticConsumer::HandleDiagnostic( } if (Info.hasSourceManager()) - checkFilters(Info.getLocation(), Info.getSourceManager()); + checkFilters(Info.getLocation(), Info.getID(), Info.getSourceManager()); for (const auto &Error : SuppressionErrors) Context.diag(Error); @@ -567,6 +571,7 @@ void ClangTidyDiagnosticConsumer::forwardDiagnostic(const Diagnostic &Info) { } void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location, + unsigned DiagnosticID, const SourceManager &Sources) { // Invalid location may mean a diagnostic in a command line, don't skip these. if (!Location.isValid()) { @@ -575,9 +580,17 @@ void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location, return; } - if (!Context.getOptions().SystemHeaders.value_or(false) && - (Sources.isInSystemHeader(Location) || Sources.isInSystemMacro(Location))) - return; + if (!Context.getOptions().SystemHeaders.value_or(false)) { + if (Context.isCompilerDiagnostic(DiagnosticID)) { + if (Context.DiagEngine->getDiagnosticIDs()->shouldSuppressAsSystemWarning( + DiagnosticID, Location, *Context.DiagEngine)) + return; + } else { + if (Sources.isInSystemHeader(Location) || + Sources.isInSystemMacro(Location)) + return; + } + } // FIXME: We start with a conservative approach here, but the actual type of // location needed depends on the check (in particular, where this check wants diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h index 8de5778dfefb0..c76f58bc4cc86 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h @@ -146,6 +146,10 @@ class ClangTidyContext { /// diagnostic ID. std::string getCheckName(unsigned DiagnosticID) const; + /// Returns true if this clang-tidy check is in fact a compiler warning + /// exposed as a 'clang-diagnostic-*' check. + bool isCompilerDiagnostic(unsigned DiagnosticID) const; + /// Returns \c true if the check is enabled for the \c CurrentFile. /// /// The \c CurrentFile can be changed using \c setCurrentFile. @@ -319,8 +323,9 @@ class ClangTidyDiagnosticConsumer : public DiagnosticConsumer { llvm::Regex *getExcludeHeaderFilter(); /// Updates \c LastErrorRelatesToUserCode and LastErrorPassesLineFilter - /// according to the diagnostic \p Location. - void checkFilters(SourceLocation Location, const SourceManager &Sources); + /// according to the diagnostic kind \p DiagnosticID and the \p Location. + void checkFilters(SourceLocation Location, unsigned DiagnosticID, + const SourceManager &Sources); bool passesLineFilter(StringRef FileName, unsigned LineNumber) const; void forwardDiagnostic(const Diagnostic &Info); diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 241ae52cffdd2..1b469cad4eb9c 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -163,6 +163,13 @@ Improvements to clang-tidy - Improved :program:`clang-tidy` ``-store-check-profile`` by generating valid JSON when the source file path contains characters that require JSON escaping. +- Ensured that :program:`clang-tidy` and the clang compiler uses the same logic + for the suppression of compiler diagnostics in system headers and expansions + of macros defined in system headers. Previously the default setting of tidy + overzealously suppressed some diagnostics that would have been emitted by the + compiler. (E.g. tidy suppressed many ``clang-diagnostic-invalid-offsetof`` + reports because they usually occur in expansion of the macro ``offsetof``.) + New checks ^^^^^^^^^^ diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/system-headers/mock_cstddef.h b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/system-headers/mock_cstddef.h new file mode 100644 index 0000000000000..641e3e7f072ea --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/system-headers/mock_cstddef.h @@ -0,0 +1 @@ +#define offsetof(t, d) __builtin_offsetof(t, d) diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/system-macro-diagnostic.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/system-macro-diagnostic.cpp new file mode 100644 index 0000000000000..ac098b3449bce --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/system-macro-diagnostic.cpp @@ -0,0 +1,28 @@ +// RUN: clang-tidy -checks='-*,clang-diagnostic-invalid-offsetof,concurrency-mt-unsafe' -header-filter='.*' -system-headers=true %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-SYSTEM-HEADERS %s +// RUN: clang-tidy -checks='-*,clang-diagnostic-invalid-offsetof,concurrency-mt-unsafe' -header-filter='.*' -system-headers=false %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-NO-SYSTEM-HEADERS %s + +// Validate that we don't get the diagnostics when the 'clang-diagnostic-*' +// check is disabled: +// RUN: clang-tidy -checks='-*,concurrency-mt-unsafe' -header-filter='.*' -system-headers=true %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -implicit-check-not='{{warning:|error:}}' %s + +// FIXME: The check 'concurrency-mt-unsafe' is completely unrelated to this +// test, it is only added to the RUN lines because Clang-Tidy aborts the +// analysis with "Error: no checks enabled." if all the enabled checks are +// 'clang-diagnostic-*' checks (i.e. compiler warnings). +// Once GH#192713 is resolved, remove 'concurrency-mt-unsafe'. + +#include + +struct D { + virtual void f() {} + virtual ~D() {} + int i; +}; + +int main() { + // Previously Clang-Tidy was suppressing this -Winvalid-offsetof report + // because the error location is in a system macro (namely, 'offsetof'). + (void) offsetof(D, i); + // CHECK-SYSTEM-HEADERS: :[[@LINE-1]]:10: warning: 'offsetof' on non-standard-layout type 'D' [clang-diagnostic-invalid-offsetof] + // CHECK-NO-SYSTEM-HEADERS: :[[@LINE-2]]:10: warning: 'offsetof' on non-standard-layout type 'D' [clang-diagnostic-invalid-offsetof] +} diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index 09e2d12dd040e..47f63933e1e2f 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -488,6 +488,15 @@ class DiagnosticIDs : public RefCountedBase { static unsigned getCXXCompatDiagId(const LangOptions &LangOpts, unsigned CompatDiagId); + /// Return true if either of the following two conditions hold: + /// 1. \p Loc is in a system header and the diagnostic kind \p DiagID does + /// not have the property 'ShowInSystemHeader'. + /// 2. \p Loc is in the expansion of a macro defined in a system header and + /// the diagnostic kind \p DiagID does not have the property + /// 'ShowInSystemMacro'. + bool shouldSuppressAsSystemWarning(unsigned DiagID, SourceLocation Loc, + const DiagnosticsEngine &Diag) const; + private: /// Classify the specified diagnostic ID into a Level, consumable by /// the DiagnosticClient. diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index 0aa543c8bcb1d..d9c5e4082c1a7 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -620,18 +620,33 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc, if (!Diag.hasSourceManager()) return Result; - const auto &SM = Diag.getSourceManager(); - // If we are in a system header, we ignore it. We look at the diagnostic class - // because we also want to ignore extensions and warnings in -Werror and - // -pedantic-errors modes, which *map* warnings/extensions to errors. - // // We check both the location-specific state and the ForceSystemWarnings // override. In some cases (like template instantiations from system modules), // the location-specific state might have suppression enabled, but the // engine might have an override (e.g. AllowWarningInSystemHeaders) to show // the warning. if (State->SuppressSystemWarnings && !Diag.getForceSystemWarnings() && - Loc.isValid() && SM.isInSystemHeader(SM.getExpansionLoc(Loc))) { + shouldSuppressAsSystemWarning(DiagID, Loc, Diag)) { + return diag::Severity::Ignored; + } + + // Clang-diagnostics pragmas always take precedence over suppression mapping. + if (!Mapping.isPragma() && Diag.isSuppressedViaMapping(DiagID, Loc)) + return diag::Severity::Ignored; + + return Result; +} + +bool DiagnosticIDs::shouldSuppressAsSystemWarning( + unsigned DiagID, SourceLocation Loc, const DiagnosticsEngine &Diag) const { + if (!Loc.isValid()) + return false; + + bool IsCustomDiag = DiagnosticIDs::IsCustomDiag(DiagID); + const auto &SM = Diag.getSourceManager(); + + // If we are in a system header, we ignore it. + if (SM.isInSystemHeader(SM.getExpansionLoc(Loc))) { bool ShowInSystemHeader = true; if (IsCustomDiag) ShowInSystemHeader = @@ -640,25 +655,22 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc, ShowInSystemHeader = Rec->WarnShowInSystemHeader; if (!ShowInSystemHeader) - return diag::Severity::Ignored; + return true; } - // We also ignore warnings due to system macros. As above, we respect the - // ForceSystemWarnings override. - if (State->SuppressSystemWarnings && !Diag.getForceSystemWarnings() && - Loc.isValid()) { - + // We also ignore warnings due to system macros. + if (Loc.isValid()) { bool ShowInSystemMacro = true; + + // FIXME: Respect the "show in system macro" information in the + // CustomDiagInfo (which is currently ignored). + if (const StaticDiagInfoRec *Rec = GetDiagInfo(DiagID)) ShowInSystemMacro = Rec->WarnShowInSystemMacro; if (!ShowInSystemMacro && SM.isInSystemMacro(Loc)) - return diag::Severity::Ignored; + return true; } - // Clang-diagnostics pragmas always take precedence over suppression mapping. - if (!Mapping.isPragma() && Diag.isSuppressedViaMapping(DiagID, Loc)) - return diag::Severity::Ignored; - - return Result; + return false; } DiagnosticIDs::Class DiagnosticIDs::getDiagClass(unsigned DiagID) const { From e40ce9ab4dfc9e78e2f682e0b42caf65dc23f66f Mon Sep 17 00:00:00 2001 From: Andrei Sabalenka Date: Mon, 11 May 2026 20:04:49 +0300 Subject: [PATCH 310/538] [clang][AST] Treat unsigned _BitInt(1) as boolean-valued (#195793) Update `Expr::isKnownToHaveBooleanValue` to recognize `unsigned _BitInt(1)` when semantic boolean-ness is not required. --- clang/lib/AST/Expr.cpp | 5 ++++ clang/unittests/AST/ASTExprTest.cpp | 46 +++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 64d61dbc3d128..eb452f5da6787 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -142,6 +142,11 @@ bool Expr::isKnownToHaveBooleanValue(bool Semantic) const { // If this is a non-scalar-integer type, we don't care enough to try. if (!E->getType()->isIntegralOrEnumerationType()) return false; + if (!Semantic) + if (const auto *BIT = E->getType()->getAs(); + BIT && BIT->isUnsigned() && BIT->getNumBits() == 1) + return true; + if (const UnaryOperator *UO = dyn_cast(E)) { switch (UO->getOpcode()) { case UO_Plus: diff --git a/clang/unittests/AST/ASTExprTest.cpp b/clang/unittests/AST/ASTExprTest.cpp index adaceb76de8b4..6e5683c68eb4d 100644 --- a/clang/unittests/AST/ASTExprTest.cpp +++ b/clang/unittests/AST/ASTExprTest.cpp @@ -108,3 +108,49 @@ TEST(ASTExpr, InitListIsConstantInitialized) { (void)FooInit->updateInit(Ctx, 2, Ref); EXPECT_FALSE(FooInit->isConstantInitializer(Ctx, false)); } + +TEST(ASTExpr, IsKnownToHaveBooleanValue) { + auto AST = tooling::buildASTFromCodeWithArgs( + R"c( + struct S { + int signed_bf1 : 1; + unsigned unsigned_bf1 : 1; + unsigned unsigned_bf2 : 2; + }; + + _Bool bool_value; + int int_value; + unsigned _BitInt(1) unsigned_bitint1; + unsigned _BitInt(2) unsigned_bitint2; + struct S s; + + void f(void) { + int from_bool = bool_value; + int from_int = int_value; + int from_signed_bitfield1 = s.signed_bf1; + int from_bitfield1 = s.unsigned_bf1; + int from_bitfield2 = s.unsigned_bf2; + int from_bitint1 = unsigned_bitint1; + int from_bitint2 = unsigned_bitint2; + } + )c", + {"-std=c23"}, "input.c"); + ASSERT_TRUE(AST); + + auto ExpectKnown = [&](const char *Name, bool Semantic, bool NonSemantic) { + const VarDecl *VD = getVariableNode(AST.get(), Name); + ASSERT_NE(VD, nullptr); + ASSERT_TRUE(VD->hasInit()); + const Expr *Init = VD->getInit(); + EXPECT_EQ(Semantic, Init->isKnownToHaveBooleanValue(true)) << Name; + EXPECT_EQ(NonSemantic, Init->isKnownToHaveBooleanValue(false)) << Name; + }; + + ExpectKnown("from_bool", true, true); + ExpectKnown("from_int", false, false); + ExpectKnown("from_signed_bitfield1", false, false); + ExpectKnown("from_bitfield1", false, true); + ExpectKnown("from_bitfield2", false, false); + ExpectKnown("from_bitint1", false, true); + ExpectKnown("from_bitint2", false, false); +} From 70fd1fc6644cd21ef2ade71a178ec9b076c9d6b7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 11 May 2026 18:06:07 +0100 Subject: [PATCH 311/538] [X86] ctpop-combine.ll - add initial test coverage for #196434 (#196990) --- llvm/test/CodeGen/X86/ctpop-combine.ll | 38 ++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll index 73152e9f909cf..06937af137968 100644 --- a/llvm/test/CodeGen/X86/ctpop-combine.ll +++ b/llvm/test/CodeGen/X86/ctpop-combine.ll @@ -102,6 +102,44 @@ define i8 @test4(i8 %x) nounwind readnone { ret i8 %and } +; PR196434 +define i32 @ctpop_i64_i32_bounds(i64 %x) nounwind readnone { +; POPCOUNT-LABEL: ctpop_i64_i32_bounds: +; POPCOUNT: # %bb.0: +; POPCOUNT-NEXT: popcntq %rdi, %rax +; POPCOUNT-NEXT: # kill: def $eax killed $eax killed $rax +; POPCOUNT-NEXT: retq +; +; NO-POPCOUNT-LABEL: ctpop_i64_i32_bounds: +; NO-POPCOUNT: # %bb.0: +; NO-POPCOUNT-NEXT: movq %rdi, %rax +; NO-POPCOUNT-NEXT: shrq %rax +; NO-POPCOUNT-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; NO-POPCOUNT-NEXT: andq %rax, %rcx +; NO-POPCOUNT-NEXT: subq %rcx, %rdi +; NO-POPCOUNT-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; NO-POPCOUNT-NEXT: movq %rdi, %rcx +; NO-POPCOUNT-NEXT: andq %rax, %rcx +; NO-POPCOUNT-NEXT: shrq $2, %rdi +; NO-POPCOUNT-NEXT: andq %rdi, %rax +; NO-POPCOUNT-NEXT: addq %rcx, %rax +; NO-POPCOUNT-NEXT: movq %rax, %rcx +; NO-POPCOUNT-NEXT: shrq $4, %rcx +; NO-POPCOUNT-NEXT: addq %rax, %rcx +; NO-POPCOUNT-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; NO-POPCOUNT-NEXT: andq %rcx, %rdx +; NO-POPCOUNT-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; NO-POPCOUNT-NEXT: imulq %rdx, %rax +; NO-POPCOUNT-NEXT: shrq $56, %rax +; NO-POPCOUNT-NEXT: # kill: def $eax killed $eax killed $rax +; NO-POPCOUNT-NEXT: retq + %cmp = icmp ult i64 %x, 4294967296 + tail call void @llvm.assume(i1 %cmp) + %count = tail call range(i64 0, 33) i64 @llvm.ctpop.i64(i64 %x) + %cast = trunc nuw nsw i64 %count to i32 + ret i32 %cast +} + define i32 @ctpop_eq_one(i64 %x) nounwind readnone { ; POPCOUNT-LABEL: ctpop_eq_one: ; POPCOUNT: # %bb.0: From c7818b2302df620c024c3e8b78e522742e0cc907 Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Mon, 11 May 2026 13:07:32 -0400 Subject: [PATCH 312/538] XCOFF no inline ptr glue (#193786) Adds an option for out of line indirect calls. `-maix-use-ptrgl` enables the option, and when enabled indirect calls will load the address of the function descriptor of the callee into r11, keeping all other arguments the same and do a direct branch to `._ptrgl` which contains the indirect call sequence code. The call is followed by a toc restore since ptrglue does a tail call and cannot restore the toc itself. The negative option `-mno-aix-use-ptrgl` is also added to revert back to the default behaviour of emitting the indirect call sequence inline at the call site. --------- Co-authored-by: Hubert Tong --- clang/include/clang/Options/Options.td | 6 ++ clang/lib/Driver/ToolChains/Arch/PPC.cpp | 9 ++ clang/test/CodeGen/PowerPC/aix-ptr-glue.c | 18 ++++ clang/test/Driver/ppc-use-ptr-glue.c | 21 +++++ .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 4 + llvm/lib/Target/PowerPC/P10InstrResources.td | 10 +++ llvm/lib/Target/PowerPC/P9InstrResources.td | 9 ++ llvm/lib/Target/PowerPC/PPC.td | 6 ++ llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 53 +++++++++-- llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 28 ++++++ llvm/lib/Target/PowerPC/PPCInstrInfo.td | 37 ++++++++ llvm/lib/Target/PowerPC/PPCScheduleP7.td | 8 ++ llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 4 + llvm/lib/Target/PowerPC/PPCSubtarget.h | 4 + .../CodeGen/PowerPC/aix-mixed-inline-glue.ll | 31 +++++++ .../PowerPC/aix-no-inline-glue-strictfp.ll | 18 ++++ .../CodeGen/PowerPC/aix-no-inline-glue.ll | 87 +++++++++++++++++++ 18 files changed, 350 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/PowerPC/aix-ptr-glue.c create mode 100644 clang/test/Driver/ppc-use-ptr-glue.c create mode 100644 llvm/test/CodeGen/PowerPC/aix-mixed-inline-glue.ll create mode 100644 llvm/test/CodeGen/PowerPC/aix-no-inline-glue-strictfp.ll create mode 100644 llvm/test/CodeGen/PowerPC/aix-no-inline-glue.ll diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 5eeabf4c33b76..5fdc60adc8986 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5976,6 +5976,12 @@ def mxcoff_roptr : Flag<["-"], "mxcoff-roptr">, Group, Flags<[TargetSpecific]>, Visibility<[ClangOption, CC1Option]>, HelpText<"Place constant objects with relocatable address values in the RO data section and add -bforceimprw to the linker flags (AIX only)">; def mno_xcoff_roptr : Flag<["-"], "mno-xcoff-roptr">, Group, TargetSpecific; +def maix_use_ptrgl : Flag<["-"], "maix-use-ptrgl">, Group, + Flags<[TargetSpecific]>, + HelpText<"Use ._ptrgl routine for indirect calls (AIX only)">; +def mnoaix_use_ptrgl : Flag<["-"], "mno-aix-use-ptrgl">, Group, TargetSpecific, + HelpText<"Emit indirect calls inline (AIX only) (default)">; + let Flags = [TargetSpecific] in { def mvx : Flag<["-"], "mvx">, Group; diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 17051980f34fb..d060d8ae2fac8 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -83,6 +83,15 @@ void ppc::getPPCTargetFeatures(const Driver &D, const llvm::Triple &Triple, true) && Triple.isOSAIX()) Features.push_back("+modern-aix-as"); + + if (Arg *A = Args.getLastArg(options::OPT_mnoaix_use_ptrgl, + options::OPT_maix_use_ptrgl)) { + if (!Triple.isOSAIX()) + D.Diag(diag::err_drv_unsupported_opt_for_target) + << A->getAsString(Args) << Triple.str(); + else if (A->getOption().matches(options::OPT_maix_use_ptrgl)) + Features.push_back("+use-ptrgl-helper"); + } } ppc::ReadGOTPtrMode ppc::getPPCReadGOTPtrMode(const Driver &D, const llvm::Triple &Triple, diff --git a/clang/test/CodeGen/PowerPC/aix-ptr-glue.c b/clang/test/CodeGen/PowerPC/aix-ptr-glue.c new file mode 100644 index 0000000000000..de9b847550a36 --- /dev/null +++ b/clang/test/CodeGen/PowerPC/aix-ptr-glue.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix -target-feature +use-ptrgl-helper \ +// RUN: %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix -target-feature +use-ptrgl-helper \ +// RUN: %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -triple powerpc-unknown-aix -emit-llvm -o - | \ +// RUN: FileCheck %s --check-prefix=DIS + +int test(void) { + return 0; +} + +// CHECK: test() #0 { +// CHECK: attributes #0 = { +// CHECK-SAME: "target-features"={{"|"[^"]*,}}+use-ptrgl-helper{{"|,[^"]*"}} + +// DIS-NOT: +use-ptrgl-helper diff --git a/clang/test/Driver/ppc-use-ptr-glue.c b/clang/test/Driver/ppc-use-ptr-glue.c new file mode 100644 index 0000000000000..01a1ccdc036f6 --- /dev/null +++ b/clang/test/Driver/ppc-use-ptr-glue.c @@ -0,0 +1,21 @@ +// RUN: %clang -### --target=powerpc-ibm-aix-xcoff -maix-use-ptrgl %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=PTR_GLUE + +// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -maix-use-ptrgl %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=PTR_GLUE + +// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mno-aix-use-ptrgl %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=INLINE_GLUE + +// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=INLINE_GLUE + +// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -maix-use-ptrgl -mno-aix-use-ptrgl %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=INLINE_GLUE + +// RUN: not %clang -### --target=powerpc64le-unknown-linux-gnu -maix-use-ptrgl %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=ERR + +// PTR_GLUE: "-target-feature" "+use-ptrgl-helper" +// INLINE_GLUE-NOT: "+use-ptrgl-helper" +// ERR: error: unsupported option '-maix-use-ptrgl' for target 'powerpc64le-unknown-linux-gnu' diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index b28304b07e1a3..64427e97f729c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -154,6 +154,10 @@ bool PPCMCCodeEmitter::isNoTOCCallInstr(const MCInst &MI) const { case PPC::TCRETURNri: case PPC::BCTRL_LWZinto_toc: case PPC::BCTRL_LWZinto_toc_RM: + case PPC::BL_LWZinto_toc: + case PPC::BL_LWZinto_toc_RM: + case PPC::BL8_LDinto_toc: + case PPC::BL8_LDinto_toc_RM: case PPC::TAILBCTR: case PPC::TAILB: case PPC::TAILBA: diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index 842174239cc4c..91c23622c99cd 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -1233,6 +1233,16 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_PAIR, P10W_FX_3C, P10FX_Read], MTCRF, MTCRF8 )>; +// 2 Cycle Branch operation, 1 input operand followed by a +// 6 cycle Load operation, 0 input operands. +def : InstRW<[P10W_BR_2C, P10W_DISP_ANY, P10BR_Read, P10W_LD_6C, P10W_DISP_ANY], + (instrs + BL_LWZinto_toc, + BL_LWZinto_toc_RM, + BL8_LDinto_toc, + BL8_LDinto_toc_RM +)>; + // 6 Cycles Load operations, 0 input operands def : InstRW<[P10W_LD_6C, P10W_DISP_ANY], (instrs diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index 106faf1f8e8d2..b72671eefc7fd 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -1296,6 +1296,15 @@ def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], // Branch Instructions +// Pseduo instruction that encapsulates a branch and a toc load. +def : InstRW<[P9_BR_2C, DISP_BR_1C, P9_LS_4C, IP_AGEN_1C, DISP_1C], + (instrs + BL_LWZinto_toc, + BL_LWZinto_toc_RM, + BL8_LDinto_toc, + BL8_LDinto_toc_RM +)>; + // Two Cycle Branch def : InstRW<[P9_BR_2C, DISP_BR_1C], (instrs diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 7b4bae60f7e74..2beeeeaf785fa 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -362,6 +362,12 @@ def FeaturePredictableSelectIsExpensive : def FeatureFastMFLR : SubtargetFeature<"fast-MFLR", "HasFastMFLR", "true", "MFLR is a fast instruction">; +// When enabled indirect calls will place the address of the descriptor +// into r11 and do a direct branch to the ._ptrgl routine. +def FeaturePointerGlueHelper : + SubtargetFeature<"use-ptrgl-helper", "UsePointerGlueHelper", "true", + "Use ._ptrgl for indirect calls">; + //===----------------------------------------------------------------------===// // PowerPC Instruction Predicate Definitions. diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 8df059e34c626..3720724095c32 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -3235,7 +3235,11 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::BL8: case PPC::BL: case PPC::BL8_NOP: - case PPC::BL_NOP: { + case PPC::BL_NOP: + case PPC::BL_LWZinto_toc: + case PPC::BL_LWZinto_toc_RM: + case PPC::BL8_LDinto_toc: + case PPC::BL8_LDinto_toc_RM: { const MachineOperand &MO = MI->getOperand(0); if (MO.isSymbol()) { auto *S = static_cast( diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index e959100d713dd..0f43555b6bca4 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -5363,8 +5363,11 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, // immediately followed by a load of the TOC pointer from the stack save // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC // as it is not saved or used. - RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC - : PPCISD::BCTRL; + if (Subtarget.usePointerGlueHelper()) + RetOpc = PPCISD::BL_LOAD_TOC; + else + RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC + : PPCISD::BCTRL; } else if (Subtarget.isUsingPCRelativeCalls()) { assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI."); RetOpc = PPCISD::CALL_NOTOC; @@ -5393,6 +5396,9 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, case PPCISD::BCTRL: RetOpc = PPCISD::BCTRL_RM; break; + case PPCISD::BL_LOAD_TOC: + RetOpc = PPCISD::BL_LOAD_TOC_RM; + break; case PPCISD::CALL_NOTOC: RetOpc = PPCISD::CALL_NOTOC_RM; break; @@ -5604,6 +5610,23 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee, prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl); } +static void prepareOutOfLineGlueCall(SelectionDAG &DAG, SDValue &Callee, + SDValue &Glue, SDValue &Chain, + SDValue CallSeqStart, const CallBase *CB, + const SDLoc &dl, bool hasNest, + const PPCSubtarget &Subtarget) { + // On AIX there is a feature ("out of line glue code") which uses a special + // trampoline function ._ptrgl to do the indirect call. If this option is + // enabled we instead simply load the address of the descriptor into gpr11, + // with the arguments in the 'normal' registers and branch to the ._ptrgl + // stub. + const MCRegister PtrGlueReg = Subtarget.getGlueCodeDescriptorRegister(); + SDValue MoveToPhysicalReg = + DAG.getCopyToReg(Chain, dl, PtrGlueReg, Callee, Glue); + Chain = MoveToPhysicalReg.getValue(0); + Glue = MoveToPhysicalReg.getValue(1); +} + static void buildCallOperands(SmallVectorImpl &Ops, PPCTargetLowering::CallFlags CFlags, const SDLoc &dl, @@ -5621,7 +5644,12 @@ buildCallOperands(SmallVectorImpl &Ops, // If it's a direct call pass the callee as the second operand. if (!CFlags.IsIndirect) Ops.push_back(Callee); - else { + else if (Subtarget.usePointerGlueHelper()) { + Ops.push_back(Callee); + // Add the register used to pass the descriptor address. + Ops.push_back( + DAG.getRegister(Subtarget.getGlueCodeDescriptorRegister(), RegVT)); + } else { assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect."); // For the TOC based ABIs, we have saved the TOC pointer to the linkage area @@ -5703,11 +5731,20 @@ SDValue PPCTargetLowering::FinishCall( if (!CFlags.IsIndirect) Callee = transformCallee(Callee, DAG, dl, Subtarget); - else if (Subtarget.usesFunctionDescriptors()) - prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB, - dl, CFlags.HasNest, Subtarget); - else + else if (Subtarget.usesFunctionDescriptors()) { + if (Subtarget.usePointerGlueHelper()) { + prepareOutOfLineGlueCall(DAG, Callee, Glue, Chain, CallSeqStart, CB, dl, + CFlags.HasNest, Subtarget); + SDValue PtrGlueCallee = + DAG.getExternalSymbol("_ptrgl", getPointerTy(DAG.getDataLayout())); + Callee = transformCallee(PtrGlueCallee, DAG, dl, Subtarget); + } else { + prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB, + dl, CFlags.HasNest, Subtarget); + } + } else { prepareIndirectCall(DAG, Callee, Glue, Chain, dl); + } // Build the operand list for the call instruction. SmallVector Ops; @@ -7751,7 +7788,7 @@ SDValue PPCTargetLowering::LowerCall_AIX( // For indirect calls, we need to save the TOC base to the stack for // restoration after the call. - if (CFlags.IsIndirect) { + if (CFlags.IsIndirect && !Subtarget.usePointerGlueHelper()) { assert(!CFlags.IsTailCall && "Indirect tail-calls not supported."); const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister(); const MCRegister StackPtrReg = Subtarget.getStackPointerRegister(); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index cd4bb9f9a21bb..3c130077f3988 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -207,6 +207,26 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, Requires<[IsPPC64]>; } +let Predicates = [IsAIX] in { + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR8, X2], Uses = [RM, X1], RST = 2, RA = 1, D = 40 in { + def BL8_LDinto_toc : IForm_and_DForm_1<18, 0, 1, 58, + (outs), (ins calltarget:$LI), + "bl $LI\n\tld 2, 40(1)", IIC_BrB, + []>, + Requires<[IsPPC64]>; + } + + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR8, X2, RM], Uses = [RM, X1], RST = 2, RA = 1, D = 40 in { + def BL8_LDinto_toc_RM : IForm_and_DForm_1<18, 0, 1, 58, + (outs), (ins calltarget:$LI), + "bl $LI\n\tld 2, 40(1)", IIC_BrB, + []>, + Requires<[IsPPC64]>; + } +} + } // Interpretation64Bit // FIXME: Duplicating this for the asm parser should be unnecessary, but the @@ -259,6 +279,14 @@ def : Pat<(PPCcall_rm (i64 mcsym:$dst)), def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)), (BL8_NOP_RM mcsym:$dst)>; +let Predicates = [IsAIX] in { + def : Pat<(PPCbl_load_toc (i64 texternalsym:$dst)), + (BL8_LDinto_toc texternalsym:$dst)>; + + def : Pat<(PPCbl_load_toc_rm (i64 texternalsym:$dst)), + (BL8_LDinto_toc_RM texternalsym:$dst)>; +} + // Atomic operations // FIXME: some of these might be used with constant operands. This will result // in constant materialization instructions that may be redundant. We currently diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index ad976cb317c2c..eb4099b532336 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -554,6 +554,11 @@ def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC", [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +def PPCbl_load_toc : SDNode<"PPCISD::BL_LOAD_TOC", + SDTypeProfile<0, 1, []>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + // The variants that implicitly define rounding mode for calls with // strictfp semantics. def PPCcall_rm : SDNode<"PPCISD::CALL_RM", SDT_PPCCall, @@ -572,6 +577,10 @@ def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM", SDTypeProfile<0, 1, []>, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +def PPCbl_load_toc_rm : SDNode<"PPCISD::BL_LOAD_TOC_RM", + SDTypeProfile<0, 1, []>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; // Return with a glue operand, matched by 'blr' def PPCretglue : SDNode<"PPCISD::RET_GLUE", SDTNone, @@ -1862,6 +1871,25 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, } +let Predicates = [IsAIX] in { + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR, R2], Uses = [RM, R1], RST = 2, RA = 1, D = 20 in { + def BL_LWZinto_toc : IForm_and_DForm_1<18, 0, 1, 32, + (outs), (ins calltarget:$LI), + "bl $LI\n\tlwz 2, 20(1)", IIC_BrB, + []>, Requires<[IsPPC32]>; + } + + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR, R2, RM], Uses = [RM, R1], RST = 2, RA = 1, D = 20 in { + def BL_LWZinto_toc_RM : + IForm_and_DForm_1<18, 0, 1, 32, + (outs), (ins calltarget:$LI), + "bl $LI\n\tlwz 2, 20(1)", IIC_BrB, + []>, Requires<[IsPPC32]>; + } +} + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, Defs = [LR, R2, RM], Uses = [CTR, RM], RST = 2 in { def BCTRL_LWZinto_toc_RM: @@ -3395,6 +3423,14 @@ def : Pat<(PPCcall_nop (i32 mcsym:$dst)), def : Pat<(PPCcall_nop (i32 texternalsym:$dst)), (BL_NOP texternalsym:$dst)>; +let Predicates = [IsAIX] in { + def : Pat<(PPCbl_load_toc (i32 texternalsym:$dst)), + (BL_LWZinto_toc texternalsym:$dst)>; + + def : Pat<(PPCbl_load_toc_rm (i32 texternalsym:$dst)), + (BL_LWZinto_toc_RM texternalsym:$dst)>; +} + def : Pat<(PPCcall_rm (i32 mcsym:$dst)), (BL_RM mcsym:$dst)>; @@ -3404,6 +3440,7 @@ def : Pat<(PPCcall_nop_rm (i32 mcsym:$dst)), def : Pat<(PPCcall_nop_rm (i32 texternalsym:$dst)), (BL_NOP_RM texternalsym:$dst)>; + def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/llvm/lib/Target/PowerPC/PPCScheduleP7.td index bf7f2f7a9c999..ed8fadaaef74c 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP7.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP7.td @@ -115,6 +115,14 @@ let SchedModel = P7Model in { def : InstRW<[P7_BRU_NONE, P7_DISP_BR], (instregex "^B(L)?(A)?(8)?(_NOP|_NOTOC)?(_TLS|_RM)?(_)?$")>; + def : InstRW<[P7_BRU_NONE, P7_DISP_BR, P7_LSU_2C, P7_DISP_LS], + (instrs + BL_LWZinto_toc, + BL_LWZinto_toc_RM, + BL8_LDinto_toc, + BL8_LDinto_toc_RM + )>; + def : InstRW<[P7_BRU_3C, P7_DISP_BR], (instrs BDZLRLp, BDZLRm, BDZLRp, BDZLm, BDZLp, BDZm, BDZp, BDNZ, BDNZ8, BDNZA, BDNZAm, BDNZAp, BDNZL, BDNZLA, BDNZLAm, BDNZLAp, BDNZLR, diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 85e022a2ba6fc..b31ac006ce3a7 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -144,6 +144,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, report_fatal_error("The aix-shared-lib-tls-model-opt attribute " "is only supported on AIX in 64-bit mode.\n", false); + + if (UsePointerGlueHelper && !getTargetTriple().isOSAIX()) + report_fatal_error("use-ptrgl-helper feature is only supported on AIX\n", + false); } bool PPCSubtarget::enableMachineScheduler() const { return true; } diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 7d933588025fe..eec0e141debd4 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -286,6 +286,10 @@ class PPCSubtarget : public PPCGenSubtargetInfo { return IsPPC64 ? PPC::X1 : PPC::R1; } + MCRegister getGlueCodeDescriptorRegister() const { + return IsPPC64 ? PPC::X11 : PPC::R11; + } + bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; } bool isPredictableSelectIsExpensive() const { diff --git a/llvm/test/CodeGen/PowerPC/aix-mixed-inline-glue.ll b/llvm/test/CodeGen/PowerPC/aix-mixed-inline-glue.ll new file mode 100644 index 0000000000000..1c4e41409e752 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-mixed-inline-glue.ll @@ -0,0 +1,31 @@ +; RUN: llc -stop-after=finalize-isel -verify-machineinstrs -mcpu=pwr8 \ +; RUN: -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck %s + +; RUN: llc -stop-after=finalize-isel -verify-machineinstrs -mcpu=pwr8 \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK64 + +define i32 @OutOfLine(ptr noundef readonly captures(none) %fp) #0 { +entry: + %call = tail call i32 %fp() + ret i32 %call +} + +define i32 @InLine(ptr noundef readonly captures(none) %fp) #1 { +entry: + %call = tail call i32 %fp() + ret i32 %call +} + +attributes #0 = {"target-features"="+use-ptrgl-helper"} +attributes #1 = {"target-features"="-use-ptrgl-helper"} + +; CHECK: name: OutOfLine +; CHECK: BL_LWZinto_toc &"._ptrgl[PR]", csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit $rm, implicit $r1, implicit $r11, implicit $r2, implicit-def $r1, implicit-def $r3 +; CHECK: name: InLine +; CHECK: BCTRL_LWZinto_toc 20, $r1, csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit $ctr, implicit $rm, implicit $r11, implicit $r2, implicit-def $r1, implicit-def $r3 + +; CHECK64: name: OutOfLine +; CHECK64: BL8_LDinto_toc &"._ptrgl[PR]", csr_ppc64, implicit-def dead $lr8, implicit-def dead $x2, implicit $rm, implicit $x1, implicit $x11, implicit $x2, implicit-def $r1, implicit-def $x3 +; CHECK64: name: InLine +; CHECK64: BCTRL8_LDinto_toc 40, $x1, csr_ppc64, implicit-def dead $lr8, implicit-def dead $x2, implicit $ctr8, implicit $rm, implicit $x11, implicit $x2, implicit-def $r1, implicit-def $x3 diff --git a/llvm/test/CodeGen/PowerPC/aix-no-inline-glue-strictfp.ll b/llvm/test/CodeGen/PowerPC/aix-no-inline-glue-strictfp.ll new file mode 100644 index 0000000000000..dd0e88431d9c2 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-no-inline-glue-strictfp.ll @@ -0,0 +1,18 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff \ +; RUN: -stop-after=finalize-isel -mattr=+use-ptrgl-helper < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-ibm-aix-xcoff \ +; RUN: -stop-after=finalize-isel -mattr=+use-ptrgl-helper < %s | \ +; RUN: FileCheck --check-prefix=CHECK64 %s + +define i32 @has_strictfp(ptr noundef readonly captures(none) %fp) #0 { +entry: + %call = tail call i32 %fp() strictfp + ret i32 %call +} + +attributes #0 = { strictfp } + +; CHECK: BL_LWZinto_toc_RM &"._ptrgl[PR]", csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit-def dead $rm, implicit $rm, implicit $r1, implicit $r11, implicit $r2, implicit-def $r1, implicit-def $r3 + +; CHECK64: BL8_LDinto_toc_RM &"._ptrgl[PR]", csr_ppc64, implicit-def dead $lr8, implicit-def dead $x2, implicit-def dead $rm, implicit $rm, implicit $x1, implicit $x11, implicit $x2, implicit-def $r1, implicit-def $x3 diff --git a/llvm/test/CodeGen/PowerPC/aix-no-inline-glue.ll b/llvm/test/CodeGen/PowerPC/aix-no-inline-glue.ll new file mode 100644 index 0000000000000..088b5ab3f26d0 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-no-inline-glue.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff \ +; RUN: -mattr=+use-ptrgl-helper < %s | FileCheck --check-prefixes=CHECK,CHECK32 %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-ibm-aix-xcoff \ +; RUN: -mattr=+use-ptrgl-helper < %s | FileCheck --check-prefixes=CHECK,CHECK64 %s + +; RUN: llc -stop-after=finalize-isel -verify-machineinstrs -mcpu=pwr8 \ +; RUN: -mtriple powerpc-ibm-aix-xcoff -mattr=+use-ptrgl-helper < %s | \ +; RUN: FileCheck --check-prefix=MIR32 %s + +; RUN: llc -stop-after=finalize-isel -verify-machineinstrs -mcpu=pwr8 \ +; RUN: -mtriple powerpc64-ibm-aix-xcoff -mattr=+use-ptrgl-helper < %s | \ +; RUN: FileCheck --check-prefix=MIR64 %s + +; RUN: not llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-unknown-linux \ +; RUN: -mattr=+use-ptrgl-helper 2>&1 < %s | FileCheck --check-prefix=ERROR %s + +; ERROR: use-ptrgl-helper feature is only supported on AIX + +@a = dso_local global i32 55, align 4 +@d = dso_local global double 3.141590e+00, align 8 +@fp = dso_local global ptr null, align 8 + +define i32 @caller1(ptr noundef readonly captures(none) %fp) { +entry: + %call = tail call i32 %fp(i32 signext 1, i32 signext 2, i32 signext 3) + ret i32 %call +} + +; CHECK-LABEL: .caller1 +; CHECK-DAG: mr 11, 3 +; CHECK-DAG: li 3, 1 +; CHECK-DAG: li 4, 2 +; CHECK-DAG: li 5, 3 +; CHECK: bl ._ptrgl[PR] +; CHECK32-NEXT: lwz 2, 20(1) +; CHECK64-NEXT: ld 2, 40(1) + +; MIR32: name: caller1 +; MIR32: %0:gprc = COPY $r3 +; MIR32: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 +; MIR32: $r11 = COPY %0 +; MIR32: BL_LWZinto_toc &"._ptrgl[PR]", csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit $rm, implicit $r1, implicit $r11, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3 +; MIR32: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 + +; MIR64: name: caller1 +; MIR64: %0:g8rc = COPY $x3 +; MIR64: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; MIR64: $x11 = COPY %0 +; MIR64: BL8_LDinto_toc &"._ptrgl[PR]", csr_ppc64, implicit-def dead $lr8, implicit-def dead $x2, implicit $rm, implicit $x1, implicit $x11, implicit $x3, implicit $x4, implicit $x5, implicit $x2, implicit-def $r1, implicit-def $x3 +; MIR64: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 + +define dso_local zeroext i1 @caller2() { +entry: + %0 = load ptr, ptr @fp + %1 = load i32, ptr @a + %2 = load double, ptr @d + %call = tail call zeroext i1 %0(i32 noundef signext %1, double noundef %2, ptr noundef nonnull @a) + ret i1 %call +} + +; CHECK-LABEL: .caller2 +; CHECK64: ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2) # @fp +; CHECK64: ld 11, 0([[REG]]) +; CHECK32: lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2) # @fp +; CHECK32: lwz 11, 0([[REG]]) +; CHECK: bl ._ptrgl[PR] +; CHECK32-NEXT: lwz 2, 20(1) +; CHECK64-NEXT: ld 2, 40(1) + +; MIR32: name: caller2 +; MIR32: %0:gprc_and_gprc_nor0 = LWZtoc @fp, $r2 :: (load (s32) from got) +; MIR32: %1:gprc = LWZ 0, killed %0 :: (dereferenceable load (s32) from @fp, align 8) +; MIR32: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 +; MIR32: $r11 = COPY %1 +; MIR32: BL_LWZinto_toc &"._ptrgl[PR]", csr_aix32, implicit-def dead $lr, implicit-def dead $r2, implicit $rm, implicit $r1, implicit $r11, implicit $r3, implicit $f1, implicit $r6, implicit $r2, implicit-def $r1, implicit-def $r3 +; MIR32: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 + +; MIR64: name: caller2 +; MIR64: %0:g8rc_and_g8rc_nox0 = LDtoc @fp, $x2 :: (load (s64) from got) +; MIR64: %1:g8rc = LD 0, killed %0 :: (dereferenceable load (s64) from @fp) +; MIR64: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; MIR64: $x11 = COPY %1 +; MIR64: BL8_LDinto_toc &"._ptrgl[PR]", csr_ppc64, implicit-def dead $lr8, implicit-def dead $x2, implicit $rm, implicit $x1, implicit $x11, implicit $x3, implicit $f1, implicit $x5, implicit $x2, implicit-def $r1, implicit-def $x3 +; MIR64: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 + +; CHECK: .extern ._ptrgl[PR] From 5560a4e8777f44f390899426365b0ef3118d6d0e Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Mon, 11 May 2026 19:29:40 +0200 Subject: [PATCH 313/538] [AMDGPU] Fix inconsistencies in RP tracking advance/reset behavior (#196595) Some of the variants of `advance` and `reset` in the `GCNRPTracker` and `GCNDownwardRPTracker` had unclear/inconsistent semantics on their return value. This aims to clarify that through improved documentation and light functional changes. These inconsistencies ultimately triggered an assert in `GCNRPTaget::saveRP` on a complex kernel during scheduling. `GCNScheduleDAGMILive::getRealRegPressure` would incorrectly return a null pressure for a non-empty region which only had debug values. Such regions can arise if the `PreRARematStage` rematerializes all non-debug instructions out of their original region, leaving only debug values. Attempting to rematerialize registers across that same region afterwards would trigger the assert. --- .../Target/AMDGPU/AMDGPUNextUseAnalysis.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 96 +++++++++++++------ llvm/lib/Target/AMDGPU/GCNRegPressure.h | 53 +++++----- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 18 ++-- .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 8 +- .../Target/AMDGPU/GCNRegPressureTest.cpp | 51 ++++------ 6 files changed, 127 insertions(+), 101 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp index f6fe48bf5bb49..cad1513f280fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp @@ -2452,7 +2452,7 @@ void printNextUseDistancesAsJson(json::OStream &J, const MachineFunction &MF, for (const MachineInstr &MI : MBB) { // Update register pressure tracker if (!PrevMI || PrevMI->getOpcode() == AMDGPU::PHI) - RPTracker.reset(MI); + RPTracker.reset(MI, MBB.end()); RPTracker.advance(); UseDistancePair Furthest; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 683e658aa4fb4..42a430da0acb8 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -14,6 +14,7 @@ #include "GCNRegPressure.h" #include "AMDGPU.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterPressure.h" @@ -520,28 +521,54 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, return LiveRegs; } -void GCNRPTracker::reset(const MachineInstr &MI, - const LiveRegSet *LiveRegsCopy, - bool After) { - const MachineFunction &MF = *MI.getMF(); - MRI = &MF.getRegInfo(); - if (LiveRegsCopy) { - if (&LiveRegs != LiveRegsCopy) - LiveRegs = *LiveRegsCopy; - } else { - LiveRegs = After ? getLiveRegsAfter(MI, LIS) - : getLiveRegsBefore(MI, LIS); +void GCNRPTracker::reset(const MachineInstr &MI, bool After) { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + if (!MI.isDebugInstr()) { + SlotIndex SI = LIS.getInstructionIndex(MI); + if (After) + SI = SI.getDeadSlot(); + reset(MRI, SI); + return; } - MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); + // Look for the first valid index after the provided debug MI. + MachineBasicBlock::const_iterator It = MI.getIterator(), + MBBEnd = MI.getParent()->end(); + MachineBasicBlock::const_iterator NonDbgMI = + skipDebugInstructionsForward(It, MBBEnd); + if (NonDbgMI == MBBEnd) { + // There are no non-debug instruction between MI and the end of the + // block, so we reset the tracker at the end of the block. + reset(*MI.getParent(), /*End=*/true); + return; + } + // MI is a debug instruction so register pressure before or after it is + // identical. Since we moved forward to finding a non-debug instruction + // in the block, we reset the tracker before that instruction i.e., at its + // base index. + reset(MRI, LIS.getInstructionIndex(*NonDbgMI)); +} + +void GCNRPTracker::reset(const MachineBasicBlock &MBB, bool End) { + SlotIndex SI = End ? LIS.getSlotIndexes()->getMBBLastIdx(&MBB) + : LIS.getMBBStartIdx(&MBB); + reset(MBB.getParent()->getRegInfo(), SI); +} + +void GCNRPTracker::reset(const MachineRegisterInfo &MRI, SlotIndex SI) { + this->MRI = &MRI; + LastTrackedMI = nullptr; + LiveRegs = llvm::getLiveRegs(SI, LIS, MRI); + MaxPressure = CurPressure = getRegPressure(MRI, LiveRegs); } -void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, - const LiveRegSet &LiveRegs_) { - MRI = &MRI_; - LiveRegs = LiveRegs_; +void GCNRPTracker::reset(const MachineRegisterInfo &MRI, + const LiveRegSet &LiveRegs) { + this->MRI = &MRI; LastTrackedMI = nullptr; - MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_); + if (&this->LiveRegs != &LiveRegs) + this->LiveRegs = LiveRegs; + MaxPressure = CurPressure = getRegPressure(MRI, LiveRegs); } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp @@ -621,16 +648,25 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { // GCNDownwardRPTracker bool GCNDownwardRPTracker::reset(const MachineInstr &MI, + MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy) { - MRI = &MI.getMF()->getRegInfo(); - LastTrackedMI = nullptr; MBBEnd = MI.getParent()->end(); + assert(End == MBBEnd || + End->getParent()->end() == MBBEnd && "end unrelated to MI block"); NextMI = &MI; - NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); - if (NextMI == MBBEnd) - return false; - GCNRPTracker::reset(*NextMI, LiveRegsCopy, false); - return true; + NextMI = skipDebugInstructionsForward(NextMI, End); + + // Do not use the MI to compute live registers when a set is provided. + // Otherwise the first non-debug instruction after the provided one (or the + // end of the block, if no such instruction exists) serves as the basis to + // compute a live register set. + if (LiveRegsCopy) + GCNRPTracker::reset(MI.getMF()->getRegInfo(), *LiveRegsCopy); + else if (NextMI != MBBEnd) + GCNRPTracker::reset(*NextMI, /*After=*/false); + else + GCNRPTracker::reset(*MI.getParent(), /*End=*/true); + return NextMI != End; } bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, @@ -738,15 +774,17 @@ bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) { } bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) { - while (NextMI != End) - if (!advance()) return false; - return true; + bool AnyAdvance = false; + while (NextMI != End && advance()) + AnyAdvance = true; + return AnyAdvance; } bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy) { - reset(*Begin, LiveRegsCopy); + if (!reset(*Begin, End, LiveRegsCopy)) + return false; return advance(End); } @@ -962,7 +1000,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { RPAtMBBEnd = getRegPressure(MRI, LiveIn); } else { GCNDownwardRPTracker RPT(LIS); - RPT.reset(MBB.front()); + RPT.reset(MBB.front(), MBB.end()); LiveIn = RPT.getLiveRegs(); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 00cb617a55fa7..85757eb718d46 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -331,8 +331,15 @@ class GCNRPTracker { GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} - void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy, - bool After); + /// Resets tracker before or \p After the provided \p MI, which can be a debug + /// instruction. + void reset(const MachineInstr &MI, bool After); + + /// Resets tracker at the start or \p End of the \p MBB. + void reset(const MachineBasicBlock &MBB, bool End); + + /// Resets tracker at the specified slot index \p SI. + void reset(const MachineRegisterInfo &MRI, SlotIndex SI); /// Mostly copy/paste from CodeGen/RegisterPressure.cpp void bumpDeadDefs(ArrayRef DeadDefs); @@ -340,8 +347,9 @@ class GCNRPTracker { LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const; public: - // reset tracker and set live register set to the specified value. - void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); + /// Resets tracker with the provided \p LiveRegs. + void reset(const MachineRegisterInfo &MRI, const LiveRegSet &LiveRegs); + // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } @@ -369,21 +377,9 @@ class GCNUpwardRPTracker : public GCNRPTracker { using GCNRPTracker::reset; - /// reset tracker at the specified slot index \p SI. - void reset(const MachineRegisterInfo &MRI, SlotIndex SI) { - GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); - } - - /// reset tracker to the end of the \p MBB. - void reset(const MachineBasicBlock &MBB) { - SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB); - reset(MBB.getParent()->getRegInfo(), MBBLastSlot); - } - - /// reset tracker to the point just after \p MI (in program order). - void reset(const MachineInstr &MI) { - reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot()); - } + /// Resets tracker to the point just after \p MI (in program order), which can + /// be a debug instruction. + void reset(const MachineInstr &MI) { reset(MI, /*After=*/true); } /// Move to the state of RP just before the \p MI . If \p UseInternalIterator /// is set, also update the internal iterators. Setting \p UseInternalIterator @@ -428,10 +424,12 @@ class GCNDownwardRPTracker : public GCNRPTracker { return Res; } - /// Reset tracker to the point before the \p MI - /// filling \p LiveRegs upon this point using LIS. - /// \p returns false if block is empty except debug values. - bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); + /// Reset tracker to the point before the \p MI filling \p LiveRegs upon this + /// point using LIS. \p End must be between the MI and the end of its parent + /// block (inclusive). \p returns false if the range [MI, End) is empty except + /// debug values. + bool reset(const MachineInstr &MI, MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegs = nullptr); /// Move to the state right before the next MI or after the end of MBB. /// \p returns false if reached end of the block. @@ -462,10 +460,15 @@ class GCNDownwardRPTracker : public GCNRPTracker { /// \p MI and use LIS for RP calculations. bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true); - /// Advance instructions until before \p End. + /// Advance instructions until before \p End using internal iterators to + /// process instructions in program order. Returns whether iterators actually + /// had to advance to reach \p End. bool advance(MachineBasicBlock::const_iterator End); - /// Reset to \p Begin and advance to \p End. + /// Reset tracker to \p Begin (filling \p LiveRegs upon this point using LIS) + /// and advance to \p End, which must be between \p Begin and the end of its + /// parent block (inclusive). \p returns false if the range [Begin, End) is + /// empty except debug values. bool advance(MachineBasicBlock::const_iterator Begin, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy = nullptr); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 727ae68e88bfb..8a6c1d5e21e85 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1075,7 +1075,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); if (LiveInIt != MBBLiveIns.end()) { auto LiveIn = std::move(LiveInIt->second); - RPTracker.reset(*MBB->begin(), &LiveIn); + RPTracker.reset(*MBB->begin(), MBB->end(), &LiveIn); MBBLiveIns.erase(LiveInIt); } else { I = Rgn.first; @@ -1083,7 +1083,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, #ifdef EXPENSIVE_CHECKS assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); #endif - RPTracker.reset(*I, &LRS); + RPTracker.reset(*I, I->getParent()->end(), &LRS); } for (;;) { @@ -1207,16 +1207,10 @@ void GCNScheduleDAGMILive::runSchedStages() { } if (S.useGCNTrackers()) { - GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); - GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); - GCNRPTracker::LiveRegSet *RegionLiveIns = - &LiveIns[Stage->getRegionIdx()]; - - reinterpret_cast(DownwardTracker) - ->reset(MRI, *RegionLiveIns); - reinterpret_cast(UpwardTracker) - ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( - Stage->getRegionIdx())); + const unsigned RegionIdx = Stage->getRegionIdx(); + S.getDownwardTracker()->reset(MRI, LiveIns[RegionIdx]); + S.getUpwardTracker()->reset( + MRI, RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx)); } ScheduleDAGMILive::schedule(); diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 982034189892c..905c2afa9fcd8 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -290,7 +290,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) { continue; if (!RPT.getNext().isValid()) - RPT.reset(MI); + RPT.reset(MI, MBB.end()); else { // Advance the state to the current MI. RPT.advance(MachineBasicBlock::const_iterator(MI)); RPT.advanceBeforeNext(); @@ -299,7 +299,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) { const GCNRPTracker::LiveRegSet LiveRegsCopy(RPT.getLiveRegs()); RegUse Defs, Uses; if (!processRegUses(MI, Defs, Uses, RPT)) { - RPT.reset(MI, &LiveRegsCopy); + RPT.reset(MI, MBB.end(), &LiveRegsCopy); continue; } @@ -323,7 +323,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) { ++Length; } if (Length < 2) { - RPT.reset(MI, &LiveRegsCopy); + RPT.reset(MI, MBB.end(), &LiveRegsCopy); continue; } @@ -391,7 +391,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) { } // Restore the state after processing the end of the bundle. - RPT.reset(MI, &LiveRegsCopy); + RPT.reset(MI, MBB.end(), &LiveRegsCopy); if (!Kill) continue; diff --git a/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp b/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp index ad84f4df65288..d907ee269a448 100644 --- a/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp +++ b/llvm/unittests/Target/AMDGPU/GCNRegPressureTest.cpp @@ -82,25 +82,18 @@ body: | // which would return false in this case. // // There aren't any non-debug instruction between the beginning of bb1 and - // Dbg1 (exclusive). However, the call to reset takes the end of the MBB as - // the limit, so it pushes the beginning of the block up to %2's def and - // considers the reset successful. - EXPECT_TRUE(RPTracker.reset(*MBB1.begin(), &MBB1LiveIns)); - EXPECT_TRUE(RPTrackerNoLiveIns.reset(*MBB1.begin(), nullptr)); - // advance then unnecessarily processes instructions in order until the end - // of the block, even though it is already past Dbg1. It still returns false - // because it is stopped by the end of block delimiter, not the end - // iterator. + // Dbg1 (exclusive), the reset is therefore unsuccessful. The advance caller + // returns early on a failure to reset. Calling advance after this does + // nothing and produces false because the internal iterator already points + // to the second debug instruction. + EXPECT_FALSE(RPTracker.reset(*MBB1.begin(), Dbg1, &MBB1LiveIns)); + EXPECT_FALSE(RPTrackerNoLiveIns.reset(*MBB1.begin(), Dbg1, nullptr)); EXPECT_FALSE(RPTracker.advance(Dbg1)); EXPECT_FALSE(RPTrackerNoLiveIns.advance(Dbg1)); - // In that case, the maximum pressure is also the pressure induced by the - // block's live-ins plus %2's def i.e., 3 VGPRs. This is confusing because - // %2's def is outside the [Begin,End) range we passed to advance, and there - // is no indication that a false return value should make the tracked - // pressure invalid. - EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 3U); - EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 3U); + // Register pressure should be the one at the block's live-ins. + EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 2U); + EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 2U); } } @@ -137,20 +130,18 @@ body: | // The following unpacks a call to // advance(MBB1.begin(), MBB1.end(), [MBB1LiveIns|nullptr]) - // which would return true in this case. + // which would return false in this case. // // There aren't any non-debug instruction in bb.2, the reset is therefore - // unsuccessful. However the advance caller discards that return value and - // proceeds to calling its override. - EXPECT_FALSE(RPTracker.reset(*MBB1.begin(), &MBB1LiveIns)); - EXPECT_FALSE(RPTrackerNoLiveIns.reset(*MBB1.begin(), nullptr)); - // advance then produces true even though no advancement actually happened. - EXPECT_TRUE(RPTracker.advance(MBB1.end())); - EXPECT_TRUE(RPTrackerNoLiveIns.advance(MBB1.end())); - - // In that case, the maximum pressure is unchanged from the beginning since - // reset was unsuccessful. This is confusing because the top-level advance - // call produced true, yet the block's live-in pressure was not considered. - EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 0U); - EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 0U); + // unsuccessful. The advance caller returns early on a failure to reset. + // Calling advance after this does nothing and produces false because the + // internal iterator is already at the block's end. + EXPECT_FALSE(RPTracker.reset(*MBB1.begin(), MBB1.end(), &MBB1LiveIns)); + EXPECT_FALSE(RPTrackerNoLiveIns.reset(*MBB1.begin(), MBB1.end(), nullptr)); + EXPECT_FALSE(RPTracker.advance(MBB1.end())); + EXPECT_FALSE(RPTrackerNoLiveIns.advance(MBB1.end())); + + // Register pressure should be the one at the block's live-ins. + EXPECT_EQ(RPTracker.moveMaxPressure().getVGPRNum(false), 1U); + EXPECT_EQ(RPTrackerNoLiveIns.moveMaxPressure().getVGPRNum(false), 1U); } From 8eed046baeed8ead2ed555751f4e42552a0e41b2 Mon Sep 17 00:00:00 2001 From: adams381 Date: Mon, 11 May 2026 12:29:54 -0500 Subject: [PATCH 314/538] [mlir][ABI] Add Test target + classification injection helper (#195725) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First in a series of PRs splitting #192119 / #192124 per @andykaylor's review request to break them down by ArgKind and replace the C++ unit tests with `cir-opt`-driven `.cir` tests. This one is dialect-agnostic so it can land before the CIR side. Two pieces: 1. A test ABI target at `mlir/lib/ABI/Targets/Test/`. Predictable rules approximating x86_64 SysV thresholds (Direct / Extend / Indirect / Ignore). The header and `.cpp` are both explicit that this is not a real ABI target — real ones live next to the LLVM ABI library. 2. A `parseClassificationAttr` helper that reads a plain `DictionaryAttr` and returns a `FunctionClassification`. Lets tests inject any classification (including shapes the test target itself doesn't produce) so rewriter behavior can be validated against real-ABI-shaped expectations without waiting for #194433. The schema is documented in the parser source comment; unknown keys cause a parse error so the schema stays honest as it grows. The injection schema only covers fields that `mlir::abi::ArgClassification` currently has. Comparing it to `llvm::abi::ArgInfo` (in `llvm/include/llvm/ABI/FunctionInfo.h`) turned up ~5 missing concepts on the MLIR side: `direct_offset` (multi-eightbyte returns), `indirect_addr_space`, `indirect_realign`, an `extend_kind` tristate (we currently collapse to a single bool, losing the no-extend case), and function-level `calling_conv` / `num_required` for variadic. Those get added to the struct and the schema by whichever subsequent PR first needs them, rather than landing here as unused fields. 19 gtest cases for the classifier and parser (well-formed inputs, missing required keys, unknown-key rejection, alignment validation, kind validation). `ninja check-clang-cir-codegen` and `ninja check-clang-cir` both still pass at the previous baseline. One thing to flag: the test target's automatic classifier never produces the `Expand` kind. That kind is only reachable through the injection driver. Fine for what this PR is for, but worth noting since it's the one ArgKind that doesn't get exercised by the test-target driver. Co-authored-by: Cursor --- .../mlir/ABI/Targets/Test/TestTarget.h | 98 ++++++ mlir/lib/ABI/CMakeLists.txt | 1 + mlir/lib/ABI/Targets/Test/TestTarget.cpp | 260 ++++++++++++++ mlir/unittests/ABI/CMakeLists.txt | 1 + mlir/unittests/ABI/TestTargetTest.cpp | 327 ++++++++++++++++++ 5 files changed, 687 insertions(+) create mode 100644 mlir/include/mlir/ABI/Targets/Test/TestTarget.h create mode 100644 mlir/lib/ABI/Targets/Test/TestTarget.cpp create mode 100644 mlir/unittests/ABI/TestTargetTest.cpp diff --git a/mlir/include/mlir/ABI/Targets/Test/TestTarget.h b/mlir/include/mlir/ABI/Targets/Test/TestTarget.h new file mode 100644 index 0000000000000..4404d47f8df45 --- /dev/null +++ b/mlir/include/mlir/ABI/Targets/Test/TestTarget.h @@ -0,0 +1,98 @@ +//===- TestTarget.h - Predictable test ABI target --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the test ABI target, a predictable, dialect-agnostic +// classifier used to exercise the MLIR ABIRewriteContext infrastructure +// without depending on any real ABI. See TestTarget.cpp for the rules +// and the rationale. +// +// It also declares parseClassificationAttr, the helper used by the +// classification-injection driver: tests can attach an arbitrary +// FunctionClassification to a function via a plain mlir::DictionaryAttr, +// and the rewriter pass reads it back through this parser. This lets +// tests verify rewriter output against any classification (including +// shapes the test target itself doesn't produce) without needing a real +// ABIInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_ABI_TARGETS_TEST_TESTTARGET_H +#define MLIR_ABI_TARGETS_TEST_TESTTARGET_H + +#include "mlir/ABI/ABIRewriteContext.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/Interfaces/DataLayoutInterfaces.h" +#include "llvm/Support/Error.h" + +namespace mlir { +namespace abi { +namespace test { + +/// Classify a function signature using the test target's predictable rules. +/// +/// The rules approximate x86_64 SysV thresholds for reviewer familiarity +/// (see TestTarget.cpp for the full list) but are not a substitute for +/// testing against a real ABIInfo. Real-ABI-shaped tests should use the +/// classification-injection driver via `parseClassificationAttr` below. +/// +/// \param argTypes Argument types of the function. +/// \param returnType Return type of the function. +/// \param dl DataLayout used for size and alignment queries. +FunctionClassification classify(ArrayRef argTypes, Type returnType, + const DataLayout &dl); + +/// Parse a `FunctionClassification` from a plain MLIR DictionaryAttr. +/// +/// Schema (all keys are required unless marked optional): +/// +/// { +/// return = { kind = "", ...per-kind keys... }, +/// args = [ { kind = "", ...per-kind keys... }, ... ] +/// } +/// +/// Per-arg/return dictionary keys: +/// kind: StringAttr. One of "direct", "extend", "indirect", +/// "ignore", "expand". +/// +/// For kind = "direct" (all optional): +/// coerced_type: TypeAttr. ABI-coerced type, if different from the +/// original. +/// can_flatten: BoolAttr. Defaults to true. +/// +/// For kind = "extend" (coerced_type required, sign_extend optional): +/// coerced_type: TypeAttr. Required; the extended integer type. +/// sign_extend: BoolAttr. Defaults to false (zero-extend). +/// +/// For kind = "indirect" (indirect_align required, byval optional): +/// indirect_align: IntegerAttr. Required; alignment of the pointed-to +/// object in bytes. +/// byval: BoolAttr. Defaults to true. +/// +/// For kind = "ignore" / "expand": no extra keys. +/// +/// Future schema additions tracked in projects/daily_log.md (Step 0c +/// field-mapping table). When we add new fields to ArgClassification +/// (e.g. direct_offset, extend_kind tristate, indirect_addr_space, +/// indirect_realign), the corresponding optional keys go here. +/// +/// Unknown keys cause a parse error (no silent ignore — keeps schema +/// honest as it grows). +/// +/// \param attr The dictionary attribute to parse. +/// \param emitError Diagnostic sink for parse errors. +/// \returns The parsed classification, or std::nullopt on error. +std::optional +parseClassificationAttr(DictionaryAttr attr, + function_ref emitError); + +} // namespace test +} // namespace abi +} // namespace mlir + +#endif // MLIR_ABI_TARGETS_TEST_TESTTARGET_H diff --git a/mlir/lib/ABI/CMakeLists.txt b/mlir/lib/ABI/CMakeLists.txt index eb434d25dd390..caa353d26ece3 100644 --- a/mlir/lib/ABI/CMakeLists.txt +++ b/mlir/lib/ABI/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_library(MLIRABI ABITypeMapper.cpp + Targets/Test/TestTarget.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/ABI diff --git a/mlir/lib/ABI/Targets/Test/TestTarget.cpp b/mlir/lib/ABI/Targets/Test/TestTarget.cpp new file mode 100644 index 0000000000000..51510b0c18009 --- /dev/null +++ b/mlir/lib/ABI/Targets/Test/TestTarget.cpp @@ -0,0 +1,260 @@ +//===- TestTarget.cpp - Predictable test ABI target ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// **NOT A REAL ABI TARGET.** +// +// This file implements a predictable, dialect-agnostic ABI classifier for +// testing the MLIR ABIRewriteContext infrastructure. The rules approximate +// x86_64 SysV thresholds (Direct / Extend / Indirect / Ignore / Expand) so +// the generated classifications are familiar to reviewers, but they are +// NOT a substitute for testing against the real x86_64 ABIInfo. Real +// ABI targets live alongside the LLVM ABI library in `llvm/lib/ABI/Targets/`. +// +// Real-ABI-shaped tests use the classification-injection driver via +// `parseClassificationAttr`, which lets tests construct any +// FunctionClassification (including shapes the test target itself does +// not produce) by attaching a DictionaryAttr to the function. +// +// Rules: +// - mlir::NoneType → Ignore +// - IntegerType with width < 32 → Extend (zero-extend by +// default; tests using the +// injection driver can +// override to signed) +// - IntegerType with width >= 32 → Direct +// - FloatType, VectorType, MemRefType → Direct +// - Anything else with DataLayout size 0 → Ignore +// - Anything else with DataLayout size <= 16 → Direct (coerced to the +// same type — no actual +// coercion in the test +// target; PR C handles +// non-trivial coercion) +// - Anything else with DataLayout size > 16 → Indirect with byval=true +// (sret on returns) and +// alignment from +// DataLayout +// +//===----------------------------------------------------------------------===// + +#include "mlir/ABI/Targets/Test/TestTarget.h" +#include "mlir/IR/BuiltinTypes.h" +#include "llvm/Support/Alignment.h" + +using namespace mlir; +using namespace mlir::abi; +using namespace mlir::abi::test; + +namespace { + +/// Indirect-vs-direct cutoff in bytes. Chosen to match x86_64 SysV's +/// 16-byte register-passing window for reviewer familiarity. +constexpr uint64_t IndirectCutoffBytes = 16; + +/// Below this width (in bits) integers get an extension attribute. +/// Chosen to match x86_64 SysV (32-bit register width) for reviewer +/// familiarity. +constexpr unsigned ExtendBelowBits = 32; + +ArgClassification classifyOne(Type type, const DataLayout &dl) { + if (isa(type)) + return ArgClassification::getIgnore(); + + if (auto intTy = dyn_cast(type)) { + if (intTy.getWidth() < ExtendBelowBits) { + Type i32Ty = IntegerType::get(type.getContext(), ExtendBelowBits); + return ArgClassification::getExtend(i32Ty, /*signExt=*/intTy.isSigned()); + } + return ArgClassification::getDirect(); + } + + if (auto indexTy = dyn_cast(type)) { + llvm::TypeSize sizeInBits = dl.getTypeSizeInBits(indexTy); + if (sizeInBits.getFixedValue() < ExtendBelowBits) { + Type i32Ty = IntegerType::get(type.getContext(), ExtendBelowBits); + return ArgClassification::getExtend(i32Ty, /*signExt=*/true); + } + return ArgClassification::getDirect(); + } + + if (isa(type)) + return ArgClassification::getDirect(); + + // For dialect-specific types: query DataLayout via + // DataLayoutTypeInterface. Types that don't implement the interface + // (e.g. dialect-specific void / unit-style sentinel types used as a + // function's "no return value" marker) are treated as Ignore so that + // the test target degrades gracefully rather than crashing on unknown + // types. + if (!isa(type)) + return ArgClassification::getIgnore(); + + llvm::TypeSize sizeInBits = dl.getTypeSizeInBits(type); + if (sizeInBits.isZero()) + return ArgClassification::getIgnore(); + + uint64_t sizeInBytes = (sizeInBits.getFixedValue() + 7) / 8; + if (sizeInBytes <= IndirectCutoffBytes) + return ArgClassification::getDirect(); + + uint64_t alignBytes = dl.getTypeABIAlignment(type); + return ArgClassification::getIndirect(llvm::Align(alignBytes), + /*byVal=*/true); +} + +} // namespace + +FunctionClassification mlir::abi::test::classify(ArrayRef argTypes, + Type returnType, + const DataLayout &dl) { + FunctionClassification fc; + fc.returnInfo = classifyOne(returnType, dl); + fc.argInfos.reserve(argTypes.size()); + for (Type t : argTypes) + fc.argInfos.push_back(classifyOne(t, dl)); + return fc; +} + +namespace { + +/// Set of dictionary keys this parser knows about. Any key not in this +/// set causes a parse error (no silent ignore). Updated when new +/// optional keys are added to the schema. +constexpr StringRef knownArgKeys[] = { + "kind", "coerced_type", "sign_extend", + "can_flatten", "indirect_align", "byval", +}; + +bool isKnownArgKey(StringRef key) { + for (StringRef k : knownArgKeys) + if (k == key) + return true; + return false; +} + +/// Parse a single ArgClassification dictionary. Returns std::nullopt on +/// any error (with the diagnostic emitted via \p emitError). +std::optional +parseOne(DictionaryAttr argDict, function_ref emitError) { + StringAttr kindAttr = argDict.getAs("kind"); + if (!kindAttr) { + emitError() << "missing required 'kind' StringAttr"; + return std::nullopt; + } + + for (NamedAttribute na : argDict) + if (!isKnownArgKey(na.getName().getValue())) { + emitError() << "unknown key '" << na.getName().getValue() + << "' in classification dictionary; allowed keys are " + << "kind, coerced_type, sign_extend, can_flatten, " + << "indirect_align, byval"; + return std::nullopt; + } + + StringRef kind = kindAttr.getValue(); + + if (kind == "direct") { + Type coerced; + if (auto t = argDict.getAs("coerced_type")) + coerced = t.getValue(); + auto c = ArgClassification::getDirect(coerced); + if (auto cf = argDict.getAs("can_flatten")) + c.canFlatten = cf.getValue(); + return c; + } + + if (kind == "extend") { + auto coerced = argDict.getAs("coerced_type"); + if (!coerced) { + emitError() << "kind='extend' requires 'coerced_type' TypeAttr"; + return std::nullopt; + } + bool signExt = false; + if (auto se = argDict.getAs("sign_extend")) + signExt = se.getValue(); + return ArgClassification::getExtend(coerced.getValue(), signExt); + } + + if (kind == "indirect") { + auto align = argDict.getAs("indirect_align"); + if (!align) { + emitError() << "kind='indirect' requires 'indirect_align' IntegerAttr"; + return std::nullopt; + } + if (align.getInt() <= 0 || !llvm::isPowerOf2_64(align.getInt())) { + emitError() << "'indirect_align' must be a positive power of 2; got " + << align.getInt(); + return std::nullopt; + } + bool byVal = true; + if (auto bv = argDict.getAs("byval")) + byVal = bv.getValue(); + return ArgClassification::getIndirect(llvm::Align(align.getInt()), byVal); + } + + if (kind == "ignore") { + return ArgClassification::getIgnore(); + } + + if (kind == "expand") { + ArgClassification c; + c.kind = ArgKind::Expand; + return c; + } + + emitError() << "unknown kind='" << kind + << "'; expected one of direct, extend, indirect, ignore, expand"; + return std::nullopt; +} + +} // namespace + +std::optional mlir::abi::test::parseClassificationAttr( + DictionaryAttr attr, function_ref emitError) { + auto returnDict = attr.getAs("return"); + if (!returnDict) { + emitError() << "missing required 'return' DictionaryAttr"; + return std::nullopt; + } + + auto argsArr = attr.getAs("args"); + if (!argsArr) { + emitError() << "missing required 'args' ArrayAttr"; + return std::nullopt; + } + + for (NamedAttribute na : attr) { + StringRef k = na.getName().getValue(); + if (k != "return" && k != "args") { + emitError() << "unknown top-level key '" << k + << "'; only 'return' and 'args' are allowed"; + return std::nullopt; + } + } + + FunctionClassification fc; + + std::optional ret = parseOne(returnDict, emitError); + if (!ret) + return std::nullopt; + fc.returnInfo = *ret; + + fc.argInfos.reserve(argsArr.size()); + for (Attribute a : argsArr) { + auto d = dyn_cast(a); + if (!d) { + emitError() << "'args' entries must be DictionaryAttrs"; + return std::nullopt; + } + std::optional ac = parseOne(d, emitError); + if (!ac) + return std::nullopt; + fc.argInfos.push_back(*ac); + } + + return fc; +} diff --git a/mlir/unittests/ABI/CMakeLists.txt b/mlir/unittests/ABI/CMakeLists.txt index 39f955a8efea6..1113ed9516f9c 100644 --- a/mlir/unittests/ABI/CMakeLists.txt +++ b/mlir/unittests/ABI/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_unittest(MLIRABITests ABIRewriteContextTest.cpp ABITypeMapperTest.cpp + TestTargetTest.cpp ) mlir_target_link_libraries(MLIRABITests diff --git a/mlir/unittests/ABI/TestTargetTest.cpp b/mlir/unittests/ABI/TestTargetTest.cpp new file mode 100644 index 0000000000000..a83f2290b0b58 --- /dev/null +++ b/mlir/unittests/ABI/TestTargetTest.cpp @@ -0,0 +1,327 @@ +//===- TestTargetTest.cpp - Unit tests for the test ABI target -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/ABI/Targets/Test/TestTarget.h" +#include "mlir/Dialect/DLTI/DLTI.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/MLIRContext.h" +#include + +using namespace mlir; +using namespace mlir::abi; +using namespace mlir::abi::test; + +namespace { + +class TestTargetClassifyTest : public ::testing::Test { +protected: + TestTargetClassifyTest() + : module(ModuleOp::create(UnknownLoc::get(&context))), dl(*module) { + context.loadDialect(); + } + + MLIRContext context; + OwningOpRef module; + DataLayout dl; +}; + +TEST_F(TestTargetClassifyTest, IgnoresNoneType) { + auto noneTy = NoneType::get(&context); + FunctionClassification fc = classify({}, noneTy, dl); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Ignore); +} + +TEST_F(TestTargetClassifyTest, ExtendsNarrowSignedInteger) { + auto i8 = IntegerType::get(&context, 8, IntegerType::Signed); + FunctionClassification fc = classify({i8}, NoneType::get(&context), dl); + ASSERT_EQ(fc.argInfos.size(), 1u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Extend); + EXPECT_TRUE(fc.argInfos[0].signExtend); + auto coerced = dyn_cast(fc.argInfos[0].coercedType); + ASSERT_TRUE(coerced); + EXPECT_EQ(coerced.getWidth(), 32u); +} + +TEST_F(TestTargetClassifyTest, ExtendsNarrowSignlessIntegerAsZeroExt) { + auto i8 = IntegerType::get(&context, 8); + FunctionClassification fc = classify({i8}, NoneType::get(&context), dl); + ASSERT_EQ(fc.argInfos.size(), 1u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Extend); + EXPECT_FALSE(fc.argInfos[0].signExtend); +} + +TEST_F(TestTargetClassifyTest, RegisterSizedIntegerIsDirect) { + auto i32 = IntegerType::get(&context, 32); + auto i64 = IntegerType::get(&context, 64); + FunctionClassification fc = classify({i32, i64}, NoneType::get(&context), dl); + ASSERT_EQ(fc.argInfos.size(), 2u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Direct); + EXPECT_EQ(fc.argInfos[1].kind, ArgKind::Direct); +} + +TEST_F(TestTargetClassifyTest, IndexTypeIsDirect) { + // The default DataLayout reports IndexType as 64 bits, which is at or above + // the extension threshold and should classify as Direct. + auto idx = IndexType::get(&context); + FunctionClassification fc = classify({idx}, idx, dl); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Direct); + ASSERT_EQ(fc.argInfos.size(), 1u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Direct); +} + +TEST_F(TestTargetClassifyTest, FloatIsDirect) { + auto f32 = Float32Type::get(&context); + FunctionClassification fc = classify({f32}, f32, dl); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Direct); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Direct); +} + +TEST_F(TestTargetClassifyTest, FunctionLevelReturnAndArgsClassifiedTogether) { + auto i32 = IntegerType::get(&context, 32); + auto f64 = Float64Type::get(&context); + FunctionClassification fc = classify({i32, f64}, i32, dl); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Direct); + ASSERT_EQ(fc.argInfos.size(), 2u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Direct); + EXPECT_EQ(fc.argInfos[1].kind, ArgKind::Direct); +} + +TEST_F(TestTargetClassifyTest, + TypeWithoutDataLayoutInterfaceClassifiedAsIgnore) { + // FunctionType does not implement DataLayoutTypeInterface. The classifier + // must treat it as Ignore rather than crashing in dl.getTypeSizeInBits(). + // This guards against the same crash for dialect-specific void / sentinel + // types (e.g. cir::VoidType) used as a function's "no return value" marker. + auto i32 = IntegerType::get(&context, 32); + auto fnTy = FunctionType::get(&context, {i32}, {i32}); + FunctionClassification fc = classify({}, fnTy, dl); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Ignore); +} + +class TestTargetParseTest : public ::testing::Test { +protected: + TestTargetParseTest() : builder(&context) { + // Suppress diagnostic printing during tests; capture into lastError + // for assertions instead. + context.getDiagEngine().registerHandler([this](Diagnostic &diag) { + lastError = diag.str(); + return success(); + }); + } + + /// Convenience: parse and assert success, returning the result. + FunctionClassification parseOk(DictionaryAttr attr) { + auto loc = UnknownLoc::get(&context); + auto result = + parseClassificationAttr(attr, [&]() { return mlir::emitError(loc); }); + EXPECT_TRUE(result.has_value()) + << "parseClassificationAttr failed: " << lastError; + return result.value_or(FunctionClassification{}); + } + + /// Convenience: parse and assert failure with a substring match. + void parseError(DictionaryAttr attr, StringRef expectedSubstr) { + auto loc = UnknownLoc::get(&context); + lastError.clear(); + auto result = + parseClassificationAttr(attr, [&]() { return mlir::emitError(loc); }); + EXPECT_FALSE(result.has_value()); + EXPECT_NE(lastError.find(expectedSubstr.str()), std::string::npos) + << "expected error containing '" << expectedSubstr << "' but got '" + << lastError << "'"; + } + + DictionaryAttr makeArg(ArrayRef entries) { + return DictionaryAttr::get(&context, entries); + } + + MLIRContext context; + OpBuilder builder; + std::string lastError; +}; + +TEST_F(TestTargetParseTest, ParsesDirectReturnAndOneDirectArg) { + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({direct})), + }); + + auto fc = parseOk(attr); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Direct); + ASSERT_EQ(fc.argInfos.size(), 1u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Direct); +} + +TEST_F(TestTargetParseTest, ParsesExtendWithCoercedTypeAndSignExtend) { + auto i32 = IntegerType::get(&context, 32); + auto extend = makeArg({ + builder.getNamedAttr("kind", builder.getStringAttr("extend")), + builder.getNamedAttr("coerced_type", TypeAttr::get(i32)), + builder.getNamedAttr("sign_extend", builder.getBoolAttr(true)), + }); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({extend})), + }); + + auto fc = parseOk(attr); + ASSERT_EQ(fc.argInfos.size(), 1u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Extend); + EXPECT_TRUE(fc.argInfos[0].signExtend); + EXPECT_EQ(fc.argInfos[0].coercedType, i32); +} + +TEST_F(TestTargetParseTest, ParsesIndirectWithAlignAndByval) { + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto indirect = makeArg({ + builder.getNamedAttr("kind", builder.getStringAttr("indirect")), + builder.getNamedAttr("indirect_align", builder.getI64IntegerAttr(16)), + builder.getNamedAttr("byval", builder.getBoolAttr(false)), + }); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({indirect})), + }); + + auto fc = parseOk(attr); + ASSERT_EQ(fc.argInfos.size(), 1u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Indirect); + EXPECT_EQ(fc.argInfos[0].indirectAlign, llvm::Align(16)); + EXPECT_FALSE(fc.argInfos[0].byVal); +} + +TEST_F(TestTargetParseTest, ParsesIgnoreAndExpand) { + auto ignore = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("ignore"))}); + auto expand = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("expand"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", ignore), + builder.getNamedAttr("args", builder.getArrayAttr({expand, ignore})), + }); + + auto fc = parseOk(attr); + EXPECT_EQ(fc.returnInfo.kind, ArgKind::Ignore); + ASSERT_EQ(fc.argInfos.size(), 2u); + EXPECT_EQ(fc.argInfos[0].kind, ArgKind::Expand); + EXPECT_EQ(fc.argInfos[1].kind, ArgKind::Ignore); +} + +TEST_F(TestTargetParseTest, RejectsMissingReturn) { + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("args", builder.getArrayAttr({direct})), + }); + parseError(attr, "missing required 'return'"); +} + +TEST_F(TestTargetParseTest, RejectsMissingArgs) { + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + }); + parseError(attr, "missing required 'args'"); +} + +TEST_F(TestTargetParseTest, RejectsUnknownTopLevelKey) { + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({})), + builder.getNamedAttr("future_field", builder.getStringAttr("hello")), + }); + parseError(attr, "unknown top-level key 'future_field'"); +} + +TEST_F(TestTargetParseTest, RejectsUnknownArgKey) { + auto badArg = makeArg({ + builder.getNamedAttr("kind", builder.getStringAttr("direct")), + builder.getNamedAttr("future_field", builder.getBoolAttr(true)), + }); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({badArg})), + }); + parseError(attr, "unknown key 'future_field'"); +} + +TEST_F(TestTargetParseTest, RejectsExtendWithoutCoercedType) { + auto badExtend = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("extend"))}); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({badExtend})), + }); + parseError(attr, "kind='extend' requires 'coerced_type'"); +} + +TEST_F(TestTargetParseTest, RejectsIndirectWithoutAlign) { + auto badIndirect = makeArg( + {builder.getNamedAttr("kind", builder.getStringAttr("indirect"))}); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({badIndirect})), + }); + parseError(attr, "kind='indirect' requires 'indirect_align'"); +} + +TEST_F(TestTargetParseTest, RejectsIndirectWithNonPowerOfTwoAlign) { + auto badIndirect = makeArg({ + builder.getNamedAttr("kind", builder.getStringAttr("indirect")), + builder.getNamedAttr("indirect_align", builder.getI64IntegerAttr(7)), + }); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({badIndirect})), + }); + parseError(attr, "must be a positive power of 2"); +} + +TEST_F(TestTargetParseTest, RejectsUnknownKind) { + auto bad = makeArg( + {builder.getNamedAttr("kind", builder.getStringAttr("invalid_kind"))}); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({bad})), + }); + parseError(attr, "unknown kind='invalid_kind'"); +} + +TEST_F(TestTargetParseTest, RejectsMissingKind) { + auto bad = makeArg({}); + auto direct = + makeArg({builder.getNamedAttr("kind", builder.getStringAttr("direct"))}); + auto attr = builder.getDictionaryAttr({ + builder.getNamedAttr("return", direct), + builder.getNamedAttr("args", builder.getArrayAttr({bad})), + }); + parseError(attr, "missing required 'kind'"); +} + +} // namespace From 8fb21f9db2ee52b6c210d87a880cfe9549b29603 Mon Sep 17 00:00:00 2001 From: adams381 Date: Mon, 11 May 2026 12:30:23 -0500 Subject: [PATCH 315/538] [CIR] Use SymbolTableCollection in CallOp/TryCallOp LLVM lowering (#195916) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `CIRToLLVMCallOpLowering` and `CIRToLLVMTryCallOpLowering` used the static `mlir::SymbolTable::lookupNearestSymbolFrom` to resolve each direct call's callee. That static lookup does a linear scan of every operation in the module (O(M) per call), so a function with N call sites took O(N × M) total — quadratic in module size. Add a `SymbolTableCollection` member to both patterns via `customLLVMLoweringConstructorDecl` and use its caching `lookupNearestSymbolFrom`. The first lookup builds the symbol table (O(M)), then subsequent lookups are O(1) hash-based. Measured on Eigen's `bdcsvd.cpp` (heavy template instantiation, many call sites): the 36.98% self-time hotspot from this lookup is eliminated, and overall CIR compile time drops by roughly 2x on the slowest tests. Stacked behind #195883 (the equivalent fix for `CIRGenModule::applyReplacements`). Made with [Cursor](https://cursor.com) Co-authored-by: Cursor --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 6 ++++++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 17 ++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index ed285201ae8f8..9d9aaec1b275a 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -4082,6 +4082,9 @@ def CIR_CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> { $_state.addTypes(resType); }]> ]; + + let customLLVMLoweringConstructorDecl = + LoweringBuilders<(ins "mlir::SymbolTableCollection &":$symbolTables)>; } def CIR_TryCallOp : CIR_CallOpBase<"try_call",[ @@ -4171,6 +4174,9 @@ def CIR_TryCallOp : CIR_CallOpBase<"try_call",[ $_state.addSuccessors(unwindDest); }]> ]; + + let customLLVMLoweringConstructorDecl = + LoweringBuilders<(ins "mlir::SymbolTableCollection &":$symbolTables)>; } //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index dc6b407af3abb..dca079eff1752 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1720,6 +1720,7 @@ static mlir::LogicalResult rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands, mlir::ConversionPatternRewriter &rewriter, const mlir::TypeConverter *converter, + mlir::SymbolTableCollection &symbolTables, mlir::FlatSymbolRefAttr calleeAttr, mlir::Block *continueBlock = nullptr, mlir::Block *landingPadBlock = nullptr) { @@ -1750,7 +1751,7 @@ rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands, if (calleeAttr) { // direct call mlir::Operation *callee = - mlir::SymbolTable::lookupNearestSymbolFrom(op, calleeAttr); + symbolTables.lookupNearestSymbolFrom(op, calleeAttr); if (auto fn = mlir::dyn_cast(callee)) { llvmFnTy = converter->convertType( fn.getFunctionType()); @@ -1820,16 +1821,17 @@ mlir::LogicalResult CIRToLLVMCallOpLowering::matchAndRewrite( cir::CallOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { return rewriteCallOrInvoke(op.getOperation(), adaptor.getOperands(), rewriter, - getTypeConverter(), op.getCalleeAttr()); + getTypeConverter(), symbolTables, + op.getCalleeAttr()); } mlir::LogicalResult CIRToLLVMTryCallOpLowering::matchAndRewrite( cir::TryCallOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { assert(!cir::MissingFeatures::opCallCallConv()); - return rewriteCallOrInvoke(op.getOperation(), adaptor.getOperands(), rewriter, - getTypeConverter(), op.getCalleeAttr(), - op.getNormalDest(), op.getUnwindDest()); + return rewriteCallOrInvoke( + op.getOperation(), adaptor.getOperands(), rewriter, getTypeConverter(), + symbolTables, op.getCalleeAttr(), op.getNormalDest(), op.getUnwindDest()); } mlir::LogicalResult CIRToLLVMReturnAddrOpLowering::matchAndRewrite( @@ -3778,9 +3780,14 @@ void ConvertCIRToLLVMPass::runOnOperation() { /// of unresolved `BlockAddressOp`s until they are matched with the /// corresponding `BlockTagOp` in `resolveBlockAddressOp`. LLVMBlockAddressInfo blockInfoAddr; + /// Cached symbol table collection used by call lowering patterns to avoid + /// repeated O(M) module-wide symbol scans for every call site. + mlir::SymbolTableCollection symbolTables; mlir::RewritePatternSet patterns(&getContext()); patterns.add( converter, patterns.getContext(), dl, blockInfoAddr); + patterns.add( + converter, patterns.getContext(), dl, symbolTables); patterns.add< #define GET_LLVM_LOWERING_PATTERNS_LIST From d8eca503e62c9315e16387b70ef2f582aa1cb03d Mon Sep 17 00:00:00 2001 From: adams381 Date: Mon, 11 May 2026 12:32:26 -0500 Subject: [PATCH 316/538] [CIR] Use SymbolTableCollection in LoweringPrepare's getCalledFunction (#195919) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `LoweringPreparePass::lowerTrivialCopyCall` calls `getCalledFunction` once per `cir.call`, which used the static `mlir::SymbolTable::lookupNearestSymbolFrom` — that function does a linear scan of every operation in the module on each call, giving O(N × M) total for N calls in an M-operation module. The pass already maintains a `SymbolTableCollection` (used by `lowerStoreOfConstAggregate` and `lowerLocalInitOp`). Thread it through `getCalledFunction` and `lowerTrivialCopyCall` so the per-call lookup uses the cached hash-based path instead. Measured on Eigen's `bdcsvd.cpp` (heavy template instantiation): the 15.06% self-time hotspot from `getCalledFunction` is eliminated, contributing roughly another 2x compile-time speedup on the slowest tests. The cold path in `getOrCreateDtorFunc` (per-global, not per-call) is left on the static lookup with an explanatory comment to avoid threading `symbolTables` through `emitGlobalGuardedDtorRegion` and its callers. Companion to #195883 (`applyReplacements`) and #195916 (`CIRToLLVMCallOpLowering`) — same algorithmic anti-pattern in three different passes. Made with [Cursor](https://cursor.com) Co-authored-by: Cursor --- .../Dialect/Transforms/LoweringPrepare.cpp | 98 ++++++++++++------- 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 673ddf2da4cd6..ec16c162dbad4 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -67,23 +67,25 @@ static SmallString<128> getTransformedFileName(mlir::ModuleOp mlirModule) { return fileName; } -/// Return the FuncOp called by `callOp`. -static cir::FuncOp getCalledFunction(cir::CallOp callOp) { - mlir::SymbolRefAttr sym = llvm::dyn_cast_if_present( - callOp.getCallableForCallee()); - if (!sym) - return nullptr; - return dyn_cast_or_null( - mlir::SymbolTable::lookupNearestSymbolFrom(callOp, sym)); -} - namespace { struct LoweringPreparePass : public impl::LoweringPrepareBase { LoweringPreparePass() = default; + + // `mlir::SymbolTableCollection` is move-only (it owns lazily-created + // `unique_ptr` entries), which makes the implicit copy + // constructor ill-formed. MLIR's `clonePass()` requires copy + // construction, so define one explicitly. Per-run state members + // (dynamic initializers, guard maps, symbol-table cache, etc.) all + // start fresh in the cloned pass, which matches MLIR convention for + // pass clones and is more correct than the previous default-generated + // behavior that silently copied them. + LoweringPreparePass(const LoweringPreparePass &other) + : impl::LoweringPrepareBase(other) {} + void runOnOperation() override; - void runOnOp(mlir::Operation *op, mlir::SymbolTableCollection &symbolTables); + void runOnOp(mlir::Operation *op); void lowerCastOp(cir::CastOp op); void lowerComplexDivOp(cir::ComplexDivOp op); void lowerComplexMulOp(cir::ComplexMulOp op); @@ -93,10 +95,13 @@ struct LoweringPreparePass void lowerArrayDtor(cir::ArrayDtor op); void lowerArrayCtor(cir::ArrayCtor op); void lowerTrivialCopyCall(cir::CallOp op); - void lowerStoreOfConstAggregate(cir::StoreOp op, - mlir::SymbolTableCollection &symbolTables); - void lowerLocalInitOp(cir::LocalInitOp op, - mlir::SymbolTableCollection &symbolTables); + void lowerStoreOfConstAggregate(cir::StoreOp op); + void lowerLocalInitOp(cir::LocalInitOp op); + + /// Return the FuncOp called by `callOp`. Uses the cached `symbolTables` + /// member to avoid the O(M) module-wide scan that the static + /// `mlir::SymbolTable::lookupNearestSymbolFrom` would do per call. + cir::FuncOp getCalledFunction(cir::CallOp callOp); /// Return a private constant cir::GlobalOp with the given type and initial /// value, suitable for backing a memcpy-initialized local aggregate. @@ -105,11 +110,11 @@ struct LoweringPreparePass /// already has a matching type and initial value, that global is reused. /// Otherwise a new global is created with the next available `.` suffix /// (matching CIRGenBuilder::createVersionedGlobal and OGCG behavior). - cir::GlobalOp - getOrCreateConstAggregateGlobal(CIRBaseBuilderTy &builder, - mlir::SymbolTableCollection &symbolTables, - mlir::Location loc, llvm::StringRef baseName, - mlir::Type ty, mlir::TypedAttr constant); + cir::GlobalOp getOrCreateConstAggregateGlobal(CIRBaseBuilderTy &builder, + mlir::Location loc, + llvm::StringRef baseName, + mlir::Type ty, + mlir::TypedAttr constant); /// Build the function that initializes the specified global cir::FuncOp buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op); @@ -227,6 +232,26 @@ struct LoweringPreparePass /// Tracks current module. mlir::ModuleOp mlirModule; + /// Cached symbol tables used to avoid repeated O(M) module-wide scans + /// during per-call/per-global symbol lookups. Lazily populated on first + /// use. Pass methods access this directly rather than threading it + /// through helper signatures (see PR feedback on #195919). + /// + /// Invariant: every site that mutates the module's symbol table either + /// (a) keeps `symbolTables` in sync via + /// `symbolTables.getSymbolTable(mlirModule).insert(...)` (as + /// `getOrCreateConstAggregateGlobal` does), or (b) creates a symbol + /// that is never resolved through the cache later. Today + /// `buildRuntimeFunction` and `getOrCreateRuntimeVariable` fall in the + /// (b) bucket: their callers either use a separate map + /// (`cudaKernelMap`, `staticLocalDeclGuardMap`, `dynamicInitializers`) + /// or the static `mlir::SymbolTable::lookupNearestSymbolFrom`, never + /// the cached path. If a future change adds a cached lookup of a + /// freshly created symbol, the corresponding create site MUST move + /// to bucket (a) (insert into the cache or call + /// `invalidateSymbolTable`). + mlir::SymbolTableCollection symbolTables; + /// Tracks existing dynamic initializers. llvm::StringMap dynamicInitializerNames; llvm::SmallVector dynamicInitializers; @@ -1319,8 +1344,7 @@ void LoweringPreparePass::handleStaticLocal(cir::GlobalOp globalOp, builder.getInsertionBlock()->push_back(ret); } -void LoweringPreparePass::lowerLocalInitOp( - cir::LocalInitOp initOp, mlir::SymbolTableCollection &symbolTables) { +void LoweringPreparePass::lowerLocalInitOp(cir::LocalInitOp initOp) { // If we don't actually need to initialize anything anymore, we're done here. if (initOp.getCtorRegion().empty() && initOp.getDtorRegion().empty()) { @@ -1692,6 +1716,14 @@ void LoweringPreparePass::lowerArrayCtor(cir::ArrayCtor op) { /*isCtor=*/true); } +cir::FuncOp LoweringPreparePass::getCalledFunction(cir::CallOp callOp) { + mlir::SymbolRefAttr sym = llvm::dyn_cast_if_present( + callOp.getCallableForCallee()); + if (!sym) + return nullptr; + return symbolTables.lookupNearestSymbolFrom(callOp, sym); +} + void LoweringPreparePass::lowerTrivialCopyCall(cir::CallOp op) { cir::FuncOp funcOp = getCalledFunction(op); if (!funcOp) @@ -1712,9 +1744,8 @@ void LoweringPreparePass::lowerTrivialCopyCall(cir::CallOp op) { } cir::GlobalOp LoweringPreparePass::getOrCreateConstAggregateGlobal( - CIRBaseBuilderTy &builder, mlir::SymbolTableCollection &symbolTables, - mlir::Location loc, llvm::StringRef baseName, mlir::Type ty, - mlir::TypedAttr constant) { + CIRBaseBuilderTy &builder, mlir::Location loc, llvm::StringRef baseName, + mlir::Type ty, mlir::TypedAttr constant) { // Look up (and lazily populate) the per-base-name cache. llvm::SmallVector &versions = constAggregateGlobals[baseName]; @@ -1770,8 +1801,7 @@ cir::GlobalOp LoweringPreparePass::getOrCreateConstAggregateGlobal( return gv; } -void LoweringPreparePass::lowerStoreOfConstAggregate( - cir::StoreOp op, mlir::SymbolTableCollection &symbolTables) { +void LoweringPreparePass::lowerStoreOfConstAggregate(cir::StoreOp op) { // Check if the value operand is a cir.const with aggregate type. auto constOp = op.getValue().getDefiningOp(); if (!constOp) @@ -1813,8 +1843,8 @@ void LoweringPreparePass::lowerStoreOfConstAggregate( // Check for existing globals and create a new global with a unique name // if no match is found. - cir::GlobalOp gv = getOrCreateConstAggregateGlobal( - builder, symbolTables, op.getLoc(), baseName, ty, constant); + cir::GlobalOp gv = getOrCreateConstAggregateGlobal(builder, op.getLoc(), + baseName, ty, constant); // Now replace the store with get_global + copy. builder.setInsertionPoint(op); @@ -1834,8 +1864,7 @@ void LoweringPreparePass::lowerStoreOfConstAggregate( constOp.erase(); } -void LoweringPreparePass::runOnOp(mlir::Operation *op, - mlir::SymbolTableCollection &symbolTables) { +void LoweringPreparePass::runOnOp(mlir::Operation *op) { if (auto arrayCtor = dyn_cast(op)) { lowerArrayCtor(arrayCtor); } else if (auto arrayDtor = dyn_cast(op)) { @@ -1853,7 +1882,7 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op, } else if (auto callOp = dyn_cast(op)) { lowerTrivialCopyCall(callOp); } else if (auto storeOp = dyn_cast(op)) { - lowerStoreOfConstAggregate(storeOp, symbolTables); + lowerStoreOfConstAggregate(storeOp); } else if (auto fnOp = dyn_cast(op)) { if (auto globalCtor = fnOp.getGlobalCtorPriority()) globalCtorList.emplace_back(fnOp.getName(), globalCtor.value()); @@ -1869,7 +1898,7 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op, } else if (auto threeWayCmp = dyn_cast(op)) { lowerThreeWayCmpOp(threeWayCmp); } else if (auto initOp = dyn_cast(op)) { - lowerLocalInitOp(initOp, symbolTables); + lowerLocalInitOp(initOp); } } @@ -2251,7 +2280,6 @@ void LoweringPreparePass::runOnOperation() { mlirModule = cast<::mlir::ModuleOp>(op); llvm::SmallVector opsToTransform; - mlir::SymbolTableCollection symbolTables; op->walk([&](mlir::Operation *op) { if (mlir::isagetLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) From 8f3f74dfe2909bacfd45f0c62d3363c4a8565ab0 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Mon, 11 May 2026 10:33:30 -0700 Subject: [PATCH 317/538] [SPIRV] Make sure spirv-tools are copied with exe suffix (#196658) If a user provides the spirv-tools for LLVM_INCLUDE_SPIRV_TOOLS_TESTS, we need to make sure that they're copied into the bin dir with the appropriate platform suffix (ie, .exe on windows). Otherwise, lit's `add_tool_substitutions` won't be able to find them, and after #192462 this can lead to silently using versions that happen to be in your path. --- llvm/tools/spirv-tools/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/tools/spirv-tools/CMakeLists.txt b/llvm/tools/spirv-tools/CMakeLists.txt index 5db7aec997593..e99f10fed0c80 100644 --- a/llvm/tools/spirv-tools/CMakeLists.txt +++ b/llvm/tools/spirv-tools/CMakeLists.txt @@ -42,9 +42,9 @@ endif () # Link the provided or just built binaries. if (SPIRV_DIS) add_custom_target(spirv-dis - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_DIS}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-dis") + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_DIS}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-dis${CMAKE_EXECUTABLE_SUFFIX}") else () - add_custom_target(spirv-dis + add_custom_target(spirv-dis${CMAKE_EXECUTABLE_SUFFIX} COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${BINARY_DIR}/tools/spirv-dis${CMAKE_EXECUTABLE_SUFFIX}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-dis${CMAKE_EXECUTABLE_SUFFIX}" DEPENDS SPIRVTools ) @@ -52,7 +52,7 @@ endif () if (SPIRV_VAL) add_custom_target(spirv-val - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_VAL}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-val") + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_VAL}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-val${CMAKE_EXECUTABLE_SUFFIX}") else () add_custom_target(spirv-val COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${BINARY_DIR}/tools/spirv-val${CMAKE_EXECUTABLE_SUFFIX}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-val${CMAKE_EXECUTABLE_SUFFIX}" @@ -62,7 +62,7 @@ endif () if (SPIRV_AS) add_custom_target(spirv-as - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_AS}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as") + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_AS}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as${CMAKE_EXECUTABLE_SUFFIX}") else () add_custom_target(spirv-as COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${BINARY_DIR}/tools/spirv-as${CMAKE_EXECUTABLE_SUFFIX}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as${CMAKE_EXECUTABLE_SUFFIX}" @@ -72,7 +72,7 @@ endif () if (SPIRV_LINK) add_custom_target(spirv-link - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_LINK}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-link") + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_LINK}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-link${CMAKE_EXECUTABLE_SUFFIX}") else () add_custom_target(spirv-link COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${BINARY_DIR}/tools/spirv-link${CMAKE_EXECUTABLE_SUFFIX}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-link${CMAKE_EXECUTABLE_SUFFIX}" From 4f41bd4d04fdb2c48fdd2f7fa134bd6ee0a9a73d Mon Sep 17 00:00:00 2001 From: albertbolt1 <45144020+albertbolt1@users.noreply.github.com> Date: Mon, 11 May 2026 23:03:51 +0530 Subject: [PATCH 318/538] [CIR][AArch64] Upstream for vector vector shl (#191655) Part of #185382 --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 27 +- clang/test/CodeGen/AArch64/neon-intrinsics.c | 234 ---------------- clang/test/CodeGen/AArch64/neon/intrinsics.c | 254 ++++++++++++++++++ 3 files changed, 278 insertions(+), 237 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index abed7ff84069e..c142b69f6be6e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -900,17 +900,38 @@ static mlir::Value emitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vbfdot_f32: case NEON::BI__builtin_neon_vbfdotq_f32: case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: - default: cgf.cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AArch64 builtin call: ") + ctx.BuiltinInfo.getName(builtinID)); return mlir::Value{}; + } + // The switch stmt is intended to help catch NYI cases and will be removed + // once the CIR implementation is complete. Avoid adding specialized + // code in cases - that should only be required for a handful of examples. + switch (builtinID) { + default: cgf.cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AArch64 builtin call: ") + - ctx.BuiltinInfo.getName(builtinID)); - return mlir::Value{}; + cgf.getContext().BuiltinInfo.getName(builtinID)); + break; + case NEON::BI__builtin_neon_vshl_v: + case NEON::BI__builtin_neon_vshlq_v: { + llvm::StringRef llvmIntrName = + getLLVMIntrNameNoPrefix(static_cast( + usgn ? llvmIntrinsic : altLLVMIntrinsic)); + + mlir::Value result = + emitNeonCall(cgf.getCIRGenModule(), cgf.getBuilder(), + /*argTypes=*/{vTy, vTy}, ops, llvmIntrName, + /*funcResTy=*/vTy, loc); + mlir::Type resultType = cgf.convertType(expr->getType()); + return cgf.getBuilder().createBitcast(result, resultType); + } } + + // NYI + return nullptr; } bool CIRGenFunction::getAArch64SVEProcessedOperands( diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index aaba75b9c835a..442850bcf0d40 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -3756,240 +3756,6 @@ uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); } -// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) -// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] -// -int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { - return vshl_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) -// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> -// CHECK-NEXT: ret <4 x i16> [[TMP2]] -// -int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { - return vshl_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) -// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[TMP2]] -// -int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { - return vshl_s32(a, b); -} - -// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_s64( -// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> -// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) -// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 -// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 -// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] -// -int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { - return vshl_s64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) -// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] -// -uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { - return vshl_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) -// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> -// CHECK-NEXT: ret <4 x i16> [[TMP2]] -// -uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { - return vshl_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) -// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[TMP2]] -// -uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { - return vshl_u32(a, b); -} - -// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_u64( -// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> -// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) -// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 -// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 -// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] -// -uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { - return vshl_u64(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_s8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) -// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] -// -int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { - return vshlq_s8(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_s16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) -// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> -// CHECK-NEXT: ret <8 x i16> [[TMP2]] -// -int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { - return vshlq_s16(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_s32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) -// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[TMP2]] -// -int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { - return vshlq_s32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_s64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) -// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[TMP2]] -// -int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { - return vshlq_s64(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_u8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) -// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] -// -uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { - return vshlq_u8(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_u16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) -// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> -// CHECK-NEXT: ret <8 x i16> [[TMP2]] -// -uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { - return vshlq_u16(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_u32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) -// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[TMP2]] -// -uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { - return vshlq_u32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_u64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> -// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) -// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[TMP2]] -// -uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { - return vshlq_u64(a, b); -} - // CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_s8( // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c index 7e71ea4c00422..b4fbdcc5436ed 100644 --- a/clang/test/CodeGen/AArch64/neon/intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c @@ -2439,6 +2439,260 @@ uint64x1_t test_vshl_n_u64(uint64x1_t a) { return vshl_n_u64(a, 1); } +// LLVM-LABEL: test_vshl_s8 +// CIR-LABEL: vshl_s8 +int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<8 x !s8i>, !cir.vector<8 x !s8i>) -> !cir.vector<8 x !s8i> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// LLVM: ret <8 x i8> [[VSHL_V_I]] + return vshl_s8(a, b); +} + +// LLVM-LABEL: test_vshl_s16 +// CIR-LABEL: vshl_s16 +int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<4 x !s16i>, !cir.vector<4 x !s16i>) -> !cir.vector<4 x !s16i> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// LLVM: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// LLVM: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) +// LLVM: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// LLVM: ret <4 x i16> [[TMP2]] + return vshl_s16(a, b); +} + +// LLVM-LABEL: test_vshl_s32 +// CIR-LABEL: vshl_s32 +int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<2 x !s32i>, !cir.vector<2 x !s32i>) -> !cir.vector<2 x !s32i> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// LLVM: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// LLVM: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) +// LLVM: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// LLVM: ret <2 x i32> [[TMP2]] + return vshl_s32(a, b); +} + +// LLVM-LABEL: test_vshl_s64 +// CIR-LABEL: vshl_s64 +int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<1 x !s64i>, !cir.vector<1 x !s64i>) -> !cir.vector<1 x !s64i> + +// LLVM: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// LLVM: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// LLVM: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// LLVM: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) +// LLVM: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 +// LLVM: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// LLVM-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] + return vshl_s64(a, b); +} + +// LLVM-LABEL: test_vshl_u8 +// CIR-LABEL: vshl_u8 +uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<8 x !u8i>, !cir.vector<8 x !u8i>) -> !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) #[[ATTR0]] { +// LLVM: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// LLVM: ret <8 x i8> [[VSHL_V_I]] + return vshl_u8(a, b); +} + +// LLVM-LABEL: test_vshl_u16 +// CIR-LABEL: vshl_u16 +uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<4 x !u16i>, !cir.vector<4 x !u16i>) -> !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// LLVM: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// LLVM: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) +// LLVM: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// LLVM: ret <4 x i16> [[TMP2]] + return vshl_u16(a, b); +} + +// LLVM-LABEL: test_vshl_u32 +// CIR-LABEL: vshl_u32 +uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<2 x !u32i>, !cir.vector<2 x !u32i>) -> !cir.vector<2 x !u32i> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// LLVM: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// LLVM: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) +// LLVM: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// LLVM: ret <2 x i32> [[TMP2]] + return vshl_u32(a, b); +} + +// LLVM-LABEL: test_vshl_u64 +// CIR-LABEL: vshl_u64 +uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<1 x !u64i>, !cir.vector<1 x !u64i>) -> !cir.vector<1 x !u64i> + +// LLVM-SAME: <1 x i64> {{.*}} [[A:%.*]], <1 x i64> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// LLVM: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// LLVM: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// LLVM: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) +// LLVM: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 +// LLVM: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// LLVM: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] + return vshl_u64(a, b); +} + +// LLVM-LABEL: test_vshlq_s8 +// CIR-LABEL: vshlq_s8 +int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>) -> !cir.vector<16 x !s8i> + +// LLVM: <16 x i8> {{.*}} [[A:%.*]], <16 x i8> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// LLVM: ret <16 x i8> [[VSHLQ_V_I]] + return vshlq_s8(a, b); +} + +// LLVM-LABEL: test_vshlq_s16 +// CIR-LABEL: vshlq_s16 +int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>) -> !cir.vector<8 x !s16i> + + +// LLVM: <8 x i16> {{.*}} [[A:%.*]], <8 x i16> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// LLVM: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// LLVM: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LLVM: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) +// LLVM: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// LLVM: ret <8 x i16> [[TMP2]] +// + return vshlq_s16(a, b); +} + +// LLVM-LABEL: test_vshlq_s32 +// CIR-LABEL: vshlq_s32 +int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> + +// LLVM: <4 x i32> {{.*}} [[A:%.*]], <4 x i32> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// LLVM: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// LLVM: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LLVM: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) +// LLVM: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// LLVM: ret <4 x i32> [[TMP2]] + return vshlq_s32(a, b); +} + +// LLVM-LABEL: test_vshlq_s64 +// CIR-LABEL: vshlq_s64 +int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !cir.vector<2 x !s64i> + +// LLVM: <2 x i64> {{.*}} [[A:%.*]], <2 x i64> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// LLVM: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// LLVM: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LLVM: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) +// LLVM: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// LLVM: ret <2 x i64> [[TMP2]] + return vshlq_s64(a, b); +} + +// LLVM-LABEL: test_vshlq_u8 +// CIR-LABEL: vshlq_u8 +uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<16 x !u8i>, !cir.vector<16 x !u8i>) -> !cir.vector<16 x !u8i> + +// LLVM: <16 x i8> {{.*}} [[A:%.*]], <16 x i8> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// LLVM: ret <16 x i8> [[VSHLQ_V_I]] + return vshlq_u8(a, b); +} + +// LLVM-LABEL: test_vshlq_u16 +// CIR-LABEL: vshlq_u16 +uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<8 x !u16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !u16i> + + +// LLVM: <8 x i16> {{.*}} [[A:%.*]], <8 x i16> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// LLVM: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// LLVM: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// LLVM: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) +// LLVM: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// LLVM: ret <8 x i16> [[TMP2]] +// + return vshlq_u16(a, b); +} + + +// LLVM-LABEL: test_vshlq_u32 +// CIR-LABEL: vshlq_u32 +uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<4 x !u32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i> + +// LLVM: <4 x i32> {{.*}} [[A:%.*]], <4 x i32> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// LLVM: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// LLVM: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// LLVM: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) +// LLVM: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// LLVM: ret <4 x i32> [[TMP2]] + return vshlq_u32(a, b); +} + +// LLVM-LABEL: test_vshlq_u64 +// CIR-LABEL: vshlq_u64 +uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { +// CIR: [[RES:%.*]] = cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!cir.vector<2 x !u64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !u64i> + +// LLVM: <2 x i64> {{.*}} [[A:%.*]], <2 x i64> {{.*}} [[B:%.*]]) #{{.*}} { +// LLVM: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// LLVM: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// LLVM: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// LLVM: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) +// LLVM: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// LLVM: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// LLVM: ret <2 x i64> [[TMP2]] + return vshlq_u64(a, b); +} + //===------------------------------------------------------===// // 2.1.3.2.1 Vector shift right //===------------------------------------------------------===// From 170a788aac7fa98c89e4b4d38c89a680a36f9cf4 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Mon, 11 May 2026 19:38:32 +0200 Subject: [PATCH 319/538] [mlir][SPIR-V] Lower math.{exp2,log2} to GLSL.std.450 ops (#196994) Map: - math.exp2 -> spirv.GL.Exp2 - math.log2 -> spirv.GL.Log2 (before it was lowered through decomposition) --- .../Conversion/MathToSPIRV/MathToSPIRV.cpp | 82 ++++++++----------- .../MathToSPIRV/math-to-gl-spirv.mlir | 12 +-- 2 files changed, 42 insertions(+), 52 deletions(-) diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp index ce603d4a85072..01285c6c0ec09 100644 --- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp +++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp @@ -297,25 +297,18 @@ struct Log1pOpPattern final : public OpConversionPattern { } }; -/// Converts math.log2 and math.log10 to SPIR-V ops. +/// Converts math.log10 to GLSL SPIR-V ops. /// -/// SPIR-V does not have direct operations for log2 and log10. Explicitly -/// lower to these operations using: -/// log2(x) = log(x) * 1/log(2) +/// GLSL.std.450 has no Log10 instruction. Lower it as: /// log10(x) = log(x) * 1/log(10) +struct Log10OpPattern final : public OpConversionPattern { + using Base::Base; -template -struct Log2Log10OpPattern final : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - using typename OpConversionPattern::OpAdaptor; - - static constexpr double log2Reciprocal = - 1.442695040888963407359924681001892137426645954152985934135449407; static constexpr double log10Reciprocal = 0.4342944819032518276511289189166050822943970058036665661144537832; LogicalResult - matchAndRewrite(MathLogOp operation, OpAdaptor adaptor, + matchAndRewrite(math::Log10Op operation, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { assert(adaptor.getOperands().size() == 1); if (LogicalResult res = checkSourceOpTypes(rewriter, operation); @@ -342,14 +335,11 @@ struct Log2Log10OpPattern final : public OpConversionPattern { vectorType, FloatAttr::get(elemType, value).getValue())); } } - - llvm_unreachable("unimplemented types for log2/log10"); + llvm_unreachable("unimplemented type for log10"); }; - Value constantValue = getConstantValue( - std::is_same() ? log2Reciprocal - : log10Reciprocal); - Value log = SpirvLogOp::create(rewriter, loc, adaptor.getOperand()); + Value constantValue = getConstantValue(log10Reciprocal); + Value log = spirv::GLLogOp::create(rewriter, loc, adaptor.getOperand()); rewriter.replaceOpWithNewOp(operation, type, log, constantValue); return success(); @@ -530,34 +520,34 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter, typeConverter, patterns.getContext()); // GLSL patterns - patterns - .add, - Log2Log10OpPattern, - Log2Log10OpPattern, - ExpM1OpPattern, PowFOpPattern, RoundOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern>( - typeConverter, patterns.getContext()); + patterns.add< + CountLeadingZerosPattern, Log1pOpPattern, Log10OpPattern, + ExpM1OpPattern, PowFOpPattern, RoundOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern>( + typeConverter, patterns.getContext()); // OpenCL patterns patterns.add< diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir index 608abffd8bd82..8eb533eeff2a9 100644 --- a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir +++ b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir @@ -12,6 +12,8 @@ func.func @float32_unary_scalar(%arg0: f32) { %1 = math.cos %arg0 : f32 // CHECK: spirv.GL.Exp %{{.*}}: f32 %2 = math.exp %arg0 : f32 + // CHECK: spirv.GL.Exp2 %{{.*}}: f32 + %exp2 = math.exp2 %arg0 : f32 // CHECK: %[[EXP:.+]] = spirv.GL.Exp %arg0 // CHECK: %[[ONE:.+]] = spirv.Constant 1.000000e+00 : f32 // CHECK: spirv.FSub %[[EXP]], %[[ONE]] @@ -22,9 +24,7 @@ func.func @float32_unary_scalar(%arg0: f32) { // CHECK: %[[ADDONE:.+]] = spirv.FAdd %[[ONE]], %{{.+}} // CHECK: spirv.GL.Log %[[ADDONE]] %5 = math.log1p %arg0 : f32 - // CHECK: %[[LOG2_RECIPROCAL:.+]] = spirv.Constant 1.44269502 : f32 - // CHECK: %[[LOG0:.+]] = spirv.GL.Log {{.+}} - // CHECK: spirv.FMul %[[LOG0]], %[[LOG2_RECIPROCAL]] + // CHECK: spirv.GL.Log2 %{{.*}}: f32 %6 = math.log2 %arg0 : f32 // CHECK: %[[LOG10_RECIPROCAL:.+]] = spirv.Constant 0.434294492 : f32 // CHECK: %[[LOG1:.+]] = spirv.GL.Log {{.+}} @@ -73,6 +73,8 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) { %1 = math.cos %arg0 : vector<3xf32> // CHECK: spirv.GL.Exp %{{.*}}: vector<3xf32> %2 = math.exp %arg0 : vector<3xf32> + // CHECK: spirv.GL.Exp2 %{{.*}}: vector<3xf32> + %exp2 = math.exp2 %arg0 : vector<3xf32> // CHECK: %[[EXP:.+]] = spirv.GL.Exp %arg0 // CHECK: %[[ONE:.+]] = spirv.Constant dense<1.000000e+00> : vector<3xf32> // CHECK: spirv.FSub %[[EXP]], %[[ONE]] @@ -83,9 +85,7 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) { // CHECK: %[[ADDONE:.+]] = spirv.FAdd %[[ONE]], %{{.+}} // CHECK: spirv.GL.Log %[[ADDONE]] %5 = math.log1p %arg0 : vector<3xf32> - // CHECK: %[[LOG2_RECIPROCAL:.+]] = spirv.Constant dense<1.44269502> : vector<3xf32> - // CHECK: %[[LOG0:.+]] = spirv.GL.Log {{.+}} - // CHECK: spirv.FMul %[[LOG0]], %[[LOG2_RECIPROCAL]] + // CHECK: spirv.GL.Log2 %{{.*}}: vector<3xf32> %6 = math.log2 %arg0 : vector<3xf32> // CHECK: %[[LOG10_RECIPROCAL:.+]] = spirv.Constant dense<0.434294492> : vector<3xf32> // CHECK: %[[LOG1:.+]] = spirv.GL.Log {{.+}} From 36f60d848643f9ebd8b0c957fb645bf6a8f68860 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 11 May 2026 19:39:17 +0200 Subject: [PATCH 320/538] [libc] s/in_addr/struct in_addr/ (#196937) in_addr should be a regular tagged struct, not a typedef-of-an-anonymous-struct. C++ doesn't care about this as much as C does, but even in C++ one cannot use `struct Foo` syntax to refer to a struct if it was not declared that way. --- libc/include/CMakeLists.txt | 2 +- libc/include/arpa/inet.yaml | 4 ++-- libc/include/llvm-libc-types/CMakeLists.txt | 2 +- .../llvm-libc-types/{in_addr.h => struct_in_addr.h} | 12 ++++++------ libc/src/arpa/inet/CMakeLists.txt | 4 ++-- libc/src/arpa/inet/inet_addr.cpp | 4 ++-- libc/src/arpa/inet/inet_aton.cpp | 2 +- libc/src/arpa/inet/inet_aton.h | 4 ++-- libc/test/src/arpa/inet/inet_aton_test.cpp | 2 +- 9 files changed, 18 insertions(+), 18 deletions(-) rename libc/include/llvm-libc-types/{in_addr.h => struct_in_addr.h} (62%) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index fac2b40ca45a8..e5f96ab19d9f1 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -205,8 +205,8 @@ add_header_macro( DEPENDS .llvm_libc_common_h .inttypes - .llvm-libc-types.in_addr .llvm-libc-types.in_addr_t + .llvm-libc-types.struct_in_addr ) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/netinet) diff --git a/libc/include/arpa/inet.yaml b/libc/include/arpa/inet.yaml index a0601d700d1b4..e6016152a1466 100644 --- a/libc/include/arpa/inet.yaml +++ b/libc/include/arpa/inet.yaml @@ -3,8 +3,8 @@ standards: - posix macros: [] types: - - type_name: in_addr - type_name: in_addr_t + - type_name: struct_in_addr enums: [] objects: [] functions: @@ -32,7 +32,7 @@ functions: return_type: int arguments: - type: const char * - - type: in_addr * + - type: struct in_addr * - name: ntohl standards: - POSIX diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index ad294279e0f03..e967b43d81df0 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -57,7 +57,7 @@ add_header(gid_t HDR gid_t.h) add_header(uid_t HDR uid_t.h) add_header(imaxdiv_t HDR imaxdiv_t.h) add_header(in_addr_t HDR in_addr_t.h) -add_header(in_addr HDR in_addr.h DEPENDS .in_addr_t) +add_header(struct_in_addr HDR struct_in_addr.h DEPENDS .in_addr_t) add_header(ino_t HDR ino_t.h) add_header(key_t HDR key_t.h) add_header(mbstate_t HDR mbstate_t.h) diff --git a/libc/include/llvm-libc-types/in_addr.h b/libc/include/llvm-libc-types/struct_in_addr.h similarity index 62% rename from libc/include/llvm-libc-types/in_addr.h rename to libc/include/llvm-libc-types/struct_in_addr.h index ab42142e323b2..2ff5694dfb173 100644 --- a/libc/include/llvm-libc-types/in_addr.h +++ b/libc/include/llvm-libc-types/struct_in_addr.h @@ -1,4 +1,4 @@ -//===-- Definition of in_addr type ----------------------------------------===// +//===-- Definition of struct in_addr --------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_TYPES_IN_ADDR_H -#define LLVM_LIBC_TYPES_IN_ADDR_H +#ifndef LLVM_LIBC_TYPES_STRUCT_IN_ADDR_H +#define LLVM_LIBC_TYPES_STRUCT_IN_ADDR_H #include "in_addr_t.h" -typedef struct { +struct in_addr { in_addr_t s_addr; -} in_addr; +}; -#endif // LLVM_LIBC_TYPES_IN_ADDR_H +#endif // LLVM_LIBC_TYPES_STRUCT_IN_ADDR_H diff --git a/libc/src/arpa/inet/CMakeLists.txt b/libc/src/arpa/inet/CMakeLists.txt index 3b3d0f43b8586..7b421c30e07fa 100644 --- a/libc/src/arpa/inet/CMakeLists.txt +++ b/libc/src/arpa/inet/CMakeLists.txt @@ -30,7 +30,7 @@ add_entrypoint_object( inet_aton.h DEPENDS libc.include.arpa_inet - libc.include.llvm-libc-types.in_addr + libc.include.llvm-libc-types.struct_in_addr libc.src.__support.common libc.src.__support.str_to_integer ) @@ -44,7 +44,7 @@ add_entrypoint_object( DEPENDS libc.include.arpa_inet libc.include.llvm-libc-macros.netinet_in_macros - libc.include.llvm-libc-types.in_addr + libc.include.llvm-libc-types.struct_in_addr libc.include.llvm-libc-types.in_addr_t libc.src.__support.common libc.src.arpa.inet.inet_aton diff --git a/libc/src/arpa/inet/inet_addr.cpp b/libc/src/arpa/inet/inet_addr.cpp index 8ce88c0df8aec..00a4a5ad22ecd 100644 --- a/libc/src/arpa/inet/inet_addr.cpp +++ b/libc/src/arpa/inet/inet_addr.cpp @@ -8,15 +8,15 @@ #include "src/arpa/inet/inet_addr.h" #include "include/llvm-libc-macros/netinet-in-macros.h" -#include "include/llvm-libc-types/in_addr.h" #include "include/llvm-libc-types/in_addr_t.h" +#include "include/llvm-libc-types/struct_in_addr.h" #include "src/__support/common.h" #include "src/arpa/inet/inet_aton.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(in_addr_t, inet_addr, (const char *cp)) { - in_addr addr; + struct in_addr addr; return inet_aton(cp, &addr) ? addr.s_addr : INADDR_NONE; } diff --git a/libc/src/arpa/inet/inet_aton.cpp b/libc/src/arpa/inet/inet_aton.cpp index 71419cb9a00c8..c7f002ec393aa 100644 --- a/libc/src/arpa/inet/inet_aton.cpp +++ b/libc/src/arpa/inet/inet_aton.cpp @@ -13,7 +13,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, inet_aton, (const char *cp, in_addr *inp)) { +LLVM_LIBC_FUNCTION(int, inet_aton, (const char *cp, struct in_addr *inp)) { constexpr int IPV4_MAX_DOT_NUM = 3; unsigned long parts[IPV4_MAX_DOT_NUM + 1] = {0}; int dot_num = 0; diff --git a/libc/src/arpa/inet/inet_aton.h b/libc/src/arpa/inet/inet_aton.h index ea387d1f6b2f6..5d97ad5547016 100644 --- a/libc/src/arpa/inet/inet_aton.h +++ b/libc/src/arpa/inet/inet_aton.h @@ -9,12 +9,12 @@ #ifndef LLVM_LIBC_SRC_ARPA_INET_INET_ATON_H #define LLVM_LIBC_SRC_ARPA_INET_INET_ATON_H -#include "include/llvm-libc-types/in_addr.h" +#include "include/llvm-libc-types/struct_in_addr.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { -int inet_aton(const char *cp, in_addr *inp); +int inet_aton(const char *cp, struct in_addr *inp); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/arpa/inet/inet_aton_test.cpp b/libc/test/src/arpa/inet/inet_aton_test.cpp index c9c97870e0dff..7757ba7ad4d07 100644 --- a/libc/test/src/arpa/inet/inet_aton_test.cpp +++ b/libc/test/src/arpa/inet/inet_aton_test.cpp @@ -13,7 +13,7 @@ namespace LIBC_NAMESPACE_DECL { TEST(LlvmLibcInetAton, ValidTest) { - in_addr a; + struct in_addr a; // a.b.c.d a.s_addr = 0; From 220424715083cb0ea31e2fb5403136fec4030152 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Mon, 11 May 2026 10:50:33 -0700 Subject: [PATCH 321/538] [libc] C11 threads: Add cnd_timedwait and mtx_trylock. (#195966) Use recently added support for timed_wait in CndVar and try_lock in Mutex to implement the C11 functions `cnd_timedwait` and `mtx_trylock` (counterparts of `pthread_cond_timedwait` and `pthread_mutex_trylock`, respectively). Fixes a minor edge case in `ensure_monotonicity` conversion where the converted timeout could fall before epoch - clamp it to be 0 / epoch in that case. Assisted by: Gemini --- libc/config/linux/aarch64/entrypoints.txt | 2 + libc/config/linux/riscv/entrypoints.txt | 2 + libc/config/linux/x86_64/entrypoints.txt | 2 + libc/include/CMakeLists.txt | 1 + libc/include/threads.yaml | 15 ++++ libc/src/__support/threads/mutex.h | 4 +- libc/src/__support/time/clock_conversion.h | 2 +- libc/src/__support/time/monotonicity.h | 4 + libc/src/threads/CMakeLists.txt | 18 +++++ libc/src/threads/cnd_timedwait.h | 22 ++++++ libc/src/threads/linux/CMakeLists.txt | 14 ++++ libc/src/threads/linux/cnd_timedwait.cpp | 57 +++++++++++++ libc/src/threads/mtx_trylock.cpp | 27 +++++++ libc/src/threads/mtx_trylock.h | 21 +++++ .../integration/src/threads/CMakeLists.txt | 4 + .../test/integration/src/threads/cnd_test.cpp | 79 ++++++++++++++++++- .../test/integration/src/threads/mtx_test.cpp | 27 +++++++ 17 files changed, 294 insertions(+), 7 deletions(-) create mode 100644 libc/src/threads/cnd_timedwait.h create mode 100644 libc/src/threads/linux/cnd_timedwait.cpp create mode 100644 libc/src/threads/mtx_trylock.cpp create mode 100644 libc/src/threads/mtx_trylock.h diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 9994a9294173d..0c8fb3c8dbc15 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -1195,10 +1195,12 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.threads.cnd_destroy libc.src.threads.cnd_init libc.src.threads.cnd_signal + libc.src.threads.cnd_timedwait libc.src.threads.cnd_wait libc.src.threads.mtx_destroy libc.src.threads.mtx_init libc.src.threads.mtx_lock + libc.src.threads.mtx_trylock libc.src.threads.mtx_unlock libc.src.threads.thrd_create libc.src.threads.thrd_current diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 2748b2b8e6a5d..99a5c820159f8 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -1318,10 +1318,12 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.threads.cnd_destroy libc.src.threads.cnd_init libc.src.threads.cnd_signal + libc.src.threads.cnd_timedwait libc.src.threads.cnd_wait libc.src.threads.mtx_destroy libc.src.threads.mtx_init libc.src.threads.mtx_lock + libc.src.threads.mtx_trylock libc.src.threads.mtx_unlock libc.src.threads.thrd_create libc.src.threads.thrd_current diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 4b551ced82138..45fdde6454880 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1392,10 +1392,12 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.threads.cnd_destroy libc.src.threads.cnd_init libc.src.threads.cnd_signal + libc.src.threads.cnd_timedwait libc.src.threads.cnd_wait libc.src.threads.mtx_destroy libc.src.threads.mtx_init libc.src.threads.mtx_lock + libc.src.threads.mtx_trylock libc.src.threads.mtx_unlock libc.src.threads.thrd_create libc.src.threads.thrd_current diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index e5f96ab19d9f1..b1b4a4fd20982 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -340,6 +340,7 @@ add_header_macro( .llvm-libc-types.thrd_start_t .llvm-libc-types.tss_t .llvm-libc-types.tss_dtor_t + .llvm-libc-types.struct_timespec ) add_header_macro( diff --git a/libc/include/threads.yaml b/libc/include/threads.yaml index 6f7796e090ab6..1e8e42c80587c 100644 --- a/libc/include/threads.yaml +++ b/libc/include/threads.yaml @@ -13,6 +13,7 @@ types: - type_name: thrd_t - type_name: tss_t - type_name: tss_dtor_t + - type_name: struct_timespec enums: - name: mtx_plain value: null @@ -63,6 +64,14 @@ functions: return_type: int arguments: - type: cnd_t * + - name: cnd_timedwait + standards: + - stdc + return_type: int + arguments: + - type: cnd_t *__restrict + - type: mtx_t *__restrict + - type: const struct timespec *__restrict - name: cnd_wait standards: - stdc @@ -89,6 +98,12 @@ functions: return_type: int arguments: - type: mtx_t * + - name: mtx_trylock + standards: + - stdc + return_type: int + arguments: + - type: mtx_t * - name: mtx_unlock standards: - stdc diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h index d4d13093a2ce7..b2e2bda66eb8c 100644 --- a/libc/src/__support/threads/mutex.h +++ b/libc/src/__support/threads/mutex.h @@ -21,7 +21,7 @@ // Mutex with non-static methods having the following signature: // // MutexError lock(); -// MutexError trylock(); +// MutexError try_lock(); // MutexError timed_lock(...); // MutexError unlock(); // MutexError reset(); // Used to reset inconsistent robust mutexes. @@ -61,7 +61,7 @@ struct Mutex { LIBC_INLINE MutexError lock() { return MutexError::NONE; } LIBC_INLINE MutexError unlock() { return MutexError::NONE; } LIBC_INLINE MutexError reset() { return MutexError::NONE; } - LIBC_INLINE MutexError trylock() { return MutexError::NONE; } + LIBC_INLINE MutexError try_lock() { return MutexError::NONE; } LIBC_INLINE bool is_robust() const { return false; } }; diff --git a/libc/src/__support/time/clock_conversion.h b/libc/src/__support/time/clock_conversion.h index 54ea4b6ca1430..658ff52f046e8 100644 --- a/libc/src/__support/time/clock_conversion.h +++ b/libc/src/__support/time/clock_conversion.h @@ -48,7 +48,7 @@ LIBC_INLINE timespec convert_clock(timespec input, clockid_t from, output.tv_sec = input.tv_sec - from_time.tv_sec + to_time.tv_sec; output.tv_nsec = input.tv_nsec - from_time.tv_nsec + to_time.tv_nsec; - if (output.tv_nsec > 1_s_ns) { + if (output.tv_nsec >= 1_s_ns) { output.tv_sec++; output.tv_nsec -= 1_s_ns; } else if (output.tv_nsec < 0) { diff --git a/libc/src/__support/time/monotonicity.h b/libc/src/__support/time/monotonicity.h index a8c7a56f4c6cc..544f8c31db980 100644 --- a/libc/src/__support/time/monotonicity.h +++ b/libc/src/__support/time/monotonicity.h @@ -31,6 +31,10 @@ LIBC_INLINE void ensure_monotonicity(AbsTimeout &timeout) { convert_clock(timeout.get_timespec(), CLOCK_REALTIME, CLOCK_MONOTONIC), false); + // Clamp the timeout to epoch if becomes negative after the conversion. + if (!res.has_value() && res.error() == AbsTimeout::Error::BeforeEpoch) + res = AbsTimeout::from_timespec(timespec{0, 0}, false); + LIBC_ASSERT(res.has_value()); if (!res.has_value()) __builtin_unreachable(); diff --git a/libc/src/threads/CMakeLists.txt b/libc/src/threads/CMakeLists.txt index 17dea3920a07a..db9ab33bbbf62 100644 --- a/libc/src/threads/CMakeLists.txt +++ b/libc/src/threads/CMakeLists.txt @@ -117,6 +117,17 @@ add_entrypoint_object( libc.src.__support.threads.mutex ) +add_entrypoint_object( + mtx_trylock + SRCS + mtx_trylock.cpp + HDRS + mtx_trylock.h + DEPENDS + libc.include.threads + libc.src.__support.threads.mutex +) + add_entrypoint_object( mtx_unlock SRCS @@ -193,6 +204,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.cnd_wait ) +add_entrypoint_object( + cnd_timedwait + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.cnd_timedwait +) + add_entrypoint_object( cnd_signal ALIAS diff --git a/libc/src/threads/cnd_timedwait.h b/libc/src/threads/cnd_timedwait.h new file mode 100644 index 0000000000000..f15eb9f6d243e --- /dev/null +++ b/libc/src/threads/cnd_timedwait.h @@ -0,0 +1,22 @@ +//===-- Internal header for cnd_timedwait -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_THREADS_CND_TIMEDWAIT_H +#define LLVM_LIBC_SRC_THREADS_CND_TIMEDWAIT_H + +#include "src/__support/macros/config.h" +#include + +namespace LIBC_NAMESPACE_DECL { + +int cnd_timedwait(cnd_t *__restrict cond, mtx_t *__restrict mutex, + const struct timespec *__restrict time_point); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_THREADS_CND_TIMEDWAIT_H diff --git a/libc/src/threads/linux/CMakeLists.txt b/libc/src/threads/linux/CMakeLists.txt index c33c48919be2c..e4b38621bd699 100644 --- a/libc/src/threads/linux/CMakeLists.txt +++ b/libc/src/threads/linux/CMakeLists.txt @@ -48,6 +48,20 @@ add_entrypoint_object( libc.src.__support.threads.CndVar ) +add_entrypoint_object( + cnd_timedwait + SRCS + cnd_timedwait.cpp + HDRS + ../cnd_timedwait.h + DEPENDS + libc.include.threads + libc.src.__support.threads.mutex + libc.src.__support.threads.CndVar + libc.src.__support.time.abs_timeout + libc.src.__support.macros.null_check +) + add_entrypoint_object( cnd_signal SRCS diff --git a/libc/src/threads/linux/cnd_timedwait.cpp b/libc/src/threads/linux/cnd_timedwait.cpp new file mode 100644 index 0000000000000..853abd07c4cc8 --- /dev/null +++ b/libc/src/threads/linux/cnd_timedwait.cpp @@ -0,0 +1,57 @@ +//===-- Linux implementation of the cnd_timedwait function ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/threads/cnd_timedwait.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/null_check.h" +#include "src/__support/threads/CndVar.h" +#include "src/__support/threads/mutex.h" +#include "src/__support/time/abs_timeout.h" + +#include + +namespace LIBC_NAMESPACE_DECL { + +static_assert(sizeof(CndVar) == sizeof(cnd_t)); +static_assert(sizeof(Mutex) == sizeof(mtx_t) && + alignof(Mutex) == alignof(mtx_t)); + +LLVM_LIBC_FUNCTION(int, cnd_timedwait, + (cnd_t *__restrict cond, mtx_t *__restrict mtx, + const struct timespec *__restrict time_point)) { + LIBC_CRASH_ON_NULLPTR(time_point); + CndVar *cndvar = reinterpret_cast(cond); + Mutex *mutex = reinterpret_cast(mtx); + + // time_point is TIME_UTC-based, so we assume realtime clock here. + auto timeout = + internal::AbsTimeout::from_timespec(*time_point, /*realtime=*/true); + + if (!timeout.has_value()) { + switch (timeout.error()) { + case internal::AbsTimeout::Error::BeforeEpoch: + return thrd_timedout; + case internal::AbsTimeout::Error::Invalid: + return thrd_error; + } + __builtin_unreachable(); + } + + switch (cndvar->wait(mutex, timeout.value())) { + case CndVarResult::Success: + return thrd_success; + case CndVarResult::Timeout: + return thrd_timedout; + case CndVarResult::MutexError: + return thrd_error; + } + __builtin_unreachable(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/threads/mtx_trylock.cpp b/libc/src/threads/mtx_trylock.cpp new file mode 100644 index 0000000000000..a3cde97ee2b30 --- /dev/null +++ b/libc/src/threads/mtx_trylock.cpp @@ -0,0 +1,27 @@ +//===-- Linux implementation of the mtx_trylock function ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/threads/mtx_trylock.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/threads/mutex.h" + +#include // For mtx_t definition. + +namespace LIBC_NAMESPACE_DECL { + +// The implementation currently handles only plain mutexes. +LLVM_LIBC_FUNCTION(int, mtx_trylock, (mtx_t * mutex)) { + auto *m = reinterpret_cast(mutex); + auto err = m->try_lock(); + if (err == MutexError::BUSY) + return thrd_busy; + return err == MutexError::NONE ? thrd_success : thrd_error; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/threads/mtx_trylock.h b/libc/src/threads/mtx_trylock.h new file mode 100644 index 0000000000000..d6ba640a9d20a --- /dev/null +++ b/libc/src/threads/mtx_trylock.h @@ -0,0 +1,21 @@ +//===-- Internal header for mtx_trylock -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_THREADS_MTX_TRYLOCK_H +#define LLVM_LIBC_SRC_THREADS_MTX_TRYLOCK_H + +#include "src/__support/macros/config.h" +#include + +namespace LIBC_NAMESPACE_DECL { + +int mtx_trylock(mtx_t *mutex); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_THREADS_MTX_TRYLOCK_H diff --git a/libc/test/integration/src/threads/CMakeLists.txt b/libc/test/integration/src/threads/CMakeLists.txt index 68e346a26457a..3355a27a309cf 100644 --- a/libc/test/integration/src/threads/CMakeLists.txt +++ b/libc/test/integration/src/threads/CMakeLists.txt @@ -12,6 +12,7 @@ add_integration_test( libc.src.threads.mtx_destroy libc.src.threads.mtx_init libc.src.threads.mtx_lock + libc.src.threads.mtx_trylock libc.src.threads.mtx_unlock libc.src.threads.thrd_create libc.src.threads.thrd_join @@ -104,12 +105,14 @@ add_integration_test( cnd_test.cpp DEPENDS libc.include.threads + libc.include.time libc.src.__support.CPP.atomic libc.src.threads.cnd_init libc.src.threads.cnd_broadcast libc.src.threads.cnd_signal libc.src.threads.cnd_destroy libc.src.threads.cnd_wait + libc.src.threads.cnd_timedwait libc.src.threads.mtx_destroy libc.src.threads.mtx_init libc.src.threads.mtx_lock @@ -117,4 +120,5 @@ add_integration_test( libc.src.threads.thrd_create libc.src.threads.thrd_join libc.src.threads.linux.threads_utils + libc.src.time.timespec_get ) diff --git a/libc/test/integration/src/threads/cnd_test.cpp b/libc/test/integration/src/threads/cnd_test.cpp index 4eaab9ac08bc1..297d95e3149a2 100644 --- a/libc/test/integration/src/threads/cnd_test.cpp +++ b/libc/test/integration/src/threads/cnd_test.cpp @@ -11,6 +11,7 @@ #include "src/threads/cnd_destroy.h" #include "src/threads/cnd_init.h" #include "src/threads/cnd_signal.h" +#include "src/threads/cnd_timedwait.h" #include "src/threads/cnd_wait.h" #include "src/threads/mtx_destroy.h" #include "src/threads/mtx_init.h" @@ -18,10 +19,20 @@ #include "src/threads/mtx_unlock.h" #include "src/threads/thrd_create.h" #include "src/threads/thrd_join.h" +#include "src/time/timespec_get.h" #include "test/IntegrationTest/test.h" #include +#include // for TIME_UTC + +static void add_ns(struct timespec &ts, long ns) { + ts.tv_nsec += ns; + if (ts.tv_nsec >= 1'000'000'000) { + ++ts.tv_sec; + ts.tv_nsec -= 1'000'000'000; + } +} namespace wait_notify_broadcast_test { @@ -100,6 +111,11 @@ namespace single_waiter_test { mtx_t waiter_mtx, main_thread_mtx; cnd_t waiter_cnd, main_thread_cnd; +enum class WaitMode { + Default, + Timed, +}; + int waiter_thread_func([[maybe_unused]] void *unused) { LIBC_NAMESPACE::mtx_lock(&waiter_mtx); @@ -113,7 +129,7 @@ int waiter_thread_func([[maybe_unused]] void *unused) { return 0x600D; } -void single_waiter_test() { +void single_waiter_test(WaitMode wait_mode) { ASSERT_EQ(LIBC_NAMESPACE::mtx_init(&waiter_mtx, mtx_plain), int(thrd_success)); ASSERT_EQ(LIBC_NAMESPACE::mtx_init(&main_thread_mtx, mtx_plain), @@ -126,8 +142,18 @@ void single_waiter_test() { thrd_t waiter_thread; LIBC_NAMESPACE::thrd_create(&waiter_thread, waiter_thread_func, nullptr); - ASSERT_EQ(LIBC_NAMESPACE::cnd_wait(&main_thread_cnd, &main_thread_mtx), - int(thrd_success)); + if (wait_mode == WaitMode::Default) { + ASSERT_EQ(LIBC_NAMESPACE::cnd_wait(&main_thread_cnd, &main_thread_mtx), + int(thrd_success)); + } else { + ASSERT_EQ(wait_mode, WaitMode::Timed); + struct timespec ts; + ASSERT_EQ(LIBC_NAMESPACE::timespec_get(&ts, TIME_UTC), TIME_UTC); + add_ns(ts, 50'000); + int result = + LIBC_NAMESPACE::cnd_timedwait(&main_thread_cnd, &main_thread_mtx, &ts); + ASSERT_TRUE(result == int(thrd_success) || result == int(thrd_timedout)); + } ASSERT_EQ(LIBC_NAMESPACE::mtx_unlock(&main_thread_mtx), int(thrd_success)); ASSERT_EQ(LIBC_NAMESPACE::mtx_lock(&waiter_mtx), int(thrd_success)); @@ -146,8 +172,53 @@ void single_waiter_test() { } // namespace single_waiter_test +namespace timed_wait_test { + +void timeout_test() { + cnd_t cnd; + mtx_t mtx; + ASSERT_EQ(LIBC_NAMESPACE::cnd_init(&cnd), int(thrd_success)); + ASSERT_EQ(LIBC_NAMESPACE::mtx_init(&mtx, mtx_plain), int(thrd_success)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_lock(&mtx), int(thrd_success)); + + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 0; + ASSERT_EQ(LIBC_NAMESPACE::cnd_timedwait(&cnd, &mtx, &ts), int(thrd_timedout)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_unlock(&mtx), int(thrd_success)); + + LIBC_NAMESPACE::cnd_destroy(&cnd); + LIBC_NAMESPACE::mtx_destroy(&mtx); +} + +void future_timeout_test() { + cnd_t cnd; + mtx_t mtx; + ASSERT_EQ(LIBC_NAMESPACE::cnd_init(&cnd), int(thrd_success)); + ASSERT_EQ(LIBC_NAMESPACE::mtx_init(&mtx, mtx_plain), int(thrd_success)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_lock(&mtx), int(thrd_success)); + + struct timespec ts; + ASSERT_EQ(LIBC_NAMESPACE::timespec_get(&ts, TIME_UTC), TIME_UTC); + add_ns(ts, 50'000); + ASSERT_EQ(LIBC_NAMESPACE::cnd_timedwait(&cnd, &mtx, &ts), int(thrd_timedout)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_unlock(&mtx), int(thrd_success)); + + LIBC_NAMESPACE::cnd_destroy(&cnd); + LIBC_NAMESPACE::mtx_destroy(&mtx); +} + +} // namespace timed_wait_test + TEST_MAIN() { wait_notify_broadcast_test::wait_notify_broadcast_test(); - single_waiter_test::single_waiter_test(); + single_waiter_test::single_waiter_test(single_waiter_test::WaitMode::Default); + single_waiter_test::single_waiter_test(single_waiter_test::WaitMode::Timed); + timed_wait_test::timeout_test(); + timed_wait_test::future_timeout_test(); return 0; } diff --git a/libc/test/integration/src/threads/mtx_test.cpp b/libc/test/integration/src/threads/mtx_test.cpp index 909c4f2c5c760..4609e97d25fa9 100644 --- a/libc/test/integration/src/threads/mtx_test.cpp +++ b/libc/test/integration/src/threads/mtx_test.cpp @@ -10,6 +10,7 @@ #include "src/threads/mtx_destroy.h" #include "src/threads/mtx_init.h" #include "src/threads/mtx_lock.h" +#include "src/threads/mtx_trylock.h" #include "src/threads/mtx_unlock.h" #include "src/threads/thrd_create.h" #include "src/threads/thrd_join.h" @@ -230,10 +231,36 @@ void multiple_waiters() { LIBC_NAMESPACE::mtx_destroy(&counter_lock); } +void trylock_test() { + mtx_t plain_mutex; + ASSERT_EQ(LIBC_NAMESPACE::mtx_init(&plain_mutex, mtx_plain), + int(thrd_success)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_trylock(&plain_mutex), int(thrd_success)); + ASSERT_EQ(LIBC_NAMESPACE::mtx_trylock(&plain_mutex), int(thrd_busy)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_unlock(&plain_mutex), int(thrd_success)); + + LIBC_NAMESPACE::mtx_destroy(&plain_mutex); + + mtx_t recursive_mutex; + ASSERT_EQ(LIBC_NAMESPACE::mtx_init(&recursive_mutex, mtx_recursive), + int(thrd_success)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_trylock(&recursive_mutex), int(thrd_success)); + ASSERT_EQ(LIBC_NAMESPACE::mtx_trylock(&recursive_mutex), int(thrd_success)); + + ASSERT_EQ(LIBC_NAMESPACE::mtx_unlock(&recursive_mutex), int(thrd_success)); + ASSERT_EQ(LIBC_NAMESPACE::mtx_unlock(&recursive_mutex), int(thrd_success)); + + LIBC_NAMESPACE::mtx_destroy(&recursive_mutex); +} + TEST_MAIN() { relay_counter(); wait_and_step(); recursive_mutex_test(); multiple_waiters(); + trylock_test(); return 0; } From a2ecad477dae379697626223794d30633d386e0d Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Mon, 11 May 2026 20:01:07 +0200 Subject: [PATCH 322/538] [AMDGPU] Add subtarget features for MTBUF and formatted MUBUF instructions. (#196315) --- llvm/lib/Target/AMDGPU/AMDGPU.td | 33 +++++++----- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 -- .../atomic_optimizations_mul_one.ll | 38 +++++++------- .../AMDGPU/load-local-redundant-copies.ll | 52 +++++++++---------- llvm/test/CodeGen/AMDGPU/mubuf.ll | 2 +- llvm/test/CodeGen/AMDGPU/wait.ll | 4 +- llvm/test/MC/AMDGPU/reg-syntax-extra.s | 1 - 7 files changed, 68 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c0647a5ce443e..dd7b4dee76c45 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -483,6 +483,14 @@ defm SBarrierLeaveImm : AMDGPUSubtargetFeature<"s-barrier-leave-imm", "s_barrier_leave takes an immediate operand" >; +defm MTBUFInsts : AMDGPUSubtargetFeature<"mtbuf-insts", + "Has memory typed buffer instructions." +>; + +defm FormattedMUBUFInsts : AMDGPUSubtargetFeature<"formatted-mubuf-insts", + "Has formatted memory untyped buffer instructions." +>; + defm VMovB64Inst : AMDGPUSubtargetFeature<"v-mov-b64-inst", "Has v_mov_b64 instruction" >; @@ -1389,7 +1397,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, - FeatureSadInsts, FeatureCvtPkNormVOP2Insts, FeatureDX10ClampAndIEEEMode + FeatureSadInsts, FeatureCvtPkNormVOP2Insts, FeatureDX10ClampAndIEEEMode, + FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; @@ -1405,7 +1414,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts, - FeatureDX10ClampAndIEEEMode, FeatureInstCacheLineSize64 + FeatureDX10ClampAndIEEEMode, FeatureInstCacheLineSize64, + FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; @@ -1425,7 +1435,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts, FeatureDX10ClampAndIEEEMode, - FeatureInstCacheLineSize64 + FeatureInstCacheLineSize64, FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; @@ -1448,7 +1458,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts, FeatureDX10ClampAndIEEEMode, - FeatureInstCacheLineSize64 + FeatureInstCacheLineSize64, FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; @@ -1476,7 +1486,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts, FeatureDX10ClampAndIEEEMode, FeatureFlatOffsetBits12, - FeatureInstCacheLineSize64 + FeatureInstCacheLineSize64, FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; @@ -1502,7 +1512,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts, - FeatureInstCacheLineSize128 + FeatureInstCacheLineSize128, FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; @@ -1554,7 +1564,8 @@ def FeatureGFX13 : GCNSubtargetFeatureGeneration<"GFX13", FeatureIEEEMinimumMaximumInsts, FeatureSALUMinimumMaximumInsts, FeatureMinimum3Maximum3F32, FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics, FeatureFlatOffsetBits24, - FeatureFlatSignedOffset, FeatureInstCacheLineSize128 + FeatureFlatSignedOffset, FeatureInstCacheLineSize128, + FeatureMTBUFInsts, FeatureFormattedMUBUFInsts ] >; //===----------------------------------------------------------------------===// @@ -2043,6 +2054,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureCvtPkNormVOP3Insts, FeatureNoF16PseudoScalarTransInlineConstants, FeatureRealTrue16Insts, + FeatureMTBUFInsts, + FeatureFormattedMUBUFInsts, ]>; def FeatureISAVersion12_50_Common : FeatureSet< @@ -2605,12 +2618,6 @@ def D16PreservesUnusedBits : def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; -def HasMTBUFInsts : Predicate<"Subtarget->hasMTBUFInsts()">, - AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>; - -def HasFormattedMUBUFInsts : Predicate<"Subtarget->hasFormattedMUBUFInsts()">, - AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>; - def HasExportInsts : Predicate<"Subtarget->hasExportInsts()">, AssemblerPredicate<(any_of FeatureGFX13Insts, (all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts)))>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2e2797269fbcf..5f580ac0577d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -359,10 +359,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasAtomicCSub() const { return HasGFX10_BEncoding; } - bool hasMTBUFInsts() const { return !hasGFX1250Insts(); } - - bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); } - bool hasExportInsts() const { return !hasGFX940Insts() && !hasGFX1250Insts(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index 65bc2d73b36b6..4cb566721348f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s -; RUN: llc -global-isel -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) @@ -43,9 +43,9 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) { ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_add v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm .entry: @@ -139,9 +139,9 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) { ; GCN-NEXT: s_cbranch_execz .LBB2_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_sub v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB2_2: ; GCN-NEXT: s_endpgm .entry: @@ -237,9 +237,9 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) { ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_xor v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB4_2: ; GCN-NEXT: s_endpgm .entry: @@ -337,9 +337,9 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: s_cbranch_execz .LBB6_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_add v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB6_2: ; GCN-NEXT: s_endpgm .entry: @@ -437,9 +437,9 @@ define amdgpu_cs void @atomic_ptr_sub(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: s_cbranch_execz .LBB8_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_sub v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB8_2: ; GCN-NEXT: s_endpgm .entry: @@ -539,9 +539,9 @@ define amdgpu_cs void @atomic_ptr_xor(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_xor v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB10_2: ; GCN-NEXT: s_endpgm .entry: diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll index 157b4fbe6803d..eb3386b64e72b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s ; Test that checks for redundant copies to temporary stack slot produced by ; expandUnalignedLoad. @@ -9,15 +9,15 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2 ; CHECK: ; %bb.0: ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; CHECK-NEXT: ds_read_b32 v3, v1 ; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: ds_read_b32 v1, v4 ; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: exp mrt0, off, off, off, off -; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen ; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float poison, float poison, float poison, float poison, i1 false, i1 false) @@ -36,8 +36,8 @@ define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: ds_read_b32 v5, v4 ; CHECK-NEXT: ds_read_b32 v4, v6 @@ -62,43 +62,43 @@ define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8) inreg %arg3, i32 %arg4, ptr addrspace(3) %arg5, ptr addrspace(3) %arg6) { ; CHECK-LABEL: test_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s7, s5 -; CHECK-NEXT: s_mov_b32 s6, s4 -; CHECK-NEXT: s_mov_b32 s5, s3 -; CHECK-NEXT: s_mov_b32 s4, s2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v1 -; CHECK-NEXT: v_mov_b32_e32 v10, s0 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v1 ; CHECK-NEXT: ds_read_b32 v6, v0 ; CHECK-NEXT: ds_read_b32 v5, v3 ; CHECK-NEXT: ds_read_b32 v4, v4 ; CHECK-NEXT: ds_read_b32 v8, v7 ; CHECK-NEXT: ds_read_b32 v7, v9 ; CHECK-NEXT: ds_read_b32 v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v2 +; CHECK-NEXT: s_mov_b32 s7, s5 +; CHECK-NEXT: s_mov_b32 s6, s4 +; CHECK-NEXT: s_mov_b32 s5, s3 +; CHECK-NEXT: s_mov_b32 s4, s2 +; CHECK-NEXT: v_mov_b32_e32 v9, s0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc -; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc +; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc +; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v2 ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: ds_read_b32 v5, v11 -; CHECK-NEXT: ds_read_b32 v4, v12 -; CHECK-NEXT: ds_read_b32 v3, v0 -; CHECK-NEXT: ds_read_b32 v1, v1 -; CHECK-NEXT: ds_read_b32 v0, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 4, v2 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 20, v2 +; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2 +; CHECK-NEXT: ds_read_b32 v5, v0 +; CHECK-NEXT: ds_read_b32 v4, v1 +; CHECK-NEXT: ds_read_b32 v3, v3 +; CHECK-NEXT: ds_read_b32 v1, v6 +; CHECK-NEXT: ds_read_b32 v0, v7 ; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: exp mrt0, off, off, off, off ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc -; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc +; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc +; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc +; CHECK-NEXT: exp mrt0, off, off, off, off ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4 %vec11 = shufflevector <6 x float> %load1, <6 x float> poison, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll index 2f59d75800b26..438b63b559de9 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -show-mc-encoding < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck %s ;;;==========================================================================;;; ;;; MUBUF LOAD TESTS diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll index 10090e31d5788..04f6df8042a2c 100644 --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT -; RUN: llc -mtriple=amdgcn --misched=ilpmax < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX +; RUN: llc -mtriple=amdgcn --misched=ilpmax -mcpu=tahiti < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX ; RUN: llc -mtriple=amdgcn --misched=ilpmax -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX ; The ilpmax scheduler is used for the second test to get the ordering we want for the test. diff --git a/llvm/test/MC/AMDGPU/reg-syntax-extra.s b/llvm/test/MC/AMDGPU/reg-syntax-extra.s index 77d74b043f819..dc77f4d89fa01 100644 --- a/llvm/test/MC/AMDGPU/reg-syntax-extra.s +++ b/llvm/test/MC/AMDGPU/reg-syntax-extra.s @@ -1,5 +1,4 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 -// RUN: not llvm-mc -triple=amdgcn -show-encoding %s | FileCheck --check-prefixes=GCN,SICI %s // RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti -show-encoding %s | FileCheck --check-prefixes=GCN,SICI %s // RUN: not llvm-mc -triple=amdgcn -mcpu=fiji -show-encoding %s | FileCheck --check-prefixes=GCN,VI %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck --check-prefixes=GCN,GFX10 %s From 4d12a16b78391f5542e1bc1e748927f1f5cc6b44 Mon Sep 17 00:00:00 2001 From: Roy Shi Date: Mon, 11 May 2026 11:04:25 -0700 Subject: [PATCH 323/538] [gsymutil] Fix crash caused by infinite recursion in `unwrapReferencedTypedefType()` (#196448) Fixing a crash in `llvm-gsymutil` caused by infinite recursion in `unwrapReferencedTypedefType()` when unwrapping self-referencing typedef cycle in DWARF, which was created by #168734. Later, #195749 fixed the DWARF generation, but `llvm-gsymutil` can still crash on already generated bad DWARFs. To prevent `llvm-gsymutil` from crashing, a `DenseSet` is used to detect cycle. When it does, it returns the typedef that forms the cycle instead of continuing to unwrap infinitely. The implementation is changed to a loop instead of recursion for performance reasons. A test `TestDWARFTypedefCycleDoesNotCrash` is added to repro the issue and to confirm the fix. --- .../Plugins/SymbolFile/DWARF/DWARFBaseDIE.h | 2 + .../llvm/DebugInfo/DWARF/DWARFTypePrinter.h | 31 ++-- llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp | 156 ++++++++++++++++++ 3 files changed, 179 insertions(+), 10 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h index d92de658a49e8..9736b80ac375f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h @@ -123,6 +123,8 @@ class DWARFBaseDIE { // LLVM libraries. dw_tag_t getTag() const { return Tag(); } + dw_offset_t getOffset() const { return GetOffset(); } + const char *getShortName() const { return GetName(); } protected: diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h index 9986aaabf6ed4..3ca0767574670 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h @@ -9,10 +9,12 @@ #ifndef LLVM_DEBUGINFO_DWARF_DWARFTYPEPRINTER_H #define LLVM_DEBUGINFO_DWARF_DWARFTYPEPRINTER_H +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/Support/Error.h" +#include "llvm/Support/WithColor.h" #include @@ -169,20 +171,29 @@ const char *toString(std::optional F) { } /// Resolve the DW_AT_type of \c D until we reach a DIE that is not a -/// DW_TAG_typedef. +/// DW_TAG_typedef. Gives up if a cycle is detected in malformed DWARF. +/// In this case, returns the typedef DIE where the cycle is formed. template DieType unwrapReferencedTypedefType(DieType D) { - auto TypeAttr = D.find(dwarf::DW_AT_type); - if (!TypeAttr) - return DieType(); + SmallSet Visited; + while (true) { + auto TypeAttr = D.find(dwarf::DW_AT_type); + if (!TypeAttr) + return DieType(); - auto Unwrapped = detail::resolveReferencedType(D, *TypeAttr); - if (!Unwrapped) - return DieType(); + auto Unwrapped = detail::resolveReferencedType(D, *TypeAttr); + if (!Unwrapped || Unwrapped.getTag() != dwarf::DW_TAG_typedef) + return Unwrapped; - if (Unwrapped.getTag() == dwarf::DW_TAG_typedef) - return unwrapReferencedTypedefType(Unwrapped); + if (!Visited.insert(Unwrapped.getOffset()).second) { + WithColor::warning() + << "typedef cycle detected: DW_TAG_typedef at offset 0x" + << utohexstr(Unwrapped.getOffset()) + << " references itself through DW_TAG_typedef chain\n"; + return Unwrapped; + } - return Unwrapped; + D = Unwrapped; + } } } // namespace detail diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp index 299e19e3c896e..b7abdd94ce645 100644 --- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp +++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp @@ -9,6 +9,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallString.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h" #include "llvm/DebugInfo/GSYM/DwarfTransformer.h" #include "llvm/DebugInfo/GSYM/ExtractRanges.h" #include "llvm/DebugInfo/GSYM/FileEntry.h" @@ -5687,3 +5688,158 @@ TEST(GSYMTest, TestMergedFunctionsInfoLargeOffsets) { EXPECT_EQ(DecResult->MergedFunctions[0].Name, LargeName1); EXPECT_EQ(DecResult->MergedFunctions[1].Name, LargeName2); } + +TEST(GSYMTest, TestDWARFTypedefCycleDoesNotCrash) { + // Test that a self-referencing typedef cycle in DWARF does not cause + // infinite recursion in DWARFTypePrinter::unwrapReferencedTypedefType(). + // This can happen when dsymutil's classic linker incorrectly deduplicates + // typedefs with the same name but different underlying types (e.g. from + // preferred_name), creating a typedef that points to itself. + // + // The crash path: DWARFTypePrinter::appendUnqualifiedNameBefore sees a + // DW_AT_name with the _STN| prefix (simplified template name), calls + // appendTemplateParameters, which for each DW_TAG_template_type_parameter + // calls unwrapReferencedTypedefType. With a cyclic typedef, this recurses + // infinitely. + // + // debug_info layout (DWARF32, AddrSize=8): + // 0x00: unit_length (4 bytes) + // 0x04: version=4 (2), abbrev_offset (4), addr_size=8 (1) = 7 bytes + // 0x0B: CU DIE (abbrev 1): strp(4) + addr(8) + addr(8) + data2(2) = 23 + // 0x22: Subprogram DIE (abbrev 2): strp(4) + addr(8) + addr(8) = 21 + // 0x37: Template param DIE (abbrev 3): strp(4) + ref4(4) = 9 + // 0x40: null terminator (1 byte, end of subprogram children) + // 0x41: Typedef DIE (abbrev 4): strp(4) + ref4(4) = 9 + // 0x4A: null terminator (1 byte, end of CU children) + // + // Template param's DW_AT_type -> 0x41 (typedef) + // Typedef's DW_AT_type -> 0x41 (self-referencing cycle) + + // String table: "" (0x00), "/tmp/main.cpp" (0x01), "_STN|foo|" + // (0x0F), "T" (0x21), "MyType" (0x23) + StringRef yamldata = R"( + debug_str: + - '' + - /tmp/main.cpp + - '_STN|foo|' + - T + - MyType + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Code: 0x00000003 + Tag: DW_TAG_template_type_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x00000004 + Tag: DW_TAG_typedef + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - Value: 0x0000000000000004 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000F + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000003 + Values: + - Value: 0x0000000000000021 + - Value: 0x0000000000000041 + - AbbrCode: 0x00000000 + - AbbrCode: 0x00000004 + Values: + - Value: 0x0000000000000023 + - Value: 0x0000000000000041 + - AbbrCode: 0x00000000 + )"; + auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); + ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded()); + std::unique_ptr DwarfContext = + DWARFContext::create(*ErrOrSections, 8); + ASSERT_TRUE(DwarfContext.get() != nullptr); + + // Verify the typedef DIE is at offset 0x41 and self-references. + auto &CUDie = *DwarfContext->compile_units().begin(); + DWARFDie CURoot = CUDie->getUnitDIE(false); + ASSERT_TRUE(CURoot.isValid()); + // Walk children to find the typedef and verify the cycle. + bool FoundTypedef = false; + for (DWARFDie Child : CURoot.children()) { + if (Child.getTag() == dwarf::DW_TAG_typedef) { + EXPECT_EQ(Child.getOffset(), 0x41u); + auto TypeAttr = Child.find(dwarf::DW_AT_type); + ASSERT_TRUE(TypeAttr.has_value()); + auto RefDie = Child.getAttributeValueAsReferencedDie(*TypeAttr); + EXPECT_EQ(RefDie.getOffset(), Child.getOffset()); + FoundTypedef = true; + } + } + ASSERT_TRUE(FoundTypedef); + + // Exercise DWARFTypePrinter on the subprogram with the _STN| name. + // appendUnqualifiedName -> appendTemplateParameters -> + // unwrapReferencedTypedefType must not infinitely recurse. + for (DWARFDie Child : CURoot.children()) { + if (Child.getTag() == dwarf::DW_TAG_subprogram) { + std::string Result; + raw_string_ostream StrOS(Result); + DWARFTypePrinter(StrOS).appendUnqualifiedName(Child); + EXPECT_FALSE(Result.empty()); + } + } + + // Also verify DwarfTransformer::convert() succeeds. + auto &OS = llvm::nulls(); + OutputAggregator OSAgg(&OS); + GsymCreatorV1 GC; + DwarfTransformer DT(*DwarfContext, GC); + ASSERT_THAT_ERROR(DT.convert(1, OSAgg), Succeeded()); + ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded()); + SmallString<512> Str; + raw_svector_ostream OutStrm(Str); + FileWriter FW(OutStrm, llvm::endianness::native); + FW.setStringOffsetSize(GC.getStringOffsetSize()); + ASSERT_THAT_ERROR(GC.encode(FW), Succeeded()); + auto GROrErr = GsymReader::copyBuffer(OutStrm.str()); + ASSERT_THAT_EXPECTED(GROrErr, Succeeded()); + const std::unique_ptr &GR = *GROrErr; + EXPECT_EQ(GR->getNumAddresses(), 1u); +} From d3832dea18dcd73d89634e3a9c63f138192b5f9e Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Mon, 11 May 2026 11:09:44 -0700 Subject: [PATCH 324/538] [lldb] Fix TestPtrauthBRAADiagnostic.py (#196406) The test does actually detect the pointer authentication issue and should check for that. --- .../functionalities/ptrauth_diagnostics/BRAA_error/braa.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/test/API/functionalities/ptrauth_diagnostics/BRAA_error/braa.c b/lldb/test/API/functionalities/ptrauth_diagnostics/BRAA_error/braa.c index c53f4725164e6..c1da569d59bc6 100644 --- a/lldb/test/API/functionalities/ptrauth_diagnostics/BRAA_error/braa.c +++ b/lldb/test/API/functionalities/ptrauth_diagnostics/BRAA_error/braa.c @@ -3,9 +3,7 @@ void foo() {} int main() { //% self.filecheck("c", "braa.c") // CHECK: stop reason = EXC_BAD_ACCESS - // - // TODO: We need call site info support for indirect calls to make this work. - // CHECK-NOT: pointer authentication failure + // CHECK-NEXT: pointer authentication failure asm volatile ( "mov x9, #0xbad \n" "braa %[target], x9 \n" @@ -19,6 +17,7 @@ int main() { // Expected codegen and exception message without ptrauth diagnostics: // * thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x2000000100007f9c) +// Note: Possible pointer authentication failure detected. // frame #0: 0x0000000100007f9c braa`foo // braa`foo: // 0x100007f9c <+0>: ret From 309205054157f36a5461355c682c323567fd0147 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Mon, 11 May 2026 11:10:24 -0700 Subject: [PATCH 325/538] [lldb] Confine TestDlopenOtherExecutable.py to Darwin-{x86_64,arm64} (#196405) First, I remove all the `skipIf` and `expectedFailure` in favor of `skipUnlessDarwin` because that appears to be the only supported platform here. Next, I limit the architectures to x86_64 and arm64. Opening other executables is a hack that works in limited circumstances. arm64e is not supported. --- .../dlopen_other_executable/TestDlopenOtherExecutable.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lldb/test/API/functionalities/dlopen_other_executable/TestDlopenOtherExecutable.py b/lldb/test/API/functionalities/dlopen_other_executable/TestDlopenOtherExecutable.py index 523ca7989fd8a..a845fc71f2378 100644 --- a/lldb/test/API/functionalities/dlopen_other_executable/TestDlopenOtherExecutable.py +++ b/lldb/test/API/functionalities/dlopen_other_executable/TestDlopenOtherExecutable.py @@ -6,14 +6,11 @@ class TestCase(TestBase): @skipIfRemote - @skipIfWindows + @no_debug_info_test + @skipIf(archs=no_match(["x86_64", "arm64$"])) # glibc's dlopen doesn't support opening executables. # https://sourceware.org/bugzilla/show_bug.cgi?id=11754 - @skipIfLinux - # freebsd's dlopen ditto - @expectedFailureAll(oslist=["freebsd"]) - @expectedFailureNetBSD - @no_debug_info_test + @skipUnlessDarwin def test(self): self.build() # Launch and stop before the dlopen call. From 53974557dc502a687c226203d3c2f491c4408a1d Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 14:23:57 -0400 Subject: [PATCH 326/538] [libc] Fix -Wshadow warning in atanf.h (#196850) x_d already exists in an outer scope and isn't modified if `x_abs < 0x3980'0000`, so no need to have a shadowing variable. No behavior change. --- libc/src/__support/math/atanf.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/src/__support/math/atanf.h b/libc/src/__support/math/atanf.h index 633a95b02d5ff..d583572ea4727 100644 --- a/libc/src/__support/math/atanf.h +++ b/libc/src/__support/math/atanf.h @@ -67,7 +67,6 @@ LIBC_INLINE constexpr float atanf(float x) { #if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(x, -0x1.0p-25f, x); #else - double x_d = static_cast(x); return static_cast(fputil::multiply_add(x_d, -0x1.0p-25, x_d)); #endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } From ffa3b895132c092658b2be330d0935e30b7b948a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 14:24:16 -0400 Subject: [PATCH 327/538] [libc] Fix -Wshadow warning in raw_rwlock.h (#196852) --- libc/src/__support/threads/raw_rwlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/src/__support/threads/raw_rwlock.h b/libc/src/__support/threads/raw_rwlock.h index 2c14645a2e5ce..9bc1b88cabd69 100644 --- a/libc/src/__support/threads/raw_rwlock.h +++ b/libc/src/__support/threads/raw_rwlock.h @@ -399,8 +399,8 @@ class RawRwLock { // reached. bool timeout_flag = false; if (!old.can_acquire(get_preference())) { - auto result = queue.wait(serial_number, timeout, is_pshared); - timeout_flag = (!result.has_value() && timeout.has_value()); + auto wait_result = queue.wait(serial_number, timeout, is_pshared); + timeout_flag = (!wait_result.has_value() && timeout.has_value()); } // Phase 7: unregister ourselves as a pending reader/writer. From 0100b524b5e3665f933d44cc983b9fe47a3d35c3 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 14:24:31 -0400 Subject: [PATCH 328/538] [libc] Fix -Wshadow warnings in getopt.cpp (#196853) --- libc/src/unistd/getopt.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/unistd/getopt.cpp b/libc/src/unistd/getopt.cpp index 6da47e73926cc..84b962d0cdda8 100644 --- a/libc/src/unistd/getopt.cpp +++ b/libc/src/unistd/getopt.cpp @@ -191,9 +191,9 @@ static GetoptContext ctx{&impl::optarg, &impl::optind, &impl::optopt, #ifndef LIBC_COPT_PUBLIC_PACKAGING // This is used exclusively in tests. -void set_getopt_state(char **optarg, int *optind, int *optopt, unsigned *optpos, - int *opterr, FILE *errstream) { - ctx = {optarg, optind, optopt, optpos, opterr, errstream}; +void set_getopt_state(char **optarg_in, int *optind_in, int *optopt_in, + unsigned *optpos_in, int *opterr_in, FILE *errstream) { + ctx = {optarg_in, optind_in, optopt_in, optpos_in, opterr_in, errstream}; } #endif From aeea9650b355c27ff38d841468571d2f2e602221 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 14:24:45 -0400 Subject: [PATCH 329/538] [libc] Fix -Wshadow warnings in inline_strlen.h (#196855) --- libc/src/string/memory_utils/generic/inline_strlen.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libc/src/string/memory_utils/generic/inline_strlen.h b/libc/src/string/memory_utils/generic/inline_strlen.h index e9f1542f41424..833ff9fbcbf80 100644 --- a/libc/src/string/memory_utils/generic/inline_strlen.h +++ b/libc/src/string/memory_utils/generic/inline_strlen.h @@ -39,9 +39,8 @@ LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) { return cpp::find_first_set(shift_mask(mask, offset)); for (;;) { - cpp::simd chars = cpp::load>(++aligned, - /*aligned=*/true); - cpp::simd_mask mask = chars == null_byte; + chars = cpp::load>(++aligned, /*aligned=*/true); + mask = chars == null_byte; if (cpp::any_of(mask)) return (reinterpret_cast(aligned) - src) + cpp::find_first_set(mask); @@ -78,8 +77,8 @@ find_first_character(const unsigned char *s, unsigned char c, size_t n) { for (size_t bytes_checked = sizeof(Vector) - offset; bytes_checked < n; bytes_checked += sizeof(Vector)) { aligned++; - Vector chars = cpp::load(aligned, /*aligned=*/true); - Mask cmp_v = chars == c_byte; + chars = cpp::load(aligned, /*aligned=*/true); + cmp_v = chars == c_byte; if (cpp::any_of(cmp_v)) return calculate_find_first_character_return( reinterpret_cast(aligned), cmp_v, n - bytes_checked); From 53ff447b64186ff51181cd6050a9613983ff80be Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 11 May 2026 19:28:29 +0100 Subject: [PATCH 330/538] [GitHub] Support issue_comment workflows in issue_write (#192205) This is split off from #190010. We want to add a new workflow triggered whenever a comment is added to an issue (workflow_run.event == 'issue_comment'), that also writes an comment back via the issue_write workflow. However for issue_comment workflows, the head branch for the workflow won't be the head of the PR, but the default branch of the repository. So trying to fetch the PR based on the branch will fail. GitHub docs seem to recommend that the PR number is explicitly passed via an artifact in these cases: https://docs.github.com/en/actions/reference/workflows-and-actions/events-that-trigger-workflows#using-data-from-the-triggering-workflow This PR adds support for this so we can eventually leave comments from the test-suite.yml workflow --- .github/workflows/issue-write.yml | 98 ++++++++++++++++--------------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml index 7cd6918752148..fd092477bd9e0 100644 --- a/.github/workflows/issue-write.yml +++ b/.github/workflows/issue-write.yml @@ -22,7 +22,8 @@ jobs: permissions: pull-requests: write if: > - github.event.workflow_run.event == 'pull_request' && + (github.event.workflow_run.event == 'pull_request' || + github.event.workflow_run.event == 'issue_comment') && ( github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' @@ -50,10 +51,14 @@ jobs: script: | var fs = require('fs'); var comments = [] + var pr_number = 0 for (local_file of fs.readdirSync('.')) { if (local_file.startsWith("comments")) { comments.push(...JSON.parse(fs.readFileSync(local_file))) } + if (local_file.startsWith("pr_number")) { + pr_number = parseInt(fs.readFileSync(local_file), 10) + } } if (!comments || comments.length == 0) { return; @@ -68,60 +73,61 @@ jobs: console.log(runInfo); - // Query to find the number of the pull request that triggered this job. - // The associated pull requests are based off of the branch name, so if - // you create a pull request for a branch, close it, and then create - // another pull request with the same branch, then this query will return - // two associated pull requests. This is why we have to fetch all the - // associated pull requests and then iterate through them to find the - // one that is open. - const gql_query = ` - query($repo_owner : String!, $repo_name : String!, $branch: String!) { - repository(owner: $repo_owner, name: $repo_name) { - ref (qualifiedName: $branch) { - associatedPullRequests(first: 100) { - nodes { - baseRepository { - owner { - login + if (!pr_number) { + // Query to find the number of the pull request that triggered this job. + // The associated pull requests are based off of the branch name, so if + // you create a pull request for a branch, close it, and then create + // another pull request with the same branch, then this query will return + // two associated pull requests. This is why we have to fetch all the + // associated pull requests and then iterate through them to find the + // one that is open. + const gql_query = ` + query($repo_owner : String!, $repo_name : String!, $branch: String!) { + repository(owner: $repo_owner, name: $repo_name) { + ref (qualifiedName: $branch) { + associatedPullRequests(first: 100) { + nodes { + baseRepository { + owner { + login + } } + number + state } - number - state } } } } + ` + const gql_variables = { + repo_owner: runInfo.data.head_repository.owner.login, + repo_name: runInfo.data.head_repository.name, + branch: runInfo.data.head_branch } - ` - const gql_variables = { - repo_owner: runInfo.data.head_repository.owner.login, - repo_name: runInfo.data.head_repository.name, - branch: runInfo.data.head_branch - } - const gql_result = await github.graphql(gql_query, gql_variables); - console.log(gql_result); - // If the branch for the PR was deleted before this job has a chance - // to run, then the ref will be null. This can happen if someone: - // 1. Rebase the PR, which triggers some workflow. - // 2. Immediately merges the PR and deletes the branch. - // 3. The workflow finishes and triggers this job. - if (!gql_result.repository.ref) { - console.log("Ref has been deleted"); - return; - } - console.log(gql_result.repository.ref.associatedPullRequests.nodes); + const gql_result = await github.graphql(gql_query, gql_variables); + console.log(gql_result); + // If the branch for the PR was deleted before this job has a chance + // to run, then the ref will be null. This can happen if someone: + // 1. Rebase the PR, which triggers some workflow. + // 2. Immediately merges the PR and deletes the branch. + // 3. The workflow finishes and triggers this job. + if (!gql_result.repository.ref) { + console.log("Ref has been deleted"); + return; + } + console.log(gql_result.repository.ref.associatedPullRequests.nodes); - var pr_number = 0; - gql_result.repository.ref.associatedPullRequests.nodes.forEach((pr) => { + gql_result.repository.ref.associatedPullRequests.nodes.forEach((pr) => { - // The largest PR number is the one we care about. The only way - // to have more than one associated pull requests is if all the - // old pull requests are in the closed state. - if (pr.baseRepository.owner.login = context.repo.owner && pr.number > pr_number) { - pr_number = pr.number; - } - }); + // The largest PR number is the one we care about. The only way + // to have more than one associated pull requests is if all the + // old pull requests are in the closed state. + if (pr.baseRepository.owner.login = context.repo.owner && pr.number > pr_number) { + pr_number = pr.number; + } + }); + } if (pr_number == 0) { console.log("Error retrieving pull request number"); return; From 88f0e3a5807195a7b43158b14a2140f02b150866 Mon Sep 17 00:00:00 2001 From: Will Hawkins Date: Mon, 11 May 2026 14:34:14 -0400 Subject: [PATCH 331/538] [lldb] Support list-ing Source Embedded in DWARF (#191802) Compiled programs that embed their source code into their debugging information (using, e.g., DW_LNCT_LLVM_source) should display that code during debugging sessions. Fixes #191801 --- lldb/source/Core/SourceManager.cpp | 7 ++++- .../Commands/command-source-embedded.test | 29 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 lldb/test/Shell/Commands/command-source-embedded.test diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index 4635d5e5eee4e..76bc9687eb45b 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -582,7 +582,12 @@ void SourceManager::File::CommonInitializerImpl(SupportFileNSP support_file_nsp, } void SourceManager::File::SetSupportFile(SupportFileNSP support_file_nsp) { - FileSpec file_spec = support_file_nsp->GetSpecOnly(); + // Use Materialize here to allow for the possibility of support files + // that may have special semantics for "generating" a file spec from + // a support file (e.g., DWARF with embedded source through + // DW_LNCT_LLVM_source). + FileSpec file_spec = support_file_nsp->Materialize(); + resolve_tilde(file_spec); m_support_file_nsp = std::make_shared(file_spec, support_file_nsp->GetChecksum()); diff --git a/lldb/test/Shell/Commands/command-source-embedded.test b/lldb/test/Shell/Commands/command-source-embedded.test new file mode 100644 index 0000000000000..183447d1aa0c5 --- /dev/null +++ b/lldb/test/Shell/Commands/command-source-embedded.test @@ -0,0 +1,29 @@ +# Test automatically listing source on break for binary with embedded source. + +# When a program with embedded source being debugged reaches a breakpoint, its +# source code should be listed. This test prevents a regression identified in #191801. + +# RUN: split-file %s %t +# RUN: %clang_host -g -gdwarf -gembed-source %t/main.c -o %t.out +# RUN: %lldb -x -b -s %t/commands %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s + +#--- main.c +int main() { + return 0; +} + +#--- commands +break set -n main +# CHECK-LABEL: break set -n main +# CHECK-NEXT: Breakpoint 1: where = {{.*}}`main + +run +# CHECK-LABEL: run +# CHECK-NEXT: Process [[PID:.*]] launched +# CHECK-NEXT: Process [[PID]] stopped +# CHECK-NEXT: name = {{.*}}, stop reason = breakpoint 1.1 +# CHECK-NEXT: frame #0: {{.*}}`main at main.c:2:3 +# CHECK-NEXT: 1 int main() { +# CHECK-NEXT: -> 2 return 0; +# CHECK-NEXT 3 } From f86f5480cb4e2c6d7f40a648f9ae0a27359c18c6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 11 May 2026 11:54:49 -0700 Subject: [PATCH 332/538] [RISCV] Check for null LIS before trying to move AVL in canMutatePriorConfig. (#196673) If LIS is null then the VN info are null and we don't know if we need to move the AVL. Fixes an assertion like RegAllocFast.cpp:729: void (anonymous namespace)::RegAllocFastImpl::reloadAtBegin(MachineBasicBlock &): Assertion `(&MBB != &MBB.getParent()->front() || IgnoreMissingDefs) && "no reload in start block. Missing vreg def?"' failed. --- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 4 ++ .../CodeGen/RISCV/rvv/vsetvli-insert-O0.ll | 40 ++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index d21bd98d5dcb5..81ea9a8e62f1b 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -776,6 +776,10 @@ bool RISCVInsertVSETVLI::canMutatePriorConfig( VNInfo *VNI = getVNInfoFromReg(AVL.getReg(), MI, LIS); VNInfo *PrevVNI = getVNInfoFromReg(AVL.getReg(), PrevMI, LIS); if (!VNI || !PrevVNI || VNI != PrevVNI) { + // If LIS is null, we were not able to get the VNInfo so we don't know + // if the AVL def needs to be moved. + if (!LIS) + return false; // If the AVL is defined by a load immediate instruction (ADDI x0, imm), // it can be moved earlier since it has no register dependencies. if (!AVL.getReg().isVirtual()) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll index 71710432b7e42..9e8a5f331172e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll @@ -7,8 +7,8 @@ define <2 x double> @fixed_length(<2 x double> %a, <2 x double> %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: # kill: def $v11 killed $v10 -; CHECK-NEXT: # kill: def $v9 killed $v8 +; CHECK-NEXT: # kill: def $v11 killed $v10 killed $vtype +; CHECK-NEXT: # kill: def $v9 killed $v8 killed $vtype ; CHECK-NEXT: # implicit-def: $v9 ; CHECK-NEXT: vfadd.vv v9, v8, v10 ; CHECK-NEXT: # implicit-def: $v8 @@ -147,3 +147,39 @@ entry: i64 7, i64 %2) ret %3 } + +define <64 x i8> @coalesce_avl_move_crash(<16 x i8> %0) { +; CHECK-LABEL: coalesce_avl_move_crash: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: # implicit-def: $v8m2 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: # implicit-def: $v12 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: # implicit-def: $v10m2 +; CHECK-NEXT: vmv1r.v v10, v12 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vmv1r.v v11, v12 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 16 +; CHECK-NEXT: # implicit-def: $v12m4 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: # implicit-def: $v16m2 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: # implicit-def: $v8m4 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; CHECK-NEXT: vmv2r.v v8, v16 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; CHECK-NEXT: vmv2r.v v14, v16 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: ret +entry: + %1 = shufflevector <16 x i8> %0, <16 x i8> zeroinitializer, <32 x i32> + %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %1, <64 x i32> + ret <64 x i8> %2 +} From 48c864a0bdd315f20acb1dbaf8e8bb37aab707e5 Mon Sep 17 00:00:00 2001 From: savchart Date: Mon, 11 May 2026 20:56:42 +0200 Subject: [PATCH 333/538] [FileCheck] Handle directives at EOF without a trailing newline (#196576) FileCheck could assert when a check directive ended at EOF without a trailing newline. After parsing the directive suffix, EOF can be a valid continuation point, so parsing now continues directly from `AfterSuffix`. Fixes #101582 --- llvm/lib/FileCheck/FileCheck.cpp | 12 ++++++------ llvm/test/FileCheck/check-empty-tag.txt | 12 ++++++++++++ llvm/test/FileCheck/check-eof-no-pattern.txt | 7 +++++++ 3 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 llvm/test/FileCheck/check-eof-no-pattern.txt diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index 4d3ac87b9c724..38c61cd60174d 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1881,18 +1881,18 @@ bool FileCheck::readCheckFile( assert(UsedPrefix.data() == Buffer.data() && "Failed to move Buffer's start forward, or pointed prefix outside " "of the buffer!"); + + const char *BufferEnd = Buffer.data() + Buffer.size(); assert(AfterSuffix.data() >= Buffer.data() && - AfterSuffix.data() < Buffer.data() + Buffer.size() && + AfterSuffix.data() <= BufferEnd && "Parsing after suffix doesn't start inside of buffer!"); + // Skip the buffer to the end of the parsed directive suffix. + Buffer = AfterSuffix; + // Location to use for error messages. const char *UsedPrefixStart = UsedPrefix.data(); - // Skip the buffer to the end of parsed suffix (or just prefix, if no good - // suffix was processed). - Buffer = AfterSuffix.empty() ? Buffer.drop_front(UsedPrefix.size()) - : AfterSuffix; - // Complain about misspelled directives. if (CheckTy == Check::CheckMisspelled) { StringRef UsedDirective(UsedPrefix.data(), diff --git a/llvm/test/FileCheck/check-empty-tag.txt b/llvm/test/FileCheck/check-empty-tag.txt index c398bcf596fcf..462eb127e69dc 100644 --- a/llvm/test/FileCheck/check-empty-tag.txt +++ b/llvm/test/FileCheck/check-empty-tag.txt @@ -31,6 +31,18 @@ CHECK3B: found non-empty check string for empty check with prefix 'CHECK3A:' CHECK4A-EMPTY: CHECK4B: found 'CHECK4A-EMPTY' without previous 'CHECK4A: line +; CHECK-EMPTY at EOF without trailing newline cannot be the first check. +; RUN: printf "# CHECK-EMPTY:" > %t.no-newline-fail.chk +; RUN: %ProtectFileCheckOutput \ +; RUN: not FileCheck --allow-empty --input-file=/dev/null %t.no-newline-fail.chk 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK4C +CHECK4C: found 'CHECK-EMPTY' without previous 'CHECK: line + +; CHECK-EMPTY at EOF without trailing newline still works after a prior check. +; RUN: printf "foo\n\n" > %t.no-newline-success.in +; RUN: printf "CHECK: foo\nCHECK-EMPTY:" > %t.no-newline-success.chk +; RUN: FileCheck --input-file=%t.no-newline-success.in %t.no-newline-success.chk + ; CHECK-EMPTY-NOT and CHECK-NOT-EMPTY rejected ; RUN: %ProtectFileCheckOutput \ ; RUN: not FileCheck %s --input-file %s --check-prefixes=CHECK5A 2>&1 \ diff --git a/llvm/test/FileCheck/check-eof-no-pattern.txt b/llvm/test/FileCheck/check-eof-no-pattern.txt new file mode 100644 index 0000000000000..b037285fcf5b8 --- /dev/null +++ b/llvm/test/FileCheck/check-eof-no-pattern.txt @@ -0,0 +1,7 @@ +; CHECK requires a pattern, so the no-newline EOF case must fail here. +; RUN: printf "CHECK:" > %t.no-pattern.chk +; RUN: %ProtectFileCheckOutput \ +; RUN: not FileCheck --allow-empty --input-file=/dev/null %t.no-pattern.chk 2>&1 \ +; RUN: | FileCheck %s --check-prefix=ERR-EMPTY-CHECK + +ERR-EMPTY-CHECK: error: found empty check string From 78d124eb16aa62e02a465b0fd6c3c2cab0a26dd8 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 11 May 2026 13:58:02 -0500 Subject: [PATCH 334/538] [lldb] Assert that CommandObject::DoExecute sets a return status (#196589) Change the default value of CommandReturnObject::m_status from eReturnStatusStarted to eReturnStatusInvalid, and add a debug-only RAII check in CommandObjectParsed::Execute and CommandObjectRaw::Execute that asserts the status is no longer Invalid after DoExecute returns. This catches commands that forget to call SetStatus on a success or failure path. Succeeded() still returns true when the status is Invalid (0 sorts below eReturnStatusSuccessContinuingResult), so helpers that read result.Succeeded() as a precondition before any explicit SetStatus (e.g. StopProcessIfNecessary) continue to work. rdar://176506732 --- .../lldb/Interpreter/CommandReturnObject.h | 5 ++- lldb/source/Commands/CommandObjectProcess.cpp | 2 + .../Commands/CommandObjectProtocolServer.cpp | 3 +- lldb/source/Interpreter/CommandObject.cpp | 25 +++++++++++- .../Interpreter/CommandReturnObject.cpp | 2 +- .../command/script/TestCommandScript.py | 4 +- lldb/unittests/Interpreter/CMakeLists.txt | 1 + .../Interpreter/TestCommandReturnObject.cpp | 38 +++++++++++++++++++ 8 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 lldb/unittests/Interpreter/TestCommandReturnObject.cpp diff --git a/lldb/include/lldb/Interpreter/CommandReturnObject.h b/lldb/include/lldb/Interpreter/CommandReturnObject.h index 743bd94f5d73e..dccb98cd2be90 100644 --- a/lldb/include/lldb/Interpreter/CommandReturnObject.h +++ b/lldb/include/lldb/Interpreter/CommandReturnObject.h @@ -189,7 +189,10 @@ class CommandReturnObject { std::vector m_diagnostics; std::optional m_diagnostic_indent; - lldb::ReturnStatus m_status = lldb::eReturnStatusStarted; + /// The command's return status indicating success or failure. The default + /// value indicate no status has been set, which is enforced by an assert in + /// the CommandInterpreter. + lldb::ReturnStatus m_status = lldb::eReturnStatusInvalid; /// An optionally empty list of values produced by this command. ValueObjectList m_value_objects; diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 551f98566a9a5..d7d6a9152e377 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -927,7 +927,9 @@ class CommandObjectProcessConnect : public CommandObjectParsed { error); if (error.Fail() || process_sp == nullptr) { result.AppendError(error.AsCString("Error connecting to the process")); + return; } + result.SetStatus(eReturnStatusSuccessFinishResult); } CommandOptions m_options; diff --git a/lldb/source/Commands/CommandObjectProtocolServer.cpp b/lldb/source/Commands/CommandObjectProtocolServer.cpp index 1a950899ea1c0..73f717660e47d 100644 --- a/lldb/source/Commands/CommandObjectProtocolServer.cpp +++ b/lldb/source/Commands/CommandObjectProtocolServer.cpp @@ -92,8 +92,8 @@ class CommandObjectProtocolServerStart : public CommandObjectParsed { result.AppendMessageWithFormatv( "{0} server started with connection listeners: {1}", protocol, address); - result.SetStatus(eReturnStatusSuccessFinishNoResult); } + result.SetStatus(eReturnStatusSuccessFinishNoResult); } }; @@ -128,6 +128,7 @@ class CommandObjectProtocolServerStop : public CommandObjectParsed { result.AppendErrorWithFormatv("{0}", llvm::fmt_consume(std::move(error))); return; } + result.SetStatus(eReturnStatusSuccessFinishNoResult); } }; diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp index 3b3b2d7a302d9..dbdfc66204660 100644 --- a/lldb/source/Interpreter/CommandObject.cpp +++ b/lldb/source/Interpreter/CommandObject.cpp @@ -37,6 +37,26 @@ using namespace lldb; using namespace lldb_private; +namespace { +/// RAII scope that resets the result's status to eReturnStatusInvalid on entry +/// and asserts on exit that DoExecute changed it (directly via SetStatus, or +/// indirectly via AppendError/SetError, which call SetStatus internally). +class DoExecuteStatusCheck { +public: + explicit DoExecuteStatusCheck(CommandReturnObject &result) + : m_result(result) { + m_result.SetStatus(eReturnStatusInvalid); + } + ~DoExecuteStatusCheck() { + assert(m_result.GetStatus() != eReturnStatusInvalid && + "DoExecute did not set a status on the CommandReturnObject"); + } + +private: + CommandReturnObject &m_result; +}; +} // namespace + // CommandObject CommandObject::CommandObject(CommandInterpreter &interpreter, @@ -825,6 +845,7 @@ void CommandObjectParsed::Execute(const char *args_string, return; } m_interpreter.IncreaseCommandUsage(*this); + DoExecuteStatusCheck check(result); DoExecute(cmd_args, result); } } @@ -845,8 +866,10 @@ void CommandObjectRaw::Execute(const char *args_string, handled = InvokeOverrideCallback(argv, result); } if (!handled) { - if (CheckRequirements(result)) + if (CheckRequirements(result)) { + DoExecuteStatusCheck check(result); DoExecute(args_string, result); + } Cleanup(); } diff --git a/lldb/source/Interpreter/CommandReturnObject.cpp b/lldb/source/Interpreter/CommandReturnObject.cpp index a6b0d56c66cca..c21b9b3a92c05 100644 --- a/lldb/source/Interpreter/CommandReturnObject.cpp +++ b/lldb/source/Interpreter/CommandReturnObject.cpp @@ -181,7 +181,7 @@ void CommandReturnObject::Clear() { if (stream_sp) static_cast(stream_sp.get())->Clear(); m_diagnostics.clear(); - m_status = eReturnStatusStarted; + m_status = eReturnStatusInvalid; m_did_change_process_state = false; m_suppress_immediate_output = false; m_interactive = true; diff --git a/lldb/test/API/commands/command/script/TestCommandScript.py b/lldb/test/API/commands/command/script/TestCommandScript.py index fdd5216a1c6cc..eb1584c64c90d 100644 --- a/lldb/test/API/commands/command/script/TestCommandScript.py +++ b/lldb/test/API/commands/command/script/TestCommandScript.py @@ -214,8 +214,8 @@ def test_persistence(self): # valid. self.expect("script str(persistence.debugger_copy)", substrs=[str(self.dbg)]) # The result object will be replaced by an empty result object (in the - # "Started" state). - self.expect("script str(persistence.result_copy)", substrs=["Started"]) + # default "Invalid" state). + self.expect("script str(persistence.result_copy)", substrs=["Invalid"]) def test_interactive(self): """ diff --git a/lldb/unittests/Interpreter/CMakeLists.txt b/lldb/unittests/Interpreter/CMakeLists.txt index d4ba5b3d58334..7eec76105aad2 100644 --- a/lldb/unittests/Interpreter/CMakeLists.txt +++ b/lldb/unittests/Interpreter/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_unittest(InterpreterTests TestCommandPaths.cpp + TestCommandReturnObject.cpp TestCompletion.cpp TestOptionArgParser.cpp TestOptions.cpp diff --git a/lldb/unittests/Interpreter/TestCommandReturnObject.cpp b/lldb/unittests/Interpreter/TestCommandReturnObject.cpp new file mode 100644 index 0000000000000..a8c66958e5d06 --- /dev/null +++ b/lldb/unittests/Interpreter/TestCommandReturnObject.cpp @@ -0,0 +1,38 @@ +//===-- TestCommandReturnObject.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Interpreter/CommandReturnObject.h" +#include "gtest/gtest.h" + +using namespace lldb; +using namespace lldb_private; + +TEST(CommandReturnObjectTest, DefaultStatusIsInvalid) { + CommandReturnObject result(/*colors=*/false); + EXPECT_EQ(result.GetStatus(), eReturnStatusInvalid); +} + +TEST(CommandReturnObjectTest, SetStatusUpdatesStatus) { + CommandReturnObject result(false); + result.SetStatus(eReturnStatusSuccessFinishResult); + EXPECT_EQ(result.GetStatus(), eReturnStatusSuccessFinishResult); +} + +TEST(CommandReturnObjectTest, AppendErrorSetsFailed) { + CommandReturnObject result(false); + result.AppendError("boom"); + EXPECT_EQ(result.GetStatus(), eReturnStatusFailed); +} + +TEST(CommandReturnObjectTest, ClearResetsToInvalid) { + CommandReturnObject result(false); + result.SetStatus(eReturnStatusSuccessFinishResult); + ASSERT_EQ(result.GetStatus(), eReturnStatusSuccessFinishResult); + result.Clear(); + EXPECT_EQ(result.GetStatus(), eReturnStatusInvalid); +} From af2d117d3bbe64997c6655d1256ca5104bd113a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 11 May 2026 11:59:43 -0700 Subject: [PATCH 335/538] [flang-rt] Return default allocator directly in device rt (#197009) --- flang-rt/lib/runtime/allocator-registry.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/flang-rt/lib/runtime/allocator-registry.cpp b/flang-rt/lib/runtime/allocator-registry.cpp index f8a8daaf8e748..d1e85c23549fc 100644 --- a/flang-rt/lib/runtime/allocator-registry.cpp +++ b/flang-rt/lib/runtime/allocator-registry.cpp @@ -27,6 +27,11 @@ RT_API_ATTRS void AllocatorRegistry::Register(int pos, Allocator_t allocator) { RT_API_ATTRS AllocFct AllocatorRegistry::GetAllocator(int pos) { INTERNAL_CHECK(pos >= 0 && pos < MAX_ALLOCATOR); +#ifdef RT_DEVICE_COMPILATION + if (pos == kDefaultAllocator) { + return &MallocWrapper; + } +#endif AllocFct f{allocators[pos].alloc}; INTERNAL_CHECK(f != nullptr); return f; @@ -34,6 +39,11 @@ RT_API_ATTRS AllocFct AllocatorRegistry::GetAllocator(int pos) { RT_API_ATTRS FreeFct AllocatorRegistry::GetDeallocator(int pos) { INTERNAL_CHECK(pos >= 0 && pos < MAX_ALLOCATOR); +#ifdef RT_DEVICE_COMPILATION + if (pos == kDefaultAllocator) { + return &FreeWrapper; + } +#endif FreeFct f{allocators[pos].free}; INTERNAL_CHECK(f != nullptr); return f; From 223a9455fd36f076ea564fb5c13517d47fdd8aa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Spaits?= Date: Mon, 11 May 2026 21:11:33 +0200 Subject: [PATCH 336/538] [GlobalISel][AArch64] Add lowering for G_SMULFIX (#196757) Adding lowering for G_SMULFIX G_OP. It is needed to compile `libc/src/stdfix/expk.cpp` with `-O3`. --- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 1 + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 26 ++ .../AArch64/GISel/AArch64LegalizerInfo.cpp | 2 + .../AArch64/GlobalISel/legalize-smulfix.mir | 181 +++++++++++ .../GlobalISel/legalizer-info-validation.mir | 4 +- llvm/test/CodeGen/AArch64/smul_fix.ll | 295 +++++++++++++----- 6 files changed, 428 insertions(+), 81 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 8bcae5d15ad2c..9858f5e92de7a 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -576,6 +576,7 @@ class LegalizerHelper { LLVM_ABI LegalizeResult lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0); LLVM_ABI LegalizeResult lowerVAArg(MachineInstr &MI); + LLVM_ABI LegalizeResult lowerSmulfix(MachineInstr &MI); }; } // End namespace llvm. diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 909decfb015b5..070d7ec4b963e 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4968,6 +4968,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { MI.eraseFromParent(); return Legalized; } + case G_SMULFIX: + return lowerSmulfix(MI); } } @@ -10654,6 +10656,30 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) { return Legalized; } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerSmulfix(MachineInstr &MI) { + auto [Dst, LHS, RHS] = MI.getFirst3Regs(); + LLT Ty = MRI.getType(Dst); + unsigned Scale = MI.getOperand(3).getImm(); + + if (Scale == 0) { + MIRBuilder.buildMul(Dst, LHS, RHS); + MI.eraseFromParent(); + return Legalized; + } + + LLT WideTy = Ty.changeElementSize(Ty.getScalarSizeInBits() * 2); + auto SExtLHS = MIRBuilder.buildSExt(WideTy, LHS); + auto SExtRHS = MIRBuilder.buildSExt(WideTy, RHS); + auto Mul = MIRBuilder.buildMul(WideTy, SExtLHS, SExtRHS); + auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Scale); + auto Shifted = MIRBuilder.buildAShr(WideTy, Mul, ShiftAmt); + MIRBuilder.buildTrunc(Dst, Shifted); + + MI.eraseFromParent(); + return Legalized; +} + static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { // On Darwin, -Os means optimize for size without hurting performance, so // only really optimize for size when -Oz (MinSize) is used. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 14a3f7547fb09..4c7abbfb871af 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -297,6 +297,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor({i64, v16i8, v8i16, v4i32}) .lower(); + getActionDefinitionsBuilder(G_SMULFIX).lower(); + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({v8i8, v16i8, v4i16, v8i16, v2i32, v4i32}) .legalFor(HasCSSC, {i32, i64}) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir new file mode 100644 index 0000000000000..2b660e5064b03 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-smulfix.mir @@ -0,0 +1,181 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s +--- +name: smulfix_i32_scale_0 +body: | + bb.1: + liveins: $w0, $w1 + + ; CHECK-LABEL: name: smulfix_i32_scale_0 + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(i32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(i32) = COPY $w1 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(i32) = G_MUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: $w0 = COPY [[MUL]](i32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(i32) = COPY $w0 + %1:_(i32) = COPY $w1 + %2:_(i32) = G_SMULFIX %0, %1, 0 + $w0 = COPY %2(i32) + RET_ReallyLR implicit $w0 +... +--- +name: smulfix_i32 +body: | + bb.1: + liveins: $w0, $w1 + + ; CHECK-LABEL: name: smulfix_i32 + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(i32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(i32) = COPY $w1 + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(i64) = G_SEXT [[COPY]](i32) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(i64) = G_SEXT [[COPY1]](i32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(i64) = G_MUL [[SEXT]], [[SEXT1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(i64) = G_ASHR [[MUL]], [[C]](i64) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(i32) = G_TRUNC [[ASHR]](i64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](i32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(i32) = COPY $w0 + %1:_(i32) = COPY $w1 + %2:_(i32) = G_SMULFIX %0, %1, 15 + $w0 = COPY %2(i32) + RET_ReallyLR implicit $w0 +... +--- +name: smulfix_i64 +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: smulfix_i64 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(i64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(i64) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 63 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(i64) = G_ASHR [[COPY]], [[C]](i64) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(i64) = G_ASHR [[COPY1]], [[C]](i64) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(i64) = G_MUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(i64) = G_MUL [[ASHR]], [[COPY1]] + ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(i64) = G_MUL [[COPY]], [[ASHR1]] + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(i64) = G_UMULH [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(i64) = G_ADD [[MUL1]], [[MUL2]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(i64) = G_ADD [[ADD]], [[UMULH]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(i64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(i64) = G_LSHR [[MUL]], [[C1]](i64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(i64) = G_CONSTANT i64 49 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(i64) = G_SHL [[ADD1]], [[C2]](i64) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(i64) = G_OR [[LSHR]], [[SHL]] + ; CHECK-NEXT: $x0 = COPY [[OR]](i64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(i64) = COPY $x0 + %1:_(i64) = COPY $x1 + %2:_(i64) = G_SMULFIX %0, %1, 15 + $x0 = COPY %2(i64) + RET_ReallyLR implicit $x0 +... +--- +name: smulfix_4xi32 +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: smulfix_4xi32 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x i32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x i32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x i32>), [[UV1:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY]](<4 x i32>) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV]](<2 x i32>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV1]](<2 x i32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x i32>), [[UV3:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY1]](<4 x i32>) + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV2]](<2 x i32>) + ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV3]](<2 x i32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT]], [[SEXT2]] + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT1]], [[SEXT3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x i64>) = G_BUILD_VECTOR [[C]](i64), [[C]](i64) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL]], [[BUILD_VECTOR]](<2 x i64>) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL1]], [[BUILD_VECTOR]](<2 x i64>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR]](<2 x i64>) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR1]](<2 x i64>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x i32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x i32>), [[TRUNC1]](<2 x i32>) + ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x i32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x i32>) = COPY $q0 + %1:_(<4 x i32>) = COPY $q1 + %2:_(<4 x i32>) = G_SMULFIX %0, %1, 2 + $q0 = COPY %2(<4 x i32>) + RET_ReallyLR implicit $q0 +... +--- +name: smulfix_4xi32_15 +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: smulfix_4xi32_15 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x i32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x i32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x i32>), [[UV1:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY]](<4 x i32>) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV]](<2 x i32>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV1]](<2 x i32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x i32>), [[UV3:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY1]](<4 x i32>) + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV2]](<2 x i32>) + ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV3]](<2 x i32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT]], [[SEXT2]] + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT1]], [[SEXT3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x i64>) = G_BUILD_VECTOR [[C]](i64), [[C]](i64) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL]], [[BUILD_VECTOR]](<2 x i64>) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL1]], [[BUILD_VECTOR]](<2 x i64>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR]](<2 x i64>) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR1]](<2 x i64>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x i32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x i32>), [[TRUNC1]](<2 x i32>) + ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x i32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x i32>) = COPY $q0 + %1:_(<4 x i32>) = COPY $q1 + %2:_(<4 x i32>) = G_SMULFIX %0, %1, 15 + $q0 = COPY %2(<4 x i32>) + RET_ReallyLR implicit $q0 +... +--- +name: smulfix_4xi32_31 +body: | + bb.1: + liveins: $q0, $q1 + ; CHECK-LABEL: name: smulfix_4xi32_31 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x i32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x i32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x i32>), [[UV1:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY]](<4 x i32>) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV]](<2 x i32>) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV1]](<2 x i32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x i32>), [[UV3:%[0-9]+]]:_(<2 x i32>) = G_UNMERGE_VALUES [[COPY1]](<4 x i32>) + ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV2]](<2 x i32>) + ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(<2 x i64>) = G_SEXT [[UV3]](<2 x i32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT]], [[SEXT2]] + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(<2 x i64>) = G_MUL [[SEXT1]], [[SEXT3]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(i64) = G_CONSTANT i64 31 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x i64>) = G_BUILD_VECTOR [[C]](i64), [[C]](i64) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL]], [[BUILD_VECTOR]](<2 x i64>) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x i64>) = G_ASHR [[MUL1]], [[BUILD_VECTOR]](<2 x i64>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR]](<2 x i64>) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x i32>) = G_TRUNC [[ASHR1]](<2 x i64>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x i32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x i32>), [[TRUNC1]](<2 x i32>) + ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x i32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x i32>) = COPY $q0 + %1:_(<4 x i32>) = COPY $q1 + %2:_(<4 x i32>) = G_SMULFIX %0, %1, 31 + $q0 = COPY %2(<4 x i32>) + RET_ReallyLR implicit $q0 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 2e0b781785785..70dbeb7d49f65 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -492,8 +492,8 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_SMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index -# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_UMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/AArch64/smul_fix.ll b/llvm/test/CodeGen/AArch64/smul_fix.ll index dacce720a7319..f99d20a436a8d 100644 --- a/llvm/test/CodeGen/AArch64/smul_fix.ll +++ b/llvm/test/CodeGen/AArch64/smul_fix.ll @@ -1,37 +1,65 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i32 @func(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: func: -; CHECK: // %bb.0: -; CHECK-NEXT: smull x8, w0, w1 -; CHECK-NEXT: lsr x9, x8, #32 -; CHECK-NEXT: extr w0, w9, w8, #2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: func: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smull x8, w0, w1 +; CHECK-SD-NEXT: lsr x9, x8, #32 +; CHECK-SD-NEXT: extr w0, w9, w8, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: func: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smull x8, w0, w1 +; CHECK-GI-NEXT: asr x0, x8, #2 +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-GI-NEXT: ret %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2) ret i32 %tmp } define i64 @func2(i64 %x, i64 %y) { -; CHECK-LABEL: func2: -; CHECK: // %bb.0: -; CHECK-NEXT: mul x8, x0, x1 -; CHECK-NEXT: smulh x9, x0, x1 -; CHECK-NEXT: extr x0, x9, x8, #2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: func2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mul x8, x0, x1 +; CHECK-SD-NEXT: smulh x9, x0, x1 +; CHECK-SD-NEXT: extr x0, x9, x8, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: func2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr x8, x1, #63 +; CHECK-GI-NEXT: asr x9, x0, #63 +; CHECK-GI-NEXT: umulh x10, x0, x1 +; CHECK-GI-NEXT: mul x8, x0, x8 +; CHECK-GI-NEXT: madd x8, x9, x1, x8 +; CHECK-GI-NEXT: mul x9, x0, x1 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: extr x0, x8, x9, #2 +; CHECK-GI-NEXT: ret %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2) ret i64 %tmp } define i4 @func3(i4 %x, i4 %y) nounwind { -; CHECK-LABEL: func3: -; CHECK: // %bb.0: -; CHECK-NEXT: sbfx w8, w1, #0, #4 -; CHECK-NEXT: sbfx w9, w0, #0, #4 -; CHECK-NEXT: smull x8, w9, w8 -; CHECK-NEXT: lsr x9, x8, #32 -; CHECK-NEXT: extr w0, w9, w8, #2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: func3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sbfx w8, w1, #0, #4 +; CHECK-SD-NEXT: sbfx w9, w0, #0, #4 +; CHECK-SD-NEXT: smull x8, w9, w8 +; CHECK-SD-NEXT: lsr x9, x8, #32 +; CHECK-SD-NEXT: extr w0, w9, w8, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: func3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #0, #4 +; CHECK-GI-NEXT: sbfx w9, w1, #0, #4 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: sbfx w0, w8, #2, #6 +; CHECK-GI-NEXT: ret %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp } @@ -56,40 +84,69 @@ define i64 @func5(i64 %x, i64 %y) { } define i4 @func6(i4 %x, i4 %y) nounwind { -; CHECK-LABEL: func6: -; CHECK: // %bb.0: -; CHECK-NEXT: sbfx w8, w1, #0, #4 -; CHECK-NEXT: sbfx w9, w0, #0, #4 -; CHECK-NEXT: mul w0, w9, w8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: func6: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sbfx w8, w1, #0, #4 +; CHECK-SD-NEXT: sbfx w9, w0, #0, #4 +; CHECK-SD-NEXT: mul w0, w9, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: func6: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mul w0, w0, w1 +; CHECK-GI-NEXT: ret %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0) ret i4 %tmp } define i64 @func7(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: func7: -; CHECK: // %bb.0: -; CHECK-NEXT: mul x8, x0, x1 -; CHECK-NEXT: smulh x9, x0, x1 -; CHECK-NEXT: extr x0, x9, x8, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: func7: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mul x8, x0, x1 +; CHECK-SD-NEXT: smulh x9, x0, x1 +; CHECK-SD-NEXT: extr x0, x9, x8, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: func7: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr x8, x1, #63 +; CHECK-GI-NEXT: asr x9, x0, #63 +; CHECK-GI-NEXT: umulh x10, x0, x1 +; CHECK-GI-NEXT: mul x8, x0, x8 +; CHECK-GI-NEXT: madd x8, x9, x1, x8 +; CHECK-GI-NEXT: mul x9, x0, x1 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: extr x0, x8, x9, #32 +; CHECK-GI-NEXT: ret %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 32) ret i64 %tmp } define i64 @func8(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: func8: -; CHECK: // %bb.0: -; CHECK-NEXT: mul x8, x0, x1 -; CHECK-NEXT: smulh x9, x0, x1 -; CHECK-NEXT: extr x0, x9, x8, #63 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: func8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mul x8, x0, x1 +; CHECK-SD-NEXT: smulh x9, x0, x1 +; CHECK-SD-NEXT: extr x0, x9, x8, #63 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: func8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr x8, x1, #63 +; CHECK-GI-NEXT: asr x9, x0, #63 +; CHECK-GI-NEXT: umulh x10, x0, x1 +; CHECK-GI-NEXT: mul x8, x0, x8 +; CHECK-GI-NEXT: madd x8, x9, x1, x8 +; CHECK-GI-NEXT: mul x9, x0, x1 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: extr x0, x8, x9, #63 +; CHECK-GI-NEXT: ret %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 63) ret i64 %tmp } -define <2 x i32> @vec(<2 x i32> %x, <2 x i32> %y) nounwind { -; CHECK-LABEL: vec: +define <2 x i32> @smulfix_2xi32_0(<2 x i32> %x, <2 x i32> %y) nounwind { +; CHECK-LABEL: smulfix_2xi32_0: ; CHECK: // %bb.0: ; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret @@ -97,8 +154,8 @@ define <2 x i32> @vec(<2 x i32> %x, <2 x i32> %y) nounwind { ret <2 x i32> %tmp } -define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec2: +define <4 x i32> @smulfix_4xi32_0(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: smulfix_4xi32_0: ; CHECK: // %bb.0: ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -106,47 +163,127 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ret <4 x i32> %tmp } -define <4 x i64> @vec3(<4 x i64> %x, <4 x i64> %y) nounwind { -; CHECK-LABEL: vec3: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, v2.d[1] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x14, v3.d[1] -; CHECK-NEXT: mov x15, v1.d[1] -; CHECK-NEXT: mul x12, x11, x10 -; CHECK-NEXT: mul x13, x9, x8 -; CHECK-NEXT: smulh x8, x9, x8 -; CHECK-NEXT: smulh x9, x11, x10 -; CHECK-NEXT: fmov x10, d3 -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: mul x16, x11, x10 -; CHECK-NEXT: extr x8, x8, x13, #32 -; CHECK-NEXT: smulh x10, x11, x10 -; CHECK-NEXT: extr x9, x9, x12, #32 -; CHECK-NEXT: mul x11, x15, x14 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: smulh x14, x15, x14 -; CHECK-NEXT: extr x10, x10, x16, #32 -; CHECK-NEXT: mov v0.d[1], x8 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: extr x11, x14, x11, #32 -; CHECK-NEXT: mov v1.d[1], x11 -; CHECK-NEXT: ret +define <4 x i32> @smulfix_4xi32(<4 x i32> %1, <4 x i32> %2) { +; CHECK-SD-LABEL: smulfix_4xi32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: smull v3.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: mul v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp2 v2.4s, v3.4s, v2.4s +; CHECK-SD-NEXT: shl v0.4s, v2.4s, #17 +; CHECK-SD-NEXT: usra v0.4s, v1.4s, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: smulfix_4xi32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.2s, v2.2d, #15 +; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #15 +; CHECK-GI-NEXT: ret + %m = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %1, <4 x i32> %2, i32 15) + ret <4 x i32> %m +} + +define <4 x i64> @smulfix_4xi64(<4 x i64> %x, <4 x i64> %y) nounwind { +; CHECK-SD-LABEL: smulfix_4xi64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov x8, v2.d[1] +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: fmov x10, d2 +; CHECK-SD-NEXT: fmov x11, d0 +; CHECK-SD-NEXT: mov x14, v3.d[1] +; CHECK-SD-NEXT: mov x15, v1.d[1] +; CHECK-SD-NEXT: mul x12, x11, x10 +; CHECK-SD-NEXT: mul x13, x9, x8 +; CHECK-SD-NEXT: smulh x8, x9, x8 +; CHECK-SD-NEXT: smulh x9, x11, x10 +; CHECK-SD-NEXT: fmov x10, d3 +; CHECK-SD-NEXT: fmov x11, d1 +; CHECK-SD-NEXT: mul x16, x11, x10 +; CHECK-SD-NEXT: extr x8, x8, x13, #32 +; CHECK-SD-NEXT: smulh x10, x11, x10 +; CHECK-SD-NEXT: extr x9, x9, x12, #32 +; CHECK-SD-NEXT: mul x11, x15, x14 +; CHECK-SD-NEXT: fmov d0, x9 +; CHECK-SD-NEXT: smulh x14, x15, x14 +; CHECK-SD-NEXT: extr x10, x10, x16, #32 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: fmov d1, x10 +; CHECK-SD-NEXT: extr x11, x14, x11, #32 +; CHECK-SD-NEXT: mov v1.d[1], x11 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: smulfix_4xi64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: mov d2, v2.d[1] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: asr x10, x9, #63 +; CHECK-GI-NEXT: asr x12, x8, #63 +; CHECK-GI-NEXT: mul x11, x8, x9 +; CHECK-GI-NEXT: mul x10, x8, x10 +; CHECK-GI-NEXT: umulh x8, x8, x9 +; CHECK-GI-NEXT: madd x9, x12, x9, x10 +; CHECK-GI-NEXT: fmov x12, d2 +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: mov d0, v3.d[1] +; CHECK-GI-NEXT: asr x13, x12, #63 +; CHECK-GI-NEXT: asr x15, x10, #63 +; CHECK-GI-NEXT: mul x14, x10, x12 +; CHECK-GI-NEXT: mul x13, x10, x13 +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: add x8, x9, x8 +; CHECK-GI-NEXT: extr x8, x8, x11, #32 +; CHECK-GI-NEXT: umulh x10, x10, x12 +; CHECK-GI-NEXT: asr x1, x0, #63 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: madd x12, x15, x12, x13 +; CHECK-GI-NEXT: fmov x15, d3 +; CHECK-GI-NEXT: fmov x13, d1 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: asr x16, x15, #63 +; CHECK-GI-NEXT: asr x18, x13, #63 +; CHECK-GI-NEXT: mul x17, x13, x15 +; CHECK-GI-NEXT: mul x16, x13, x16 +; CHECK-GI-NEXT: add x9, x12, x10 +; CHECK-GI-NEXT: extr x9, x9, x14, #32 +; CHECK-GI-NEXT: umulh x13, x13, x15 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: madd x15, x18, x15, x16 +; CHECK-GI-NEXT: fmov x16, d1 +; CHECK-GI-NEXT: mul x18, x16, x1 +; CHECK-GI-NEXT: asr x1, x16, #63 +; CHECK-GI-NEXT: umulh x2, x16, x0 +; CHECK-GI-NEXT: add x10, x15, x13 +; CHECK-GI-NEXT: extr x10, x10, x17, #32 +; CHECK-GI-NEXT: madd x18, x1, x0, x18 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mul x16, x16, x0 +; CHECK-GI-NEXT: add x12, x18, x2 +; CHECK-GI-NEXT: extr x11, x12, x16, #32 +; CHECK-GI-NEXT: mov v1.d[1], x11 +; CHECK-GI-NEXT: ret %tmp = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> %x, <4 x i64> %y, i32 32) ret <4 x i64> %tmp } define <4 x i16> @widemul(<4 x i16> %x, <4 x i16> %y) nounwind { -; CHECK-LABEL: widemul: -; CHECK: // %bb.0: -; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: shrn v1.4h, v0.4s, #16 -; CHECK-NEXT: xtn v2.4h, v0.4s -; CHECK-NEXT: shl v0.4h, v1.4h, #14 -; CHECK-NEXT: usra v0.4h, v2.4h, #2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: widemul: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: shrn v1.4h, v0.4s, #16 +; CHECK-SD-NEXT: xtn v2.4h, v0.4s +; CHECK-SD-NEXT: shl v0.4h, v1.4h, #14 +; CHECK-SD-NEXT: usra v0.4h, v2.4h, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: widemul: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #2 +; CHECK-GI-NEXT: ret %tmp = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> %x, <4 x i16> %y, i32 2) ret <4 x i16> %tmp } From 875e6226aad6c771631ce97297d5d3f09d13a176 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Mon, 11 May 2026 12:12:29 -0700 Subject: [PATCH 337/538] [lld][WebAssembly] Allow defining of arbitrary symbols types in LTO objects (#196552) Bitcode files don't contains precise symbol type information so we always allow the post-LTO defined symbols (from the LTO object file) to overwrite bitcode symbols. We don't want to be reporting type mismatches in these cases. Fixes: #195311 --- lld/test/wasm/lto/inline-asm-symbols.ll | 24 ++++++++++++++++++++++++ lld/test/wasm/lto/signature-mismatch.ll | 6 +++--- lld/wasm/SymbolTable.cpp | 20 ++++++++++++++++++-- 3 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 lld/test/wasm/lto/inline-asm-symbols.ll diff --git a/lld/test/wasm/lto/inline-asm-symbols.ll b/lld/test/wasm/lto/inline-asm-symbols.ll new file mode 100644 index 0000000000000..8b2f6270d7e2b --- /dev/null +++ b/lld/test/wasm/lto/inline-asm-symbols.ll @@ -0,0 +1,24 @@ +;; Test that a bitcode symbol defined in inline assembly (which wasm-ld +;; initially guesses is a FUNCTION) can be replaced by the LTO-generated +;; object symbol (which is correctly identified as a TAG) without error. + +; RUN: llvm-as %s -o %t.o +; RUN: wasm-ld --export=foo %t.o -o %t.wasm +; RUN: obj2yaml %t.wasm | FileCheck %s + +; CHECK: - Type: TAG +; CHECK: TagTypes: [ 1 ] +; CHECK: - Name: foo +; CHECK: Kind: TAG +; CHECK: Index: 0 + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-unknown" + +module asm ".globl foo" +module asm ".tagtype foo i32" +module asm "foo:" + +define void @_start() { + ret void +} diff --git a/lld/test/wasm/lto/signature-mismatch.ll b/lld/test/wasm/lto/signature-mismatch.ll index 6580c8cf71b33..58ec045a77981 100644 --- a/lld/test/wasm/lto/signature-mismatch.ll +++ b/lld/test/wasm/lto/signature-mismatch.ll @@ -1,6 +1,6 @@ ; RUN: llc -filetype=obj -o %t.o %s -; RUN: llvm-as %S/Inputs/archive.ll -o %t1.o -; RUN: not wasm-ld --fatal-warnings %t.o %t1.o -o %t.wasm 2>&1 | FileCheck %s +; RUN: llvm-as %S/Inputs/archive.ll -o %t.archive.o +; RUN: not wasm-ld --fatal-warnings %t.o %t.archive.o -o %t.wasm 2>&1 | FileCheck %s ; Test that functions defined in bitcode correctly report signature ; mismatches with existing undefined sybmols in normal objects. @@ -16,5 +16,5 @@ define void @_start() { } ; CHECK: error: function signature mismatch: f -; CHECK: >>> defined as (i32) -> void in {{.*}}signature-mismatch.ll.tmp1.o +; CHECK: >>> defined as (i32) -> void in {{.*}}signature-mismatch.ll.tmp.archive.o ; CHECK: >>> defined as () -> void in {{.*}}signature-mismatch.ll.tmp.wasm.lto.o diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp index cda192323f067..88ac54302c286 100644 --- a/lld/wasm/SymbolTable.cpp +++ b/lld/wasm/SymbolTable.cpp @@ -165,6 +165,11 @@ std::pair SymbolTable::insert(StringRef name, return {s, wasInserted}; } +static bool isBitcodeSymbol(const Symbol *symbol) { + return symbol->getFile() && + symbol->getFile()->kind() == InputFile::BitcodeKind; +} + static void reportTypeError(const Symbol *existing, const InputFile *file, llvm::wasm::WasmSymbolType type) { error("symbol type mismatch: " + toString(*existing) + "\n>>> defined as " + @@ -192,6 +197,8 @@ static bool signatureMatches(FunctionSymbol *existing, static void checkGlobalType(const Symbol *existing, const InputFile *file, const WasmGlobalType *newType) { if (!isa(existing)) { + if (isBitcodeSymbol(existing)) + return; reportTypeError(existing, file, WASM_SYMBOL_TYPE_GLOBAL); return; } @@ -206,12 +213,15 @@ static void checkGlobalType(const Symbol *existing, const InputFile *file, static void checkTagType(const Symbol *existing, const InputFile *file, const WasmSignature *newSig) { - const auto *existingTag = dyn_cast(existing); if (!isa(existing)) { + if (isBitcodeSymbol(existing)) + return; reportTypeError(existing, file, WASM_SYMBOL_TYPE_TAG); return; } + const auto *existingTag = cast(existing); + const WasmSignature *oldSig = existingTag->signature; if (*newSig != *oldSig) warn("Tag signature mismatch: " + existing->getName() + @@ -223,6 +233,8 @@ static void checkTagType(const Symbol *existing, const InputFile *file, static void checkTableType(const Symbol *existing, const InputFile *file, const WasmTableType *newType) { if (!isa(existing)) { + if (isBitcodeSymbol(existing)) + return; reportTypeError(existing, file, WASM_SYMBOL_TYPE_TABLE); return; } @@ -237,7 +249,7 @@ static void checkTableType(const Symbol *existing, const InputFile *file, } static void checkDataType(const Symbol *existing, const InputFile *file) { - if (!isa(existing)) + if (!isa(existing) && !isBitcodeSymbol(existing)) reportTypeError(existing, file, WASM_SYMBOL_TYPE_DATA); } @@ -511,6 +523,10 @@ Symbol *SymbolTable::addDefinedFunction(StringRef name, uint32_t flags, auto existingFunction = dyn_cast(s); if (!existingFunction) { + if (isBitcodeSymbol(s)) { + replaceSym(s); + return s; + } reportTypeError(s, file, WASM_SYMBOL_TYPE_FUNCTION); return s; } From 943a7da07a07ac5b2b735d5eee4edc1eeb8bf6f1 Mon Sep 17 00:00:00 2001 From: argothiel Date: Mon, 11 May 2026 21:54:24 +0200 Subject: [PATCH 338/538] [clangd] Fix out-of-bounds read in `packedLookup` (#197021) The `packedLookup` function doesn't work correctly with byte values over 0x7F (e.g. UTF-8 high bytes) on platforms where `char` is signed (like x86-64). The character is treated as a negative `char`, which gets converted to a negative `int`, which makes `I >> 2` negative, which gives a negative index, and thus an out-of-bounds read. The fix is to change the `int` parameter type to `unsigned char`, to always get the value in the 0x00..0xFF range. The issue has been discovered by the sanitizer buildbot after the #187623 merge, which first introduced tests with non-ASCII source content into the code-completion path. --- clang-tools-extra/clangd/FuzzyMatch.cpp | 3 ++- clang-tools-extra/clangd/unittests/FuzzyMatchTests.cpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/FuzzyMatch.cpp b/clang-tools-extra/clangd/FuzzyMatch.cpp index cf5182bc1b2d7..2df939594bfcf 100644 --- a/clang-tools-extra/clangd/FuzzyMatch.cpp +++ b/clang-tools-extra/clangd/FuzzyMatch.cpp @@ -145,7 +145,8 @@ constexpr static uint8_t CharRoles[] = { // clang-format on }; -template static T packedLookup(const uint8_t *Data, int I) { +template +static T packedLookup(const uint8_t *Data, unsigned char I) { return static_cast((Data[I >> 2] >> ((I & 3) * 2)) & 3); } CharTypeSet calculateRoles(llvm::StringRef Text, diff --git a/clang-tools-extra/clangd/unittests/FuzzyMatchTests.cpp b/clang-tools-extra/clangd/unittests/FuzzyMatchTests.cpp index 9cb668cb7cb16..5efbfcd8d3e93 100644 --- a/clang-tools-extra/clangd/unittests/FuzzyMatchTests.cpp +++ b/clang-tools-extra/clangd/unittests/FuzzyMatchTests.cpp @@ -305,6 +305,8 @@ TEST(FuzzyMatch, Segmentation) { returns("+--+---+------")); EXPECT_THAT(segment("t3h PeNgU1N oF d00m!!!!!!!!"), // returns("+-- +-+-+-+ ++ +--- ")); + EXPECT_THAT(segment("ab🙂cd"), // + returns("+-------")); } } // namespace From ac3ca83b03835fa7681aaf13a0a41aa299bb40ce Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 11 May 2026 12:54:38 -0700 Subject: [PATCH 339/538] [FileCheck] Fix -Wunused-variable in 48c864a (#197022) Variable is only used in an assertion. Mark it maybe_unused rather than inlining as the variable name makes it a bit more readable, even if it is a common idiom. --- llvm/lib/FileCheck/FileCheck.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index 38c61cd60174d..ae25a078713e7 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1882,7 +1882,7 @@ bool FileCheck::readCheckFile( "Failed to move Buffer's start forward, or pointed prefix outside " "of the buffer!"); - const char *BufferEnd = Buffer.data() + Buffer.size(); + [[maybe_unused]] const char *BufferEnd = Buffer.data() + Buffer.size(); assert(AfterSuffix.data() >= Buffer.data() && AfterSuffix.data() <= BufferEnd && "Parsing after suffix doesn't start inside of buffer!"); From b1a4e08997bb4cb39316847109cfdd9778108ce4 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 11 May 2026 15:55:38 -0400 Subject: [PATCH 340/538] [AMDGPU] Add `.amdgpu.info` section for per-function metadata (#192384) AMDGPU object linking requires the linker to propagate resource usage (registers, stack, LDS) across translation units. To support this, the compiler must emit per-function metadata and call graph edges in the relocatable object so the linker can compute whole-program resource requirements. This PR introduces a `.amdgpu.info` ELF section using a tagged, length-prefixed binary format: each entry is encoded as: ``` [kind: u8] [len: u8] [payload: bytes] ``` A function scope is opened by an `INFO_FUNC` entry (containing a symbol reference), followed by per-function attributes (register counts, flags, private segment size) and relational edges (direct calls, LDS uses, indirect call signatures). String data such as function type signatures is stored in a companion `.amdgpu.strtab` section. The format is forward-compatible: a consumer that encounters an unknown kind can skip it by reading the length byte, allowing new entry kinds to be added without breaking existing toolchains. --- llvm/docs/AMDGPUUsage.rst | 106 ++++++++ .../llvm/Support/AMDGPUObjLinkingInfo.h | 74 +++++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 160 ++++++++++- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 10 + llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 3 + .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 126 +++++++++ .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 179 ++++++++++++ .../MCTargetDesc/AMDGPUTargetStreamer.h | 30 ++ .../AMDGPU/lds-link-time-codegen-agpr.ll | 23 ++ .../AMDGPU/lds-link-time-codegen-callgraph.ll | 62 +++++ .../lds-link-time-codegen-named-barrier.ll | 29 +- .../AMDGPU/lds-link-time-codegen-typeid.ll | 257 ++++++++++++++++++ .../CodeGen/AMDGPU/lds-link-time-codegen.ll | 47 +++- llvm/test/MC/AMDGPU/amdgpu-info-err.s | 43 +++ llvm/test/MC/AMDGPU/amdgpu-info-roundtrip.s | 126 +++++++++ 15 files changed, 1261 insertions(+), 14 deletions(-) create mode 100644 llvm/include/llvm/Support/AMDGPUObjLinkingInfo.h create mode 100644 llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-agpr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-callgraph.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-typeid.ll create mode 100644 llvm/test/MC/AMDGPU/amdgpu-info-err.s create mode 100644 llvm/test/MC/AMDGPU/amdgpu-info-roundtrip.s diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 4c6cd12582a62..4ac8cc9197515 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -2881,6 +2881,8 @@ An AMDGPU target ELF code object has the standard ELF sections which include: ``.strtab`` ``SHT_STRTAB`` *none* ``.symtab`` ``SHT_SYMTAB`` *none* ``.text`` ``SHT_PROGBITS`` ``SHF_ALLOC`` + ``SHF_EXECINSTR`` + ``.amdgpu.info`` ``SHT_PROGBITS`` ``SHF_EXCLUDE`` + ``.amdgpu.strtab`` ``SHT_STRTAB`` ``SHF_EXCLUDE`` ================== ================ ================================= These sections have their standard meanings (see [ELF]_) and are only generated @@ -2916,6 +2918,67 @@ if needed. ``.amdgpu.kernel.runtime.handle`` Symbols used for device enqueue. +.. _amdgpu-info-section: + +``.amdgpu.info`` + Per-function metadata for AMDGPU object linking, emitted only in relocatable + code objects when object linking is enabled + (``-amdgpu-enable-object-linking``). The linker uses this section to + propagate resource usage (registers, stack, LDS) and resolve call graph + dependencies across translation units. + + Each entry uses a tagged, length-prefixed binary encoding: + + .. code-block:: none + + [kind: u8] [len: u8] [payload: bytes] + + A function scope is opened by an ``INFO_FUNC`` entry whose payload is an + 8-byte relocated symbol reference. All subsequent entries until the next + ``INFO_FUNC`` or end of section belong to that scope. The format is + forward-compatible: unknown kinds can be skipped by reading the length byte. + + .. table:: AMDGPU Info Entry Kinds + :name: amdgpu-info-entry-kinds-table + + ===== ============================== ========================================== + Value Name Payload + ===== ============================== ========================================== + 1 ``INFO_FUNC`` 8B symbol ref; opens function scope + 2 ``INFO_FLAGS`` u32; ``FuncInfoFlags`` bitfield + 3 ``INFO_NUM_SGPR`` u32; SGPRs explicitly used + 4 ``INFO_NUM_VGPR`` u32; architectural VGPRs used + 5 ``INFO_NUM_AGPR`` u32; accumulator VGPRs (AGPRs) used + 6 ``INFO_PRIVATE_SEGMENT_SIZE`` u32; private (scratch) segment bytes + 7 ``INFO_USE`` 8B symbol ref; resource dependency edge + 8 ``INFO_CALL`` 8B symbol ref; direct call edge + 9 ``INFO_INDIRECT_CALL`` u32 strtab offset; indirect call type-ID + 10 ``INFO_TYPEID`` u32 strtab offset; function type-ID + ===== ============================== ========================================== + + .. table:: AMDGPU Info Function Flags (``INFO_FLAGS``) + :name: amdgpu-info-flags-table + + ===== =========================== ========================================== + Bit Name Description + ===== =========================== ========================================== + 0x1 ``FUNC_USES_VCC`` Function uses the VCC register + 0x2 ``FUNC_USES_FLAT_SCRATCH`` Function uses flat scratch addressing + 0x4 ``FUNC_HAS_DYN_STACK`` Function has dynamic stack allocation + ===== =========================== ========================================== + + Symbol references (``INFO_FUNC``, ``INFO_USE``, ``INFO_CALL``) generate + ``R_AMDGPU_ABS64`` relocations in ``.rela.amdgpu.info``. String payloads + (``INFO_INDIRECT_CALL``, ``INFO_TYPEID``) store a ``u32`` offset into + the companion ``.amdgpu.strtab`` section. + + See :ref:`amdgpu-assembler-directive-amdgpu-info` for the assembly syntax. + +``.amdgpu.strtab`` + Null-terminated string pool for the ``.amdgpu.info`` section. Contains + type-ID strings referenced by ``INFO_INDIRECT_CALL`` and ``INFO_TYPEID`` + entries. Only present when ``.amdgpu.info`` requires string data. + .. _amdgpu-note-records: Note Records @@ -21402,6 +21465,49 @@ semantics described in :ref:`amdgpu-amdhsa-code-object-metadata-v3`, This directive is terminated by an ``.end_amdgpu_metadata`` directive. +.. _amdgpu-assembler-directive-amdgpu-info: + +.amdgpu_info ++++++++++++++++++++++ + +Begins a per-function metadata block for ```` in the ``.amdgpu.info`` +section (see :ref:`amdgpu-info-section`). Only valid when the OS is ``amdhsa``. +The block is terminated by an ``.end_amdgpu_info`` directive. + +The following sub-directives may appear inside the block: + + .. table:: .amdgpu_info Sub-Directives + :name: amdgpu-info-sub-directives-table + + ====================================== ========================================== + Directive Description + ====================================== ========================================== + ``.amdgpu_flags`` *value* ``FuncInfoFlags`` bitfield (u32) + ``.amdgpu_num_sgpr`` *value* SGPRs explicitly used (u32) + ``.amdgpu_num_vgpr`` *value* Architectural VGPRs used (u32) + ``.amdgpu_num_agpr`` *value* Accumulator VGPRs used (u32) + ``.amdgpu_private_segment_size`` *n* Private segment size in bytes (u32) + ``.amdgpu_use`` *symbol* Resource dependency (LDS or barrier) + ``.amdgpu_call`` *symbol* Direct call edge to *symbol* + ``.amdgpu_indirect_call`` *"type-id"* Indirect call with given type-ID string + ``.amdgpu_typeid`` *"type-id"* Type-ID for an address-taken function + ====================================== ========================================== + +Example: + +.. code-block:: nasm + + .amdgpu_info my_kernel + .amdgpu_flags 7 + .amdgpu_num_sgpr 33 + .amdgpu_num_vgpr 32 + .amdgpu_num_agpr 0 + .amdgpu_private_segment_size 0 + .amdgpu_use lds_var + .amdgpu_call helper + .amdgpu_indirect_call "vi" + .end_amdgpu_info + .. _amdgpu-amdhsa-assembler-example-v3-onwards: Code Object V3 and Above Example Source Code diff --git a/llvm/include/llvm/Support/AMDGPUObjLinkingInfo.h b/llvm/include/llvm/Support/AMDGPUObjLinkingInfo.h new file mode 100644 index 0000000000000..e65161e6545fc --- /dev/null +++ b/llvm/include/llvm/Support/AMDGPUObjLinkingInfo.h @@ -0,0 +1,74 @@ +//===--- AMDGPUObjLinkingInfo.h ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Enums shared between the AMDGPU backend (LLVM) and the ELF linker (LLD) +/// for the `.amdgpu.info` object-linking metadata section. +/// +/// Binary layout of each entry: [kind: u8] [len: u8] [payload: bytes]. +/// Unknown kinds are forward-compatible: a consumer skips them by reading len. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AMDGPUOBJECTLINKINGINFO_H +#define LLVM_SUPPORT_AMDGPUOBJECTLINKINGINFO_H + +#include "llvm/ADT/BitmaskEnum.h" + +#include + +namespace llvm { +namespace AMDGPU { + +/// Entry kind values for the `.amdgpu.info` section. +/// +/// Entries that appear between an INFO_FUNC and the next INFO_FUNC (or end of +/// section) belong to the function scope opened by that INFO_FUNC. +enum class InfoKind : uint8_t { + /// Opens a new function scope. Payload is an 8-byte symbol reference + /// (relocated) identifying the function. All subsequent entries until the + /// next INFO_FUNC belong to this function. + INFO_FUNC = 1, + /// Bitfield of FuncInfoFlags properties for the function. [u32] + INFO_FLAGS = 2, + /// Number of SGPRs explicitly used by the function. [u32] + INFO_NUM_SGPR = 3, + /// Number of architectural VGPRs used by the function. [u32] + INFO_NUM_VGPR = 4, + /// Number of accumulator VGPRs (AGPRs) used by the function. [u32] + INFO_NUM_AGPR = 5, + /// Private (scratch) memory size in bytes required by the function. [u32] + INFO_PRIVATE_SEGMENT_SIZE = 6, + /// Dependency edge: the function uses the resource identified by the + /// 8-byte relocated symbol (e.g. an LDS variable or named barrier). + INFO_USE = 7, + /// Direct call edge: the function calls the callee identified by the + /// 8-byte relocated symbol. + INFO_CALL = 8, + /// Indirect call edge: the function contains an indirect call whose + /// callee is expected to match the type-ID string at the given + /// `.amdgpu.strtab` offset. [u32] + INFO_INDIRECT_CALL = 9, + /// Function type ID: tags an address-taken function with a type-ID + /// string (at the given `.amdgpu.strtab` offset) so the linker can match + /// it against INFO_INDIRECT_CALL entries. [u32] + INFO_TYPEID = 10, +}; + +/// Per-function flags packed into INFO_FLAGS entries. +enum class FuncInfoFlags : uint32_t { + FUNC_USES_VCC = 1U << 0, + FUNC_USES_FLAT_SCRATCH = 1U << 1, + FUNC_HAS_DYN_STACK = 1U << 2, + LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FUNC_HAS_DYN_STACK), +}; + +} // namespace AMDGPU +} // namespace llvm + +#endif // LLVM_SUPPORT_AMDGPUOBJECTLINKINGINFO_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 390d68cca1174..ad61d8d084c7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -32,6 +32,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" #include "Utils/SIDefinesUtils.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinterHandler.h" @@ -537,6 +538,136 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { } } +static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL, + bool IsReturnType) { + if (Ty->isVoidTy()) { + Enc += 'v'; + return; + } + unsigned Bits = DL.getTypeSizeInBits(Ty); + // Zero-sized non-void types (e.g. `{}` or `[0 x i8]`) consume no ABI + // registers. For returns, emit the same no-result marker as void so the + // parameter encoding still has an explicit return-type prefix. + if (Bits == 0) { + if (IsReturnType) + Enc += 'v'; + return; + } + if (Bits <= 32) + Enc += 'i'; + else if (Bits <= 64) + Enc += 'l'; + else + Enc.append(divideCeil(Bits, 32), 'i'); +} + +static std::string computeTypeId(const FunctionType *FTy, + const DataLayout &DL) { + std::string Enc; + appendTypeEncoding(Enc, FTy->getReturnType(), DL, /*IsReturnType=*/true); + for (Type *ParamTy : FTy->params()) + appendTypeEncoding(Enc, ParamTy, DL, /*IsReturnType=*/false); + return Enc; +} + +void AMDGPUAsmPrinter::collectCallEdge(const MachineInstr &MI) { + if (!AMDGPUTargetMachine::EnableObjectLinking) + return; + const SIInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const MachineOperand *Callee = + TII->getNamedOperand(MI, AMDGPU::OpName::callee); + if (!Callee || !Callee->isGlobal()) + return; + DirectCallEdges.insert( + {getSymbol(&MF->getFunction()), getSymbol(Callee->getGlobal())}); +} + +void AMDGPUAsmPrinter::emitAMDGPUInfo(Module &M) { + if (!AMDGPUTargetMachine::EnableObjectLinking) + return; + + const NamedMDNode *LDSMD = M.getNamedMetadata("amdgpu.lds.uses"); + bool HasLDSUses = LDSMD && LDSMD->getNumOperands() > 0; + + const NamedMDNode *BarMD = M.getNamedMetadata("amdgpu.named_barrier.uses"); + bool HasNamedBarriers = BarMD && BarMD->getNumOperands() > 0; + + // Collect address-taken functions (with type IDs) and indirect call sites. + DenseMap AddrTakenTypeIds; + using IndirectCallInfo = std::pair; + SmallVector IndirectCalls; + + for (const Function &F : M) { + bool IsKernel = AMDGPU::isKernel(F.getCallingConv()); + + if (!IsKernel && F.hasAddressTaken(/*PutOffender=*/nullptr, + /*IgnoreCallbackUses=*/false, + /*IgnoreAssumeLikeCalls=*/true, + /*IgnoreLLVMUsed=*/true)) { + AddrTakenTypeIds[&F] = + computeTypeId(F.getFunctionType(), M.getDataLayout()); + } + + if (F.isDeclaration()) + continue; + + StringSet<> SeenTypeIds; + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + const auto *CB = dyn_cast(&I); + if (!CB || !CB->isIndirectCall()) + continue; + std::string TId = + computeTypeId(CB->getFunctionType(), M.getDataLayout()); + if (SeenTypeIds.insert(TId).second) + IndirectCalls.push_back({&F, std::move(TId)}); + } + } + } + + if (FunctionInfos.empty() && DirectCallEdges.empty() && !HasLDSUses && + !HasNamedBarriers && AddrTakenTypeIds.empty() && IndirectCalls.empty()) + return; + + AMDGPU::InfoSectionData Data; + Data.Funcs = std::move(FunctionInfos); + + for (auto &[F, TypeId] : AddrTakenTypeIds) { + MCSymbol *Sym = getSymbol(F); + Data.TypeIds.push_back({Sym, TypeId}); + } + + for (auto &[CallerSym, CalleeSym] : DirectCallEdges) + Data.Calls.push_back({CallerSym, CalleeSym}); + DirectCallEdges.clear(); + + if (HasLDSUses) { + for (const MDNode *N : LDSMD->operands()) { + auto *Func = mdconst::extract(N->getOperand(0)); + auto *LdsVar = mdconst::extract(N->getOperand(1)); + Data.Uses.push_back({getSymbol(Func), getSymbol(LdsVar)}); + } + } + + if (HasNamedBarriers) { + for (const MDNode *N : BarMD->operands()) { + auto *BarVar = mdconst::extract(N->getOperand(0)); + MCSymbol *BarSym = getSymbol(BarVar); + for (unsigned I = 1, E = N->getNumOperands(); I < E; ++I) { + auto *Func = mdconst::extract(N->getOperand(I)); + Data.Uses.push_back({getSymbol(Func), BarSym}); + } + } + } + + for (auto &[Caller, Enc] : IndirectCalls) { + MCSymbol *CallerSym = getSymbol(Caller); + Data.IndirectCalls.push_back({CallerSym, Enc}); + } + + getTargetStreamer()->emitAMDGPUInfo(Data); +} + bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, @@ -553,6 +684,10 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { } } + // Emit the unified .amdgpu.info section (per-function resources, call graph, + // LDS/named-barrier use edges, indirect calls, and address-taken type IDs). + emitAMDGPUInfo(M); + // Assign expressions which can only be resolved when all other functions are // known. RI.finalize(OutContext); @@ -567,8 +702,15 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext)); OutStreamer->popSection(); - for (Function &F : M.functions()) - validateMCResourceInfo(F); + // In the object-linking pipeline per-function resource MCExprs reference + // external callee symbols that cannot be evaluated here, so cross-TU limit + // checks would silently no-op for every non-leaf function. Defer resource + // sanity checking to the linker, which re-validates against the aggregated + // call graph in the combined .amdgpu.info metadata. + if (!AMDGPUTargetMachine::EnableObjectLinking) { + for (Function &F : M.functions()) + validateMCResourceInfo(F); + } RI.reset(); @@ -729,6 +871,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { RI.gatherResourceInfo(MF, *ResourceUsage, OutContext); + if (AMDGPUTargetMachine::EnableObjectLinking) { + const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &RU = + *ResourceUsage; + FunctionInfos.push_back( + {/*NumSGPR=*/static_cast(RU.NumExplicitSGPR), + /*NumArchVGPR=*/static_cast(RU.NumVGPR), + /*NumAccVGPR=*/static_cast(RU.NumAGPR), + /*PrivateSegmentSize=*/static_cast(RU.PrivateSegmentSize), + /*UsesVCC=*/RU.UsesVCC, + /*UsesFlatScratch=*/RU.UsesFlatScratch, + /*HasDynStack=*/RU.HasDynamicallySizedStack, + /*Sym=*/getSymbol(&MF.getFunction())}); + } + if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 31d10fe92ca26..9066b2d419f89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -15,7 +15,10 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #include "AMDGPUMCResourceInfo.h" +#include "AMDGPUResourceUsageAnalysis.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIProgramInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/AsmPrinter.h" namespace llvm { @@ -86,6 +89,13 @@ class AMDGPUAsmPrinter final : public AsmPrinter { void initTargetStreamer(Module &M); + void emitAMDGPUInfo(Module &M); + void collectCallEdge(const MachineInstr &MI); + + SetVector> DirectCallEdges; + + SmallVector FunctionInfos; + SmallString<128> getMCExprStr(const MCExpr *Value); /// Attempts to replace the validation that is missed in getSIProgramInfo due diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index cdc0b0a371e45..2863f263fcf94 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -320,6 +320,9 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII, } void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { + if (MI->isCall()) + collectCallEdge(*MI); + // FIXME: Enable feature predicate checks once all the test pass. // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), // getSubtargetInfo().getFeatureBits()); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5bacb6628474d..59bdfc177d1d7 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -38,6 +38,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/AMDGPUObjLinkingInfo.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" @@ -1387,6 +1388,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { return getRegBitWidth(RCID) / 8; } + std::optional InfoData; + private: void createConstantSymbol(StringRef Id, int64_t Val); @@ -1427,6 +1430,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ParseDirectivePALMetadataBegin(); bool ParseDirectivePALMetadata(); bool ParseDirectiveAMDGPULDS(); + bool ParseDirectiveAMDGPUInfo(); /// Common code to parse out a block of text (typically YAML) between start and /// end directives. @@ -1681,6 +1685,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool ParseDirective(AsmToken DirectiveID) override; + void onEndOfFile() override; ParseStatus parseOperand(OperandVector &Operands, StringRef Mnemonic, OperandMode Mode = OperandMode_Default); StringRef parseMnemonicSuffix(StringRef Name); @@ -6751,6 +6756,124 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { return false; } +bool AMDGPUAsmParser::ParseDirectiveAMDGPUInfo() { + if (getParser().checkForValidSection()) + return true; + + StringRef FuncName; + if (getParser().parseIdentifier(FuncName)) + return TokError("expected symbol name after .amdgpu_info"); + + MCSymbol *FuncSym = getContext().getOrCreateSymbol(FuncName); + AMDGPU::InfoSectionData ParsedInfoData; + AMDGPU::FuncInfo FI; + FI.Sym = FuncSym; + bool HasScalarAttrs = false; + + while (true) { + while (trySkipToken(AsmToken::EndOfStatement)) + ; + + StringRef ID; + SMLoc IDLoc = getLoc(); + if (!parseId(ID, "expected directive or .end_amdgpu_info")) + return true; + + if (ID == ".end_amdgpu_info") + break; + + // Every per-entry directive shares the `.amdgpu_` namespace prefix; strip + // it once and dispatch on the distinguishing suffix below. The unstripped + // ID is preserved for diagnostics. + StringRef Dir = ID; + if (!Dir.consume_front(".amdgpu_")) + return Error(IDLoc, "unknown .amdgpu_info directive '" + ID + "'"); + + if (Dir == "flags") { + int64_t Val; + if (getParser().parseAbsoluteExpression(Val)) + return true; + auto Flags = static_cast(Val); + FI.UsesVCC = !!(Flags & AMDGPU::FuncInfoFlags::FUNC_USES_VCC); + FI.UsesFlatScratch = + !!(Flags & AMDGPU::FuncInfoFlags::FUNC_USES_FLAT_SCRATCH); + FI.HasDynStack = !!(Flags & AMDGPU::FuncInfoFlags::FUNC_HAS_DYN_STACK); + HasScalarAttrs = true; + } else if (Dir == "num_sgpr") { + int64_t Val; + if (getParser().parseAbsoluteExpression(Val)) + return true; + FI.NumSGPR = static_cast(Val); + HasScalarAttrs = true; + } else if (Dir == "num_vgpr") { + int64_t Val; + if (getParser().parseAbsoluteExpression(Val)) + return true; + FI.NumArchVGPR = static_cast(Val); + HasScalarAttrs = true; + } else if (Dir == "num_agpr") { + int64_t Val; + if (getParser().parseAbsoluteExpression(Val)) + return true; + FI.NumAccVGPR = static_cast(Val); + HasScalarAttrs = true; + } else if (Dir == "private_segment_size") { + int64_t Val; + if (getParser().parseAbsoluteExpression(Val)) + return true; + FI.PrivateSegmentSize = static_cast(Val); + HasScalarAttrs = true; + } else if (Dir == "use") { + StringRef ResName; + if (getParser().parseIdentifier(ResName)) + return TokError("expected resource symbol for .amdgpu_use"); + ParsedInfoData.Uses.push_back( + {FuncSym, getContext().getOrCreateSymbol(ResName)}); + } else if (Dir == "call") { + StringRef DstName; + if (getParser().parseIdentifier(DstName)) + return TokError("expected callee symbol for .amdgpu_call"); + ParsedInfoData.Calls.push_back( + {FuncSym, getContext().getOrCreateSymbol(DstName)}); + } else if (Dir == "indirect_call") { + std::string TypeId; + if (getParser().parseEscapedString(TypeId)) + return TokError("expected type ID string for .amdgpu_indirect_call"); + ParsedInfoData.IndirectCalls.push_back({FuncSym, std::move(TypeId)}); + } else if (Dir == "typeid") { + std::string TypeId; + if (getParser().parseEscapedString(TypeId)) + return TokError("expected type ID string for .amdgpu_typeid"); + ParsedInfoData.TypeIds.push_back({FuncSym, std::move(TypeId)}); + } else { + return Error(IDLoc, "unknown .amdgpu_info directive '" + ID + "'"); + } + } + + if (HasScalarAttrs) + ParsedInfoData.Funcs.push_back(std::move(FI)); + + AMDGPU::InfoSectionData &Data = InfoData ? *InfoData : InfoData.emplace(); + for (AMDGPU::FuncInfo &Func : ParsedInfoData.Funcs) + Data.Funcs.push_back(std::move(Func)); + for (std::pair &Use : ParsedInfoData.Uses) + Data.Uses.push_back(Use); + for (std::pair &Call : ParsedInfoData.Calls) + Data.Calls.push_back(Call); + for (std::pair &IndirectCall : + ParsedInfoData.IndirectCalls) + Data.IndirectCalls.push_back(std::move(IndirectCall)); + for (std::pair &TypeId : ParsedInfoData.TypeIds) + Data.TypeIds.push_back(std::move(TypeId)); + + return false; +} + +void AMDGPUAsmParser::onEndOfFile() { + if (InfoData) + getTargetStreamer().emitAMDGPUInfo(*InfoData); +} + bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -6788,6 +6911,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amdgpu_lds") return ParseDirectiveAMDGPULDS(); + if (IDVal == ".amdgpu_info") + return ParseDirectiveAMDGPUInfo(); + if (IDVal == PALMD::AssemblerDirectiveBegin) return ParseDirectivePALMetadataBegin(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 47733494d421b..53c21837a11dd 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -25,7 +25,9 @@ #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/StringTableBuilder.h" #include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/AMDGPUObjLinkingInfo.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" @@ -664,6 +666,103 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( OS << "\t.end_amdhsa_kernel\n"; } +namespace { +/// Callback type invoked by \c forEachInfoScope for each function scope in +/// the canonical iteration order. The scope is emitted exactly once per +/// unique \p Sym regardless of how many flat entries reference it. +using InfoScopeEmitter = function_ref Uses, + ArrayRef Calls, ArrayRef IndirectCallTypeIds, + ArrayRef TypeIds)>; + +/// Group the flat edge lists in \p Data by source function symbol and drive +/// per-scope emission. A scope is opened for every function with attached +/// info and for every function that appears only as an edge source; each +/// scope is emitted exactly once. Both the asm and ELF streamers share this +/// iteration logic and only differ in the per-scope emission callback. +static void forEachInfoScope(const AMDGPU::InfoSectionData &Data, + InfoScopeEmitter Emit) { + DenseMap> FuncUses; + DenseMap> FuncCalls; + DenseMap> FuncIndirectCalls; + DenseMap> FuncTypeIds; + for (const auto &[Func, Res] : Data.Uses) + FuncUses[Func].push_back(Res); + for (const auto &[Src, Dst] : Data.Calls) + FuncCalls[Src].push_back(Dst); + for (const auto &[Func, TypeId] : Data.IndirectCalls) + FuncIndirectCalls[Func].push_back(TypeId); + for (const auto &[Sym, TypeId] : Data.TypeIds) + FuncTypeIds[Sym].push_back(TypeId); + + DenseSet Emitted; + auto EmitIfNew = [&](MCSymbol *Sym, const AMDGPU::FuncInfo *Info) { + if (!Emitted.insert(Sym).second) + return; + ArrayRef Uses, Calls; + ArrayRef IndirectCallTypeIds, TypeIds; + if (auto It = FuncUses.find(Sym); It != FuncUses.end()) + Uses = It->second; + if (auto It = FuncCalls.find(Sym); It != FuncCalls.end()) + Calls = It->second; + if (auto It = FuncIndirectCalls.find(Sym); It != FuncIndirectCalls.end()) + IndirectCallTypeIds = It->second; + if (auto It = FuncTypeIds.find(Sym); It != FuncTypeIds.end()) + TypeIds = It->second; + Emit(Sym, Info, Uses, Calls, IndirectCallTypeIds, TypeIds); + }; + + for (const AMDGPU::FuncInfo &Func : Data.Funcs) + EmitIfNew(Func.Sym, &Func); + // Emit scopes for functions that only appear as edge sources (e.g. typeid + // tags on address-taken declarations, or callers of external functions). + for (const auto &[Sym, TypeId] : Data.TypeIds) + EmitIfNew(Sym, nullptr); + for (const auto &[Sym, Res] : Data.Uses) + EmitIfNew(Sym, nullptr); + for (const auto &[Sym, Dst] : Data.Calls) + EmitIfNew(Sym, nullptr); + for (const auto &[Sym, TypeId] : Data.IndirectCalls) + EmitIfNew(Sym, nullptr); +} +} // namespace + +void AMDGPUTargetAsmStreamer::emitAMDGPUInfo( + const AMDGPU::InfoSectionData &Data) { + forEachInfoScope(Data, [&](MCSymbol *Sym, const AMDGPU::FuncInfo *Info, + ArrayRef Uses, + ArrayRef Calls, + ArrayRef IndirectCallTypeIds, + ArrayRef TypeIds) { + OS << "\t.amdgpu_info " << Sym->getName() << '\n'; + if (Info) { + AMDGPU::FuncInfoFlags Flags{}; + if (Info->UsesVCC) + Flags |= AMDGPU::FuncInfoFlags::FUNC_USES_VCC; + if (Info->UsesFlatScratch) + Flags |= AMDGPU::FuncInfoFlags::FUNC_USES_FLAT_SCRATCH; + if (Info->HasDynStack) + Flags |= AMDGPU::FuncInfoFlags::FUNC_HAS_DYN_STACK; + OS << "\t\t.amdgpu_flags " << llvm::to_underlying(Flags) << '\n'; + OS << "\t\t.amdgpu_num_sgpr " << Info->NumSGPR << '\n'; + OS << "\t\t.amdgpu_num_vgpr " << Info->NumArchVGPR << '\n'; + if (Info->NumAccVGPR) + OS << "\t\t.amdgpu_num_agpr " << Info->NumAccVGPR << '\n'; + OS << "\t\t.amdgpu_private_segment_size " << Info->PrivateSegmentSize + << '\n'; + } + for (MCSymbol *Res : Uses) + OS << "\t\t.amdgpu_use " << Res->getName() << '\n'; + for (MCSymbol *Dst : Calls) + OS << "\t\t.amdgpu_call " << Dst->getName() << '\n'; + for (StringRef TypeId : IndirectCallTypeIds) + OS << "\t\t.amdgpu_indirect_call \"" << TypeId << "\"\n"; + for (StringRef TypeId : TypeIds) + OS << "\t\t.amdgpu_typeid \"" << TypeId << "\"\n"; + OS << "\t.end_amdgpu_info\n\n"; + }); +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// @@ -1065,3 +1164,83 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved3); ++i) Streamer.emitInt8(0u); } + +void AMDGPUTargetELFStreamer::emitAMDGPUInfo( + const AMDGPU::InfoSectionData &Data) { + MCELFStreamer &S = getStreamer(); + MCContext &Context = S.getContext(); + + StringTableBuilder StrTab(StringTableBuilder::ELF); + auto getOrAddString = [&](StringRef Str) -> uint32_t { + if (Str.empty()) + return UINT32_MAX; + return StrTab.add(Str); + }; + + auto EmitU32Entry = [&](AMDGPU::InfoKind Kind, uint32_t Val) { + S.emitInt8(static_cast(Kind)); + S.emitInt8(4); + S.emitInt32(Val); + }; + auto EmitSymEntry = [&](AMDGPU::InfoKind Kind, MCSymbol *Sym) { + S.emitInt8(static_cast(Kind)); + S.emitInt8(8); + S.emitValue(MCSymbolRefExpr::create(Sym, Context), 8); + }; + + S.pushSection(); + MCSectionELF *InfoSec = Context.getELFSection( + ".amdgpu.info", ELF::SHT_PROGBITS, ELF::SHF_EXCLUDE); + S.switchSection(InfoSec); + + forEachInfoScope(Data, [&](MCSymbol *Sym, const AMDGPU::FuncInfo *Info, + ArrayRef Uses, + ArrayRef Calls, + ArrayRef IndirectCallTypeIds, + ArrayRef TypeIds) { + EmitSymEntry(AMDGPU::InfoKind::INFO_FUNC, Sym); + + if (Info) { + AMDGPU::FuncInfoFlags Flags{}; + if (Info->UsesVCC) + Flags |= AMDGPU::FuncInfoFlags::FUNC_USES_VCC; + if (Info->UsesFlatScratch) + Flags |= AMDGPU::FuncInfoFlags::FUNC_USES_FLAT_SCRATCH; + if (Info->HasDynStack) + Flags |= AMDGPU::FuncInfoFlags::FUNC_HAS_DYN_STACK; + EmitU32Entry(AMDGPU::InfoKind::INFO_FLAGS, llvm::to_underlying(Flags)); + EmitU32Entry(AMDGPU::InfoKind::INFO_NUM_SGPR, Info->NumSGPR); + EmitU32Entry(AMDGPU::InfoKind::INFO_NUM_VGPR, Info->NumArchVGPR); + // INFO_NUM_AGPR is only emitted when the function actually uses AGPRs, + // since AGPRs are not available on all architectures. + if (Info->NumAccVGPR) + EmitU32Entry(AMDGPU::InfoKind::INFO_NUM_AGPR, Info->NumAccVGPR); + EmitU32Entry(AMDGPU::InfoKind::INFO_PRIVATE_SEGMENT_SIZE, + Info->PrivateSegmentSize); + } + + for (MCSymbol *Res : Uses) + EmitSymEntry(AMDGPU::InfoKind::INFO_USE, Res); + for (MCSymbol *Dst : Calls) + EmitSymEntry(AMDGPU::InfoKind::INFO_CALL, Dst); + for (StringRef TypeId : IndirectCallTypeIds) { + EmitU32Entry(AMDGPU::InfoKind::INFO_INDIRECT_CALL, + getOrAddString(TypeId)); + } + for (StringRef TypeId : TypeIds) + EmitU32Entry(AMDGPU::InfoKind::INFO_TYPEID, getOrAddString(TypeId)); + }); + + if (!StrTab.empty()) { + StrTab.finalizeInOrder(); + MCSectionELF *Sec = Context.getELFSection(".amdgpu.strtab", ELF::SHT_STRTAB, + ELF::SHF_EXCLUDE); + S.switchSection(Sec); + SmallString<128> Buf; + raw_svector_ostream OS(Buf); + StrTab.write(OS); + S.emitBytes(Buf); + } + + S.popSection(); +} diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 3a0d8dcd2d27c..ca1fe3ccf3da1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -11,7 +11,10 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUPALMetadata.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCStreamer.h" +#include +#include namespace llvm { @@ -26,6 +29,27 @@ struct MCKernelDescriptor; namespace HSAMD { struct Metadata; } + +struct FuncInfo { + uint32_t NumSGPR = 0; + uint32_t NumArchVGPR = 0; + uint32_t NumAccVGPR = 0; + uint32_t PrivateSegmentSize = 0; + bool UsesVCC = false; + bool UsesFlatScratch = false; + bool HasDynStack = false; + + MCSymbol *Sym = nullptr; +}; + +struct InfoSectionData { + SmallVector Funcs; + SmallVector, 4> Uses; + SmallVector, 8> Calls; + SmallVector, 4> IndirectCalls; + SmallVector, 4> TypeIds; +}; + } // namespace AMDGPU class AMDGPUTargetStreamer : public MCTargetStreamer { @@ -104,6 +128,8 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr) {} + virtual void emitAMDGPUInfo(const AMDGPU::InfoSectionData &Data) {} + static StringRef getArchNameFromElfMach(unsigned ElfMach); static unsigned getElfMach(StringRef GPU); @@ -168,6 +194,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr) override; + + void emitAMDGPUInfo(const AMDGPU::InfoSectionData &Data) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { @@ -221,6 +249,8 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr) override; + + void emitAMDGPUInfo(const AMDGPU::InfoSectionData &Data) override; }; } #endif diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-agpr.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-agpr.ll new file mode 100644 index 0000000000000..6442d6f6501c9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-agpr.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-enable-object-linking < %s | FileCheck %s + +; Verify that .amdgpu_num_agpr IS emitted when AGPRs are used on a target +; that supports them (gfx908 has a separate AGPR file). + +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) + +define void @func_with_agpr(float %a, float %b, ptr addrspace(1) %out) { + %result = call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> zeroinitializer, i32 0, i32 0, i32 0) + store <4 x float> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @kern(float %a, float %b, ptr addrspace(1) %out) { + call void @func_with_agpr(float %a, float %b, ptr addrspace(1) %out) + ret void +} + +; CHECK: .amdgpu_info func_with_agpr +; CHECK: .amdgpu_num_agpr {{[1-9][0-9]*}} +; CHECK: .end_amdgpu_info +; CHECK: .amdgpu_info kern +; CHECK: .end_amdgpu_info diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-callgraph.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-callgraph.ll new file mode 100644 index 0000000000000..0297a2a6e049d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-callgraph.ll @@ -0,0 +1,62 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=obj < %s | llvm-readobj -r --sections - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=asm < %s | FileCheck %s --check-prefix=ASM --implicit-check-not=.amdgpu_num_agpr + +; Test that the unified .amdgpu.info section (.amdgpu_info blocks in assembly) is +; emitted with correct relocations when object linking is enabled. + +declare void @extern_func() +declare void @tail_extern() + +; The .amdgpu.info section should exist as SHT_PROGBITS with SHF_EXCLUDE. +; CHECK: Section { +; CHECK: Name: .amdgpu.info +; CHECK: Type: SHT_PROGBITS +; CHECK: Flags [ +; CHECK: SHF_EXCLUDE +; CHECK: ] + +; Symbol references in the binary resource metadata still use R_AMDGPU_ABS64 relocations. +; CHECK-DAG: R_AMDGPU_ABS64 my_kernel +; CHECK-DAG: R_AMDGPU_ABS64 helper +; CHECK-DAG: R_AMDGPU_ABS64 extern_func +; COM: Tail-call callee must still be recorded as an INFO_CALL edge. +; CHECK-DAG: R_AMDGPU_ABS64 tail_helper +; CHECK-DAG: R_AMDGPU_ABS64 tail_extern + +; COM: Assembly: per-function .amdgpu_info blocks (target flags derived from +; COM: e_flags). +; ASM-DAG: .amdgpu_info helper +; ASM-DAG: .amdgpu_flags {{[0-9]+}} +; ASM-DAG: .amdgpu_num_vgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_num_sgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_private_segment_size {{[0-9]+}} +; ASM-DAG: .amdgpu_call extern_func +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info my_kernel +; ASM-DAG: .amdgpu_flags {{[0-9]+}} +; ASM-DAG: .amdgpu_num_vgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_num_sgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_private_segment_size {{[0-9]+}} +; ASM-DAG: .amdgpu_call helper +; ASM-DAG: .end_amdgpu_info + +; COM: A tail call is lowered to SI_TCRETURN (isCall = 1). Verify that the +; COM: callee edge is still captured in the .amdgpu_info block of the caller. +; ASM-DAG: .amdgpu_info tail_helper +; ASM-DAG: .amdgpu_call tail_extern +; ASM-DAG: .end_amdgpu_info + +define void @helper() { + call void @extern_func() + ret void +} + +define amdgpu_kernel void @my_kernel() { + call void @helper() + ret void +} + +define void @tail_helper() { + tail call void @tail_extern() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll index 6bb9064ca1ac6..f573a3180c067 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll @@ -1,9 +1,11 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-enable-object-linking < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-enable-object-linking < %s | FileCheck %s --implicit-check-not=.amdgpu_num_agpr +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-enable-object-linking -filetype=obj < %s | llvm-readobj -r --sections - | FileCheck %s --check-prefix=ELF ; Verify object linking codegen for named barriers on GFX1250: ; 1. Barrier instructions use M0-based forms with relocation references -; 2. group_segment_fixed_size = 0 (linker patches it) -; 3. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds) +; 2. .amdgpu.info section records the barrier as an LDS use edge +; 3. group_segment_fixed_size = 0 (linker patches it) +; 4. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds) @bar = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison @@ -13,12 +15,29 @@ ; CHECK: s_barrier_signal m0 ; CHECK: s_barrier_wait 1 -; KD: group_segment_fixed_size = 0 (linker will patch). ; CHECK: .amdhsa_group_segment_fixed_size 0 -; LDS symbol declaration +; CHECK: .amdgpu_info kernel +; CHECK: .amdgpu_flags {{[0-9]+}} +; CHECK: .amdgpu_num_sgpr {{[0-9]+}} +; CHECK: .amdgpu_num_vgpr {{[0-9]+}} +; CHECK: .amdgpu_private_segment_size {{[0-9]+}} +; CHECK: .amdgpu_use __amdgpu_named_barrier.bar{{[^ ,]*}} +; CHECK: .amdgpu_call helper +; CHECK: .end_amdgpu_info + ; CHECK: .amdgpu_lds __amdgpu_named_barrier.bar{{[^ ,]*}}, 32, 4 +; ELF: Section { +; ELF: Name: .amdgpu.info +; ELF: Type: SHT_PROGBITS +; ELF: Flags [ +; ELF: SHF_EXCLUDE + +; ELF-DAG: R_AMDGPU_ABS64 kernel +; ELF-DAG: R_AMDGPU_ABS64 __amdgpu_named_barrier.bar{{[^ ]*}} +; ELF-DAG: R_AMDGPU_ABS64 helper + define amdgpu_kernel void @kernel() { call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3) diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-typeid.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-typeid.ll new file mode 100644 index 0000000000000..0cf45c3bc81a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-typeid.ll @@ -0,0 +1,257 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=obj < %s | llvm-readobj -r - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=asm < %s | FileCheck %s --check-prefix=ASM --implicit-check-not=.amdgpu_num_agpr + +; Test ABI register-size type ID generation for various function types. +; The type ID encodes each parameter/return by bit width: v=void, i=<=32-bit, +; l=33-64-bit, and >64-bit types widen to ceil(bits/32) x "i". Types with the +; same register footprint share an encoding (e.g. float(float) and i32(i32) +; both produce "ii"). Coverage here spans scalars, vectors (whose size is the +; total element bit width), pointers across address spaces (AS 1 is 64-bit on +; amdhsa; AS 3 / AS 5 are 32-bit), and small integer types (i1/i8/i16) that +; are not natively passed as function arguments but ABI-promoted to i32 slots +; -- they still encode as "i", matching an i32 parameter. +; +; Cross-TU coverage: an address-taken declaration (defined in another TU) still +; gets an .amdgpu_info scope with .amdgpu_typeid, but no per-function resource +; counts since its body isn't available here. +declare void @extern_decl(i32) + +; Zero-sized aggregate types are valid IR function argument/return types. They +; consume no ABI registers, so arguments are omitted from the type ID and returns +; use the same no-result marker as void. +declare void @empty_struct_arg({}) +declare void @empty_array_i32_arg([0 x i8], i32) +declare void @empty_array_i64_i32_arg([0 x i8], i64, i32) +declare {} @empty_struct_ret() + +define void @void_void() { + ret void +} + +define i32 @i32_i32(i32 %x) { + ret i32 %x +} + +define void @void_ptr_i32(ptr %p, i32 %x) { + ret void +} + +define i64 @i64_i64_i64(i64 %a, i64 %b) { + ret i64 %a +} + +define float @float_float(float %x) { + ret float %x +} + +; Address-space pointer widths: AS 1 (global) is 64-bit -> "l"; AS 3 (LDS) and +; AS 5 (private) are 32-bit -> "i". +define void @ptr_addrspaces(ptr addrspace(1) %g, ptr addrspace(3) %l, ptr addrspace(5) %p) { + ret void +} + +; Vector types: encoded by total bit width. <2 x i32> = 64 bits -> "l"; +; <4 x i32>/<4 x float>/<2 x i64> = 128 bits -> "iiii" each. +define <4 x i32> @vectors(<2 x i32> %a, <4 x float> %b, <2 x i64> %c) { + ret <4 x i32> zeroinitializer +} + +; Small integer types (i1/i8/i16) are ABI-promoted to i32 register slots on +; AMDGPU. They all collapse to "i" under the bit-width scheme, matching an i32 +; parameter, so callers declared as void(i32, ...) remain compatible with +; callees taking void(i8, ...). signext/zeroext attributes describe the +; promotion mode and do not affect the encoding. +define void @promoted_small_ints(i8 signext %a, i16 zeroext %b, i1 %c) { + ret void +} + +; Wider non-vector scalars: double is "l" (64 bits); i128 widens to 4 x "i" +; (ceil(128/32)). +define double @wide_scalars(double %a, i128 %b) { + ret double %a +} + +%Struct16 = type { i32, i32, i32, i32 } + +; byval / byref struct pointer parameters encode as a single pointer register +; slot, the same as a plain pointer in the same address space. byval describes +; a caller-side stack copy and byref describes a pointer handed through +; unchanged; neither changes the callee's register footprint (one pointer), +; so both collapse to "i" (for 32-bit AS) or "l" (for 64-bit AS). Compare +; with the plain-pointer encodings in @ptr_addrspaces. +define void @byval_struct_private(ptr addrspace(5) byval(%Struct16) %p) { + ret void +} + +define void @byref_struct_constant(ptr addrspace(4) byref(%Struct16) %p) { + ret void +} + +; Indirect-call type IDs are derived from the call instruction's FunctionType +; using the same rules, so they match the .amdgpu_typeid of an ABI-compatible +; address-taken callee. Duplicate signatures within one function are +; deduplicated (the second void() call below shares "v" with the first and +; yields only one .amdgpu_indirect_call entry; likewise a plain +; ptr addrspace(5) call and a ptr addrspace(5) byval(...) call both encode as +; "vi" and collapse to one entry). Zero-sized arguments consume no ABI registers, +; so the empty-array/i64/i32 call encodes as "vli". +define void @icaller(ptr %f_void, ptr %f_ptrs, ptr %f_vec, ptr %f_small, ptr %f_wide, ptr %f_dup, ptr %f_priv, ptr %f_priv_byval, ptr %f_const, ptr %f_const_byref, ptr %f_empty_struct, ptr %f_empty_array_i32, ptr %f_empty_array_i64_i32, ptr %f_empty_ret) { + call void %f_void() + call void %f_ptrs(ptr addrspace(1) null, ptr addrspace(3) null, ptr addrspace(5) null) + %v = call <4 x i32> %f_vec(<2 x i32> zeroinitializer, <4 x float> zeroinitializer, <2 x i64> zeroinitializer) + call void %f_small(i8 signext 0, i16 zeroext 0, i1 false) + %d = call double %f_wide(double 0.0, i128 0) + call void %f_dup() + call void %f_priv(ptr addrspace(5) null) + call void %f_priv_byval(ptr addrspace(5) byval(%Struct16) null) + call void %f_const(ptr addrspace(4) null) + call void %f_const_byref(ptr addrspace(4) byref(%Struct16) null) + call void %f_empty_struct({} zeroinitializer) + call void %f_empty_array_i32([0 x i8] zeroinitializer, i32 0) + call void %f_empty_array_i64_i32([0 x i8] zeroinitializer, i64 0, i32 0) + %empty_ret = call {} %f_empty_ret() + ret void +} + +; Take the address of each function so they appear as resource nodes. +define void @taker() { + %p0 = alloca ptr, addrspace(5) + store volatile ptr @void_void, ptr addrspace(5) %p0 + store volatile ptr @i32_i32, ptr addrspace(5) %p0 + store volatile ptr @void_ptr_i32, ptr addrspace(5) %p0 + store volatile ptr @i64_i64_i64, ptr addrspace(5) %p0 + store volatile ptr @float_float, ptr addrspace(5) %p0 + store volatile ptr @ptr_addrspaces, ptr addrspace(5) %p0 + store volatile ptr @vectors, ptr addrspace(5) %p0 + store volatile ptr @promoted_small_ints, ptr addrspace(5) %p0 + store volatile ptr @wide_scalars, ptr addrspace(5) %p0 + store volatile ptr @byval_struct_private, ptr addrspace(5) %p0 + store volatile ptr @byref_struct_constant, ptr addrspace(5) %p0 + store volatile ptr @extern_decl, ptr addrspace(5) %p0 + store volatile ptr @empty_struct_arg, ptr addrspace(5) %p0 + store volatile ptr @empty_array_i32_arg, ptr addrspace(5) %p0 + store volatile ptr @empty_array_i64_i32_arg, ptr addrspace(5) %p0 + store volatile ptr @empty_struct_ret, ptr addrspace(5) %p0 + ret void +} + +define amdgpu_kernel void @kern() { + call void @taker() + call void @icaller(ptr @void_void, ptr @ptr_addrspaces, ptr @vectors, + ptr @promoted_small_ints, ptr @wide_scalars, + ptr @void_void, + ptr @byval_struct_private, ptr @byval_struct_private, + ptr @byref_struct_constant, ptr @byref_struct_constant, + ptr @empty_struct_arg, ptr @empty_array_i32_arg, + ptr @empty_array_i64_i32_arg, ptr @empty_struct_ret) + ret void +} + +; CHECK-DAG: R_AMDGPU_ABS64 void_void +; CHECK-DAG: R_AMDGPU_ABS64 i32_i32 +; CHECK-DAG: R_AMDGPU_ABS64 void_ptr_i32 +; CHECK-DAG: R_AMDGPU_ABS64 i64_i64_i64 +; CHECK-DAG: R_AMDGPU_ABS64 float_float +; CHECK-DAG: R_AMDGPU_ABS64 ptr_addrspaces +; CHECK-DAG: R_AMDGPU_ABS64 vectors +; CHECK-DAG: R_AMDGPU_ABS64 promoted_small_ints +; CHECK-DAG: R_AMDGPU_ABS64 wide_scalars +; CHECK-DAG: R_AMDGPU_ABS64 byval_struct_private +; CHECK-DAG: R_AMDGPU_ABS64 byref_struct_constant +; CHECK-DAG: R_AMDGPU_ABS64 extern_decl +; CHECK-DAG: R_AMDGPU_ABS64 empty_struct_arg +; CHECK-DAG: R_AMDGPU_ABS64 empty_array_i32_arg +; CHECK-DAG: R_AMDGPU_ABS64 empty_array_i64_i32_arg +; CHECK-DAG: R_AMDGPU_ABS64 empty_struct_ret +; CHECK-DAG: R_AMDGPU_ABS64 icaller +; CHECK-DAG: R_AMDGPU_ABS64 taker +; CHECK-DAG: R_AMDGPU_ABS64 kern + +; ASM-DAG: .amdgpu_info void_void +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "v" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info i32_i32 +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "ii" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info void_ptr_i32 +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "vli" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info i64_i64_i64 +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "lll" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info float_float +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "ii" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info ptr_addrspaces +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "vlii" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info vectors +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "iiiiliiiiiiii" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info promoted_small_ints +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "viii" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info wide_scalars +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "lliiii" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info byval_struct_private +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "vi" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info byref_struct_constant +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_typeid "vl" +; ASM-DAG: .end_amdgpu_info +; COM: Address-taken declaration: only the type-ID appears in its scope, with +; COM: no per-function resource counts (the body lives in another TU). +; ASM-DAG: .amdgpu_info extern_decl +; ASM-DAG: .amdgpu_typeid "vi" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info empty_struct_arg +; ASM-DAG: .amdgpu_typeid "v" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info empty_array_i32_arg +; ASM-DAG: .amdgpu_typeid "vi" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info empty_array_i64_i32_arg +; ASM-DAG: .amdgpu_typeid "vli" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info empty_struct_ret +; ASM-DAG: .amdgpu_typeid "v" +; ASM-DAG: .end_amdgpu_info +; COM: @icaller's indirect call type IDs mirror the @void_void / @ptr_addrspaces / +; COM: @vectors / @promoted_small_ints / @wide_scalars .amdgpu_typeid encodings +; COM: above, proving ABI compatibility. The duplicate void() indirect call is +; COM: deduplicated, so "v" appears only once. The byval/byref pairs dedupe +; COM: with their plain-pointer counterparts: "vi" (AS 5) and "vl" (AS 4) each +; COM: appear once despite two call sites apiece. +; ASM-DAG: .amdgpu_info icaller +; ASM-DAG: .amdgpu_flags 1 +; ASM-DAG: .amdgpu_indirect_call "v" +; ASM-DAG: .amdgpu_indirect_call "vlii" +; ASM-DAG: .amdgpu_indirect_call "iiiiliiiiiiii" +; ASM-DAG: .amdgpu_indirect_call "viii" +; ASM-DAG: .amdgpu_indirect_call "lliiii" +; ASM-DAG: .amdgpu_indirect_call "vi" +; ASM-DAG: .amdgpu_indirect_call "vl" +; ASM-DAG: .amdgpu_indirect_call "vli" +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info taker +; ASM-DAG: .amdgpu_flags 0 +; ASM-DAG: .amdgpu_num_vgpr {{[0-9]+}} +; ASM-DAG: .end_amdgpu_info +; COM: The kernel scope is present but carries no type IDs of its own (kernels +; COM: aren't indirect-call targets). Direct-call edges from the kernel body are +; COM: exercised separately in the callgraph test. +; ASM-DAG: .amdgpu_info kern +; ASM-DAG: .amdgpu_flags {{[0-9]+}} +; ASM-DAG: .end_amdgpu_info diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen.ll index 878f3abf7ccfc..0020c2272d23e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen.ll @@ -1,17 +1,18 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking < %s | FileCheck -check-prefixes=ASM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=obj < %s | llvm-readobj -r --syms - | FileCheck -check-prefixes=ELF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking < %s | FileCheck -check-prefixes=ASM %s --implicit-check-not=.amdgpu_num_agpr +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=obj < %s | llvm-readobj -r --syms --sections - | FileCheck -check-prefixes=ELF %s ; Test that with object linking enabled, external LDS declarations produce -; @abs32@lo relocations, SHN_AMDGPU_LDS symbols, and .amdgpu_lds directives. -; Covers multiple LDS variables with different sizes and alignments (including -; zero-sized dynamic LDS), usage from both kernels and device functions, and +; @abs32@lo relocations, SHN_AMDGPU_LDS symbols, .amdgpu_lds directives, +; and .amdgpu_use edges in the .amdgpu.info section. Covers multiple LDS +; variables with different sizes and alignments (including zero-sized dynamic +; LDS), usage from both kernels and device functions, and ; group_segment_fixed_size = 0 (linker patches via binary patching). @lds_large = external addrspace(3) global [256 x i8], align 16 @lds_small = external addrspace(3) global [128 x i8], align 4 @lds_dynamic = external addrspace(3) global [0 x i8], align 8 -; --- Assembly checks --- +; Instruction-level relocation checks. ; ASM-LABEL: {{^}}device_func: ; ASM: v_add_u32_e32 v{{[0-9]+}}, lds_large@abs32@lo, v{{[0-9]+}} @@ -19,17 +20,49 @@ ; ASM-DAG: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, lds_small@abs32@lo ; ASM-DAG: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, lds_dynamic@abs32@lo +; .amdgpu.info section with LDS use edges. +; ASM-DAG: .amdgpu_info device_func +; ASM-DAG: .amdgpu_flags {{[0-9]+}} +; ASM-DAG: .amdgpu_num_vgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_num_sgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_private_segment_size {{[0-9]+}} +; ASM-DAG: .amdgpu_use lds_large +; ASM-DAG: .end_amdgpu_info +; ASM-DAG: .amdgpu_info test_kernel +; ASM-DAG: .amdgpu_flags {{[0-9]+}} +; ASM-DAG: .amdgpu_num_vgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_num_sgpr {{[0-9]+}} +; ASM-DAG: .amdgpu_private_segment_size {{[0-9]+}} +; ASM-DAG: .amdgpu_use lds_dynamic +; ASM-DAG: .amdgpu_use lds_small +; ASM-DAG: .amdgpu_call device_func +; ASM-DAG: .end_amdgpu_info + +; SHN_AMDGPU_LDS directives. ; ASM-DAG: .amdgpu_lds lds_large, 256, 16 ; ASM-DAG: .amdgpu_lds lds_small, 128, 4 ; ASM-DAG: .amdgpu_lds lds_dynamic, 0, 8 ; ASM: .group_segment_fixed_size: 0 -; --- ELF checks --- +; .amdgpu.info section exists. +; ELF: Section { +; ELF: Name: .amdgpu.info +; ELF: Type: SHT_PROGBITS +; ELF: Flags [ +; ELF: SHF_EXCLUDE + +; Relocations. ; ELF-DAG: R_AMDGPU_ABS32_LO lds_large ; ELF-DAG: R_AMDGPU_ABS32_LO lds_small ; ELF-DAG: R_AMDGPU_ABS32_LO lds_dynamic +; ELF-DAG: R_AMDGPU_ABS64 device_func +; ELF-DAG: R_AMDGPU_ABS64 test_kernel +; ELF-DAG: R_AMDGPU_ABS64 lds_large +; ELF-DAG: R_AMDGPU_ABS64 lds_small +; ELF-DAG: R_AMDGPU_ABS64 lds_dynamic +; SHN_AMDGPU_LDS symbols. ; ELF-DAG: Name: lds_large ; ELF-DAG: Name: lds_small ; ELF-DAG: Name: lds_dynamic diff --git a/llvm/test/MC/AMDGPU/amdgpu-info-err.s b/llvm/test/MC/AMDGPU/amdgpu-info-err.s new file mode 100644 index 0000000000000..22e6d2e29f47d --- /dev/null +++ b/llvm/test/MC/AMDGPU/amdgpu-info-err.s @@ -0,0 +1,43 @@ +// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 %s -filetype=null 2>&1 | FileCheck %s + +// Each error case aborts parsing of its enclosing .amdgpu_info block: the +// parser returns on the failing directive, which implicitly exits the block +// (there is no block-open state tracked at the top level), and the next +// test case starts fresh at top level. `.end_amdgpu_info` terminators are +// therefore intentionally omitted -- adding them here would themselves +// become "unknown directive" errors, since `.end_amdgpu_info` is only +// recognised inside the block. + +// Missing function symbol after .amdgpu_info. +.amdgpu_info +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected symbol name after .amdgpu_info + +// Unknown directive inside a .amdgpu_info block. +.amdgpu_info f_unknown_dir + .amdgpu_bogus 1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: unknown .amdgpu_info directive '.amdgpu_bogus' + +// .amdgpu_use with no resource symbol. +.amdgpu_info f_use_missing + .amdgpu_use +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected resource symbol for .amdgpu_use + +// .amdgpu_call with no callee symbol. +.amdgpu_info f_call_missing + .amdgpu_call +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected callee symbol for .amdgpu_call + +// .amdgpu_indirect_call with no type-ID string. +.amdgpu_info f_icall_missing + .amdgpu_indirect_call +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected type ID string for .amdgpu_indirect_call + +// .amdgpu_typeid with no type-ID string. +.amdgpu_info f_typeid_missing + .amdgpu_typeid +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected type ID string for .amdgpu_typeid + +// Non-identifier token where a directive or .end_amdgpu_info is expected. +.amdgpu_info f_bad_token + 123 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected directive or .end_amdgpu_info diff --git a/llvm/test/MC/AMDGPU/amdgpu-info-roundtrip.s b/llvm/test/MC/AMDGPU/amdgpu-info-roundtrip.s new file mode 100644 index 0000000000000..d49890eb05174 --- /dev/null +++ b/llvm/test/MC/AMDGPU/amdgpu-info-roundtrip.s @@ -0,0 +1,126 @@ +// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj %s | llvm-readobj -r --sections --section-data --string-dump=.amdgpu.strtab - | FileCheck --check-prefix=OBJ %s + +// Test that .amdgpu_info directives round-trip through the assembler (asm and +// object emission) and produce the correct TLV-encoded .amdgpu.info section. + + .text + .globl my_kernel + .p2align 8 + .type my_kernel,@function +my_kernel: + s_endpgm +.Lfunc_end0: + .size my_kernel, .Lfunc_end0-my_kernel + + .globl helper + .p2align 2 + .type helper,@function +helper: + s_setpc_b64 s[30:31] +.Lfunc_end1: + .size helper, .Lfunc_end1-helper + + .globl addr_taken_func + .p2align 2 + .type addr_taken_func,@function +addr_taken_func: + s_setpc_b64 s[30:31] +.Lfunc_end2: + .size addr_taken_func, .Lfunc_end2-addr_taken_func + + .globl extern_func + +// COM: Kernel: flags=7 (KERNEL|VCC|FLAT_SCRATCH), resources, call edge, use +// COM: edge, indirect call, and type ID. Non-zero AGPR to verify conditional +// COM: emission. + .amdgpu_info my_kernel + .amdgpu_flags 7 + .amdgpu_num_sgpr 33 + .amdgpu_num_vgpr 32 + .amdgpu_num_agpr 4 + .amdgpu_private_segment_size 0 + .amdgpu_use lds_var + .amdgpu_call helper + .amdgpu_indirect_call "vi" + .end_amdgpu_info + +// COM: Device function: flags=2 (VCC), call edge to external. Zero AGPR values +// COM: are omitted from the input; the parser defaults them to 0 and the +// COM: emitter skips them. + .amdgpu_info helper + .amdgpu_flags 2 + .amdgpu_num_sgpr 8 + .amdgpu_num_vgpr 10 + .amdgpu_private_segment_size 16 + .amdgpu_call extern_func + .end_amdgpu_info + +// Address-taken function with type ID. Zero AGPR omitted. + .amdgpu_info addr_taken_func + .amdgpu_flags 0 + .amdgpu_num_sgpr 2 + .amdgpu_num_vgpr 4 + .amdgpu_private_segment_size 0 + .amdgpu_typeid "vi" + .end_amdgpu_info + +// ASM: .amdgpu_info my_kernel +// ASM: .amdgpu_flags 7 +// ASM: .amdgpu_num_sgpr 33 +// ASM: .amdgpu_num_vgpr 32 +// ASM: .amdgpu_num_agpr 4 +// ASM: .amdgpu_private_segment_size 0 +// ASM: .amdgpu_use lds_var +// ASM: .amdgpu_call helper +// ASM: .amdgpu_indirect_call "vi" +// ASM: .end_amdgpu_info + +// ASM: .amdgpu_info helper +// ASM: .amdgpu_flags 2 +// ASM: .amdgpu_num_sgpr 8 +// ASM: .amdgpu_num_vgpr 10 +// ASM-NOT: .amdgpu_num_agpr +// ASM: .amdgpu_private_segment_size 16 +// ASM: .amdgpu_call extern_func +// ASM: .end_amdgpu_info + +// ASM: .amdgpu_info addr_taken_func +// ASM: .amdgpu_flags 0 +// ASM: .amdgpu_num_sgpr 2 +// ASM: .amdgpu_num_vgpr 4 +// ASM-NOT: .amdgpu_num_agpr +// ASM: .amdgpu_private_segment_size 0 +// ASM: .amdgpu_typeid "vi" +// ASM: .end_amdgpu_info + +// OBJ: Section { +// OBJ: Name: .amdgpu.info +// OBJ: Type: SHT_PROGBITS +// OBJ: Flags [ +// OBJ: SHF_EXCLUDE +// OBJ: ] +// OBJ: } + +// The string pool backs INFO_INDIRECT_CALL / INFO_TYPEID payloads. It is an +// ELF-convention SHT_STRTAB with a leading null byte at offset 0 and string +// deduplication -- both directives above reference the same "vi" TypeID, so +// it must appear exactly once starting at offset 1. +// OBJ: Section { +// OBJ: Name: .amdgpu.strtab +// OBJ: Type: SHT_STRTAB +// OBJ: Flags [ +// OBJ: SHF_EXCLUDE +// OBJ: ] +// OBJ: } + +// Relocations in .amdgpu.info should reference defined and external symbols. +// OBJ-DAG: R_AMDGPU_ABS64 my_kernel +// OBJ-DAG: R_AMDGPU_ABS64 helper +// OBJ-DAG: R_AMDGPU_ABS64 addr_taken_func +// OBJ-DAG: R_AMDGPU_ABS64 extern_func +// OBJ-DAG: R_AMDGPU_ABS64 lds_var + +// OBJ: String dump of section '.amdgpu.strtab': +// OBJ-NEXT: [{{ +}}1] vi +// OBJ-NOT: ] vi From 95f5a88c250b9d0175b5478f90b1cee584489384 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 11 May 2026 20:58:04 +0100 Subject: [PATCH 341/538] [AArch64][GlobalISel] More test updates for vector select. NFC (#197023) --- llvm/test/CodeGen/AArch64/arm64-mul.ll | 15 ++ llvm/test/CodeGen/AArch64/icmp.ll | 87 +++++++- llvm/test/CodeGen/AArch64/minmax.ll | 170 ++++++++++----- llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 102 ++++++++- .../AArch64/vec-combine-trunc-dup-ext.ll | 206 ++++++++++++++---- 5 files changed, 474 insertions(+), 106 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/arm64-mul.ll b/llvm/test/CodeGen/AArch64/arm64-mul.ll index 20b1c9f409e81..0ffd36b4aad21 100644 --- a/llvm/test/CodeGen/AArch64/arm64-mul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-mul.ll @@ -210,6 +210,21 @@ entry: ret i64 %tmp4 } +define i64 @t13b(i32 %a, i64 %b) nounwind { +; CHECK-LABEL: t13b: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #-24910 // =0xffffffffffff9eb2 +; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: movk x8, #65347, lsl #16 +; CHECK-NEXT: madd x0, x9, x8, x1 +; CHECK-NEXT: ret +entry: + %tmp1 = zext i32 %a to i64 + %tmp3 = mul i64 %tmp1, -12345678 + %tmp4 = add i64 %b, %tmp3 + ret i64 %tmp4 +} + define i64 @t14(i32 %a, i64 %b) nounwind { ; CHECK-SD-LABEL: t14: ; CHECK-SD: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index 3c49374d81f40..b0fdf1820d7eb 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -1,6 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for v2p0_p0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3p0_p0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4p0_p0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for icmp_eq_v2p0_Zero_RHS +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for icmp_eq_v2p0_Zero_LHS define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) { ; CHECK-LABEL: i64_i64: @@ -1421,6 +1427,65 @@ entry: ret <2 x i128> %s } +define <2 x ptr> @v2p0_p0(<2 x ptr> %a, <2 x ptr> %b, <2 x ptr> %d, <2 x ptr> %e) { +; CHECK-LABEL: v2p0_p0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d +; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: ret +entry: + %c = icmp ne <2 x ptr> %a, %b + %s = select <2 x i1> %c, <2 x ptr> %d, <2 x ptr> %e + ret <2 x ptr> %s +} + +define <3 x ptr> @v3p0_p0(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %d, <3 x ptr> %e) { +; CHECK-LABEL: v3p0_p0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d6 killed $d6 def $q6 +; CHECK-NEXT: // kill: def $d7 killed $d7 def $q7 +; CHECK-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: ldr d16, [sp, #24] +; CHECK-NEXT: ldr d17, [sp] +; CHECK-NEXT: mov v3.d[1], v4.d[0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v6.d[1], v7.d[0] +; CHECK-NEXT: ldp d1, d4, [sp, #8] +; CHECK-NEXT: mov v1.d[1], v4.d[0] +; CHECK-NEXT: cmgt v0.2d, v3.2d, v0.2d +; CHECK-NEXT: bsl v0.16b, v6.16b, v1.16b +; CHECK-NEXT: cmgt v1.2d, v5.2d, v2.2d +; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: bsl v2.16b, v17.16b, v16.16b +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: ret +entry: + %c = icmp slt <3 x ptr> %a, %b + %s = select <3 x i1> %c, <3 x ptr> %d, <3 x ptr> %e + ret <3 x ptr> %s +} + +define <4 x ptr> @v4p0_p0(<4 x ptr> %a, <4 x ptr> %b, <4 x ptr> %d, <4 x ptr> %e) { +; CHECK-LABEL: v4p0_p0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmgt v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmgt v0.2d, v2.2d, v0.2d +; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b +; CHECK-NEXT: bsl v0.16b, v4.16b, v6.16b +; CHECK-NEXT: ret +entry: + %c = icmp slt <4 x ptr> %a, %b + %s = select <4 x i1> %c, <4 x ptr> %d, <4 x ptr> %e + ret <4 x ptr> %s +} + ; ===== ICMP Zero RHS ===== define <8 x i1> @icmp_eq_v8i8_Zero_RHS(<8 x i8> %a) { @@ -1489,6 +1554,16 @@ define <2 x i1> @icmp_eq_v2i64_Zero_RHS(<2 x i64> %a) { ret <2 x i1> %c } +define <2 x i1> @icmp_eq_v2p0_Zero_RHS(<2 x ptr> %a) { +; CHECK-LABEL: icmp_eq_v2p0_Zero_RHS: +; CHECK: // %bb.0: +; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %c = icmp eq <2 x ptr> %a, splat (ptr null) + ret <2 x i1> %c +} + define <8 x i1> @icmp_sge_v8i8_Zero_RHS(<8 x i8> %a) { ; CHECK-LABEL: icmp_sge_v8i8_Zero_RHS: ; CHECK: // %bb.0: @@ -1821,6 +1896,16 @@ define <2 x i1> @icmp_eq_v2i64_Zero_LHS(<2 x i64> %a) { ret <2 x i1> %c } +define <2 x i1> @icmp_eq_v2p0_Zero_LHS(<2 x ptr> %a) { +; CHECK-LABEL: icmp_eq_v2p0_Zero_LHS: +; CHECK: // %bb.0: +; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %c = icmp eq <2 x ptr> splat (ptr null), %a + ret <2 x i1> %c +} + define <8 x i1> @icmp_sge_v8i8_Zero_LHS(<8 x i8> %a) { ; CHECK-LABEL: icmp_sge_v8i8_Zero_LHS: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/minmax.ll b/llvm/test/CodeGen/AArch64/minmax.ll index db9bd236319c6..6a81ee46faf76 100644 --- a/llvm/test/CodeGen/AArch64/minmax.ll +++ b/llvm/test/CodeGen/AArch64/minmax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <4 x i32> @t1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: t1: @@ -42,51 +43,77 @@ define <8 x i8> @t4(<8 x i8> %a, <8 x i8> %b) { } define <4 x i16> @t5(<4 x i16> %a, <4 x i16> %b) { -; CHECK-LABEL: t5: -; CHECK: // %bb.0: -; CHECK-NEXT: smin v0.4h, v1.4h, v0.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t5: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smin v0.4h, v1.4h, v0.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t5: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smin v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %t1 = icmp sgt <4 x i16> %b, %a %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b ret <4 x i16> %t2 } define <2 x i32> @t6(<2 x i32> %a, <2 x i32> %b) { -; CHECK-LABEL: t6: -; CHECK: // %bb.0: -; CHECK-NEXT: smax v0.2s, v1.2s, v0.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t6: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smax v0.2s, v1.2s, v0.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t6: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %t1 = icmp slt <2 x i32> %b, %a %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b ret <2 x i32> %t2 } define <16 x i8> @t7(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: t7: -; CHECK: // %bb.0: -; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t7: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umin v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t7: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: umin v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %t1 = icmp ugt <16 x i8> %b, %a %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b ret <16 x i8> %t2 } define <8 x i16> @t8(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: t8: -; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.8h, v1.8h, v0.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umax v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: umax v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %t1 = icmp ult <8 x i16> %b, %a %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %t2 } define <4 x i32> @t9(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { -; CHECK-LABEL: t9: -; CHECK: // %bb.0: -; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t9: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umin v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: smax v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t9: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: smax v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: ret %t1 = icmp ugt <4 x i32> %b, %a %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b %t3 = icmp sge <4 x i32> %t2, %c @@ -95,24 +122,38 @@ define <4 x i32> @t9(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { } define <8 x i32> @t10(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: t10: -; CHECK: // %bb.0: -; CHECK-NEXT: smax v1.4s, v1.4s, v3.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t10: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smax v1.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: smax v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t10: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smax v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: smax v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: ret %t1 = icmp sgt <8 x i32> %a, %b %t2 = select <8 x i1> %t1, <8 x i32> %a, <8 x i32> %b ret <8 x i32> %t2 } define <16 x i32> @t11(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: t11: -; CHECK: // %bb.0: -; CHECK-NEXT: smin v2.4s, v2.4s, v6.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s -; CHECK-NEXT: smin v1.4s, v1.4s, v5.4s -; CHECK-NEXT: smin v3.4s, v3.4s, v7.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t11: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: smin v2.4s, v2.4s, v6.4s +; CHECK-SD-NEXT: smin v0.4s, v0.4s, v4.4s +; CHECK-SD-NEXT: smin v1.4s, v1.4s, v5.4s +; CHECK-SD-NEXT: smin v3.4s, v3.4s, v7.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t11: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: smin v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: smin v1.4s, v1.4s, v5.4s +; CHECK-GI-NEXT: smin v2.4s, v2.4s, v6.4s +; CHECK-GI-NEXT: smin v3.4s, v3.4s, v7.4s +; CHECK-GI-NEXT: ret %t1 = icmp sle <16 x i32> %a, %b %t2 = select <16 x i1> %t1, <16 x i32> %a, <16 x i32> %b ret <16 x i32> %t2 @@ -120,12 +161,21 @@ define <16 x i32> @t11(<16 x i32> %a, <16 x i32> %b) { ; The icmp is used by two instructions, so don't produce a umin node. define <16 x i8> @t12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: t12: -; CHECK: // %bb.0: -; CHECK-NEXT: cmhi v2.16b, v1.16b, v0.16b -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t12: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmhi v2.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: sub v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t12: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.16b, #1 +; CHECK-GI-NEXT: cmhi v3.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: and v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %t1 = icmp ugt <16 x i8> %b, %a %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b %t3 = zext <16 x i1> %t1 to <16 x i8> @@ -134,11 +184,19 @@ define <16 x i8> @t12(<16 x i8> %a, <16 x i8> %b) { } define <1 x i64> @t13(<1 x i64> %a, <1 x i64> %b) { -; CHECK-LABEL: t13: -; CHECK: // %bb.0: -; CHECK-NEXT: cmhi d2, d1, d0 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t13: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmhi d2, d1, d0 +; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t13: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: cmp x8, x9 +; CHECK-GI-NEXT: fcsel d0, d0, d1, lo +; CHECK-GI-NEXT: ret %t1 = icmp ult <1 x i64> %a, %b %t2 = select <1 x i1> %t1, <1 x i64> %a, <1 x i64> %b ret <1 x i64> %t2 @@ -156,13 +214,21 @@ define <2 x i64> @t14(<2 x i64> %a, <2 x i64> %b) { } define <4 x i64> @t15(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: t15: -; CHECK: // %bb.0: -; CHECK-NEXT: cmhi v4.2d, v3.2d, v1.2d -; CHECK-NEXT: cmhi v5.2d, v2.2d, v0.2d -; CHECK-NEXT: bif v1.16b, v3.16b, v4.16b -; CHECK-NEXT: bif v0.16b, v2.16b, v5.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: t15: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmhi v4.2d, v3.2d, v1.2d +; CHECK-SD-NEXT: cmhi v5.2d, v2.2d, v0.2d +; CHECK-SD-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v5.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: t15: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmhi v4.2d, v2.2d, v0.2d +; CHECK-GI-NEXT: cmhi v5.2d, v3.2d, v1.2d +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: bif v1.16b, v3.16b, v5.16b +; CHECK-GI-NEXT: ret %t1 = icmp ule <4 x i64> %a, %b %t2 = select <4 x i1> %t1, <4 x i64> %a, <4 x i64> %b ret <4 x i64> %t2 diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll index 7385f62881bec..48f9417c9345b 100644 --- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll +++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc %s -o - | FileCheck %s -target triple = "aarch64-linux-gnu" +; RUN: llc -mtriple aarch64-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple aarch64-linux-gnu %s -o - -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for any_of_select_vf2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for any_of_select_vf32 ;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an ;; integer of a bitwidth equal to the number of lanes being reduced, then @@ -10,15 +13,40 @@ target triple = "aarch64-linux-gnu" ;; don't want to end up with scalarization. define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: any_of_select_vf4: -; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 -; CHECK-NEXT: movi d3, #0000000000000000 -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: cmeq v0.2d, v0.2d, v3.2d -; CHECK-NEXT: dup v0.2d, v0.d[0] -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: any_of_select_vf4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: movi d3, #0000000000000000 +; CHECK-SD-NEXT: addp d0, v0.2d +; CHECK-SD-NEXT: cmeq v0.2d, v0.2d, v3.2d +; CHECK-SD-NEXT: dup v0.2d, v0.d[0] +; CHECK-SD-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: any_of_select_vf4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov w10, v0.s[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w8, w8, #0xff +; CHECK-GI-NEXT: tst w8, #0xf +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: dup v0.4s, w8 +; CHECK-GI-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp = icmp slt <4 x i32> %mask, zeroinitializer %cmp.bc = bitcast <4 x i1> %cmp to i4 %cmp.bc.not = icmp eq i4 %cmp.bc, 0 @@ -26,6 +54,58 @@ define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) ret <4 x i32> %res } +define <4 x i8> @any_of_select_v4i8(<4 x i32> %mask, <4 x i8> %a, <4 x i8> %b) { +; CHECK-SD-LABEL: any_of_select_v4i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: movi d3, #0000000000000000 +; CHECK-SD-NEXT: addp d0, v0.2d +; CHECK-SD-NEXT: cmeq d0, d0, d3 +; CHECK-SD-NEXT: bsl v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: any_of_select_v4i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov w10, v0.s[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: mov w9, #255 // =0xff +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w8, w8, #0xff +; CHECK-GI-NEXT: tst w8, #0xf +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: mov v3.h[1], w9 +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v3.h[2], w9 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: mov v3.h[3], w9 +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] +; CHECK-GI-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: eor v3.8b, v0.8b, v3.8b +; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: and v1.8b, v2.8b, v3.8b +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret + %cmp = icmp slt <4 x i32> %mask, zeroinitializer + %cmp.bc = bitcast <4 x i1> %cmp to i4 + %cmp.bc.not = icmp eq i4 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %res +} + define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: any_of_select_vf2: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/vec-combine-trunc-dup-ext.ll b/llvm/test/CodeGen/AArch64/vec-combine-trunc-dup-ext.ll index 66a0e3ef6df98..01e545eb6bc7b 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-trunc-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-trunc-dup-ext.ll @@ -1,13 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <4 x i32> @dup_trunc_sext_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: dup_trunc_sext_v4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: dup v0.4s, v0.s[2] -; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_sext_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: dup v0.4s, v0.s[2] +; CHECK-SD-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_sext_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov w10, v0.s[3] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: mov v0.b[2], w9 +; CHECK-GI-NEXT: mov v0.b[3], w10 +; CHECK-GI-NEXT: dup v0.8b, v0.b[2] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: umov w9, v0.b[3] +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 +; CHECK-GI-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: ret %cmp = icmp slt <4 x i32> %a, %b %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y @@ -15,12 +39,29 @@ define <4 x i32> @dup_trunc_sext_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, } define <2 x i64> @dup_trunc_sext_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %x, <2 x i64> %y) { -; CHECK-LABEL: dup_trunc_sext_v2i64: -; CHECK: // %bb.0: -; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d -; CHECK-NEXT: dup v0.2d, v0.d[0] -; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_sext_v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmgt v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: dup v0.2d, v0.d[0] +; CHECK-SD-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_sext_v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmgt v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: fmov x9, d0 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: shl v0.2d, v0.2d, #63 +; CHECK-GI-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-GI-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: ret %cmp = icmp slt <2 x i64> %a, %b %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> %sel = select <2 x i1> %splat, <2 x i64> %x, <2 x i64> %y @@ -28,12 +69,35 @@ define <2 x i64> @dup_trunc_sext_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %x, } define <4 x float> @dup_trunc_sext_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x, <4 x float> %y) { -; CHECK-LABEL: dup_trunc_sext_v4f32: -; CHECK: // %bb.0: -; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s -; CHECK-NEXT: dup v0.4s, v0.s[3] -; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_sext_v4f32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: dup v0.4s, v0.s[3] +; CHECK-SD-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_sext_v4f32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcmge v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov w10, v0.s[3] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: mov v0.b[2], w9 +; CHECK-GI-NEXT: mov v0.b[3], w10 +; CHECK-GI-NEXT: dup v0.8b, v0.b[3] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: umov w9, v0.b[3] +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 +; CHECK-GI-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: ret %cmp = fcmp ole <4 x float> %a, %b %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> %sel = select <4 x i1> %splat, <4 x float> %x, <4 x float> %y @@ -41,12 +105,29 @@ define <4 x float> @dup_trunc_sext_v4f32(<4 x float> %a, <4 x float> %b, <4 x fl } define <2 x double> @dup_trunc_sext_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %x, <2 x double> %y) { -; CHECK-LABEL: dup_trunc_sext_v2f64: -; CHECK: // %bb.0: -; CHECK-NEXT: fcmge v0.2d, v1.2d, v0.2d -; CHECK-NEXT: dup v0.2d, v0.d[1] -; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_sext_v2f64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcmge v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: dup v0.2d, v0.d[1] +; CHECK-SD-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_sext_v2f64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcmge v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: fmov x9, d0 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: dup v0.8b, v0.b[1] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: shl v0.2d, v0.2d, #63 +; CHECK-GI-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-GI-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: ret %cmp = fcmp ole <2 x double> %a, %b %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> %sel = select <2 x i1> %splat, <2 x double> %x, <2 x double> %y @@ -54,12 +135,35 @@ define <2 x double> @dup_trunc_sext_v2f64(<2 x double> %a, <2 x double> %b, <2 x } define <4 x i32> @dup_trunc_sext_v4i32_idx0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: dup_trunc_sext_v4i32_idx0: -; CHECK: // %bb.0: -; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s -; CHECK-NEXT: dup v0.4s, v0.s[0] -; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_sext_v4i32_idx0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: dup v0.4s, v0.s[0] +; CHECK-SD-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_sext_v4i32_idx0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmgt v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: mov w10, v0.s[3] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: mov v0.b[2], w9 +; CHECK-GI-NEXT: mov v0.b[3], w10 +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: umov w9, v0.b[3] +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 +; CHECK-GI-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: ret %cmp = icmp slt <4 x i32> %a, %b %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> zeroinitializer %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y @@ -67,12 +171,23 @@ define <4 x i32> @dup_trunc_sext_v4i32_idx0(<4 x i32> %a, <4 x i32> %b, <4 x i32 } define <8 x i16> @dup_trunc_sext_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %x, <8 x i16> %y) { -; CHECK-LABEL: dup_trunc_sext_v8i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h -; CHECK-NEXT: dup v0.8h, v0.h[2] -; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_sext_v8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmgt v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: dup v0.8h, v0.h[2] +; CHECK-SD-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_sext_v8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmgt v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: dup v0.8b, v0.b[2] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: shl v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-GI-NEXT: bsl v0.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: ret %cmp = icmp slt <8 x i16> %a, %b %splat = shufflevector <8 x i1> %cmp, <8 x i1> poison, <8 x i32> %sel = select <8 x i1> %splat, <8 x i16> %x, <8 x i16> %y @@ -93,11 +208,18 @@ define <2 x i64> @negative_sext_i32(<2 x i64> %mask) { } define <2 x i64> @dup_trunc_zext_i32(<2 x i64> %a) { -; CHECK-LABEL: dup_trunc_zext_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.2d, v0.2d, #32 -; CHECK-NEXT: dup v0.2d, v0.d[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_trunc_zext_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushr v0.2d, v0.2d, #32 +; CHECK-SD-NEXT: dup v0.2d, v0.d[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_trunc_zext_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: dup v0.2s, v0.s[1] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-GI-NEXT: ret %shifted = lshr <2 x i64> %a, %trunc = trunc <2 x i64> %shifted to <2 x i32> %splat = shufflevector <2 x i32> %trunc, <2 x i32> poison, <2 x i32> From 1ac50831246de95162bae252a794fad16b3e68c6 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 16:06:01 -0400 Subject: [PATCH 342/538] [libc] Fix -Wshadow warning in HashTable/table.h (#196857) --- libc/src/__support/HashTable/table.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h index f42eead2efcf2..578f01e0e2c55 100644 --- a/libc/src/__support/HashTable/table.h +++ b/libc/src/__support/HashTable/table.h @@ -208,10 +208,10 @@ struct HashTable { LIBC_INLINE HashTable *grow() const { size_t hint = full_capacity() + 1; - HashState state = this->state; + HashState new_state = state; // migrate to a new random state - state.update(&hint, sizeof(hint)); - HashTable *new_table = allocate(hint, state.finish()); + new_state.update(&hint, sizeof(hint)); + HashTable *new_table = allocate(hint, new_state.finish()); // It is safe to call unsafe_insert() because we know that: // - the new table has enough capacity to hold all the entries // - there is no duplicate key in the old table From 64a8f3645cab2ce09c454067c1b3c02eddb78cab Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 11 May 2026 15:06:05 -0500 Subject: [PATCH 343/538] [lldb] Fix missing status in CommandObjectBreakpointCommand (#197024) This should fix Breakpoint/breakpoint-command.test after #196589. --- .../Commands/CommandObjectBreakpointCommand.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp index 03ca1e3e87748..1ce10441e3ada 100644 --- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp +++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp @@ -389,16 +389,25 @@ are no syntax errors may indicate that a function was declared but never called. } else { script_interp->CollectDataForBreakpointCommandCallback( m_bp_options_vec, result); + // Still gathering input; the IOHandler will set the final status. + result.SetStatus(eReturnStatusStarted); + return; } if (!error.Success()) result.SetError(std::move(error)); + else + result.SetStatus(eReturnStatusSuccessFinishNoResult); } else { // Special handling for one-liner specified inline. - if (m_options.m_use_one_liner) + if (m_options.m_use_one_liner) { SetBreakpointCommandCallback(m_bp_options_vec, m_options.m_one_liner.c_str()); - else + result.SetStatus(eReturnStatusSuccessFinishNoResult); + } else { CollectDataForBreakpointCommandCallback(m_bp_options_vec, result); + // Still gathering input; the IOHandler will set the final status. + result.SetStatus(eReturnStatusStarted); + } } } } From c4e932f7663b44d867f96cb8ea66cabc80dddb86 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 16:06:20 -0400 Subject: [PATCH 344/538] [libc] Fix -Wshadow warning in IntegrationTest/test.cpp (#196858) --- libc/test/IntegrationTest/test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp index 19eb255e57b04..0a9728f021e90 100644 --- a/libc/test/IntegrationTest/test.cpp +++ b/libc/test/IntegrationTest/test.cpp @@ -66,12 +66,12 @@ int atexit(void (*func)(void)) { return LIBC_NAMESPACE::atexit(func); } static constexpr uint64_t ALIGNMENT = alignof(double); static constexpr uint64_t MEMORY_SIZE = 256 * 1024 /* 256 KiB */; alignas(ALIGNMENT) static uint8_t memory[MEMORY_SIZE]; -static size_t ptr = 0; +static size_t global_ptr = 0; extern "C" { void *malloc(size_t size) { - LIBC_NAMESPACE::cpp::AtomicRef ref(ptr); + LIBC_NAMESPACE::cpp::AtomicRef ref(global_ptr); size = (size + ALIGNMENT - 1) & ~(ALIGNMENT - 1); size_t old_ptr = ref.fetch_add(size, LIBC_NAMESPACE::cpp::MemoryOrder::RELAXED); From a761e2e8297ed2295a5b6cf0619b69f6f52c1129 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 16:06:45 -0400 Subject: [PATCH 345/538] [libc] Fix -Wshadow warning in sqrtf128.h (#196851) sqrtf128() contained both `using namespace sqrtf128_internal;` and `using FPBits = fputil::FPBits;`, but sqrtf128_internal also had a `using FPBits = fputil::FPBits;`. The outer `using` wasn't actually used, so remove that one. --- libc/src/__support/math/sqrtf128.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/libc/src/__support/math/sqrtf128.h b/libc/src/__support/math/sqrtf128.h index d5b54b724c259..14dcbec1acbb6 100644 --- a/libc/src/__support/math/sqrtf128.h +++ b/libc/src/__support/math/sqrtf128.h @@ -60,8 +60,6 @@ namespace math { namespace sqrtf128_internal { -using FPBits = fputil::FPBits; - template LIBC_INLINE constexpr T prod_hi(T, U); // Get high part of integer multiplications. From 7ddee0b619f658cef905a69427ef9531fd1d229d Mon Sep 17 00:00:00 2001 From: Adel Ejjeh Date: Mon, 11 May 2026 15:08:20 -0500 Subject: [PATCH 346/538] [AMDGPU] Account for inline asm size in inst_pref_size calculation (#192306) `SIProgramInfo::getFunctionCodeSize()` with `IsLowerBound=true` was completely skipping inline assembly instructions, treating them as zero bytes. This caused `amdhsa_inst_pref_size` to be severely underestimated for kernels containing inline asm, defeating instruction prefetch on gfx11+. Use MCExpr label subtraction (`.Lfunc_end - func_sym`) to compute exact function code size, resolved at assembly time. This avoids inline asm string parsing which cannot reliably estimate code size and risks overestimation (which causes prefetch of unmapped memory and a fatal segfault). Add a new `AMDGPUMCExpr` variant (`AGVK_InstPrefSize`) to compute `min(divideCeil(codeSize, cacheLineSize), maxFieldVal)` as a custom MCExpr, following the same pattern as `AGVK_Occupancy` and `AGVK_AlignTo`. The cache line size and field width are derived from the subtarget via `IsaInfo::getInstCacheLineSize` and feature-bit checks (`isGFX12Plus`). Compute `inst_pref_size` in `AMDGPUAsmPrinter::endFunction()` where `.Lfunc_end` has already been emitted in the correct position (after the fix in #191526). The `inst_pref_size` MCExpr is stored as a separate field in `MCKernelDescriptor` rather than being OR'd into `compute_pgm_rsrc3`, because the label subtraction is unresolvable in text assembly mode and would corrupt the printing of other fields (e.g. `named_barrier_count`) that share the same register. The ELF streamer OR's `inst_pref_size` into `compute_pgm_rsrc3` when emitting the binary kernel descriptor bytes. Remove the `IsLowerBound` parameter from `getFunctionCodeSize()` as it is no longer needed for `inst_pref_size` calculation. Update existing lit tests, and add a new one. Both lit tests verify the correct `inst_pref_size` value in the object file by checking the `COMPUTE_PGM_RSRC3` bytes in the kernel descriptor via `llvm-objdump`. *This PR was developed with AI assistance (GitHub Copilot).* --------- Co-authored-by: Claude Opus 4 (1M context) Co-authored-by: Copilot --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 83 +++++----- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 18 ++ .../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp | 45 +++++ .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h | 9 + llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 17 +- llvm/lib/Target/AMDGPU/SIProgramInfo.h | 5 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 8 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 + .../test/CodeGen/AMDGPU/inst-prefetch-hint.ll | 50 +++++- .../AMDGPU/inst-prefetch-inline-asm.ll | 154 ++++++++++++++++++ 10 files changed, 324 insertions(+), 68 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index ad61d8d084c7b..3a2738d9fc498 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -234,6 +234,18 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() { HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } +/// Set bits in a kernel descriptor MCExpr field: +/// return ((Dst & ~Mask) | (Value << Shift)) +static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value, + uint32_t Mask, uint32_t Shift, MCContext &Ctx) { + const auto *Shft = MCConstantExpr::create(Shift, Ctx); + const auto *Msk = MCConstantExpr::create(Mask, Ctx); + Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); + Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), + Ctx); + return Dst; +} + void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { const SIMachineFunctionInfo &MFI = *MF->getInfo(); if (!MFI.isEntryFunction()) @@ -241,6 +253,29 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { assert(TM.getTargetTriple().getOS() == Triple::AMDHSA); + const GCNSubtarget &STM = MF->getSubtarget(); + MCContext &Ctx = MF->getContext(); + + AMDGPU::MCKernelDescriptor KD = + getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo); + + // Compute inst_pref_size using MCExpr label subtraction for exact code + // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter) + // right after the function code, so (Lfunc_end - func_sym) gives the + // exact function code size in bytes. + if (STM.hasInstPrefSize()) { + const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(getFunctionEnd(), OutContext), + MCSymbolRefExpr::create(CurrentFnSym, OutContext), OutContext); + + uint32_t Mask, Shift, Width, CacheLineSize; + STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize); + const MCExpr *InstPrefSize = + AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx); + KD.compute_pgm_rsrc3 = + setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx); + } + auto &Streamer = getTargetStreamer()->getStreamer(); auto &Context = Streamer.getContext(); auto &ObjectFileInfo = *Context.getObjectFileInfo(); @@ -254,13 +289,10 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { Streamer.emitValueToAlignment(Align(64), 0, 1, 0); ReadOnlySection.ensureMinAlignment(Align(64)); - const GCNSubtarget &STM = MF->getSubtarget(); - SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), - CurrentProgramInfo.NumVGPRsForWavesPerEU, + STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU, MCBinaryExpr::createSub( CurrentProgramInfo.NumSGPRsForWavesPerEU, AMDGPUMCExpr::createExtraSGPRs( @@ -1438,33 +1470,22 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; ProgInfo.EXCPEnable = 0; - // return ((Dst & ~Mask) | (Value << Shift)) - auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, - uint32_t Shift) { - const auto *Shft = MCConstantExpr::create(Shift, Ctx); - const auto *Msk = MCConstantExpr::create(Mask, Ctx); - Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); - Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), - Ctx); - return Dst; - }; - if (STM.hasGFX90AInsts()) { ProgInfo.ComputePGMRSrc3 = - SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, + setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx); ProgInfo.ComputePGMRSrc3 = - SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), + setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx); } if (STM.hasGFX1250Insts()) ProgInfo.ComputePGMRSrc3 = - SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, + setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, - amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT); + amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx); ProgInfo.Occupancy = createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize).second, @@ -1483,26 +1504,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ", final occupancy is " + Twine(Occupancy)); F.getContext().diagnose(Diag); } - - if (isGFX11Plus(STM)) { - uint32_t CodeSizeInBytes = (uint32_t)std::min( - ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */), - (uint64_t)std::numeric_limits::max()); - uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128); - uint32_t Field, Shift, Width; - if (isGFX11(STM)) { - Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; - Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; - Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; - } else { - Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; - Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; - Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; - } - uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1); - ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, - CreateExpr(InstPrefSize), Field, Shift); - } } static unsigned getRsrcReg(CallingConv::ID CallConv) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 5f580ac0577d5..758e9b445d6dd 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -21,6 +21,7 @@ #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/ErrorHandling.h" #define GET_SUBTARGETINFO_HEADER @@ -425,6 +426,23 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPrefetch() const { return HasGFX12Insts; } + bool hasInstPrefSize() const { return isGFX11Plus(); } + + void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, + uint32_t &CacheLineSize) const { + assert(isGFX11Plus()); + CacheLineSize = getInstCacheLineSize(); + if (getGeneration() == GFX11) { + Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; + } else { + Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; + } + } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index fd0a2a6a77d7e..4563803ad6577 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -12,9 +12,12 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -74,6 +77,9 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { case AGVK_Occupancy: OS << "occupancy("; break; + case AGVK_InstPrefSize: + OS << "instprefsize("; + break; case AGVK_Lit: OS << "lit("; break; @@ -182,6 +188,27 @@ bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, return true; } +/// Get the inst_pref_size field width for the given subtarget. +static unsigned getInstPrefSizeFieldWidth(const MCSubtargetInfo &STI) { + if (AMDGPU::isGFX12Plus(STI)) + return amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; + return amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; +} + +bool AMDGPUMCExpr::evaluateInstPrefSize(MCValue &Res, + const MCAssembler *Asm) const { + uint64_t CodeSizeInBytes = 0; + if (!evaluateMCExprs(Args, Asm, {CodeSizeInBytes})) + return false; + const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); + unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI); + unsigned CacheLineSize = AMDGPU::IsaInfo::getInstCacheLineSize(STI); + uint64_t CodeSizeInLines = divideCeil(CodeSizeInBytes, CacheLineSize); + uint64_t MaxVal = (1u << FieldWidth) - 1; + Res = MCValue::get(std::min(CodeSizeInLines, MaxVal)); + return true; +} + bool AMDGPUMCExpr::isSymbolUsedInExpression(const MCSymbol *Sym, const MCExpr *E) { switch (E->getKind()) { @@ -227,6 +254,8 @@ bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res, return evaluateTotalNumVGPR(Res, Asm); case AGVK_Occupancy: return evaluateOccupancy(Res, Asm); + case AGVK_InstPrefSize: + return evaluateInstPrefSize(Res, Asm); case AGVK_Lit: case AGVK_Lit64: return Args[0]->evaluateAsRelocatable(Res, Asm); @@ -279,6 +308,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } +const AMDGPUMCExpr * +AMDGPUMCExpr::createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx) { + return create(AGVK_InstPrefSize, {CodeSizeBytes}, Ctx); +} + const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value, MCContext &Ctx) { assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64); @@ -469,6 +503,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, case AMDGPUMCExpr::VariantKind::AGVK_TotalNumVGPRs: case AMDGPUMCExpr::VariantKind::AGVK_AlignTo: case AMDGPUMCExpr::VariantKind::AGVK_Occupancy: + case AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize: case AMDGPUMCExpr::VariantKind::AGVK_Lit: case AMDGPUMCExpr::VariantKind::AGVK_Lit64: { int64_t Val; @@ -477,6 +512,16 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, KBM[Expr] = KnownBits::makeConstant(APValue); return; } + if (AGVK->getKind() == AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize) { + // The result is clamped to (1 << FieldWidth) - 1, so upper bits are + // known zero. FieldWidth is derived from the subtarget. + const MCSubtargetInfo *STI = AGVK->getCtx().getSubtargetInfo(); + unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI); + KnownBits KB(BitWidth); + KB.Zero.setBitsFrom(FieldWidth); + KBM[Expr] = KB; + return; + } KBM[Expr] = KnownBits(BitWidth); return; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 96bd8f4cf3c13..4b1aa0c591a80 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -38,6 +38,7 @@ class AMDGPUMCExpr : public MCTargetExpr { AGVK_TotalNumVGPRs, AGVK_AlignTo, AGVK_Occupancy, + AGVK_InstPrefSize, AGVK_Lit, AGVK_Lit64, }; @@ -69,6 +70,7 @@ class AMDGPUMCExpr : public MCTargetExpr { bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; bool evaluateAlignTo(MCValue &Res, const MCAssembler *Asm) const; bool evaluateOccupancy(MCValue &Res, const MCAssembler *Asm) const; + bool evaluateInstPrefSize(MCValue &Res, const MCAssembler *Asm) const; public: static const AMDGPUMCExpr * @@ -97,11 +99,18 @@ class AMDGPUMCExpr : public MCTargetExpr { return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } + /// Create an expression for instruction prefetch size computation: + /// min(divideCeil(CodeSizeBytes, CacheLineSize), (1 << FieldWidth) - 1) + /// FieldWidth and CacheLineSize are derived from the subtarget. + static const AMDGPUMCExpr *createInstPrefSize(const MCExpr *CodeSizeBytes, + MCContext &Ctx); + static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value, MCContext &Ctx); ArrayRef getArgs() const { return Args; } VariantKind getKind() const { return Kind; } + MCContext &getCtx() const { return Ctx; } const MCExpr *getSubExpr(size_t Index) const; void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 99255e4060886..27cef7a1b9158 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -215,9 +215,8 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } -uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, - bool IsLowerBound) { - if (!IsLowerBound && CodeSizeInBytes.has_value()) +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (CodeSizeInBytes.has_value()) return *CodeSizeInBytes; const GCNSubtarget &STM = MF.getSubtarget(); @@ -226,12 +225,7 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { - // The amount of padding to align code can be both underestimated and - // overestimated. In case of inline asm used getInstSizeInBytes() will - // return a maximum size of a single instruction, where the real size may - // differ. At this point CodeSize may be already off. - if (!IsLowerBound) - CodeSize = alignTo(CodeSize, MBB.getAlignment()); + CodeSize = alignTo(CodeSize, MBB.getAlignment()); for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -239,11 +233,6 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, if (MI.isMetaInstruction()) continue; - // We cannot properly estimate inline asm size. It can be as small as zero - // if that is just a comment. - if (IsLowerBound && MI.isInlineAsm()) - continue; - CodeSize += TII->getInstSizeInBytes(MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 947b473142a1f..fb56ebf88c96f 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -105,10 +105,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { void reset(const MachineFunction &MF); // Get function code size and cache the value. - // If \p IsLowerBound is set it returns a minimal code size which is safe - // to address. - uint64_t getFunctionCodeSize(const MachineFunction &MF, - bool IsLowerBound = false); + uint64_t getFunctionCodeSize(const MachineFunction &MF); /// Compute the value of the ComputePGMRsrc1 register. const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b13aed2432602..dd67e77d0d9ed 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1182,6 +1182,14 @@ std::string AMDGPUTargetID::toString() const { return Str; } +unsigned getInstCacheLineSize(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureInstCacheLineSize128)) + return 128; + if (STI->getFeatureBits().test(FeatureInstCacheLineSize64)) + return 64; + return 64; +} + unsigned getWavefrontSize(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureWavefrontSize16)) return 16; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 49373f09ee460..e1b36f0996331 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -233,6 +233,9 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } +/// \returns Instruction cache line size in bytes for given subtarget \p STI. +unsigned getInstCacheLineSize(const MCSubtargetInfo *STI); + /// \returns Wavefront size for given subtarget \p STI. unsigned getWavefrontSize(const MCSubtargetInfo *STI); diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll index 580167076e1f0..b76ef7eac11c4 100644 --- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll +++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll @@ -1,11 +1,31 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s +;; Verify that inst_pref_size resolves to the correct value in the object file. +;; COMPUTE_PGM_RSRC3 is at offset 0x2C in each 64-byte kernel descriptor. +;; inst_pref_size is bits [9:4] on GFX11 (6-bit) and bits [11:4] on GFX12+ (8-bit). +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx11.o +; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx12.o +; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s + +; The inst_pref_size is computed via MCExpr label subtraction, resolved at +; assembly/link time. In text output it appears as: +; ((instprefsize()<>Shift +; where: +; = .Lfunc_endN - func_sym (exact function code size in bytes) +; instprefsize = min(divideCeil(code_size, cache_line_size), (1 << field_width) - 1) +; field_width and cache_line_size are derived from the subtarget + ; GCN-LABEL: .amdhsa_kernel large -; GFX11: .amdhsa_inst_pref_size 3 -; GFX11: codeLenInByte = 3{{[0-9][0-9]$}} -; GFX12: .amdhsa_inst_pref_size 4 -; GFX12: codeLenInByte = 4{{[0-9][0-9]$}} +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&1008)>>4 +; GFX11: codeLenInByte = {{[0-9]+}} +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&4080)>>4 +; GFX12: codeLenInByte = {{[0-9]+}} +;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C: +;; gfx11 pref=3 (0x30), gfx12 pref=4 (0x40) +; OBJ-GFX11: 0020 {{.*}}30000000 +; OBJ-GFX12: 0020 {{.*}}40000000 define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) { bb: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false) @@ -13,18 +33,30 @@ bb: } ; GCN-LABEL: .amdhsa_kernel small -; GCN: .amdhsa_inst_pref_size 1 -; GCN: codeLenInByte = {{[0-9]$}} +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&1008)>>4 +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&4080)>>4 +; GCN: codeLenInByte = {{[0-9]+}} +;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C: +;; pref=1 (0x10) for both +; OBJ-GFX11: 0060 {{.*}}10000000 +; OBJ-GFX12: 0060 {{.*}}10000000 define amdgpu_kernel void @small() { bb: ret void } -; Ignore inline asm in size calculation +; Inline asm is accounted for via MCExpr label subtraction (exact code size). +; The MCExpr resolves to the correct inst_pref_size at assembly time. ; GCN-LABEL: .amdhsa_kernel inline_asm -; GCN: .amdhsa_inst_pref_size 1 -; GCN: codeLenInByte = {{[0-9]$}} +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&1008)>>4 +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&4080)>>4 +; GCN: codeLenInByte = {{[0-9]+}} +;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC: +;; pref=9 (0x90) for both +;; (.fill 256, 4, 0 = 1024 bytes + 4 s_endpgm = 1028 -> divideCeil(1028,128) = 9) +; OBJ-GFX11: 00a0 {{.*}}90000000 +; OBJ-GFX12: 00a0 {{.*}}90000000 define amdgpu_kernel void @inline_asm() { bb: call void asm sideeffect ".fill 256, 4, 0", ""() diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll new file mode 100644 index 0000000000000..287a30032230b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll @@ -0,0 +1,154 @@ +;; Verify that inline assembly is correctly accounted for in the +;; inst_pref_size calculation. The inst_pref_size is computed via MCExpr +;; label subtraction (.Lfunc_end - func_sym), giving exact code size. +;; See inst-prefetch-hint.ll for explanation of the instprefsize expression. + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s -o %t.gfx11.o +; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s -o %t.gfx12.o +; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s + +;; --- .fill directive: .fill 256, 4, 0 => 1024 bytes + 4 (s_endpgm) = 1028 --- +;; pref_size = divideCeil(1028, 128) = 9 + +; GFX11-LABEL: .amdhsa_kernel test_fill +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_fill +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C: +;; pref_size=9 -> 9<<4 = 0x90 +; OBJ-GFX11: 0020 {{.*}}90000000 +; OBJ-GFX12: 0020 {{.*}}90000000 + +define amdgpu_kernel void @test_fill() { + call void asm sideeffect ".fill 256, 4, 0", ""() + ret void +} + +;; --- .space directive: .space 1024 => 1024 bytes + 4 = 1028 --- +;; pref_size = 9 + +; GFX11-LABEL: .amdhsa_kernel test_space +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_space +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C: +;; pref_size=9 -> 9<<4 = 0x90 +; OBJ-GFX11: 0060 {{.*}}90000000 +; OBJ-GFX12: 0060 {{.*}}90000000 + +define amdgpu_kernel void @test_space() { + call void asm sideeffect ".space 1024", ""() + ret void +} + +;; --- Instructions: 32 x s_nop (4 bytes each) = 128 + 4 = 132 --- +;; pref_size = divideCeil(132, 128) = 2 + +; GFX11-LABEL: .amdhsa_kernel test_instructions +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_instructions +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC: +;; pref_size=2 -> 2<<4 = 0x20 +; OBJ-GFX11: 00a0 {{.*}}20000000 +; OBJ-GFX12: 00a0 {{.*}}20000000 + +define amdgpu_kernel void @test_instructions() { + call void asm sideeffect "s_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0", ""() + ret void +} + +;; --- Comments emit no bytes: only s_endpgm = 4 bytes --- +;; pref_size = 1 + +; GFX11-LABEL: .amdhsa_kernel test_comments +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_comments +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&4080)>>4 +;; Object: kernel descriptor at 0xC0, COMPUTE_PGM_RSRC3 at 0xEC: +;; pref_size=1 -> 1<<4 = 0x10 +; OBJ-GFX11: 00e0 {{.*}}10000000 +; OBJ-GFX12: 00e0 {{.*}}10000000 + +define amdgpu_kernel void @test_comments() { + call void asm sideeffect "; comment 1\0A; comment 2\0A; comment 3", ""() + ret void +} + +;; --- Empty inline asm: only s_endpgm = 4 bytes --- +;; pref_size = 1 + +; GFX11-LABEL: .amdhsa_kernel test_empty_asm +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_empty_asm +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x100, COMPUTE_PGM_RSRC3 at 0x12C: +;; pref_size=1 -> 1<<4 = 0x10 +; OBJ-GFX11: 0120 {{.*}}10000000 +; OBJ-GFX12: 0120 {{.*}}10000000 + +define amdgpu_kernel void @test_empty_asm() { + call void asm sideeffect "", ""() + ret void +} + +;; --- Multiple inline asm blocks: .fill (512) + .space (512) + s_endpgm (4) = 1028 --- +;; pref_size = divideCeil(1028, 128) = 9 + +; GFX11-LABEL: .amdhsa_kernel test_multiple_asm +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_multiple_asm +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x140, COMPUTE_PGM_RSRC3 at 0x16C: +;; pref_size=9 -> 9<<4 = 0x90 +; OBJ-GFX11: 0160 {{.*}}90000000 +; OBJ-GFX12: 0160 {{.*}}90000000 + +define amdgpu_kernel void @test_multiple_asm() { + call void asm sideeffect ".fill 128, 4, 0", ""() + call void asm sideeffect ".space 512", ""() + ret void +} + +;; --- Large function that exceeds GFX11 6-bit field max (63) --- +;; .fill 2048, 4, 0 = 8192 bytes + 4 = 8196 bytes +;; divideCeil(8196, 128) = 65, but GFX11 max = (1<<6)-1 = 63 +;; pref_size should clamp to 63 + +; GFX11-LABEL: .amdhsa_kernel test_clamping +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_clamping +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x180, COMPUTE_PGM_RSRC3 at 0x1AC: +;; gfx11: clamped to 63 -> 63<<4 = 0x3F0 +;; gfx12: no clamping, 65 -> 65<<4 = 0x410 +; OBJ-GFX11: 01a0 {{.*}}f0030000 +; OBJ-GFX12: 01a0 {{.*}}10040000 + +define amdgpu_kernel void @test_clamping() { + call void asm sideeffect ".fill 2048, 4, 0", ""() + ret void +} + +;; --- Large function that exceeds both GFX11 and GFX12 field max --- +;; .fill 8192, 4, 0 = 32768 bytes + 4 = 32772 bytes +;; divideCeil(32772, 128) = 257 +;; GFX11 max = 63, GFX12 max = 255 -> both clamp + +; GFX11-LABEL: .amdhsa_kernel test_clamping_both +; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&1008)>>4 +; GFX12-LABEL: .amdhsa_kernel test_clamping_both +; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&4080)>>4 +;; Object: kernel descriptor at 0x1C0, COMPUTE_PGM_RSRC3 at 0x1EC: +;; gfx11: clamped to 63 -> 63<<4 = 0x3F0 +;; gfx12: clamped to 255 -> 255<<4 = 0xFF0 +; OBJ-GFX11: 01e0 {{.*}}f0030000 +; OBJ-GFX12: 01e0 {{.*}}f00f0000 + +define amdgpu_kernel void @test_clamping_both() { + call void asm sideeffect ".fill 8192, 4, 0", ""() + ret void +} From 8ac875471bc30dd3ecb7b74a4646468718cb835e Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 16:19:15 -0400 Subject: [PATCH 347/538] [libc] Remove global printf_core StorageType declarations in float_inf_nan_converter.h (#196859) fixed_converter.h and float_hex_converter.h have local declarations with the same name shadowing it, causing -Wshadow warnings. The using declaration is used in only one function, so just make it local. --- libc/src/stdio/printf_core/float_inf_nan_converter.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libc/src/stdio/printf_core/float_inf_nan_converter.h b/libc/src/stdio/printf_core/float_inf_nan_converter.h index 379b7c6d3dcd6..973199d55094b 100644 --- a/libc/src/stdio/printf_core/float_inf_nan_converter.h +++ b/libc/src/stdio/printf_core/float_inf_nan_converter.h @@ -22,15 +22,15 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { +template +LIBC_INLINE int convert_inf_nan(Writer *writer, + const FormatSection &to_conv) { #ifdef LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE -using StorageType = UInt128; + using StorageType = UInt128; #else -using StorageType = fputil::FPBits::StorageType; + using StorageType = fputil::FPBits::StorageType; #endif // LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE -template -LIBC_INLINE int convert_inf_nan(Writer *writer, - const FormatSection &to_conv) { // All of the letters will be defined relative to variable a, which will be // the appropriate case based on the case of the conversion. bool is_negative; From 916445b32092912cd6900d8065c0accd908fe5be Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 11 May 2026 16:22:02 -0400 Subject: [PATCH 348/538] [MLIR][MemRef] Extend narrow-type emulation for dynamic offsets (#196945) This patch adds three related extensions to the MemRef narrow-type emulation patterns. * `ConvertMemRefSubview` now accepts a dynamic innermost offset. * `ConvertMemRefReinterpretCast` is generalized from the previous static-rank-1, static-offset shape to accept any rank and dynamic offsets, with the same alignment contract as the subview pattern. * A new `ConvertMemRefCast` pattern handles `memref.cast` between equivalent narrow-typed memref types so that emulation does not get blocked by trivial casts. --- .../Dialect/MemRef/Transforms/Transforms.h | 8 +- .../mlir/Dialect/MemRef/Utils/MemRefUtils.h | 27 ++- .../MemRef/Transforms/EmulateNarrowType.cpp | 198 +++++++++++++----- mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 14 +- ...emulate-narrow-type-no-assume-aligned.mlir | 23 ++ .../Dialect/MemRef/emulate-narrow-type.mlir | 104 ++++++++- .../Dialect/MemRef/TestEmulateNarrowType.cpp | 4 +- 7 files changed, 314 insertions(+), 64 deletions(-) create mode 100644 mlir/test/Dialect/MemRef/emulate-narrow-type-no-assume-aligned.mlir diff --git a/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h b/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h index 62745f8fa1dfa..4d6c54d74d2a9 100644 --- a/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/MemRef/Transforms/Transforms.h @@ -89,9 +89,15 @@ void populateMemRefWideIntEmulationConversions( /// over wider types. /// When `disableAtomicRMW` is true, the store patterns generate non-atomic /// read-modify-write sequences instead of atomic operations. +/// When `assumeAligned` is true, `memref.subview` and +/// `memref.reinterpret_cast` patterns accept dynamic offsets under the +/// alignment contract that the caller guarantees those offsets are a multiple +/// of `dstBits / srcBits`. When false (the default), dynamic offsets are +/// rejected to preserve soundness for callers that cannot prove divisibility. void populateMemRefNarrowTypeEmulationPatterns( const arith::NarrowTypeEmulationConverter &typeConverter, - RewritePatternSet &patterns, bool disableAtomicRMW = false); + RewritePatternSet &patterns, bool disableAtomicRMW = false, + bool assumeAligned = false); /// Appends type conversions for emulating memref operations over narrow types /// with ops over wider types. diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h index 9af0f301d763c..f58b776138def 100644 --- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h +++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h @@ -31,6 +31,18 @@ namespace memref { /// contiguous chunk of memory. bool isStaticShapeAndContiguousRowMajor(MemRefType type); +/// Controls how the per-dimension contribution to `linearizedSize` is divided +/// by `dstBits / srcBits` when scaling down to the emulated type. The offset +/// and intra-data offset are unaffected; they always use floor division and +/// remainder respectively. +/// - `Floor`: round each `stride * size / scaler` down. Suitable for indexing +/// computations where a partial trailing byte is not included. +/// - `Ceil`: round up, matching the result-shape size used by narrow-type +/// memref type conversion (see `getLinearizedShape`). Use this when the +/// caller needs the linearized size to cover all source elements, e.g. when +/// building the size attribute of a converted `memref.reinterpret_cast`. +enum class LinearizedDivKind { Floor, Ceil }; + /// For a `memref` with `offset`, `sizes` and `strides`, returns the /// offset, size, and potentially the size padded at the front to use for the /// linearized `memref`. @@ -47,6 +59,8 @@ bool isStaticShapeAndContiguousRowMajor(MemRefType type); /// load/store, the memory region emulated is larger than the actual memory /// region needed. `intraDataOffset` returns the element offset of the data /// relevant at the beginning. +/// - `sizeDivKind` selects floor vs ceil rounding for the `linearizedSize` +/// contribution from each dimension (see `LinearizedDivKind`). struct LinearizedMemRefInfo { OpFoldResult linearizedOffset; OpFoldResult linearizedSize; @@ -55,7 +69,8 @@ struct LinearizedMemRefInfo { std::pair getLinearizedMemRefOffsetAndSize( OpBuilder &builder, Location loc, int srcBits, int dstBits, OpFoldResult offset, ArrayRef sizes, - ArrayRef strides, ArrayRef indices = {}); + ArrayRef strides, ArrayRef indices = {}, + LinearizedDivKind sizeDivKind = LinearizedDivKind::Floor); /// For a `memref` with `offset` and `sizes`, returns the /// offset and size to use for the linearized `memref`, assuming that @@ -64,10 +79,12 @@ std::pair getLinearizedMemRefOffsetAndSize( /// element type with bitwidth `srcBits` using element type with /// bitwidth `dstBits`, the linearized offset and size are /// scaled down by `dstBits`/`srcBits`. -LinearizedMemRefInfo -getLinearizedMemRefOffsetAndSize(OpBuilder &builder, Location loc, int srcBits, - int dstBits, OpFoldResult offset, - ArrayRef sizes); +/// - `sizeDivKind` selects floor vs ceil rounding for the `linearizedSize` +/// contribution from each dimension (see `LinearizedDivKind`). +LinearizedMemRefInfo getLinearizedMemRefOffsetAndSize( + OpBuilder &builder, Location loc, int srcBits, int dstBits, + OpFoldResult offset, ArrayRef sizes, + LinearizedDivKind sizeDivKind = LinearizedDivKind::Floor); /// Track temporary allocations that are never read from. If this is the case /// it means both the allocations and associated stores can be removed. diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp index 8686f22c9e3c2..a11e14faa5475 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @@ -32,15 +32,24 @@ using namespace mlir; //===----------------------------------------------------------------------===// /// Converts a memref::ReinterpretCastOp to the converted type. The result -/// MemRefType of the old op must have a rank and stride of 1, with static -/// offset and size. The number of bits in the offset must evenly divide the -/// bitwidth of the new converted type. +/// memref is linearized to a rank-1 byte view (or rank-0 if the source is +/// rank-0). When `assumeAligned` is true, dynamic offsets are accepted under +/// the alignment contract that the caller guarantees the offset is a multiple +/// of `dstBits / srcBits`; statically-provable misalignment is rejected. +/// When `assumeAligned` is false, dynamic offsets are rejected outright since +/// divisibility cannot be proven from the IR alone. static LogicalResult convertCastingOp(ConversionPatternRewriter &rewriter, memref::ReinterpretCastOp::Adaptor adaptor, - memref::ReinterpretCastOp op, MemRefType newTy) { - auto convertedElementType = newTy.getElementType(); - auto oldElementType = op.getType().getElementType(); + memref::ReinterpretCastOp op, MemRefType newTy, + bool assumeAligned) { + if (newTy == op.getType()) { + return rewriter.notifyMatchFailure( + op, "result type was not converted by narrow-type emulation"); + } + + Type convertedElementType = newTy.getElementType(); + Type oldElementType = op.getType().getElementType(); int srcBits = oldElementType.getIntOrFloatBitWidth(); int dstBits = convertedElementType.getIntOrFloatBitWidth(); if (dstBits % srcBits != 0) { @@ -48,35 +57,70 @@ convertCastingOp(ConversionPatternRewriter &rewriter, "only dstBits % srcBits == 0 supported"); } - // Only support stride of 1. - if (llvm::any_of(op.getStaticStrides(), - [](int64_t stride) { return stride != 1; })) { - return rewriter.notifyMatchFailure(op->getLoc(), - "stride != 1 is not supported"); + ArrayRef staticStrides = op.getStaticStrides(); + if (!staticStrides.empty() && staticStrides.back() != 1) { + return rewriter.notifyMatchFailure( + op->getLoc(), "innermost stride != 1 is not supported"); + } + + // TODO: support dynamic sizes. Requires a divisibility analysis or a + // stronger alignment contract; tracked as follow-up work. + if (llvm::is_contained(op.getStaticSizes(), ShapedType::kDynamic)) { + return rewriter.notifyMatchFailure(op, "dynamic sizes are not supported"); } - auto sizes = op.getStaticSizes(); - int64_t offset = op.getStaticOffset(0); - // Only support static sizes and offsets. - if (llvm::is_contained(sizes, ShapedType::kDynamic) || - offset == ShapedType::kDynamic) { + if (!memref::isStaticShapeAndContiguousRowMajor(op.getType())) { return rewriter.notifyMatchFailure( - op, "dynamic size or offset is not supported"); + op, "result memref is not row-major contiguous"); } - int elementsPerByte = dstBits / srcBits; - if (offset % elementsPerByte != 0) { + // Reject dynamic offsets unless the caller has opted into the alignment + // contract via `assumeAligned`. Without it we cannot prove the offset is a + // multiple of `dstBits / srcBits`. + if (!assumeAligned && + llvm::is_contained(op.getStaticOffsets(), ShapedType::kDynamic)) { return rewriter.notifyMatchFailure( - op, "offset not multiple of elementsPerByte is not supported"); + op, "dynamic offsets require assumeAligned=true to ensure the offset " + "is a multiple of dstBits / srcBits"); } - SmallVector size; - if (!sizes.empty()) - size.push_back(llvm::divideCeilSigned(sizes[0], elementsPerByte)); - offset = offset / elementsPerByte; + Location loc = op.getLoc(); + SmallVector mixedSizes = op.getMixedSizes(); + OpFoldResult origOffset = op.getMixedOffsets()[0]; + + SmallVector newSizes; + SmallVector newStrides; + OpFoldResult newOffset; + OpFoldResult intraOffset; + if (mixedSizes.empty()) { + int64_t elementsPerByte = dstBits / srcBits; + AffineExpr s0; + bindSymbols(rewriter.getContext(), s0); + newOffset = affine::makeComposedFoldedAffineApply( + rewriter, loc, s0.floorDiv(elementsPerByte), {origOffset}); + intraOffset = affine::makeComposedFoldedAffineApply( + rewriter, loc, s0 % elementsPerByte, {origOffset}); + } else { + // Use ceil division so the produced linearized size matches the converted + // result memref shape (see `getLinearizedShape` in the type converter), + // which also rounds up to fit all source elements. + memref::LinearizedMemRefInfo info = + memref::getLinearizedMemRefOffsetAndSize( + rewriter, loc, srcBits, dstBits, origOffset, mixedSizes, + memref::LinearizedDivKind::Ceil); + newOffset = info.linearizedOffset; + intraOffset = info.intraDataOffset; + newSizes.push_back(info.linearizedSize); + newStrides.push_back(rewriter.getIndexAttr(1)); + } + + if (auto cst = getConstantIntValue(intraOffset); cst && *cst != 0) { + return rewriter.notifyMatchFailure( + op, "offset is provably not a multiple of dstBits / srcBits"); + } rewriter.replaceOpWithNewOp( - op, newTy, adaptor.getSource(), offset, size, op.getStaticStrides()); + op, newTy, adaptor.getSource(), newOffset, newSizes, newStrides); return success(); } @@ -349,6 +393,32 @@ struct ConvertMemRefLoad final : OpConversionPattern { } }; +//===----------------------------------------------------------------------===// +// ConvertMemRefCast +//===----------------------------------------------------------------------===// + +/// `memref.cast` between two narrow-typed memrefs forwards through the type +/// converter to a cast between the converted byte-typed memrefs. +struct ConvertMemRefCast final : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(memref::CastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type newTy = getTypeConverter()->convertType(op.getType()); + if (!newTy) { + return rewriter.notifyMatchFailure( + op->getLoc(), + llvm::formatv("failed to convert memref type: {0}", op.getType())); + } + if (newTy == op.getType()) + return failure(); + + rewriter.replaceOpWithNewOp(op, newTy, adaptor.getSource()); + return success(); + } +}; + //===----------------------------------------------------------------------===// // ConvertMemRefMemorySpaceCast //===----------------------------------------------------------------------===// @@ -377,11 +447,15 @@ struct ConvertMemRefMemorySpaceCast final // ConvertMemRefReinterpretCast //===----------------------------------------------------------------------===// -/// Output types should be at most one dimensional, so only the 0 or 1 -/// dimensional cases are supported. +/// Forwards to `convertCastingOp`, which enforces all preconditions. +/// `assumeAligned` is propagated from the populate entry point and controls +/// acceptance of dynamic offsets. struct ConvertMemRefReinterpretCast final : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + ConvertMemRefReinterpretCast(const TypeConverter &typeConverter, + MLIRContext *context, bool assumeAligned) + : OpConversionPattern(typeConverter, context), + assumeAligned(assumeAligned) {} LogicalResult matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor, @@ -394,14 +468,11 @@ struct ConvertMemRefReinterpretCast final llvm::formatv("failed to convert memref type: {0}", op.getType())); } - // Only support for 0 or 1 dimensional cases. - if (op.getType().getRank() > 1) { - return rewriter.notifyMatchFailure( - op->getLoc(), "subview with rank > 1 is not supported"); - } - - return convertCastingOp(rewriter, adaptor, op, newTy); + return convertCastingOp(rewriter, adaptor, op, newTy, assumeAligned); } + +private: + bool assumeAligned; }; //===----------------------------------------------------------------------===// @@ -503,11 +574,17 @@ struct ConvertMemrefStore final : OpConversionPattern { //===----------------------------------------------------------------------===// /// Emulating narrow ints on subview have limited support, supporting only -/// static offset and size and stride of 1. Ideally, the subview should be +/// static sizes and stride of 1. When `assumeAligned` is true, dynamic +/// offsets are accepted under the alignment contract that the caller +/// guarantees the offset is a multiple of `dstBits / srcBits`. Without that +/// opt-in, dynamic offsets are rejected. Ideally, the subview should be /// folded away before running narrow type emulation, and this pattern should /// only run for cases that can't be folded. struct ConvertMemRefSubview final : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + ConvertMemRefSubview(const TypeConverter &typeConverter, MLIRContext *context, + bool assumeAligned) + : OpConversionPattern(typeConverter, context), + assumeAligned(assumeAligned) {} LogicalResult matchAndRewrite(memref::SubViewOp subViewOp, OpAdaptor adaptor, @@ -543,12 +620,21 @@ struct ConvertMemRefSubview final : OpConversionPattern { } auto sizes = subViewOp.getStaticSizes(); - int64_t lastOffset = subViewOp.getStaticOffsets().back(); - // Only support static sizes and offsets. - if (llvm::is_contained(sizes, ShapedType::kDynamic) || - lastOffset == ShapedType::kDynamic) { + // TODO: support dynamic sizes. Requires a divisibility analysis or a + // stronger alignment contract; tracked as follow-up work. + if (llvm::is_contained(sizes, ShapedType::kDynamic)) { + return rewriter.notifyMatchFailure(subViewOp->getLoc(), + "dynamic size is not supported"); + } + + // Reject dynamic offsets unless the caller has opted into the alignment + // contract via `assumeAligned`. + if (!assumeAligned && llvm::is_contained(subViewOp.getStaticOffsets(), + ShapedType::kDynamic)) { return rewriter.notifyMatchFailure( - subViewOp->getLoc(), "dynamic size or offset is not supported"); + subViewOp, + "dynamic offsets require assumeAligned=true to ensure the offset " + "is a multiple of dstBits / srcBits"); } // Transform the offsets, sizes and strides according to the emulation. @@ -566,11 +652,21 @@ struct ConvertMemRefSubview final : OpConversionPattern { getMixedValues(adaptor.getStaticOffsets(), adaptor.getOffsets(), rewriter)); + if (auto cst = getConstantIntValue(linearizedInfo.intraDataOffset); + cst && *cst != 0) { + return rewriter.notifyMatchFailure( + subViewOp, + "subview offset is provably not a multiple of dstBits / srcBits"); + } + rewriter.replaceOpWithNewOp( subViewOp, newTy, adaptor.getSource(), linearizedIndices, linearizedInfo.linearizedSize, strides.back()); return success(); } + +private: + bool assumeAligned; }; //===----------------------------------------------------------------------===// @@ -630,16 +726,18 @@ struct ConvertMemRefExpandShape final void memref::populateMemRefNarrowTypeEmulationPatterns( const arith::NarrowTypeEmulationConverter &typeConverter, - RewritePatternSet &patterns, bool disableAtomicRMW) { + RewritePatternSet &patterns, bool disableAtomicRMW, bool assumeAligned) { // Populate `memref.*` conversion patterns. - patterns.add, - ConvertMemRefAllocation, ConvertMemRefCopy, - ConvertMemRefDealloc, ConvertMemRefCollapseShape, - ConvertMemRefExpandShape, ConvertMemRefLoad, - ConvertMemRefAssumeAlignment, ConvertMemRefMemorySpaceCast, - ConvertMemRefSubview, ConvertMemRefReinterpretCast>( - typeConverter, patterns.getContext()); + patterns + .add, + ConvertMemRefAllocation, ConvertMemRefCast, + ConvertMemRefCopy, ConvertMemRefDealloc, ConvertMemRefCollapseShape, + ConvertMemRefExpandShape, ConvertMemRefLoad, + ConvertMemRefAssumeAlignment, ConvertMemRefMemorySpaceCast>( + typeConverter, patterns.getContext()); + patterns.add( + typeConverter, patterns.getContext(), assumeAligned); patterns.insert(typeConverter, patterns.getContext(), disableAtomicRMW); memref::populateResolveExtractStridedMetadataPatterns(patterns); diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp index cf126cd85ddce..0899b1a9faeb4 100644 --- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp +++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp @@ -51,7 +51,8 @@ bool isStaticShapeAndContiguousRowMajor(MemRefType type) { std::pair getLinearizedMemRefOffsetAndSize( OpBuilder &builder, Location loc, int srcBits, int dstBits, OpFoldResult offset, ArrayRef sizes, - ArrayRef strides, ArrayRef indices) { + ArrayRef strides, ArrayRef indices, + LinearizedDivKind sizeDivKind) { unsigned sourceRank = sizes.size(); assert(sizes.size() == strides.size() && "expected as many sizes as strides for a memref"); @@ -88,7 +89,10 @@ std::pair getLinearizedMemRefOffsetAndSize( AffineExpr sizeExpr = symbols[symbolIndex++]; values.push_back(sizes[i]); - productExpressions.push_back((strideExpr * sizeExpr).floorDiv(scaler)); + AffineExpr product = strideExpr * sizeExpr; + productExpressions.push_back(sizeDivKind == LinearizedDivKind::Ceil + ? product.ceilDiv(scaler) + : product.floorDiv(scaler)); } AffineMap maxMap = AffineMap::get( /*dimCount=*/0, /*symbolCount=*/symbolIndex, productExpressions, @@ -112,7 +116,8 @@ std::pair getLinearizedMemRefOffsetAndSize( LinearizedMemRefInfo getLinearizedMemRefOffsetAndSize(OpBuilder &builder, Location loc, int srcBits, int dstBits, OpFoldResult offset, - ArrayRef sizes) { + ArrayRef sizes, + LinearizedDivKind sizeDivKind) { SmallVector strides(sizes.size()); if (!sizes.empty()) { strides.back() = builder.getIndexAttr(1); @@ -128,7 +133,8 @@ getLinearizedMemRefOffsetAndSize(OpBuilder &builder, Location loc, int srcBits, LinearizedMemRefInfo linearizedMemRefInfo; std::tie(linearizedMemRefInfo, std::ignore) = getLinearizedMemRefOffsetAndSize(builder, loc, srcBits, dstBits, offset, - sizes, strides); + sizes, strides, /*indices=*/{}, + sizeDivKind); return linearizedMemRefInfo; } diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type-no-assume-aligned.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type-no-assume-aligned.mlir new file mode 100644 index 0000000000000..3625f91cedbd6 --- /dev/null +++ b/mlir/test/Dialect/MemRef/emulate-narrow-type-no-assume-aligned.mlir @@ -0,0 +1,23 @@ +// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=8" --cse --verify-diagnostics --split-input-file %s + +// Without `assume-aligned=true`, dynamic offsets in `memref.subview` and +// `memref.reinterpret_cast` cannot be proven to be multiples of +// `dstBits / srcBits`. The patterns must reject them so partial conversion +// fails to legalize the op. + +func.func @negative_subview_dynamic_inner_offset_i4(%off: index) -> i4 { + %c0 = arith.constant 0 : index + %arr = memref.alloc() : memref<128xi4> + // expected-error @+1 {{failed to legalize operation 'memref.subview' that was explicitly marked illegal}} + %subview = memref.subview %arr[%off] [32] [1] : memref<128xi4> to memref<32xi4, strided<[1], offset: ?>> + %ld = memref.load %subview[%c0] : memref<32xi4, strided<[1], offset: ?>> + return %ld : i4 +} + +// ----- + +func.func @negative_reinterpret_cast_memref_rank3_dynamic_offset_i4(%arg0: memref<2x4x8xi4>, %off: index) -> memref<4x4x8xi4, strided<[32, 8, 1], offset: ?>> { + // expected-error @+1 {{failed to legalize operation 'memref.reinterpret_cast' that was explicitly marked illegal}} + %r = memref.reinterpret_cast %arg0 to offset: [%off], sizes: [4, 4, 8], strides: [32, 8, 1] : memref<2x4x8xi4> to memref<4x4x8xi4, strided<[32, 8, 1], offset: ?>> + return %r : memref<4x4x8xi4, strided<[32, 8, 1], offset: ?>> +} diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir index dd64ecc98721a..adc8fe3b36096 100644 --- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir +++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=8" --cse --verify-diagnostics --split-input-file %s | FileCheck %s -// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=32" --cse --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=CHECK32 +// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=8 assume-aligned=true" --cse --verify-diagnostics --split-input-file %s | FileCheck %s +// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=32 assume-aligned=true" --cse --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=CHECK32 // Expect no conversions. func.func @memref_i8() -> i8 { @@ -238,6 +238,51 @@ func.func @memref_subview_dynamic_offset_i4(%idx : index) -> i4 { // ----- +func.func @memref_subview_dynamic_inner_offset_i4(%off: index) -> i4 { + %c0 = arith.constant 0 : index + %arr = memref.alloc() : memref<128xi4> + %subview = memref.subview %arr[%off] [32] [1] : memref<128xi4> to memref<32xi4, strided<[1], offset: ?>> + %ld = memref.load %subview[%c0] : memref<32xi4, strided<[1], offset: ?>> + return %ld : i4 +} + +// CHECK-LABEL: func.func @memref_subview_dynamic_inner_offset_i4( +// CHECK-SAME: %[[OFF:[a-zA-Z0-9_]+]]: index +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<64xi8> +// CHECK: %[[IDX:.+]] = affine.apply {{.*}}%[[OFF]] +// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][%[[IDX]]] [16] [1] : memref<64xi8> to memref<16xi8, strided<[1], offset: ?>> +// CHECK: memref.load %[[SUBVIEW]] + +// CHECK32-LABEL: func.func @memref_subview_dynamic_inner_offset_i4( +// CHECK32-SAME: %[[OFF:[a-zA-Z0-9_]+]]: index +// CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<16xi32> +// CHECK32: %[[IDX:.+]] = affine.apply {{.*}}%[[OFF]] +// CHECK32: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][%[[IDX]]] [4] [1] : memref<16xi32> to memref<4xi32, strided<[1], offset: ?>> +// CHECK32: memref.load %[[SUBVIEW]] + +// ----- + +// Dynamic innermost offset that is provably aligned (multiple of +// `dstBits / srcBits`). The affine simplifier folds the `floordiv` away. + +func.func @memref_subview_aligned_dynamic_inner_offset_i4(%x: index) -> i4 { + %c0 = arith.constant 0 : index + %off = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%x] + %arr = memref.alloc() : memref<128xi4> + %subview = memref.subview %arr[%off] [32] [1] : memref<128xi4> to memref<32xi4, strided<[1], offset: ?>> + %ld = memref.load %subview[%c0] : memref<32xi4, strided<[1], offset: ?>> + return %ld : i4 +} + +// CHECK-LABEL: func.func @memref_subview_aligned_dynamic_inner_offset_i4( +// CHECK-SAME: %[[X:[a-zA-Z0-9_]+]]: index +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<64xi8> +// CHECK-NOT: affine.apply +// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][%[[X]]] [16] [1] : memref<64xi8> to memref<16xi8, strided<[1], offset: ?>> +// CHECK: memref.load %[[SUBVIEW]] + +// ----- + func.func @negative_memref_subview_non_contiguous(%idx : index) -> i4 { %c0 = arith.constant 0 : index %arr = memref.alloc() : memref<40x40xi4> @@ -249,6 +294,61 @@ func.func @negative_memref_subview_non_contiguous(%idx : index) -> i4 { // ----- +// Rank-3 reinterpret_cast on a sub-byte (i4) memref with a static, aligned +// offset. + +func.func @reinterpret_cast_memref_rank3_static_offset_i4(%arg0: memref<2x4x8xi4>) -> memref<4x4x8xi4, strided<[32, 8, 1]>> { + %r = memref.reinterpret_cast %arg0 to offset: [0], sizes: [4, 4, 8], strides: [32, 8, 1] : memref<2x4x8xi4> to memref<4x4x8xi4, strided<[32, 8, 1]>> + return %r : memref<4x4x8xi4, strided<[32, 8, 1]>> +} + +// CHECK-LABEL: func @reinterpret_cast_memref_rank3_static_offset_i4( +// CHECK-SAME: %[[ARG0:.+]]: memref<32xi8> +// CHECK: %[[R:.+]] = memref.reinterpret_cast %[[ARG0]] to offset: [0], sizes: [64], strides: [1] : memref<32xi8> to memref<64xi8> +// CHECK: return %[[R]] + +// CHECK32-LABEL: func @reinterpret_cast_memref_rank3_static_offset_i4( +// CHECK32-SAME: %[[ARG0:.+]]: memref<8xi32> +// CHECK32: %[[R:.+]] = memref.reinterpret_cast %[[ARG0]] to offset: [0], sizes: [16], strides: [1] : memref<8xi32> to memref<16xi32> +// CHECK32: return %[[R]] + +// ----- + +// Rank-3 reinterpret_cast with a dynamic offset accepted under the alignment +// contract. + +func.func @reinterpret_cast_memref_rank3_dynamic_offset_i4(%arg0: memref<2x4x8xi4>, %off: index) -> memref<4x4x8xi4, strided<[32, 8, 1], offset: ?>> { + %r = memref.reinterpret_cast %arg0 to offset: [%off], sizes: [4, 4, 8], strides: [32, 8, 1] : memref<2x4x8xi4> to memref<4x4x8xi4, strided<[32, 8, 1], offset: ?>> + return %r : memref<4x4x8xi4, strided<[32, 8, 1], offset: ?>> +} + +// CHECK-LABEL: func @reinterpret_cast_memref_rank3_dynamic_offset_i4( +// CHECK-SAME: %[[ARG0:.+]]: memref<32xi8>, +// CHECK-SAME: %[[OFF:.+]]: index +// CHECK: %[[NEWOFF:.+]] = affine.apply {{.*}}%[[OFF]] +// CHECK: %[[R:.+]] = memref.reinterpret_cast %[[ARG0]] to offset: {{\[}}%[[NEWOFF]]{{\]}}, sizes: [64], strides: [1] : memref<32xi8> to memref<64xi8, strided<[1], offset: ?>> +// CHECK: return %[[R]] + +// CHECK32-LABEL: func @reinterpret_cast_memref_rank3_dynamic_offset_i4( +// CHECK32-SAME: %[[ARG0:.+]]: memref<8xi32>, +// CHECK32-SAME: %[[OFF:.+]]: index +// CHECK32: %[[NEWOFF:.+]] = affine.apply {{.*}}%[[OFF]] +// CHECK32: %[[R:.+]] = memref.reinterpret_cast %[[ARG0]] to offset: {{\[}}%[[NEWOFF]]{{\]}}, sizes: [16], strides: [1] : memref<8xi32> to memref<16xi32, strided<[1], offset: ?>> +// CHECK32: return %[[R]] + +// ----- + +// Provably-misaligned static offset (1 is not a multiple of i4 -> i8 ratio +// of 2). Lowering must fail. + +func.func @negative_reinterpret_cast_memref_misaligned_static_offset_i4(%arg0: memref<2x4x8xi4>) -> memref<4x4x8xi4, strided<[32, 8, 1], offset: 1>> { + // expected-error @+1 {{failed to legalize operation 'memref.reinterpret_cast' that was explicitly marked illegal}} + %r = memref.reinterpret_cast %arg0 to offset: [1], sizes: [4, 4, 8], strides: [32, 8, 1] : memref<2x4x8xi4> to memref<4x4x8xi4, strided<[32, 8, 1], offset: 1>> + return %r : memref<4x4x8xi4, strided<[32, 8, 1], offset: 1>> +} + +// ----- + func.func @reinterpret_cast_memref_load_0D() -> i4 { %0 = memref.alloc() : memref<5xi4> %reinterpret_cast_0 = memref.reinterpret_cast %0 to offset: [0], sizes: [], strides: [] : memref<5xi4> to memref diff --git a/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp b/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp index 9313a0945d86b..bec83a8dcbef9 100644 --- a/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp +++ b/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp @@ -99,8 +99,8 @@ struct TestEmulateNarrowTypePass RewritePatternSet patterns(ctx); arith::populateArithNarrowTypeEmulationPatterns(typeConverter, patterns); - memref::populateMemRefNarrowTypeEmulationPatterns(typeConverter, patterns, - disableAtomicRMW); + memref::populateMemRefNarrowTypeEmulationPatterns( + typeConverter, patterns, disableAtomicRMW, assumeAligned); vector::populateVectorNarrowTypeEmulationPatterns( typeConverter, patterns, disableAtomicRMW, assumeAligned); From 23e647e6437d790d43200a3e4ea515fd7a280d90 Mon Sep 17 00:00:00 2001 From: YongKang Zhu Date: Mon, 11 May 2026 13:32:27 -0700 Subject: [PATCH 349/538] [BOLT] Fix EH data encoding checks in relocateEHFrameSection (#196777) Previously committed in 7ab26d7c3a16 (#195691) and later reverted in bc654b438ffe (#196672) due to failures extended bolt-tests. The problem was that the mask should be `0x70` instead of `0xf0`, so to allow `DW_EH_PE_indirect` to pass through. The `DW_EH_PE_*rel` constants are not defined as values that each have only one distinctive bit set, so we rewrote the conditions to check encoding scheme explicitly. --- bolt/lib/Rewrite/RewriteInstance.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 43d4421e06928..b1fa65390c5e7 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2162,13 +2162,11 @@ void RewriteInstance::relocateEHFrameSection() { return; // Only fix references that are relative to other locations. - if (!(DwarfType & dwarf::DW_EH_PE_pcrel) && - !(DwarfType & dwarf::DW_EH_PE_textrel) && - !(DwarfType & dwarf::DW_EH_PE_funcrel) && - !(DwarfType & dwarf::DW_EH_PE_datarel)) - return; - - if (!(DwarfType & dwarf::DW_EH_PE_sdata4)) + const uint64_t Mask = 0xf0 & ~dwarf::DW_EH_PE_indirect; + if ((DwarfType & Mask) != dwarf::DW_EH_PE_pcrel && + (DwarfType & Mask) != dwarf::DW_EH_PE_textrel && + (DwarfType & Mask) != dwarf::DW_EH_PE_funcrel && + (DwarfType & Mask) != dwarf::DW_EH_PE_datarel) return; uint32_t RelType; From 541a8c51eb58beaa822430d249aa049f0c955059 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Mon, 11 May 2026 13:39:10 -0700 Subject: [PATCH 350/538] [lldb] Add completion support for direct ivars (#195187) Fixes the current shortcoming where `v m_na` will not complete the member `m_name` on `this`. This implements tab completion to complement direct ivar access support in `frame variable`. Assisted-by: claude --------- Co-authored-by: Jonas Devlieghere --- lldb/source/Symbol/Variable.cpp | 37 +++++++++++++++++++ .../completion/TestCompletion.py | 8 ++++ 2 files changed, 45 insertions(+) diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp index eb4d740fa7efa..3f280d9cec2c6 100644 --- a/lldb/source/Symbol/Variable.cpp +++ b/lldb/source/Symbol/Variable.cpp @@ -21,6 +21,7 @@ #include "lldb/Symbol/TypeSystem.h" #include "lldb/Symbol/VariableList.h" #include "lldb/Target/ABI.h" +#include "lldb/Target/Language.h" #include "lldb/Target/Process.h" #include "lldb/Target/RegisterContext.h" #include "lldb/Target/StackFrame.h" @@ -475,6 +476,28 @@ static void PrivateAutoComplete( &prefix_path, // Anything that has been resolved already will be in here const CompilerType &compiler_type, CompletionRequest &request); +/// Get the CompilerType of the current instance (this/self) for direct ivar +/// completion. Returns an invalid CompilerType if the frame is not for an +/// instance method. +static CompilerType GetInstanceType(StackFrame &frame, + VariableList &variable_list) { + SymbolContext sc = + frame.GetSymbolContext(eSymbolContextFunction | eSymbolContextBlock); + llvm::StringRef instance_name = sc.GetInstanceName(); + if (instance_name.empty()) + return {}; + VariableSP var_sp = variable_list.FindVariable(ConstString(instance_name)); + if (!var_sp) + return {}; + Type *var_type = var_sp->GetType(); + if (!var_type) + return {}; + CompilerType compiler_type = var_type->GetForwardCompilerType(); + if (compiler_type.IsPointerType()) + compiler_type = compiler_type.GetPointeeType(); + return compiler_type.GetCanonicalType(); +} + static void PrivateAutoCompleteMembers( StackFrame *frame, const std::string &partial_member_name, llvm::StringRef partial_path, @@ -598,6 +621,13 @@ static void PrivateAutoComplete( if (variable_list) { for (const VariableSP &var_sp : *variable_list) request.AddCompletion(var_sp->GetName()); + + // Offer members of this/self so that direct ivar access can be + // completed (eg "frame variable member" for "this->member"). + CompilerType instance_type = GetInstanceType(*frame, *variable_list); + if (instance_type.IsValid()) + PrivateAutoCompleteMembers(frame, "", "", "", instance_type, + request); } } } @@ -720,6 +750,13 @@ static void PrivateAutoComplete( } } } + + // Try also completing the token as a member of this/self (direct ivar + // access). + CompilerType instance_type = GetInstanceType(*frame, *variable_list); + if (instance_type.IsValid()) + PrivateAutoCompleteMembers(frame, token, remaining_partial_path, + prefix_path, instance_type, request); } } break; diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py index 8a972da9cc902..04253b15c041d 100644 --- a/lldb/test/API/functionalities/completion/TestCompletion.py +++ b/lldb/test/API/functionalities/completion/TestCompletion.py @@ -82,6 +82,14 @@ def do_test_variable_completion(self, command): f"{command} ptr_container->Mem", f"{command} ptr_container->MemberVar" ) + def test_frame_variable_direct_ivar(self): + """Test that 'frame variable' completes members of 'this' directly.""" + self.build() + lldbutil.run_to_name_breakpoint(self, "Bar") + self.completions_contain("frame variable ", ["t", "temp"]) + self.complete_from_to("frame variable te", "frame variable temp") + self.complete_from_to("frame variable t.", "frame variable t.x") + def test_process_attach_dash_dash_con(self): """Test that 'process attach --con' completes to 'process attach --continue '.""" self.complete_from_to("process attach --con", "process attach --continue ") From 839647d8cfb5a1c2f1256d3d800700cd296fe5ce Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 11 May 2026 15:40:50 -0500 Subject: [PATCH 351/538] [DWARFLinker] Preserve children of DW_TAG_GNU_template_parameter_pack (#196439) Pack children were not getting ordered synthetic keys, so TypePool deduplicated them by name and TypesComparator sorted the survivors alphabetically. Register the two missing tags with SyntheticTypeNameBuilder. --- .../Parallel/SyntheticTypeNameBuilder.cpp | 4 +- .../dwarflinker-template-parameter-pack.map | 7 + .../dwarflinker-template-parameter-pack.test | 181 ++++++++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/dsymutil/X86/Inputs/dwarflinker-template-parameter-pack.map create mode 100644 llvm/test/tools/dsymutil/X86/dwarflinker-template-parameter-pack.test diff --git a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp index ca918f6e17b38..b5be5d498ed3e 100644 --- a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp +++ b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp @@ -671,7 +671,8 @@ OrderedChildrenIndexAssigner::OrderedChildrenIndexAssigner( case dwarf::DW_TAG_subroutine_type: case dwarf::DW_TAG_union_type: case dwarf::DW_TAG_GNU_template_template_param: - case dwarf::DW_TAG_GNU_formal_parameter_pack: { + case dwarf::DW_TAG_GNU_formal_parameter_pack: + case dwarf::DW_TAG_GNU_template_parameter_pack: { NeedCountChildren = true; } break; case dwarf::DW_TAG_enumeration_type: { @@ -724,6 +725,7 @@ std::optional OrderedChildrenIndexAssigner::tagToArrayIndex( return 0; case dwarf::DW_TAG_template_value_parameter: case dwarf::DW_TAG_template_type_parameter: + case dwarf::DW_TAG_GNU_template_template_param: return 1; case dwarf::DW_TAG_enumeration_type: if (std::optional ParentIdx = DieEntry->getParentIdx()) { diff --git a/llvm/test/tools/dsymutil/X86/Inputs/dwarflinker-template-parameter-pack.map b/llvm/test/tools/dsymutil/X86/Inputs/dwarflinker-template-parameter-pack.map new file mode 100644 index 0000000000000..697b3803bee92 --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/Inputs/dwarflinker-template-parameter-pack.map @@ -0,0 +1,7 @@ +--- +triple: 'x86_64-apple-darwin' +objects: + - filename: pack.o + symbols: + - { sym: __Z3foov, objAddr: 0x0, binAddr: 0x10000, size: 0x10 } +... diff --git a/llvm/test/tools/dsymutil/X86/dwarflinker-template-parameter-pack.test b/llvm/test/tools/dsymutil/X86/dwarflinker-template-parameter-pack.test new file mode 100644 index 0000000000000..c34393553847d --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/dwarflinker-template-parameter-pack.test @@ -0,0 +1,181 @@ +# RUN: rm -rf %t.dir && mkdir -p %t.dir +# RUN: yaml2obj %s -o %t.dir/pack.o +# RUN: dsymutil --linker classic -y %p/Inputs/dwarflinker-template-parameter-pack.map -oso-prepend-path=%t.dir -f -o - | llvm-dwarfdump -a - | FileCheck %s +# RUN: dsymutil --linker parallel -y %p/Inputs/dwarflinker-template-parameter-pack.map -oso-prepend-path=%t.dir -f -o - | llvm-dwarfdump -a - | FileCheck %s + +## A DW_TAG_GNU_template_parameter_pack with three identical +## DW_TAG_template_type_parameter children (all referring to the same +## int base type) must be preserved as three separate child DIEs after +## linking. Before the parallel linker registered the pack tag with +## SyntheticTypeNameBuilder, pack children received no ordered synthetic +## key and TypePool deduplicated the three siblings down to one. + +# CHECK: DW_TAG_structure_type +# CHECK: DW_AT_name {{.*}}"Pack" +# CHECK: DW_TAG_GNU_template_parameter_pack +# CHECK: DW_AT_name {{.*}}"Ts" +# CHECK: DW_TAG_template_type_parameter +# CHECK: DW_AT_type {{.*}}"int" +# CHECK: DW_TAG_template_type_parameter +# CHECK: DW_AT_type {{.*}}"int" +# CHECK: DW_TAG_template_type_parameter +# CHECK: DW_AT_type {{.*}}"int" +# CHECK-NOT: DW_TAG_template_type_parameter + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x01000007 + cpusubtype: 0x00000003 + filetype: 0x00000001 + ncmds: 2 + sizeofcmds: 376 + flags: 0x00002000 + reserved: 0x00000000 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0x00 + vmsize: 0x300 + fileoff: 0x300 + filesize: 0x300 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __debug_abbrev + segname: __DWARF + addr: 0x000000000000000F + size: 0x37 + offset: 0x00000380 + align: 0 + reloff: 0x00000000 + nreloc: 0 + flags: 0x02000000 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + - sectname: __debug_info + segname: __DWARF + addr: 0x000000000000100 + size: 0x44 + offset: 0x000003B7 + align: 0 + reloff: 0x00000600 + nreloc: 1 + flags: 0x02000000 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + relocations: + - address: 0x000001FC + symbolnum: 1 + pcrel: true + length: 3 + extern: true + type: 0 + scattered: false + value: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 0x700 + nsyms: 1 + stroff: 0x710 + strsize: 10 +LinkEditData: + NameList: + - n_strx: 1 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - '__Z3foov' + - '' +DWARF: + debug_abbrev: + - Table: + - Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_producer + Form: DW_FORM_string + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_GNU_template_parameter_pack + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_template_type_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref_addr + - Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Attribute: DW_AT_const_value + Form: DW_FORM_data4 + - Attribute: DW_AT_type + Form: DW_FORM_ref_addr + debug_info: + - Version: 4 + Entries: + ## 0x0b: DW_TAG_compile_unit + - AbbrCode: 1 + Values: + - CStr: by_hand + - Value: 0x04 + ## 0x16: DW_TAG_structure_type "Pack" + - AbbrCode: 2 + Values: + - CStr: Pack + ## 0x1c: DW_TAG_GNU_template_parameter_pack "Ts" + - AbbrCode: 3 + Values: + - CStr: Ts + ## 0x20: DW_TAG_template_type_parameter -> int + - AbbrCode: 4 + Values: + - Value: 0x00000031 + ## 0x25: DW_TAG_template_type_parameter -> int + - AbbrCode: 4 + Values: + - Value: 0x00000031 + ## 0x2a: DW_TAG_template_type_parameter -> int + - AbbrCode: 4 + Values: + - Value: 0x00000031 + ## 0x2f: NULL (close pack) + - AbbrCode: 0 + ## 0x30: NULL (close structure) + - AbbrCode: 0 + ## 0x31: DW_TAG_base_type "int" + - AbbrCode: 5 + Values: + - CStr: int + ## 0x36: DW_TAG_variable "var" -> Pack + - AbbrCode: 6 + Values: + - CStr: var + - Value: 0x000000ff + - Value: 0x00000016 + ## 0x43: NULL (close CU) + - AbbrCode: 0 +... From 5645d72e54fdd73e17494b764b3057623a99622b Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Mon, 11 May 2026 22:42:48 +0200 Subject: [PATCH 352/538] [flang] Correct MIN/MAX bug with DO CONCURRENT and REDUCE (#196708) --- .../lib/Lower/Support/ReductionProcessor.cpp | 18 ++++++++- flang/test/Lower/loops3.f90 | 37 ++++++++++++++++++- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp index d5387f7a59118..b3a27736d1616 100644 --- a/flang/lib/Lower/Support/ReductionProcessor.cpp +++ b/flang/lib/Lower/Support/ReductionProcessor.cpp @@ -194,9 +194,23 @@ ReductionProcessor::getReductionName(ReductionIdentifier redId, case ReductionIdentifier::NEQV: reductionName = "neqv_reduction"; break; - default: - reductionName = "other_reduction"; + case ReductionIdentifier::MAX: + reductionName = "max_reduction"; + break; + case ReductionIdentifier::MIN: + reductionName = "min_reduction"; + break; + case ReductionIdentifier::IAND: + reductionName = "iand_reduction"; break; + case ReductionIdentifier::IOR: + reductionName = "ior_reduction"; + break; + case ReductionIdentifier::IEOR: + reductionName = "ieor_reduction"; + break; + default: + llvm_unreachable("unsupported reduction identifier"); } return getReductionName(reductionName, kindMap, ty, isByRef); diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 5df3c4fd93703..c80e6fcca4aed 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -12,7 +12,7 @@ subroutine loop_test ! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFloop_testEm"} ! CHECK: %[[SUM:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFloop_testEsum"} - ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) local(@_QFloop_testEtmp_private_i32 %{{.*}} -> %{{.*}} : !fir.ref) reduce(@add_reduction_i32 #fir.reduce_attr %[[SUM]]#0 -> %{{.*}}, @other_reduction_f32 #fir.reduce_attr %[[M]]#0 -> %{{.*}} : !fir.ref, !fir.ref) { + ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) local(@_QFloop_testEtmp_private_i32 %{{.*}} -> %{{.*}} : !fir.ref) reduce(@add_reduction_i32 #fir.reduce_attr %[[SUM]]#0 -> %{{.*}}, @max_reduction_f32 #fir.reduce_attr %[[M]]#0 -> %{{.*}} : !fir.ref, !fir.ref) { ! CHECK: %[[TMP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFloop_testEtmp"} ! CHECK: %[[SUM_INNER:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFloop_testEsum"} ! CHECK: %[[M_INNER:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFloop_testEm"} @@ -28,3 +28,38 @@ subroutine loop_test m = max(m, sum) enddo end subroutine loop_test + +! CHECK-LABEL: func.func @_QPloop_min_max_test +subroutine loop_min_max_test + integer :: i + real :: lo, hi + lo = huge(0.0) + hi = 0.0 + + ! CHECK: fir.do_concurrent.loop + ! CHECK-SAME: @min_reduction_f32 #fir.reduce_attr + ! CHECK-SAME: @max_reduction_f32 #fir.reduce_attr + do concurrent (i=1:10) reduce(min:lo) reduce(max:hi) + lo = min(lo, real(i)) + hi = max(hi, real(i)) + enddo +end subroutine loop_min_max_test + +! CHECK-LABEL: func.func @_QPloop_bitwise_test +subroutine loop_bitwise_test + integer :: i + integer :: a, o, x + a = -1 + o = 0 + x = 0 + + ! CHECK: fir.do_concurrent.loop + ! CHECK-SAME: @iand_reduction_i32 #fir.reduce_attr + ! CHECK-SAME: @ior_reduction_i32 #fir.reduce_attr + ! CHECK-SAME: @ieor_reduction_i32 #fir.reduce_attr + do concurrent (i=1:10) reduce(iand:a) reduce(ior:o) reduce(ieor:x) + a = iand(a, i) + o = ior(o, i) + x = ieor(x, i) + enddo +end subroutine loop_bitwise_test From 185305d8ca0a61524e46ccb87f0b09c91a1dfb0b Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 11 May 2026 13:51:00 -0700 Subject: [PATCH 353/538] [AMDGPU] Prevent prefetch and load reordering (#197025) Mark prefetches as having side effects, otherwise scheduler reorders these with loads. --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 1 + llvm/lib/Target/AMDGPU/SMInstructions.td | 1 + .../AMDGPU/llvm.amdgcn.flat.prefetch.ll | 19 +++++++++++++++++++ .../AMDGPU/llvm.amdgcn.global.prefetch.ll | 19 +++++++++++++++++++ .../AMDGPU/llvm.amdgcn.s.prefetch.data.ll | 14 ++++++++++++++ .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 14 +++++++------- 6 files changed, 61 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index a0963fcd5ce55..0f30ab24521cb 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -530,6 +530,7 @@ class FLAT_Prefetch_Pseudo { diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index ee8d29c77708b..343d7660cf250 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -251,6 +251,7 @@ class SM_Prefetch_Pseudo let ScalarStore = 0; let has_offset = 1; let has_soffset = 1; + let hasSideEffects = 1; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll index bace9a1d8a7d1..022eda6627a9d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll @@ -107,3 +107,22 @@ entry: tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27) ret void } + +define amdgpu_ps float @flat_prefetch_and_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_prefetch_and_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 2, s[0:1] +; GCN-NEXT: flat_prefetch_b8 v[2:3] +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %ptr = getelementptr inbounds float, ptr %p, i64 %idxprom + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0) + %ret = load float, ptr %ptr, align 4 + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll index 2c5ad1e36cc9e..8039c6dedf532 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll @@ -107,3 +107,22 @@ entry: tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 27) ret void } + +define amdgpu_ps float @global_prefetch_and_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_prefetch_and_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 2, s[0:1] +; GCN-NEXT: global_prefetch_b8 v[2:3], off +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %ptr = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0) + %ret = load float, ptr addrspace(1) %ptr, align 4 + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll index 26d64aa1cdf4c..3ee0a9ad8ac40 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll @@ -144,6 +144,20 @@ entry: ret void } +define amdgpu_ps float @prefetch_and_load_b32(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: prefetch_and_load_b32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GCN-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %p, i32 0) + %ret = load float, ptr addrspace(4) %p, align 4 + ret float %ret +} + declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len) declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len) diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 29ff00607bd66..199b79932402f 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -113,8 +113,8 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX1250-NEXT: .LBB0_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176 ; GFX1250-NEXT: flat_prefetch_b8 v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176 ; GFX1250-NEXT: s_add_co_i32 s6, s6, -1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 @@ -183,8 +183,8 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB1_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 @@ -209,9 +209,9 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12ES2-SPREFETCH-NEXT: .LBB1_2: ; %for.body ; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 ; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0) depctr_vm_vsrc(0) ; GFX12ES2-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 -; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 ; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 @@ -236,8 +236,8 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX1250-NEXT: .LBB1_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176 ; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176 ; GFX1250-NEXT: s_add_co_i32 s6, s6, -1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 @@ -306,8 +306,8 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GFX12-SPREFETCH-NEXT: .LBB2_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 -; GFX12-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 +; GFX12-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 @@ -333,8 +333,8 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GFX12ES2-SPREFETCH-NEXT: .LBB2_2: ; %for.body ; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0 -; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 +; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 @@ -843,8 +843,8 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB5_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 ; GFX1250-NEXT: global_prefetch_b8 v[2:3], off scope:SCOPE_SE +; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3] ; GFX1250-NEXT: s_add_co_i32 s0, s0, -1 From 14cc641b026ce9622cd47d9bc2cc9f73ee3780a5 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 11 May 2026 13:57:03 -0700 Subject: [PATCH 354/538] [msan] Strengthen LLVM/NEON floating-point<->int propagation (#196875) This generalizes handleNEONVectorConvertIntrinsic() to apply it to LLVM cross-platform floating point<->int conversion intrinsics. The handler uses an all-or-nothing approach: if any bit of an input element is uninitialized, the corresponding output element is fully uninitialized. This approximates how a single bit flip in an integer can affect multiple bits of the equivalent floating-point (likewise for FP to int). This implements the future work suggested in https://github.com/llvm/llvm-project/pull/196429. --- .../Instrumentation/MemorySanitizer.cpp | 104 +-- .../MemorySanitizer/AArch64/arm64-vcvt.ll | 32 +- .../AArch64/arm64-vcvt_f32_su32.ll | 16 +- .../X86/avx512-intrinsics-upgrade.ll | 637 +++++++++--------- .../MemorySanitizer/X86/avx512-intrinsics.ll | 24 +- .../X86/avx512fp16-intrinsics.ll | 6 +- .../Instrumentation/MemorySanitizer/ftrunc.ll | 92 ++- 7 files changed, 511 insertions(+), 400 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index f128ff5cfaab6..c45ec68f3cd07 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2616,10 +2616,63 @@ struct MemorySanitizerVisitor : public InstVisitor { setOrigin(&I, getOrigin(&I, 0)); } - void visitFPToSIInst(CastInst &I) { handleShadowOr(I); } - void visitFPToUIInst(CastInst &I) { handleShadowOr(I); } - void visitSIToFPInst(CastInst &I) { handleShadowOr(I); } - void visitUIToFPInst(CastInst &I) { handleShadowOr(I); } + /// Handle LLVM and NEON vector convert intrinsics. + /// + /// e.g., <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) + /// i32 @llvm.aarch64.neon.fcvtms.i32.f64 (double) + /// <2 x i32> @fptoui (<2 x float>) + /// i64 @llvm.fptosi.sat.i64.f64(double) + /// + /// Note that the size of input/output elements can differ e.g., + /// double @sitofp(i32) + /// but the number of elements must be the same. + /// + /// For conversions to or from fixed-point, there is a trailing argument to + /// indicate the fixed-point precision: + /// - <4 x float> llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) + /// - <4 x i32> llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) + /// + /// For x86 SSE vector convert intrinsics, see + /// handleSSEVectorConvertIntrinsic(). + void handleGenericVectorConvertIntrinsic(Instruction &I, bool FixedPoint) { + [[maybe_unused]] unsigned NumArgs = I.getNumOperands(); + if (auto *CI = dyn_cast(&I)) + NumArgs = CI->arg_size(); + + if (FixedPoint) { + assert(NumArgs == 2); + Value *Precision = I.getOperand(1); + insertCheckShadowOf(Precision, &I); + } else { + assert(NumArgs == 1); + } + + IRBuilder<> IRB(&I); + Value *S0 = getShadow(&I, 0); + + /// For scalars: + /// Since they are converting from floating-point to integer, the output is + /// - fully uninitialized if *any* bit of the input is uninitialized + /// - fully ininitialized if all bits of the input are ininitialized + /// We apply the same principle on a per-field basis for vectors. + Value *OutShadow = IRB.CreateSExt(IRB.CreateICmpNE(S0, getCleanShadow(S0)), + getShadowTy(&I)); + setShadow(&I, OutShadow); + setOriginForNaryOp(I); + } + + void visitFPToSIInst(CastInst &I) { + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); + } + void visitFPToUIInst(CastInst &I) { + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); + } + void visitSIToFPInst(CastInst &I) { + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); + } + void visitUIToFPInst(CastInst &I) { + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); + } void visitFPExtInst(CastInst &I) { handleShadowOr(I); } void visitFPTruncInst(CastInst &I) { handleShadowOr(I); } @@ -3561,43 +3614,6 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - /// Handle Arm NEON vector convert intrinsics. - /// - /// e.g., <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) - /// i32 @llvm.aarch64.neon.fcvtms.i32.f64 (double) - /// - /// For conversions to or from fixed-point, there is a trailing argument to - /// indicate the fixed-point precision: - /// - <4 x float> llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) - /// - <4 x i32> llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) - /// - /// For x86 SSE vector convert intrinsics, see - /// handleSSEVectorConvertIntrinsic(). - void handleNEONVectorConvertIntrinsic(IntrinsicInst &I, bool FixedPoint) { - if (FixedPoint) - assert(I.arg_size() == 2); - else - assert(I.arg_size() == 1); - - IRBuilder<> IRB(&I); - Value *S0 = getShadow(&I, 0); - - if (FixedPoint) { - Value *Precision = I.getOperand(1); - insertCheckShadowOf(Precision, &I); - } - - /// For scalars: - /// Since they are converting from floating-point to integer, the output is - /// - fully uninitialized if *any* bit of the input is uninitialized - /// - fully ininitialized if all bits of the input are ininitialized - /// We apply the same principle on a per-field basis for vectors. - Value *OutShadow = IRB.CreateSExt(IRB.CreateICmpNE(S0, getCleanShadow(S0)), - getShadowTy(&I)); - setShadow(&I, OutShadow); - setOriginForNaryOp(I); - } - /// Some instructions have additional zero-elements in the return type /// e.g., <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, ...) /// @@ -5870,7 +5886,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // lowered to these cross-platform intrinsics. case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: - handleShadowOr(I); + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); break; default: @@ -7173,7 +7189,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // Vector Conversions Between Half-Precision and Single-Precision case Intrinsic::aarch64_neon_vcvthf2fp: case Intrinsic::aarch64_neon_vcvtfp2hf: - handleNEONVectorConvertIntrinsic(I, /*FixedPoint=*/false); + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); break; // Vector Conversions Between Fixed-Point and Floating-Point @@ -7181,7 +7197,7 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::aarch64_neon_vcvtfp2fxs: case Intrinsic::aarch64_neon_vcvtfxu2fp: case Intrinsic::aarch64_neon_vcvtfp2fxu: - handleNEONVectorConvertIntrinsic(I, /*FixedPoint=*/true); + handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/true); break; // TODO: bfloat conversions diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll index e8d9fd2862d29..9efed0338cb24 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll @@ -533,8 +533,10 @@ define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptosi <2 x float> [[A]] to <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = fptosi <2 x float> %A to <2 x i32> @@ -546,8 +548,10 @@ define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptosi <4 x float> [[A]] to <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = fptosi <4 x float> %A to <4 x i32> @@ -559,8 +563,10 @@ define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptosi <2 x double> [[A]] to <2 x i64> -; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = fptosi <2 x double> %A to <2 x i64> @@ -572,8 +578,10 @@ define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptosi <1 x double> [[A]] to <1 x i64> -; CHECK-NEXT: store <1 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = fptosi <1 x double> %A to <1 x i64> @@ -650,8 +658,10 @@ define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptoui <2 x float> [[A]] to <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = fptoui <2 x float> %A to <2 x i32> @@ -663,8 +673,10 @@ define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptoui <4 x float> [[A]] to <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = fptoui <4 x float> %A to <4 x i32> @@ -676,8 +688,10 @@ define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptoui <2 x double> [[A]] to <2 x i64> -; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = fptoui <2 x double> %A to <2 x i64> @@ -689,8 +703,10 @@ define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = fptoui <1 x double> [[A]] to <1 x i64> -; CHECK-NEXT: store <1 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = fptoui <1 x double> %A to <1 x i64> diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll index 01af715652102..802c3aa4784ee 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll @@ -15,8 +15,10 @@ define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp sanitize_memory { ; CHECK-SAME: <2 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float> -; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x float> [[VCVT_I]] ; %vcvt.i = uitofp <2 x i32> %a to <2 x float> @@ -28,8 +30,10 @@ define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp sanitize_memory { ; CHECK-SAME: <2 x i32> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i32> [[A]] to <2 x float> -; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x float> [[VCVT_I]] ; %vcvt.i = sitofp <2 x i32> %a to <2 x float> @@ -41,8 +45,10 @@ define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp sanitize_memory { ; CHECK-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[VCVT_I]] ; %vcvt.i = uitofp <4 x i32> %a to <4 x float> @@ -54,8 +60,10 @@ define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp sanitize_memory { ; CHECK-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float> -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[VCVT_I]] ; %vcvt.i = sitofp <4 x i32> %a to <4 x float> diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll index f0a1791068d9e..a870fdd945b6f 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll @@ -739,14 +739,14 @@ define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) #0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9:[0-9]+]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr align 1 [[PTR]], <16 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -783,14 +783,14 @@ define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) #0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr align 1 [[PTR]], <8 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -827,14 +827,14 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr align 64 [[PTR]], <16 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -871,14 +871,14 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr align 64 [[PTR]], <8 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -915,14 +915,14 @@ define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr align 1 [[PTR1]], <8 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -959,14 +959,14 @@ define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i3 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr align 1 [[PTR1]], <16 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -1003,14 +1003,14 @@ define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr align 64 [[PTR1]], <8 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -1047,14 +1047,14 @@ define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr align 64 [[PTR1]], <16 x i1> [[TMP6]]) ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 @@ -1080,7 +1080,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 64 @@ -1100,7 +1100,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP10]], <16 x float> [[TMP5]]) @@ -1116,7 +1116,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 25: ; CHECK-NEXT: [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP19]], <16 x float> zeroinitializer) @@ -1143,7 +1143,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 1 @@ -1163,7 +1163,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP10]], <16 x float> [[TMP5]]) @@ -1179,7 +1179,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 25: ; CHECK-NEXT: [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP19]], <16 x float> zeroinitializer) @@ -1206,7 +1206,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 64 @@ -1226,7 +1226,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP10]], <8 x double> [[TMP5]]) @@ -1242,7 +1242,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 25: ; CHECK-NEXT: [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP19]], <8 x double> zeroinitializer) @@ -1269,7 +1269,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 1 @@ -1289,7 +1289,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP10]], <8 x double> [[TMP5]]) @@ -1305,7 +1305,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 25: ; CHECK-NEXT: [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP19]], <8 x double> zeroinitializer) @@ -1335,7 +1335,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 1 @@ -1355,7 +1355,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] ; CHECK: 16: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 17: ; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[PTR2]], <16 x i1> [[TMP11]], <16 x i32> [[TMP6]]) @@ -1371,7 +1371,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] ; CHECK: 25: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 26: ; CHECK-NEXT: [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP20]], <16 x i32> zeroinitializer) @@ -1399,7 +1399,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 1 @@ -1419,7 +1419,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] ; CHECK: 16: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 17: ; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[PTR2]], <8 x i1> [[TMP11]], <8 x i64> [[TMP6]]) @@ -1435,7 +1435,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] ; CHECK: 25: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 26: ; CHECK-NEXT: [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP20]], <8 x i64> zeroinitializer) @@ -1462,7 +1462,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 64 @@ -1482,7 +1482,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP10]], <16 x i32> [[TMP5]]) @@ -1498,7 +1498,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 25: ; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) @@ -1525,7 +1525,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64 @@ -1545,7 +1545,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask) ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP10]], <8 x i64> [[TMP5]]) @@ -1561,7 +1561,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask) ; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] ; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 25: ; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) @@ -2770,7 +2770,7 @@ define void@test_storent_q_512(<8 x i64> %data, ptr %ptr) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 @@ -2795,7 +2795,7 @@ define void @test_storent_pd_512(<8 x double> %data, ptr %ptr) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 @@ -2820,7 +2820,7 @@ define void @test_storent_ps_512(<16 x float> %data, ptr %ptr) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 @@ -3192,7 +3192,7 @@ define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -3221,7 +3221,7 @@ define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -3257,7 +3257,7 @@ define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -3294,7 +3294,7 @@ define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <16 x i32> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -3331,7 +3331,7 @@ define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -3375,7 +3375,7 @@ define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mas ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -3481,7 +3481,7 @@ define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -3510,7 +3510,7 @@ define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -3546,7 +3546,7 @@ define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -3583,7 +3583,7 @@ define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <16 x i32> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -3620,7 +3620,7 @@ define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -3663,7 +3663,7 @@ define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mas ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -3769,7 +3769,7 @@ define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 @@ -3798,7 +3798,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %p ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 @@ -3834,7 +3834,7 @@ define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 @@ -3871,7 +3871,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b, <8 x i64> %e ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -3908,7 +3908,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -3952,7 +3952,7 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -4058,7 +4058,7 @@ define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 @@ -4087,7 +4087,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %p ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 @@ -4123,7 +4123,7 @@ define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 @@ -4160,7 +4160,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b, <8 x i64> %e ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -4197,7 +4197,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -4241,7 +4241,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -4347,7 +4347,7 @@ define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -4376,7 +4376,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -4412,7 +4412,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -4449,7 +4449,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -4486,7 +4486,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -4530,7 +4530,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i1 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 @@ -7853,7 +7853,7 @@ define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, ptr %ptr) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64 @@ -7880,7 +7880,8 @@ define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double ; CHECK-LABEL: @test_int_x86_avx512_cvt_dq2pd_512( ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> ; CHECK-NEXT: [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double> ; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x double> [[CVT]] @@ -7896,7 +7897,8 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x d ; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP14]] to <8 x i64> ; CHECK-NEXT: [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> @@ -7921,7 +7923,8 @@ define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x doubl ; CHECK-LABEL: @test_int_x86_avx512_cvt_udq2pd_512( ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double> ; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x double> [[CVT]] @@ -7937,7 +7940,8 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x ; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP14]] to <8 x i64> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> @@ -7964,7 +7968,7 @@ define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4) @@ -7983,7 +7987,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8) @@ -8010,7 +8014,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> [[A1:%.*]], i16 [[MASK:%.*]], i32 4) @@ -8033,7 +8037,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 8) @@ -8056,7 +8060,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) #0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 4) @@ -8151,7 +8155,7 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP6]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X3:%.*]], <8 x i64> [[X2]]) @@ -8178,7 +8182,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP8]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X5:%.*]], <8 x i64> [[X4]]) @@ -8214,7 +8218,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP7]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X4:%.*]], <8 x i64> [[X2]]) @@ -8249,7 +8253,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X3:%.*]], <16 x i32> [[X2]]) @@ -8276,7 +8280,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP8]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X5:%.*]], <16 x i32> [[X4]]) @@ -8313,7 +8317,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X4:%.*]], <16 x i32> [[X2]]) @@ -8507,7 +8511,7 @@ define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -8552,7 +8556,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -8604,7 +8608,7 @@ define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -8657,7 +8661,7 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -8713,7 +8717,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -8776,7 +8780,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP35:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -8857,7 +8861,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -8919,7 +8923,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_ ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP34:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -9110,7 +9114,7 @@ define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -9155,7 +9159,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -9207,7 +9211,7 @@ define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -9260,7 +9264,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -9316,7 +9320,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -9379,7 +9383,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -9830,7 +9834,7 @@ define <8 x i64> @test_x86_avx512_movntdqa(ptr %a0) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] ; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 3: ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[A0:%.*]], align 64, !nontemporal [[META2]] @@ -11122,7 +11126,7 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(ptr %x0ptr, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[X0:%.*]] = load <4 x float>, ptr [[X0PTR:%.*]], align 16 @@ -11225,7 +11229,7 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[X0:%.*]] = load <4 x double>, ptr [[X0PTR:%.*]], align 32 @@ -11312,7 +11316,7 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(ptr %x0ptr, < ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[X0:%.*]] = load <4 x i32>, ptr [[X0PTR:%.*]], align 16 @@ -11410,7 +11414,7 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[X0:%.*]] = load <4 x i64>, ptr [[X0PTR:%.*]], align 32 @@ -12127,7 +12131,7 @@ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8) @@ -12152,7 +12156,7 @@ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4) @@ -12289,7 +12293,7 @@ define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -12334,7 +12338,7 @@ define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passT ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -12388,7 +12392,7 @@ define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -12442,7 +12446,7 @@ define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -12497,7 +12501,7 @@ define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP33:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -12561,7 +12565,7 @@ define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP32:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -12740,7 +12744,7 @@ define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -12785,7 +12789,7 @@ define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passT ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -12839,7 +12843,7 @@ define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 @@ -12893,7 +12897,7 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -12948,7 +12952,7 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP33:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -13012,7 +13016,7 @@ define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP32:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 @@ -13073,7 +13077,8 @@ define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b) ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[B:%.*]] to double ; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP4]], i64 0 @@ -13094,7 +13099,7 @@ define <16 x float> @test_x86_vbroadcast_ss_512(ptr %a0) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] ; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 3: ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A0:%.*]], align 4 @@ -13150,7 +13155,7 @@ define <8 x double> @test_x86_vbroadcast_sd_512(ptr %a0) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] ; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 3: ; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[A0:%.*]], align 8 @@ -13196,7 +13201,7 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i6 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) @@ -13222,7 +13227,7 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) @@ -13257,7 +13262,7 @@ define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, < ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) @@ -13356,7 +13361,7 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) @@ -13382,7 +13387,7 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) @@ -13417,7 +13422,7 @@ define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, < ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) @@ -13520,7 +13525,7 @@ define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) @@ -13549,7 +13554,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) @@ -13588,7 +13593,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) @@ -13625,7 +13630,7 @@ define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1 ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) @@ -13654,7 +13659,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64 ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) @@ -13693,7 +13698,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) @@ -13724,7 +13729,7 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 @@ -13738,7 +13743,7 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP11]], 0 ; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X3]], <16 x i32> [[X4:%.*]]) @@ -13761,7 +13766,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 @@ -13775,7 +13780,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP19]], 0 ; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]]) @@ -13812,7 +13817,7 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP12]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]]) @@ -13841,7 +13846,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP21]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]]) @@ -13880,7 +13885,7 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]]) @@ -13909,7 +13914,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP21]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]]) @@ -13945,7 +13950,7 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP5]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X3]], <8 x i64> [[X2:%.*]]) @@ -13970,7 +13975,7 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP14]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]]) @@ -14002,7 +14007,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 @@ -14016,7 +14021,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP19]], 0 ; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]] ; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 14: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X4]], <16 x i32> [[X2]]) @@ -14050,7 +14055,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8 @@ -14071,7 +14076,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i24 [[TMP25]], 0 ; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] ; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 18: ; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X4]], <8 x double> [[X2]]) @@ -14113,7 +14118,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]] ; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 12: ; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X4]], <16 x float> [[X2:%.*]]) @@ -14150,7 +14155,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i24 [[TMP14]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X4]], <8 x i64> [[X2:%.*]]) @@ -14183,7 +14188,7 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X3]], <16 x i32> [[X2:%.*]]) @@ -14208,7 +14213,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP14]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X4]], <16 x i32> [[X2:%.*]]) @@ -14243,7 +14248,7 @@ define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -14267,7 +14272,7 @@ define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -14291,7 +14296,7 @@ define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -14315,7 +14320,7 @@ define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -14339,7 +14344,7 @@ define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -14363,7 +14368,7 @@ define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -14387,7 +14392,7 @@ define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -14411,7 +14416,7 @@ define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -14438,7 +14443,7 @@ define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -14473,7 +14478,7 @@ define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -14508,7 +14513,7 @@ define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -14543,7 +14548,7 @@ define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -14580,7 +14585,7 @@ define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -14617,7 +14622,7 @@ define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -14654,7 +14659,7 @@ define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -14691,7 +14696,7 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -14728,7 +14733,7 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8) @@ -14763,7 +14768,7 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9) @@ -14798,7 +14803,7 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10) @@ -14833,7 +14838,7 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11) @@ -14868,7 +14873,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -14901,7 +14906,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -14934,7 +14939,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -14968,7 +14973,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15003,7 +15008,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15038,7 +15043,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15073,7 +15078,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -15108,7 +15113,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -15144,7 +15149,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15181,7 +15186,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15215,7 +15220,7 @@ define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15237,7 +15242,7 @@ define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -15259,7 +15264,7 @@ define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -15282,7 +15287,7 @@ define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15305,7 +15310,7 @@ define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15332,7 +15337,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15367,7 +15372,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -15402,7 +15407,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -15438,7 +15443,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15475,7 +15480,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15508,7 +15513,7 @@ define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15530,7 +15535,7 @@ define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -15552,7 +15557,7 @@ define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -15575,7 +15580,7 @@ define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15598,7 +15603,7 @@ define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15623,7 +15628,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15656,7 +15661,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -15689,7 +15694,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -15723,7 +15728,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15758,7 +15763,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15793,7 +15798,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15828,7 +15833,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -15863,7 +15868,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -15899,7 +15904,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -15936,7 +15941,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -15970,7 +15975,7 @@ define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) @@ -15992,7 +15997,7 @@ define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) @@ -16014,7 +16019,7 @@ define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) @@ -16037,7 +16042,7 @@ define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x floa ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) @@ -16060,7 +16065,7 @@ define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) @@ -16091,7 +16096,7 @@ define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 % ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]]) @@ -16116,7 +16121,7 @@ define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true)) @@ -16145,7 +16150,7 @@ define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]]) @@ -16170,7 +16175,7 @@ define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true)) @@ -16199,7 +16204,7 @@ define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]]) @@ -16224,7 +16229,7 @@ define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true)) @@ -16253,7 +16258,7 @@ define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %ma ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]]) @@ -16278,7 +16283,7 @@ define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true)) @@ -16307,7 +16312,7 @@ define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]]) @@ -16336,7 +16341,7 @@ define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer) @@ -16362,7 +16367,7 @@ define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data) #0 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]]) @@ -16386,7 +16391,7 @@ define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> % ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]]) @@ -16416,7 +16421,7 @@ define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data, ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]]) @@ -16445,7 +16450,7 @@ define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer) @@ -16471,7 +16476,7 @@ define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data) #0 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]]) @@ -16501,7 +16506,7 @@ define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %ma ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]]) @@ -16530,7 +16535,7 @@ define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) @@ -16556,7 +16561,7 @@ define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]]) @@ -16586,7 +16591,7 @@ define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]]) @@ -16615,7 +16620,7 @@ define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) @@ -16641,7 +16646,7 @@ define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]]) @@ -16995,7 +17000,7 @@ define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %extr ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) @@ -17016,7 +17021,7 @@ define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) @@ -17046,7 +17051,7 @@ define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) # ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) @@ -17132,7 +17137,7 @@ define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 { ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) @@ -17153,7 +17158,7 @@ define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) @@ -17183,7 +17188,7 @@ define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] ; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 5: ; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) @@ -17754,7 +17759,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] ; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] ; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 24: ; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) @@ -17773,7 +17778,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] ; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] ; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 31: ; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) @@ -17852,7 +17857,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] ; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] ; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 24: ; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) @@ -17871,7 +17876,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] ; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] ; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 31: ; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) @@ -17949,7 +17954,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]] ; CHECK-NEXT: br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] ; CHECK: 22: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 23: ; CHECK-NEXT: [[TMP24:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]], i32 11) @@ -18022,7 +18027,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]] ; CHECK-NEXT: br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] ; CHECK: 22: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 23: ; CHECK-NEXT: [[TMP24:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]], i32 11) @@ -18095,7 +18100,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] ; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] ; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 24: ; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) @@ -18114,7 +18119,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] ; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] ; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 31: ; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) @@ -18193,7 +18198,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] ; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] ; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 24: ; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) @@ -18212,7 +18217,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] ; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] ; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 31: ; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) @@ -18256,7 +18261,7 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_par ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 @@ -18275,7 +18280,7 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_par ; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP10:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 @@ -18319,7 +18324,7 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_par ; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP30:%.*]], label [[TMP34:%.*]], !prof [[PROF1]] ; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 31: ; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 @@ -18359,7 +18364,7 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_pa ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 @@ -18378,7 +18383,7 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_pa ; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP10:%.*]], label [[TMP28:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 @@ -18421,7 +18426,7 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_pa ; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP33:%.*]], !prof [[PROF1]] ; CHECK: 29: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 30: ; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 @@ -18461,7 +18466,7 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_pa ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 @@ -18476,7 +18481,7 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_pa ; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP10:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 @@ -18516,7 +18521,7 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_pa ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP30:%.*]], label [[TMP34:%.*]], !prof [[PROF1]] ; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 31: ; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 @@ -18552,7 +18557,7 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_p ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 @@ -18567,7 +18572,7 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_p ; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP10:%.*]], label [[TMP28:%.*]], !prof [[PROF1]] ; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 11: ; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 @@ -18606,7 +18611,7 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_p ; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP33:%.*]], !prof [[PROF1]] ; CHECK: 29: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 30: ; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 @@ -18681,7 +18686,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] ; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] ; CHECK: 26: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 27: ; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11) @@ -18703,7 +18708,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] ; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] ; CHECK: 35: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 36: ; CHECK-NEXT: [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10) @@ -18788,7 +18793,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] ; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] ; CHECK: 26: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 27: ; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11) @@ -18810,7 +18815,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] ; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] ; CHECK: 35: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 36: ; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10) @@ -18897,7 +18902,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] ; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] ; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 29: ; CHECK-NEXT: [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11) @@ -18920,7 +18925,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] ; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] ; CHECK: 38: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 39: ; CHECK-NEXT: [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10) @@ -19007,7 +19012,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] ; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] ; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 29: ; CHECK-NEXT: [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11) @@ -19030,7 +19035,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] ; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] ; CHECK: 38: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 39: ; CHECK-NEXT: [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10) @@ -19077,7 +19082,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 @@ -19131,7 +19136,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] ; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 7: ; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 @@ -19184,7 +19189,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 @@ -19292,24 +19297,26 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = sext <16 x i1> [[TMP14]] to <16 x i32> ; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP15]], <16 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP15]] ; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] ; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK: 17: ; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) ; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer ; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] @@ -19331,24 +19338,26 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = sext <16 x i1> [[TMP14]] to <16 x i32> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP15]], <16 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP15]] ; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] ; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK: 17: ; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) ; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer ; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] @@ -19380,7 +19389,7 @@ define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) @@ -19406,7 +19415,7 @@ define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) # ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) @@ -19429,7 +19438,7 @@ define <8 x double> @test_compress_pd_512(<8 x double> %data, <8 x double> %extr ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[EXTRA_PARAM:%.*]], <8 x i1> splat (i1 true)) @@ -19461,7 +19470,7 @@ define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) @@ -19487,7 +19496,7 @@ define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) @@ -19510,7 +19519,7 @@ define <16 x float> @test_compress_ps_512(<16 x float> %data, <16 x float> %extr ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[EXTRA_PARAM:%.*]], <16 x i1> splat (i1 true)) @@ -19542,7 +19551,7 @@ define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) @@ -19568,7 +19577,7 @@ define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) @@ -19591,7 +19600,7 @@ define <8 x i64> @test_compress_q_512(<8 x i64> %data, <8 x i64> %extra_param) ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[EXTRA_PARAM:%.*]], <8 x i1> splat (i1 true)) @@ -19623,7 +19632,7 @@ define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passth ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) @@ -19649,7 +19658,7 @@ define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) @@ -19672,7 +19681,7 @@ define <16 x i32> @test_compress_d_512(<16 x i32> %data, <16 x i32> %extra_param ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[EXTRA_PARAM:%.*]], <16 x i1> splat (i1 true)) @@ -19697,7 +19706,7 @@ define <8 x double> @test_expand_pd_512(<8 x double> %data, <8 x double> %extra_ ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[EXTRA_PARAM:%.*]], <8 x i1> splat (i1 true)) @@ -19727,7 +19736,7 @@ define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %p ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) @@ -19753,7 +19762,7 @@ define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) @@ -19778,7 +19787,7 @@ define <16 x float> @test_expand_ps_512(<16 x float> %data, <16 x float> %extra_ ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[EXTRA_PARAM:%.*]], <16 x i1> splat (i1 true)) @@ -19808,7 +19817,7 @@ define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %p ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) @@ -19834,7 +19843,7 @@ define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) @@ -19859,7 +19868,7 @@ define <8 x i64> @test_expand_q_512(<8 x i64> %data, <8 x i64> %extra_param) #0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[EXTRA_PARAM:%.*]], <8 x i1> splat (i1 true)) @@ -19889,7 +19898,7 @@ define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) @@ -19915,7 +19924,7 @@ define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) @@ -19940,7 +19949,7 @@ define <16 x i32> @test_expand_d_512(<16 x i32> %data, <16 x i32> %extra_param) ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] ; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 6: ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[EXTRA_PARAM:%.*]], <16 x i1> splat (i1 true)) @@ -19970,7 +19979,7 @@ define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] ; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] ; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 10: ; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) @@ -19996,7 +20005,7 @@ define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 { ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) @@ -20026,7 +20035,7 @@ define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 1, <16 x i1> splat (i1 true), i32 8) @@ -20037,14 +20046,14 @@ define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> ; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] ; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 13: ; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[C:%.*]], <16 x float> [[D:%.*]], i32 1, <16 x i1> splat (i1 true), i32 4) ; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[_MSCMP5]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] ; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR9]] ; CHECK-NEXT: unreachable ; CHECK: 16: ; CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr [[P:%.*]], align 64 diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll index 7126f9457f530..d84cd899bd4a5 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll @@ -7222,24 +7222,26 @@ define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x ; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = sext <16 x i1> [[TMP14]] to <16 x i32> ; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP15]], <16 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP15]] ; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] ; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] ; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK: 17: ; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) ; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer ; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] @@ -7548,24 +7550,26 @@ define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = sext <16 x i1> [[TMP14]] to <16 x i32> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP15]], <16 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP15]] ; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] ; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] ; CHECK-NEXT: unreachable -; CHECK: 15: +; CHECK: 17: ; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) ; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer ; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll index c5e61d8ea07d7..49c1d28cc224e 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll @@ -2114,7 +2114,8 @@ define <16 x half> @sint_to_fp_16i32_to_16f16(<16 x i32> %x) #0 { ; CHECK-SAME: <16 x i32> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16> ; CHECK-NEXT: [[RES:%.*]] = sitofp <16 x i32> [[X]] to <16 x half> ; CHECK-NEXT: store <16 x i16> [[TMP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x half> [[RES]] @@ -2216,7 +2217,8 @@ define <16 x half> @uint_to_fp_16i32_to_16f16(<16 x i32> %x) #0 { ; CHECK-SAME: <16 x i32> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i32> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16> ; CHECK-NEXT: [[RES:%.*]] = uitofp <16 x i32> [[X]] to <16 x half> ; CHECK-NEXT: store <16 x i16> [[TMP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x half> [[RES]] diff --git a/llvm/test/Instrumentation/MemorySanitizer/ftrunc.ll b/llvm/test/Instrumentation/MemorySanitizer/ftrunc.ll index 878bdd24eed73..e636bb0350c7e 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/ftrunc.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/ftrunc.ll @@ -20,9 +20,13 @@ define float @trunc_unsigned_f32(float %x) #0 { ; CHECK-SAME: float [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 ; CHECK-NEXT: [[I:%.*]] = fptoui float [[X]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP4]] to i32 ; CHECK-NEXT: [[R:%.*]] = uitofp i32 [[I]] to float -; CHECK-NEXT: store i32 [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret float [[R]] ; %i = fptoui float %x to i32 @@ -35,9 +39,13 @@ define double @trunc_unsigned_f64(double %x) #0 { ; CHECK-SAME: double [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64 ; CHECK-NEXT: [[I:%.*]] = fptoui double [[X]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP4]] to i64 ; CHECK-NEXT: [[R:%.*]] = uitofp i64 [[I]] to double -; CHECK-NEXT: store i64 [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret double [[R]] ; %i = fptoui double %x to i64 @@ -50,9 +58,13 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 { ; CHECK-SAME: <4 x float> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[I:%.*]] = fptoui <4 x float> [[X]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[R:%.*]] = uitofp <4 x i32> [[I]] to <4 x float> -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[R]] ; %i = fptoui <4 x float> %x to <4 x i32> @@ -65,9 +77,13 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 { ; CHECK-SAME: <2 x double> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[I:%.*]] = fptoui <2 x double> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64> ; CHECK-NEXT: [[R:%.*]] = uitofp <2 x i64> [[I]] to <2 x double> -; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[R]] ; %i = fptoui <2 x double> %x to <2 x i64> @@ -80,9 +96,13 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 { ; CHECK-SAME: <4 x double> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64> ; CHECK-NEXT: [[I:%.*]] = fptoui <4 x double> [[X]] to <4 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i64> ; CHECK-NEXT: [[R:%.*]] = uitofp <4 x i64> [[I]] to <4 x double> -; CHECK-NEXT: store <4 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[R]] ; %i = fptoui <4 x double> %x to <4 x i64> @@ -95,9 +115,13 @@ define float @trunc_signed_f32_nsz(float %x) #0 { ; CHECK-SAME: float [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 ; CHECK-NEXT: [[I:%.*]] = fptosi float [[X]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP4]] to i32 ; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[I]] to float -; CHECK-NEXT: store i32 [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret float [[R]] ; %i = fptosi float %x to i32 @@ -110,9 +134,11 @@ define double @trunc_signed32_f64_no_fast_math(double %x) #0 { ; CHECK-SAME: double [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP2]] to i32 ; CHECK-NEXT: [[I:%.*]] = fptosi double [[X]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP4]] to i64 ; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[I]] to double ; CHECK-NEXT: store i64 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret double [[R]] @@ -127,9 +153,11 @@ define double @trunc_signed32_f64_nsz(double %x) #0 { ; CHECK-SAME: double [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP2]] to i32 ; CHECK-NEXT: [[I:%.*]] = fptosi double [[X]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP4]] to i64 ; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[I]] to double ; CHECK-NEXT: store i64 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret double [[R]] @@ -144,8 +172,11 @@ define double @trunc_f32_signed32_f64_nsz(float %x) #0 { ; CHECK-SAME: float [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP5]] to i32 ; CHECK-NEXT: [[I:%.*]] = fptosi float [[X]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext i1 [[TMP4]] to i64 ; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[I]] to double ; CHECK-NEXT: store i64 [[TMP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret double [[R]] @@ -160,8 +191,11 @@ define float @trunc_f64_signed32_f32_nsz(double %x) #0 { ; CHECK-SAME: double [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP5]] to i32 ; CHECK-NEXT: [[I:%.*]] = fptosi double [[X]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext i1 [[TMP4]] to i32 ; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[I]] to float ; CHECK-NEXT: store i32 [[TMP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret float [[R]] @@ -176,9 +210,13 @@ define double @trunc_signed_f64_nsz(double %x) #0 { ; CHECK-SAME: double [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64 ; CHECK-NEXT: [[I:%.*]] = fptosi double [[X]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP4]] to i64 ; CHECK-NEXT: [[R:%.*]] = sitofp i64 [[I]] to double -; CHECK-NEXT: store i64 [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret double [[R]] ; %i = fptosi double %x to i64 @@ -191,9 +229,13 @@ define <4 x float> @trunc_signed_v4f32_nsz(<4 x float> %x) #0 { ; CHECK-SAME: <4 x float> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[I:%.*]] = fptosi <4 x float> [[X]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[R:%.*]] = sitofp <4 x i32> [[I]] to <4 x float> -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[R]] ; %i = fptosi <4 x float> %x to <4 x i32> @@ -206,9 +248,13 @@ define <2 x double> @trunc_signed_v2f64_nsz(<2 x double> %x) #0 { ; CHECK-SAME: <2 x double> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[I:%.*]] = fptosi <2 x double> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64> ; CHECK-NEXT: [[R:%.*]] = sitofp <2 x i64> [[I]] to <2 x double> -; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[R]] ; %i = fptosi <2 x double> %x to <2 x i64> @@ -221,9 +267,13 @@ define <4 x double> @trunc_signed_v4f64_nsz(<4 x double> %x) #0 { ; CHECK-SAME: <4 x double> [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64> ; CHECK-NEXT: [[I:%.*]] = fptosi <4 x double> [[X]] to <4 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i64> ; CHECK-NEXT: [[R:%.*]] = sitofp <4 x i64> [[I]] to <4 x double> -; CHECK-NEXT: store <4 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[R]] ; %i = fptosi <4 x double> %x to <4 x i64> @@ -236,8 +286,11 @@ define float @trunc_unsigned_f32_disable_via_intrinsic(float %x) #0 { ; CHECK-SAME: float [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 ; CHECK-NEXT: [[I:%.*]] = call i32 @llvm.fptoui.sat.i32.f32(float [[X]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = sext i1 [[TMP4]] to i32 ; CHECK-NEXT: [[R:%.*]] = uitofp i32 [[I]] to float ; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret float [[R]] @@ -252,8 +305,11 @@ define double @trunc_signed_f64_disable_via_intrinsic(double %x) #0 { ; CHECK-SAME: double [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64 ; CHECK-NEXT: [[I:%.*]] = call i64 @llvm.fptosi.sat.i64.f64(double [[X]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = sext i1 [[TMP4]] to i64 ; CHECK-NEXT: [[R:%.*]] = sitofp i64 [[I]] to double ; CHECK-NEXT: store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret double [[R]] From bbe046b17ef866db251b06aa361cf7861a212ed1 Mon Sep 17 00:00:00 2001 From: Demetrius Kanios Date: Mon, 11 May 2026 14:02:00 -0700 Subject: [PATCH 355/538] [WebAssembly][GlobalISel] Fix ordering of operands for calls and other issues (#196898) Fixes a few of issues with `WebAssemblyCallLowering::lowerCall`. - Fixes the ordering of operands on the call instruction. Defs (so call returns) must come before uses (call target and args). - Prevents the tail-call bail out from null derefing when the call base is empty (e.g. for libcalls). - Ensures that the reg class is always set for the return registers of the call instruction (before, if the regs didn't need splitting, it wouldn't assign a reg-class to the existing reg, causing failures later down the pipeline). --- .../GISel/WebAssemblyCallLowering.cpp | 29 +++++---- .../GlobalISel/irtranslator/call-basics.ll | 59 +++++++++++-------- 2 files changed, 51 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/GISel/WebAssemblyCallLowering.cpp b/llvm/lib/Target/WebAssembly/GISel/WebAssemblyCallLowering.cpp index f69030b6443d6..9f3a1d1ba7fa2 100644 --- a/llvm/lib/Target/WebAssembly/GISel/WebAssemblyCallLowering.cpp +++ b/llvm/lib/Target/WebAssembly/GISel/WebAssemblyCallLowering.cpp @@ -411,7 +411,7 @@ bool WebAssemblyCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; // TODO: tail calls - if (Info.CB->isMustTailCall()) + if (Info.IsMustTailCall) return false; // TODO: varargs @@ -430,14 +430,6 @@ bool WebAssemblyCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallInst = MIRBuilder.buildInstrNoInsert(WebAssembly::CALL); - if (Info.Callee.isGlobal()) { - CallInst.addGlobalAddress(Info.Callee.getGlobal()); - } else if (Info.Callee.isSymbol()) { - CallInst.addExternalSymbol(Info.Callee.getSymbolName()); - } else { - return false; - } - SmallVector SplitArgs; for (const ArgInfo &Arg : Info.OrigArgs) { @@ -457,6 +449,8 @@ bool WebAssemblyCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, splitToValueTypes(Arg, SplitArgs, DL, CallConv); } + SmallVector CallUseRegs; + for (ArgInfo &Arg : SplitArgs) { const EVT OrigVT = TLI.getValueType(DL, Arg.Ty); const MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, OrigVT); @@ -499,14 +493,14 @@ bool WebAssemblyCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, for (unsigned Part = 0; Part < NumParts; ++Part) { Register NewReg = MRI.createVirtualRegister(&NewRegClass); MRI.setType(NewReg, NewLLT); - CallInst.addUse(NewReg); + CallUseRegs.push_back(NewReg); Arg.Regs[Part] = NewReg; } buildCopyToRegs(MIRBuilder, Arg.Regs, Arg.OrigRegs[0], OrigLLT, NewLLT, extendOpFromFlags(Arg.Flags[0])); } else { - CallInst.addUse(Arg.Regs[0]); + CallUseRegs.push_back(Arg.Regs[0]); } } @@ -603,11 +597,24 @@ bool WebAssemblyCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, buildCopyFromRegs(MIRBuilder, Ret.OrigRegs, Ret.Regs, OrigLLT, NewLLT, Ret.Flags[0]); } else { + MRI.setRegClass(Ret.Regs[0], &NewRegClass); CallInst.addDef(Ret.Regs[0]); } } } + if (Info.Callee.isGlobal()) { + CallInst.addGlobalAddress(Info.Callee.getGlobal()); + } else if (Info.Callee.isSymbol()) { + CallInst.addExternalSymbol(Info.Callee.getSymbolName()); + } else { + return false; + } + + for (Register Reg : CallUseRegs) { + CallInst.addUse(Reg); + } + if (!Info.CanLowerReturn) insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, Info.DemoteRegister, Info.DemoteStackIndex); diff --git a/llvm/test/CodeGen/WebAssembly/GlobalISel/irtranslator/call-basics.ll b/llvm/test/CodeGen/WebAssembly/GlobalISel/irtranslator/call-basics.ll index 8acbf587192e4..d767ed5104dae 100644 --- a/llvm/test/CodeGen/WebAssembly/GlobalISel/irtranslator/call-basics.ll +++ b/llvm/test/CodeGen/WebAssembly/GlobalISel/irtranslator/call-basics.ll @@ -26,8 +26,8 @@ define i32 @call_ret_i32_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_i32_args_none, def %0(i32), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(i32), implicit-def $arguments + ; CHECK-NEXT: [[CALL:%[0-9]+]]:i32(i32) = CALL @ret_i32_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: RETURN [[CALL]](i32), implicit-def $arguments %ret = call i32 @ret_i32_args_none() ret i32 %ret } @@ -38,8 +38,8 @@ define i64 @call_ret_i64_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_i64_args_none, def %0(i64), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(i64), implicit-def $arguments + ; CHECK-NEXT: [[CALL:%[0-9]+]]:i64(i64) = CALL @ret_i64_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: RETURN [[CALL]](i64), implicit-def $arguments %ret = call i64 @ret_i64_args_none() ret i64 %ret } @@ -50,8 +50,8 @@ define float @call_ret_f32_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_f32_args_none, def %0(f32), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(f32), implicit-def $arguments + ; CHECK-NEXT: [[CALL:%[0-9]+]]:f32(f32) = CALL @ret_f32_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: RETURN [[CALL]](f32), implicit-def $arguments %ret = call float @ret_f32_args_none() ret float %ret } @@ -62,20 +62,27 @@ define double @call_ret_f64_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_f64_args_none, def %0(f64), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(f64), implicit-def $arguments + ; CHECK-NEXT: [[CALL:%[0-9]+]]:f64(f64) = CALL @ret_f64_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: RETURN [[CALL]](f64), implicit-def $arguments %ret = call double @ret_f64_args_none() ret double %ret } declare ptr @ret_ptr_args_none() define ptr @call_ret_ptr_args_none() { - ; CHECK-LABEL: name: call_ret_ptr_args_none - ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $arguments - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_ptr_args_none, def %0(p0), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(p0), implicit-def $arguments + ; WASM32-LABEL: name: call_ret_ptr_args_none + ; WASM32: bb.1 (%ir-block.0): + ; WASM32-NEXT: liveins: $arguments + ; WASM32-NEXT: {{ $}} + ; WASM32-NEXT: [[CALL:%[0-9]+]]:i32(p0) = CALL @ret_ptr_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; WASM32-NEXT: RETURN [[CALL]](p0), implicit-def $arguments + ; + ; WASM64-LABEL: name: call_ret_ptr_args_none + ; WASM64: bb.1 (%ir-block.0): + ; WASM64-NEXT: liveins: $arguments + ; WASM64-NEXT: {{ $}} + ; WASM64-NEXT: [[CALL:%[0-9]+]]:i64(p0) = CALL @ret_ptr_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; WASM64-NEXT: RETURN [[CALL]](p0), implicit-def $arguments %ret = call ptr @ret_ptr_args_none() ret ptr %ret } @@ -86,8 +93,8 @@ define %externref @call_ret_externref_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_externref_args_none, def %0(p10), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(p10), implicit-def $arguments + ; CHECK-NEXT: [[CALL:%[0-9]+]]:externref(p10) = CALL @ret_externref_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: RETURN [[CALL]](p10), implicit-def $arguments %ret = call %externref @ret_externref_args_none() ret %externref %ret } @@ -98,8 +105,8 @@ define %funcref @call_ret_funcref_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_funcref_args_none, def %0(p20), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: RETURN %0(p20), implicit-def $arguments + ; CHECK-NEXT: [[CALL:%[0-9]+]]:funcref(p20) = CALL @ret_funcref_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: RETURN [[CALL]](p20), implicit-def $arguments %ret = call %funcref @ret_funcref_args_none() ret %funcref %ret } @@ -132,8 +139,8 @@ define i128 @call_ret_i128_args_none() { ; MULTIVAL-SIMD: bb.1 (%ir-block.0): ; MULTIVAL-SIMD-NEXT: liveins: $arguments ; MULTIVAL-SIMD-NEXT: {{ $}} - ; MULTIVAL-SIMD-NEXT: CALL @ret_i128_args_none, def %1(i64), def %2(i64), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; MULTIVAL-SIMD-NEXT: [[MV:%[0-9]+]]:_(i128) = G_MERGE_VALUES %1(i64), %2(i64) + ; MULTIVAL-SIMD-NEXT: [[CALL:%[0-9]+]]:i64(i64), [[CALL1:%[0-9]+]]:i64(i64) = CALL @ret_i128_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; MULTIVAL-SIMD-NEXT: [[MV:%[0-9]+]]:_(i128) = G_MERGE_VALUES [[CALL]](i64), [[CALL1]](i64) ; MULTIVAL-SIMD-NEXT: [[UV:%[0-9]+]]:i64(i64), [[UV1:%[0-9]+]]:i64(i64) = G_UNMERGE_VALUES [[MV]](i128) ; MULTIVAL-SIMD-NEXT: RETURN [[UV]](i64), [[UV1]](i64), implicit-def $arguments %ret = call i128 @ret_i128_args_none() @@ -146,8 +153,8 @@ define half @call_ret_f16_args_none() { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $arguments ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: CALL @ret_f16_args_none, def %1(i32), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(f16) = G_TRUNC %1(i32) + ; CHECK-NEXT: [[CALL:%[0-9]+]]:i32(i32) = CALL @ret_f16_args_none, implicit-def $arguments, implicit $sp32, implicit $sp64 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(f16) = G_TRUNC [[CALL]](i32) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:i32(i32) = G_ANYEXT [[TRUNC]](f16) ; CHECK-NEXT: RETURN [[ANYEXT]](i32), implicit-def $arguments %ret = call half @ret_f16_args_none() @@ -394,8 +401,8 @@ define %complexret @call_complex(<8 x i16> %a, [3 x ptr] %b, i1 %c) { ; WASM32-MULTIVAL-SIMD-NEXT: [[ARGUMENT_i32_3:%[0-9]+]]:i32(i32) = ARGUMENT_i32 4, implicit $arguments ; WASM32-MULTIVAL-SIMD-NEXT: [[TRUNC:%[0-9]+]]:_(i1) = G_TRUNC [[ARGUMENT_i32_3]](i32) ; WASM32-MULTIVAL-SIMD-NEXT: [[ANYEXT:%[0-9]+]]:i32(i32) = G_ANYEXT [[TRUNC]](i1) - ; WASM32-MULTIVAL-SIMD-NEXT: CALL @complex, [[ARGUMENT_v8i16_]](<8 x i16>), [[ARGUMENT_i32_]](p0), [[ARGUMENT_i32_1]](p0), [[ARGUMENT_i32_2]](p0), [[ANYEXT]](i32), def %6(i32), def %7(p0), def %8(<4 x f32>), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; WASM32-MULTIVAL-SIMD-NEXT: RETURN %6(i32), %7(p0), %8(<4 x f32>), implicit-def $arguments + ; WASM32-MULTIVAL-SIMD-NEXT: [[CALL:%[0-9]+]]:i32(i32), [[CALL1:%[0-9]+]]:i32(p0), [[CALL2:%[0-9]+]]:v128(<4 x f32>) = CALL @complex, [[ARGUMENT_v8i16_]](<8 x i16>), [[ARGUMENT_i32_]](p0), [[ARGUMENT_i32_1]](p0), [[ARGUMENT_i32_2]](p0), [[ANYEXT]](i32), implicit-def $arguments, implicit $sp32, implicit $sp64 + ; WASM32-MULTIVAL-SIMD-NEXT: RETURN [[CALL]](i32), [[CALL1]](p0), [[CALL2]](<4 x f32>), implicit-def $arguments ; ; WASM64-MULTIVAL-SIMD-LABEL: name: call_complex ; WASM64-MULTIVAL-SIMD: bb.1 (%ir-block.0): @@ -408,8 +415,8 @@ define %complexret @call_complex(<8 x i16> %a, [3 x ptr] %b, i1 %c) { ; WASM64-MULTIVAL-SIMD-NEXT: [[ARGUMENT_i32_:%[0-9]+]]:i32(i32) = ARGUMENT_i32 4, implicit $arguments ; WASM64-MULTIVAL-SIMD-NEXT: [[TRUNC:%[0-9]+]]:_(i1) = G_TRUNC [[ARGUMENT_i32_]](i32) ; WASM64-MULTIVAL-SIMD-NEXT: [[ANYEXT:%[0-9]+]]:i32(i32) = G_ANYEXT [[TRUNC]](i1) - ; WASM64-MULTIVAL-SIMD-NEXT: CALL @complex, [[ARGUMENT_v8i16_]](<8 x i16>), [[ARGUMENT_i64_]](p0), [[ARGUMENT_i64_1]](p0), [[ARGUMENT_i64_2]](p0), [[ANYEXT]](i32), def %6(i32), def %7(p0), def %8(<4 x f32>), implicit-def $arguments, implicit $sp32, implicit $sp64 - ; WASM64-MULTIVAL-SIMD-NEXT: RETURN %6(i32), %7(p0), %8(<4 x f32>), implicit-def $arguments + ; WASM64-MULTIVAL-SIMD-NEXT: [[CALL:%[0-9]+]]:i32(i32), [[CALL1:%[0-9]+]]:i64(p0), [[CALL2:%[0-9]+]]:v128(<4 x f32>) = CALL @complex, [[ARGUMENT_v8i16_]](<8 x i16>), [[ARGUMENT_i64_]](p0), [[ARGUMENT_i64_1]](p0), [[ARGUMENT_i64_2]](p0), [[ANYEXT]](i32), implicit-def $arguments, implicit $sp32, implicit $sp64 + ; WASM64-MULTIVAL-SIMD-NEXT: RETURN [[CALL]](i32), [[CALL1]](p0), [[CALL2]](<4 x f32>), implicit-def $arguments %ret = call %complexret @complex(<8 x i16> %a, [3 x ptr] %b, i1 %c) ret %complexret %ret } From 437803e4af2046801a0144404698796d4b0e55d2 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Mon, 11 May 2026 14:11:00 -0700 Subject: [PATCH 356/538] [compiler-rt][common] Only unmap stacks the runtime has actually mapped (#179000) When the sanitizer hasn't mapped the alternate signal stack, but the host program has (like LLVM), the runtime still tries to unilaterally unmap the alternate stack. Instead, the runtime should just check if it's actually mmaped the alternate stack, and only unmap it if it has. --------- Co-authored-by: Vitaly Buka --- compiler-rt/lib/asan/asan_thread.cpp | 4 ++-- compiler-rt/lib/asan/asan_thread.h | 1 + .../lib/sanitizer_common/sanitizer_common.h | 4 ++-- .../sanitizer_common/sanitizer_fuchsia.cpp | 4 ++-- .../sanitizer_posix_libcdep.cpp | 12 ++++++---- .../lib/sanitizer_common/sanitizer_win.cpp | 5 ++-- .../TestCases/Posix/multiple_sigaltstack.cpp | 24 +++++++++++++++++++ 7 files changed, 42 insertions(+), 12 deletions(-) create mode 100644 compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp index 32ab723e89001..ec96ec9b8082b 100644 --- a/compiler-rt/lib/asan/asan_thread.cpp +++ b/compiler-rt/lib/asan/asan_thread.cpp @@ -132,7 +132,7 @@ void AsanThread::Destroy() { CHECK_EQ(this, thread); malloc_storage().CommitBack(); if (common_flags()->use_sigaltstack) - UnsetAlternateSignalStack(); + UnsetAlternateSignalStack(altstack_base_); FlushToDeadThreadStats(&stats_); // We also clear the shadow on thread destruction because // some code may still be executing in later TSD destructors @@ -288,7 +288,7 @@ void AsanThread::ThreadStart(ThreadID os_id) { asanThreadRegistry().StartThread(tid(), os_id, ThreadType::Regular, nullptr); if (common_flags()->use_sigaltstack) - SetAlternateSignalStack(); + altstack_base_ = SetAlternateSignalStack(); } AsanThread *CreateMainThread() { diff --git a/compiler-rt/lib/asan/asan_thread.h b/compiler-rt/lib/asan/asan_thread.h index e9ca6b6a59016..6dbac7f3d8162 100644 --- a/compiler-rt/lib/asan/asan_thread.h +++ b/compiler-rt/lib/asan/asan_thread.h @@ -190,6 +190,7 @@ class AsanThread { AsanStats stats_; bool unwinding_; uptr extra_spill_area_; + void* altstack_base_ = nullptr; char start_data_[]; }; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index 4dd2187df2272..2aacec1cd8994 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -387,8 +387,8 @@ void ReportDeadlySignal(const SignalContext &sig, u32 tid, const void *unwind_context); // Alternative signal stack (POSIX-only). -void SetAlternateSignalStack(); -void UnsetAlternateSignalStack(); +void* SetAlternateSignalStack(); +void UnsetAlternateSignalStack(void* altstack_base); bool IsSignalHandlerFromSanitizer(int signum); bool SetSignalHandlerFromSanitizer(int signum, bool new_state); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp index 3c61b60802996..4dc31f4e51efa 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp @@ -93,8 +93,8 @@ void CheckMPROTECT() {} void PlatformPrepareForSandboxing(void *args) {} void DisableCoreDumperIfNecessary() {} void InstallDeadlySignalHandlers(SignalHandlerType handler) {} -void SetAlternateSignalStack() {} -void UnsetAlternateSignalStack() {} +void* SetAlternateSignalStack() { return nullptr; } +void UnsetAlternateSignalStack(void* altstack_base) {} bool SignalContext::IsStackOverflow() const { return false; } void SignalContext::DumpAllRegisters(void *context) { UNIMPLEMENTED(); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index 35b596de30fff..056eb677f0441 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -188,12 +188,13 @@ static uptr GetAltStackSize() { return SIGSTKSZ * 4; } -void SetAlternateSignalStack() { +void* SetAlternateSignalStack() { stack_t altstack, oldstack; CHECK_EQ(0, sigaltstack(nullptr, &oldstack)); // If the alternate stack is already in place, do nothing. // Android always sets an alternate stack, but it's too small for us. - if (!SANITIZER_ANDROID && !(oldstack.ss_flags & SS_DISABLE)) return; + if (!SANITIZER_ANDROID && !(oldstack.ss_flags & SS_DISABLE)) + return nullptr; // TODO(glider): the mapped stack should have the MAP_STACK flag in the // future. It is not required by man 2 sigaltstack now (they're using // malloc()). @@ -201,15 +202,18 @@ void SetAlternateSignalStack() { altstack.ss_sp = (char *)MmapOrDie(altstack.ss_size, __func__); altstack.ss_flags = 0; CHECK_EQ(0, sigaltstack(&altstack, nullptr)); + return altstack.ss_sp; } -void UnsetAlternateSignalStack() { +void UnsetAlternateSignalStack(void* altstack_base) { stack_t altstack, oldstack; altstack.ss_sp = nullptr; altstack.ss_flags = SS_DISABLE; altstack.ss_size = GetAltStackSize(); // Some sane value required on Darwin. CHECK_EQ(0, sigaltstack(&altstack, &oldstack)); - UnmapOrDie(oldstack.ss_sp, oldstack.ss_size); + if (altstack_base && altstack_base == oldstack.ss_sp) { + UnmapOrDie(oldstack.ss_sp, oldstack.ss_size); + } } bool IsSignalHandlerFromSanitizer(int signum) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp index ed4f60deeffc8..70f4a936c1329 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp @@ -913,11 +913,12 @@ void ReportFile::Write(const char *buffer, uptr length) { } } -void SetAlternateSignalStack() { +void* SetAlternateSignalStack() { // FIXME: Decide what to do on Windows. + return nullptr; } -void UnsetAlternateSignalStack() { +void UnsetAlternateSignalStack(void* altstack_base) { // FIXME: Decide what to do on Windows. } diff --git a/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp b/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp new file mode 100644 index 0000000000000..a7cf4b3a43b91 --- /dev/null +++ b/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp @@ -0,0 +1,24 @@ +// RUN: %clangxx_asan %s -o %t && %env_asan_opts=use_sigaltstack=1 %run %t + +#include +#include +#include +#include + +char global_alt_stack[4096 * 4]; + +int main() { + stack_t altstack; + altstack.ss_sp = global_alt_stack; + altstack.ss_size = sizeof(global_alt_stack); + altstack.ss_flags = 0; + if (sigaltstack(&altstack, nullptr) != 0) { + perror("sigaltstack"); + exit(1); + } + + // UnsetAlternateSignalStack will get called when the thread exists. If we + // don't *only* unmap a signal stack the runtime owns, we'll get a fault on + // the munmap operation, since that memory isn't mmaped. + return 0; +} From 8f854d8b0011b9e282edd18f24bd3cb1a2c14657 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 11 May 2026 22:12:38 +0100 Subject: [PATCH 357/538] [lldb] Add specific error message for "process plugin" with no plugin loaded (#196933) Fixes #196535. The error was: > command is not implemented Which is incorrect. It is now: > no process plugin commands are currently registered Which is not very helpful either but it's not wrong at least. We could expand it but I'm not sure what would help anyone here, given how rare it is that anyone encounters this anyway. --- lldb/source/Commands/CommandObjectProcess.cpp | 4 ++++ .../test/Shell/Commands/command-process-plugin-no-plugin.test | 3 +++ 2 files changed, 7 insertions(+) create mode 100644 lldb/test/Shell/Commands/command-process-plugin-no-plugin.test diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index d7d6a9152e377..366a44e1c6639 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -954,6 +954,10 @@ class CommandObjectProcessPlugin : public CommandObjectProxy { return process->GetPluginCommandObject(); return nullptr; } + + llvm::StringRef GetUnsupportedError() override { + return "no process plugin commands are currently registered"; + } }; // CommandObjectProcessLoad diff --git a/lldb/test/Shell/Commands/command-process-plugin-no-plugin.test b/lldb/test/Shell/Commands/command-process-plugin-no-plugin.test new file mode 100644 index 0000000000000..b582bda04954a --- /dev/null +++ b/lldb/test/Shell/Commands/command-process-plugin-no-plugin.test @@ -0,0 +1,3 @@ +# RUN: not %lldb -b -o "process plugin" 2>&1 | FileCheck %s + +# CHECK: error: no process plugin commands are currently registered From b32f9825fc5f32a99f4cb9c7cb52be9cc3bd1225 Mon Sep 17 00:00:00 2001 From: Roy Shi Date: Mon, 11 May 2026 14:14:25 -0700 Subject: [PATCH 358/538] [gsymutil] Fix build error in 196448 and remove warning message (#197028) **Problem:** #196448 broke the linux build of a test `DebugInfoGSYMTests`. See this [buildbot](https://lab.llvm.org/buildbot/#/builders/10/builds/28337). **Root cause:** The `BinaryFormat` is a dependency that is required when the build is done with `-DBUILD_SHARED_LIBS=ON`. This explains why some of the linux builds passes, while the above buildbot fails. **Fix:** This patch fixes this by adding that dependency. This patch also removes the warning message that was added by the same patch, which should be added in a different way, as pointed out by this [comment](https://github.com/llvm/llvm-project/pull/196448#discussion_r3221162626). **Tests:** ``` cmake -G Ninja -DLLVM_ENABLE_PROJECTS="clang;lldb" -DCMAKE_BUILD_TYPE=Debug -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" -DBUILD_SHARED_LIBS=ON ~/public_llvm/llvm-project/llvm ninja -C ~/public_llvm/build DebugInfoGSYMTests DebugInfoDWARFTests LD_LIBRARY_PATH=~/public_llvm/build/lib ~/public_llvm/build/unittests/DebugInfo/GSYM/DebugInfoGSYMTests LD_LIBRARY_PATH=~/public_llvm/build/lib ~/public_llvm/build/unittests/DebugInfo/DWARF/DebugInfoDWARFTests ``` Co-authored-by: royshi --- llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h | 7 +------ llvm/unittests/DebugInfo/GSYM/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h index 3ca0767574670..81a5e54a71145 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h @@ -184,13 +184,8 @@ template DieType unwrapReferencedTypedefType(DieType D) { if (!Unwrapped || Unwrapped.getTag() != dwarf::DW_TAG_typedef) return Unwrapped; - if (!Visited.insert(Unwrapped.getOffset()).second) { - WithColor::warning() - << "typedef cycle detected: DW_TAG_typedef at offset 0x" - << utohexstr(Unwrapped.getOffset()) - << " references itself through DW_TAG_typedef chain\n"; + if (!Visited.insert(Unwrapped.getOffset()).second) return Unwrapped; - } D = Unwrapped; } diff --git a/llvm/unittests/DebugInfo/GSYM/CMakeLists.txt b/llvm/unittests/DebugInfo/GSYM/CMakeLists.txt index 3b1482fc1b1e8..a421cf168ce05 100644 --- a/llvm/unittests/DebugInfo/GSYM/CMakeLists.txt +++ b/llvm/unittests/DebugInfo/GSYM/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + BinaryFormat DebugInfoDWARF DebugInfoGSYM MC From 7be448e6e090fb19785350edb1d121f6deb6cca2 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Mon, 11 May 2026 14:32:27 -0700 Subject: [PATCH 359/538] [flang][cuda][openacc] Don't apply CUDA Fortran COMMON/EQUIVALENCE rule to internal UseDevice marker (#197036) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `CUDADataAttr::UseDevice` is not user-spellable; the symbol that actually lives in COMMON/EQUIVALENCE carries no CUDA attribute. The CUDA Fortran restriction (CUDA Fortran Programming Guide §3.2) does not apply to it. Exclude `UseDevice` from the COMMON/EQUIVALENCE check alongside the existing `Pinned` exclusion, and add a Semantics regression test. --- flang/lib/Semantics/check-declarations.cpp | 9 ++++- .../OpenACC/acc-host-data-common.f90 | 36 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 flang/test/Semantics/OpenACC/acc-host-data-common.f90 diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index a11cfb818818c..d1475176215d3 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -1243,7 +1243,14 @@ void CheckHelper::CheckObjectEntity( case common::CUDADataAttr::UseDevice: break; } - if (attr != common::CUDADataAttr::Pinned) { + // CUDADataAttr::UseDevice is not user-spellable; it is set internally on + // construct-scoped symbol copies created for OpenACC `host_data + // use_device(...)` operands so that later passes can resolve them to the + // device address. The original symbol that actually lives in COMMON or an + // equivalence group carries no CUDA attribute, so the CUDA Fortran + // restrictions on user-written ATTRIBUTES(...) do not apply to it. + if (attr != common::CUDADataAttr::Pinned && + attr != common::CUDADataAttr::UseDevice) { if (details.commonBlock()) { messages_.Say( "Object '%s' with ATTRIBUTES(%s) may not be in COMMON"_err_en_US, diff --git a/flang/test/Semantics/OpenACC/acc-host-data-common.f90 b/flang/test/Semantics/OpenACC/acc-host-data-common.f90 new file mode 100644 index 0000000000000..25662fb47edcb --- /dev/null +++ b/flang/test/Semantics/OpenACC/acc-host-data-common.f90 @@ -0,0 +1,36 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenacc + +! Regression test: a variable that lives in a COMMON block must not be +! rejected when it appears in an OpenACC `host_data use_device(...)` +! clause. The CUDA Fortran restriction "object with ATTRIBUTES(USEDEVICE) +! may not be in COMMON" applies only to user-spelled +! `attributes(usedevice)` dummy arguments, not to the internal +! CUDADataAttr::UseDevice marker that name resolution attaches to +! construct-scoped copies of `use_device` operands. + +subroutine vadd(a, b, c, n) + real(8) :: a(*), b(*), c(*) + integer :: n, i + do i = 1, n - 1 + c(i) = a(i) + b(i) + end do +end subroutine + +program acc_host_data_common + integer, parameter :: N = 100 + real(8) :: a(2:N), b(2:N), c0(2:N), c1(2:N) + common /arrays/ a, b, c0, c1 + integer :: i + + !$acc data copy(a, b, c0) + !$acc parallel loop + do i = 2, N + a(i) = i + b(i) = 2.0_8 * i + end do + + !$acc host_data use_device(a, b, c0) + call vadd(a, b, c0, N) + !$acc end host_data + !$acc end data +end program From 0562d1749fc3266698180a86fb67c5bb8d0868dc Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Mon, 11 May 2026 14:41:13 -0700 Subject: [PATCH 360/538] [CIR] Implement copy construction of EH catch values (#196419) This change implements handling of exception variables that require copy construction (on Itanium targets) before they can be used in a catch handler, using the cir.contruct_catch_param operation. Some targets, such as MSABI, do not need to perform an explicit copy. The construct_catch_param operation is effectively a noop for those cases and will be lowered as such when the EHABI lowering is implemented for those targets. Assisted-by: Cursor / claude-opus-4.7-thinking-xhigh --- clang/lib/CIR/CodeGen/CIRGenException.cpp | 105 +++- .../CodeGen/try-catch-non-trivial-copy.cpp | 541 ++++++++++++++++++ 2 files changed, 643 insertions(+), 3 deletions(-) create mode 100644 clang/test/CIR/CodeGen/try-catch-non-trivial-copy.cpp diff --git a/clang/lib/CIR/CodeGen/CIRGenException.cpp b/clang/lib/CIR/CodeGen/CIRGenException.cpp index 3c956801f6b52..054a6003ae762 100644 --- a/clang/lib/CIR/CodeGen/CIRGenException.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenException.cpp @@ -298,6 +298,87 @@ static mlir::Value callBeginCatch(CIRGenFunction &cgf, mlir::Value ehToken, return beginCatch.getExnPtr(); } +/// Get or create the catch-init copy thunk for \p catchParam. +/// +/// The copy thunk has signature `void(T*, T*)` (where `T` is the catch +/// parameter type) and contains the normal aggregate emission of the catch +/// parameter's init expression. +/// +/// The thunk name is keyed off the catch parameter's canonical type mangled +/// name, so a single translation unit emits at most one thunk per catch type. +static cir::FuncOp getOrCreateCopyThunk(CIRGenFunction &cgf, + const VarDecl &catchParam, + cir::PointerType paramAddrType, + mlir::Location loc) { + CIRGenModule &cgm = cgf.cgm; + CIRGenBuilderTy &builder = cgm.getBuilder(); + mlir::ModuleOp mod = cgm.getModule(); + + const Expr *copyExpr = catchParam.getInit(); + assert(copyExpr && "non-trivial copy expects a copy expression"); + + llvm::SmallString<128> thunkName; + llvm::raw_svector_ostream thunkNameStream(thunkName); + thunkNameStream << "__clang_cir_catch_copy_"; + cgm.getCXXABI().getMangleContext().mangleCanonicalTypeName( + catchParam.getType(), thunkNameStream); + + if (cir::FuncOp existing = cgm.lookupFuncOp(thunkName)) + return existing; + + mlir::Type voidTy = cir::VoidType::get(builder.getContext()); + auto thunkTy = cir::FuncType::get({paramAddrType, paramAddrType}, voidTy, + /*isVarArg=*/false); + + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToEnd(mod.getBody()); + cir::FuncOp thunk = cir::FuncOp::create(builder, loc, thunkName, thunkTy); + cgm.insertGlobalSymbol(thunk); + thunk.setLinkage(cir::GlobalLinkageKind::LinkOnceODRLinkage); + thunk.setGlobalVisibility(cir::VisibilityKind::Hidden); + thunk->setAttr(cir::CIRDialect::getCatchCopyThunkAttrName(), + builder.getUnitAttr()); + + mlir::Block *entry = thunk.addEntryBlock(); + builder.setInsertionPointToStart(entry); + + // Use a fresh CIRGenFunction to drive the body emission. We need just enough + // state for emitAggExpr / emitCXXConstructorCall to compute the call-site + // argument attributes; the helper has no AST decl, no exception scopes, and + // no return value, so we bypass the full startFunction/finishFunction + // machinery. + CIRGenFunction subCgf(cgm, builder); + subCgf.curFn = thunk; + + // Some emission paths (e.g. materializing temporaries for default args via + // emitAnyExprToTemp) need both a current source location and a lexical + // scope to anchor allocas. Since we bypass startFunction, install both + // explicitly for the lifetime of the thunk's body emission. + CIRGenFunction::SourceLocRAIIObject thunkLoc(subCgf, loc); + CIRGenFunction::LexicalScope thunkScope(subCgf, loc, entry); + + // Bind the OpaqueValueExpr at the source position of the catch parameter's + // copy expression to an LValue at the thunk's `src` block argument. + LValue srcLV = subCgf.makeNaturalAlignAddrLValue(entry->getArgument(1), + catchParam.getType()); + CIRGenFunction::OpaqueValueMapping opaqueValue( + subCgf, OpaqueValueExpr::findInCopyConstruct(copyExpr), srcLV); + + // Drive the construction into the helper's `dest` block argument via the + // normal aggregate-emission machinery so that `ExprWithCleanups`, + // converting/inheriting constructors, and any future copy-construction + // shapes flow through unchanged. + Address destAddr = subCgf.makeNaturalAddressForPointer( + entry->getArgument(0), catchParam.getType(), clang::CharUnits::Zero()); + subCgf.emitAggExpr( + copyExpr, AggValueSlot::forAddr( + destAddr, Qualifiers(), AggValueSlot::IsNotDestructed, + AggValueSlot::IsNotAliased, AggValueSlot::DoesNotOverlap)); + + cir::ReturnOp::create(builder, loc); + return thunk; +} + /// A "special initializer" callback for initializing a catch /// parameter during catch initialization. static void initCatchParam(CIRGenFunction &cgf, CIRGenBuilderTy &builder, @@ -339,10 +420,28 @@ static void initCatchParam(CIRGenFunction &cgf, CIRGenBuilderTy &builder, } } - mlir::Value exnPtr = callBeginCatch(cgf, ehToken, builder.getVoidPtrTy()); CIRGenFunction::AutoVarEmission var = cgf.emitAutoVarAlloca(catchParam); - cir::InitCatchParamOp::create(builder, cgf.getLoc(loc), exnPtr, - var.getAllocatedAddress().getPointer(), kind); + Address paramAddr = var.getAllocatedAddress(); + mlir::Location mloc = cgf.getLoc(loc); + + if (kind == cir::InitCatchKind::NonTrivialCopy) { + // Sanitizer-checked construction (UBSan vptr/derived-class checks, etc.) + // would require additional adornments that cir.construct_catch_param does + // not yet carry. + assert(!cir::MissingFeatures::sanitizers()); + + auto paramAddrType = + mlir::cast(paramAddr.getPointer().getType()); + cir::FuncOp thunk = + getOrCreateCopyThunk(cgf, catchParam, paramAddrType, mloc); + cir::ConstructCatchParamOp::create(builder, mloc, ehToken, + paramAddr.getPointer(), kind, + thunk.getSymName()); + } + + mlir::Value exnPtr = callBeginCatch(cgf, ehToken, builder.getVoidPtrTy()); + cir::InitCatchParamOp::create(builder, mloc, exnPtr, paramAddr.getPointer(), + kind); cgf.emitAutoVarCleanups(var); } diff --git a/clang/test/CIR/CodeGen/try-catch-non-trivial-copy.cpp b/clang/test/CIR/CodeGen/try-catch-non-trivial-copy.cpp new file mode 100644 index 0000000000000..af458bdb0f73d --- /dev/null +++ b/clang/test/CIR/CodeGen/try-catch-non-trivial-copy.cpp @@ -0,0 +1,541 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR +// RUN: cir-opt -cir-hoist-allocas -cir-flatten-cfg %t.cir -o %t.flat.cir +// RUN: FileCheck --input-file=%t.flat.cir %s -check-prefix=CIR-FLAT +// RUN: cir-opt -cir-hoist-allocas -cir-flatten-cfg -cir-eh-abi-lowering %t.cir -o %t.eh.cir +// RUN: FileCheck --input-file=%t.eh.cir %s -check-prefix=CIR-AFTER-EHABI +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +void mayThrow(); + +//===----------------------------------------------------------------------===// +// Case 1: Plain non-trivial copy constructor `T(const T &)`. +//===----------------------------------------------------------------------===// + +struct MyException { + MyException(); + MyException(const MyException &); + ~MyException(); + int get(); +}; + +int test_non_trivial_exception_copy() { + int rv = 0; + try { + mayThrow(); + } catch (MyException e) { + rv = e.get(); + } + return rv; +} + +// --- CIR (out of CodeGen, before any pre-lowering passes) --- +// +// The catch-binding step is `cir.construct_catch_param non_trivial_copy`, +// which references a CIRGen-synthesized helper thunk via `copy_fn`. The +// helper is keyed off the catch type's mangled typeinfo name, has +// `linkonce_odr` linkage, hidden visibility, and the +// `cir.eh.catch_copy_thunk` attribute so EHABI lowering can find or remove +// it later. + +// CIR-LABEL: cir.func {{.*}} @_Z31test_non_trivial_exception_copyv() +// CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] +// CIR: %[[RV:.*]] = cir.alloca !s32i, !cir.ptr, ["rv", init] +// CIR: cir.scope { +// CIR: %[[E:.*]] = cir.alloca !rec_MyException, !cir.ptr, ["e"] +// CIR: cir.try { +// CIR: cir.call @_Z8mayThrowv() : () -> () +// CIR: } catch [type #cir.global_view<@_ZTI11MyException> : !cir.ptr] (%[[EH_TOKEN:.*]]: !cir.eh_token {{.*}}) { +// CIR: cir.construct_catch_param non_trivial_copy %[[EH_TOKEN]] to %[[E]] using @__clang_cir_catch_copy__ZTS11MyException : !cir.ptr +// CIR: %[[CATCH_TOKEN:.*]], %[[EXN_PTR:.*]] = cir.begin_catch %[[EH_TOKEN]] : !cir.eh_token -> (!cir.catch_token, !cir.ptr) +// CIR: cir.cleanup.scope { +// CIR: cir.init_catch_param non_trivial_copy %[[EXN_PTR]] to %[[E]] : !cir.ptr, !cir.ptr +// CIR: cir.cleanup.scope { +// CIR: %[[GET:.*]] = cir.call @_ZN11MyException3getEv(%[[E]]) : (!cir.ptr {{.*}}) -> (!s32i {{.*}}) +// CIR: cir.store{{.*}} %[[GET]], %[[RV]] +// CIR: cir.yield +// CIR: } cleanup all { +// CIR: cir.call @_ZN11MyExceptionD1Ev(%[[E]]) nothrow +// CIR: cir.yield +// CIR: } +// CIR: cir.yield +// CIR: } cleanup all { +// CIR: cir.end_catch %[[CATCH_TOKEN]] : !cir.catch_token +// CIR: cir.yield +// CIR: } +// CIR: } unwind (%[[EH_TOKEN:.*]]: !cir.eh_token {{.*}}) { +// CIR: cir.resume %[[EH_TOKEN]] : !cir.eh_token +// CIR: } +// CIR: } +// CIR: cir.return + +// CIR-LABEL: cir.func linkonce_odr hidden @__clang_cir_catch_copy__ZTS11MyException +// CIR-SAME: attributes {cir.eh.catch_copy_thunk} +// CIR: cir.call @_ZN11MyExceptionC1ERKS_(%{{.*}}, %{{.*}}) +// CIR: cir.return + +// --- CIR-FLAT (after hoist-allocas + flatten-cfg) --- +// +// Regions are inlined into a flat CFG. The catch-binding op survives +// flattening unchanged; its `copy_fn` symbol reference is preserved. +// +// Pure-fallthrough blocks (just a `cir.br` to the next block) are skipped in +// the checks; we only verify branch terminators with non-trivial successors +// and the labels of blocks that carry real work. + +// CIR-FLAT-LABEL: cir.func {{.*}} @_Z31test_non_trivial_exception_copyv() +// CIR-FLAT: cir.try_call @_Z8mayThrowv() ^[[T1F_CONT:bb[0-9]+]], ^[[T1F_LPAD:bb[0-9]+]] : () -> () + +// On the unwind edge: initiate the in-flight exception and feed it to the +// dispatch block. +// CIR-FLAT: ^[[T1F_LPAD]]: +// CIR-FLAT: %[[T1F_TOK:.*]] = cir.eh.initiate : !cir.eh_token +// CIR-FLAT: cir.br ^[[T1F_DISP:bb[0-9]+]](%[[T1F_TOK]] : !cir.eh_token) + +// Dispatch matches the in-flight exception against `MyException` and falls +// through to the resume block on a miss. +// CIR-FLAT: ^[[T1F_DISP]](%{{.*}}: !cir.eh_token): +// CIR-FLAT: cir.eh.dispatch %{{.*}} : !cir.eh_token [ +// CIR-FLAT: catch(#cir.global_view<@_ZTI11MyException> : !cir.ptr) : ^[[T1F_CATCH:bb[0-9]+]], +// CIR-FLAT: unwind : ^[[T1F_RESUME:bb[0-9]+]] +// CIR-FLAT: ] + +// Catch handler: bind exception via the helper thunk, begin the catch, +// initialize the catch parameter, run the body. +// CIR-FLAT: ^[[T1F_CATCH]](%{{.*}}: !cir.eh_token): +// CIR-FLAT: cir.construct_catch_param non_trivial_copy %{{.*}} to %{{.*}} using @__clang_cir_catch_copy__ZTS11MyException : !cir.ptr +// CIR-FLAT: cir.begin_catch %{{.*}} : !cir.eh_token -> (!cir.catch_token, !cir.ptr) +// CIR-FLAT: cir.init_catch_param non_trivial_copy %{{.*}} to %{{.*}} : !cir.ptr, !cir.ptr +// CIR-FLAT: cir.try_call @_ZN11MyException3getEv(%{{.*}}) ^[[T1F_GET_OK:bb[0-9]+]], ^[[T1F_GET_LPAD:bb[0-9]+]] + +// Normal path of `e.get()`: store result and run `e`'s dtor. +// CIR-FLAT: ^[[T1F_GET_OK]]: +// CIR-FLAT: cir.store align(4) %{{.*}} +// CIR-FLAT: cir.call @_ZN11MyExceptionD1Ev(%{{.*}}) nothrow + +// Unwind path of `e.get()`: initiate cleanup, run `e`'s dtor, end cleanup. +// CIR-FLAT: ^[[T1F_GET_LPAD]]: +// CIR-FLAT: %[[T1F_CL_TOK:.*]] = cir.eh.initiate cleanup : !cir.eh_token +// CIR-FLAT: cir.br ^[[T1F_CL_DTOR:bb[0-9]+]](%[[T1F_CL_TOK]] : !cir.eh_token) + +// CIR-FLAT: ^[[T1F_CL_DTOR]](%[[T1F_CL_TOK2:.*]]: !cir.eh_token): +// CIR-FLAT: cir.begin_cleanup %[[T1F_CL_TOK2]] : !cir.eh_token -> !cir.cleanup_token +// CIR-FLAT: cir.call @_ZN11MyExceptionD1Ev(%{{.*}}) nothrow +// CIR-FLAT: cir.end_cleanup %{{.*}} : !cir.cleanup_token +// CIR-FLAT: cir.br ^[[T1F_CL_END:bb[0-9]+]](%[[T1F_CL_TOK2]] : !cir.eh_token) + +// Normal-path end_catch (after the catch body completed). +// CIR-FLAT: cir.end_catch + +// End of the catch's unwind path: end_catch then resume. +// CIR-FLAT: ^[[T1F_CL_END]](%[[T1F_RES_TOK:.*]]: !cir.eh_token): +// CIR-FLAT: cir.begin_cleanup +// CIR-FLAT: cir.end_catch +// CIR-FLAT: cir.end_cleanup +// CIR-FLAT: cir.resume %[[T1F_RES_TOK]] : !cir.eh_token + +// The outer dispatch unwind block (unmatched type). +// CIR-FLAT: ^[[T1F_RESUME]](%[[T1F_OUT_TOK:.*]]: !cir.eh_token): +// CIR-FLAT: cir.resume %[[T1F_OUT_TOK]] : !cir.eh_token + +// Final return. +// CIR-FLAT: cir.return %{{.*}} : !s32i + +// CIR-AFTER-EHABI-LABEL: cir.func {{.*}} @_Z31test_non_trivial_exception_copyv() +// CIR-AFTER-EHABI: cir.try_call @_Z8mayThrowv() ^[[T1E_CONT:bb[0-9]+]], ^[[T1E_LPAD:bb[0-9]+]] + +// CIR-AFTER-EHABI: ^[[T1E_LPAD]]: +// CIR-AFTER-EHABI: %[[T1E_EXN:exception_ptr]], %[[T1E_TID:type_id]] = cir.eh.inflight_exception [@_ZTI11MyException] +// CIR-AFTER-EHABI: cir.br ^{{bb[0-9]+}}(%[[T1E_EXN]], %[[T1E_TID]] : !cir.ptr, !u32i) + +// At the dispatch: compare type_ids and branch to either the catch or resume. +// CIR-AFTER-EHABI: %[[T1E_TYPEID:.*]] = cir.eh.typeid @_ZTI11MyException +// CIR-AFTER-EHABI: %[[T1E_EQ:.*]] = cir.cmp eq %{{.*}}, %[[T1E_TYPEID]] : !u32i +// CIR-AFTER-EHABI: cir.brcond %[[T1E_EQ]] ^[[T1E_CATCH:bb[0-9]+]](%{{.*}}, %{{.*}} : !cir.ptr, !u32i), ^[[T1E_RESUME:bb[0-9]+]](%{{.*}}, %{{.*}} : !cir.ptr, !u32i) + +// In the catch block: get the adjusted exception pointer, then invoke the +// inlined copy ctor directly. The normal-edge target is a pure-fallthrough +// block that we don't pin; the unwind-edge target is the terminate handler. +// CIR-AFTER-EHABI: ^[[T1E_CATCH]](%[[T1E_RAW:.*]]: !cir.ptr, %{{.*}}: !u32i): +// CIR-AFTER-EHABI: %[[T1E_ADJ:.*]] = cir.call @__cxa_get_exception_ptr(%[[T1E_RAW]]) nothrow : (!cir.ptr) -> !cir.ptr +// CIR-AFTER-EHABI: %[[T1E_ADJT:.*]] = cir.cast bitcast %[[T1E_ADJ]] : !cir.ptr -> !cir.ptr +// CIR-AFTER-EHABI: cir.try_call @_ZN11MyExceptionC1ERKS_(%{{.*}}, %[[T1E_ADJT]]) ^{{bb[0-9]+}}, ^[[T1E_TERM:bb[0-9]+]] + +// On the inlined copy's normal-edge fallthrough chain: __cxa_begin_catch and +// run the catch body. +// CIR-AFTER-EHABI: cir.call @__cxa_begin_catch(%[[T1E_RAW]]) : (!cir.ptr) -> !cir.ptr +// CIR-AFTER-EHABI: cir.try_call @_ZN11MyException3getEv(%{{.*}}) ^[[T1E_GET_OK:bb[0-9]+]], ^[[T1E_GET_LPAD:bb[0-9]+]] + +// Normal path: store get's result, dtor, end_catch, fall through. +// CIR-AFTER-EHABI: ^[[T1E_GET_OK]]: +// CIR-AFTER-EHABI: cir.store align(4) %{{.*}} +// CIR-AFTER-EHABI: cir.call @_ZN11MyExceptionD1Ev(%{{.*}}) nothrow + +// Unwind path: inflight_exception cleanup, dtor, end_catch, resume. +// CIR-AFTER-EHABI: ^[[T1E_GET_LPAD]]: +// CIR-AFTER-EHABI: %{{.*}}, %{{.*}} = cir.eh.inflight_exception cleanup +// CIR-AFTER-EHABI: cir.call @_ZN11MyExceptionD1Ev(%{{.*}}) nothrow +// CIR-AFTER-EHABI: cir.call @__cxa_end_catch() : () -> () +// CIR-AFTER-EHABI: cir.call @__cxa_end_catch() : () -> () +// CIR-AFTER-EHABI: cir.resume.flat %{{.*}} + +// Outer dispatch's resume. +// CIR-AFTER-EHABI: ^[[T1E_RESUME]](%{{.*}}: !cir.ptr, %{{.*}}: !u32i): +// CIR-AFTER-EHABI: cir.resume.flat %{{.*}}, %{{.*}} + +// CIR-AFTER-EHABI: cir.return %{{.*}} : !s32i + +// Terminate landing pad. +// CIR-AFTER-EHABI: ^[[T1E_TERM]]: +// CIR-AFTER-EHABI: %{{.*}}, %{{.*}} = cir.eh.inflight_exception catch_all +// CIR-AFTER-EHABI: cir.call @__clang_call_terminate(%{{.*}}) nothrow {noreturn} : (!cir.ptr) -> () +// CIR-AFTER-EHABI: cir.unreachable + + +// The catch-copy thunk is fully inlined and removed by the post-lowering +// sweep, so it is no longer present at this stage. +// CIR-AFTER-EHABI-NOT: @__clang_cir_catch_copy__ZTS11MyException + +// --- LLVM (CIR -> LLVM IR via the full pipeline) --- + +// LLVM-LABEL: define dso_local noundef i32 @_Z31test_non_trivial_exception_copyv() #{{[0-9]+}} personality ptr @__gxx_personality_v0 +// LLVM: invoke void @_Z8mayThrowv() +// LLVM: to label %[[T1L_CONT:.*]] unwind label %[[T1L_LPAD:.*]] + +// LLVM: [[T1L_LPAD]]: +// LLVM: landingpad { ptr, i32 } +// LLVM: catch ptr @_ZTI11MyException + +// Type-id dispatch. +// LLVM: %[[T1L_TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI11MyException) +// LLVM: %[[T1L_MATCH:.*]] = icmp eq i32 %{{.*}}, %[[T1L_TYPEID]] +// LLVM: br i1 %[[T1L_MATCH]], label %[[T1L_CATCH:.*]], label %[[T1L_RESUME:.*]] + +// Catch handler: __cxa_get_exception_ptr followed by the inlined copy ctor +// invoke (the catch-copy thunk has been inlined away, so the ctor is called +// directly with a terminate landing pad). The normal-edge target is a +// pure-fallthrough block that we don't pin. +// LLVM: [[T1L_CATCH]]: +// LLVM: %[[T1L_ADJ:.*]] = call ptr @__cxa_get_exception_ptr( +// LLVM: invoke void @_ZN11MyExceptionC1ERKS_(ptr {{.*}}, ptr{{.*}} %[[T1L_ADJ]]) +// LLVM: to label %{{.*}} unwind label %[[T1L_TERM:.*]] + +// On the inlined copy's normal-edge fallthrough chain: __cxa_begin_catch. +// LLVM: call ptr @__cxa_begin_catch( +// LLVM: invoke noundef i32 @_ZN11MyException3getEv( +// LLVM: to label %[[T1L_GET_OK:.*]] unwind label %[[T1L_GET_LPAD:.*]] + +// LLVM: [[T1L_GET_OK]]: +// LLVM: store i32 %{{.*}} +// LLVM: call void @_ZN11MyExceptionD1Ev( + +// LLVM: [[T1L_GET_LPAD]]: +// LLVM: landingpad { ptr, i32 } +// LLVM: cleanup +// LLVM: call void @_ZN11MyExceptionD1Ev( +// LLVM: call void @__cxa_end_catch() +// LLVM: call void @__cxa_end_catch() +// LLVM: resume { ptr, i32 } + +// LLVM: [[T1L_RESUME]]: +// LLVM: resume { ptr, i32 } +// LLVM: ret i32 + +// LLVM: [[T1L_TERM]]: +// LLVM: landingpad { ptr, i32 } +// LLVM: catch ptr null +// LLVM: call void @__clang_call_terminate( +// LLVM: unreachable + +// The catch-copy thunk is inlined away during EHABI lowering, so it should +// not appear in the LLVM IR. +// LLVM-NOT: @__clang_cir_catch_copy__ZTS11MyException + +// --- OGCG (classic CodeGen reference) --- +// +// Classic CodeGen does not synthesize a catch-init thunk; it inlines the +// `__cxa_get_exception_ptr` + copy ctor invocation at the catch site, with a +// terminate landing pad on the copy's unwind edge. + +// OGCG-LABEL: define {{.*}} i32 @_Z31test_non_trivial_exception_copyv() +// OGCG: entry: +// OGCG: %[[O1_RV:.*]] = alloca i32 +// OGCG: %[[O1_E:.*]] = alloca %struct.MyException +// OGCG: store i32 0, ptr %[[O1_RV]] +// OGCG: invoke void @_Z8mayThrowv() +// OGCG: to label %[[O1_CONT:.*]] unwind label %[[O1_LPAD:.*]] + +// OGCG: [[O1_LPAD]]: +// OGCG: landingpad { ptr, i32 } +// OGCG: catch ptr @_ZTI11MyException + +// OGCG: catch.dispatch: +// OGCG: %[[O1_TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI11MyException) +// OGCG: %[[O1_MATCH:.*]] = icmp eq i32 %{{.*}}, %[[O1_TYPEID]] +// OGCG: br i1 %[[O1_MATCH]], label %[[O1_CATCH:.*]], label %[[EH_RESUME:.*]] + +// OGCG: [[O1_CATCH]]: +// OGCG: %[[O1_ADJ:.*]] = call ptr @__cxa_get_exception_ptr(ptr %{{.*}}) +// OGCG: invoke void @_ZN11MyExceptionC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) %[[O1_E]], ptr noundef nonnull align 1 dereferenceable(1) %[[O1_ADJ]]) +// OGCG: to label %[[O1_INVOKE_CONT1:.*]] unwind label %[[O1_TERM:.*]] + +// OGCG: [[O1_INVOKE_CONT1]]: +// OGCG: call ptr @__cxa_begin_catch( +// OGCG: invoke noundef i32 @_ZN11MyException3getEv(ptr noundef nonnull align 1 dereferenceable(1) %[[O1_E]]) +// OGCG: to label %[[O1_INVOKE_CONT3:.*]] unwind label %[[O1_LPAD2:.*]] + +// OGCG: [[O1_INVOKE_CONT3]]: +// OGCG: store i32 %{{.*}}, ptr %[[O1_RV]] +// OGCG: call void @_ZN11MyExceptionD1Ev(ptr noundef nonnull align 1 dereferenceable(1) %[[O1_E]]) +// OGCG: call void @__cxa_end_catch() +// OGCG: br label %[[O1_TRY_CONT:.*]] + +// OGCG: [[O1_TRY_CONT]]: +// OGCG: load i32, ptr %[[O1_RV]] +// OGCG: ret i32 + +// OGCG: [[O1_LPAD2]]: +// OGCG: landingpad { ptr, i32 } +// OGCG: cleanup +// OGCG: call void @_ZN11MyExceptionD1Ev(ptr noundef nonnull align 1 dereferenceable(1) %[[O1_E]]) +// OGCG: invoke void @__cxa_end_catch() +// OGCG: to label %{{.*}} unwind label %[[O1_TERM]] + +// OGCG: [[EH_RESUME]]: +// OGCG: resume { ptr, i32 } + +// OGCG: [[O1_TERM]]: +// OGCG: landingpad { ptr, i32 } +// OGCG: catch ptr null +// OGCG: call void @__clang_call_terminate( +// OGCG: unreachable + +//===----------------------------------------------------------------------===// +// Case 2: Copy constructor with extra (default) arguments. +// +// `WithDefault(const WithDefault &, int = 42)` is a valid copy constructor +// per [class.copy.ctor]/1 since the only non-`const T &` parameter has a +// default argument. emitAggExpr lets the CXXDefaultArgExpr fill-in flow +// through naturally; the thunk body materializes the `42` constant before +// calling the underlying ctor. +//===----------------------------------------------------------------------===// + +struct WithDefault { + WithDefault(); + WithDefault(const WithDefault &, int = 42); + ~WithDefault(); +}; + +int test_copy_ctor_extra_args() { + try { + mayThrow(); + } catch (WithDefault w) { + return 0; + } + return -1; +} + +// --- CIR --- + +// CIR-LABEL: cir.func {{.*}} @_Z25test_copy_ctor_extra_argsv() +// CIR: cir.construct_catch_param non_trivial_copy %{{.*}} to %{{.*}} using @__clang_cir_catch_copy__ZTS11WithDefault : !cir.ptr + +// CIR-LABEL: cir.func linkonce_odr hidden @__clang_cir_catch_copy__ZTS11WithDefault( +// CIR-SAME: attributes {cir.eh.catch_copy_thunk} +// CIR: %[[FORTYTWO:.*]] = cir.const #cir.int<42> : !s32i +// CIR: cir.call @_ZN11WithDefaultC1ERKS_i({{.*}}, {{.*}}, %[[FORTYTWO]]) +// CIR: cir.return + +// --- CIR-FLAT --- +// +// Because the catch handler does `return 0`, the function uses a +// `__cleanup_dest_slot` to dispatch through the catch end_catch and into +// either the inner-`return 0` or the outer-`return -1`. We don't pin the +// exact switch shape here, but we do verify the EH-relevant control flow. + +// CIR-FLAT-LABEL: cir.func {{.*}} @_Z25test_copy_ctor_extra_argsv() +// CIR-FLAT: %{{.*}} = cir.alloca !s32i, !cir.ptr, ["__cleanup_dest_slot", cleanup_dest_slot] +// CIR-FLAT: cir.try_call @_Z8mayThrowv() ^[[T2F_CONT:bb[0-9]+]], ^[[T2F_LPAD:bb[0-9]+]] + +// CIR-FLAT: ^[[T2F_LPAD]]: +// CIR-FLAT: %[[T2F_TOK:.*]] = cir.eh.initiate : !cir.eh_token +// CIR-FLAT: cir.br ^[[T2F_DISP:bb[0-9]+]](%[[T2F_TOK]] : !cir.eh_token) + +// CIR-FLAT: ^[[T2F_DISP]](%{{.*}}: !cir.eh_token): +// CIR-FLAT: cir.eh.dispatch %{{.*}} : !cir.eh_token [ +// CIR-FLAT: catch(#cir.global_view<@_ZTI11WithDefault> : !cir.ptr) : ^[[T2F_CATCH:bb[0-9]+]], +// CIR-FLAT: unwind : ^[[T2F_RESUME:bb[0-9]+]] +// CIR-FLAT: ] + +// CIR-FLAT: ^[[T2F_CATCH]](%{{.*}}: !cir.eh_token): +// CIR-FLAT: cir.construct_catch_param non_trivial_copy %{{.*}} to %{{.*}} using @__clang_cir_catch_copy__ZTS11WithDefault : !cir.ptr +// CIR-FLAT: cir.begin_catch %{{.*}} : !cir.eh_token -> (!cir.catch_token, !cir.ptr) +// CIR-FLAT: cir.init_catch_param non_trivial_copy %{{.*}} to %{{.*}} + +// Catch body sets retval=0 and cleanup_dest=1. +// CIR-FLAT: cir.store %{{.*}} +// CIR-FLAT: cir.store %{{.*}} + +// `w`'s destructor. +// CIR-FLAT: cir.call @_ZN11WithDefaultD1Ev(%{{.*}}) nothrow + +// Cleanup-dest dispatch and end_catch. +// CIR-FLAT: cir.switch.flat +// CIR-FLAT: cir.end_catch + +// Return-or-fallthrough switch on cleanup_dest_slot. +// CIR-FLAT: cir.switch.flat +// CIR-FLAT: cir.load %{{.*}} : !cir.ptr, !s32i +// CIR-FLAT: cir.return %{{.*}} : !s32i + +// CIR-FLAT: ^[[T2F_RESUME]](%[[T2F_OUT_TOK:.*]]: !cir.eh_token): +// CIR-FLAT: cir.resume %[[T2F_OUT_TOK]] : !cir.eh_token + +// Tail of the function (when the try doesn't catch): +// CIR-FLAT: cir.return %{{.*}} : !s32i + +// --- CIR-AFTER-EHABI --- + +// CIR-AFTER-EHABI-LABEL: cir.func {{.*}} @_Z25test_copy_ctor_extra_argsv() +// CIR-AFTER-EHABI: cir.try_call @_Z8mayThrowv() ^[[T2E_CONT:bb[0-9]+]], ^[[T2E_LPAD:bb[0-9]+]] + +// CIR-AFTER-EHABI: ^[[T2E_LPAD]]: +// CIR-AFTER-EHABI: %{{.*}}, %{{.*}} = cir.eh.inflight_exception [@_ZTI11WithDefault] + +// CIR-AFTER-EHABI: %[[T2E_TYPEID:.*]] = cir.eh.typeid @_ZTI11WithDefault +// CIR-AFTER-EHABI: %[[T2E_EQ:.*]] = cir.cmp eq %{{.*}}, %[[T2E_TYPEID]] +// CIR-AFTER-EHABI: cir.brcond %[[T2E_EQ]] ^[[T2E_CATCH:bb[0-9]+]](%{{.*}}, %{{.*}} : !cir.ptr, !u32i), ^[[T2E_RESUME:bb[0-9]+]](%{{.*}}, %{{.*}} : !cir.ptr, !u32i) + +// In the catch block: get the adjusted exception pointer, materialize the +// `42` default argument, and invoke the inlined copy ctor directly. The +// thunk has been inlined into the catch handler, so the constant comes from +// the cloned thunk body. The normal-edge target is a pure-fallthrough block +// that we don't pin; the unwind-edge target is the terminate handler. +// CIR-AFTER-EHABI: ^[[T2E_CATCH]](%[[T2E_RAW:.*]]: !cir.ptr, %{{.*}}: !u32i): +// CIR-AFTER-EHABI: %[[T2E_ADJ:.*]] = cir.call @__cxa_get_exception_ptr(%[[T2E_RAW]]) nothrow +// CIR-AFTER-EHABI: %[[T2E_ADJT:.*]] = cir.cast bitcast %[[T2E_ADJ]] : !cir.ptr -> !cir.ptr +// CIR-AFTER-EHABI: %[[T2E_FORTYTWO:.*]] = cir.const #cir.int<42> : !s32i +// CIR-AFTER-EHABI: cir.try_call @_ZN11WithDefaultC1ERKS_i(%{{.*}}, %[[T2E_ADJT]], %[[T2E_FORTYTWO]]) ^{{bb[0-9]+}}, ^[[T2E_TERM:bb[0-9]+]] + +// On the inlined copy's normal-edge fallthrough chain: __cxa_begin_catch. +// CIR-AFTER-EHABI: cir.call @__cxa_begin_catch(%[[T2E_RAW]]) + +// Catch body and dispatch out via cleanup_dest_slot. +// CIR-AFTER-EHABI: cir.call @_ZN11WithDefaultD1Ev(%{{.*}}) nothrow +// CIR-AFTER-EHABI: cir.switch.flat +// CIR-AFTER-EHABI: cir.call @__cxa_end_catch() +// CIR-AFTER-EHABI: cir.switch.flat +// CIR-AFTER-EHABI: cir.return %{{.*}} : !s32i + +// CIR-AFTER-EHABI: ^[[T2E_RESUME]](%{{.*}}: !cir.ptr, %{{.*}}: !u32i): +// CIR-AFTER-EHABI: cir.resume.flat %{{.*}}, %{{.*}} + +// CIR-AFTER-EHABI: cir.return %{{.*}} : !s32i + +// Terminate landing pad. +// CIR-AFTER-EHABI: ^[[T2E_TERM]]: +// CIR-AFTER-EHABI: cir.eh.inflight_exception catch_all +// CIR-AFTER-EHABI: cir.call @__clang_call_terminate( + +// The catch-copy thunk is fully inlined and removed. +// CIR-AFTER-EHABI-NOT: @__clang_cir_catch_copy__ZTS11WithDefault + +// --- LLVM --- + +// LLVM-LABEL: define dso_local noundef i32 @_Z25test_copy_ctor_extra_argsv() +// LLVM: invoke void @_Z8mayThrowv() +// LLVM: to label %[[T2L_CONT:.*]] unwind label %[[T2L_LPAD:.*]] + +// LLVM: [[T2L_LPAD]]: +// LLVM: landingpad { ptr, i32 } +// LLVM: catch ptr @_ZTI11WithDefault + +// LLVM: %[[T2L_TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI11WithDefault) +// LLVM: icmp eq i32 %{{.*}}, %[[T2L_TYPEID]] +// LLVM: br i1 %{{.*}}, label %[[T2L_CATCH:.*]], label %[[T2L_RESUME:.*]] + +// LLVM: [[T2L_CATCH]]: +// LLVM: %[[T2L_ADJ:.*]] = call ptr @__cxa_get_exception_ptr( +// LLVM: invoke void @_ZN11WithDefaultC1ERKS_i(ptr {{.*}}, ptr {{.*}} %[[T2L_ADJ]], i32 {{.*}} 42) +// LLVM: to label %{{.*}} unwind label %[[T2L_TERM:.*]] + +// On the inlined copy's normal-edge fallthrough chain: __cxa_begin_catch. +// LLVM: call ptr @__cxa_begin_catch( + +// LLVM: call void @_ZN11WithDefaultD1Ev( +// LLVM: switch i32 %{{.*}} +// LLVM: call void @__cxa_end_catch() +// LLVM: switch i32 %{{.*}} +// LLVM: ret i32 %{{.*}} + +// LLVM: [[T2L_RESUME]]: +// LLVM: resume { ptr, i32 } +// LLVM: ret i32 + +// LLVM: [[T2L_TERM]]: +// LLVM: landingpad { ptr, i32 } +// LLVM: catch ptr null +// LLVM: call void @__clang_call_terminate( +// LLVM: unreachable + +// The catch-copy thunk is inlined away; no separate function definition. +// LLVM-NOT: @__clang_cir_catch_copy__ZTS11WithDefault + +// --- OGCG --- + +// OGCG-LABEL: define {{.*}} i32 @_Z25test_copy_ctor_extra_argsv() +// OGCG: entry: +// OGCG: invoke void @_Z8mayThrowv() +// OGCG: to label %[[O2_CONT:.*]] unwind label %[[O2_LPAD:.*]] + +// OGCG: [[O2_LPAD]]: +// OGCG: landingpad { ptr, i32 } +// OGCG: catch ptr @_ZTI11WithDefault + +// OGCG: catch.dispatch: +// OGCG: %[[O2_TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI11WithDefault) +// OGCG: icmp eq i32 %{{.*}}, %[[O2_TYPEID]] +// OGCG: br i1 %{{.*}}, label %[[O2_CATCH:.*]], label %[[EH_RESUME:.*]] + +// OGCG: [[O2_CATCH]]: +// OGCG: %[[O2_ADJ:.*]] = call ptr @__cxa_get_exception_ptr(ptr %{{.*}}) +// OGCG: invoke void @_ZN11WithDefaultC1ERKS_i(ptr {{.*}} %{{.*}}, ptr {{.*}} %[[O2_ADJ]], i32 {{.*}} 42) +// OGCG: to label %[[O2_INVOKE_CONT1:.*]] unwind label %[[O2_TERM:.*]] + +// OGCG: [[O2_INVOKE_CONT1]]: +// OGCG: call ptr @__cxa_begin_catch( +// OGCG: store i32 0, ptr %{{.*}} +// OGCG: call void @_ZN11WithDefaultD1Ev( +// OGCG: call void @__cxa_end_catch() +// OGCG: br label %[[RETURN:.*]] + +// OGCG: [[TRY_CONT:.*]]: +// OGCG: store i32 -1, ptr %{{.*}} +// OGCG: br label %[[RETURN:.*]] + +// OGCG: [[RETURN]]: +// OGCG: load i32, ptr %{{.*}} +// OGCG: ret i32 + +// OGCG: eh.resume: +// OGCG: resume { ptr, i32 } + +// OGCG: [[O2_TERM]]: +// OGCG: landingpad { ptr, i32 } +// OGCG: catch ptr null +// OGCG: call void @__clang_call_terminate( + +//===----------------------------------------------------------------------===// +// Module-level checks (runtime helpers materialized by EHABI lowering). +// +// These come last so they aren't partitioned by any per-function LABEL. +//===----------------------------------------------------------------------===// + +// CIR-AFTER-EHABI: cir.func private @__cxa_begin_catch( +// CIR-AFTER-EHABI: cir.func private @__cxa_end_catch() +// CIR-AFTER-EHABI: cir.func private @__cxa_get_exception_ptr( +// CIR-AFTER-EHABI: cir.func linkonce_odr hidden @__clang_call_terminate( +// CIR-AFTER-EHABI: cir.func private @_ZSt9terminatev() From 3ea7398eb4888e58fdd18e097faf9e515890c597 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Mon, 11 May 2026 14:42:22 -0700 Subject: [PATCH 361/538] [CIR] Implement implicit value init for aggregates (#197029) This implements the AggExprEmitter::VisitImplicitValueInitExpr function for CIR. The code to emit a zero-initializer was already present. We just needed to hook it up to the visitor. --- clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 7 ++- .../CIR/CodeGen/implicit-value-init-expr.cpp | 49 +++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index 9c9c88c325844..b561d4abeceda 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -515,8 +515,11 @@ class AggExprEmitter : public StmtVisitor { } void VisitImplicitValueInitExpr(ImplicitValueInitExpr *e) { - cgf.cgm.errorNYI(e->getSourceRange(), - "AggExprEmitter: VisitImplicitValueInitExpr"); + QualType ty = e->getType(); + mlir::Location loc = cgf.getLoc(e->getSourceRange()); + AggValueSlot slot = ensureSlot(loc, ty); + emitNullInitializationToLValue(loc, + cgf.makeAddrLValue(slot.getAddress(), ty)); } void VisitNoInitExpr(NoInitExpr *e) { cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr"); diff --git a/clang/test/CIR/CodeGen/implicit-value-init-expr.cpp b/clang/test/CIR/CodeGen/implicit-value-init-expr.cpp index 3babc7fb6e763..9dc1d1e76dbd4 100644 --- a/clang/test/CIR/CodeGen/implicit-value-init-expr.cpp +++ b/clang/test/CIR/CodeGen/implicit-value-init-expr.cpp @@ -58,3 +58,52 @@ void test_complex(void *p) { new (p) int _Complex(); } // OGCG: %[[P_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[TMP_P]], i32 0, i32 1 // OGCG: store i32 0, ptr %[[P_REAL_PTR]], align 4 // OGCG: store i32 0, ptr %[[P_IMAG_PTR]], align 4 + +struct S { + double filler; +}; + +struct Foo { + Foo() : bar_(), dbar_(), sbar_() {} + + int bar_[5]; + double dbar_[5]; + S sbar_[5]; +}; + +void test_aggregate() { + Foo a; +} + +// CIR: cir.func {{.*}} @_ZN3FooC2Ev( +// CIR: %[[THIS:.*]] = cir.load %{{.*}} +// CIR: %[[BAR:.*]] = cir.get_member %[[THIS]][0] {name = "bar_"} : !cir.ptr -> !cir.ptr> +// CIR: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array +// CIR: cir.store{{.*}} %[[ZERO]], %[[BAR]] : !cir.array, !cir.ptr> +// CIR: %[[DBAR:.*]] = cir.get_member %[[THIS]][1] {name = "dbar_"} : !cir.ptr -> !cir.ptr> +// CIR: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array +// CIR: cir.store{{.*}} %[[ZERO]], %[[DBAR]] : !cir.array, !cir.ptr> +// CIR: %[[SBAR:.*]] = cir.get_member %[[THIS]][2] {name = "sbar_"} : !cir.ptr -> !cir.ptr> +// CIR: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array +// CIR: cir.store{{.*}} %[[ZERO]], %[[SBAR]] : !cir.array, !cir.ptr> +// CIR: cir.return + +// LLVM: define {{.*}} void @_ZN3FooC2Ev( +// LLVM: %[[THIS:.*]] = load ptr, ptr +// LLVM: %[[BAR:.*]] = getelementptr inbounds nuw %struct.Foo, ptr %[[THIS]], i32 0, i32 0 +// LLVM: store [5 x i32] zeroinitializer, ptr %[[BAR]] +// LLVM: %[[DBAR:.*]] = getelementptr inbounds nuw %struct.Foo, ptr %[[THIS]], i32 0, i32 1 +// LLVM: store [5 x double] zeroinitializer, ptr %[[DBAR]] +// LLVM: %[[SBAR:.*]] = getelementptr inbounds nuw %struct.Foo, ptr %[[THIS]], i32 0, i32 2 +// LLVM: store [5 x %struct.S] zeroinitializer, ptr %[[SBAR]] +// LLVM: ret void + +// OGCG: define{{.*}} void @_ZN3FooC2Ev( +// OGCG: %[[THIS:.*]] = load ptr, ptr +// OGCG: %[[BAR:.*]] = getelementptr inbounds nuw %struct.Foo, ptr %[[THIS]], i32 0, i32 0 +// OGCG: call void @llvm.memset.p0.i64(ptr {{.*}}%[[BAR]], i8 0, i64 20, i1 false) +// OGCG: %[[DBAR:.*]] = getelementptr inbounds nuw %struct.Foo, ptr %[[THIS]], i32 0, i32 1 +// OGCG: call void @llvm.memset.p0.i64(ptr {{.*}}%[[DBAR]], i8 0, i64 40, i1 false) +// OGCG: %[[SBAR:.*]] = getelementptr inbounds nuw %struct.Foo, ptr %[[THIS]], i32 0, i32 2 +// OGCG: call void @llvm.memset.p0.i64(ptr {{.*}}%[[SBAR]], i8 0, i64 40, i1 false) +// OGCG: ret void From cddc09bb7bcd130f7c5c09456377b3cbcd57f0e6 Mon Sep 17 00:00:00 2001 From: AbdallahRashed <63146988+AbdallahRashed@users.noreply.github.com> Date: Mon, 11 May 2026 23:53:12 +0200 Subject: [PATCH 362/538] WIR [CIR][CodeGen] Remove dead srcAS code in emitCastLValue address spacecast (#197016) The srcAS variable was computed but never used since upstream's performAddrSpaceCast only takes (value, destType). Remove the dead code and its errorNYI for non-target address spaces. Fixes part of #192314 --- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index b8cfb74fc81e4..34a7e4d655610 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1676,16 +1676,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) { LValue lv = emitLValue(e->getSubExpr()); QualType destTy = getContext().getPointerType(e->getType()); - clang::LangAS srcLangAS = e->getSubExpr()->getType().getAddressSpace(); - mlir::ptr::MemorySpaceAttrInterface srcAS; - if (clang::isTargetAddressSpace(srcLangAS)) - srcAS = cir::toCIRAddressSpaceAttr(getMLIRContext(), srcLangAS); - else - cgm.errorNYI( - e->getSourceRange(), - "emitCastLValue: address space conversion from unknown address " - "space"); - mlir::Value v = performAddrSpaceCast(lv.getPointer(), convertType(destTy)); return makeAddrLValue(Address(v, convertTypeForMem(e->getType()), From afb1c2e6e42f1c6ddbf079e976ad8414a3a0a30f Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 11 May 2026 15:21:13 -0700 Subject: [PATCH 363/538] workflows/release-doxygen: Add some security checks and input validation (#196769) We now ensure the job was started by a release manager before granting the contents: write permissions and we also validate the input to ensure it is a proper release string and not something malicious. --- .github/workflows/release-doxygen.yml | 32 +++++++++++++++++++ .../validate-release-version/action.yml | 15 +++++++++ 2 files changed, 47 insertions(+) create mode 100644 .github/workflows/validate-release-version/action.yml diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml index 246d7a6304095..7bf5e5e94f905 100644 --- a/.github/workflows/release-doxygen.yml +++ b/.github/workflows/release-doxygen.yml @@ -34,6 +34,36 @@ on: required: true jobs: + # This job checks permissions and validates inputs to prevent potential + # malicious actions. Since the release-doxygen job has contents: write + # permissions we need to be extra careful about who can run the job and + # what inputs can be provided. + release-doxygen-validate-input: + name: Release Doxygen Validate Input + runs-on: ubuntu-24.04 + environment: + name: release + deployment: false + permissions: + contents: read + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + sparse-checkout: | + .github/workflows/ + + - name: Check Permissions + uses: ./.github/workflows/require-release-manager + with: + LLVM_TOKEN_GENERATOR_CLIENT_ID: ${{ secrets.LLVM_TOKEN_GENERATOR_CLIENT_ID }} + LLVM_TOKEN_GENERATOR_PRIVATE_KEY: ${{ secrets.LLVM_TOKEN_GENERATOR_PRIVATE_KEY }} + + - name: Validate Input + ./.github/workflows/validate-release-version + with: + release-version: ${{ inputs.release-version }} + release-doxygen: name: Build and Upload Release Doxygen runs-on: ubuntu-24.04 @@ -42,6 +72,8 @@ jobs: deployment: false permissions: contents: write + needs: + - release-doxygen-validate-input env: upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }} steps: diff --git a/.github/workflows/validate-release-version/action.yml b/.github/workflows/validate-release-version/action.yml new file mode 100644 index 0000000000000..1a10ad67cb549 --- /dev/null +++ b/.github/workflows/validate-release-version/action.yml @@ -0,0 +1,15 @@ +name: Validate Release String +description: >- + This checks to make sure that the given release-version string is well formed. +inputs: + release-version: + required: true + +runs: + using: "composite" + steps: + - env: + RELEASE_VERSON: ${{ inputs.release-version }} + + run: | + grep -e '^[0-9]\+\.[0-9]\+\.[0-9]\+\(-rc[0-9]\+\)\?$' <<< "$RELEASE_VERSION" From 20e4a52446000af0789a84e4f4e40d3467480778 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Mon, 11 May 2026 15:28:22 -0700 Subject: [PATCH 364/538] [Clang][CodeGen][NFC] Refactor EmitAsmStmt method (#196885) Split up massive function into smaller, easier-to-digest chunks. This places the data into a single structure to limit the amount of parameters needed per function. --- clang/lib/CodeGen/CGStmt.cpp | 616 +++++++++++++++------------- clang/lib/CodeGen/CodeGenFunction.h | 80 ++++ 2 files changed, 414 insertions(+), 282 deletions(-) diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 7b6035a6968b1..71f88cdf58954 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -2605,32 +2605,29 @@ static llvm::MDNode *getAsmSrcLocInfo(const StringLiteral *Str, return llvm::MDNode::get(CGF.getLLVMContext(), Locs); } -static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect, - bool HasUnwindClobber, bool ReadOnly, - bool ReadNone, bool NoMerge, bool NoConvergent, - const AsmStmt &S, - const std::vector &ResultRegTypes, - const std::vector &ArgElemTypes, - CodeGenFunction &CGF, - std::vector &RegResults) { +void CodeGenFunction::UpdateAsmCallInst( + const AsmStmt &S, llvm::CallBase &Result, const AsmConstraintsInfo &AsmInfo, + bool HasSideEffect, bool HasUnwindClobber, bool NoMerge, bool NoConvergent, + std::vector &RegResults) { if (!HasUnwindClobber) Result.addFnAttr(llvm::Attribute::NoUnwind); if (NoMerge) Result.addFnAttr(llvm::Attribute::NoMerge); + // Attach readnone and readonly attributes. if (!HasSideEffect) { - if (ReadNone) + if (AsmInfo.ReadNone) Result.setDoesNotAccessMemory(); - else if (ReadOnly) + else if (AsmInfo.ReadOnly) Result.setOnlyReadsMemory(); } // Add elementtype attribute for indirect constraints. - for (auto Pair : llvm::enumerate(ArgElemTypes)) { + for (auto Pair : llvm::enumerate(AsmInfo.ArgElemTypes)) { if (Pair.value()) { auto Attr = llvm::Attribute::get( - CGF.getLLVMContext(), llvm::Attribute::ElementType, Pair.value()); + getLLVMContext(), llvm::Attribute::ElementType, Pair.value()); Result.addParamAttr(Pair.index(), Attr); } } @@ -2641,79 +2638,75 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect, if (const auto *gccAsmStmt = dyn_cast(&S); gccAsmStmt && (SL = dyn_cast(gccAsmStmt->getAsmStringExpr()))) { - Result.setMetadata("srcloc", getAsmSrcLocInfo(SL, CGF)); + Result.setMetadata("srcloc", getAsmSrcLocInfo(SL, *this)); } else { // At least put the line number on MS inline asm blobs and GCC asm constexpr // strings. llvm::Constant *Loc = - llvm::ConstantInt::get(CGF.Int64Ty, S.getAsmLoc().getRawEncoding()); + llvm::ConstantInt::get(Int64Ty, S.getAsmLoc().getRawEncoding()); Result.setMetadata("srcloc", - llvm::MDNode::get(CGF.getLLVMContext(), + llvm::MDNode::get(getLLVMContext(), llvm::ConstantAsMetadata::get(Loc))); } // Make inline-asm calls Key for the debug info feature Key Instructions. - CGF.addInstToNewSourceAtom(&Result, nullptr); + addInstToNewSourceAtom(&Result, nullptr); - if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent()) + if (!NoConvergent && getLangOpts().assumeFunctionsAreConvergent()) // Conservatively, mark all inline asm blocks in CUDA or OpenCL as // convergent (meaning, they may call an intrinsically convergent op, such // as bar.sync, and so can't have certain optimizations applied around // them) unless it's explicitly marked 'noconvergent'. Result.addFnAttr(llvm::Attribute::Convergent); // Extract all of the register value results from the asm. - if (ResultRegTypes.size() == 1) { + if (AsmInfo.ResultRegTypes.size() == 1) { RegResults.push_back(&Result); } else { - for (unsigned i = 0, e = ResultRegTypes.size(); i != e; ++i) { - llvm::Value *Tmp = CGF.Builder.CreateExtractValue(&Result, i, "asmresult"); + for (unsigned i = 0, e = AsmInfo.ResultRegTypes.size(); i != e; ++i) { + llvm::Value *Tmp = Builder.CreateExtractValue(&Result, i, "asmresult"); RegResults.push_back(Tmp); } } } -static void -EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S, - const llvm::ArrayRef RegResults, - const llvm::ArrayRef ResultRegTypes, - const llvm::ArrayRef ResultTruncRegTypes, - const llvm::ArrayRef ResultRegDests, - const llvm::ArrayRef ResultRegQualTys, - const llvm::BitVector &ResultTypeRequiresCast, - const std::vector>> - &ResultBounds) { - CGBuilderTy &Builder = CGF.Builder; - CodeGenModule &CGM = CGF.CGM; - llvm::LLVMContext &CTX = CGF.getLLVMContext(); - - assert(RegResults.size() == ResultRegTypes.size()); - assert(RegResults.size() == ResultTruncRegTypes.size()); - assert(RegResults.size() == ResultRegDests.size()); - // ResultRegDests can be also populated by addReturnRegisterOutputs() above, +void CodeGenFunction::EmitAsmStores( + const AsmStmt &S, const llvm::ArrayRef RegResults, + const AsmConstraintsInfo &AsmInfo) { + llvm::LLVMContext &CTX = getLLVMContext(); + + assert(RegResults.size() == AsmInfo.ResultRegTypes.size()); + assert(RegResults.size() == AsmInfo.ResultTruncRegTypes.size()); + assert(RegResults.size() == AsmInfo.ResultRegDests.size()); + + // ResultRegDests can also be populated by addReturnRegisterOutputs() above, // in which case its size may grow. - assert(ResultTypeRequiresCast.size() <= ResultRegDests.size()); - assert(ResultBounds.size() <= ResultRegDests.size()); + assert(AsmInfo.ResultTypeRequiresCast.size() <= + AsmInfo.ResultRegDests.size()); + assert(AsmInfo.ResultBounds.size() <= AsmInfo.ResultRegDests.size()); for (unsigned i = 0, e = RegResults.size(); i != e; ++i) { llvm::Value *Tmp = RegResults[i]; - llvm::Type *TruncTy = ResultTruncRegTypes[i]; + llvm::Type *TruncTy = AsmInfo.ResultTruncRegTypes[i]; + + if (i < AsmInfo.ResultBounds.size() && + AsmInfo.ResultBounds[i].has_value()) { + const auto [LowerBound, UpperBound] = AsmInfo.ResultBounds[i].value(); - if ((i < ResultBounds.size()) && ResultBounds[i].has_value()) { - const auto [LowerBound, UpperBound] = ResultBounds[i].value(); // FIXME: Support for nonzero lower bounds not yet implemented. assert(LowerBound == 0 && "Output operand lower bound is not zero."); + llvm::Constant *UpperBoundConst = llvm::ConstantInt::get(Tmp->getType(), UpperBound); llvm::Value *IsBooleanValue = Builder.CreateCmp(llvm::CmpInst::ICMP_ULT, Tmp, UpperBoundConst); llvm::Function *FnAssume = CGM.getIntrinsic(llvm::Intrinsic::assume); + Builder.CreateCall(FnAssume, IsBooleanValue); } // If the result type of the LLVM IR asm doesn't match the result type of // the expression, do the conversion. - if (ResultRegTypes[i] != TruncTy) { - + if (AsmInfo.ResultRegTypes[i] != TruncTy) { // Truncate the integer result to the right size, note that TruncTy can be // a pointer. if (TruncTy->isFloatingPointTy()) @@ -2736,30 +2729,34 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S, } } - ApplyAtomGroup Grp(CGF.getDebugInfo()); - LValue Dest = ResultRegDests[i]; + ApplyAtomGroup Grp(getDebugInfo()); + LValue Dest = AsmInfo.ResultRegDests[i]; + // ResultTypeRequiresCast elements correspond to the first // ResultTypeRequiresCast.size() elements of RegResults. - if ((i < ResultTypeRequiresCast.size()) && ResultTypeRequiresCast[i]) { - unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]); - Address A = Dest.getAddress().withElementType(ResultRegTypes[i]); - if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) { + if (i < AsmInfo.ResultTypeRequiresCast.size() && + AsmInfo.ResultTypeRequiresCast[i]) { + unsigned Size = getContext().getTypeSize(AsmInfo.ResultRegQualTys[i]); + Address A = Dest.getAddress().withElementType(AsmInfo.ResultRegTypes[i]); + + if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) { llvm::StoreInst *S = Builder.CreateStore(Tmp, A); - CGF.addInstToCurrentSourceAtom(S, S->getValueOperand()); + addInstToCurrentSourceAtom(S, S->getValueOperand()); continue; } - QualType Ty = - CGF.getContext().getIntTypeForBitwidth(Size, /*Signed=*/false); + QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed=*/false); if (Ty.isNull()) { const Expr *OutExpr = S.getOutputExpr(i); CGM.getDiags().Report(OutExpr->getExprLoc(), diag::err_store_value_to_reg); return; } - Dest = CGF.MakeAddrLValue(A, Ty); + + Dest = MakeAddrLValue(A, Ty); } - CGF.EmitStoreThroughLValue(RValue::get(Tmp), Dest); + + EmitStoreThroughLValue(RValue::get(Tmp), Dest); } } @@ -2772,7 +2769,6 @@ static void EmitHipStdParUnsupportedAsm(CodeGenFunction *CGF, Asm = GCCAsm->getAsmString(); auto &Ctx = CGF->CGM.getLLVMContext(); - auto StrTy = llvm::ConstantDataArray::getString(Ctx, Asm); auto FnTy = llvm::FunctionType::get(llvm::Type::getVoidTy(Ctx), {StrTy->getType()}, false); @@ -2781,90 +2777,69 @@ static void EmitHipStdParUnsupportedAsm(CodeGenFunction *CGF, CGF->Builder.CreateCall(UBF, {StrTy}); } -void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { - // Pop all cleanup blocks at the end of the asm statement. - CodeGenFunction::RunCleanupsScope Cleanups(*this); - - // Assemble the final asm string. - std::string AsmString = S.generateAsmString(getContext()); - - // Get all the output and input constraints together. - SmallVector OutputConstraintInfos; - SmallVector InputConstraintInfos; - - bool IsHipStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice; +/// Gather and validate the output and input constraints for the given inline +/// assembly statement. This ensures that the constraints are valid for the +/// target and prepares them for further processing. +bool CodeGenFunction::GetOutputAndInputConstraints( + const AsmStmt &S, + SmallVectorImpl &OutputConstraintInfos, + SmallVectorImpl &InputConstraintInfos) { bool IsValidTargetAsm = true; - for (unsigned i = 0, e = S.getNumOutputs(); i != e && IsValidTargetAsm; i++) { + bool IsHipStdPar = getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice; + for (unsigned I = 0, E = S.getNumOutputs(); I != E && IsValidTargetAsm; I++) { StringRef Name; if (const GCCAsmStmt *GAS = dyn_cast(&S)) - Name = GAS->getOutputName(i); - TargetInfo::ConstraintInfo Info(S.getOutputConstraint(i), Name); - bool IsValid = getTarget().validateOutputConstraint(Info); (void)IsValid; + Name = GAS->getOutputName(I); + + TargetInfo::ConstraintInfo Info(S.getOutputConstraint(I), Name); + + bool IsValid = getTarget().validateOutputConstraint(Info); if (IsHipStdPar && !IsValid) IsValidTargetAsm = false; else assert(IsValid && "Failed to parse output constraint"); + OutputConstraintInfos.push_back(Info); } - for (unsigned i = 0, e = S.getNumInputs(); i != e && IsValidTargetAsm; i++) { + for (unsigned I = 0, E = S.getNumInputs(); I != E && IsValidTargetAsm; I++) { StringRef Name; if (const GCCAsmStmt *GAS = dyn_cast(&S)) - Name = GAS->getInputName(i); - TargetInfo::ConstraintInfo Info(S.getInputConstraint(i), Name); + Name = GAS->getInputName(I); + + TargetInfo::ConstraintInfo Info(S.getInputConstraint(I), Name); + bool IsValid = - getTarget().validateInputConstraint(OutputConstraintInfos, Info); + getTarget().validateInputConstraint(OutputConstraintInfos, Info); if (IsHipStdPar && !IsValid) IsValidTargetAsm = false; else assert(IsValid && "Failed to parse input constraint"); + InputConstraintInfos.push_back(Info); } - if (!IsValidTargetAsm) - return EmitHipStdParUnsupportedAsm(this, S); - - std::string Constraints; - - std::vector ResultRegDests; - std::vector ResultRegQualTys; - std::vector ResultRegTypes; - std::vector ResultTruncRegTypes; - std::vector ArgTypes; - std::vector ArgElemTypes; - std::vector Args; - llvm::BitVector ResultTypeRequiresCast; - std::vector>> ResultBounds; - - // Keep track of inout constraints. - std::string InOutConstraints; - std::vector InOutArgs; - std::vector InOutArgTypes; - std::vector InOutArgElemTypes; - - // Keep track of out constraints for tied input operand. - std::vector OutputConstraints; + return IsValidTargetAsm; +} +/// Process the output constraints of an inline assembly statement. This method +/// handles the complexity of determining whether an output should be a +/// register or memory operand, manages tied operands, and prepares the +/// necessary arguments for the LLVM inline asm call. +void CodeGenFunction::HandleOutputConstraints(const AsmStmt &S, + AsmConstraintsInfo &AsmInfo) { // Keep track of defined physregs. llvm::SmallSet PhysRegOutputs; - // An inline asm can be marked readonly if it meets the following conditions: - // - it doesn't have any sideeffects - // - it doesn't clobber memory - // - it doesn't return a value by-reference - // It can be marked readnone if it doesn't have any input memory constraints - // in addition to meeting the conditions listed above. - bool ReadOnly = true, ReadNone = true; - - for (unsigned i = 0, e = S.getNumOutputs(); i != e; i++) { - TargetInfo::ConstraintInfo &Info = OutputConstraintInfos[i]; + for (unsigned I = 0, E = S.getNumOutputs(); I != E; I++) { + TargetInfo::ConstraintInfo &Info = AsmInfo.OutputConstraintInfos[I]; // Simplify the output constraint. - std::string OutputConstraint(S.getOutputConstraint(i)); + std::string OutputConstraint(S.getOutputConstraint(I)); OutputConstraint = getTarget().simplifyConstraint( - StringRef(OutputConstraint).substr(1), &OutputConstraintInfos); + StringRef(OutputConstraint).substr(1), &AsmInfo.OutputConstraintInfos); - const Expr *OutExpr = S.getOutputExpr(i); + const Expr *OutExpr = S.getOutputExpr(I); OutExpr = OutExpr->IgnoreParenNoopCasts(getContext()); std::string GCCReg; @@ -2874,52 +2849,56 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { CGM.ErrorUnsupported(UnspStmt, Msg); }, &GCCReg); + // Give an error on multiple outputs to same physreg. if (!GCCReg.empty() && !PhysRegOutputs.insert(GCCReg).second) CGM.Error(S.getAsmLoc(), "multiple outputs to hard register: " + GCCReg); - OutputConstraints.push_back(OutputConstraint); + AsmInfo.OutputConstraints.push_back(OutputConstraint); LValue Dest = EmitLValue(OutExpr); - if (!Constraints.empty()) - Constraints += ','; + if (!AsmInfo.Constraints.empty()) + AsmInfo.Constraints += ','; // If this is a register output, then make the inline asm return it // by-value. If this is a memory result, return the value by-reference. QualType QTy = OutExpr->getType(); - const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) || - hasAggregateEvaluationKind(QTy); - if (!Info.allowsMemory() && IsScalarOrAggregate) { + const bool IsScalarOrAggregate = + hasScalarEvaluationKind(QTy) || hasAggregateEvaluationKind(QTy); - Constraints += "=" + OutputConstraint; - ResultRegQualTys.push_back(QTy); - ResultRegDests.push_back(Dest); + if (!Info.allowsMemory() && IsScalarOrAggregate) { + AsmInfo.Constraints += "=" + OutputConstraint; + AsmInfo.ResultRegQualTys.push_back(QTy); + AsmInfo.ResultRegDests.push_back(Dest); - ResultBounds.emplace_back(Info.getOutputOperandBounds()); + AsmInfo.ResultBounds.emplace_back(Info.getOutputOperandBounds()); llvm::Type *Ty = ConvertTypeForMem(QTy); - const bool RequiresCast = Info.allowsRegister() && + const bool RequiresCast = + Info.allowsRegister() && (getTargetHooks().isScalarizableAsmOperand(*this, Ty) || Ty->isAggregateType()); - ResultTruncRegTypes.push_back(Ty); - ResultTypeRequiresCast.push_back(RequiresCast); + AsmInfo.ResultTruncRegTypes.push_back(Ty); + AsmInfo.ResultTypeRequiresCast.push_back(RequiresCast); if (RequiresCast) { - unsigned Size = getContext().getTypeSize(QTy); - if (Size) + if (unsigned Size = getContext().getTypeSize(QTy)) Ty = llvm::IntegerType::get(getLLVMContext(), Size); else CGM.Error(OutExpr->getExprLoc(), "output size should not be zero"); } - ResultRegTypes.push_back(Ty); + + AsmInfo.ResultRegTypes.push_back(Ty); + // If this output is tied to an input, and if the input is larger, then // we need to set the actual result type of the inline asm node to be the // same as the input type. if (Info.hasMatchingInput()) { unsigned InputNo; for (InputNo = 0; InputNo != S.getNumInputs(); ++InputNo) { - TargetInfo::ConstraintInfo &Input = InputConstraintInfos[InputNo]; - if (Input.hasTiedOperand() && Input.getTiedOperand() == i) + TargetInfo::ConstraintInfo &Input = + AsmInfo.InputConstraintInfos[InputNo]; + if (Input.hasTiedOperand() && Input.getTiedOperand() == I) break; } assert(InputNo != S.getNumInputs() && "Didn't find matching input!"); @@ -2928,28 +2907,27 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { QualType OutputType = OutExpr->getType(); uint64_t InputSize = getContext().getTypeSize(InputTy); - if (getContext().getTypeSize(OutputType) < InputSize) { + if (getContext().getTypeSize(OutputType) < InputSize) // Form the asm to return the value as a larger integer or fp type. - ResultRegTypes.back() = ConvertType(InputTy); - } + AsmInfo.ResultRegTypes.back() = ConvertType(InputTy); } - if (llvm::Type* AdjTy = - getTargetHooks().adjustInlineAsmType(*this, OutputConstraint, - ResultRegTypes.back())) - ResultRegTypes.back() = AdjTy; - else { + + if (llvm::Type *AdjTy = getTargetHooks().adjustInlineAsmType( + *this, OutputConstraint, AsmInfo.ResultRegTypes.back())) + AsmInfo.ResultRegTypes.back() = AdjTy; + else CGM.getDiags().Report(S.getAsmLoc(), diag::err_asm_invalid_type_in_input) << OutExpr->getType() << OutputConstraint; - } // Update largest vector width for any vector types. - if (auto *VT = dyn_cast(ResultRegTypes.back())) + if (auto *VT = dyn_cast(AsmInfo.ResultRegTypes.back())) LargestVectorWidth = std::max((uint64_t)LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinValue()); } else { Address DestAddr = Dest.getAddress(); + // Matrix types in memory are represented by arrays, but accessed through // vector pointers, with the alignment specified on the access operation. // For inline assembly, update pointer arguments to use vector pointers. @@ -2958,87 +2936,105 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { if (isa(OutExpr->getType().getCanonicalType())) DestAddr = DestAddr.withElementType(ConvertType(OutExpr->getType())); - ArgTypes.push_back(DestAddr.getType()); - ArgElemTypes.push_back(DestAddr.getElementType()); - Args.push_back(DestAddr.emitRawPointer(*this)); - Constraints += "=*"; - Constraints += OutputConstraint; - ReadOnly = ReadNone = false; + AsmInfo.ArgTypes.push_back(DestAddr.getType()); + AsmInfo.ArgElemTypes.push_back(DestAddr.getElementType()); + AsmInfo.Args.push_back(DestAddr.emitRawPointer(*this)); + + AsmInfo.Constraints += "=*" + OutputConstraint; + AsmInfo.ReadOnly = false; + AsmInfo.ReadNone = false; } - if (Info.isReadWrite()) { - InOutConstraints += ','; + if (!Info.isReadWrite()) + continue; - const Expr *InputExpr = S.getOutputExpr(i); - llvm::Value *Arg; - llvm::Type *ArgElemType; - std::tie(Arg, ArgElemType) = EmitAsmInputLValue( - Info, Dest, InputExpr->getType(), InOutConstraints, - InputExpr->getExprLoc()); + AsmInfo.InOutConstraints += ','; - if (llvm::Type* AdjTy = - getTargetHooks().adjustInlineAsmType(*this, OutputConstraint, - Arg->getType())) - Arg = Builder.CreateBitCast(Arg, AdjTy); + const Expr *InputExpr = S.getOutputExpr(I); + llvm::Value *Arg; + llvm::Type *ArgElemType; + std::tie(Arg, ArgElemType) = + EmitAsmInputLValue(Info, Dest, InputExpr->getType(), + AsmInfo.InOutConstraints, InputExpr->getExprLoc()); - // Update largest vector width for any vector types. - if (auto *VT = dyn_cast(Arg->getType())) - LargestVectorWidth = - std::max((uint64_t)LargestVectorWidth, - VT->getPrimitiveSizeInBits().getKnownMinValue()); - // Only tie earlyclobber physregs. - if (Info.allowsRegister() && (GCCReg.empty() || Info.earlyClobber())) - InOutConstraints += llvm::utostr(i); - else - InOutConstraints += OutputConstraint; + if (llvm::Type *AdjTy = getTargetHooks().adjustInlineAsmType( + *this, OutputConstraint, Arg->getType())) + Arg = Builder.CreateBitCast(Arg, AdjTy); - InOutArgTypes.push_back(Arg->getType()); - InOutArgElemTypes.push_back(ArgElemType); - InOutArgs.push_back(Arg); - } - } + // Update largest vector width for any vector types. + if (auto *VT = dyn_cast(Arg->getType())) + LargestVectorWidth = + std::max((uint64_t)LargestVectorWidth, + VT->getPrimitiveSizeInBits().getKnownMinValue()); - // If this is a Microsoft-style asm blob, store the return registers (EAX:EDX) - // to the return value slot. Only do this when returning in registers. - if (isa(&S)) { - const ABIArgInfo &RetAI = CurFnInfo->getReturnInfo(); - if (RetAI.isDirect() || RetAI.isExtend()) { - // Make a fake lvalue for the return value slot. - LValue ReturnSlot = MakeAddrLValueWithoutTBAA(ReturnValue, FnRetTy); - CGM.getTargetCodeGenInfo().addReturnRegisterOutputs( - *this, ReturnSlot, Constraints, ResultRegTypes, ResultTruncRegTypes, - ResultRegDests, AsmString, S.getNumOutputs()); - SawAsmBlock = true; - } + // Only tie earlyclobber physregs. + if (Info.allowsRegister() && (GCCReg.empty() || Info.earlyClobber())) + AsmInfo.InOutConstraints += llvm::utostr(I); + else + AsmInfo.InOutConstraints += OutputConstraint; + + AsmInfo.InOutArgTypes.push_back(Arg->getType()); + AsmInfo.InOutArgElemTypes.push_back(ArgElemType); + AsmInfo.InOutArgs.push_back(Arg); } +} - for (unsigned i = 0, e = S.getNumInputs(); i != e; i++) { - const Expr *InputExpr = S.getInputExpr(i); +/// Special handling for Microsoft-style inline assembly blocks. This ensures +/// that return registers (like EAX:EDX) are correctly mapped to the function's +/// return value slot when necessary. +void CodeGenFunction::HandleMSStyleAsmBlob(const AsmStmt &S, + std::string &AsmString, + AsmConstraintsInfo &AsmInfo) { + if (!isa(&S)) + return; - TargetInfo::ConstraintInfo &Info = InputConstraintInfos[i]; + const ABIArgInfo &RetAI = CurFnInfo->getReturnInfo(); + if (!RetAI.isDirect() && !RetAI.isExtend()) + return; + + // Make a fake lvalue for the return value slot. + LValue ReturnSlot = MakeAddrLValueWithoutTBAA(ReturnValue, FnRetTy); + CGM.getTargetCodeGenInfo().addReturnRegisterOutputs( + *this, ReturnSlot, AsmInfo.Constraints, AsmInfo.ResultRegTypes, + AsmInfo.ResultTruncRegTypes, AsmInfo.ResultRegDests, AsmString, + S.getNumOutputs()); + SawAsmBlock = true; +} + +/// Process the input constraints of an inline assembly statement. It handles +/// type conversions, extensions for tied operands, and collects the necessary +/// LLVM values to be passed to the inline assembly call. +void CodeGenFunction::HandleInputConstraints(const AsmStmt &S, + AsmConstraintsInfo &AsmInfo) { + ASTContext &Ctx = getContext(); + + for (unsigned I = 0, E = S.getNumInputs(); I != E; I++) { + TargetInfo::ConstraintInfo &Info = AsmInfo.InputConstraintInfos[I]; + const Expr *InputExpr = S.getInputExpr(I); if (Info.allowsMemory()) - ReadNone = false; + AsmInfo.ReadNone = false; - if (!Constraints.empty()) - Constraints += ','; + if (!AsmInfo.Constraints.empty()) + AsmInfo.Constraints += ','; // Simplify the input constraint. - std::string InputConstraint(S.getInputConstraint(i)); - InputConstraint = - getTarget().simplifyConstraint(InputConstraint, &OutputConstraintInfos); + std::string InputConstraint(S.getInputConstraint(I)); + InputConstraint = getTarget().simplifyConstraint( + InputConstraint, &AsmInfo.OutputConstraintInfos); InputConstraint = S.addVariableConstraints( - InputConstraint, *InputExpr->IgnoreParenNoopCasts(getContext()), - getTarget(), false /* No EarlyClobber */, + InputConstraint, *InputExpr->IgnoreParenNoopCasts(Ctx), getTarget(), + false /* No EarlyClobber */, [&](const Stmt *UnspStmt, std::string_view Msg) { CGM.ErrorUnsupported(UnspStmt, Msg); }); - std::string ReplaceConstraint (InputConstraint); + std::string ReplaceConstraint(InputConstraint); llvm::Value *Arg; llvm::Type *ArgElemType; - std::tie(Arg, ArgElemType) = EmitAsmInput(Info, InputExpr, Constraints); + std::tie(Arg, ArgElemType) = + EmitAsmInput(Info, InputExpr, AsmInfo.Constraints); // If this input argument is tied to a larger output result, extend the // input to be the same size as the output. The LLVM backend wants to see @@ -3050,11 +3046,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { QualType OutputType = S.getOutputExpr(Output)->getType(); QualType InputTy = InputExpr->getType(); - if (getContext().getTypeSize(OutputType) > - getContext().getTypeSize(InputTy)) { + if (Ctx.getTypeSize(OutputType) > Ctx.getTypeSize(InputTy)) { // Use ptrtoint as appropriate so that we can do our extension. if (isa(Arg->getType())) Arg = Builder.CreatePtrToInt(Arg, IntPtrTy); + llvm::Type *OutputTy = ConvertType(OutputType); if (isa(OutputTy)) Arg = Builder.CreateZExt(Arg, OutputTy); @@ -3063,12 +3059,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { else if (OutputTy->isFloatingPointTy()) Arg = Builder.CreateFPExt(Arg, OutputTy); } + // Deal with the tied operands' constraint code in adjustInlineAsmType. - ReplaceConstraint = OutputConstraints[Output]; + ReplaceConstraint = AsmInfo.OutputConstraints[Output]; } - if (llvm::Type* AdjTy = - getTargetHooks().adjustInlineAsmType(*this, ReplaceConstraint, - Arg->getType())) + + if (llvm::Type *AdjTy = getTargetHooks().adjustInlineAsmType( + *this, ReplaceConstraint, Arg->getType())) Arg = Builder.CreateBitCast(Arg, AdjTy); else CGM.getDiags().Report(S.getAsmLoc(), diag::err_asm_invalid_type_in_input) @@ -3080,49 +3077,65 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { std::max((uint64_t)LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinValue()); - ArgTypes.push_back(Arg->getType()); - ArgElemTypes.push_back(ArgElemType); - Args.push_back(Arg); - Constraints += InputConstraint; - } - - // Append the "input" part of inout constraints. - for (unsigned i = 0, e = InOutArgs.size(); i != e; i++) { - ArgTypes.push_back(InOutArgTypes[i]); - ArgElemTypes.push_back(InOutArgElemTypes[i]); - Args.push_back(InOutArgs[i]); - } - Constraints += InOutConstraints; - - // Labels - SmallVector Transfer; - llvm::BasicBlock *Fallthrough = nullptr; - bool IsGCCAsmGoto = false; - if (const auto *GS = dyn_cast(&S)) { - IsGCCAsmGoto = GS->isAsmGoto(); - if (IsGCCAsmGoto) { - for (const auto *E : GS->labels()) { - JumpDest Dest = getJumpDestForLabel(E->getLabel()); - Transfer.push_back(Dest.getBlock()); - if (!Constraints.empty()) - Constraints += ','; - Constraints += "!i"; - } - Fallthrough = createBasicBlock("asm.fallthrough"); + AsmInfo.ArgTypes.push_back(Arg->getType()); + AsmInfo.ArgElemTypes.push_back(ArgElemType); + AsmInfo.Args.push_back(Arg); + + AsmInfo.Constraints += InputConstraint; + } + + // Append the "input" part of in/out constraints. + for (unsigned I = 0, E = AsmInfo.InOutArgs.size(); I != E; I++) { + AsmInfo.ArgTypes.push_back(AsmInfo.InOutArgTypes[I]); + AsmInfo.ArgElemTypes.push_back(AsmInfo.InOutArgElemTypes[I]); + AsmInfo.Args.push_back(AsmInfo.InOutArgs[I]); + } + + AsmInfo.Constraints += AsmInfo.InOutConstraints; +} + +/// Handle labels in an 'asm goto' statement. This method resolves the symbolic +/// labels to LLVM basic blocks and updates the constraint string to reflect +/// the indirect jump targets. +bool CodeGenFunction::HandleLabels(const AsmStmt &S, + AsmConstraintsInfo &AsmInfo) { + if (const auto *GS = dyn_cast(&S); GS && GS->isAsmGoto()) { + for (const auto *E : GS->labels()) { + CodeGenFunction::JumpDest Dest = getJumpDestForLabel(E->getLabel()); + AsmInfo.IndirectDests.push_back(Dest.getBlock()); + + if (!AsmInfo.Constraints.empty()) + AsmInfo.Constraints += ','; + + AsmInfo.Constraints += "!i"; } + + AsmInfo.DefaultDest = createBasicBlock("asm.fallthrough"); + return true; } - bool HasUnwindClobber = false; + return false; +} - // Clobbers - for (unsigned i = 0, e = S.getNumClobbers(); i != e; i++) { - std::string Clobber = S.getClobber(i); +/// Process clobber constraints for an inline assembly statement. This +/// identifies which registers or system state (like "memory" or "cc") are +/// modified by the assembly block, which is crucial for correct optimization +/// and side-effect modeling. +bool CodeGenFunction::HandleClobbers(const AsmStmt &S, + AsmConstraintsInfo &AsmInfo) { + std::string &Constraints = AsmInfo.Constraints; + bool HasUnwindClobber = false; + for (unsigned I = 0, E = S.getNumClobbers(); I != E; I++) { + std::string Clobber = S.getClobber(I); - if (Clobber == "memory") - ReadOnly = ReadNone = false; - else if (Clobber == "unwind") { + if (Clobber == "unwind") { HasUnwindClobber = true; continue; + } + + if (Clobber == "memory") { + AsmInfo.ReadOnly = false; + AsmInfo.ReadNone = false; } else if (Clobber != "cc") { Clobber = getTarget().getNormalizedGCCRegisterName(Clobber); if (CGM.getCodeGenOpts().StackClashProtector && @@ -3136,12 +3149,14 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { if (Clobber == "eax" || Clobber == "edx") { if (Constraints.find("=&A") != std::string::npos) continue; + std::string::size_type position1 = Constraints.find("={" + Clobber + "}"); if (position1 != std::string::npos) { Constraints.insert(position1 + 1, "&"); continue; } + std::string::size_type position2 = Constraints.find("=A"); if (position2 != std::string::npos) { Constraints.insert(position2 + 1, "&"); @@ -3149,35 +3164,62 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { } } } + if (!Constraints.empty()) Constraints += ','; - Constraints += "~{"; - Constraints += Clobber; - Constraints += '}'; + Constraints += "~{" + Clobber + '}'; } + return HasUnwindClobber; +} + +void CodeGenFunction::EmitAsmStmt( + const AsmStmt &S, + SmallVectorImpl &OutputConstraintInfos, + SmallVectorImpl &InputConstraintInfos) { + // Assemble the final asm string. + std::string AsmString = S.generateAsmString(getContext()); + + AsmConstraintsInfo AsmInfo(OutputConstraintInfos, InputConstraintInfos); + + // Handle output constraints. + HandleOutputConstraints(S, AsmInfo); + + // If this is a Microsoft-style asm blob, store the return registers (EAX:EDX) + // to the return value slot. Only do this when returning in registers. + HandleMSStyleAsmBlob(S, AsmString, AsmInfo); + + // Handle input constraints. + HandleInputConstraints(S, AsmInfo); + + // Handle 'asm goto' labels. + bool IsGCCAsmGoto = HandleLabels(S, AsmInfo); + + // Handle any clobbers. + bool HasUnwindClobber = HandleClobbers(S, AsmInfo); assert(!(HasUnwindClobber && IsGCCAsmGoto) && "unwind clobber can't be used with asm goto"); // Add machine specific clobbers std::string_view MachineClobbers = getTarget().getClobbers(); if (!MachineClobbers.empty()) { - if (!Constraints.empty()) - Constraints += ','; - Constraints += MachineClobbers; + if (!AsmInfo.Constraints.empty()) + AsmInfo.Constraints += ','; + AsmInfo.Constraints += MachineClobbers; } llvm::Type *ResultType; - if (ResultRegTypes.empty()) + if (AsmInfo.ResultRegTypes.empty()) ResultType = VoidTy; - else if (ResultRegTypes.size() == 1) - ResultType = ResultRegTypes[0]; + else if (AsmInfo.ResultRegTypes.size() == 1) + ResultType = AsmInfo.ResultRegTypes[0]; else - ResultType = llvm::StructType::get(getLLVMContext(), ResultRegTypes); + ResultType = + llvm::StructType::get(getLLVMContext(), AsmInfo.ResultRegTypes); llvm::FunctionType *FTy = - llvm::FunctionType::get(ResultType, ArgTypes, false); + llvm::FunctionType::get(ResultType, AsmInfo.ArgTypes, false); bool HasSideEffect = S.isVolatile() || S.getNumOutputs() == 0; @@ -3185,66 +3227,64 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { CGM.getCodeGenOpts().getInlineAsmDialect() == CodeGenOptions::IAD_ATT ? llvm::InlineAsm::AD_ATT : llvm::InlineAsm::AD_Intel; - llvm::InlineAsm::AsmDialect AsmDialect = isa(&S) ? - llvm::InlineAsm::AD_Intel : GnuAsmDialect; + llvm::InlineAsm::AsmDialect AsmDialect = + isa(&S) ? llvm::InlineAsm::AD_Intel : GnuAsmDialect; llvm::InlineAsm *IA = llvm::InlineAsm::get( - FTy, AsmString, Constraints, HasSideEffect, + FTy, AsmString, AsmInfo.Constraints, HasSideEffect, /* IsAlignStack */ false, AsmDialect, HasUnwindClobber); std::vector RegResults; llvm::CallBrInst *CBR; llvm::DenseMap> CBRRegResults; + if (IsGCCAsmGoto) { - CBR = Builder.CreateCallBr(IA, Fallthrough, Transfer, Args); - EmitBlock(Fallthrough); - UpdateAsmCallInst(*CBR, HasSideEffect, /*HasUnwindClobber=*/false, ReadOnly, - ReadNone, InNoMergeAttributedStmt, - InNoConvergentAttributedStmt, S, ResultRegTypes, - ArgElemTypes, *this, RegResults); + CBR = Builder.CreateCallBr(IA, AsmInfo.DefaultDest, AsmInfo.IndirectDests, + AsmInfo.Args); + EmitBlock(AsmInfo.DefaultDest); + UpdateAsmCallInst(S, *CBR, AsmInfo, HasSideEffect, + /*HasUnwindClobber=*/false, InNoMergeAttributedStmt, + InNoConvergentAttributedStmt, RegResults); + // Because we are emitting code top to bottom, we don't have enough // information at this point to know precisely whether we have a critical // edge. If we have outputs, split all indirect destinations. if (!RegResults.empty()) { - unsigned i = 0; + unsigned I = 0; for (llvm::BasicBlock *Dest : CBR->getIndirectDests()) { llvm::Twine SynthName = Dest->getName() + ".split"; llvm::BasicBlock *SynthBB = createBasicBlock(SynthName); llvm::IRBuilderBase::InsertPointGuard IPG(Builder); Builder.SetInsertPoint(SynthBB); - if (ResultRegTypes.size() == 1) { + if (AsmInfo.ResultRegTypes.size() == 1) { CBRRegResults[SynthBB].push_back(CBR); } else { - for (unsigned j = 0, e = ResultRegTypes.size(); j != e; ++j) { - llvm::Value *Tmp = Builder.CreateExtractValue(CBR, j, "asmresult"); + for (unsigned J = 0, E = AsmInfo.ResultRegTypes.size(); J != E; ++J) { + llvm::Value *Tmp = Builder.CreateExtractValue(CBR, J, "asmresult"); CBRRegResults[SynthBB].push_back(Tmp); } } EmitBranch(Dest); EmitBlock(SynthBB); - CBR->setIndirectDest(i++, SynthBB); + CBR->setIndirectDest(I++, SynthBB); } } } else if (HasUnwindClobber) { - llvm::CallBase *Result = EmitCallOrInvoke(IA, Args, ""); - UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/true, - ReadOnly, ReadNone, InNoMergeAttributedStmt, - InNoConvergentAttributedStmt, S, ResultRegTypes, - ArgElemTypes, *this, RegResults); + llvm::CallBase *Result = EmitCallOrInvoke(IA, AsmInfo.Args, ""); + UpdateAsmCallInst(S, *Result, AsmInfo, HasSideEffect, + /*HasUnwindClobber=*/true, InNoMergeAttributedStmt, + InNoConvergentAttributedStmt, RegResults); } else { llvm::CallInst *Result = - Builder.CreateCall(IA, Args, getBundlesForFunclet(IA)); - UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/false, - ReadOnly, ReadNone, InNoMergeAttributedStmt, - InNoConvergentAttributedStmt, S, ResultRegTypes, - ArgElemTypes, *this, RegResults); + Builder.CreateCall(IA, AsmInfo.Args, getBundlesForFunclet(IA)); + UpdateAsmCallInst(S, *Result, AsmInfo, HasSideEffect, + /*HasUnwindClobber=*/false, InNoMergeAttributedStmt, + InNoConvergentAttributedStmt, RegResults); } - EmitAsmStores(*this, S, RegResults, ResultRegTypes, ResultTruncRegTypes, - ResultRegDests, ResultRegQualTys, ResultTypeRequiresCast, - ResultBounds); + EmitAsmStores(S, RegResults, AsmInfo); // If this is an asm goto with outputs, repeat EmitAsmStores, but with a // different insertion point; one for each indirect destination and with @@ -3253,13 +3293,25 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { for (llvm::BasicBlock *Succ : CBR->getIndirectDests()) { llvm::IRBuilderBase::InsertPointGuard IPG(Builder); Builder.SetInsertPoint(Succ, --(Succ->end())); - EmitAsmStores(*this, S, CBRRegResults[Succ], ResultRegTypes, - ResultTruncRegTypes, ResultRegDests, ResultRegQualTys, - ResultTypeRequiresCast, ResultBounds); + EmitAsmStores(S, CBRRegResults[Succ], AsmInfo); } } } +void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { + // Pop all cleanup blocks at the end of the asm statement. + CodeGenFunction::RunCleanupsScope Cleanups(*this); + + // Get all the output and input constraints together. + SmallVector OutputConstraintInfos; + SmallVector InputConstraintInfos; + if (!GetOutputAndInputConstraints(S, OutputConstraintInfos, + InputConstraintInfos)) + return EmitHipStdParUnsupportedAsm(this, S); + + EmitAsmStmt(S, OutputConstraintInfos, InputConstraintInfos); +} + LValue CodeGenFunction::InitCapturedStruct(const CapturedStmt &S) { const RecordDecl *RD = S.getCapturedRecordDecl(); CanQualType RecordTy = getContext().getCanonicalTagType(RD); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 29b87a0616992..464bdeb801a29 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -5489,6 +5489,86 @@ class CodeGenFunction : public CodeGenTypeCache { QualType InputType, std::string &ConstraintStr, SourceLocation Loc); + /// This structure holds the information gathered about the constraints for an + /// inline assembly statement. It helps in separating the constraint + /// processing from the code generation. + struct AsmConstraintsInfo { + // The output and input constraints. + SmallVectorImpl &OutputConstraintInfos; + SmallVectorImpl &InputConstraintInfos; + + // Constraint strings. + std::string Constraints; + std::string InOutConstraints; + + // Keep track of out constraints for tied input operand. + std::vector OutputConstraints; + + // Keep track of argument types. + std::vector Args; + std::vector ArgTypes; + std::vector ArgElemTypes; + + // Keep track of result register constraints. + std::vector ResultRegDests; + std::vector ResultRegQualTys; + std::vector ResultRegTypes; + std::vector ResultTruncRegTypes; + + llvm::BitVector ResultTypeRequiresCast; + + // Keep track of in/out constraints. + std::vector InOutArgs; + std::vector InOutArgTypes; + std::vector InOutArgElemTypes; + + // Destination blocks for 'asm gotos'. + llvm::BasicBlock *DefaultDest = nullptr; + SmallVector IndirectDests; + + std::vector>> ResultBounds; + + // An inline asm can be marked readonly if it meets the following + // conditions: + // + // - it doesn't have any sideeffects + // - it doesn't clobber memory + // - it doesn't return a value by-reference + // + // It can be marked readnone if it doesn't have any input memory + // constraints in addition to meeting the conditions listed above. + bool ReadOnly = true; + bool ReadNone = true; + + AsmConstraintsInfo( + SmallVectorImpl &OutputConstraintInfos, + SmallVectorImpl &InputConstraintInfos) + : OutputConstraintInfos(OutputConstraintInfos), + InputConstraintInfos(InputConstraintInfos) {} + }; + + void EmitAsmStmt( + const AsmStmt &S, + SmallVectorImpl &OutputConstraintInfos, + SmallVectorImpl &InputConstraintInfos); + void EmitAsmStores(const AsmStmt &S, + const llvm::ArrayRef RegResults, + const AsmConstraintsInfo &AsmInfo); + void UpdateAsmCallInst(const AsmStmt &S, llvm::CallBase &Result, + const AsmConstraintsInfo &AsmInfo, bool HasSideEffect, + bool HasUnwindClobber, bool NoMerge, bool NoConvergent, + std::vector &RegResults); + bool GetOutputAndInputConstraints( + const AsmStmt &S, + SmallVectorImpl &OutputConstraintInfos, + SmallVectorImpl &InputConstraintInfos); + void HandleOutputConstraints(const AsmStmt &S, AsmConstraintsInfo &AsmInfo); + void HandleMSStyleAsmBlob(const AsmStmt &S, std::string &AsmString, + AsmConstraintsInfo &AsmInfo); + void HandleInputConstraints(const AsmStmt &S, AsmConstraintsInfo &AsmInfo); + bool HandleLabels(const AsmStmt &S, AsmConstraintsInfo &AsmInfo); + bool HandleClobbers(const AsmStmt &S, AsmConstraintsInfo &AsmInfo); + /// Attempts to statically evaluate the object size of E. If that /// fails, emits code to figure the size of E out for us. This is /// pass_object_size aware. From 0d552b1ffb5ac80bfaadd7d43afef04185888a00 Mon Sep 17 00:00:00 2001 From: Sadaf Ebrahimi Date: Mon, 11 May 2026 15:33:45 -0700 Subject: [PATCH 365/538] [scudo] Add unit tests for common functions This patch adds unit tests for isPowerOfTwo, computePercentage, and isAlignedSlow in common_test.cpp. These additions increase the test coverage for common.h to 100%. --- .../scudo/standalone/tests/common_test.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp index b46ceb558b699..d1d978212e06d 100644 --- a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp @@ -116,4 +116,36 @@ TEST(ScudoCommonTest, Zeros) { MemMap.unmap(); } +TEST(ScudoCommonTest, IsPowerOfTwo) { + EXPECT_FALSE(isPowerOfTwo(0)); + EXPECT_TRUE(isPowerOfTwo(1)); + EXPECT_TRUE(isPowerOfTwo(2)); + EXPECT_TRUE(isPowerOfTwo(4)); + EXPECT_FALSE(isPowerOfTwo(3)); +} + +TEST(ScudoCommonTest, ComputePercentage) { + uptr Integral, Fractional; + computePercentage(50, 100, &Integral, &Fractional); + EXPECT_EQ(Integral, 50U); + EXPECT_EQ(Fractional, 0U); + + computePercentage(1, 3, &Integral, &Fractional); + EXPECT_EQ(Integral, 33U); + EXPECT_EQ(Fractional, 33U); + + computePercentage(2, 3, &Integral, &Fractional); + EXPECT_EQ(Integral, 66U); + EXPECT_EQ(Fractional, 67U); + + computePercentage(0, 0, &Integral, &Fractional); + EXPECT_EQ(Integral, 100U); + EXPECT_EQ(Fractional, 0U); +} + +TEST(ScudoCommonTest, IsAlignedSlow) { + EXPECT_TRUE(isAlignedSlow(64, 16)); + EXPECT_FALSE(isAlignedSlow(65, 16)); +} + } // namespace scudo From abf90bf9ae43a7b9d97f394320606428c0570683 Mon Sep 17 00:00:00 2001 From: jimingham Date: Mon, 11 May 2026 15:41:01 -0700 Subject: [PATCH 366/538] Move lldb-python.h to be the first included as its comment demands. (#197048) The comment was preserved even though the header was almost never actually placed first. For the most part that didn't matter, but you can through bad luck uncover the conflict between Python typedefs (in this case of pid_t) and ones from include/lldb/Host/windows/PosixAPI.h. So this patch just moves this header to the front in all its appearances. --- .../Interfaces/OperatingSystemPythonInterface.cpp | 10 +++++----- .../Interfaces/ScriptedBreakpointPythonInterface.cpp | 8 +++++--- .../ScriptedFrameProviderPythonInterface.cpp | 8 +++++--- .../Python/Interfaces/ScriptedFramePythonInterface.cpp | 8 +++++--- .../Python/Interfaces/ScriptedHookPythonInterface.cpp | 6 +++++- .../Interfaces/ScriptedPlatformPythonInterface.cpp | 10 +++++----- .../Interfaces/ScriptedProcessPythonInterface.cpp | 9 +++++---- .../Python/Interfaces/ScriptedPythonInterface.cpp | 8 +++++--- .../Interfaces/ScriptedStopHookPythonInterface.cpp | 10 +++++----- .../Interfaces/ScriptedThreadPlanPythonInterface.cpp | 8 ++++---- .../Interfaces/ScriptedThreadPythonInterface.cpp | 8 +++++--- .../ScriptInterpreter/Python/PythonDataObjects.h | 2 ++ .../ScriptInterpreter/Python/SWIGPythonBridge.h | 2 ++ .../Python/ScriptInterpreterPython.cpp | 2 ++ .../Python/ScriptInterpreterPythonImpl.h | 3 +++ 15 files changed, 63 insertions(+), 39 deletions(-) diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp index d293406069395..01f4d3a7ca05b 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp @@ -6,16 +6,16 @@ // //===----------------------------------------------------------------------===// -#include "lldb/Core/PluginManager.h" -#include "lldb/Target/ExecutionContext.h" -#include "lldb/Utility/Log.h" -#include "lldb/lldb-enumerations.h" - // clang-format off // LLDB Python header must be included first #include "../lldb-python.h" //clang-format on +#include "lldb/Core/PluginManager.h" +#include "lldb/Target/ExecutionContext.h" +#include "lldb/Utility/Log.h" +#include "lldb/lldb-enumerations.h" + #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "OperatingSystemPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp index 3243ffdad7c00..077a4481e0c63 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp @@ -6,15 +6,17 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Core/PluginManager.h" #include "lldb/Symbol/SymbolContext.h" #include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" -// LLDB Python header must be included first -#include "../lldb-python.h" - #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedBreakpointPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp index d4d7a63ce4010..d95b382e6365e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Core/PluginManager.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" -// LLDB Python header must be included first -#include "../lldb-python.h" - #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedFrameProviderPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFramePythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFramePythonInterface.cpp index b8613578bdbd1..57d0bc6fa9759 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFramePythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFramePythonInterface.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Host/Config.h" #include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" -// LLDB Python header must be included first -#include "../lldb-python.h" - #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedFramePythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedHookPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedHookPythonInterface.cpp index 5963a1093a3d6..b85f6c3fc359d 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedHookPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedHookPythonInterface.cpp @@ -6,6 +6,11 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Core/PluginManager.h" #include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" @@ -13,7 +18,6 @@ #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" -#include "../lldb-python.h" #include "ScriptedHookPythonInterface.h" using namespace lldb; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp index 8b7f84c5af037..e3c64115d04d1 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp @@ -6,17 +6,17 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Core/PluginManager.h" #include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/Status.h" #include "lldb/lldb-enumerations.h" -// clang-format off -// LLDB Python header must be included first -#include "../lldb-python.h" -//clang-format on - #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedPlatformPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp index b99bf9f627310..17ec036b09080 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp @@ -6,15 +6,16 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Core/PluginManager.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/Status.h" #include "lldb/lldb-enumerations.h" -// clang-format off -// LLDB Python header must be included first -#include "../lldb-python.h" - #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedThreadPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index 68aea7d223f7b..5e134042c6ffb 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -6,13 +6,15 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Host/Config.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" -// LLDB Python header must be included first -#include "../lldb-python.h" - #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedPythonInterface.h" #include "lldb/Symbol/SymbolContext.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedStopHookPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedStopHookPythonInterface.cpp index d10f7f07bd33e..9936547f6c9fc 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedStopHookPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedStopHookPythonInterface.cpp @@ -6,16 +6,16 @@ // //===----------------------------------------------------------------------===// -#include "lldb/Core/PluginManager.h" -#include "lldb/Target/ExecutionContext.h" -#include "lldb/Utility/Log.h" -#include "lldb/lldb-enumerations.h" - // clang-format off // LLDB Python header must be included first #include "../lldb-python.h" //clang-format on +#include "lldb/Core/PluginManager.h" +#include "lldb/Target/ExecutionContext.h" +#include "lldb/Utility/Log.h" +#include "lldb/lldb-enumerations.h" + #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedStopHookPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp index b27fcd64d3892..f7186e9764eff 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp @@ -6,15 +6,15 @@ // //===----------------------------------------------------------------------===// -#include "lldb/Core/PluginManager.h" -#include "lldb/Utility/Log.h" -#include "lldb/lldb-enumerations.h" - // clang-format off // LLDB Python header must be included first #include "../lldb-python.h" //clang-format on +#include "lldb/Core/PluginManager.h" +#include "lldb/Utility/Log.h" +#include "lldb/lldb-enumerations.h" + #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedThreadPlanPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp index 7181e039b640a..2b12e67b31098 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// +// clang-format off +// LLDB Python header must be included first +#include "../lldb-python.h" +//clang-format on + #include "lldb/Host/Config.h" #include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" -// LLDB Python header must be included first -#include "../lldb-python.h" - #include "../SWIGPythonBridge.h" #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedThreadPythonInterface.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h index 919c5d57ceb33..de19ddf9c2e76 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h @@ -47,8 +47,10 @@ #ifndef LLDB_SOURCE_PLUGINS_SCRIPTINTERPRETER_PYTHON_PYTHONDATAOBJECTS_H #define LLDB_SOURCE_PLUGINS_SCRIPTINTERPRETER_PYTHON_PYTHONDATAOBJECTS_H +// clang-format off // LLDB Python header must be included first #include "lldb-python.h" +//clang-format on #include "lldb/Host/File.h" #include "lldb/Utility/StructuredData.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 16ad895ee9f26..953240449be5b 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -12,8 +12,10 @@ #include #include +// clang-format off // LLDB Python header must be included first #include "lldb-python.h" +//clang-format on #include "Plugins/ScriptInterpreter/Python/PythonDataObjects.h" #include "lldb/lldb-forward.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 7a0b1d97962af..d48fa040c20e5 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -6,8 +6,10 @@ // //===----------------------------------------------------------------------===// +// clang-format off // LLDB Python header must be included first #include "lldb-python.h" +//clang-format on #include "Interfaces/ScriptInterpreterPythonInterfaces.h" #include "PythonDataObjects.h" diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index 863cf27785824..75455bb3b0a0e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -9,7 +9,10 @@ #ifndef LLDB_SOURCE_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHONIMPL_H #define LLDB_SOURCE_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHONIMPL_H +// clang-format off +// LLDB Python header must be included first #include "lldb-python.h" +//clang-format on #include "PythonDataObjects.h" #include "ScriptInterpreterPython.h" From 457380cee1979b98eee641f241fe3bf9d3f49815 Mon Sep 17 00:00:00 2001 From: jimingham Date: Mon, 11 May 2026 15:55:34 -0700 Subject: [PATCH 367/538] Add a breakpoint override resolver feature to lldb (#195392) This is a general mechanism that allows a custom breakpoint resolver to "override" the resolver lldb would have used. It is implemented by making a scripted breakpoint resolver class, implementing an extra "overrides_resolver" method. When lldb has decided what resolver it is going to use for the breakpoint, if there are any override resolvers registered, it will pass the serialized form of the resolver to the `overrides_resolver` method of the registered resolvers. If any of the override resolvers return true from the overrides_resolver method, that resolver will be used in place of the one lldb would have used. The original motivation for this feature is to make the support of "debugging a scripting language from the implementation side". We added the notion of "facade breakpoints" so that a location given in the context of the scripting language can be handled by the emulation code. But when you are using lldb in some IDE, when "stopped" in the scripting language, the IDE will show the scripting language source. It is natural in that case for the user to expect the UI gesture to set a file and line breakpoint should work the same way. To implement that, the implementor would make their facade breakpoint resolver also an override resolver that checks to see if the incoming resolver was a file and line breakpoint for a source file of the scripting language type. If it is, it returns true, and lldb will use that resolver instead. Note, this will also work when breakpoints are set on the command line, so you can transparently support "scripting language" breakpoints from the command line as well. --- lldb/bindings/python/python-wrapper.swig | 12 + .../tutorials/creating-custom-breakpoints.md | 26 ++ lldb/include/lldb/API/SBStructuredData.h | 4 + lldb/include/lldb/API/SBTarget.h | 10 + .../lldb/Breakpoint/BreakpointResolver.h | 8 + .../Breakpoint/BreakpointResolverScripted.h | 9 + .../ScriptedBreakpointOverrideResolver.h | 46 ++++ .../Interfaces/ScriptedBreakpointInterface.h | 8 + .../lldb/Interpreter/ScriptInterpreter.h | 2 + lldb/include/lldb/Target/Target.h | 80 ++++++ lldb/source/API/SBStructuredData.cpp | 4 + lldb/source/API/SBTarget.cpp | 44 ++++ .../Breakpoint/BreakpointResolverScripted.cpp | 49 +++- lldb/source/Breakpoint/CMakeLists.txt | 1 + .../ScriptedBreakpointOverrideResolver.cpp | 28 ++ .../Commands/CommandObjectBreakpoint.cpp | 239 ++++++++++++++++++ lldb/source/Commands/Options.td | 4 + lldb/source/Interpreter/ScriptInterpreter.cpp | 5 + .../ScriptedBreakpointPythonInterface.cpp | 31 +++ .../ScriptedBreakpointPythonInterface.h | 6 +- .../Interfaces/ScriptedPythonInterface.cpp | 15 ++ .../Interfaces/ScriptedPythonInterface.h | 5 + .../Python/SWIGPythonBridge.h | 1 + lldb/source/Target/Target.cpp | 64 +++++ .../scripted_bkpt/overrides_resolver/Makefile | 4 + .../TestOverridesResolver.py | 163 ++++++++++++ .../overrides_resolver/bkpt_resolver.py | 67 +++++ .../scripted_bkpt/overrides_resolver/main.c | 20 ++ .../Python/PythonTestSuite.cpp | 5 + 29 files changed, 954 insertions(+), 6 deletions(-) create mode 100644 lldb/include/lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h create mode 100644 lldb/source/Breakpoint/ScriptedBreakpointOverrideResolver.cpp create mode 100644 lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/Makefile create mode 100644 lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py create mode 100644 lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/bkpt_resolver.py create mode 100644 lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/main.c diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 72f90f1b23c29..dea4f6b4c7f7c 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -595,6 +595,18 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *d return sb_ptr; } +void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBTarget(PyObject * + data) { + lldb::SBTarget *sb_ptr = NULL; + + int valid_cast = SWIG_ConvertPtr(data, (void **)&sb_ptr, + SWIGTYPE_p_lldb__SBTarget, 0); + + if (valid_cast == -1) + return NULL; + + return sb_ptr; +} bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallCommand( const char *python_function_name, const char *session_dictionary_name, lldb::DebuggerSP debugger, const char *args, diff --git a/lldb/docs/use/tutorials/creating-custom-breakpoints.md b/lldb/docs/use/tutorials/creating-custom-breakpoints.md index b31a600a7505a..f12655d9471af 100644 --- a/lldb/docs/use/tutorials/creating-custom-breakpoints.md +++ b/lldb/docs/use/tutorials/creating-custom-breakpoints.md @@ -170,3 +170,29 @@ The Facade location adds these optional affordances to the Resolver class: |`was_hit`| `frame`:`lldb.SBFrame` `bp_loc`:`lldb.SBBreakpointLocation` | This will get called when one of the "real" locations set by your resolver is hit. `frame` is the stack frame that hit this location. `bp_loc` is the real location that was hit. Return either the facade location that you want to consider hit on this stop, or None if you don't consider any of your facade locations to have been hit. | | `get_location_description` | `bp_loc`:`lldb.SBBreakpointLocation` `desc_level`:`lldb.DescriptionLevel` `bp_loc` is the facade location to describe.| Use this to provide a helpful description of each facade location. ``desc_level`` is the level of description requested. The Brief description is printed when the location is hit. Full is printed for `break list` and Verbose for `break list -v`.| +## Override breakpoint resolvers + +If a breakpoint resolver can provide a better action for some subset of +breakpoints that lldb would normally set using its own resolvers, it can +register itself as an "Override Resolver" using: + +``` +SBTarget.AddBreakpointOverrideResolver(classname, description, extra_args) +``` + +description is what will show in the command: + +``` +(lldb) breakpoint override resolver list +``` + +And extra_args is an SBStructuredData that will get passed to the constructor +of your breakpoint resolver when we make an instance to check for overrides. + +The overrides resolver requires the following to be implemented: + +| Name | Arguments | Description| +|-------|-----------|------------| +|`overrides_resolver`| `orig_resolver_data`:`lldb.SBStructuredData` | This will get called when lldb has determined the resolver it would normally use for the breakpoint. `orig_resolver_data` the serialized form of the breakpoint resolver lldb would have used. If you return True from the API, then lldb will use this instance of your resolver instead of the one it would have used. | + + diff --git a/lldb/include/lldb/API/SBStructuredData.h b/lldb/include/lldb/API/SBStructuredData.h index 5f688664c570b..0e3a3150fddc1 100644 --- a/lldb/include/lldb/API/SBStructuredData.h +++ b/lldb/include/lldb/API/SBStructuredData.h @@ -138,6 +138,10 @@ class SBStructuredData { /// value. void SetGenericValue(SBScriptObject value); +#ifndef SWIG + void CopyImpl(lldb_private::StructuredDataImpl &new_impl); +#endif + protected: friend class SBAttachInfo; friend class SBCommandReturnObject; diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index 93b3aab578f42..d598c44dd7332 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -638,6 +638,14 @@ class LLDB_API SBTarget { size_t ReadMemory(const SBAddress addr, void *buf, size_t size, lldb::SBError &error); + /// Adds a breakpoint override implemented by class_name. Returns the ID + /// of the new override or LLDB_INVALID_INDEX64 on error. + uint64_t AddBreakpointOverride(const char *class_name, + const char *description, + SBStructuredData &args_data, SBError &status); + + bool RemoveBreakpointOverride(uint64_t id); + lldb::SBBreakpoint BreakpointCreateByLocation(const char *file, uint32_t line); @@ -1050,6 +1058,8 @@ class LLDB_API SBTarget { friend class SBVariablesOptions; friend class lldb_private::python::SWIGBridge; + friend class lldb_private::lua::SWIGBridge; + friend class lldb_private::ScriptInterpreter; // Constructors are private, use static Target::Create function to create an // instance of this class. diff --git a/lldb/include/lldb/Breakpoint/BreakpointResolver.h b/lldb/include/lldb/Breakpoint/BreakpointResolver.h index 7be6b7a94c618..064e370d08a40 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointResolver.h +++ b/lldb/include/lldb/Breakpoint/BreakpointResolver.h @@ -113,6 +113,14 @@ class BreakpointResolver : public Searcher { return StructuredData::ObjectSP(); } + /// The resolver_sp won't have had its breakpoint set by the time we are + /// checking the Override, but it might need to access the Target, so we pass + /// that in here. + virtual bool OverridesResolver(Target &target, + lldb::BreakpointResolverSP resolver_sp) { + return false; + } + static const char *GetSerializationKey() { return "BKPTResolver"; } static const char *GetSerializationSubclassKey() { return "Type"; } diff --git a/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h b/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h index c3c1c80f49043..739eaabf4ad29 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h +++ b/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h @@ -63,10 +63,18 @@ class BreakpointResolverScripted : public BreakpointResolver { lldb::BreakpointResolverSP CopyForBreakpoint(lldb::BreakpointSP &breakpoint) override; + // OverridesResolver will get called before this resolver has been assigned a + // breakpoint. You should only need to see the resolver to know whether you + // want to override it, but you may need to check something about the target, + // which you would normally get to from the breakpoint, so we pass it in here. + bool OverridesResolver(Target &target, + lldb::BreakpointResolverSP original_sp) override; + protected: void NotifyBreakpointSet() override; private: void CreateImplementationIfNeeded(lldb::BreakpointSP bkpt); + void CreateImplementationIfNeeded(Target &target, lldb::BreakpointSP bkpt); ScriptInterpreter *GetScriptInterpreter(); std::string m_class_name; @@ -74,6 +82,7 @@ class BreakpointResolverScripted : public BreakpointResolver { StructuredDataImpl m_args; Status m_error; lldb::ScriptedBreakpointInterfaceSP m_interface_sp; + bool m_breakpoint_sent = false; BreakpointResolverScripted(const BreakpointResolverScripted &) = delete; const BreakpointResolverScripted & diff --git a/lldb/include/lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h b/lldb/include/lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h new file mode 100644 index 0000000000000..2800fafeec8b1 --- /dev/null +++ b/lldb/include/lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_BREAKPOINT_SCRIPTEDBREAKPOINTOVERRIDERESOLVER_H +#define LLDB_BREAKPOINT_SCRIPTEDBREAKPOINTOVERRIDERESOLVER_H + +#include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/Target.h" +#include "lldb/lldb-private.h" + +namespace lldb_private { + +class ScriptedBreakpointResolverOverride + : public Target::BreakpointResolverOverride { +public: + ScriptedBreakpointResolverOverride(Target &target, + const std::string &description, + const std::string &class_name, + StructuredDataImpl &args_data) + : Target::BreakpointResolverOverride(target, description), + m_args_data(args_data), m_class_name(class_name) {} + + Target::BreakpointResolverOverrideUP + CopyIntoNewTarget(Target &target) override { + return Target::BreakpointResolverOverrideUP( + new ScriptedBreakpointResolverOverride(target, m_desc, m_class_name, + m_args_data)); + } + + lldb::BreakpointResolverSP + CheckForOverride(Target &target, + lldb::BreakpointResolverSP initial_sp) override; + + llvm::Error Validate() override; + +private: + StructuredDataImpl m_args_data; + std::string m_class_name; +}; +} // namespace lldb_private +#endif // LLDB_BREAKPOINT_SCRIPTEDBREAKPOINTOVERRIDERESOLVER_H diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h index d6d2ee786788e..a169432d2759a 100644 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h @@ -11,6 +11,7 @@ #include "ScriptedInterface.h" #include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/Target.h" #include "lldb/lldb-private.h" namespace lldb_private { @@ -36,6 +37,13 @@ class ScriptedBreakpointInterface : public ScriptedInterface { lldb::DescriptionLevel level) { return {}; } + + virtual void SetBreakpoint(lldb::BreakpointSP break_sp) {} + + virtual bool OverridesResolver(Target &target, + StructuredDataImpl &original_resolver) { + return false; + } }; } // namespace lldb_private diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 0c37c119540f6..58af82fb48390 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -661,6 +661,8 @@ class ScriptInterpreter : public PluginInterface { lldb::ValueObjectSP GetOpaqueTypeFromSBValue(const lldb::SBValue &value) const; + lldb::TargetSP GetOpaqueTypeFromSBTarget(const lldb::SBTarget &target) const; + protected: Debugger &m_debugger; lldb::ScriptLanguage m_script_lang; diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 87b5c4f9591f1..7a88184849d6e 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -38,6 +38,7 @@ #include "lldb/Utility/Broadcaster.h" #include "lldb/Utility/LLDBAssert.h" #include "lldb/Utility/RealpathPrefixes.h" +#include "lldb/Utility/Stream.h" #include "lldb/Utility/StructuredData.h" #include "lldb/Utility/Timeout.h" #include "lldb/lldb-public.h" @@ -998,6 +999,79 @@ class Target : public std::enable_shared_from_this, /// Resets the hit count of all breakpoints. void ResetBreakpointHitCounts(); + // This callout implements the "Resolver Override". When we have determined + // the Resolver for a given breakpoint, we pass each of the registered + // overrides the "natural" resolver, and then we will use whatever resolver + // we get back from it if it is non-null. + // We keep a list of overrides ordered by ID - and we search through the list + // by ID order, and the first override that returns a non-null Resolver will + // be the one we use. If no overrides return an override resolver, we'll use + // the original one. + + // This is the abstract version of the override. Particular implementations + // e.g. the scripted override will derive from this. + class BreakpointResolverOverride; + using BreakpointResolverOverrideUP = + std::unique_ptr; + + class BreakpointResolverOverride { + public: + BreakpointResolverOverride(Target &target, const std::string &description) + : m_target(target), m_desc(description) {} + + virtual BreakpointResolverOverrideUP CopyIntoNewTarget(Target &target) = 0; + + virtual ~BreakpointResolverOverride() {} + virtual lldb::BreakpointResolverSP + CheckForOverride(Target &target, lldb::BreakpointResolverSP initial_sp) = 0; + // Return whether constructing this resolver was successful. + virtual llvm::Error Validate() = 0; + const std::string &GetDescription() { return m_desc; } + + protected: + Target &m_target; + std::string m_desc; + }; + + /// Add a breakpoint override resolver. This version can't fail. + lldb::user_id_t + AddBreakpointResolverOverride(BreakpointResolverOverrideUP override_up) { + lldb::user_id_t id_used = m_override_id; + m_breakpoint_overrides.emplace(m_override_id, std::move(override_up)); + m_override_id++; + return id_used; + } + + /// Add a breakpoint override resolver. Return the ID or an error: + llvm::Expected + AddBreakpointResolverOverride(llvm::StringRef class_name, + StructuredData::DictionarySP args_data_sp, + llvm::StringRef description); + + bool RemoveBreakpointResolverOverride(lldb::user_id_t override_id) { + size_t removed = m_breakpoint_overrides.erase(override_id); + return removed == 1; + } + + void ClearBreakpointResolverOverrides() { m_breakpoint_overrides.clear(); } + + lldb::BreakpointResolverSP + CheckBreakpointOverrides(lldb::BreakpointResolverSP original_sp) { + for (auto const &elem : m_breakpoint_overrides) { + if (lldb::BreakpointResolverSP overriden_sp = + elem.second->CheckForOverride(*this, original_sp)) + return overriden_sp; + } + return {}; + } + + /// Describe the breakpoint overrides. If ixds is empty, list all. Otherwise + /// list the overrides whose ids match the ones given in idxs. The matched + /// elements are removed from the list, so any elements remaining in idxs are + /// indexes that are not breakpoint override indexes. + void DescribeBreakpointOverrides(Stream &stream, + std::vector &idxs); + // The flag 'end_to_end', default to true, signifies that the operation is // performed end to end, for both the debugger and the debuggee. @@ -2020,6 +2094,12 @@ class Target : public std::enable_shared_from_this, std::map>; BreakpointNameList m_breakpoint_names; + std::map + m_breakpoint_overrides; + /// This is the ID that will be handed out for the next added breakpoint + /// override resolver for this target. + lldb::user_id_t m_override_id = 0; + lldb::BreakpointSP m_last_created_breakpoint; WatchpointList m_watchpoint_list; lldb::WatchpointSP m_last_created_watchpoint; diff --git a/lldb/source/API/SBStructuredData.cpp b/lldb/source/API/SBStructuredData.cpp index 8e2c18ed42b47..971e079723ba4 100644 --- a/lldb/source/API/SBStructuredData.cpp +++ b/lldb/source/API/SBStructuredData.cpp @@ -77,6 +77,10 @@ operator=(const lldb::SBStructuredData &rhs) { return *this; } +void SBStructuredData::CopyImpl(lldb_private::StructuredDataImpl &new_impl) { + new_impl.SetObjectSP(m_impl_up->GetObjectSP()); +} + lldb::SBError SBStructuredData::SetFromJSON(lldb::SBStream &stream) { LLDB_INSTRUMENT_VA(this, stream); diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index a3b500f9011c3..431dded5b1193 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -29,6 +29,7 @@ #include "lldb/Breakpoint/BreakpointIDList.h" #include "lldb/Breakpoint/BreakpointList.h" #include "lldb/Breakpoint/BreakpointLocation.h" +#include "lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h" #include "lldb/Core/Address.h" #include "lldb/Core/AddressResolver.h" #include "lldb/Core/Debugger.h" @@ -40,6 +41,7 @@ #include "lldb/Core/Section.h" #include "lldb/Core/StructuredDataImpl.h" #include "lldb/Host/Host.h" +#include "lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Symbol/DeclVendor.h" #include "lldb/Symbol/ObjectFile.h" @@ -679,6 +681,48 @@ size_t SBTarget::ReadMemory(const SBAddress addr, void *buf, size_t size, return bytes_read; } +uint64_t SBTarget::AddBreakpointOverride(const char *class_name, + const char *description, + SBStructuredData &args_data, + SBError &error) { + if (!class_name || class_name[0] == '\0') { + error.SetErrorString("empty class name"); + return LLDB_INVALID_INDEX64; + } + + if (TargetSP target_sp = GetSP()) { + StructuredDataImpl impl; + args_data.CopyImpl(impl); + StructuredData::ObjectSP object_sp = impl.GetObjectSP(); + StructuredData::DictionarySP args_dict( + new StructuredData::Dictionary(object_sp)); + if (!args_dict->IsValid()) { + error.SetErrorString("args data is not a dictionary"); + return LLDB_INVALID_INDEX64; + } + + llvm::Expected id_or_err = + target_sp->AddBreakpointResolverOverride( + class_name, args_dict, + description ? description : ""); + if (id_or_err) + return *id_or_err; + error.SetErrorString(llvm::toString(id_or_err.takeError()).c_str()); + return LLDB_INVALID_INDEX64; + + } else { + error.SetErrorString("invalid SBTarget."); + return LLDB_INVALID_INDEX64; + } +} + +bool SBTarget::RemoveBreakpointOverride(uint64_t id) { + if (TargetSP target_sp = GetSP()) { + return target_sp->RemoveBreakpointResolverOverride(id); + } + return false; +} + SBBreakpoint SBTarget::BreakpointCreateByLocation(const char *file, uint32_t line) { LLDB_INSTRUMENT_VA(this, file, line); diff --git a/lldb/source/Breakpoint/BreakpointResolverScripted.cpp b/lldb/source/Breakpoint/BreakpointResolverScripted.cpp index 373bd74d24b67..84d918029faf8 100644 --- a/lldb/source/Breakpoint/BreakpointResolverScripted.cpp +++ b/lldb/source/Breakpoint/BreakpointResolverScripted.cpp @@ -30,13 +30,23 @@ BreakpointResolverScripted::BreakpointResolverScripted( lldb::SearchDepth depth, const StructuredDataImpl &args_data) : BreakpointResolver(bkpt, BreakpointResolver::PythonResolver), m_class_name(std::string(class_name)), m_depth(depth), m_args(args_data) { - CreateImplementationIfNeeded(bkpt); + if (bkpt) + CreateImplementationIfNeeded(bkpt); } void BreakpointResolverScripted::CreateImplementationIfNeeded( BreakpointSP breakpoint_sp) { - if (m_interface_sp) + // This version has to be called with a valid breakpoint_sp + // But the interface might have been made before we sent the breakpoint to + // the interface. If so, do that here: + assert(breakpoint_sp); + if (m_interface_sp) { + if (!m_breakpoint_sent) { + m_interface_sp->SetBreakpoint(breakpoint_sp); + m_breakpoint_sent = true; + } return; + } if (m_class_name.empty()) return; @@ -45,8 +55,22 @@ void BreakpointResolverScripted::CreateImplementationIfNeeded( return; TargetSP target_sp = breakpoint_sp->GetTargetSP(); - ScriptInterpreter *script_interp = target_sp->GetDebugger() - .GetScriptInterpreter(); + if (target_sp) + CreateImplementationIfNeeded(*target_sp.get(), breakpoint_sp); +} + +void BreakpointResolverScripted::CreateImplementationIfNeeded( + Target &target, BreakpointSP breakpoint_sp) { + if (m_interface_sp) { + if (!m_breakpoint_sent && breakpoint_sp) { + m_interface_sp->SetBreakpoint(breakpoint_sp); + m_breakpoint_sent = true; + } + return; + } + + ScriptInterpreter *script_interp = + target.GetDebugger().GetScriptInterpreter(); if (!script_interp) return; @@ -67,13 +91,28 @@ void BreakpointResolverScripted::CreateImplementationIfNeeded( m_error = Status::FromError(obj_or_err.takeError()); return; } - StructuredData::ObjectSP object_sp = *obj_or_err; if (!object_sp || !object_sp->IsValid()) { m_error = Status::FromErrorStringWithFormat( "ScriptedBreakpoint::%s () - ERROR: %s", __FUNCTION__, "Failed to create valid script object"); } + if (breakpoint_sp) + m_breakpoint_sent = true; +} + +bool BreakpointResolverScripted::OverridesResolver( + Target &target, BreakpointResolverSP original_sp) { + // At this point neither resolver has been assigned a breakpoint, so pass + // in an empty one. + CreateImplementationIfNeeded(target, {}); + if (!m_interface_sp) + return false; + + StructuredData::ObjectSP serialized_sp = + original_sp->SerializeToStructuredData(); + StructuredDataImpl impl(serialized_sp); + return m_interface_sp->OverridesResolver(target, impl); } void BreakpointResolverScripted::NotifyBreakpointSet() { diff --git a/lldb/source/Breakpoint/CMakeLists.txt b/lldb/source/Breakpoint/CMakeLists.txt index de23c8737b324..a0ab1e1834403 100644 --- a/lldb/source/Breakpoint/CMakeLists.txt +++ b/lldb/source/Breakpoint/CMakeLists.txt @@ -16,6 +16,7 @@ add_lldb_library(lldbBreakpoint NO_PLUGIN_DEPENDENCIES BreakpointResolverName.cpp BreakpointResolverScripted.cpp BreakpointSite.cpp + ScriptedBreakpointOverrideResolver.cpp Stoppoint.cpp StoppointCallbackContext.cpp StoppointSite.cpp diff --git a/lldb/source/Breakpoint/ScriptedBreakpointOverrideResolver.cpp b/lldb/source/Breakpoint/ScriptedBreakpointOverrideResolver.cpp new file mode 100644 index 0000000000000..6adab02330770 --- /dev/null +++ b/lldb/source/Breakpoint/ScriptedBreakpointOverrideResolver.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h" +#include "lldb/Breakpoint/Breakpoint.h" +#include "lldb/Breakpoint/BreakpointResolverScripted.h" + +namespace lldb_private { +lldb::BreakpointResolverSP ScriptedBreakpointResolverOverride::CheckForOverride( + Target &target, lldb::BreakpointResolverSP initial_sp) { + lldb::BreakpointResolverSP candidate_sp(new BreakpointResolverScripted( + {}, m_class_name, initial_sp->GetDepth(), m_args_data)); + if (candidate_sp->OverridesResolver(target, initial_sp)) + return candidate_sp; + return {}; +} + +llvm::Error ScriptedBreakpointResolverOverride::Validate() { + // FIXME: we should make sure the module and class exist, though that will + // to happen in a scripting language specific function. + return llvm::Error::success(); +} +} // namespace lldb_private diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp index c462a4875b127..74260615d11cd 100644 --- a/lldb/source/Commands/CommandObjectBreakpoint.cpp +++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp @@ -3585,6 +3585,241 @@ class CommandObjectBreakpointWrite : public CommandObjectParsed { CommandOptions m_options; }; +#pragma mark override add +#define LLDB_OPTIONS_breakpoint_override_add +#include "CommandOptions.inc" + +class CommandObjectBreakpointOverrideAdd : public CommandObjectParsed { +public: + CommandObjectBreakpointOverrideAdd(CommandInterpreter &interpreter) + : CommandObjectParsed(interpreter, "breakpoint override add", + "Add a scripted breakpoint override resolver.", + nullptr), + m_python_class_options("breakpoint override resolver", true, 'P') { + // We're picking up all the normal options, commands and disable. + m_all_options.Append(&m_python_class_options, + LLDB_OPT_SET_1 | LLDB_OPT_SET_2, LLDB_OPT_SET_1); + m_all_options.Append(&m_dummy_options, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); + m_all_options.Append(&m_options, llvm::ArrayRef()); + m_all_options.Finalize(); + } + + ~CommandObjectBreakpointOverrideAdd() override = default; + + class CommandOptions : public OptionGroup { + public: + CommandOptions() = default; + + ~CommandOptions() override = default; + + Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg, + ExecutionContext *execution_context) override { + Status error; + const int short_option = GetDefinitions()[option_idx].short_option; + + switch (short_option) { + case 'd': + m_description.assign(std::string(option_arg)); + break; + default: + llvm_unreachable("Unimplemented option"); + } + + return error; + } + + void OptionParsingStarting(ExecutionContext *execution_context) override { + m_description.clear(); + } + + llvm::ArrayRef GetDefinitions() override { + return llvm::ArrayRef(g_breakpoint_override_add_options); + } + + // Instance variables to hold the values for command options. + + std::string m_description; + }; + Options *GetOptions() override { return &m_all_options; } + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target &target = + m_dummy_options.m_use_dummy ? GetDummyTarget() : GetTarget(); + llvm::Expected id = target.AddBreakpointResolverOverride( + m_python_class_options.GetName(), + m_python_class_options.GetStructuredData(), m_options.m_description); + if (id) { + result.AppendMessageWithFormatv("{0}", *id); + result.SetStatus(eReturnStatusSuccessFinishResult); + } else { + result.AppendErrorWithFormatv("could not add resolver: {0}.", + llvm::toString(id.takeError())); + } + } + +private: + BreakpointDummyOptionGroup m_dummy_options; + OptionGroupPythonClassWithDict m_python_class_options; + CommandOptions m_options; + OptionGroupOptions m_all_options; +}; + +class CommandObjectBreakpointOverrideDelete : public CommandObjectParsed { +public: + CommandObjectBreakpointOverrideDelete(CommandInterpreter &interpreter) + : CommandObjectParsed(interpreter, "breakpoint override delete", + "Delete a scripted breakpoint override resolver.", + nullptr) { + AddSimpleArgumentList(eArgTypeIndex, eArgRepeatOptional); + m_all_options.Append(&m_dummy_options, LLDB_OPT_SET_1, LLDB_OPT_SET_1); + m_all_options.Finalize(); + } + + ~CommandObjectBreakpointOverrideDelete() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target &target = + m_dummy_options.m_use_dummy ? GetDummyTarget() : GetTarget(); + + const size_t argc = command.GetArgumentCount(); + if (argc == 0) { + if (m_interpreter.Confirm("Delete all breakpoint overrides?", false)) { + target.ClearBreakpointResolverOverrides(); + } + result.SetStatus(eReturnStatusSuccessFinishNoResult); + return; + } + + for (auto &entry : command.entries()) { + uint64_t id; + bool success; + if (!entry.ref().getAsInteger(0, id)) + success = target.RemoveBreakpointResolverOverride(id); + else { + result.AppendErrorWithFormatv("Index not an integer: {0}", entry.ref()); + result.SetStatus(eReturnStatusFailed); + return; + } + if (!success) { + result.AppendErrorWithFormatv("Cannot delete override: {0}", id); + result.SetStatus(eReturnStatusFailed); + return; + } + } + result.SetStatus(eReturnStatusSuccessFinishNoResult); + } + +private: + BreakpointDummyOptionGroup m_dummy_options; + OptionGroupOptions m_all_options; +}; + +class CommandObjectBreakpointOverrideList : public CommandObjectParsed { +public: + CommandObjectBreakpointOverrideList(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "breakpoint override list", + "List the current scripted breakpoint override resolvers.", + nullptr) { + AddSimpleArgumentList(eArgTypeIndex, eArgRepeatOptional); + m_all_options.Append(&m_dummy_options, LLDB_OPT_SET_1, LLDB_OPT_SET_1); + m_all_options.Finalize(); + } + + ~CommandObjectBreakpointOverrideList() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target &target = + m_dummy_options.m_use_dummy ? GetDummyTarget() : GetTarget(); + + const size_t argc = command.GetArgumentCount(); + std::vector idxs; + if (argc != 0) { + for (auto &entry : command.entries()) { + uint64_t id; + if (!entry.ref().getAsInteger(0, id)) { + idxs.push_back(id); + } else { + result.AppendErrorWithFormatv("Index not an integer: {0}", + entry.ref()); + result.SetStatus(eReturnStatusFailed); + return; + } + } + } + target.DescribeBreakpointOverrides(result.GetOutputStream(), idxs); + if (idxs.empty()) { + result.SetStatus(eReturnStatusSuccessFinishResult); + } else { + result.SetStatus(eReturnStatusFailed); + Stream &error_strm = result.GetErrorStream(); + if (idxs.size() == 1) { + error_strm << llvm::formatv("error: invalid index: {0}", idxs[0]); + return; + } + error_strm << "error: invalid indices: "; + auto begin = idxs.begin(); + error_strm << llvm::formatv("{0}", *begin); + idxs.erase(begin); + for (auto elem : idxs) + error_strm << llvm::formatv(", {0}", elem); + } + } + +private: + BreakpointDummyOptionGroup m_dummy_options; + OptionGroupOptions m_all_options; +}; +class CommandObjectBreakpointOverride : public CommandObjectMultiword { +public: + CommandObjectBreakpointOverride(CommandInterpreter &interpreter) + : CommandObjectMultiword( + interpreter, "override", + "Commands to manage breakpoint override resolvers") { + + SetHelpLong( + R"( +Breakpoint override resolvers allow you to intercept breakpoint requests and +re-implement them using a custom breakpoint resolver. Override resolvers are +implemented by a scripted breakpoint resolver that implements the +'overrides_resolver' interface. It takes an SBStructuredData with the +serialized form of the original breakpoint resolver. If it returns true, then +the provided resolver will be substituted for the one lldb would have produced +by default. + +Add new override resolvers using: + + (lldb) breakpoint override add -c class_name + +This returns the ID of the resolver you added. + +List the currently added override resolvers using: + + (lldb) breakpoint override list + +Delete an added resolver using: + + (lldb) breakpoint override delete + +)"); + CommandObjectSP add_command_object( + new CommandObjectBreakpointOverrideAdd(interpreter)); + CommandObjectSP delete_command_object( + new CommandObjectBreakpointOverrideDelete(interpreter)); + CommandObjectSP list_command_object( + new CommandObjectBreakpointOverrideList(interpreter)); + + LoadSubCommand("add", add_command_object); + LoadSubCommand("delete", delete_command_object); + LoadSubCommand("list", list_command_object); + } + + ~CommandObjectBreakpointOverride() override = default; +}; + // CommandObjectMultiwordBreakpoint #pragma mark MultiwordBreakpoint @@ -3618,6 +3853,8 @@ CommandObjectMultiwordBreakpoint::CommandObjectMultiwordBreakpoint( new CommandObjectBreakpointWrite(interpreter)); CommandObjectSP read_command_object( new CommandObjectBreakpointRead(interpreter)); + CommandObjectSP override_command_object( + new CommandObjectBreakpointOverride(interpreter)); list_command_object->SetCommandName("breakpoint list"); enable_command_object->SetCommandName("breakpoint enable"); @@ -3631,6 +3868,7 @@ CommandObjectMultiwordBreakpoint::CommandObjectMultiwordBreakpoint( name_command_object->SetCommandName("breakpoint name"); write_command_object->SetCommandName("breakpoint write"); read_command_object->SetCommandName("breakpoint read"); + override_command_object->SetCommandName("breakpoint override"); LoadSubCommand("list", list_command_object); LoadSubCommand("enable", enable_command_object); @@ -3644,6 +3882,7 @@ CommandObjectMultiwordBreakpoint::CommandObjectMultiwordBreakpoint( LoadSubCommand("name", name_command_object); LoadSubCommand("write", write_command_object); LoadSubCommand("read", read_command_object); + LoadSubCommand("override", override_command_object); } CommandObjectMultiwordBreakpoint::~CommandObjectMultiwordBreakpoint() = default; diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td index 123ba7bdb257e..1f525c07f852a 100644 --- a/lldb/source/Commands/Options.td +++ b/lldb/source/Commands/Options.td @@ -485,6 +485,10 @@ let Command = "breakpoint add scripted" in { "repeat this option multiple times to specify multiple shared libraries.">; } +let Command = "breakpoint override add" in { + def breakpoint_override_add_description : Option<"description", "d">, + Desc<"Description string for this override resolver">, Arg<"HelpText">; +} let Command = "breakpoint clear" in { def breakpoint_clear_file : Option<"file", "f">, diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 448c3714a7a05..8dafb7f823e70 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -163,6 +163,11 @@ lldb::StackFrameListSP ScriptInterpreter::GetOpaqueTypeFromSBFrameList( return frame_list.m_opaque_sp; } +lldb::TargetSP ScriptInterpreter::GetOpaqueTypeFromSBTarget( + const lldb::SBTarget &target) const { + return target.m_opaque_sp; +} + lldb::ValueObjectSP ScriptInterpreter::GetOpaqueTypeFromSBValue(const lldb::SBValue &value) const { if (!value.m_opaque_sp) diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp index 077a4481e0c63..300d856e28d60 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp @@ -11,6 +11,8 @@ #include "../lldb-python.h" //clang-format on +#include "lldb/Breakpoint/Breakpoint.h" +#include "lldb/Breakpoint/BreakpointResolverScripted.h" #include "lldb/Core/PluginManager.h" #include "lldb/Symbol/SymbolContext.h" #include "lldb/Target/ExecutionContext.h" @@ -37,6 +39,35 @@ ScriptedBreakpointPythonInterface::CreatePluginObject( break_sp, args_sp); } +bool ScriptedBreakpointPythonInterface::OverridesResolver( + Target &target, StructuredDataImpl &resolver_data) { + Status error; + + TargetSP target_sp = target.shared_from_this(); + + StructuredData::ObjectSP obj = + Dispatch("overrides_resolver", error, target_sp, resolver_data); + + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) { + Log *log = GetLog(LLDBLog::Script); + LLDB_LOG(log, "Error calling overrides_resolver method: {0}", error); + return false; + } + return obj->GetBooleanValue(); +} + +void ScriptedBreakpointPythonInterface::SetBreakpoint( + lldb::BreakpointSP break_sp) { + Status error; + StructuredData::ObjectSP obj = Dispatch("set_breakpoint", error, break_sp); + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) { + Log *log = GetLog(LLDBLog::Script); + LLDB_LOG(log, "Error calling set_breakpoint method: {0}", error); + } +} + bool ScriptedBreakpointPythonInterface::ResolverCallback( SymbolContext sym_ctx) { Status error; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h index 34e63792fb257..56b80fd863939 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h @@ -13,8 +13,8 @@ #include "lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h" #include "ScriptedPythonInterface.h" - namespace lldb_private { + class ScriptedBreakpointPythonInterface : public ScriptedBreakpointInterface, public ScriptedPythonInterface, public PluginInterface { @@ -30,6 +30,10 @@ class ScriptedBreakpointPythonInterface : public ScriptedBreakpointInterface, return llvm::SmallVector({{"__callback__", 2}}); } + bool OverridesResolver(Target &target, + StructuredDataImpl &resolver_data) override; + void SetBreakpoint(lldb::BreakpointSP break_sp) override; + bool ResolverCallback(SymbolContext sym_ctx) override; lldb::SearchDepth GetDepth() override; std::optional GetShortHelp() override; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index 5e134042c6ffb..510cbccb20ce1 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -289,6 +289,21 @@ ScriptedPythonInterface::ExtractValueFromPythonObject( return m_interpreter.GetOpaqueTypeFromSBValue(*sb_value); } +template <> +lldb::TargetSP +ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error) { + lldb::SBTarget *sb_target = reinterpret_cast( + python::LLDBSWIGPython_CastPyObjectToSBTarget(p.get())); + if (!sb_target) { + error = Status::FromErrorStringWithFormat( + "couldn't cast lldb::SBTarget to lldb::TargetSP"); + return {}; + } + + return m_interpreter.GetOpaqueTypeFromSBTarget(*sb_target); +} + template <> lldb::ValueObjectListSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index 37597d059ca90..637ca1b2ab1f9 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -818,6 +818,11 @@ lldb::ValueObjectSP ScriptedPythonInterface::ExtractValueFromPythonObject( python::PythonObject &p, Status &error); +template <> +lldb::TargetSP +ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error); + template <> lldb::ValueObjectListSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 953240449be5b..4d20703eb2e5c 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -271,6 +271,7 @@ void *LLDBSWIGPython_CastPyObjectToSBValueList(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data); +void *LLDBSWIGPython_CastPyObjectToSBTarget(PyObject *data); } // namespace python } // namespace lldb_private diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 86ce97c37dad3..427765de6d221 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -16,6 +16,7 @@ #include "lldb/Breakpoint/BreakpointResolverFileRegex.h" #include "lldb/Breakpoint/BreakpointResolverName.h" #include "lldb/Breakpoint/BreakpointResolverScripted.h" +#include "lldb/Breakpoint/ScriptedBreakpointOverrideResolver.h" #include "lldb/Breakpoint/Watchpoint.h" #include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" @@ -240,6 +241,13 @@ void Target::PrimeFromDummyTarget(Target &target) { AddBreakpointName(std::make_unique(*bp_name_entry.second)); } + for (auto const &elem : target.m_breakpoint_overrides) { + BreakpointResolverOverrideUP new_override_up = + elem.second->CopyIntoNewTarget(*this); + if (new_override_up->Validate()) + AddBreakpointResolverOverride(std::move(new_override_up)); + } + m_frame_recognizer_manager_up = std::make_unique( *target.m_frame_recognizer_manager_up); @@ -800,6 +808,15 @@ BreakpointSP Target::CreateBreakpoint(SearchFilterSP &filter_sp, bool resolve_indirect_symbols) { BreakpointSP bp_sp; if (filter_sp && resolver_sp) { + // Now check whether there are any "Breakpoint Overrides" registered, and + // if there are see if one of them want to handle this request instead. + // But we don't allow overrides for internal breakpoints: + if (!internal) { + BreakpointResolverSP overridden_sp = + CheckBreakpointOverrides(resolver_sp); + if (overridden_sp) + resolver_sp = overridden_sp; + } const bool hardware = request_hardware || GetRequireHardwareBreakpoints(); bp_sp.reset(new Breakpoint(*this, filter_sp, resolver_sp, hardware, resolve_indirect_symbols)); @@ -933,6 +950,53 @@ void Target::GetBreakpointNames(std::vector &names) { llvm::sort(names); } +llvm::Expected +Target::AddBreakpointResolverOverride(llvm::StringRef class_name, + StructuredData::DictionarySP args_data_sp, + llvm::StringRef description) { + if (class_name.empty()) + return LLDB_INVALID_INDEX64; + + StructuredDataImpl impl; + impl.SetObjectSP(args_data_sp); + + BreakpointResolverOverrideUP new_override_up( + new ScriptedBreakpointResolverOverride(*this, std::string(description), + std::string(class_name), impl)); + llvm::Error error = new_override_up->Validate(); + if (error) + return error; + + return AddBreakpointResolverOverride(std::move(new_override_up)); +} + +void Target::DescribeBreakpointOverrides(Stream &stream, + std::vector &idxs) { + if (m_breakpoint_overrides.size() == 0) { + stream << "No overrides.\n"; + return; + } + + auto begin = idxs.begin(); + auto end = idxs.end(); + bool empty = idxs.empty(); + bool print_first = true; + for (auto const &elem : m_breakpoint_overrides) { + auto idx_pos = std::find(begin, end, elem.first); + if (empty || idx_pos != end) { + if (print_first) { + // FIXME: Is there some good way to flow the description? + stream << "ID Description\n"; + stream << "---- -----------\n"; + print_first = false; + } + stream.Format("{0,4} {1}\n", elem.first, elem.second->GetDescription()); + if (!empty) + idxs.erase(idx_pos); + } + } +} + bool Target::ProcessIsValid() { return (m_process_sp && m_process_sp->IsAlive()); } diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/Makefile b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/Makefile new file mode 100644 index 0000000000000..695335e068c0c --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/Makefile @@ -0,0 +1,4 @@ +C_SOURCES := main.c +CFLAGS_EXTRAS := -std=c99 + +include Makefile.rules diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py new file mode 100644 index 0000000000000..24ff019831992 --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py @@ -0,0 +1,163 @@ +""" +Test the OverridesResolver feature of scripted breakpoints +""" + +import os +import lldb +import lldbsuite.test.lldbutil as lldbutil +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * + + +class TestOverridesResolver(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528") + def test_overrides_resolver_resolver_python(self): + """Use facade breakpoints to emulate hitting some locations""" + self.build() + self.do_test(True) + + def test_overrides_resolver_resolver_cmd(self): + """Use facade breakpoints to emulate hitting some locations""" + self.build() + self.do_test(True) + + def make_target_and_import(self): + target = lldbutil.run_to_breakpoint_make_target(self) + self.import_resolver_script() + return target + + def import_resolver_script(self): + interp = self.dbg.GetCommandInterpreter() + error = lldb.SBError() + + script_name = os.path.join(self.getSourceDir(), "bkpt_resolver.py") + + command = "command script import " + script_name + self.runCmd(command) + + def add_override(self, use_cmd, help_text, class_name, key, value): + if use_cmd: + result = lldb.SBCommandReturnObject() + self.ci.HandleCommand( + f"breakpoint override add -P {class_name} -k {key} -v {value} -d '{help_text}'", + result, + ) + self.assertCommandReturn(result, "breakpoint override worked") + override_id = int(result.GetOutput()) + else: + extra_args = lldb.SBStructuredData() + json_str = '{"' + key + '":"' + value + '"}' + extra_args.SetFromJSON(json_str) + error = lldb.SBError() + override_id = target.AddBreakpointOverride( + class_name, help_text, extra_args, error + ) + self.assertError(error, "Made the override successfully") + + # Check the override listing, make sure our new entry is present: + self.expect("breakpoint override list", substrs=[str(override_id), help_text]) + + return override_id + + def do_test(self, use_cmd): + """This reads in a python file and sets a breakpoint using it.""" + alternate_location = "stop_here_instead" + target = self.make_target_and_import() + # Add out trivial one first so we test more than one list element: + + trivial_help = "Trivial help text" + trivial_id = self.add_override( + use_cmd, + trivial_help, + "bkpt_resolver.TrivialExample", + "test_key", + "test_value", + ) + + useful_help = "SOME HELP TEXT" + useful_id = self.add_override( + use_cmd, + useful_help, + "bkpt_resolver.OverrideExample", + "symbol", + "stop_here_instead", + ) + + # Now exercise the list command by id: + self.expect( + f"breakpoint override list {trivial_id}", + substrs=[str(useful_id), useful_help], + matching=False, + ) + self.expect( + f"breakpoint override list {trivial_id}", + substrs=[str(trivial_id), trivial_help], + ) + self.expect( + f"breakpoint override list {useful_id}", + substrs=[str(trivial_id), trivial_help], + matching=False, + ) + self.expect( + f"breakpoint override list {useful_id}", + substrs=[str(useful_id), useful_help], + ) + + # Now make a breakpoint by file and line: + # FIXME: Use source_line to find this line number: + bkpt = target.BreakpointCreateByLocation( + "main.c", line_number("main.c", "I am in the stop symbol") + ) + self.assertEqual(bkpt.GetNumLocations(), 1, "We make one location") + # Now continue and we'll hit this breakpoint but not in the + # right place: + (target, process, thread, bkpt) = lldbutil.run_to_breakpoint_do_run( + self, target, bkpt + ) + # This location should be bkpt_no.1: + self.assertEqual( + thread.stop_reason_data[0], bkpt.GetID(), "Hit the right breakpoint" + ) + self.assertEqual(thread.stop_reason_data[1], 1, "First location hit is 1") + func_name = thread.frames[0].name + self.assertEqual( + func_name, alternate_location, "Stopped at overridden location" + ) + + # Now set a source name breakpoint, that should not get overridden, and + # when we continue we should hit it: + name_bkpt = target.BreakpointCreateByName("change_him") + self.assertGreater(name_bkpt.GetNumLocations(), 0, "Found locations") + threads = lldbutil.continue_to_breakpoint(process, name_bkpt) + self.assertEqual(len(threads), 1, "Hit our name breakpoint") + func_name = threads[0].frames[0].name + self.assertEqual(func_name, "change_him", "Stopped in the right place") + + # Now delete the override and make sure we hit newly set + # source breakpoints: + if use_cmd: + self.runCmd(f"breakpoint override delete {useful_id}") + else: + self.assertTrue( + target.DeleteBreakpointOverride(useful_id), "Delete the right one" + ) + + # Make sure it's gone from the listings: + self.expect( + "breakpoint override list", + substrs=[str(useful_id), useful_help], + matching=False, + ) + # And that listing it is an error: + self.expect(f"breakpoint override list {useful_id}", error=True) + + new_bkpt = target.BreakpointCreateByLocation( + "main.c", line_number("main.c", "return 0") + ) + self.assertEqual(new_bkpt.num_locations, 1, "Made breakpoint") + threads = lldbutil.continue_to_breakpoint(process, new_bkpt) + self.assertEqual(len(threads), 1, "Hit our new breakpoint") + func_name = threads[0].frames[0].name + self.assertEqual(func_name, "main", "Stopped in unchanged location") diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/bkpt_resolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/bkpt_resolver.py new file mode 100644 index 0000000000000..181e2100444f8 --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/bkpt_resolver.py @@ -0,0 +1,67 @@ +import lldb + + +class OverrideExample: + def __init__( + self, bkpt: lldb.SBBreakpoint, extra_args: lldb.SBStructuredData, dict + ): + self.bkpt = bkpt + self.extra_args = extra_args + self.set_bkpt = False + symbol_value = extra_args.GetValueForKey("symbol") + self.alternate_loc = symbol_value.GetStringValue(1000) + + def __callback__(self, sym_ctx: lldb.SBSymbolContext): + """This callback only sets a breakpoint in one place, + no matter what file and line you ask for""" + if self.set_bkpt == True: + return + # FIXME: Do this better... + alternate_sym_list = sym_ctx.module.FindFunctions(self.alternate_loc) + if len(alternate_sym_list.symbols) == 0: + return + alternate_sym = alternate_sym_list.symbols[0] + start_addr = alternate_sym.addr + self.bkpt.AddLocation(start_addr) + self.set_bkpt = True + + def get_short_help(self): + return f"I am an override resolver, resolving to {self.alternate_loc}." + + def set_breakpoint(self, bkpt: lldb.SBBreakpoint): + self.bkpt = bkpt + + def overrides_resolver( + self, target: lldb.SBTarget, initial_resolver: lldb.SBStructuredData + ): + strm = lldb.SBStream() + initial_resolver.GetAsJSON(strm) + type = initial_resolver.GetValueForKey("Type").GetStringValue(1000) + if type == "FileAndLine": + return True + return False + + +class TrivialExample: + def __init__( + self, bkpt: lldb.SBBreakpoint, extra_args: lldb.SBStructuredData, dict + ): + self.bkpt = bkpt + self.extra_args = extra_args + self.set_bkpt = False + + def __callback__(self, sym_ctx: lldb.SBSymbolContext): + """This one's trivial, it does nothing""" + return + + def get_short_help(self): + return f"I am an triial resolver, doing nothing." + + def set_breakpoint(self, bkpt: lldb.SBBreakpoint): + self.bkpt = bkpt + + def overrides_resolver( + self, target: lldb.SBTarget, initial_resolver: lldb.SBStructuredData + ): + """Trivial - overrides nothing""" + return False diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/main.c b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/main.c new file mode 100644 index 0000000000000..b9fc619f46ff1 --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/main.c @@ -0,0 +1,20 @@ +#include + +int g_change_me = 0; + +int change_him() { return ++g_change_me; } + +void stop_here_instead() { printf("Stopped here instead?\n"); } + +int stop_symbol() { + static int s_cnt = 0; + printf("I am in the stop symbol: %d\n", s_cnt++); + stop_here_instead(); + return s_cnt; +} + +int main() { + stop_symbol(); + change_him(); + return 0; +} diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 5694aeeff3e5b..1ed6bee384a84 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -171,6 +171,11 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) { return nullptr; } +void * +lldb_private::python::LLDBSWIGPython_CastPyObjectToSBTarget(PyObject *data) { + return nullptr; +} + lldb::ValueObjectSP lldb_private::python::SWIGBridge::LLDBSWIGPython_GetValueObjectSPFromSBValue( void *data) { From 91b9a5d4feb4438ddb04c2283ed9bcb93846876f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:02:13 -0400 Subject: [PATCH 368/538] [gn] port dbf927a6e3ff4b (llvm-extract-bundle-entry) (#197053) --- llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn index bba53d357b13f..56760661a9c61 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn @@ -24,10 +24,16 @@ tablegen("StripOpts") { args = [ "-gen-opt-parser-defs" ] } +tablegen("ExtractBundleEntryOpts") { + visibility = [ ":llvm-objcopy" ] + args = [ "-gen-opt-parser-defs" ] +} + symlinks = [ "llvm-bitcode-strip", "llvm-install-name-tool", "llvm-strip", + "llvm-extract-bundle-entry", ] if (llvm_install_binutils_symlinks) { symlinks += [ @@ -63,6 +69,7 @@ driver_executable("llvm-objcopy") { ":InstallNameToolOpts", ":ObjcopyOpts", ":StripOpts", + ":ExtractBundleEntryOpts", "//llvm/lib/MC", "//llvm/lib/ObjCopy", "//llvm/lib/Object", From 6f10d3fe22a95e378a858a7380766c0dbf5dc156 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:02:35 -0400 Subject: [PATCH 369/538] [gn build] Port 2162c1692413 (#197054) --- .../secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn | 1 + .../gn/secondary/clang-tools-extra/clang-tidy/hicpp/BUILD.gn | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index 3503da0ab7985..a1c602a158139 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -114,6 +114,7 @@ static_library("bugprone") { "UncheckedStringToNumberConversionCheck.cpp", "UndefinedMemoryManipulationCheck.cpp", "UndelegatedConstructorCheck.cpp", + "UnhandledCodePathsCheck.cpp", "UnhandledExceptionAtNewCheck.cpp", "UnhandledSelfAssignmentCheck.cpp", "UnintendedCharOstreamOutputCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/hicpp/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/hicpp/BUILD.gn index 33389108716d9..dbb869655af57 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/hicpp/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/hicpp/BUILD.gn @@ -18,8 +18,5 @@ static_library("hicpp") { "//clang/lib/Serialization", "//llvm/lib/Support", ] - sources = [ - "HICPPTidyModule.cpp", - "MultiwayPathsCoveredCheck.cpp", - ] + sources = [ "HICPPTidyModule.cpp" ] } From fb66f2d7086764bfd91e3f32e36e5112f53c98c1 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:03:09 -0400 Subject: [PATCH 370/538] [gn build] Port 2f4c387147f1 (#197055) --- llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 1 - llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 80dce9234a1bb..2141d86340216 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -97,7 +97,6 @@ static_library("Support") { "FormattedStream.cpp", "GlobPattern.cpp", "GraphWriter.cpp", - "Hash.cpp", "HexagonAttributeParser.cpp", "HexagonAttributes.cpp", "InitLLVM.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn index 1ea1b37e9c87e..a419a5bed69fb 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn @@ -48,6 +48,7 @@ static_library("Utils") { "InstructionNamer.cpp", "Instrumentation.cpp", "IntegerDivision.cpp", + "KCFIHash.cpp", "LCSSA.cpp", "LibCallsShrinkWrap.cpp", "Local.cpp", From 12e8aebd6f61542690249f3079aae473ff1a2bea Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:03:38 -0400 Subject: [PATCH 371/538] [gn build] Port 3a7c0eba9bf8 (#197056) --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 9b02e64161ba2..f1d7b37ae6d88 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -173,6 +173,7 @@ copy("Headers") { "amxmovrsintrin.h", "amxtf32intrin.h", "andes_vector.h", + "arm64_neon.h", "arm64intr.h", "arm_acle.h", "arm_cmse.h", From c6ddfb9c9d9da5b89a88e42a6d9edfca7aa8c5da Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:04:57 -0400 Subject: [PATCH 372/538] [gn build] Port 457380cee197 (#197057) --- llvm/utils/gn/secondary/lldb/source/Breakpoint/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Breakpoint/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Breakpoint/BUILD.gn index 24a1657759c51..62bc88edd6fde 100644 --- a/llvm/utils/gn/secondary/lldb/source/Breakpoint/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Breakpoint/BUILD.gn @@ -30,6 +30,7 @@ static_library("Breakpoint") { "BreakpointResolverName.cpp", "BreakpointResolverScripted.cpp", "BreakpointSite.cpp", + "ScriptedBreakpointOverrideResolver.cpp", "StopPointSiteList.cpp", "Stoppoint.cpp", "StoppointCallbackContext.cpp", From 5b2ef822228fb7e45e368cf6f810904084c18e9c Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:05:18 -0400 Subject: [PATCH 373/538] [gn build] Port 5022a168a7a9 (#197058) --- .../gn/secondary/clang-tools-extra/clang-tidy/google/BUILD.gn | 1 - .../gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/google/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/google/BUILD.gn index 6b82610bdc1ac..e352834c53f98 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/google/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/google/BUILD.gn @@ -16,7 +16,6 @@ static_library("google") { "AvoidThrowingObjCExceptionCheck.cpp", "AvoidUnderscoreInGoogletestNameCheck.cpp", "DefaultArgumentsCheck.cpp", - "ExplicitConstructorCheck.cpp", "ExplicitMakePairCheck.cpp", "FloatTypesCheck.cpp", "FunctionNamingCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn index b63e63b41b1db..7b48e0d459665 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn @@ -37,6 +37,7 @@ static_library("misc") { "ConstCorrectnessCheck.cpp", "CoroutineHostileRAIICheck.cpp", "DefinitionsInHeadersCheck.cpp", + "ExplicitConstructorCheck.cpp", "HeaderIncludeCycleCheck.cpp", "IncludeCleanerCheck.cpp", "MiscTidyModule.cpp", From 8a89800cfc675473a3a4d9b3659d22cbd852db08 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:05:36 -0400 Subject: [PATCH 374/538] [gn build] Port 899663966c7e (#197059) --- llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn index 4e2787b68089e..f824c486b0a26 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -131,6 +131,7 @@ static_library("LLVMAArch64CodeGen") { "AArch64BranchTargets.cpp", "AArch64CallingConvention.cpp", "AArch64CleanupLocalDynamicTLSPass.cpp", + "AArch64CodeLayoutOpt.cpp", "AArch64CollectLOH.cpp", "AArch64CompressJumpTables.cpp", "AArch64CondBrTuning.cpp", From 65c1ad22c8af7fdfe0a445587af3ba154e644be1 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:05:55 -0400 Subject: [PATCH 375/538] [gn build] Port e361f28b7589 (#197060) --- llvm/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn index 0708f2a1897f5..e25050ff662c0 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ObjectYAML/BUILD.gn @@ -10,6 +10,7 @@ static_library("ObjectYAML") { sources = [ "ArchiveEmitter.cpp", "ArchiveYAML.cpp", + "BBAddrMapYAML.cpp", "COFFEmitter.cpp", "COFFYAML.cpp", "CodeViewYAMLDebugSections.cpp", From 066ebe48e9a79f79307793d83932a1e01d899f12 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:12:59 -0400 Subject: [PATCH 376/538] [gn] port 7c7f5be3560d4 (#197063) --- .../utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn index 1e2d0bef08e4d..dc6208965eaf1 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn @@ -14,7 +14,10 @@ unittest("AMDGPUTests") { "//llvm/lib/Target/AMDGPU/Utils", "//llvm/lib/TargetParser", ] - include_dirs = [ "//llvm/lib/Target/AMDGPU" ] + include_dirs = [ + "//llvm/lib/Target/AMDGPU", + "//llvm/unittests/CodeGen", + ] sources = [ "AMDGPUUnitTests.cpp", "CSETest.cpp", From 83965bb82fdd15b3b552870f430acd890207f71b Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 19:24:35 -0400 Subject: [PATCH 377/538] [gn] port c5bc0a2cdb4dd5 (clang-ssaf-analyzer) (#197065) --- llvm/utils/gn/secondary/clang/test/BUILD.gn | 1 + .../clang/tools/clang-ssaf-analyzer/BUILD.gn | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 llvm/utils/gn/secondary/clang/tools/clang-ssaf-analyzer/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn index d820a23f8f7a1..73f3bd020a1e0 100644 --- a/llvm/utils/gn/secondary/clang/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn @@ -164,6 +164,7 @@ group("test") { "//clang/tools/clang-refactor", "//clang/tools/clang-repl", "//clang/tools/clang-scan-deps", + "//clang/tools/clang-ssaf-analyzer", "//clang/tools/clang-ssaf-format", "//clang/tools/clang-ssaf-linker", "//clang/tools/clang-sycl-linker", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-ssaf-analyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-ssaf-analyzer/BUILD.gn new file mode 100644 index 0000000000000..289f2fd73b58d --- /dev/null +++ b/llvm/utils/gn/secondary/clang/tools/clang-ssaf-analyzer/BUILD.gn @@ -0,0 +1,12 @@ +executable("clang-ssaf-analyzer") { + configs += [ "//llvm/utils/gn/build:clang_code" ] + deps = [ + "//clang/lib/Basic", + "//clang/lib/ScalableStaticAnalysisFramework/Analyses", + "//clang/lib/ScalableStaticAnalysisFramework/Core", + "//clang/lib/ScalableStaticAnalysisFramework/Tool", + "//llvm/lib/Option", + "//llvm/lib/Support", + ] + sources = [ "SSAFAnalyzer.cpp" ] +} From 2850716954ade2e8dcfe5571b484f53f8deac374 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Mon, 11 May 2026 16:30:45 -0700 Subject: [PATCH 378/538] [CIR] Force deferred conditional cleanup emission (#197042) We had a bug in CIR where we were pushing cleanups on the deferredConditionalCleanupStack and never popping them. This was because we weren't wrapping the full expressions that produced them with the correct RAII object to force these cleanups to be emitted at the end of the expression in some cases. This change adds the proper enclosing RAII object and adds the code to correctly spill and reload values when that is needed to avoid dominance problems. Assisted-by: Cursor / claude-opus-4.7-thinking-xhigh --- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 14 +- .../test/CIR/CodeGen/cleanup-conditional.cpp | 320 ++++++++++++++++++ 3 files changed, 334 insertions(+), 4 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 6a26b2c987f3e..08a7def1022d3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -200,11 +200,11 @@ class ComplexExprEmitter : public StmtVisitor { return Visit(die->getExpr()); } mlir::Value VisitExprWithCleanups(ExprWithCleanups *e) { - CIRGenFunction::RunCleanupsScope scope(cgf); + CIRGenFunction::FullExprCleanupScope scope(cgf, e->getSubExpr()); mlir::Value complexVal = Visit(e->getSubExpr()); // Defend against dominance problems caused by jumps out of expression // evaluation through the shared cleanup block. - scope.forceCleanup({&complexVal}); + scope.exit({&complexVal}); return complexVal; } mlir::Value VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *e) { diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 3e38f678a7474..cda37513a4feb 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -1191,9 +1191,19 @@ LValue CIRGenFunction::emitLValue(const Expr *e) { return emitCallExprLValue(cast(e)); case Expr::ExprWithCleanupsClass: { const auto *cleanups = cast(e); - RunCleanupsScope scope(*this); + FullExprCleanupScope scope(*this, cleanups->getSubExpr()); LValue lv = emitLValue(cleanups->getSubExpr()); - assert(!cir::MissingFeatures::cleanupWithPreservedValues()); + if (lv.isSimple()) { + // Defend against branches out of gnu statement expressions surrounded by + // cleanups. + Address addr = lv.getAddress(); + mlir::Value v = addr.getPointer(); + scope.exit({&v}); + return LValue::makeAddr(addr.withPointer(v), lv.getType(), + lv.getBaseInfo()); + } + // FIXME: Is it possible to create an ExprWithCleanups that produces a + // bitfield lvalue or some other non-simple lvalue? return lv; } case Expr::CXXDefaultArgExprClass: { diff --git a/clang/test/CIR/CodeGen/cleanup-conditional.cpp b/clang/test/CIR/CodeGen/cleanup-conditional.cpp index 9b38e8278d459..7028b1990b873 100644 --- a/clang/test/CIR/CodeGen/cleanup-conditional.cpp +++ b/clang/test/CIR/CodeGen/cleanup-conditional.cpp @@ -554,3 +554,323 @@ void test_nested_ewc(bool c1, bool c2) { // OGCG: [[OUTER_MERGE2]]: // OGCG: call void @_ZN1TD1Ev({{.*}} %[[REF_TMP]]) // OGCG: call void @_ZN1TD1Ev({{.*}} %[[RESULT]]) + +// The result of the ternary is bound to an lvalue (the parameter of +// operator=), so the enclosing ExprWithCleanups is lowered through the +// LValue emission path. The lvalue path must still open a +// FullExprCleanupScope so that conditional cleanups deferred by the +// ternary (here, the D temporary created by the default argument inside +// each branch) are consumed before the full-expression boundary. +struct U { + U(); + ~U(); +}; + +struct V { + V(int, const U & = U()); + ~V(); + V &operator=(const V &); +}; + +void test_lvalue_ternary_cleanup(bool c, V &y) { + y = c ? V(1) : V(2); +} +// CIR-LABEL: @_Z27test_lvalue_ternary_cleanupbR1V +// CIR: %[[REFTMP:.*]] = cir.alloca !rec_V, !cir.ptr, ["ref.tmp0"] +// CIR: %[[UTRUE:.*]] = cir.alloca !rec_U, !cir.ptr, ["ref.tmp1"] +// CIR: %[[ACTTRUE:.*]] = cir.alloca !cir.bool, !cir.ptr, ["cleanup.cond"] +// CIR: %[[UFALSE:.*]] = cir.alloca !rec_U, !cir.ptr, ["ref.tmp2"] +// CIR: %[[ACTFALSE:.*]] = cir.alloca !cir.bool, !cir.ptr, ["cleanup.cond"] +// The outer cleanup scope wraps the full expression containing the ternary +// and the operator= call. +// CIR: cir.cleanup.scope { +// Both cleanup flags initialized to false before the ternary. +// CIR: cir.store {{.*}}, %[[ACTTRUE]] : !cir.bool, !cir.ptr +// CIR: cir.store {{.*}}, %[[ACTFALSE]] : !cir.bool, !cir.ptr +// CIR: cir.if %{{.*}} { +// CIR: cir.call @_ZN1UC1Ev(%[[UTRUE]]) +// CIR: cir.store {{.*}}, %[[ACTTRUE]] : !cir.bool, !cir.ptr +// CIR: cir.call @_ZN1VC1EiRK1U(%[[REFTMP]], %{{.*}}, %[[UTRUE]]) +// CIR: } else { +// CIR: cir.call @_ZN1UC1Ev(%[[UFALSE]]) +// CIR: cir.store {{.*}}, %[[ACTFALSE]] : !cir.bool, !cir.ptr +// CIR: cir.call @_ZN1VC1EiRK1U(%[[REFTMP]], %{{.*}}, %[[UFALSE]]) +// CIR: } +// Inner cleanup scope for the operator= call destroys the ref.tmp. +// CIR: cir.cleanup.scope { +// CIR: cir.call @_ZN1VaSERKS_(%{{.*}}, %[[REFTMP]]) +// CIR: cir.yield +// CIR: } cleanup normal { +// CIR: cir.call @_ZN1VD1Ev(%[[REFTMP]]) +// CIR: cir.yield +// CIR: } +// CIR: cir.yield +// Outer cleanup region: conditionally destroy U temporaries by active flag. +// CIR: } cleanup normal { +// CIR: %[[F2:.*]] = cir.load {{.*}} %[[ACTFALSE]] +// CIR: cir.if %[[F2]] { +// CIR: cir.call @_ZN1UD1Ev(%[[UFALSE]]) +// CIR: } +// CIR: %[[F1:.*]] = cir.load {{.*}} %[[ACTTRUE]] +// CIR: cir.if %[[F1]] { +// CIR: cir.call @_ZN1UD1Ev(%[[UTRUE]]) +// CIR: } +// CIR: cir.yield +// CIR: } + +// LLVM-LABEL: define dso_local void @_Z27test_lvalue_ternary_cleanupbR1V( +// LLVM: %[[REFTMP:.*]] = alloca %struct.V +// LLVM: %[[UTRUE:.*]] = alloca %struct.U +// LLVM: %[[ACTTRUE:.*]] = alloca i8 +// LLVM: %[[UFALSE:.*]] = alloca %struct.U +// LLVM: %[[ACTFALSE:.*]] = alloca i8 +// LLVM: store i8 0, ptr %[[ACTTRUE]] +// LLVM: store i8 0, ptr %[[ACTFALSE]] +// LLVM: br i1 %{{.*}}, label %[[CONS_TRUE:.*]], label %[[CONS_FALSE:.*]] +// LLVM: [[CONS_TRUE]]: +// LLVM: call void @_ZN1UC1Ev({{.*}} %[[UTRUE]]) +// LLVM: store i8 1, ptr %[[ACTTRUE]] +// LLVM: call void @_ZN1VC1EiRK1U({{.*}} %[[REFTMP]], i32 {{.*}} 1, {{.*}} %[[UTRUE]]) +// LLVM: br label %[[MERGE:.*]] +// LLVM: [[CONS_FALSE]]: +// LLVM: call void @_ZN1UC1Ev({{.*}} %[[UFALSE]]) +// LLVM: store i8 1, ptr %[[ACTFALSE]] +// LLVM: call void @_ZN1VC1EiRK1U({{.*}} %[[REFTMP]], i32 {{.*}} 2, {{.*}} %[[UFALSE]]) +// LLVM: br label %[[MERGE]] +// LLVM: [[MERGE]]: +// LLVM: call {{.*}} ptr @_ZN1VaSERKS_({{.*}}, {{.*}} %[[REFTMP]]) +// LLVM: call void @_ZN1VD1Ev({{.*}} %[[REFTMP]]) +// LLVM: %[[F2_BYTE:.*]] = load i8, ptr %[[ACTFALSE]] +// LLVM: %[[F2:.*]] = trunc i8 %[[F2_BYTE]] to i1 +// LLVM: br i1 %[[F2]], label %[[DTOR_F:.*]], label %[[SKIP_F:.*]] +// LLVM: [[DTOR_F]]: +// LLVM: call void @_ZN1UD1Ev({{.*}} %[[UFALSE]]) +// LLVM: br label %[[SKIP_F]] +// LLVM: [[SKIP_F]]: +// LLVM: %[[F1_BYTE:.*]] = load i8, ptr %[[ACTTRUE]] +// LLVM: %[[F1:.*]] = trunc i8 %[[F1_BYTE]] to i1 +// LLVM: br i1 %[[F1]], label %[[DTOR_T:.*]], label %[[SKIP_T:.*]] +// LLVM: [[DTOR_T]]: +// LLVM: call void @_ZN1UD1Ev({{.*}} %[[UTRUE]]) +// LLVM: br label %[[SKIP_T]] + +// OGCG-LABEL: define dso_local void @_Z27test_lvalue_ternary_cleanupbR1V( +// OGCG: store i1 false, ptr %[[ACTTRUE:.*]] +// OGCG: store i1 false, ptr %[[ACTFALSE:.*]] +// OGCG: br i1 %{{.*}}, label %[[CTRUE:.*]], label %[[CFALSE:.*]] +// OGCG: [[CTRUE]]: +// OGCG: call void @_ZN1UC1Ev({{.*}} %[[UTRUE:.*]]) +// OGCG: store i1 true, ptr %[[ACTTRUE]] +// OGCG: call void @_ZN1VC1EiRK1U({{.*}} %[[REFTMP:.*]], i32 {{.*}} 1, {{.*}} %[[UTRUE]]) +// OGCG: br label %[[MERGE:.*]] +// OGCG: [[CFALSE]]: +// OGCG: call void @_ZN1UC1Ev({{.*}} %[[UFALSE:.*]]) +// OGCG: store i1 true, ptr %[[ACTFALSE]] +// OGCG: call void @_ZN1VC1EiRK1U({{.*}} %[[REFTMP]], i32 {{.*}} 2, {{.*}} %[[UFALSE]]) +// OGCG: br label %[[MERGE]] +// OGCG: [[MERGE]]: +// OGCG: call {{.*}} ptr @_ZN1VaSERKS_({{.*}}, {{.*}} %[[REFTMP]]) +// OGCG: call void @_ZN1VD1Ev({{.*}} %[[REFTMP]]) +// OGCG: br i1 %{{.*}}, label %[[DTOR_F:.*]], label %[[AFTER_F:.*]] +// OGCG: [[DTOR_F]]: +// OGCG: call void @_ZN1UD1Ev({{.*}} %[[UFALSE]]) +// OGCG: br label %[[AFTER_F]] +// OGCG: [[AFTER_F]]: +// OGCG: br i1 %{{.*}}, label %[[DTOR_T:.*]], label %[[AFTER_T:.*]] +// OGCG: [[DTOR_T]]: +// OGCG: call void @_ZN1UD1Ev({{.*}} %[[UTRUE]]) +// OGCG: br label %[[AFTER_T]] + +// When an ExprWithCleanups produces an lvalue whose base pointer is computed +// *inside* the FullExprCleanupScope (here, via the `.field` GEP on the +// conditional's lvalue result), the lvalue path must spill the base pointer +// before the scope closes and reload it afterward so that uses outside the +// scope (the reference binding to `r`) see a dominating SSA value. +struct R { + R(int); + R(const R &); + ~R(); + int field; +}; + +R &pickR(const R &x); + +int *sink; + +void test_lvalue_reload(bool c) { + int &r = (c ? pickR(R(1)) : pickR(R(2))).field; + sink = &r; +} +// CIR-LABEL: @_Z18test_lvalue_reloadb +// CIR: %[[R_REF:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["r", init, const] +// CIR: %[[TMP0:.*]] = cir.alloca !rec_R, !cir.ptr, ["ref.tmp0"] +// CIR: %[[ACT0:.*]] = cir.alloca !cir.bool, !cir.ptr, ["cleanup.cond"] +// CIR: %[[TMP1:.*]] = cir.alloca !rec_R, !cir.ptr, ["ref.tmp1"] +// CIR: %[[ACT1:.*]] = cir.alloca !cir.bool, !cir.ptr, ["cleanup.cond"] +// The spill slot for the lvalue's base pointer. +// CIR: %[[SPILL:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["tmp.exprcleanup"] +// CIR: cir.cleanup.scope { +// CIR: cir.store {{.*}}, %[[ACT0]] : !cir.bool, !cir.ptr +// CIR: cir.store {{.*}}, %[[ACT1]] : !cir.bool, !cir.ptr +// The ternary returns an !cir.ptr lvalue from pickR(). +// CIR: %[[TERN:.*]] = cir.ternary({{.*}}, true { +// CIR: cir.call @_ZN1RC1Ei(%[[TMP0]], %{{.*}}) +// CIR: cir.store {{.*}}, %[[ACT0]] +// CIR: %[[CALL_T:.*]] = cir.call @_Z5pickRRK1R(%[[TMP0]]) +// CIR: cir.yield %[[CALL_T]] : !cir.ptr +// CIR: }, false { +// CIR: cir.call @_ZN1RC1Ei(%[[TMP1]], %{{.*}}) +// CIR: cir.store {{.*}}, %[[ACT1]] +// CIR: %[[CALL_F:.*]] = cir.call @_Z5pickRRK1R(%[[TMP1]]) +// CIR: cir.yield %[[CALL_F]] : !cir.ptr +// CIR: }) : (!cir.bool) -> !cir.ptr +// `.field` GEP and its spill happen inside the cleanup scope body. +// CIR: %[[GEP:.*]] = cir.get_member %[[TERN]][0] {name = "field"} : !cir.ptr -> !cir.ptr +// CIR: cir.store {{.*}} %[[GEP]], %[[SPILL]] +// CIR: cir.yield +// CIR: } cleanup normal { +// CIR: %[[F1:.*]] = cir.load {{.*}} %[[ACT1]] +// CIR: cir.if %[[F1]] { +// CIR: cir.call @_ZN1RD1Ev(%[[TMP1]]) +// CIR: } +// CIR: %[[F0:.*]] = cir.load {{.*}} %[[ACT0]] +// CIR: cir.if %[[F0]] { +// CIR: cir.call @_ZN1RD1Ev(%[[TMP0]]) +// CIR: } +// CIR: cir.yield +// CIR: } +// Reload happens after the cleanup scope; the reloaded pointer initializes r. +// CIR: %[[RELOAD:.*]] = cir.load {{.*}} %[[SPILL]] : !cir.ptr>, !cir.ptr +// CIR: cir.store {{.*}} %[[RELOAD]], %[[R_REF]] : !cir.ptr, !cir.ptr> + +// LLVM-LABEL: define dso_local void @_Z18test_lvalue_reloadb( +// LLVM: %[[R_REF:.*]] = alloca ptr +// LLVM: %[[TMP0:.*]] = alloca %struct.R +// LLVM: %[[ACT0:.*]] = alloca i8 +// LLVM: %[[TMP1:.*]] = alloca %struct.R +// LLVM: %[[ACT1:.*]] = alloca i8 +// LLVM: %[[SPILL:.*]] = alloca ptr +// LLVM: br i1 %{{.*}}, label %[[BR_T:.*]], label %[[BR_F:.*]] +// LLVM: [[BR_T]]: +// LLVM: call void @_ZN1RC1Ei({{.*}} %[[TMP0]], i32 {{.*}} 1) +// LLVM: store i8 1, ptr %[[ACT0]] +// LLVM: %[[CALL_T:.*]] = call {{.*}} ptr @_Z5pickRRK1R({{.*}} %[[TMP0]]) +// LLVM: br label %[[MERGE:.*]] +// LLVM: [[BR_F]]: +// LLVM: call void @_ZN1RC1Ei({{.*}} %[[TMP1]], i32 {{.*}} 2) +// LLVM: store i8 1, ptr %[[ACT1]] +// LLVM: %[[CALL_F:.*]] = call {{.*}} ptr @_Z5pickRRK1R({{.*}} %[[TMP1]]) +// LLVM: br label %[[MERGE]] +// LLVM: [[MERGE]]: +// LLVM: %[[PHI:.*]] = phi ptr [ %[[CALL_F]], %[[BR_F]] ], [ %[[CALL_T]], %[[BR_T]] ] +// LLVM: %[[GEP:.*]] = getelementptr {{.*}} %struct.R, ptr %[[PHI]] +// LLVM: store ptr %[[GEP]], ptr %[[SPILL]] +// Cleanup checks happen between the spill and the reload. +// LLVM: load i8, ptr %[[ACT1]] +// LLVM: load i8, ptr %[[ACT0]] +// LLVM: %[[RELOAD:.*]] = load ptr, ptr %[[SPILL]] +// LLVM: store ptr %[[RELOAD]], ptr %[[R_REF]] + +// OGCG-LABEL: define dso_local void @_Z18test_lvalue_reloadb( +// OGCG: %[[R_REF:.*]] = alloca ptr +// OGCG: br i1 %{{.*}}, label %[[BR_T:.*]], label %[[BR_F:.*]] +// OGCG: [[BR_T]]: +// OGCG: call void @_ZN1RC1Ei({{.*}} %[[TMP0:.*]], i32 {{.*}} 1) +// OGCG: %[[CALL_T:.*]] = call {{.*}} ptr @_Z5pickRRK1R({{.*}} %[[TMP0]]) +// OGCG: br label %[[MERGE:.*]] +// OGCG: [[BR_F]]: +// OGCG: call void @_ZN1RC1Ei({{.*}} %[[TMP1:.*]], i32 {{.*}} 2) +// OGCG: %[[CALL_F:.*]] = call {{.*}} ptr @_Z5pickRRK1R({{.*}} %[[TMP1]]) +// OGCG: br label %[[MERGE]] +// OGCG: [[MERGE]]: +// OGCG: %[[PHI:.*]] = phi ptr [ %[[CALL_T]], %[[BR_T]] ], [ %[[CALL_F]], %[[BR_F]] ] +// OGCG: %[[GEP:.*]] = getelementptr {{.*}} %struct.R, ptr %[[PHI]] +// Classic codegen uses the phi-merged pointer directly; the cleanups run, and +// then the lvalue address is stored into r. +// OGCG: call void @_ZN1RD1Ev({{.*}} %[[TMP1]]) +// OGCG: call void @_ZN1RD1Ev({{.*}} %[[TMP0]]) +// OGCG: store ptr %[[GEP]], ptr %[[R_REF]] + +// When the result of an ExprWithCleanups is a _Complex value, the complex +// emitter must use FullExprCleanupScope so that conditional cleanups deferred +// by the inner conditional operator are consumed at the full-expression +// boundary. Without this, the destructor cleanup for the temporary `D` in the +// true branch would remain on the deferredConditionalCleanupStack and trip +// the assertion in finishFunction. +struct CplxD { + CplxD(); + ~CplxD(); + _Complex float get(); +}; + +_Complex float test_complex_cond_cleanup(bool b, _Complex float x) { + return b ? CplxD().get() : x; +} +// CIR-LABEL: @_Z25test_complex_cond_cleanupbCf +// CIR: %[[TMP:.*]] = cir.alloca !rec_CplxD, !cir.ptr, ["ref.tmp0"] +// CIR: %[[ACTIVE:.*]] = cir.alloca !cir.bool, !cir.ptr, ["cleanup.cond"] +// The full expression is wrapped in a single cleanup scope. +// CIR: cir.cleanup.scope { +// CIR: %[[COND:.*]] = cir.load {{.*}} : !cir.ptr, !cir.bool +// Active flag is initialized to false before the ternary so the dtor only runs +// when the true branch was actually taken. +// CIR: %[[FALSE:.*]] = cir.const #false +// CIR: cir.store %[[FALSE]], %[[ACTIVE]] : !cir.bool, !cir.ptr +// CIR: %{{.*}} = cir.ternary(%[[COND]], true { +// CIR: cir.call @_ZN5CplxDC1Ev(%[[TMP]]) +// CIR: %[[SET_TRUE:.*]] = cir.const #true +// CIR: cir.store %[[SET_TRUE]], %[[ACTIVE]] : !cir.bool, !cir.ptr +// CIR: %[[CALL:.*]] = cir.call @_ZN5CplxD3getEv(%[[TMP]]) +// CIR: cir.yield %[[CALL]] : !cir.complex +// CIR: }, false { +// CIR: %[[XV:.*]] = cir.load {{.*}} : !cir.ptr>, !cir.complex +// CIR: cir.yield %[[XV]] : !cir.complex +// CIR: }) : (!cir.bool) -> !cir.complex +// CIR: cir.yield +// CIR: } cleanup normal { +// CIR: %[[IS_ACTIVE:.*]] = cir.load {{.*}} %[[ACTIVE]] +// CIR: cir.if %[[IS_ACTIVE]] { +// CIR: cir.call @_ZN5CplxDD1Ev(%[[TMP]]) +// CIR: } +// CIR: cir.yield +// CIR: } + +// LLVM-LABEL: define dso_local {{.*}} { float, float } @_Z25test_complex_cond_cleanupbCf( +// LLVM: %[[TMP:.*]] = alloca %struct.CplxD +// LLVM: %[[ACTIVE:.*]] = alloca i8 +// LLVM: br i1 %{{.*}}, label %[[TRUE_BR:.*]], label %[[FALSE_BR:.*]] +// LLVM: [[TRUE_BR]]: +// LLVM: call void @_ZN5CplxDC1Ev(ptr {{.*}} %[[TMP]]) +// LLVM: store i8 1, ptr %[[ACTIVE]] +// LLVM: %[[CALL:.*]] = call {{.*}} { float, float } @_ZN5CplxD3getEv(ptr {{.*}} %[[TMP]]) +// LLVM: br label %[[MERGE:.*]] +// LLVM: [[FALSE_BR]]: +// LLVM: %[[XV:.*]] = load { float, float }, ptr %{{.*}} +// LLVM: br label %[[MERGE]] +// LLVM: [[MERGE]]: +// LLVM: %{{.*}} = phi { float, float } [ %[[XV]], %[[FALSE_BR]] ], [ %[[CALL]], %[[TRUE_BR]] ] +// LLVM: %[[ACT:.*]] = load i8, ptr %[[ACTIVE]] +// LLVM: %[[ACT_B:.*]] = trunc i8 %[[ACT]] to i1 +// LLVM: br i1 %[[ACT_B]], label %[[DTOR:.*]], label %[[SKIP:.*]] +// LLVM: [[DTOR]]: +// LLVM: call void @_ZN5CplxDD1Ev(ptr {{.*}} %[[TMP]]) +// LLVM: br label %[[SKIP]] + +// OGCG-LABEL: define dso_local {{.*}} <2 x float> @_Z25test_complex_cond_cleanupbCf( +// OGCG: %[[TMP:.*]] = alloca %struct.CplxD +// OGCG: %[[ACTIVE:.*]] = alloca i1 +// OGCG: store i1 false, ptr %[[ACTIVE]] +// OGCG: br i1 %{{.*}}, label %[[CTRUE:.*]], label %[[CFALSE:.*]] +// OGCG: [[CTRUE]]: +// OGCG: call void @_ZN5CplxDC1Ev(ptr {{.*}} %[[TMP]]) +// OGCG: store i1 true, ptr %[[ACTIVE]] +// OGCG: %[[CALL:.*]] = call {{.*}} <2 x float> @_ZN5CplxD3getEv(ptr {{.*}} %[[TMP]]) +// OGCG: br label %[[MERGE:.*]] +// OGCG: [[CFALSE]]: +// OGCG: br label %[[MERGE]] +// OGCG: [[MERGE]]: +// OGCG: %[[ACT:.*]] = load i1, ptr %[[ACTIVE]] +// OGCG: br i1 %[[ACT]], label %[[DTOR:.*]], label %[[DONE:.*]] +// OGCG: [[DTOR]]: +// OGCG: call void @_ZN5CplxDD1Ev(ptr {{.*}} %[[TMP]]) +// OGCG: br label %[[DONE]] From ba467b630ababad2ae75e0036717d8973c39edaa Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Mon, 11 May 2026 16:32:55 -0700 Subject: [PATCH 379/538] [CIR] Handle usual delete alignment argument (#197046) This implements the handling to add the alignment argument to operator delete calls when the usual delete parameters require it. --- clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp | 12 ++- clang/test/CIR/CodeGen/delete-aligned.cpp | 96 +++++++++++++++++++++++ 2 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 clang/test/CIR/CodeGen/delete-aligned.cpp diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index c7b11aa3f0009..de866f286a0e1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -1816,9 +1816,15 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD, } // Pass the alignment if the delete function has an align_val_t parameter. - if (isAlignedAllocation(params.Alignment)) - cgm.errorNYI(deleteFD->getSourceRange(), - "emitDeleteCall: aligned allocation"); + if (isAlignedAllocation(params.Alignment)) { + QualType alignValType = *paramTypeIter++; + CharUnits deleteTypeAlign = + getContext().toCharUnitsFromBits(getContext().getTypeAlignIfKnown( + deleteTy, /*NeedsPreferredAlignment=*/true)); + cir::ConstantOp align = builder.getConstInt( + *currSrcLoc, convertType(alignValType), deleteTypeAlign.getQuantity()); + deleteArgs.add(RValue::get(align), alignValType); + } assert(paramTypeIter == deleteFTy->param_type_end() && "unknown parameter to usual delete function"); diff --git a/clang/test/CIR/CodeGen/delete-aligned.cpp b/clang/test/CIR/CodeGen/delete-aligned.cpp new file mode 100644 index 0000000000000..be18ea46c5eca --- /dev/null +++ b/clang/test/CIR/CodeGen/delete-aligned.cpp @@ -0,0 +1,96 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -mconstructor-aliases -emit-cir %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -mconstructor-aliases -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -mconstructor-aliases -emit-llvm %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s + +// Check that `delete p` calls the aligned (de)allocation function when the +// object's type is over-aligned and the program uses C++17 aligned allocation. + +typedef decltype(sizeof(0)) size_t; +namespace std { enum class align_val_t : size_t {}; } + +#define OVERALIGNED alignas(__STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2) + +// Global aligned operator delete. +// =============================== +struct OVERALIGNED A { A(); int n[128]; }; + +void a2(A *p) { delete p; } + +// CIR: cir.func {{.*}} @_Z2a2P1A +// CIR: %[[P:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr>, !cir.ptr +// CIR: %[[NULL:.*]] = cir.const #cir.ptr +// CIR: %[[NOT_NULL:.*]] = cir.cmp ne %[[P]], %[[NULL]] +// CIR: cir.if %[[NOT_NULL]] { +// CIR: cir.cleanup.scope { +// CIR: cir.yield +// CIR: } cleanup normal { +// CIR: %[[P_CAST:.*]] = cir.cast bitcast %[[P]] : !cir.ptr -> !cir.ptr +// CIR: %[[OBJ_SIZE:.*]] = cir.const #cir.int<512> : !u64i +// CIR: %[[ALIGNMENT:.*]] = cir.const #cir.int<32> : !u64i +// CIR: cir.call @_ZdlPvmSt11align_val_t(%[[P_CAST]], %[[OBJ_SIZE]], %[[ALIGNMENT]]) +// CIR: cir.yield +// CIR: } +// CIR: } + +// LLVM: define {{.*}} void @_Z2a2P1A +// LLVM: %[[P:.*]] = load ptr, ptr %{{.*}} +// LLVM: %[[NOT_NULL:.*]] = icmp ne ptr %[[P]], null +// LLVM: br i1 %[[NOT_NULL]], label %[[DELETE_NOTNULL:.*]], label %{{.*}} +// LLVM: [[DELETE_NOTNULL]]: +// LLVM: call void @_ZdlPvmSt11align_val_t(ptr noundef %[[P]], i64 noundef 512, i64 noundef 32) + +// OGCG: define {{.*}} void @_Z2a2P1A( +// OGCG: %[[P:.*]] = load ptr, ptr %{{.*}} +// OGCG: %[[ISNULL:.*]] = icmp eq ptr %[[P]], null +// OGCG: br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]] +// OGCG: [[DELETE_NOTNULL]]: +// OGCG: call void @_ZdlPvmSt11align_val_t(ptr noundef %[[P]], i64 noundef 512, i64 noundef 32) + + +// Class-specific aligned operator delete. +// ======================================= +struct OVERALIGNED B { + B(); + // These are just a distraction. We should ignore them. + void *operator new(size_t); + void operator delete(void*, size_t); + + void *operator new(size_t, std::align_val_t); + void operator delete(void*, std::align_val_t); + + int n[128]; +}; + +void b2(B *p) { delete p; } + +// CIR: cir.func {{.*}} @_Z2b2P1B +// CIR: %[[P:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr>, !cir.ptr +// CIR: %[[NULL:.*]] = cir.const #cir.ptr +// CIR: %[[NOT_NULL:.*]] = cir.cmp ne %[[P]], %[[NULL]] +// CIR: cir.if %[[NOT_NULL]] { +// CIR: cir.cleanup.scope { +// CIR: cir.yield +// CIR: } cleanup normal { +// CIR: %[[P_CAST:.*]] = cir.cast bitcast %[[P]] : !cir.ptr -> !cir.ptr +// CIR: %[[ALIGNMENT:.*]] = cir.const #cir.int<32> : !u64i +// CIR: cir.call @_ZN1BdlEPvSt11align_val_t(%[[P_CAST]], %[[ALIGNMENT]]) +// CIR: cir.yield +// CIR: } +// CIR: } + +// LLVM: define {{.*}} void @_Z2b2P1B +// LLVM: %[[P:.*]] = load ptr, ptr %{{.*}} +// LLVM: %[[NOT_NULL:.*]] = icmp ne ptr %[[P]], null +// LLVM: br i1 %[[NOT_NULL]], label %[[DELETE_NOTNULL:.*]], label %{{.*}} +// LLVM: [[DELETE_NOTNULL]]: +// LLVM: call void @_ZN1BdlEPvSt11align_val_t(ptr noundef %[[P]], i64 noundef 32) + +// OGCG: define {{.*}} void @_Z2b2P1B( +// OGCG: %[[P:.*]] = load ptr, ptr %{{.*}} +// OGCG: %[[ISNULL:.*]] = icmp eq ptr %[[P]], null +// OGCG: br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]] +// OGCG: [[DELETE_NOTNULL]]: +// OGCG: call void @_ZN1BdlEPvSt11align_val_t(ptr noundef %[[P]], i64 noundef 32) From dc58013db904c6576da201ab2f05458ea5228b57 Mon Sep 17 00:00:00 2001 From: Chen Li Date: Mon, 11 May 2026 16:54:38 -0700 Subject: [PATCH 380/538] [llvm-gsymutil] Replace truncated DWARF names with mangled names from symbol table (#184221) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - During `GsymCreator::finalize()`, when deduplicating entries with the same address range, check if the DWARF entry's name is a truncated version of the symbol table's mangled name - If the DWARF name is a substring of the demangled symbol table name, replace it with the full mangled name before discarding the symbol table entry - This allows downstream tools to properly demangle and display full function signatures ## Test plan ### Unit tests - `TestMangledNameReplacement`: Verifies DWARF name `make_ftype` is replaced with `_Z10make_ftypePci` and line table is preserved - `TestMangledNameReplacementNegative`: Verifies no replacement when both names are mangled, or when names are unrelated - All 51 GSYM unit tests pass ### Lit test - `elf-mangled-name-replacement.yaml`: End-to-end test creating an ELF with DWARF + symbol table, converting to GSYM, and verifying the output - All 9/9 applicable GSYM lit tests pass (6 unsupported are ARM/macOS tests on x86_64 Linux) ### Manual end-to-end testing Created ELF binaries with `yaml2obj` containing both DWARF debug info and symbol table entries for the same function, then converted to GSYM with `llvm-gsymutil --convert` and verified the output with `llvm-gsymutil` dump. **Test 1: Name replacement happens when DWARF name is truncated** - DWARF has function named `make_ftype` with line table at `0x401000` - Symbol table has `_Z10make_ftypePci` (demangles to `make_ftype(char*, int)`) at same address - After conversion, GSYM output shows: `"_Z10make_ftypePci"` with line table preserved ✅ **Test 2: No replacement when names are unrelated** - DWARF has function named `unrelated_func` with line table at `0x401000` - Symbol table has `_Z10make_ftypePci` at same address - After conversion, GSYM output shows: `"unrelated_func"` — name unchanged ✅ **Test 3: Replacement works with namespaced functions** - DWARF has function named `make_ftype` with line table at `0x401000` - Symbol table has `_ZN12_GLOBAL__N_110make_ftypeEPci` (demangles to `(anonymous namespace)::make_ftype(char*, int)`) at same address - After conversion, GSYM output shows: `"_ZN12_GLOBAL__N_110make_ftypeEPci"` with line table preserved ✅ Co-authored-by: Chen Li --- .../llvm/DebugInfo/GSYM/CallSiteInfo.h | 7 + .../llvm/DebugInfo/GSYM/FunctionInfo.h | 17 ++- llvm/lib/DebugInfo/GSYM/GsymCreator.cpp | 59 ++++++-- .../X86/elf-mangled-name-replacement.yaml | 128 ++++++++++++++++ .../elf-swift-mangled-name-replacement.yaml | 128 ++++++++++++++++ llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp | 139 ++++++++++++++++++ 6 files changed, 460 insertions(+), 18 deletions(-) create mode 100644 llvm/test/tools/llvm-gsymutil/X86/elf-mangled-name-replacement.yaml create mode 100644 llvm/test/tools/llvm-gsymutil/X86/elf-swift-mangled-name-replacement.yaml diff --git a/llvm/include/llvm/DebugInfo/GSYM/CallSiteInfo.h b/llvm/include/llvm/DebugInfo/GSYM/CallSiteInfo.h index 2b72d901f2330..96e4c9adf99ad 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/CallSiteInfo.h +++ b/llvm/include/llvm/DebugInfo/GSYM/CallSiteInfo.h @@ -78,6 +78,13 @@ struct CallSiteInfo { struct CallSiteInfoCollection { std::vector CallSites; + bool operator==(const CallSiteInfoCollection &RHS) const { + return CallSites == RHS.CallSites; + } + bool operator!=(const CallSiteInfoCollection &RHS) const { + return !(*this == RHS); + } + /// Decode a CallSiteInfoCollection object from a binary data stream. /// /// \param Data The binary stream to read the data from. diff --git a/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h b/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h index bd836e6783a97..0713ff27966d9 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h +++ b/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h @@ -217,7 +217,8 @@ struct FunctionInfo { inline bool operator==(const FunctionInfo &LHS, const FunctionInfo &RHS) { return LHS.Range == RHS.Range && LHS.Name == RHS.Name && - LHS.OptLineTable == RHS.OptLineTable && LHS.Inline == RHS.Inline; + LHS.OptLineTable == RHS.OptLineTable && LHS.Inline == RHS.Inline && + LHS.CallSites == RHS.CallSites; } inline bool operator!=(const FunctionInfo &LHS, const FunctionInfo &RHS) { return !(LHS == RHS); @@ -233,13 +234,17 @@ inline bool operator!=(const FunctionInfo &LHS, const FunctionInfo &RHS) { /// inline information with the most entries will appeear last. If the inline /// information match, either by both function infos not having any or both /// being exactly the same, we will then compare line tables. Comparing line -/// tables allows the entry with the most line entries to appear last. This -/// ensures we are able to save the FunctionInfo with the most debug info into -/// the GSYM file. +/// tables allows the entry with the most line entries to appear last. As a +/// final tiebreaker, an entry that has call site information sorts after one +/// that does not, so that within a single address range the entry with the +/// most debug info always appears last. This ensures we are able to save the +/// FunctionInfo with the most debug info into the GSYM file. inline bool operator<(const FunctionInfo &LHS, const FunctionInfo &RHS) { // First sort by address range - return std::tie(LHS.Range, LHS.Inline, LHS.OptLineTable) < - std::tie(RHS.Range, RHS.Inline, RHS.OptLineTable); + const bool LHSHasCallSites = LHS.CallSites.has_value(); + const bool RHSHasCallSites = RHS.CallSites.has_value(); + return std::tie(LHS.Range, LHS.Inline, LHS.OptLineTable, LHSHasCallSites) < + std::tie(RHS.Range, RHS.Inline, RHS.OptLineTable, RHSHasCallSites); } LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const FunctionInfo &R); diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp index 4b3929a532489..d6e8795fdec7d 100644 --- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp +++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp @@ -6,6 +6,7 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/GSYM/GsymCreator.h" +#include "llvm/ADT/SmallString.h" #include "llvm/DebugInfo/GSYM/FileWriter.h" #include "llvm/DebugInfo/GSYM/Header.h" #include "llvm/DebugInfo/GSYM/LineTable.h" @@ -21,6 +22,34 @@ using namespace llvm; using namespace gsym; +// Keep this matching cheap: Itanium and Swift both encode identifiers as +// in the raw mangled name. Look for that token instead of +// demangling during finalize(). +static bool isSupportedMangledPrefix(StringRef Name) { + return Name.starts_with("_Z") || Name.starts_with("$s") || + Name.starts_with("$S"); +} + +static bool shouldReplaceWithMangledName(StringRef AlternateName, + StringRef CurrentName) { + // Any name is better than no name. + if (CurrentName.empty() && !AlternateName.empty()) + return true; + + // Keep the current name if it's already mangled, or if the alternate name + // is not a supported mangled name. + if (isSupportedMangledPrefix(CurrentName) || + !isSupportedMangledPrefix(AlternateName)) + return false; + + // Confirm the alternate mangled name actually contains the current name as + // an Itanium/Swift identifier token (). + SmallString<64> LengthAndName; + raw_svector_ostream OS(LengthAndName); + OS << CurrentName.size() << CurrentName; + return AlternateName.contains(StringRef(LengthAndName)); +} + GsymCreator::GsymCreator(bool Quiet) : StrTab(StringTableBuilder::ELF), Quiet(Quiet) { insertFile(StringRef()); @@ -180,14 +209,24 @@ llvm::Error GsymCreator::finalize(OutputAggregator &Out) { if (ranges_equal || Prev.Range.intersects(Curr.Range)) { // Overlapping ranges or empty identical ranges. if (ranges_equal) { - // Same address range. Check if one is from debug - // info and the other is from a symbol table. If - // so, then keep the one with debug info. Our - // sorting guarantees that entries with matching - // address ranges that have debug info are last in - // the sort. - if (!(Prev == Curr)) { - if (Prev.hasRichInfo() && Curr.hasRichInfo()) + // Same address range. The sort orders entries with more debug info + // last, so when exactly one entry has rich info, Prev is the + // non-rich (typically symbol-table) entry and Curr is the rich + // (typically DWARF) one. DWARF often truncates a function's + // linkage name to its short form, so before dropping the non-rich + // entry check whether its name is a more complete mangled + // (Itanium or Swift) form of the rich entry's name and, if so, + // copy it onto the rich entry. This lets downstream tools + // demangle the full signature. + const bool PrevRich = Prev.hasRichInfo(); + const bool CurrRich = Curr.hasRichInfo(); + if (PrevRich != CurrRich) { + if (shouldReplaceWithMangledName(getString(Prev.Name), + getString(Curr.Name))) + Curr.Name = Prev.Name; + std::swap(Prev, Curr); + } else if (Prev != Curr) { + if (PrevRich) Out.Report( "Duplicate address ranges with different debug info.", [&](raw_ostream &OS) { @@ -197,10 +236,6 @@ llvm::Error GsymCreator::finalize(OutputAggregator &Out) { << Prev << "\nIn favor of this one:\n" << Curr << "\n"; }); - - // We want to swap the current entry with the previous since - // later entries with the same range always have more debug info - // or different debug info. std::swap(Prev, Curr); } } else { diff --git a/llvm/test/tools/llvm-gsymutil/X86/elf-mangled-name-replacement.yaml b/llvm/test/tools/llvm-gsymutil/X86/elf-mangled-name-replacement.yaml new file mode 100644 index 0000000000000..0ad50e2e3e218 --- /dev/null +++ b/llvm/test/tools/llvm-gsymutil/X86/elf-mangled-name-replacement.yaml @@ -0,0 +1,128 @@ +## Test that same-range dedup keeps the DWARF line table while replacing a +## shortened DWARF function name with the full Itanium symbol-table name. + +# RUN: yaml2obj %s -o %t +# RUN: llvm-gsymutil --convert %t -o %t.gsym 2>&1 | FileCheck %s --check-prefix=CONVERT +# RUN: llvm-gsymutil %t.gsym 2>&1 | FileCheck %s --check-prefix=DUMP + +# CONVERT: Loaded 1 functions from DWARF. +# CONVERT: Loaded 1 functions from symbol table. +# CONVERT: Pruned 1 functions, ended with 1 total + +# DUMP: "_Z10make_ftypePci" +# DUMP: LineTable: +# DUMP: main.cpp:10 +# DUMP: main.cpp:11 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000401000 + AddressAlign: 0x10 + Content: 554889E531C05DC3554889E531C05DC3 +DWARF: + debug_str: + - '' + - main.cpp + - make_ftype + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_language + Form: DW_FORM_udata + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Length: 0x27 + Version: 4 + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x1 + - Value: 0x2 + - Value: 0x0 + - AbbrCode: 0x2 + Values: + - Value: 0xA + - Value: 0x401000 + - Value: 0x401010 + - AbbrCode: 0x0 + debug_line: + - Length: 61 + Version: 2 + PrologueLength: 31 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + Files: + - Name: main.cpp + DirIdx: 0 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4198400 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 8 + - Opcode: DW_LNS_advance_line + SData: 1 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 8 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + Align: 0x1000 + FirstSec: .text + LastSec: .text +Symbols: + - Name: _Z10make_ftypePci + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x0000000000401000 + Size: 0x0000000000000010 +... diff --git a/llvm/test/tools/llvm-gsymutil/X86/elf-swift-mangled-name-replacement.yaml b/llvm/test/tools/llvm-gsymutil/X86/elf-swift-mangled-name-replacement.yaml new file mode 100644 index 0000000000000..b706f890c6c35 --- /dev/null +++ b/llvm/test/tools/llvm-gsymutil/X86/elf-swift-mangled-name-replacement.yaml @@ -0,0 +1,128 @@ +## Test that same-range dedup keeps the DWARF line table while replacing a +## shortened Swift DWARF function name with the full Swift symbol-table name. + +# RUN: yaml2obj %s -o %t +# RUN: llvm-gsymutil --convert %t -o %t.gsym 2>&1 | FileCheck %s --check-prefix=CONVERT +# RUN: llvm-gsymutil %t.gsym 2>&1 | FileCheck %s --check-prefix=DUMP + +# CONVERT: Loaded 1 functions from DWARF. +# CONVERT: Loaded 1 functions from symbol table. +# CONVERT: Pruned 1 functions, ended with 1 total + +# DUMP: "$s4main10make_ftypeyyF" +# DUMP: LineTable: +# DUMP: main.swift:10 +# DUMP: main.swift:11 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000401000 + AddressAlign: 0x10 + Content: 554889E531C05DC3554889E531C05DC3 +DWARF: + debug_str: + - '' + - main.swift + - make_ftype + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_language + Form: DW_FORM_udata + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Length: 0x27 + Version: 4 + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x1 + - Value: 0x1E + - Value: 0x0 + - AbbrCode: 0x2 + Values: + - Value: 0xC + - Value: 0x401000 + - Value: 0x401010 + - AbbrCode: 0x0 + debug_line: + - Length: 63 + Version: 2 + PrologueLength: 33 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + Files: + - Name: main.swift + DirIdx: 0 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4198400 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 8 + - Opcode: DW_LNS_advance_line + SData: 1 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 8 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + Align: 0x1000 + FirstSec: .text + LastSec: .text +Symbols: + - Name: '$s4main10make_ftypeyyF' + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x0000000000401000 + Size: 0x0000000000000010 +... diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp index b7abdd94ce645..5808ae712efed 100644 --- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp +++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp @@ -2814,6 +2814,145 @@ FinalizeEncodeAndDecode(GsymCreator &GC) { return GsymReader::copyBuffer(OutStrm.str()); } +template +static void AddDuplicateRangePair(CreatorT &GC, uint64_t Addr, + gsym_strp_t RichName, gsym_strp_t NonRichName, + bool AddRichFirst, uint32_t FileIdx, + uint32_t FirstLine) { + FunctionInfo RichFI(Addr, 0x20, RichName); + RichFI.OptLineTable = LineTable(); + RichFI.OptLineTable->push(LineEntry(Addr, FileIdx, FirstLine)); + RichFI.OptLineTable->push(LineEntry(Addr + 0x10, FileIdx, FirstLine + 1)); + + FunctionInfo NonRichFI(Addr, 0x20, NonRichName); + if (AddRichFirst) { + GC.addFunctionInfo(std::move(RichFI)); + GC.addFunctionInfo(std::move(NonRichFI)); + } else { + GC.addFunctionInfo(std::move(NonRichFI)); + GC.addFunctionInfo(std::move(RichFI)); + } +} + +static void VerifyDuplicateRangeResult(const GsymReader &GR, uint64_t Addr, + StringRef ExpectedName, + uint32_t ExpectedFileIdx, + uint32_t FirstLine) { + auto ExpFI = GR.getFunctionInfo(Addr); + ASSERT_THAT_EXPECTED(ExpFI, Succeeded()); + EXPECT_EQ(GR.getString(ExpFI->Name), ExpectedName); + ASSERT_TRUE(ExpFI->OptLineTable.has_value()); + ASSERT_EQ(ExpFI->OptLineTable->size(), 2u); + EXPECT_EQ((*ExpFI->OptLineTable)[0], + LineEntry(Addr, ExpectedFileIdx, FirstLine)); + EXPECT_EQ((*ExpFI->OptLineTable)[1], + LineEntry(Addr + 0x10, ExpectedFileIdx, FirstLine + 1)); +} + +template static void TestMangledNameReplacement() { + CreatorT GC; + const uint32_t FileIdx = GC.insertFile("/tmp/main.cpp"); + const gsym_strp_t ShortName = GC.insertString("make_ftype"); + const gsym_strp_t MangledName = GC.insertString("_Z10make_ftypePci"); + const gsym_strp_t SwiftMangledName = + GC.insertString("$s4main10make_ftypeyyF"); + + AddDuplicateRangePair(GC, 0x1000, ShortName, MangledName, + /*AddRichFirst=*/false, FileIdx, 10); + AddDuplicateRangePair(GC, 0x2000, ShortName, MangledName, + /*AddRichFirst=*/true, FileIdx, 20); + AddDuplicateRangePair(GC, 0x3000, ShortName, SwiftMangledName, + /*AddRichFirst=*/false, FileIdx, 30); + + auto GROrErr = FinalizeEncodeAndDecode(GC); + ASSERT_THAT_EXPECTED(GROrErr, Succeeded()); + const std::unique_ptr &GR = *GROrErr; + + EXPECT_EQ(GR->getNumAddresses(), 3u); + VerifyDuplicateRangeResult(*GR, 0x1000, "_Z10make_ftypePci", FileIdx, 10); + VerifyDuplicateRangeResult(*GR, 0x2000, "_Z10make_ftypePci", FileIdx, 20); + VerifyDuplicateRangeResult(*GR, 0x3000, "$s4main10make_ftypeyyF", FileIdx, + 30); +} + +TEST(GSYMTest, TestMangledNameReplacement) { + TestMangledNameReplacement(); +} +TEST(GSYMTest, TestMangledNameReplacementV2) { + TestMangledNameReplacement(); +} + +template static void TestMangledNameReplacementNegative() { + CreatorT GC; + const uint32_t FileIdx = GC.insertFile("/tmp/test.cpp"); + const gsym_strp_t MangledA = GC.insertString("_Z3foov"); + const gsym_strp_t MangledB = GC.insertString("_Z3barv"); + const gsym_strp_t MangledName = GC.insertString("_Z10make_ftypePci"); + const gsym_strp_t UnrelatedName = GC.insertString("some_other_func"); + const gsym_strp_t SwiftShortName = GC.insertString("foo"); + const gsym_strp_t SwiftLongerName = GC.insertString("$s5fooBaryyF"); + + AddDuplicateRangePair(GC, 0x3000, MangledB, MangledA, + /*AddRichFirst=*/false, FileIdx, 5); + AddDuplicateRangePair(GC, 0x4000, UnrelatedName, MangledName, + /*AddRichFirst=*/false, FileIdx, 15); + AddDuplicateRangePair(GC, 0x5000, SwiftShortName, SwiftLongerName, + /*AddRichFirst=*/false, FileIdx, 25); + + auto GROrErr = FinalizeEncodeAndDecode(GC); + ASSERT_THAT_EXPECTED(GROrErr, Succeeded()); + const std::unique_ptr &GR = *GROrErr; + + EXPECT_EQ(GR->getNumAddresses(), 3u); + VerifyDuplicateRangeResult(*GR, 0x3000, "_Z3barv", FileIdx, 5); + VerifyDuplicateRangeResult(*GR, 0x4000, "some_other_func", FileIdx, 15); + VerifyDuplicateRangeResult(*GR, 0x5000, "foo", FileIdx, 25); +} + +TEST(GSYMTest, TestMangledNameReplacementNegative) { + TestMangledNameReplacementNegative(); +} +TEST(GSYMTest, TestMangledNameReplacementNegativeV2) { + TestMangledNameReplacementNegative(); +} + +template static void TestDuplicateRangeKeepsCallSites() { + CreatorT GC; + const gsym_strp_t FuncName = GC.insertString("foo"); + const gsym_strp_t MatchRegex = GC.insertString("callee"); + + FunctionInfo NonRichFI(0x5000, 0x20, FuncName); + FunctionInfo RichFI(0x5000, 0x20, FuncName); + RichFI.CallSites = CallSiteInfoCollection(); + CallSiteInfo CSI; + CSI.ReturnOffset = 0x10; + CSI.MatchRegex.push_back(MatchRegex); + RichFI.CallSites->CallSites.push_back(CSI); + + GC.addFunctionInfo(std::move(NonRichFI)); + GC.addFunctionInfo(std::move(RichFI)); + + auto GROrErr = FinalizeEncodeAndDecode(GC); + ASSERT_THAT_EXPECTED(GROrErr, Succeeded()); + const std::unique_ptr &GR = *GROrErr; + + auto ExpFI = GR->getFunctionInfo(0x5000); + ASSERT_THAT_EXPECTED(ExpFI, Succeeded()); + ASSERT_TRUE(ExpFI->CallSites.has_value()); + ASSERT_EQ(ExpFI->CallSites->CallSites.size(), 1u); + EXPECT_EQ(ExpFI->CallSites->CallSites[0].ReturnOffset, 0x10u); + ASSERT_EQ(ExpFI->CallSites->CallSites[0].MatchRegex.size(), 1u); + EXPECT_EQ(GR->getString(ExpFI->CallSites->CallSites[0].MatchRegex[0]), + "callee"); +} + +TEST(GSYMTest, TestDuplicateRangeKeepsCallSites) { + TestDuplicateRangeKeepsCallSites(); +} +TEST(GSYMTest, TestDuplicateRangeKeepsCallSitesV2) { + TestDuplicateRangeKeepsCallSites(); +} + template static void TestGsymSegmenting(uint64_t SegmentSize) { // Test creating a GSYM file with function infos and segment the information. From 0c101370f58aa7e3d2a6f199bffc5b585cdeab69 Mon Sep 17 00:00:00 2001 From: Ziqing Luo Date: Mon, 11 May 2026 17:00:07 -0700 Subject: [PATCH 381/538] [SSAF][WPA] Add PointerFlowReachableAnalysis (#193097) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PointerFlowReachableAnalysis uses PointerFlow and UnsafeBufferUsage summaries. It computes reachable nodes in the PointerFlow graph from unsafe buffer nodes in the UnsafeBufferUsage summary. rdar://174874942 --------- Co-authored-by: Balázs Benics Co-authored-by: Jan Korous --- .../PointerFlow/PointerFlowAnalysis.h | 34 +- .../PointerFlow/PointerFlowAnalysis.cpp | 125 +++++- .../PointerFlow/PointerFlowExtractor.cpp | 1 - .../CMakeLists.txt | 1 + .../UnsafeBufferReachableAnalysisTest.cpp | 378 ++++++++++++++++++ 5 files changed, 528 insertions(+), 11 deletions(-) create mode 100644 clang/unittests/ScalableStaticAnalysisFramework/WholeProgramAnalysis/UnsafeBufferReachableAnalysisTest.cpp diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.h b/clang/include/clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.h index 3b4da10bdc65d..5b2e534368f98 100644 --- a/clang/include/clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.h +++ b/clang/include/clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.h @@ -1,4 +1,4 @@ -//===- PointerFlowAnalysis.h ------------------------------------*- C++ -*-===// +//===- PointerFlowAnalysis.h ------------------------------------*- C++- *-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,14 +6,19 @@ // //===----------------------------------------------------------------------===// // -// Defines PointerFlowAnalysisResult, the whole-program analysis result type -// for PointerFlowAnalysis. +// Defines +// - PointerFlowAnalysisResult +// - the plain PointerFlow info collected from the whole program. +// - PointerFlowReachableAnalysisResult +// - the set of reachable pointers in the pointer flow graph from a provided +// starting set. // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_ANALYSES_POINTERFLOW_POINTERFLOWANALYSIS_H #define LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_ANALYSES_POINTERFLOW_POINTERFLOWANALYSIS_H +#include "clang/ScalableStaticAnalysisFramework/Analyses/EntityPointerLevel/EntityPointerLevel.h" #include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlow.h" #include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityId.h" #include "clang/ScalableStaticAnalysisFramework/Core/WholeProgramAnalysis/AnalysisName.h" @@ -23,18 +28,37 @@ namespace clang::ssaf { -inline constexpr llvm::StringLiteral PointerFlowAnalysisResultName = +constexpr llvm::StringLiteral PointerFlowAnalysisResultName = "PointerFlowAnalysisResult"; +constexpr llvm::StringLiteral UnsafeBufferReachableAnalysisResultName = + "UnsafeBufferReachableAnalysisResult"; +/// A PointerFlowAnalysisResult is a set of pointer-flow edges, i.e., +/// a pointer-flow graph. A directed edge src -> dest corresponds to an +/// assignment (of any of various kinds, e.g., assignment operator or +/// argument-passing) of pointer dest to pointer src in the source code. +/// The edge's direction is the opposite of how pointer values flow. This +/// is because PointerFlowAnalysisResult is used for analyzing property +/// propagation between pointers. For an assignment `src = dest`, the +/// propagation works such that if `src` has a property, `dest` must also +/// have that property; otherwise, the property would not be preserved +/// across the assignment. struct PointerFlowAnalysisResult final : AnalysisResult { static AnalysisName analysisName() { return AnalysisName(PointerFlowAnalysisResultName.str()); } - /// Whole-program map from EntityIds to their EdgeSets. std::map Edges; }; +struct UnsafeBufferReachableAnalysisResult final : AnalysisResult { + static AnalysisName analysisName() { + return AnalysisName(UnsafeBufferReachableAnalysisResultName.str()); + } + + std::map Reachables; +}; + } // namespace clang::ssaf #endif // LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_ANALYSES_POINTERFLOW_POINTERFLOWANALYSIS_H diff --git a/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.cpp b/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.cpp index e8392137c851b..c8dcc6ec24825 100644 --- a/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.cpp +++ b/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.cpp @@ -5,19 +5,21 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// PointerFlowAnalysis is a noop analysis. -// -// PointerFlowAnalysisResult is a map from EntityIds to -// EdgeSets. -//===----------------------------------------------------------------------===// #include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.h" #include "SSAFAnalysesCommon.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/EntityPointerLevel/EntityPointerLevel.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/EntityPointerLevel/EntityPointerLevelFormat.h" #include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlow.h" #include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowFormat.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/UnsafeBufferUsage/UnsafeBufferUsageAnalysis.h" +#include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityId.h" #include "clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h" #include "clang/ScalableStaticAnalysisFramework/Core/WholeProgramAnalysis/AnalysisRegistry.h" +#include "clang/ScalableStaticAnalysisFramework/Core/WholeProgramAnalysis/DerivedAnalysis.h" #include "clang/ScalableStaticAnalysisFramework/Core/WholeProgramAnalysis/SummaryAnalysis.h" +#include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Support/Error.h" #include "llvm/Support/JSON.h" #include @@ -27,6 +29,11 @@ using namespace llvm; namespace { +//===----------------------------------------------------------------------===// +// PointerFlowAnalysis---converts PointerFlowEntitySummary(s) in an LUSummary to +// a PointerFlowAnalysisResult +//===----------------------------------------------------------------------===// + // Serialized as a flat array of alternating [EntityId, EdgesArray, ...] pairs. json::Object serializePointerFlowAnalysisResult(const PointerFlowAnalysisResult &R, @@ -109,6 +116,114 @@ class PointerFlowAnalysis final AnalysisRegistry::Add RegisterPointerFlowAnalysis("Whole-program pointer flow analysis"); +//===----------------------------------------------------------------------===// +// PointerFlowReachableAnalysis---computes reachable node +//===----------------------------------------------------------------------===// + +json::Object serializePointerFlowReachableAnalysisResult( + const UnsafeBufferReachableAnalysisResult &R, + JSONFormat::EntityIdToJSONFn IdToJSON) { + json::Object Result; + + Result[UnsafeBufferReachableAnalysisResultName] = + entityPointerLevelMapToJSON(R.Reachables, IdToJSON); + return Result; +} + +Expected> +deserializePointerFlowReachableAnalysisResult( + const json::Object &Obj, JSONFormat::EntityIdFromJSONFn IdFromJSON) { + const json::Array *Content = + Obj.getArray(UnsafeBufferReachableAnalysisResultName); + + if (!Content) + return makeSawButExpectedError( + Obj, "an object with a key %s", + UnsafeBufferReachableAnalysisResultName.data()); + + auto Reachables = entityPointerLevelMapFromJSON(*Content, IdFromJSON); + + if (!Reachables) + return Reachables.takeError(); + + auto Ret = std::make_unique(); + + Ret->Reachables = std::move(*Reachables); + return Ret; +} + +JSONFormat::AnalysisResultRegistry::Add + RegisterPointerFlowReachableResultForJSON( + serializePointerFlowReachableAnalysisResult, + deserializePointerFlowReachableAnalysisResult); + +/// Computes all the reachable "nodes" (pointers) in a pointer flow graph from a +/// provided starter node set. Specifically, the starter set is the unsafe +/// pointers found by `UnsafeBufferUsageAnalysis`. +class UnsafeBufferReachableAnalysis + : public DerivedAnalysis { + using GraphT = std::map; + const GraphT *Graph = nullptr; + + // Use pointers for efficiency. Both `Graph` and `Reachables` in the result + // are tree-based containers that only grow. So pointers to them are stable. + using EPLPtr = const EntityPointerLevel *; + + // Find all outgoing edges from `EPL` in the `Graph`, insert their + // destination nodes into `Reachables`, and add newly discovered nodes to + // `Worklist`: + void updateReachablesWithOutgoings(EPLPtr EPL, + std::vector &WorkList) { + for (auto &[Id, SubGraph] : *Graph) { + auto I = SubGraph.find(*EPL); + EntityPointerLevelSet &ReachablesOfId = getResult().Reachables[Id]; + + if (I != SubGraph.end()) { + for (const auto &EPL : I->second) { + auto [Ignored, Inserted] = ReachablesOfId.insert(EPL); + if (Inserted) + WorkList.push_back(&EPL); + } + } + } + } + +public: + llvm::Error + initialize(const PointerFlowAnalysisResult &Graph, + const UnsafeBufferUsageAnalysisResult &Starter) override { + this->Graph = &Graph.Edges; + assert(getResult().Reachables.empty()); + getResult().Reachables.insert(Starter.begin(), Starter.end()); + return llvm::Error::success(); + } + + llvm::Expected step() override { + auto &Reachables = getResult().Reachables; + // Simple DFS: + std::vector Worklist; + + for (auto &[Id, EPLs] : Reachables) + for (auto &EPL : EPLs) + Worklist.push_back(&EPL); + + while (!Worklist.empty()) { + EPLPtr Node = Worklist.back(); + Worklist.pop_back(); + + updateReachablesWithOutgoings(Node, Worklist); + } + // This is not an iterative algorithm so stop iteration by retruning false: + return false; + } +}; + +AnalysisRegistry::Add + RegisterPointerFlowReachableAnalysis( + "Reachable pointers from unsafe buffer usage in pointer flow graph"); + } // namespace // NOLINTNEXTLINE(misc-use-internal-linkage) diff --git a/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowExtractor.cpp b/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowExtractor.cpp index 8f79c987cc290..e1130a2c52e4c 100644 --- a/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowExtractor.cpp +++ b/clang/lib/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowExtractor.cpp @@ -17,7 +17,6 @@ #include "clang/AST/TypeBase.h" #include "clang/ScalableStaticAnalysisFramework/Analyses/EntityPointerLevel/EntityPointerLevel.h" #include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlow.h" -#include "clang/ScalableStaticAnalysisFramework/Core/ASTEntityMapping.h" #include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityId.h" #include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityName.h" #include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h" diff --git a/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt b/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt index 0f8854d471fd1..e852d99d34781 100644 --- a/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt +++ b/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt @@ -29,6 +29,7 @@ add_distinct_clang_unittest(ClangScalableAnalysisTests TestFixture.cpp TUSummaryBuilderTest.cpp WholeProgramAnalysis/AnalysisDriverTest.cpp + WholeProgramAnalysis/UnsafeBufferReachableAnalysisTest.cpp CLANG_LIBS clangAST diff --git a/clang/unittests/ScalableStaticAnalysisFramework/WholeProgramAnalysis/UnsafeBufferReachableAnalysisTest.cpp b/clang/unittests/ScalableStaticAnalysisFramework/WholeProgramAnalysis/UnsafeBufferReachableAnalysisTest.cpp new file mode 100644 index 0000000000000..c3a98b6b8e87d --- /dev/null +++ b/clang/unittests/ScalableStaticAnalysisFramework/WholeProgramAnalysis/UnsafeBufferReachableAnalysisTest.cpp @@ -0,0 +1,378 @@ +//===- UnsafeBufferReachableAnalysisTest.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../TestFixture.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/EntityPointerLevel/EntityPointerLevel.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlow.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/PointerFlow/PointerFlowAnalysis.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/UnsafeBufferUsage/UnsafeBufferUsage.h" +#include "clang/ScalableStaticAnalysisFramework/Analyses/UnsafeBufferUsage/UnsafeBufferUsageAnalysis.h" +#include "clang/ScalableStaticAnalysisFramework/Core/EntityLinker/LUSummary.h" +#include "clang/ScalableStaticAnalysisFramework/Core/Model/BuildNamespace.h" +#include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityId.h" +#include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityLinkage.h" +#include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityName.h" +#include "clang/ScalableStaticAnalysisFramework/Core/WholeProgramAnalysis/AnalysisDriver.h" +#include "clang/ScalableStaticAnalysisFramework/Core/WholeProgramAnalysis/WPASuite.h" +#include "llvm/ADT/Sequence.h" +#include "gtest/gtest.h" +#include +#include +#include + +using namespace clang; +using namespace ssaf; + +namespace clang::ssaf { +extern PointerFlowEntitySummary buildPointerFlowEntitySummary(EdgeSet Edges); +extern UnsafeBufferUsageEntitySummary + buildUnsafeBufferUsageEntitySummary(EntityPointerLevelSet); +} // namespace clang::ssaf + +namespace { + +class UnsafeBufferReachableAnalysisTest : public TestFixture { +protected: + using EPLEdge = std::pair; + + static constexpr EntityLinkage ExternalLinkage = + EntityLinkage(EntityLinkageType::External); + + std::unique_ptr makeLUSummary() { + NestedBuildNamespace NS( + {BuildNamespace(BuildNamespaceKind::LinkUnit, "TestLU")}); + return std::make_unique(std::move(NS)); + } + + EntityId addEntity(LUSummary &LU, llvm::StringRef USR) { + NestedBuildNamespace NS( + {BuildNamespace(BuildNamespaceKind::LinkUnit, "TestLU")}); + EntityName Name(USR.str(), "", NS); + EntityId Id = getIdTable(LU).getId(Name); + getLinkageTable(LU).insert({Id, ExternalLinkage}); + return Id; + } + + /// Insert a PointerFlowEntitySummary for an entity. + void insertPointerFlowSummary(LUSummary &LU, EntityId Id, EdgeSet Edges) { + getData(LU)[PointerFlowEntitySummary::summaryName()][Id] = + std::make_unique( + buildPointerFlowEntitySummary(std::move(Edges))); + } + + /// Insert an UnsafeBufferUsageEntitySummary for an entity. + void insertUnsafeBufferUsageSummary(LUSummary &LU, EntityId Id, + EntityPointerLevelSet UnsafeBuffers) { + getData(LU)[UnsafeBufferUsageEntitySummary::summaryName()][Id] = + std::make_unique( + buildUnsafeBufferUsageEntitySummary(std::move(UnsafeBuffers))); + } + + /// Create \p N entities in \p LU and return their EntityIds. + std::vector createEntities(LUSummary &LU, unsigned N) { + std::vector Ids; + for (unsigned I = 0; I < N; ++I) + Ids.push_back(addEntity(LU, ("E" + llvm::Twine(I)).str())); + return Ids; + } + + /// Create \p N EPLs, one per entity. + std::vector + createEPLs(llvm::ArrayRef Entities) { + std::vector EPLs; + for (const auto &Id : Entities) + EPLs.push_back(buildEntityPointerLevel(Id, 1)); + return EPLs; + } + + /// Insert both PointerFlow and UnsafeBufferUsage summaries for an entity + /// from a list of edges and a list of starter EPLs. + void insertSummaries(LUSummary &LU, EntityId Id, + llvm::ArrayRef EdgeList, + llvm::ArrayRef StarterList) { + EdgeSet Edges; + for (const auto &[From, To] : EdgeList) + Edges[From].insert(To); + insertPointerFlowSummary(LU, Id, std::move(Edges)); + + EntityPointerLevelSet Starters; + for (const auto &EPL : StarterList) + Starters.insert(EPL); + insertUnsafeBufferUsageSummary(LU, Id, std::move(Starters)); + } + + /// Run the driver and return the flattened reachable EPL set. + std::optional + computeReachables(std::unique_ptr LU, unsigned Line) { + AnalysisDriver Driver(std::move(LU)); + auto WPAOrErr = + Driver.run(); + if (!WPAOrErr) { + ADD_FAILURE_AT(__FILE__, Line) << llvm::toString(WPAOrErr.takeError()); + return std::nullopt; + } + auto ROrErr = WPAOrErr->get(); + if (!ROrErr) { + ADD_FAILURE_AT(__FILE__, Line) << llvm::toString(ROrErr.takeError()); + return std::nullopt; + } + EntityPointerLevelSet Result; + for (const auto &[Id, EPLs] : ROrErr->Reachables) + Result.insert(EPLs.begin(), EPLs.end()); + return Result; + } + + // FIXME: When we use more advanced search algorithms, it may involve + // a divide-and-conquer approach on sub-graphs organized by contributors. + // In that case, we may want to enumerate all possible partitions of + // how edges are distributed among contributors. For now we use + // `singlePartition`. + + /// Compute reachables from \p StarterLayout in the graph defined by \p + /// EdgeLayout. Edges and starters are all belong to Entity 0. + std::optional> + singlePartition(unsigned NumEnt, + llvm::ArrayRef> EdgeLayout, + llvm::ArrayRef StarterLayout, unsigned Line) { + auto LU = makeLUSummary(); + auto Entities = createEntities(*LU, NumEnt); + auto N = createEPLs(Entities); + + std::vector Edges; + for (const auto &[F, T] : EdgeLayout) + Edges.push_back({N[F], N[T]}); + + std::vector Starters; + for (unsigned Idx : StarterLayout) + Starters.push_back(N[Idx]); + + insertSummaries(*LU, Entities[0], Edges, Starters); + for (unsigned Idx = 1; Idx < NumEnt; ++Idx) + insertSummaries(*LU, Entities[Idx], {}, {}); + + auto Reachables = computeReachables(std::move(LU), Line); + if (!Reachables.has_value()) + return std::nullopt; + + std::set ReachableIndices; + for (unsigned I : llvm::seq(0U, NumEnt)) + if (Reachables->count(N[I])) + ReachableIndices.insert(I); + + return ReachableIndices; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Tests below are written in a manner that focuses on pointer flow graph +// topology and the starter set, where numbers are used to represent distinct +// nodes (pointers). +// For example, `LinearChain` tests a graph forming a +// linear chain with 4 distinct nodes: 0 -> 1 -> 2 -> 3 with a starter set {0}, +// where, for example, 0 -> 1 represents an edge where node 0 is the source and +// node 1 is the destination. Thus, {0, 1, 2, 3} is the expected reachable set. +//////////////////////////////////////////////////////////////////////////////// + +// Linear chain: 0 -> 1 -> 2 -> 3. +// Start from {0} => {0, 1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, LinearChain) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {1, 2}, {2, 3}}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 4u); +} + +// Linear chain: 0 -> 1 -> 2 -> 3. +// Start from mid-chain {2} => {2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, LinearChainFromMiddle) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {1, 2}, {2, 3}}, + /* StarterLayout */ {2}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 2u); + EXPECT_TRUE(Reachables->count(2)); + EXPECT_TRUE(Reachables->count(3)); +} + +// Diamond: 0 -> 1, 0 -> 2, 1 -> 3, 2 -> 3. +// Start from {0} => {0, 1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, Diamond) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {0, 2}, {1, 3}, {2, 3}}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 4u); +} + +// Diamond: 0 -> 1, 0 -> 2, 1 -> 3, 2 -> 3. +// Start from one branch {1} => {1, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, DiamondFromBranch) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {0, 2}, {1, 3}, {2, 3}}, + /* StarterLayout */ {1}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 2u); + EXPECT_TRUE(Reachables->count(1)); + EXPECT_TRUE(Reachables->count(3)); +} + +// Disconnected subgraphs: 0 -> 1, 2 -> 3. +// Start from {0} => {0, 1} +TEST_F(UnsafeBufferReachableAnalysisTest, DisconnectedSubgraphs) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {2, 3}}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 2u); + EXPECT_TRUE(Reachables->count(0)); + EXPECT_TRUE(Reachables->count(1)); +} + +// Disconnected subgraphs: 0 -> 1, 2 -> 3. +// Start from tail {1} => {1} +TEST_F(UnsafeBufferReachableAnalysisTest, DisconnectedSubgraphsFromTail) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {2, 3}}, + /* StarterLayout */ {1}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 1u); + EXPECT_TRUE(Reachables->count(1)); +} + +// Cycle: 0 -> 1 -> 2 -> 3 -> 0. +// Start from {2} => {0, 1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, Cycle) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {1, 2}, {2, 3}, {3, 0}}, + /* StarterLayout */ {2}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 4u); + EXPECT_TRUE(Reachables->count(0)); + EXPECT_TRUE(Reachables->count(1)); + EXPECT_TRUE(Reachables->count(2)); + EXPECT_TRUE(Reachables->count(3)); +} + +// Empty graph: no edges, start from {0} => {0} +TEST_F(UnsafeBufferReachableAnalysisTest, EmptyGraph) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 1u); + EXPECT_TRUE(Reachables->count(0)); +} + +// Star: 0 -> 1, 0 -> 2, 0 -> 3. +// Start from {0} => {0, 1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, StarFromHub) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {0, 2}, {0, 3}}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 4u); +} + +// Star: 0 -> 1, 0 -> 2, 0 -> 3. +// Start from leaf {2} => {2} +TEST_F(UnsafeBufferReachableAnalysisTest, StarFromLeaf) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {0, 2}, {0, 3}}, + /* StarterLayout */ {2}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 1u); + EXPECT_TRUE(Reachables->count(2)); +} + +// Reverse star: 0 -> 3, 1 -> 3, 2 -> 3. +// Start from {0} => {0, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, ReverseStarFromSource) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 3}, {1, 3}, {2, 3}}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 2u); + EXPECT_TRUE(Reachables->count(0)); + EXPECT_TRUE(Reachables->count(3)); +} + +// Reverse star: 0 -> 3, 1 -> 3, 2 -> 3. +// Start from sink {3} => {3} +TEST_F(UnsafeBufferReachableAnalysisTest, ReverseStarFromSink) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 3}, {1, 3}, {2, 3}}, + /* StarterLayout */ {3}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 1u); + EXPECT_TRUE(Reachables->count(3)); +} + +// Self-loop: 0 -> 1, 1 -> 1, 1 -> 2, 2 -> 3. +// Start from {0} => {0, 1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, SelfLoopFromRoot) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {1, 1}, {1, 2}, {2, 3}}, + /* StarterLayout */ {0}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 4u); +} + +// Self-loop: 0 -> 1, 1 -> 1, 1 -> 2, 2 -> 3. +// Start from {1} => {1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, SelfLoopFromLoopNode) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {1, 1}, {1, 2}, {2, 3}}, + /* StarterLayout */ {1}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 3u); + EXPECT_TRUE(Reachables->count(1)); + EXPECT_TRUE(Reachables->count(2)); + EXPECT_TRUE(Reachables->count(3)); +} + +// Multiple starters: 0 -> 1, 2 -> 3 (disconnected). +// Start from {0, 2} => {0, 1, 2, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, MultipleStartersBothChains) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {2, 3}}, + /* StarterLayout */ {0, 2}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 4u); +} + +// Multiple starters: 0 -> 1, 2 -> 3 (disconnected). +// Start from leaves {1, 3} => {1, 3} +TEST_F(UnsafeBufferReachableAnalysisTest, MultipleStartersLeaves) { + auto Reachables = singlePartition( + /* NumEnt */ 4, + /* EdgeLayout */ {{0, 1}, {2, 3}}, + /* StarterLayout */ {1, 3}, __LINE__); + ASSERT_TRUE(Reachables.has_value()); + EXPECT_EQ(Reachables->size(), 2u); + EXPECT_TRUE(Reachables->count(1)); + EXPECT_TRUE(Reachables->count(3)); +} + +} // namespace From 70a70e0ed664148e5701b830e7d758bcb9a51daa Mon Sep 17 00:00:00 2001 From: theRonShark Date: Mon, 11 May 2026 20:42:18 -0400 Subject: [PATCH 382/538] Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation" (#197070) Reverts llvm/llvm-project#192306 breaking other roundtrip test --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 83 +++++----- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 18 -- .../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp | 45 ----- .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h | 9 - llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 17 +- llvm/lib/Target/AMDGPU/SIProgramInfo.h | 5 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 8 - llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 - .../test/CodeGen/AMDGPU/inst-prefetch-hint.ll | 50 +----- .../AMDGPU/inst-prefetch-inline-asm.ll | 154 ------------------ 10 files changed, 68 insertions(+), 324 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 3a2738d9fc498..ad61d8d084c7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -234,18 +234,6 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() { HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } -/// Set bits in a kernel descriptor MCExpr field: -/// return ((Dst & ~Mask) | (Value << Shift)) -static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value, - uint32_t Mask, uint32_t Shift, MCContext &Ctx) { - const auto *Shft = MCConstantExpr::create(Shift, Ctx); - const auto *Msk = MCConstantExpr::create(Mask, Ctx); - Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); - Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), - Ctx); - return Dst; -} - void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { const SIMachineFunctionInfo &MFI = *MF->getInfo(); if (!MFI.isEntryFunction()) @@ -253,29 +241,6 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { assert(TM.getTargetTriple().getOS() == Triple::AMDHSA); - const GCNSubtarget &STM = MF->getSubtarget(); - MCContext &Ctx = MF->getContext(); - - AMDGPU::MCKernelDescriptor KD = - getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo); - - // Compute inst_pref_size using MCExpr label subtraction for exact code - // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter) - // right after the function code, so (Lfunc_end - func_sym) gives the - // exact function code size in bytes. - if (STM.hasInstPrefSize()) { - const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub( - MCSymbolRefExpr::create(getFunctionEnd(), OutContext), - MCSymbolRefExpr::create(CurrentFnSym, OutContext), OutContext); - - uint32_t Mask, Shift, Width, CacheLineSize; - STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize); - const MCExpr *InstPrefSize = - AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx); - KD.compute_pgm_rsrc3 = - setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx); - } - auto &Streamer = getTargetStreamer()->getStreamer(); auto &Context = Streamer.getContext(); auto &ObjectFileInfo = *Context.getObjectFileInfo(); @@ -289,10 +254,13 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { Streamer.emitValueToAlignment(Align(64), 0, 1, 0); ReadOnlySection.ensureMinAlignment(Align(64)); + const GCNSubtarget &STM = MF->getSubtarget(); + SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU, + STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + CurrentProgramInfo.NumVGPRsForWavesPerEU, MCBinaryExpr::createSub( CurrentProgramInfo.NumSGPRsForWavesPerEU, AMDGPUMCExpr::createExtraSGPRs( @@ -1470,22 +1438,33 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; ProgInfo.EXCPEnable = 0; + // return ((Dst & ~Mask) | (Value << Shift)) + auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, + uint32_t Shift) { + const auto *Shft = MCConstantExpr::create(Shift, Ctx); + const auto *Msk = MCConstantExpr::create(Mask, Ctx); + Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); + Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), + Ctx); + return Dst; + }; + if (STM.hasGFX90AInsts()) { ProgInfo.ComputePGMRSrc3 = - setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, + SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx); + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); ProgInfo.ComputePGMRSrc3 = - setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), + SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx); + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } if (STM.hasGFX1250Insts()) ProgInfo.ComputePGMRSrc3 = - setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, + SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, - amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx); + amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT); ProgInfo.Occupancy = createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize).second, @@ -1504,6 +1483,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ", final occupancy is " + Twine(Occupancy)); F.getContext().diagnose(Diag); } + + if (isGFX11Plus(STM)) { + uint32_t CodeSizeInBytes = (uint32_t)std::min( + ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */), + (uint64_t)std::numeric_limits::max()); + uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128); + uint32_t Field, Shift, Width; + if (isGFX11(STM)) { + Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; + } else { + Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; + } + uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1); + ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, + CreateExpr(InstPrefSize), Field, Shift); + } } static unsigned getRsrcReg(CallingConv::ID CallConv) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 758e9b445d6dd..5f580ac0577d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -21,7 +21,6 @@ #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/ErrorHandling.h" #define GET_SUBTARGETINFO_HEADER @@ -426,23 +425,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPrefetch() const { return HasGFX12Insts; } - bool hasInstPrefSize() const { return isGFX11Plus(); } - - void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, - uint32_t &CacheLineSize) const { - assert(isGFX11Plus()); - CacheLineSize = getInstCacheLineSize(); - if (getGeneration() == GFX11) { - Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; - Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; - Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; - } else { - Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; - Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; - Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; - } - } - // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 4563803ad6577..fd0a2a6a77d7e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -12,12 +12,9 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -77,9 +74,6 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { case AGVK_Occupancy: OS << "occupancy("; break; - case AGVK_InstPrefSize: - OS << "instprefsize("; - break; case AGVK_Lit: OS << "lit("; break; @@ -188,27 +182,6 @@ bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, return true; } -/// Get the inst_pref_size field width for the given subtarget. -static unsigned getInstPrefSizeFieldWidth(const MCSubtargetInfo &STI) { - if (AMDGPU::isGFX12Plus(STI)) - return amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; - return amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; -} - -bool AMDGPUMCExpr::evaluateInstPrefSize(MCValue &Res, - const MCAssembler *Asm) const { - uint64_t CodeSizeInBytes = 0; - if (!evaluateMCExprs(Args, Asm, {CodeSizeInBytes})) - return false; - const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); - unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI); - unsigned CacheLineSize = AMDGPU::IsaInfo::getInstCacheLineSize(STI); - uint64_t CodeSizeInLines = divideCeil(CodeSizeInBytes, CacheLineSize); - uint64_t MaxVal = (1u << FieldWidth) - 1; - Res = MCValue::get(std::min(CodeSizeInLines, MaxVal)); - return true; -} - bool AMDGPUMCExpr::isSymbolUsedInExpression(const MCSymbol *Sym, const MCExpr *E) { switch (E->getKind()) { @@ -254,8 +227,6 @@ bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res, return evaluateTotalNumVGPR(Res, Asm); case AGVK_Occupancy: return evaluateOccupancy(Res, Asm); - case AGVK_InstPrefSize: - return evaluateInstPrefSize(Res, Asm); case AGVK_Lit: case AGVK_Lit64: return Args[0]->evaluateAsRelocatable(Res, Asm); @@ -308,11 +279,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } -const AMDGPUMCExpr * -AMDGPUMCExpr::createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx) { - return create(AGVK_InstPrefSize, {CodeSizeBytes}, Ctx); -} - const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value, MCContext &Ctx) { assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64); @@ -503,7 +469,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, case AMDGPUMCExpr::VariantKind::AGVK_TotalNumVGPRs: case AMDGPUMCExpr::VariantKind::AGVK_AlignTo: case AMDGPUMCExpr::VariantKind::AGVK_Occupancy: - case AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize: case AMDGPUMCExpr::VariantKind::AGVK_Lit: case AMDGPUMCExpr::VariantKind::AGVK_Lit64: { int64_t Val; @@ -512,16 +477,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, KBM[Expr] = KnownBits::makeConstant(APValue); return; } - if (AGVK->getKind() == AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize) { - // The result is clamped to (1 << FieldWidth) - 1, so upper bits are - // known zero. FieldWidth is derived from the subtarget. - const MCSubtargetInfo *STI = AGVK->getCtx().getSubtargetInfo(); - unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI); - KnownBits KB(BitWidth); - KB.Zero.setBitsFrom(FieldWidth); - KBM[Expr] = KB; - return; - } KBM[Expr] = KnownBits(BitWidth); return; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 4b1aa0c591a80..96bd8f4cf3c13 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -38,7 +38,6 @@ class AMDGPUMCExpr : public MCTargetExpr { AGVK_TotalNumVGPRs, AGVK_AlignTo, AGVK_Occupancy, - AGVK_InstPrefSize, AGVK_Lit, AGVK_Lit64, }; @@ -70,7 +69,6 @@ class AMDGPUMCExpr : public MCTargetExpr { bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; bool evaluateAlignTo(MCValue &Res, const MCAssembler *Asm) const; bool evaluateOccupancy(MCValue &Res, const MCAssembler *Asm) const; - bool evaluateInstPrefSize(MCValue &Res, const MCAssembler *Asm) const; public: static const AMDGPUMCExpr * @@ -99,18 +97,11 @@ class AMDGPUMCExpr : public MCTargetExpr { return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } - /// Create an expression for instruction prefetch size computation: - /// min(divideCeil(CodeSizeBytes, CacheLineSize), (1 << FieldWidth) - 1) - /// FieldWidth and CacheLineSize are derived from the subtarget. - static const AMDGPUMCExpr *createInstPrefSize(const MCExpr *CodeSizeBytes, - MCContext &Ctx); - static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value, MCContext &Ctx); ArrayRef getArgs() const { return Args; } VariantKind getKind() const { return Kind; } - MCContext &getCtx() const { return Ctx; } const MCExpr *getSubExpr(size_t Index) const; void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 27cef7a1b9158..99255e4060886 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -215,8 +215,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } -uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { - if (CodeSizeInBytes.has_value()) +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, + bool IsLowerBound) { + if (!IsLowerBound && CodeSizeInBytes.has_value()) return *CodeSizeInBytes; const GCNSubtarget &STM = MF.getSubtarget(); @@ -225,7 +226,12 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { - CodeSize = alignTo(CodeSize, MBB.getAlignment()); + // The amount of padding to align code can be both underestimated and + // overestimated. In case of inline asm used getInstSizeInBytes() will + // return a maximum size of a single instruction, where the real size may + // differ. At this point CodeSize may be already off. + if (!IsLowerBound) + CodeSize = alignTo(CodeSize, MBB.getAlignment()); for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -233,6 +239,11 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { if (MI.isMetaInstruction()) continue; + // We cannot properly estimate inline asm size. It can be as small as zero + // if that is just a comment. + if (IsLowerBound && MI.isInlineAsm()) + continue; + CodeSize += TII->getInstSizeInBytes(MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index fb56ebf88c96f..947b473142a1f 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -105,7 +105,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { void reset(const MachineFunction &MF); // Get function code size and cache the value. - uint64_t getFunctionCodeSize(const MachineFunction &MF); + // If \p IsLowerBound is set it returns a minimal code size which is safe + // to address. + uint64_t getFunctionCodeSize(const MachineFunction &MF, + bool IsLowerBound = false); /// Compute the value of the ComputePGMRsrc1 register. const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index dd67e77d0d9ed..b13aed2432602 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1182,14 +1182,6 @@ std::string AMDGPUTargetID::toString() const { return Str; } -unsigned getInstCacheLineSize(const MCSubtargetInfo *STI) { - if (STI->getFeatureBits().test(FeatureInstCacheLineSize128)) - return 128; - if (STI->getFeatureBits().test(FeatureInstCacheLineSize64)) - return 64; - return 64; -} - unsigned getWavefrontSize(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureWavefrontSize16)) return 16; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index e1b36f0996331..49373f09ee460 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -233,9 +233,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } -/// \returns Instruction cache line size in bytes for given subtarget \p STI. -unsigned getInstCacheLineSize(const MCSubtargetInfo *STI); - /// \returns Wavefront size for given subtarget \p STI. unsigned getWavefrontSize(const MCSubtargetInfo *STI); diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll index b76ef7eac11c4..580167076e1f0 100644 --- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll +++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll @@ -1,31 +1,11 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s -;; Verify that inst_pref_size resolves to the correct value in the object file. -;; COMPUTE_PGM_RSRC3 is at offset 0x2C in each 64-byte kernel descriptor. -;; inst_pref_size is bits [9:4] on GFX11 (6-bit) and bits [11:4] on GFX12+ (8-bit). -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx11.o -; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx12.o -; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s - -; The inst_pref_size is computed via MCExpr label subtraction, resolved at -; assembly/link time. In text output it appears as: -; ((instprefsize()<>Shift -; where: -; = .Lfunc_endN - func_sym (exact function code size in bytes) -; instprefsize = min(divideCeil(code_size, cache_line_size), (1 << field_width) - 1) -; field_width and cache_line_size are derived from the subtarget - ; GCN-LABEL: .amdhsa_kernel large -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&1008)>>4 -; GFX11: codeLenInByte = {{[0-9]+}} -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&4080)>>4 -; GFX12: codeLenInByte = {{[0-9]+}} -;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C: -;; gfx11 pref=3 (0x30), gfx12 pref=4 (0x40) -; OBJ-GFX11: 0020 {{.*}}30000000 -; OBJ-GFX12: 0020 {{.*}}40000000 +; GFX11: .amdhsa_inst_pref_size 3 +; GFX11: codeLenInByte = 3{{[0-9][0-9]$}} +; GFX12: .amdhsa_inst_pref_size 4 +; GFX12: codeLenInByte = 4{{[0-9][0-9]$}} define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) { bb: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false) @@ -33,30 +13,18 @@ bb: } ; GCN-LABEL: .amdhsa_kernel small -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&1008)>>4 -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&4080)>>4 -; GCN: codeLenInByte = {{[0-9]+}} -;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C: -;; pref=1 (0x10) for both -; OBJ-GFX11: 0060 {{.*}}10000000 -; OBJ-GFX12: 0060 {{.*}}10000000 +; GCN: .amdhsa_inst_pref_size 1 +; GCN: codeLenInByte = {{[0-9]$}} define amdgpu_kernel void @small() { bb: ret void } -; Inline asm is accounted for via MCExpr label subtraction (exact code size). -; The MCExpr resolves to the correct inst_pref_size at assembly time. +; Ignore inline asm in size calculation ; GCN-LABEL: .amdhsa_kernel inline_asm -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&1008)>>4 -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&4080)>>4 -; GCN: codeLenInByte = {{[0-9]+}} -;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC: -;; pref=9 (0x90) for both -;; (.fill 256, 4, 0 = 1024 bytes + 4 s_endpgm = 1028 -> divideCeil(1028,128) = 9) -; OBJ-GFX11: 00a0 {{.*}}90000000 -; OBJ-GFX12: 00a0 {{.*}}90000000 +; GCN: .amdhsa_inst_pref_size 1 +; GCN: codeLenInByte = {{[0-9]$}} define amdgpu_kernel void @inline_asm() { bb: call void asm sideeffect ".fill 256, 4, 0", ""() diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll deleted file mode 100644 index 287a30032230b..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll +++ /dev/null @@ -1,154 +0,0 @@ -;; Verify that inline assembly is correctly accounted for in the -;; inst_pref_size calculation. The inst_pref_size is computed via MCExpr -;; label subtraction (.Lfunc_end - func_sym), giving exact code size. -;; See inst-prefetch-hint.ll for explanation of the instprefsize expression. - -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s -o %t.gfx11.o -; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s -o %t.gfx12.o -; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s - -;; --- .fill directive: .fill 256, 4, 0 => 1024 bytes + 4 (s_endpgm) = 1028 --- -;; pref_size = divideCeil(1028, 128) = 9 - -; GFX11-LABEL: .amdhsa_kernel test_fill -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_fill -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C: -;; pref_size=9 -> 9<<4 = 0x90 -; OBJ-GFX11: 0020 {{.*}}90000000 -; OBJ-GFX12: 0020 {{.*}}90000000 - -define amdgpu_kernel void @test_fill() { - call void asm sideeffect ".fill 256, 4, 0", ""() - ret void -} - -;; --- .space directive: .space 1024 => 1024 bytes + 4 = 1028 --- -;; pref_size = 9 - -; GFX11-LABEL: .amdhsa_kernel test_space -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_space -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C: -;; pref_size=9 -> 9<<4 = 0x90 -; OBJ-GFX11: 0060 {{.*}}90000000 -; OBJ-GFX12: 0060 {{.*}}90000000 - -define amdgpu_kernel void @test_space() { - call void asm sideeffect ".space 1024", ""() - ret void -} - -;; --- Instructions: 32 x s_nop (4 bytes each) = 128 + 4 = 132 --- -;; pref_size = divideCeil(132, 128) = 2 - -; GFX11-LABEL: .amdhsa_kernel test_instructions -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_instructions -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC: -;; pref_size=2 -> 2<<4 = 0x20 -; OBJ-GFX11: 00a0 {{.*}}20000000 -; OBJ-GFX12: 00a0 {{.*}}20000000 - -define amdgpu_kernel void @test_instructions() { - call void asm sideeffect "s_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0", ""() - ret void -} - -;; --- Comments emit no bytes: only s_endpgm = 4 bytes --- -;; pref_size = 1 - -; GFX11-LABEL: .amdhsa_kernel test_comments -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_comments -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&4080)>>4 -;; Object: kernel descriptor at 0xC0, COMPUTE_PGM_RSRC3 at 0xEC: -;; pref_size=1 -> 1<<4 = 0x10 -; OBJ-GFX11: 00e0 {{.*}}10000000 -; OBJ-GFX12: 00e0 {{.*}}10000000 - -define amdgpu_kernel void @test_comments() { - call void asm sideeffect "; comment 1\0A; comment 2\0A; comment 3", ""() - ret void -} - -;; --- Empty inline asm: only s_endpgm = 4 bytes --- -;; pref_size = 1 - -; GFX11-LABEL: .amdhsa_kernel test_empty_asm -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_empty_asm -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x100, COMPUTE_PGM_RSRC3 at 0x12C: -;; pref_size=1 -> 1<<4 = 0x10 -; OBJ-GFX11: 0120 {{.*}}10000000 -; OBJ-GFX12: 0120 {{.*}}10000000 - -define amdgpu_kernel void @test_empty_asm() { - call void asm sideeffect "", ""() - ret void -} - -;; --- Multiple inline asm blocks: .fill (512) + .space (512) + s_endpgm (4) = 1028 --- -;; pref_size = divideCeil(1028, 128) = 9 - -; GFX11-LABEL: .amdhsa_kernel test_multiple_asm -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_multiple_asm -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x140, COMPUTE_PGM_RSRC3 at 0x16C: -;; pref_size=9 -> 9<<4 = 0x90 -; OBJ-GFX11: 0160 {{.*}}90000000 -; OBJ-GFX12: 0160 {{.*}}90000000 - -define amdgpu_kernel void @test_multiple_asm() { - call void asm sideeffect ".fill 128, 4, 0", ""() - call void asm sideeffect ".space 512", ""() - ret void -} - -;; --- Large function that exceeds GFX11 6-bit field max (63) --- -;; .fill 2048, 4, 0 = 8192 bytes + 4 = 8196 bytes -;; divideCeil(8196, 128) = 65, but GFX11 max = (1<<6)-1 = 63 -;; pref_size should clamp to 63 - -; GFX11-LABEL: .amdhsa_kernel test_clamping -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_clamping -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x180, COMPUTE_PGM_RSRC3 at 0x1AC: -;; gfx11: clamped to 63 -> 63<<4 = 0x3F0 -;; gfx12: no clamping, 65 -> 65<<4 = 0x410 -; OBJ-GFX11: 01a0 {{.*}}f0030000 -; OBJ-GFX12: 01a0 {{.*}}10040000 - -define amdgpu_kernel void @test_clamping() { - call void asm sideeffect ".fill 2048, 4, 0", ""() - ret void -} - -;; --- Large function that exceeds both GFX11 and GFX12 field max --- -;; .fill 8192, 4, 0 = 32768 bytes + 4 = 32772 bytes -;; divideCeil(32772, 128) = 257 -;; GFX11 max = 63, GFX12 max = 255 -> both clamp - -; GFX11-LABEL: .amdhsa_kernel test_clamping_both -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_clamping_both -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x1C0, COMPUTE_PGM_RSRC3 at 0x1EC: -;; gfx11: clamped to 63 -> 63<<4 = 0x3F0 -;; gfx12: clamped to 255 -> 255<<4 = 0xFF0 -; OBJ-GFX11: 01e0 {{.*}}f0030000 -; OBJ-GFX12: 01e0 {{.*}}f00f0000 - -define amdgpu_kernel void @test_clamping_both() { - call void asm sideeffect ".fill 8192, 4, 0", ""() - ret void -} From b48b110a49f7833e4e91258a2ca08672b1b13de4 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 12 May 2026 08:51:32 +0800 Subject: [PATCH 383/538] [LV][RISCV] Simplify strided-accesses test checks by ignoring loop metadata. nfc (#196026) Use --replace-value-regex to ignore specific !llvm.loop metadata numbers since the metadata IDs are not important for this test. --- .../LoopVectorize/RISCV/strided-accesses.ll | 606 ++++++------------ 1 file changed, 212 insertions(+), 394 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 8d074d1c30c4e..16715f15a20dd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --replace-value-regex "!llvm.loop ![0-9]+" --version 6 ; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S | FileCheck --check-prefixes=COMMON,CHECK,NOSTRIDED %s ; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-interleave=2 -S | FileCheck --check-prefixes=COMMON,CHECK-UF2,NOSTRIDED-UF2 %s ; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -lv-strided-pointer-ivs=true -laa-speculate-unit-stride=false -S | FileCheck --check-prefixes=COMMON,STRIDED-COMMON,CHECK,STRIDED %s @@ -28,7 +28,7 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: @@ -70,7 +70,7 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-UF2-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT]] ; CHECK-UF2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-UF2-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF2-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK-UF2: [[MIDDLE_BLOCK]]: ; CHECK-UF2-NEXT: br label %[[SCALAR_PH]] ; CHECK-UF2: [[SCALAR_PH]]: @@ -85,7 +85,7 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-UF2-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; CHECK-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; CHECK-UF2: [[EXIT]]: ; CHECK-UF2-NEXT: ret void ; @@ -131,7 +131,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: @@ -172,7 +172,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[TMP6]] ; CHECK-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK-UF2: [[MIDDLE_BLOCK]]: ; CHECK-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -190,7 +190,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-UF2-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], 64 ; CHECK-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; CHECK-UF2: [[EXIT]]: ; CHECK-UF2-NEXT: ret void ; @@ -236,7 +236,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP9]], 3 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: @@ -286,7 +286,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-UF2-NEXT: [[TMP18:%.*]] = shl i64 [[TMP4]], 3 ; CHECK-UF2-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP18]] ; CHECK-UF2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-UF2-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-UF2-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK-UF2: [[MIDDLE_BLOCK]]: ; CHECK-UF2-NEXT: br label %[[SCALAR_PH]] ; CHECK-UF2: [[SCALAR_PH]]: @@ -302,7 +302,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-UF2-NEXT: [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 ; CHECK-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; CHECK-UF2: [[EXIT]]: ; CHECK-UF2-NEXT: ret void ; @@ -347,7 +347,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[MIDDLE_BLOCK]]: ; NOSTRIDED-NEXT: br label %[[EXIT:.*]] ; NOSTRIDED: [[SCALAR_PH]]: @@ -361,7 +361,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[EXIT]]: ; NOSTRIDED-NEXT: ret void ; @@ -394,7 +394,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-UF2-NEXT: store [[TMP9]], ptr [[TMP7]], align 4 ; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[MIDDLE_BLOCK]]: ; NOSTRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -410,7 +410,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-UF2-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; NOSTRIDED-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[EXIT]]: ; NOSTRIDED-UF2-NEXT: ret void ; @@ -471,7 +471,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[MIDDLE_BLOCK]]: ; NOSTRIDED-NEXT: br label %[[EXIT:.*]] ; NOSTRIDED: [[SCALAR_PH]]: @@ -486,7 +486,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[EXIT]]: ; NOSTRIDED-NEXT: ret void ; @@ -519,7 +519,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-UF2-NEXT: store [[TMP9]], ptr [[TMP7]], align 4 ; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[MIDDLE_BLOCK]]: ; NOSTRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -536,7 +536,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-UF2-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[EXIT]]: ; NOSTRIDED-UF2-NEXT: ret void ; @@ -646,7 +646,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[MIDDLE_BLOCK]]: ; NOSTRIDED-NEXT: br label %[[EXIT:.*]] ; NOSTRIDED: [[SCALAR_PH]]: @@ -661,7 +661,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[EXIT]]: ; NOSTRIDED-NEXT: ret void ; @@ -706,7 +706,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-UF2-NEXT: store [[TMP13]], ptr [[TMP17]], align 4 ; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; NOSTRIDED-UF2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[MIDDLE_BLOCK]]: ; NOSTRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -723,7 +723,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-UF2-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; NOSTRIDED-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[EXIT]]: ; NOSTRIDED-UF2-NEXT: ret void ; @@ -799,7 +799,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; STRIDED: [[MIDDLE_BLOCK]]: ; STRIDED-NEXT: br label %[[EXIT:.*]] ; STRIDED: [[SCALAR_PH]]: @@ -814,7 +814,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; STRIDED: [[EXIT]]: ; STRIDED-NEXT: ret void ; @@ -904,7 +904,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP30]] ; STRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT]] ; STRIDED-UF2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; STRIDED-UF2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; STRIDED-UF2: [[MIDDLE_BLOCK]]: ; STRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; STRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -921,7 +921,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; STRIDED-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; STRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; STRIDED-UF2: [[EXIT]]: ; STRIDED-UF2-NEXT: ret void ; @@ -966,7 +966,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[MIDDLE_BLOCK]]: ; NOSTRIDED-NEXT: br label %[[EXIT:.*]] ; NOSTRIDED: [[SCALAR_PH]]: @@ -982,7 +982,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED: [[EXIT]]: ; NOSTRIDED-NEXT: ret void ; @@ -1015,7 +1015,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-UF2-NEXT: store [[TMP9]], ptr [[TMP7]], align 4 ; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[MIDDLE_BLOCK]]: ; NOSTRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -1033,7 +1033,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-UF2-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; NOSTRIDED-UF2: [[EXIT]]: ; NOSTRIDED-UF2-NEXT: ret void ; @@ -1159,7 +1159,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]] ; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; STRIDED-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; STRIDED: [[MIDDLE_BLOCK]]: ; STRIDED-NEXT: br label %[[EXIT:.*]] ; STRIDED: [[SCALAR_PH]]: @@ -1175,7 +1175,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]] ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; STRIDED: [[EXIT]]: ; STRIDED-NEXT: ret void ; @@ -1243,7 +1243,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP21]] ; STRIDED-UF2-NEXT: [[PTR_IND15]] = getelementptr i8, ptr [[POINTER_PHI9]], i64 [[TMP21]] ; STRIDED-UF2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-UF2-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; STRIDED-UF2-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; STRIDED-UF2: [[MIDDLE_BLOCK]]: ; STRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; STRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -1263,7 +1263,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]] ; STRIDED-UF2-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; STRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; STRIDED-UF2: [[EXIT]]: ; STRIDED-UF2-NEXT: ret void ; @@ -1299,165 +1299,85 @@ exit: ; } ; define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { -; NOSTRIDED-LABEL: define void @constant_stride_reinterpret( -; NOSTRIDED-SAME: ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]]) #[[ATTR0]] { -; NOSTRIDED-NEXT: [[ENTRY:.*:]] -; NOSTRIDED-NEXT: br label %[[VECTOR_PH:.*]] -; NOSTRIDED: [[VECTOR_PH]]: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() -; NOSTRIDED-NEXT: br label %[[VECTOR_BODY:.*]] -; NOSTRIDED: [[VECTOR_BODY]]: -; NOSTRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; NOSTRIDED-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NOSTRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[VEC_IND]] -; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP4]], splat (i1 true), i32 [[TMP2]]) -; NOSTRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[EVL_BASED_IV]] -; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) -; NOSTRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP3]], [[EVL_BASED_IV]] -; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] -; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] -; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] -; NOSTRIDED: [[MIDDLE_BLOCK]]: -; NOSTRIDED-NEXT: br label %[[EXIT:.*]] -; NOSTRIDED: [[EXIT]]: -; NOSTRIDED-NEXT: ret void -; -; NOSTRIDED-UF2-LABEL: define void @constant_stride_reinterpret( -; NOSTRIDED-UF2-SAME: ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]]) #[[ATTR0]] { -; NOSTRIDED-UF2-NEXT: [[ENTRY:.*]]: -; NOSTRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; NOSTRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; NOSTRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; NOSTRIDED-UF2: [[VECTOR_PH]]: -; NOSTRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-UF2-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1 -; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NOSTRIDED-UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 -; NOSTRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; NOSTRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() -; NOSTRIDED-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; NOSTRIDED-UF2: [[VECTOR_BODY]]: -; NOSTRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add nuw [[VEC_IND]], [[BROADCAST_SPLAT]] -; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[VEC_IND]] -; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[STEP_ADD]] -; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( align 8 [[TMP7]], splat (i1 true), poison) -; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( align 8 [[TMP8]], splat (i1 true), poison) -; NOSTRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[INDEX]] -; NOSTRIDED-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP9]], i64 [[TMP3]] -; NOSTRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8 -; NOSTRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8 -; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; NOSTRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT]] -; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] -; NOSTRIDED-UF2: [[MIDDLE_BLOCK]]: -; NOSTRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; NOSTRIDED-UF2: [[SCALAR_PH]]: -; NOSTRIDED-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; NOSTRIDED-UF2-NEXT: br label %[[LOOP:.*]] -; NOSTRIDED-UF2: [[LOOP]]: -; NOSTRIDED-UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; NOSTRIDED-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], i64 [[IV]] -; NOSTRIDED-UF2-NEXT: [[TMP14:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; NOSTRIDED-UF2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[IV]] -; NOSTRIDED-UF2-NEXT: store i64 [[TMP14]], ptr [[ARRAYIDX2]], align 8 -; NOSTRIDED-UF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; NOSTRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] -; NOSTRIDED-UF2: [[EXIT]]: -; NOSTRIDED-UF2-NEXT: ret void -; -; STRIDED-LABEL: define void @constant_stride_reinterpret( -; STRIDED-SAME: ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]]) #[[ATTR0]] { -; STRIDED-NEXT: [[ENTRY:.*:]] -; STRIDED-NEXT: br label %[[VECTOR_PH:.*]] -; STRIDED: [[VECTOR_PH]]: -; STRIDED-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() -; STRIDED-NEXT: br label %[[VECTOR_BODY:.*]] -; STRIDED: [[VECTOR_BODY]]: -; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; STRIDED-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[VEC_IND]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP4]], splat (i1 true), i32 [[TMP2]]) -; STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[EVL_BASED_IV]] -; STRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) -; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP3]], [[EVL_BASED_IV]] -; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] -; STRIDED-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] -; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] -; STRIDED: [[MIDDLE_BLOCK]]: -; STRIDED-NEXT: br label %[[EXIT:.*]] -; STRIDED: [[EXIT]]: -; STRIDED-NEXT: ret void +; CHECK-LABEL: define void @constant_stride_reinterpret( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP3]], splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP4]], splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[CURRENT_ITERATION_NEXT]] = add nuw i64 [[TMP2]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP2]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; -; STRIDED-UF2-LABEL: define void @constant_stride_reinterpret( -; STRIDED-UF2-SAME: ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]]) #[[ATTR0]] { -; STRIDED-UF2-NEXT: [[ENTRY:.*]]: -; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; STRIDED-UF2: [[VECTOR_PH]]: -; STRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-UF2-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1 -; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 -; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; STRIDED-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() -; STRIDED-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; STRIDED-UF2: [[VECTOR_BODY]]: -; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add nuw [[VEC_IND]], [[BROADCAST_SPLAT]] -; STRIDED-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[VEC_IND]] -; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[STEP_ADD]] -; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( align 8 [[TMP7]], splat (i1 true), poison) -; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( align 8 [[TMP8]], splat (i1 true), poison) -; STRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[INDEX]] -; STRIDED-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP9]], i64 [[TMP3]] -; STRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8 -; STRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8 -; STRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; STRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT]] -; STRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-UF2-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; STRIDED-UF2: [[MIDDLE_BLOCK]]: -; STRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; STRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; STRIDED-UF2: [[SCALAR_PH]]: -; STRIDED-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; STRIDED-UF2-NEXT: br label %[[LOOP:.*]] -; STRIDED-UF2: [[LOOP]]: -; STRIDED-UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; STRIDED-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], i64 [[IV]] -; STRIDED-UF2-NEXT: [[TMP14:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; STRIDED-UF2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[IV]] -; STRIDED-UF2-NEXT: store i64 [[TMP14]], ptr [[ARRAYIDX2]], align 8 -; STRIDED-UF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; STRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; STRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] -; STRIDED-UF2: [[EXIT]]: -; STRIDED-UF2-NEXT: ret void +; CHECK-UF2-LABEL: define void @constant_stride_reinterpret( +; CHECK-UF2-SAME: ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-UF2-NEXT: [[ENTRY:.*]]: +; CHECK-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; CHECK-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-UF2: [[VECTOR_PH]]: +; CHECK-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF2-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1 +; CHECK-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 +; CHECK-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; CHECK-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; CHECK-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-UF2: [[VECTOR_BODY]]: +; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = add nuw [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-UF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[VEC_IND]] +; CHECK-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[STEP_ADD]] +; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( align 8 [[TMP6]], splat (i1 true), poison) +; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( align 8 [[TMP7]], splat (i1 true), poison) +; CHECK-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[INDEX]] +; CHECK-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP8]], i64 [[TMP3]] +; CHECK-UF2-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP8]], align 8 +; CHECK-UF2-NEXT: store [[WIDE_MASKED_GATHER1]], ptr [[TMP9]], align 8 +; CHECK-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-UF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; CHECK-UF2: [[MIDDLE_BLOCK]]: +; CHECK-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-UF2: [[SCALAR_PH]]: +; CHECK-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-UF2-NEXT: br label %[[LOOP:.*]] +; CHECK-UF2: [[LOOP]]: +; CHECK-UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], i64 [[IV]] +; CHECK-UF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-UF2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[IV]] +; CHECK-UF2-NEXT: store i64 [[TMP11]], ptr [[ARRAYIDX2]], align 8 +; CHECK-UF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} +; CHECK-UF2: [[EXIT]]: +; CHECK-UF2-NEXT: ret void ; entry: br label %loop @@ -1480,209 +1400,107 @@ exit: ; strided access at the same time is vectorized as an interleaved load rather ; than a strided load. define void @interleaved_load_instead_of_strided(ptr %a) { -; NOSTRIDED-LABEL: define void @interleaved_load_instead_of_strided( -; NOSTRIDED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { -; NOSTRIDED-NEXT: [[ENTRY:.*:]] -; NOSTRIDED-NEXT: br label %[[VECTOR_PH:.*]] -; NOSTRIDED: [[VECTOR_PH]]: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv4i64() -; NOSTRIDED-NEXT: br label %[[VECTOR_BODY:.*]] -; NOSTRIDED: [[VECTOR_BODY]]: -; NOSTRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; NOSTRIDED-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP2]], i64 0 -; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NOSTRIDED-NEXT: [[TMP3:%.*]] = getelementptr [4 x i32], ptr [[A]], [[VEC_IND]] -; NOSTRIDED-NEXT: [[TMP4:%.*]] = extractelement [[TMP3]], i64 0 -; NOSTRIDED-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP1]], 4 -; NOSTRIDED-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv16i32.p0(ptr align 4 [[TMP4]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) -; NOSTRIDED-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VP_LOAD]]) -; NOSTRIDED-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; NOSTRIDED-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; NOSTRIDED-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; NOSTRIDED-NEXT: [[TMP8:%.*]] = add [[TMP5]], [[TMP6]] -; NOSTRIDED-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[TMP7]] -; NOSTRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP9]], align 4 [[TMP3]], splat (i1 true), i32 [[TMP1]]) -; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP2]] -; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] -; NOSTRIDED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; NOSTRIDED-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; NOSTRIDED: [[MIDDLE_BLOCK]]: -; NOSTRIDED-NEXT: br label %[[EXIT:.*]] -; NOSTRIDED: [[EXIT]]: -; NOSTRIDED-NEXT: ret void -; -; NOSTRIDED-UF2-LABEL: define void @interleaved_load_instead_of_strided( -; NOSTRIDED-UF2-SAME: ptr [[A:%.*]]) #[[ATTR0]] { -; NOSTRIDED-UF2-NEXT: [[ENTRY:.*]]: -; NOSTRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; NOSTRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; NOSTRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; NOSTRIDED-UF2: [[VECTOR_PH]]: -; NOSTRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-UF2-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NOSTRIDED-UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 -; NOSTRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; NOSTRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i64() -; NOSTRIDED-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; NOSTRIDED-UF2: [[VECTOR_BODY]]: -; NOSTRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NOSTRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add nuw [[VEC_IND]], [[BROADCAST_SPLAT]] -; NOSTRIDED-UF2-NEXT: [[TMP6:%.*]] = getelementptr [4 x i32], ptr [[A]], [[VEC_IND]] -; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = extractelement [[TMP6]], i64 0 -; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr [4 x i32], ptr [[A]], [[STEP_ADD]] -; NOSTRIDED-UF2-NEXT: [[TMP9:%.*]] = extractelement [[TMP8]], i64 0 -; NOSTRIDED-UF2-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 -; NOSTRIDED-UF2-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC]]) -; NOSTRIDED-UF2-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; NOSTRIDED-UF2-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; NOSTRIDED-UF2-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; NOSTRIDED-UF2-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP9]], align 4 -; NOSTRIDED-UF2-NEXT: [[STRIDED_VEC2:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC1]]) -; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 0 -; NOSTRIDED-UF2-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 1 -; NOSTRIDED-UF2-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 3 -; NOSTRIDED-UF2-NEXT: [[TMP16:%.*]] = add [[TMP10]], [[TMP11]] -; NOSTRIDED-UF2-NEXT: [[TMP17:%.*]] = add [[TMP13]], [[TMP14]] -; NOSTRIDED-UF2-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP12]] -; NOSTRIDED-UF2-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP15]] -; NOSTRIDED-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], align 4 [[TMP6]], splat (i1 true)) -; NOSTRIDED-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP19]], align 4 [[TMP8]], splat (i1 true)) -; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; NOSTRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT]] -; NOSTRIDED-UF2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] -; NOSTRIDED-UF2: [[MIDDLE_BLOCK]]: -; NOSTRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; NOSTRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; NOSTRIDED-UF2: [[SCALAR_PH]]: -; NOSTRIDED-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; NOSTRIDED-UF2-NEXT: br label %[[LOOP:.*]] -; NOSTRIDED-UF2: [[LOOP]]: -; NOSTRIDED-UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; NOSTRIDED-UF2-NEXT: [[BASE:%.*]] = getelementptr [4 x i32], ptr [[A]], i64 [[IV]] -; NOSTRIDED-UF2-NEXT: [[V0:%.*]] = load i32, ptr [[BASE]], align 4 -; NOSTRIDED-UF2-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 4 -; NOSTRIDED-UF2-NEXT: [[V1:%.*]] = load i32, ptr [[P1]], align 4 -; NOSTRIDED-UF2-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[BASE]], i64 12 -; NOSTRIDED-UF2-NEXT: [[V3:%.*]] = load i32, ptr [[P3]], align 4 -; NOSTRIDED-UF2-NEXT: [[ADD0:%.*]] = add i32 [[V0]], [[V1]] -; NOSTRIDED-UF2-NEXT: [[ADD1:%.*]] = add i32 [[ADD0]], [[V3]] -; NOSTRIDED-UF2-NEXT: store i32 [[ADD1]], ptr [[BASE]], align 4 -; NOSTRIDED-UF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; NOSTRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; NOSTRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] -; NOSTRIDED-UF2: [[EXIT]]: -; NOSTRIDED-UF2-NEXT: ret void -; -; STRIDED-LABEL: define void @interleaved_load_instead_of_strided( -; STRIDED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { -; STRIDED-NEXT: [[ENTRY:.*:]] -; STRIDED-NEXT: br label %[[VECTOR_PH:.*]] -; STRIDED: [[VECTOR_PH]]: -; STRIDED-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv4i64() -; STRIDED-NEXT: br label %[[VECTOR_BODY:.*]] -; STRIDED: [[VECTOR_BODY]]: -; STRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; STRIDED-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP2]], i64 0 -; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-NEXT: [[TMP3:%.*]] = getelementptr [4 x i32], ptr [[A]], [[VEC_IND]] -; STRIDED-NEXT: [[TMP4:%.*]] = extractelement [[TMP3]], i64 0 -; STRIDED-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP1]], 4 -; STRIDED-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv16i32.p0(ptr align 4 [[TMP4]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) -; STRIDED-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VP_LOAD]]) -; STRIDED-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; STRIDED-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; STRIDED-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; STRIDED-NEXT: [[TMP8:%.*]] = add [[TMP5]], [[TMP6]] -; STRIDED-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[TMP7]] -; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP9]], align 4 [[TMP3]], splat (i1 true), i32 [[TMP1]]) -; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP2]] -; STRIDED-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] -; STRIDED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; STRIDED-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] -; STRIDED: [[MIDDLE_BLOCK]]: -; STRIDED-NEXT: br label %[[EXIT:.*]] -; STRIDED: [[EXIT]]: -; STRIDED-NEXT: ret void +; CHECK-LABEL: define void @interleaved_load_instead_of_strided( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [4 x i32], ptr [[A]], [[VEC_IND]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement [[TMP3]], i64 0 +; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP1]], 4 +; CHECK-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv16i32.p0(ptr align 4 [[TMP4]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VP_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[TMP7]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP9]], align 4 [[TMP3]], splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP2]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; -; STRIDED-UF2-LABEL: define void @interleaved_load_instead_of_strided( -; STRIDED-UF2-SAME: ptr [[A:%.*]]) #[[ATTR0]] { -; STRIDED-UF2-NEXT: [[ENTRY:.*]]: -; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; STRIDED-UF2: [[VECTOR_PH]]: -; STRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-UF2-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 -; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; STRIDED-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i64() -; STRIDED-UF2-NEXT: br label %[[VECTOR_BODY:.*]] -; STRIDED-UF2: [[VECTOR_BODY]]: -; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add nuw [[VEC_IND]], [[BROADCAST_SPLAT]] -; STRIDED-UF2-NEXT: [[TMP6:%.*]] = getelementptr [4 x i32], ptr [[A]], [[VEC_IND]] -; STRIDED-UF2-NEXT: [[TMP7:%.*]] = extractelement [[TMP6]], i64 0 -; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr [4 x i32], ptr [[A]], [[STEP_ADD]] -; STRIDED-UF2-NEXT: [[TMP9:%.*]] = extractelement [[TMP8]], i64 0 -; STRIDED-UF2-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 -; STRIDED-UF2-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC]]) -; STRIDED-UF2-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; STRIDED-UF2-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; STRIDED-UF2-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; STRIDED-UF2-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP9]], align 4 -; STRIDED-UF2-NEXT: [[STRIDED_VEC2:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC1]]) -; STRIDED-UF2-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 0 -; STRIDED-UF2-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 1 -; STRIDED-UF2-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 3 -; STRIDED-UF2-NEXT: [[TMP16:%.*]] = add [[TMP10]], [[TMP11]] -; STRIDED-UF2-NEXT: [[TMP17:%.*]] = add [[TMP13]], [[TMP14]] -; STRIDED-UF2-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP12]] -; STRIDED-UF2-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP15]] -; STRIDED-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], align 4 [[TMP6]], splat (i1 true)) -; STRIDED-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP19]], align 4 [[TMP8]], splat (i1 true)) -; STRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; STRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT]] -; STRIDED-UF2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-UF2-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] -; STRIDED-UF2: [[MIDDLE_BLOCK]]: -; STRIDED-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; STRIDED-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; STRIDED-UF2: [[SCALAR_PH]]: -; STRIDED-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; STRIDED-UF2-NEXT: br label %[[LOOP:.*]] -; STRIDED-UF2: [[LOOP]]: -; STRIDED-UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; STRIDED-UF2-NEXT: [[BASE:%.*]] = getelementptr [4 x i32], ptr [[A]], i64 [[IV]] -; STRIDED-UF2-NEXT: [[V0:%.*]] = load i32, ptr [[BASE]], align 4 -; STRIDED-UF2-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 4 -; STRIDED-UF2-NEXT: [[V1:%.*]] = load i32, ptr [[P1]], align 4 -; STRIDED-UF2-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[BASE]], i64 12 -; STRIDED-UF2-NEXT: [[V3:%.*]] = load i32, ptr [[P3]], align 4 -; STRIDED-UF2-NEXT: [[ADD0:%.*]] = add i32 [[V0]], [[V1]] -; STRIDED-UF2-NEXT: [[ADD1:%.*]] = add i32 [[ADD0]], [[V3]] -; STRIDED-UF2-NEXT: store i32 [[ADD1]], ptr [[BASE]], align 4 -; STRIDED-UF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; STRIDED-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; STRIDED-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] -; STRIDED-UF2: [[EXIT]]: -; STRIDED-UF2-NEXT: ret void +; CHECK-UF2-LABEL: define void @interleaved_load_instead_of_strided( +; CHECK-UF2-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-UF2-NEXT: [[ENTRY:.*]]: +; CHECK-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-UF2: [[VECTOR_PH]]: +; CHECK-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF2-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 +; CHECK-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; CHECK-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; CHECK-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-UF2: [[VECTOR_BODY]]: +; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = add nuw [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-UF2-NEXT: [[TMP6:%.*]] = getelementptr [4 x i32], ptr [[A]], [[VEC_IND]] +; CHECK-UF2-NEXT: [[TMP7:%.*]] = extractelement [[TMP6]], i64 0 +; CHECK-UF2-NEXT: [[TMP8:%.*]] = getelementptr [4 x i32], ptr [[A]], [[STEP_ADD]] +; CHECK-UF2-NEXT: [[TMP9:%.*]] = extractelement [[TMP8]], i64 0 +; CHECK-UF2-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 +; CHECK-UF2-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC]]) +; CHECK-UF2-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; CHECK-UF2-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; CHECK-UF2-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; CHECK-UF2-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-UF2-NEXT: [[STRIDED_VEC2:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[WIDE_VEC1]]) +; CHECK-UF2-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 0 +; CHECK-UF2-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 1 +; CHECK-UF2-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[STRIDED_VEC2]], 3 +; CHECK-UF2-NEXT: [[TMP16:%.*]] = add [[TMP10]], [[TMP11]] +; CHECK-UF2-NEXT: [[TMP17:%.*]] = add [[TMP13]], [[TMP14]] +; CHECK-UF2-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP12]] +; CHECK-UF2-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP15]] +; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], align 4 [[TMP6]], splat (i1 true)) +; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP19]], align 4 [[TMP8]], splat (i1 true)) +; CHECK-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-UF2-NEXT: [[VEC_IND_NEXT]] = add nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-UF2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-UF2-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; CHECK-UF2: [[MIDDLE_BLOCK]]: +; CHECK-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-UF2: [[SCALAR_PH]]: +; CHECK-UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-UF2-NEXT: br label %[[LOOP:.*]] +; CHECK-UF2: [[LOOP]]: +; CHECK-UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-UF2-NEXT: [[BASE:%.*]] = getelementptr [4 x i32], ptr [[A]], i64 [[IV]] +; CHECK-UF2-NEXT: [[V0:%.*]] = load i32, ptr [[BASE]], align 4 +; CHECK-UF2-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 4 +; CHECK-UF2-NEXT: [[V1:%.*]] = load i32, ptr [[P1]], align 4 +; CHECK-UF2-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[BASE]], i64 12 +; CHECK-UF2-NEXT: [[V3:%.*]] = load i32, ptr [[P3]], align 4 +; CHECK-UF2-NEXT: [[ADD0:%.*]] = add i32 [[V0]], [[V1]] +; CHECK-UF2-NEXT: [[ADD1:%.*]] = add i32 [[ADD0]], [[V3]] +; CHECK-UF2-NEXT: store i32 [[ADD1]], ptr [[BASE]], align 4 +; CHECK-UF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-UF2-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-UF2-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} +; CHECK-UF2: [[EXIT]]: +; CHECK-UF2-NEXT: ret void ; entry: br label %loop From 08c77d0990c1f19d3cf419b899091958af1eb782 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 11 May 2026 21:03:13 -0400 Subject: [PATCH 384/538] [gn build] Port 0c101370f58a (#197073) --- .../clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn index 0db7b474b0137..95b7e1bedd146 100644 --- a/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn @@ -48,5 +48,6 @@ unittest("ClangScalableAnalysisTests") { "TUSummaryBuilderTest.cpp", "TestFixture.cpp", "WholeProgramAnalysis/AnalysisDriverTest.cpp", + "WholeProgramAnalysis/UnsafeBufferReachableAnalysisTest.cpp", ] } From 6709926d72a1cd5e1013bbe190c0cf1866f055e3 Mon Sep 17 00:00:00 2001 From: Demo_mod Date: Tue, 12 May 2026 07:14:46 +0530 Subject: [PATCH 385/538] [clang-tidy][docs] Remove outdated Phabricator reference (#196997) Removed the old Phabricator mention from the clang-tidy contributing guide. Since LLVM uses GitHub for code review now, this updates the wording to match the current contribution workflow. --- clang-tools-extra/docs/clang-tidy/Contributing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index 463cfa1bd417c..79551f60e9b1c 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -131,8 +131,8 @@ So you have an idea of a useful check for :program:`clang-tidy`. First, if you're not familiar with LLVM development, read through the `Getting Started with the LLVM System`_ document for instructions on setting up your workflow and the `LLVM Coding Standards`_ document to familiarize yourself -with the coding style used in the project. For code reviews, we currently -use `LLVM Github`_, though historically we used Phabricator. +with the coding style used in the project. Code reviews for clang-tidy +contributions are done through `LLVM Github`_. .. _Getting Started with the LLVM System: https://llvm.org/docs/GettingStarted.html .. _LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html From 05c0db7f0aa10f04dc545eef7b3768528523ca98 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Mon, 11 May 2026 19:26:50 -0700 Subject: [PATCH 386/538] [TableGen] Emit the primary input file in -d depfile output (#197061) This fixes a bug where old, but still supported, versions of CMake and ninja perpetually consider zero-include tablegen files to be out of date. It also matches what Clang and GCC do for regular C compilations. When a .td input has no `include` directives, the depfile produced by `-d` contains only `:` followed by zero dependencies. My version (3.27) of CMake's `cmake_transform_depfile` step then writes a 0-byte file, which old versions of ninja treat as a missing depfile and re-run the rule on every incremental build (e.g. Attributes.td, ValueTypes.td). Here's the effect on Attributes.inc.d: ``` $ cat ./build/include/llvm/IR/Attributes.inc.d Attributes.inc: # switch branches and rebuild... $ cat ./build/include/llvm/IR/Attributes.inc.d Attributes.inc: /work/llvm-project/llvm/include/llvm/IR/Attributes.td ``` An LLM was used to help create this change. --- llvm/lib/TableGen/Main.cpp | 10 ++++++++++ llvm/test/TableGen/depfile.td | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 llvm/test/TableGen/depfile.td diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp index cac70165b4131..b86f5f48707b0 100644 --- a/llvm/lib/TableGen/Main.cpp +++ b/llvm/lib/TableGen/Main.cpp @@ -97,6 +97,16 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) { return reportError(argv0, "error opening " + DependFilename + ":" + EC.message() + "\n"); DepOut.os() << OutputFilename << ":"; + + // Emit the primary input file as a dependency. This matches C compilers like + // Clang and GCC. Without it, a .td file with no `include` directives would + // produce a depfile listing zero dependencies. CMake's + // `cmake_transform_depfile` then collapses that to a 0-byte file, which Ninja + // treats as a missing depfile and re-runs the rule on every incremental + // build. + if (InputFilename != "-") + DepOut.os() << ' ' << InputFilename; + for (const auto &Dep : Parser.getDependencies()) { DepOut.os() << ' ' << Dep; } diff --git a/llvm/test/TableGen/depfile.td b/llvm/test/TableGen/depfile.td new file mode 100644 index 0000000000000..cb3e0644de867 --- /dev/null +++ b/llvm/test/TableGen/depfile.td @@ -0,0 +1,19 @@ +// Verify that `-d` emits a non-empty depfile that always names the primary +// input file, even when the input has no `include` directives. A depfile +// containing only `:` (no deps) round-trips through CMake's +// `cmake_transform_depfile` step to a 0-byte file, which Ninja treats as a +// missing depfile and unconditionally re-runs the edge. + +// RUN: llvm-tblgen -print-records %s -o %t.out -d %t.d +// RUN: FileCheck %s --input-file=%t.d + +// CHECK: {{.*}}depfile.td.tmp.out: {{.*}}depfile.td + +// When the input is stdin (`-`), there is no input path to emit, so the +// depfile lists only the output target with no dependencies. +// RUN: llvm-tblgen -print-records - -o %t.stdin.out -d %t.stdin.d < %s +// RUN: FileCheck %s --check-prefix=STDIN --input-file=%t.stdin.d + +// STDIN: {{.*}}depfile.td.tmp.stdin.out:{{$}} + +def Empty; From b4f7c93e7d1325f1c1f00b47c10d2923e8259369 Mon Sep 17 00:00:00 2001 From: Yashwant Singh Date: Tue, 12 May 2026 08:19:56 +0530 Subject: [PATCH 387/538] [AArch64][Isel] For fixed length vectors use sve for bitreverse when available (#196025) Lowering bitreverse via the SVE path seems to be giving significant performance improvements for fixed width vectors Speedups after the patch uint8x8_t 1.01x uint16x4_t 2.02x uint32x2_t 2.01x uint64x1_t 2.02x uint8x16_t 1.00x uint16x8_t 2.03x uint32x4_t 2.03x uint64x2_t 2.02x --- .../Target/AArch64/AArch64ISelLowering.cpp | 14 +- llvm/test/CodeGen/AArch64/bitreverse.ll | 246 ++++++++++++++---- .../AArch64/sve-fixed-length-bit-counting.ll | 8 +- .../CodeGen/AArch64/sve-fixed-length-rev.ll | 10 +- 4 files changed, 210 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8608bfe0a8205..d8ba83dd1d584 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1932,6 +1932,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MULHU, VT, Custom); } + for (auto VT : {MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v1i64, + MVT::v2i64}) + setOperationAction(ISD::BITREVERSE, VT, Custom); + // NEON doesn't support 64-bit vector integer muls, but SVE does. setOperationAction(ISD::MUL, MVT::v1i64, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); @@ -1970,7 +1974,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, VT, Custom); // These operations are not supported on NEON but SVE can do them. - setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); @@ -1988,9 +1991,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); // Int operations with no NEON support. - for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, - MVT::v2i32, MVT::v4i32, MVT::v2i64}) { - setOperationAction(ISD::BITREVERSE, VT, Custom); + for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); @@ -11825,9 +11827,7 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT( - VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) + if (Subtarget->isSVEorStreamingSVEAvailable()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); SDLoc DL(Op); diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll index 61f67328be38e..d3ea6916f0832 100644 --- a/llvm/test/CodeGen/AArch64/bitreverse.ll +++ b/llvm/test/CodeGen/AArch64/bitreverse.ll @@ -1,5 +1,6 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,SDAG +; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK,SVE ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=1 %s -o - | FileCheck %s --check-prefixes=CHECK,GISEL ; These tests just check that the plumbing is in place for @llvm.bitreverse. @@ -14,6 +15,14 @@ define <2 x i16> @f(<2 x i16> %a) { ; SDAG-NEXT: ushr v0.2s, v0.2s, #16 ; SDAG-NEXT: ret ; +; SVE-LABEL: f: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s, vl2 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: rbit z0.s, p0/m, z0.s +; SVE-NEXT: ushr v0.2s, v0.2s, #16 +; SVE-NEXT: ret +; ; GISEL-LABEL: f: ; GISEL: // %bb.0: ; GISEL-NEXT: uzp1 v0.4h, v0.4h, v0.4h @@ -152,6 +161,14 @@ define <4 x i8> @g_vec_4x8(<4 x i8> %a) { ; SDAG-NEXT: ushr v0.4h, v0.4h, #8 ; SDAG-NEXT: ret ; +; SVE-LABEL: g_vec_4x8: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h, vl4 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: rbit z0.h, p0/m, z0.h +; SVE-NEXT: ushr v0.4h, v0.4h, #8 +; SVE-NEXT: ret +; ; GISEL-LABEL: g_vec_4x8: ; GISEL: // %bb.0: ; GISEL-NEXT: uzp1 v0.8b, v0.8b, v0.8b @@ -176,11 +193,25 @@ define <9 x i8> @g_vec_9x8(<9 x i8> %a) { declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>) readnone define <4 x i16> @g_vec_4x16(<4 x i16> %a) { -; CHECK-LABEL: g_vec_4x16: -; CHECK: // %bb.0: -; CHECK-NEXT: rev16 v0.8b, v0.8b -; CHECK-NEXT: rbit v0.8b, v0.8b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_4x16: +; SDAG: // %bb.0: +; SDAG-NEXT: rev16 v0.8b, v0.8b +; SDAG-NEXT: rbit v0.8b, v0.8b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_4x16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h, vl4 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: rbit z0.h, p0/m, z0.h +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_4x16: +; GISEL: // %bb.0: +; GISEL-NEXT: rev16 v0.8b, v0.8b +; GISEL-NEXT: rbit v0.8b, v0.8b +; GISEL-NEXT: ret %b = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %a) ret <4 x i16> %b } @@ -188,11 +219,25 @@ define <4 x i16> @g_vec_4x16(<4 x i16> %a) { declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone define <8 x i16> @g_vec_8x16(<8 x i16> %a) { -; CHECK-LABEL: g_vec_8x16: -; CHECK: // %bb.0: -; CHECK-NEXT: rev16 v0.16b, v0.16b -; CHECK-NEXT: rbit v0.16b, v0.16b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_8x16: +; SDAG: // %bb.0: +; SDAG-NEXT: rev16 v0.16b, v0.16b +; SDAG-NEXT: rbit v0.16b, v0.16b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_8x16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: rbit z0.h, p0/m, z0.h +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_8x16: +; GISEL: // %bb.0: +; GISEL-NEXT: rev16 v0.16b, v0.16b +; GISEL-NEXT: rbit v0.16b, v0.16b +; GISEL-NEXT: ret %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ret <8 x i16> %b } @@ -200,13 +245,32 @@ define <8 x i16> @g_vec_8x16(<8 x i16> %a) { declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone define <16 x i16> @g_vec_16x16(<16 x i16> %a) { -; CHECK-LABEL: g_vec_16x16: -; CHECK: // %bb.0: -; CHECK-NEXT: rev16 v0.16b, v0.16b -; CHECK-NEXT: rev16 v1.16b, v1.16b -; CHECK-NEXT: rbit v0.16b, v0.16b -; CHECK-NEXT: rbit v1.16b, v1.16b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_16x16: +; SDAG: // %bb.0: +; SDAG-NEXT: rev16 v0.16b, v0.16b +; SDAG-NEXT: rev16 v1.16b, v1.16b +; SDAG-NEXT: rbit v0.16b, v0.16b +; SDAG-NEXT: rbit v1.16b, v1.16b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_16x16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: rbit z0.h, p0/m, z0.h +; SVE-NEXT: rbit z1.h, p0/m, z1.h +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_16x16: +; GISEL: // %bb.0: +; GISEL-NEXT: rev16 v0.16b, v0.16b +; GISEL-NEXT: rev16 v1.16b, v1.16b +; GISEL-NEXT: rbit v0.16b, v0.16b +; GISEL-NEXT: rbit v1.16b, v1.16b +; GISEL-NEXT: ret %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ret <16 x i16> %b } @@ -214,11 +278,25 @@ define <16 x i16> @g_vec_16x16(<16 x i16> %a) { declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) readnone define <2 x i32> @g_vec_2x32(<2 x i32> %a) { -; CHECK-LABEL: g_vec_2x32: -; CHECK: // %bb.0: -; CHECK-NEXT: rev32 v0.8b, v0.8b -; CHECK-NEXT: rbit v0.8b, v0.8b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_2x32: +; SDAG: // %bb.0: +; SDAG-NEXT: rev32 v0.8b, v0.8b +; SDAG-NEXT: rbit v0.8b, v0.8b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_2x32: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s, vl2 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: rbit z0.s, p0/m, z0.s +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_2x32: +; GISEL: // %bb.0: +; GISEL-NEXT: rev32 v0.8b, v0.8b +; GISEL-NEXT: rbit v0.8b, v0.8b +; GISEL-NEXT: ret %b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) ret <2 x i32> %b } @@ -226,11 +304,25 @@ define <2 x i32> @g_vec_2x32(<2 x i32> %a) { declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone define <4 x i32> @g_vec_4x32(<4 x i32> %a) { -; CHECK-LABEL: g_vec_4x32: -; CHECK: // %bb.0: -; CHECK-NEXT: rev32 v0.16b, v0.16b -; CHECK-NEXT: rbit v0.16b, v0.16b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_4x32: +; SDAG: // %bb.0: +; SDAG-NEXT: rev32 v0.16b, v0.16b +; SDAG-NEXT: rbit v0.16b, v0.16b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_4x32: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: rbit z0.s, p0/m, z0.s +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_4x32: +; GISEL: // %bb.0: +; GISEL-NEXT: rev32 v0.16b, v0.16b +; GISEL-NEXT: rbit v0.16b, v0.16b +; GISEL-NEXT: ret %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %b } @@ -238,13 +330,32 @@ define <4 x i32> @g_vec_4x32(<4 x i32> %a) { declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone define <8 x i32> @g_vec_8x32(<8 x i32> %a) { -; CHECK-LABEL: g_vec_8x32: -; CHECK: // %bb.0: -; CHECK-NEXT: rev32 v0.16b, v0.16b -; CHECK-NEXT: rev32 v1.16b, v1.16b -; CHECK-NEXT: rbit v0.16b, v0.16b -; CHECK-NEXT: rbit v1.16b, v1.16b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_8x32: +; SDAG: // %bb.0: +; SDAG-NEXT: rev32 v0.16b, v0.16b +; SDAG-NEXT: rev32 v1.16b, v1.16b +; SDAG-NEXT: rbit v0.16b, v0.16b +; SDAG-NEXT: rbit v1.16b, v1.16b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_8x32: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: rbit z0.s, p0/m, z0.s +; SVE-NEXT: rbit z1.s, p0/m, z1.s +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_8x32: +; GISEL: // %bb.0: +; GISEL-NEXT: rev32 v0.16b, v0.16b +; GISEL-NEXT: rev32 v1.16b, v1.16b +; GISEL-NEXT: rbit v0.16b, v0.16b +; GISEL-NEXT: rbit v1.16b, v1.16b +; GISEL-NEXT: ret %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ret <8 x i32> %b } @@ -258,6 +369,14 @@ define <1 x i64> @g_vec_1x64(<1 x i64> %a) { ; SDAG-NEXT: rbit v0.8b, v0.8b ; SDAG-NEXT: ret ; +; SVE-LABEL: g_vec_1x64: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.d, vl1 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: rbit z0.d, p0/m, z0.d +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE-NEXT: ret +; ; GISEL-LABEL: g_vec_1x64: ; GISEL: // %bb.0: ; GISEL-NEXT: fmov x8, d0 @@ -271,11 +390,25 @@ define <1 x i64> @g_vec_1x64(<1 x i64> %a) { declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone define <2 x i64> @g_vec_2x64(<2 x i64> %a) { -; CHECK-LABEL: g_vec_2x64: -; CHECK: // %bb.0: -; CHECK-NEXT: rev64 v0.16b, v0.16b -; CHECK-NEXT: rbit v0.16b, v0.16b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_2x64: +; SDAG: // %bb.0: +; SDAG-NEXT: rev64 v0.16b, v0.16b +; SDAG-NEXT: rbit v0.16b, v0.16b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_2x64: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: rbit z0.d, p0/m, z0.d +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_2x64: +; GISEL: // %bb.0: +; GISEL-NEXT: rev64 v0.16b, v0.16b +; GISEL-NEXT: rbit v0.16b, v0.16b +; GISEL-NEXT: ret %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %b } @@ -283,13 +416,32 @@ define <2 x i64> @g_vec_2x64(<2 x i64> %a) { declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone define <4 x i64> @g_vec_4x64(<4 x i64> %a) { -; CHECK-LABEL: g_vec_4x64: -; CHECK: // %bb.0: -; CHECK-NEXT: rev64 v0.16b, v0.16b -; CHECK-NEXT: rev64 v1.16b, v1.16b -; CHECK-NEXT: rbit v0.16b, v0.16b -; CHECK-NEXT: rbit v1.16b, v1.16b -; CHECK-NEXT: ret +; SDAG-LABEL: g_vec_4x64: +; SDAG: // %bb.0: +; SDAG-NEXT: rev64 v0.16b, v0.16b +; SDAG-NEXT: rev64 v1.16b, v1.16b +; SDAG-NEXT: rbit v0.16b, v0.16b +; SDAG-NEXT: rbit v1.16b, v1.16b +; SDAG-NEXT: ret +; +; SVE-LABEL: g_vec_4x64: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: rbit z0.d, p0/m, z0.d +; SVE-NEXT: rbit z1.d, p0/m, z1.d +; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 +; SVE-NEXT: ret +; +; GISEL-LABEL: g_vec_4x64: +; GISEL: // %bb.0: +; GISEL-NEXT: rev64 v0.16b, v0.16b +; GISEL-NEXT: rev64 v1.16b, v1.16b +; GISEL-NEXT: rbit v0.16b, v0.16b +; GISEL-NEXT: rbit v1.16b, v1.16b +; GISEL-NEXT: ret %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ret <4 x i64> %b } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll index b62b850434469..2de1976322db0 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll @@ -740,9 +740,7 @@ define void @ctpop_v32i64(ptr %a) vscale_range(16,0) #0 { define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: cttz_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: clz v0.8b, v0.8b ; CHECK-NEXT: ret %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op) @@ -752,9 +750,7 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: cttz_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b +; CHECK-NEXT: rbit v0.16b, v0.16b ; CHECK-NEXT: clz v0.16b, v0.16b ; CHECK-NEXT: ret %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll index 82d350f6e28f8..7cf981d63f674 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll @@ -12,10 +12,7 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: bitreverse_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: rbit v0.8b, v0.8b ; CHECK-NEXT: ret %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -24,10 +21,7 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: bitreverse_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: rbit v0.16b, v0.16b ; CHECK-NEXT: ret %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op) ret <16 x i8> %res From c0d51dd0b6d6759f6157968668f03c772ad54a6f Mon Sep 17 00:00:00 2001 From: Yashwant Singh Date: Tue, 12 May 2026 09:18:57 +0530 Subject: [PATCH 388/538] [AArch64] Improve fixed vector lowering for cttz/ctlz when sve (#192427) When available we should be able to lower fixed size 64/128 bit cttz/ctlz vector instructions using sve. Below are the performance uplifts +--------------+--------------+--------------+ | Function | CTTZ Speedup | CTLZ Speedup | +--------------+--------------+--------------+ | u8x8 | 1.33x | 1.00x | | u16x4 | 1.26x | 1.00x | | u32x2 | 1.26x | 1.00x | | u64x1 | 1.00x | 7.00x | +--------------+--------------+--------------+ | u8x16 | 1.67x | 1.00x | | u16x8 | 1.51x | 1.00x | | u32x4 | 1.51x | 1.00x | | u64x2 | 1.26x | 7.00x | +--------------+--------------+--------------+ --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++++---- .../CodeGen/AArch64/sve-fixed-length-bit-counting.ll | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d8ba83dd1d584..782b928f4b841 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1948,6 +1948,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); } + for (auto VT : {MVT::v1i64, MVT::v2i64}) { + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + } + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1974,9 +1979,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, VT, Custom); // These operations are not supported on NEON but SVE can do them. - setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); setOperationAction(ISD::SMAX, MVT::v1i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Custom); setOperationAction(ISD::SMIN, MVT::v1i64, Custom); @@ -1993,7 +1995,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Int operations with no NEON support. for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}) { - setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll index 2de1976322db0..8b333ea342447 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll @@ -273,7 +273,7 @@ define void @ctlz_v64i32(ptr %a) vscale_range(16,0) #0 { ret void } -define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { +define <1 x i64> @ctlz_v1i64(<1 x i64> %op) #0 { ; CHECK-LABEL: ctlz_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl1 @@ -285,7 +285,7 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { ret <1 x i64> %res } -define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { +define <2 x i64> @ctlz_v2i64(<2 x i64> %op) #0 { ; CHECK-LABEL: ctlz_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl2 @@ -1029,7 +1029,7 @@ define void @cttz_v64i32(ptr %a) vscale_range(16,0) #0 { ret void } -define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { +define <1 x i64> @cttz_v1i64(<1 x i64> %op) #0 { ; CHECK-LABEL: cttz_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl1 @@ -1042,7 +1042,7 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { ret <1 x i64> %res } -define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { +define <2 x i64> @cttz_v2i64(<2 x i64> %op) #0 { ; CHECK-LABEL: cttz_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl2 From c583e3280ac3f9cd715f9c1d76c492cfd6c69bc2 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Tue, 12 May 2026 12:01:35 +0800 Subject: [PATCH 389/538] [RISCV] Rename dota4 to dot4a (#196921) I don't know how this slipped our eyes... --- clang/include/clang/Basic/riscv_vector.td | 16 +- .../{vdota4_vv.c => vdot4a_vv.c} | 80 +++--- .../{vdota4_vx.c => vdot4a_vx.c} | 80 +++--- .../{vdota4su_vv.c => vdot4asu_vv.c} | 80 +++--- .../{vdota4su_vx.c => vdot4asu_vx.c} | 80 +++--- .../{vdota4u_vv.c => vdot4au_vv.c} | 80 +++--- .../{vdota4u_vx.c => vdot4au_vx.c} | 80 +++--- .../{vdota4us_vx.c => vdot4aus_vx.c} | 80 +++--- .../overloaded/{vdota4_vv.c => vdot4a_vv.c} | 80 +++--- .../overloaded/{vdota4_vx.c => vdot4a_vx.c} | 80 +++--- .../{vdota4su_vv.c => vdot4asu_vv.c} | 80 +++--- .../{vdota4su_vx.c => vdot4asu_vx.c} | 80 +++--- .../overloaded/{vdota4u_vv.c => vdot4au_vv.c} | 80 +++--- .../overloaded/{vdota4u_vx.c => vdot4au_vx.c} | 80 +++--- .../{vdota4us_vx.c => vdot4aus_vx.c} | 80 +++--- .../{vdota4_vv.c => vdot4a_vv.c} | 160 +++++------ .../{vdota4_vx.c => vdot4a_vx.c} | 160 +++++------ .../{vdota4su_vv.c => vdot4asu_vv.c} | 160 +++++------ .../{vdota4su_vx.c => vdot4asu_vx.c} | 160 +++++------ .../{vdota4u_vv.c => vdot4au_vv.c} | 160 +++++------ .../{vdota4u_vx.c => vdot4au_vx.c} | 160 +++++------ .../{vdota4us_vx.c => vdot4aus_vx.c} | 160 +++++------ .../overloaded/{vdota4_vv.c => vdot4a_vv.c} | 160 +++++------ .../overloaded/{vdota4_vx.c => vdot4a_vx.c} | 160 +++++------ .../{vdota4su_vv.c => vdot4asu_vv.c} | 160 +++++------ .../{vdota4su_vx.c => vdot4asu_vx.c} | 160 +++++------ .../overloaded/{vdota4u_vv.c => vdot4au_vv.c} | 160 +++++------ .../overloaded/{vdota4u_vx.c => vdot4au_vx.c} | 160 +++++------ .../{vdota4us_vx.c => vdot4aus_vx.c} | 160 +++++------ llvm/include/llvm/IR/IntrinsicsRISCV.td | 18 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 34 +-- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 20 +- .../Target/RISCV/RISCVInstrInfoZvdot4a8i.td | 52 ++-- .../Target/RISCV/RISCVSelectionDAGInfo.cpp | 6 +- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 4 +- .../CodeGen/RISCV/rvv/commutable-zvdot4a8i.ll | 58 ++-- .../RISCV/rvv/fixed-vectors-zvdot4a8i.ll | 258 +++++++++--------- .../RISCV/rvv/{vdota4.ll => vdot4a.ll} | 160 +++++------ .../RISCV/rvv/{vdota4su.ll => vdot4asu.ll} | 160 +++++------ .../RISCV/rvv/{vdota4u.ll => vdot4au.ll} | 160 +++++------ .../RISCV/rvv/{vdota4us.ll => vdot4aus.ll} | 80 +++--- .../CodeGen/RISCV/rvv/zvdot4a8i-sdnode.ll | 144 +++++----- llvm/test/MC/RISCV/rvv/zvdot4a8i-invalid.s | 8 +- llvm/test/MC/RISCV/rvv/zvdot4a8i.s | 56 ++-- .../RISCV/partial-reduce-dot-product.ll | 48 ++-- 45 files changed, 2321 insertions(+), 2321 deletions(-) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4_vv.c => vdot4a_vv.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4_vx.c => vdot4a_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4su_vv.c => vdot4asu_vv.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4su_vx.c => vdot4asu_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4u_vv.c => vdot4au_vv.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4u_vx.c => vdot4au_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/{vdota4us_vx.c => vdot4aus_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4_vv.c => vdot4a_vv.c} (75%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4_vx.c => vdot4a_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4su_vv.c => vdot4asu_vv.c} (75%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4su_vx.c => vdot4asu_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4u_vv.c => vdot4au_vv.c} (75%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4u_vx.c => vdot4au_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/{vdota4us_vx.c => vdot4aus_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4_vv.c => vdot4a_vv.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4_vx.c => vdot4a_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4su_vv.c => vdot4asu_vv.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4su_vx.c => vdot4asu_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4u_vv.c => vdot4au_vv.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4u_vx.c => vdot4au_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/{vdota4us_vx.c => vdot4aus_vx.c} (73%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4_vv.c => vdot4a_vv.c} (75%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4_vx.c => vdot4a_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4su_vv.c => vdot4asu_vv.c} (75%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4su_vx.c => vdot4asu_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4u_vv.c => vdot4au_vv.c} (75%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4u_vx.c => vdot4au_vx.c} (74%) rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/{vdota4us_vx.c => vdot4aus_vx.c} (74%) rename llvm/test/CodeGen/RISCV/rvv/{vdota4.ll => vdot4a.ll} (62%) rename llvm/test/CodeGen/RISCV/rvv/{vdota4su.ll => vdot4asu.ll} (62%) rename llvm/test/CodeGen/RISCV/rvv/{vdota4u.ll => vdot4au.ll} (62%) rename llvm/test/CodeGen/RISCV/rvv/{vdota4us.ll => vdot4aus.ll} (62%) diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td index 06b13dd5725f8..386290f8b4ba1 100644 --- a/clang/include/clang/Basic/riscv_vector.td +++ b/clang/include/clang/Basic/riscv_vector.td @@ -2116,7 +2116,7 @@ let UnMaskedPolicyScheme = HasPolicyOperand, HasMasked = false in { } // Zvdot4a8i -multiclass RVVVDOTA4QBuiltinSet> suffixes_prototypes> { +multiclass RVVVDOT4AQBuiltinSet> suffixes_prototypes> { let UnMaskedPolicyScheme = HasPolicyOperand, HasMaskedOffOperand = false, OverloadedName = NAME, @@ -2128,16 +2128,16 @@ multiclass RVVVDOTA4QBuiltinSet> suffixes_prototypes> { // Only SEW=32 is defined for zvdot4a8i so far, and since inputs are in fact // four 8-bit integer bundles, we use unsigned type to represent all of them let RequiredFeatures = ["zvdot4a8i"] in { - defm vdota4 - : RVVVDOTA4QBuiltinSet<[["vv", "v", "vvUvUv"], + defm vdot4a + : RVVVDOT4AQBuiltinSet<[["vv", "v", "vvUvUv"], ["vx", "v", "vvUvUe"]]>; - defm vdota4u - : RVVVDOTA4QBuiltinSet<[["vv", "Uv", "UvUvUvUv"], + defm vdot4au + : RVVVDOT4AQBuiltinSet<[["vv", "Uv", "UvUvUvUv"], ["vx", "Uv", "UvUvUvUe"]]>; - defm vdota4su - : RVVVDOTA4QBuiltinSet<[["vv", "v", "vvUvUv"], + defm vdot4asu + : RVVVDOT4AQBuiltinSet<[["vv", "v", "vvUvUv"], ["vx", "v", "vvUvUe"]]>; - defm vdota4us : RVVVDOTA4QBuiltinSet<[["vx", "v", "vvUvUe"]]>; + defm vdot4aus : RVVVDOT4AQBuiltinSet<[["vx", "v", "vvUvUe"]]>; } // Zvzip diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4a_vv.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4a_vv.c index 22f9053ce4a18..d5c1a241dbd60 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4a_vv.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, +vint32mf2_t test_vdot4a_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32mf2(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32mf2(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vint32m1_t test_vdot4a_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m1(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m1(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vint32m2_t test_vdot4a_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m2(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m2(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vint32m4_t test_vdot4a_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m4(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m4(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vint32m8_t test_vdot4a_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m8(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m8(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32mf2_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32mf2_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m1_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m1_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m2_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m2_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m4_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m4_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m8_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m8_m(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4a_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4a_vx.c index 2045577d58ca1..be62f7fb1c644 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4a_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, +vint32mf2_t test_vdot4a_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32mf2(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32mf2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4a_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m1(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m1(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4a_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m2(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4a_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m4(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m4(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4a_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m8(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m8(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32mf2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32mf2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m1_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m1_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m4_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m4_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m8_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m8_m(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4su_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4asu_vv.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4su_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4asu_vv.c index 21290663366ba..e710753329a86 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4su_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4asu_vv.c @@ -6,113 +6,113 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4asu_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32mf2(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32mf2(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vint32m1_t test_vdot4asu_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m1(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m1(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vint32m2_t test_vdot4asu_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m2(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m2(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vint32m4_t test_vdot4asu_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m4(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m4(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vint32m8_t test_vdot4asu_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m8(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m8(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32mf2_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32mf2_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m1_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m1_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m2_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m2_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m4_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m4_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m8_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m8_m(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4su_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4asu_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4su_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4asu_vx.c index fd0095f9eec4a..3f1eb899817d6 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4su_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4asu_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, +vint32mf2_t test_vdot4asu_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32mf2(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32mf2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4asu_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m1(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m1(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4asu_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m2(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4asu_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m4(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m4(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4asu_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m8(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m8(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32mf2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32mf2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m1_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m1_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m4_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m4_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m8_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m8_m(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4u_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4au_vv.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4u_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4au_vv.c index e203e30252426..e91e9ded51131 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4u_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4au_vv.c @@ -6,113 +6,113 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vv_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32mf2(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32mf2(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1(vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vuint32m1_t test_vdot4au_vv_u32m1(vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m1(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m1(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2(vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vuint32m2_t test_vdot4au_vv_u32m2(vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m2(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m2(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4(vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vuint32m4_t test_vdot4au_vv_u32m4(vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m4(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m4(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8(vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vuint32m8_t test_vdot4au_vv_u32m8(vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m8(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m8(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32mf2_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32mf2_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vv_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m1_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m1_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vv_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m2_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m2_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vv_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m4_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m4_m(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vv_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m8_m(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m8_m(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4u_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4au_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4u_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4au_vx.c index 44ec0e6ba7edb..d9cc4670f8349 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4u_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4au_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vx_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32mf2(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32mf2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1(vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vuint32m1_t test_vdot4au_vx_u32m1(vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m1(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m1(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2(vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vuint32m2_t test_vdot4au_vx_u32m2(vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m2(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4(vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vuint32m4_t test_vdot4au_vx_u32m4(vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m4(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m4(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8(vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vuint32m8_t test_vdot4au_vx_u32m8(vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m8(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m8(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32mf2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32mf2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vx_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m1_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m1_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vx_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vx_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m4_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m4_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vx_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m8_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m8_m(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4us_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4aus_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4us_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4aus_vx.c index 064ff0c459ccf..c7792c32b5347 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdota4us_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/non-overloaded/vdot4aus_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4aus_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32mf2(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32mf2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4aus_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m1(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m1(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4aus_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m2(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m2(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4aus_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m4(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m4(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4aus_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m8(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m8(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32mf2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32mf2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4aus_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m1_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m1_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4aus_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m2_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m2_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4aus_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m4_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m4_m(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4aus_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m8_m(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m8_m(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4a_vv.c similarity index 75% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4a_vv.c index b3b41292c127d..c733e71512509 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4a_vv.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, +vint32mf2_t test_vdot4a_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4(vd, vs2, vs1, vl); + return __riscv_vdot4a(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vint32m1_t test_vdot4a_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4(vd, vs2, vs1, vl); + return __riscv_vdot4a(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vint32m2_t test_vdot4a_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4(vd, vs2, vs1, vl); + return __riscv_vdot4a(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vint32m4_t test_vdot4a_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4(vd, vs2, vs1, vl); + return __riscv_vdot4a(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vint32m8_t test_vdot4a_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4(vd, vs2, vs1, vl); + return __riscv_vdot4a(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4a_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4a_vx.c index a050941275d1a..c197e75cd422d 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4a_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, +vint32mf2_t test_vdot4a_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vd, vs2, rs1, vl); + return __riscv_vdot4a(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4a_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vd, vs2, rs1, vl); + return __riscv_vdot4a(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4a_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vd, vs2, rs1, vl); + return __riscv_vdot4a(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4a_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vd, vs2, rs1, vl); + return __riscv_vdot4a(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4a_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vd, vs2, rs1, vl); + return __riscv_vdot4a(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4su_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4asu_vv.c similarity index 75% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4su_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4asu_vv.c index 095c7254df378..e6f15424ce213 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4su_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4asu_vv.c @@ -6,113 +6,113 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4asu_vv_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, vs1, vl); + return __riscv_vdot4asu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vint32m1_t test_vdot4asu_vv_i32m1(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, vs1, vl); + return __riscv_vdot4asu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vint32m2_t test_vdot4asu_vv_i32m2(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, vs1, vl); + return __riscv_vdot4asu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vint32m4_t test_vdot4asu_vv_i32m4(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, vs1, vl); + return __riscv_vdot4asu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vint32m8_t test_vdot4asu_vv_i32m8(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, vs1, vl); + return __riscv_vdot4asu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4su_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4asu_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4su_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4asu_vx.c index a96cd01b3737e..93fb2741dac2a 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4su_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4asu_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, +vint32mf2_t test_vdot4asu_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, rs1, vl); + return __riscv_vdot4asu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4asu_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, rs1, vl); + return __riscv_vdot4asu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4asu_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, rs1, vl); + return __riscv_vdot4asu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4asu_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, rs1, vl); + return __riscv_vdot4asu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4asu_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vd, vs2, rs1, vl); + return __riscv_vdot4asu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4u_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4au_vv.c similarity index 75% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4u_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4au_vv.c index ccaa68c5bb298..c3fe9aa77ee9c 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4u_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4au_vv.c @@ -6,113 +6,113 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vv_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, vs1, vl); + return __riscv_vdot4au(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1(vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vuint32m1_t test_vdot4au_vv_u32m1(vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, vs1, vl); + return __riscv_vdot4au(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2(vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vuint32m2_t test_vdot4au_vv_u32m2(vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, vs1, vl); + return __riscv_vdot4au(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4(vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vuint32m4_t test_vdot4au_vv_u32m4(vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, vs1, vl); + return __riscv_vdot4au(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8(vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vuint32m8_t test_vdot4au_vv_u32m8(vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, vs1, vl); + return __riscv_vdot4au(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vv_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vv_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vv_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vv_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4u_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4au_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4u_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4au_vx.c index a6ba47aec9c29..643002fa4e067 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4u_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4au_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vx_u32mf2(vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, rs1, vl); + return __riscv_vdot4au(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1(vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vuint32m1_t test_vdot4au_vx_u32m1(vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, rs1, vl); + return __riscv_vdot4au(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2(vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vuint32m2_t test_vdot4au_vx_u32m2(vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, rs1, vl); + return __riscv_vdot4au(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4(vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vuint32m4_t test_vdot4au_vx_u32m4(vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, rs1, vl); + return __riscv_vdot4au(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8(vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vuint32m8_t test_vdot4au_vx_u32m8(vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vd, vs2, rs1, vl); + return __riscv_vdot4au(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_m(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vx_u32m1_m(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vx_u32m2_m(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vx_u32m4_m(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vx_u32m8_m(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4us_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4aus_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4us_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4aus_vx.c index 4c87bc2372a69..067a4bea18212 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdota4us_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/non-policy/overloaded/vdot4aus_vx.c @@ -6,112 +6,112 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4aus_vx_i32mf2(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vd, vs2, rs1, vl); + return __riscv_vdot4aus(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4aus_vx_i32m1(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vd, vs2, rs1, vl); + return __riscv_vdot4aus(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4aus_vx_i32m2(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vd, vs2, rs1, vl); + return __riscv_vdot4aus(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4aus_vx_i32m4(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vd, vs2, rs1, vl); + return __riscv_vdot4aus(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4aus_vx_i32m8(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vd, vs2, rs1, vl); + return __riscv_vdot4aus(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_m(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4aus_vx_i32m1_m(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4aus_vx_i32m2_m(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4aus_vx_i32m4_m(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_m( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_m( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 3) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4aus_vx_i32m8_m(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4a_vv.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4a_vv.c index 753087c6ca811..4bdb1179fa373 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4a_vv.c @@ -6,224 +6,224 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32mf2_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32mf2_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vint32m1_t test_vdot4a_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m1_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m1_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vint32m2_t test_vdot4a_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m2_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m2_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vint32m4_t test_vdot4a_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m4_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vint32m8_t test_vdot4a_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m8_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m8_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32mf2_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32mf2_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m1_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m1_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m2_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m2_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m4_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m8_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m8_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32mf2_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32mf2_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m1_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m1_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m2_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m2_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m4_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m8_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m8_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32mf2_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32mf2_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m1_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m1_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m2_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m2_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m4_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_vv_i32m8_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_vv_i32m8_mu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4a_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4a_vx.c index 3d9e954e1d067..fabf45616d570 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4a_vx.c @@ -6,222 +6,222 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32mf2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32mf2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4a_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m1_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m1_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4a_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4a_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m4_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4a_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m8_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m8_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32mf2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32mf2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m1_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m1_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m4_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m8_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m8_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32mf2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32mf2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m1_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m1_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m4_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m8_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m8_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32mf2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32mf2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m1_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m1_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m4_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_vx_i32m8_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_vx_i32m8_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4su_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4asu_vv.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4su_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4asu_vv.c index 9d643edf7b7ed..36e3185d032bc 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4su_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4asu_vv.c @@ -6,227 +6,227 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4asu_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32mf2_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32mf2_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m1_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m1_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m2_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m2_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m4_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m8_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m8_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32mf2_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32mf2_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m1_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m1_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m2_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m2_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m4_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m8_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m8_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32mf2_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32mf2_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4asu_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m1_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m1_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4asu_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m2_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m2_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m4_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m8_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m8_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32mf2_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32mf2_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m1_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m1_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m2_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m2_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m4_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_vv_i32m8_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_vv_i32m8_mu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4su_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4asu_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4su_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4asu_vx.c index 74f5cfc73cab7..e437161ff1207 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4su_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4asu_vx.c @@ -6,224 +6,224 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4asu_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32mf2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32mf2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4asu_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m1_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m1_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4asu_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4asu_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m4_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4asu_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m8_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m8_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32mf2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32mf2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m1_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m1_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m4_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m8_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m8_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32mf2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32mf2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4asu_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m1_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m1_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4asu_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m4_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m8_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m8_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32mf2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32mf2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m1_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m1_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m4_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_vx_i32m8_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_vx_i32m8_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4u_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4au_vv.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4u_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4au_vv.c index 59177c74ba70d..d018c57305618 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4u_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4au_vv.c @@ -6,233 +6,233 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vv_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32mf2_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32mf2_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vv_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m1_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m1_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vv_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m2_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m2_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vv_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m4_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vv_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m8_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m8_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32mf2_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32mf2_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_tum(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vv_u32m1_tum(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m1_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m1_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_tum(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vv_u32m2_tum(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m2_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m2_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_tum(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vv_u32m4_tum(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m4_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_tum(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vv_u32m8_tum(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m8_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m8_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32mf2_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32mf2_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vv_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m1_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m1_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vv_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m2_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m2_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vv_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m4_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vv_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m8_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m8_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32mf2_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32mf2_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_mu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vv_u32m1_mu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m1_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m1_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_mu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vv_u32m2_mu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m2_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m2_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vv_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m4_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vv_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_vv_u32m8_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_vv_u32m8_mu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4u_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4au_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4u_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4au_vx.c index 9068dcb39218d..d33ab1b6585b5 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4u_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4au_vx.c @@ -6,225 +6,225 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vx_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32mf2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32mf2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vx_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m1_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m1_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vx_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vx_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m4_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vx_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m8_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m8_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32mf2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32mf2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_tum(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vx_u32m1_tum(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m1_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m1_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_tum(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vx_u32m2_tum(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_tum(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vx_u32m4_tum(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m4_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_tum(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vx_u32m8_tum(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m8_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m8_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32mf2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32mf2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vx_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m1_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m1_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vx_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vx_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m4_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vx_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m8_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m8_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32mf2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32mf2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_mu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vx_u32m1_mu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m1_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m1_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_mu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vx_u32m2_mu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vx_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m4_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vx_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_vx_u32m8_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_vx_u32m8_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4us_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4aus_vx.c similarity index 73% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4us_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4aus_vx.c index 01da3b8e04d67..56c58cb790352 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdota4us_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/non-overloaded/vdot4aus_vx.c @@ -6,225 +6,225 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4aus_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32mf2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32mf2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4aus_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m1_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m1_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4aus_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m2_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m2_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4aus_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m4_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4aus_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m8_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m8_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32mf2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32mf2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4aus_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m1_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m1_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4aus_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m2_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m2_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4aus_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m4_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4aus_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m8_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m8_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32mf2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32mf2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4aus_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m1_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m1_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4aus_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m2_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m2_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, +vint32m4_t test_vdot4aus_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m4_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, +vint32m8_t test_vdot4aus_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m8_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m8_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32mf2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32mf2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4aus_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m1_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m1_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4aus_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m2_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m2_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4aus_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m4_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4aus_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_vx_i32m8_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_vx_i32m8_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4a_vv.c similarity index 75% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4a_vv.c index fa14df5d58c95..06e15986b5f9d 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4a_vv.c @@ -6,224 +6,224 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, +vint32m1_t test_vdot4a_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, +vint32m2_t test_vdot4a_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, +vint32m4_t test_vdot4a_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, +vint32m8_t test_vdot4a_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, vs1, vl); + return __riscv_vdot4a_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vv_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vv_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4a_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4a_vx.c index b78992c17e909..bde5c14377a60 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4a_vx.c @@ -6,222 +6,222 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4a_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4a_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4a_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4a_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4a_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tu(vd, vs2, rs1, vl); + return __riscv_vdot4a_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4a_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4a_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4a_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4a_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4_vx_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4a_vx_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4a.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4a_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4a_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4su_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4asu_vv.c similarity index 75% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4su_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4asu_vv.c index effe5a58c0efa..af485557b88b4 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4su_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4asu_vv.c @@ -6,227 +6,227 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4asu_vv_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, vs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4asu_vv_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4asu_vv_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vv_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vv_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vv_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vv_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vv_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vv_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vv_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4su_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4asu_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4su_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4asu_vx.c index 6cb2462283e9e..2323083af8c7f 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4su_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4asu_vx.c @@ -6,224 +6,224 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4asu_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4asu_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4asu_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4asu_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4asu_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tu(vd, vs2, rs1, vl); + return __riscv_vdot4asu_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4asu_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4asu_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4su_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4asu_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4su_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4asu_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4su_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4asu_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4su_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4asu_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4su_vx_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4asu_vx_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4su.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4su_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4asu_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4su_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4asu_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4u_vv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4au_vv.c similarity index 75% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4u_vv.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4au_vv.c index 26f9131e61ce7..5f9b75210a740 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4u_vv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4au_vv.c @@ -6,233 +6,233 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vv_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vv_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vv_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vv_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vv_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, vs1, vl); + return __riscv_vdot4au_tu(vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_tum(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vv_u32m1_tum(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_tum(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vv_u32m2_tum(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_tum(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vv_u32m4_tum(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_tum(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vv_u32m8_tum(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vv_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vv_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vv_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vv_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vv_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vv_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, vuint32mf2_t vs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vv_u32m1_mu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vv_u32m1_mu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, vuint32m1_t vs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vv_u32m2_mu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vv_u32m2_mu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, vuint32m2_t vs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vv_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vv_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, vuint32m4_t vs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, vs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vv_u32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vv_u32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32.i64( [[VD]], [[VS2]], [[VS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vv_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vv_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, vuint32m8_t vs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, vs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, vs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4u_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4au_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4u_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4au_vx.c index 2ecc45e61c476..2895be49178ed 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4u_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4au_vx.c @@ -6,225 +6,225 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, +vuint32mf2_t test_vdot4au_vx_u32mf2_tu(vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, +vuint32m1_t test_vdot4au_vx_u32m1_tu(vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, +vuint32m2_t test_vdot4au_vx_u32m2_tu(vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vx_u32m4_tu(vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vx_u32m8_tu(vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tu(vd, vs2, rs1, vl); + return __riscv_vdot4au_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_tum(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_tum(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vx_u32m1_tum(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_tum(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vx_u32m2_tum(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_tum(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vx_u32m4_tum(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_tum(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vx_u32m8_tum(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_tumu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vx_u32m1_tumu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vx_u32m2_tumu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, +vuint32m4_t test_vdot4au_vx_u32m4_tumu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, +vuint32m8_t test_vdot4au_vx_u32m8_tumu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32mf2_t test_vdota4u_vx_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, +vuint32mf2_t test_vdot4au_vx_u32mf2_mu(vbool64_t vm, vuint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m1_t test_vdota4u_vx_u32m1_mu(vbool32_t vm, vuint32m1_t vd, +vuint32m1_t test_vdot4au_vx_u32m1_mu(vbool32_t vm, vuint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m2_t test_vdota4u_vx_u32m2_mu(vbool16_t vm, vuint32m2_t vd, +vuint32m2_t test_vdot4au_vx_u32m2_mu(vbool16_t vm, vuint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m4_t test_vdota4u_vx_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, +vuint32m4_t test_vdot4au_vx_u32m4_mu(vbool8_t vm, vuint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4u_vx_u32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4au_vx_u32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4u.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4au.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vuint32m8_t test_vdota4u_vx_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, +vuint32m8_t test_vdot4au_vx_u32m8_mu(vbool4_t vm, vuint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4u_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4au_mu(vm, vd, vs2, rs1, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4us_vx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4aus_vx.c similarity index 74% rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4us_vx.c rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4aus_vx.c index 380962ff3499d..d3cb25269d7c5 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdota4us_vx.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvdot4a8i/policy/overloaded/vdot4aus_vx.c @@ -6,225 +6,225 @@ #include -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, +vint32mf2_t test_vdot4aus_vx_i32mf2_tu(vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, +vint32m1_t test_vdot4aus_vx_i32m1_tu(vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, +vint32m2_t test_vdot4aus_vx_i32m2_tu(vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, +vint32m4_t test_vdot4aus_vx_i32m4_tu(vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_tu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_tu( // CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, +vint32m8_t test_vdot4aus_vx_i32m8_tu(vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tu(vd, vs2, rs1, vl); + return __riscv_vdot4aus_tu(vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_tum(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4aus_vx_i32m1_tum(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4aus_vx_i32m2_tum(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4aus_vx_i32m4_tum(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_tum( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_tum( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 2) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4aus_vx_i32m8_tum(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tum(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tum(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_tumu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, +vint32m1_t test_vdot4aus_vx_i32m1_tumu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, +vint32m2_t test_vdot4aus_vx_i32m2_tumu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, +vint32m4_t test_vdot4aus_vx_i32m4_tumu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_tumu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_tumu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 0) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, +vint32m8_t test_vdot4aus_vx_i32m8_tumu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_tumu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_tumu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32mf2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32mf2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32mf2_t test_vdota4us_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, +vint32mf2_t test_vdot4aus_vx_i32mf2_mu(vbool64_t vm, vint32mf2_t vd, vuint32mf2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m1_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m1_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m1_t test_vdota4us_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, +vint32m1_t test_vdot4aus_vx_i32m1_mu(vbool32_t vm, vint32m1_t vd, vuint32m1_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m2_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m2_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m2_t test_vdota4us_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, +vint32m2_t test_vdot4aus_vx_i32m2_mu(vbool16_t vm, vint32m2_t vd, vuint32m2_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m4_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m4_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m4_t test_vdota4us_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, +vint32m4_t test_vdot4aus_vx_i32m4_mu(vbool8_t vm, vint32m4_t vd, vuint32m4_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_mu(vm, vd, vs2, rs1, vl); } -// CHECK-RV64-LABEL: define dso_local @test_vdota4us_vx_i32m8_mu( +// CHECK-RV64-LABEL: define dso_local @test_vdot4aus_vx_i32m8_mu( // CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i32 noundef signext [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { // CHECK-RV64-NEXT: entry: -// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdota4us.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32.i64( [[VD]], [[VS2]], i32 [[RS1]], [[VM]], i64 [[VL]], i64 1) // CHECK-RV64-NEXT: ret [[TMP0]] // -vint32m8_t test_vdota4us_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, +vint32m8_t test_vdot4aus_vx_i32m8_mu(vbool4_t vm, vint32m8_t vd, vuint32m8_t vs2, uint32_t rs1, size_t vl) { - return __riscv_vdota4us_mu(vm, vd, vs2, rs1, vl); + return __riscv_vdot4aus_mu(vm, vd, vs2, rs1, vl); } diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index caf8fa6f9be81..f53f752c25c30 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -2003,7 +2003,7 @@ let TargetPrefix = "riscv" in { // We use llvm_anyvector_ty and llvm_anyint_ty for future extensibility // purpose but only EEW=32 is defined for now // Input: (vector_in, vector_in, vector_in/scalar_in, vl, policy) - class RISCVVDOTA4UnMasked + class RISCVVDOT4AUnMasked : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, !if(HasVV, llvm_any_ty, llvm_anyint_ty), @@ -2014,7 +2014,7 @@ let TargetPrefix = "riscv" in { let VLOperand = 3; } // Input: (vector_in, vector_in, vector_in/scalar_in, mask, vl, policy) - class RISCVVDOTA4Masked + class RISCVVDOT4AMasked : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, !if(HasVV, llvm_any_ty, llvm_anyint_ty), @@ -2026,15 +2026,15 @@ let TargetPrefix = "riscv" in { let VLOperand = 4; } - multiclass RISCVVDOTA4 { - def "int_riscv_" # NAME : RISCVVDOTA4UnMasked; - def "int_riscv_" # NAME # "_mask" : RISCVVDOTA4Masked; + multiclass RISCVVDOT4A { + def "int_riscv_" # NAME : RISCVVDOT4AUnMasked; + def "int_riscv_" # NAME # "_mask" : RISCVVDOT4AMasked; } - defm vdota4 : RISCVVDOTA4; - defm vdota4u : RISCVVDOTA4; - defm vdota4su : RISCVVDOTA4; - defm vdota4us : RISCVVDOTA4; + defm vdot4a : RISCVVDOT4A; + defm vdot4au : RISCVVDOT4A; + defm vdot4asu : RISCVVDOT4A; + defm vdot4aus : RISCVVDOT4A; } // TargetPrefix = "riscv" diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 93e820b4713ec..b3258898452f9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9182,7 +9182,7 @@ SDValue RISCVTargetLowering::lowerADJUST_TRAMPOLINE(SDValue Op, SDValue RISCVTargetLowering::lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const { - // Currently, only the vdota4 and vdota4u case (from zvdot4a8i) should be + // Currently, only the vdot4a and vdot4au case (from zvdot4a8i) should be // legal. // TODO: There are many other sub-cases we could potentially lower, are // any of them worthwhile? Ex: via vredsum, vwredsum, vwwmaccu, etc.. @@ -9215,13 +9215,13 @@ SDValue RISCVTargetLowering::lowerPARTIAL_REDUCE_MLA(SDValue Op, unsigned Opc; switch (Op.getOpcode()) { case ISD::PARTIAL_REDUCE_SMLA: - Opc = RISCVISD::VDOTA4_VL; + Opc = RISCVISD::VDOT4A_VL; break; case ISD::PARTIAL_REDUCE_UMLA: - Opc = RISCVISD::VDOTA4U_VL; + Opc = RISCVISD::VDOT4AU_VL; break; case ISD::PARTIAL_REDUCE_SUMLA: - Opc = RISCVISD::VDOTA4SU_VL; + Opc = RISCVISD::VDOT4ASU_VL; break; default: llvm_unreachable("Unexpected opcode"); @@ -19984,7 +19984,7 @@ static SDValue getZeroPaddedAdd(const SDLoc &DL, SDValue A, SDValue B, return DAG.getInsertSubvector(DL, B, Res, 0); } -static SDValue foldReduceOperandViaVDOTA4(SDValue InVec, const SDLoc &DL, +static SDValue foldReduceOperandViaVDOT4A(SDValue InVec, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const RISCVTargetLowering &TLI) { @@ -20000,8 +20000,8 @@ static SDValue foldReduceOperandViaVDOTA4(SDValue InVec, const SDLoc &DL, // form). SDValue A, B; if (sd_match(InVec, m_AddLike(m_Value(A), m_Value(B)))) { - SDValue AOpt = foldReduceOperandViaVDOTA4(A, DL, DAG, Subtarget, TLI); - SDValue BOpt = foldReduceOperandViaVDOTA4(B, DL, DAG, Subtarget, TLI); + SDValue AOpt = foldReduceOperandViaVDOT4A(A, DL, DAG, Subtarget, TLI); + SDValue BOpt = foldReduceOperandViaVDOT4A(B, DL, DAG, Subtarget, TLI); if (AOpt || BOpt) { if (AOpt) A = AOpt; @@ -20079,7 +20079,7 @@ static SDValue performVECREDUCECombine(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); EVT VT = N->getValueType(0); SDValue InVec = N->getOperand(0); - if (SDValue V = foldReduceOperandViaVDOTA4(InVec, DL, DAG, Subtarget, TLI)) + if (SDValue V = foldReduceOperandViaVDOT4A(InVec, DL, DAG, Subtarget, TLI)) return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, V); return SDValue(); } @@ -20407,7 +20407,7 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opc, DL, VT, Ops); } -static SDValue combineVdota4Accum(SDNode *N, SelectionDAG &DAG, +static SDValue combineVdot4aAccum(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD); @@ -20424,21 +20424,21 @@ static SDValue combineVdota4Accum(SDNode *N, SelectionDAG &DAG, return SDValue(); } - auto IsVdota4Opc = [](unsigned Opc) { + auto IsVdot4aOpc = [](unsigned Opc) { switch (Opc) { - case RISCVISD::VDOTA4_VL: - case RISCVISD::VDOTA4U_VL: - case RISCVISD::VDOTA4SU_VL: + case RISCVISD::VDOT4A_VL: + case RISCVISD::VDOT4AU_VL: + case RISCVISD::VDOT4ASU_VL: return true; default: return false; } }; - if (!IsVdota4Opc(DotOp.getOpcode())) + if (!IsVdot4aOpc(DotOp.getOpcode())) std::swap(Addend, DotOp); - if (!IsVdota4Opc(DotOp.getOpcode())) + if (!IsVdot4aOpc(DotOp.getOpcode())) return SDValue(); auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG, @@ -21274,7 +21274,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return V; if (SDValue V = combineToVWMACC(N, DAG, Subtarget)) return V; - if (SDValue V = combineVdota4Accum(N, DAG, Subtarget)) + if (SDValue V = combineVdot4aAccum(N, DAG, Subtarget)) return V; return performADDCombine(N, DCI, Subtarget); } @@ -21848,7 +21848,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return V; if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget)) return V; - if (SDValue V = combineVdota4Accum(N, DAG, Subtarget)) + if (SDValue V = combineVdot4aAccum(N, DAG, Subtarget)) return V; return combineToVWMACC(N, DAG, Subtarget); case RISCVISD::VWADDU_VL: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 93512842712df..f2cfee8477883 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4193,16 +4193,16 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, case CASE_RVV_OPCODE(VAADD_VV): case CASE_RVV_OPCODE(VAADDU_VV): case CASE_RVV_OPCODE(VSMUL_VV): - case CASE_RVV_OPCODE_LMUL(VDOTA4_VV, MF2): - case CASE_RVV_OPCODE_LMUL(VDOTA4_VV, M1): - case CASE_RVV_OPCODE_LMUL(VDOTA4_VV, M2): - case CASE_RVV_OPCODE_LMUL(VDOTA4_VV, M4): - case CASE_RVV_OPCODE_LMUL(VDOTA4_VV, M8): - case CASE_RVV_OPCODE_LMUL(VDOTA4U_VV, MF2): - case CASE_RVV_OPCODE_LMUL(VDOTA4U_VV, M1): - case CASE_RVV_OPCODE_LMUL(VDOTA4U_VV, M2): - case CASE_RVV_OPCODE_LMUL(VDOTA4U_VV, M4): - case CASE_RVV_OPCODE_LMUL(VDOTA4U_VV, M8): + case CASE_RVV_OPCODE_LMUL(VDOT4A_VV, MF2): + case CASE_RVV_OPCODE_LMUL(VDOT4A_VV, M1): + case CASE_RVV_OPCODE_LMUL(VDOT4A_VV, M2): + case CASE_RVV_OPCODE_LMUL(VDOT4A_VV, M4): + case CASE_RVV_OPCODE_LMUL(VDOT4A_VV, M8): + case CASE_RVV_OPCODE_LMUL(VDOT4AU_VV, MF2): + case CASE_RVV_OPCODE_LMUL(VDOT4AU_VV, M1): + case CASE_RVV_OPCODE_LMUL(VDOT4AU_VV, M2): + case CASE_RVV_OPCODE_LMUL(VDOT4AU_VV, M4): + case CASE_RVV_OPCODE_LMUL(VDOT4AU_VV, M8): // Operands 2 and 3 are commutable. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); case CASE_VFMA_SPLATS(FMADD): diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvdot4a8i.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvdot4a8i.td index d8c60dc9a584c..35a98ad7eb15a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvdot4a8i.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvdot4a8i.td @@ -17,7 +17,7 @@ // Instructions //===----------------------------------------------------------------------===// -class VDOTA4VV funct6, RISCVVFormat opv, string opcodestr> +class VDOT4AVV funct6, RISCVVFormat opv, string opcodestr> : RVInstVV { @@ -27,7 +27,7 @@ class VDOTA4VV funct6, RISCVVFormat opv, string opcodestr> let Constraints = "$vd = $vd_wb"; } -class VDOTA4VX funct6, RISCVVFormat opv, string opcodestr> +class VDOT4AVX funct6, RISCVVFormat opv, string opcodestr> : RVInstVX { @@ -38,13 +38,13 @@ class VDOTA4VX funct6, RISCVVFormat opv, string opcodestr> } let Predicates = [HasStdExtZvdot4a8i] in { - def VDOTA4_VV : VDOTA4VV<0b101100, OPMVV, "vdota4.vv">; - def VDOTA4_VX : VDOTA4VX<0b101100, OPMVX, "vdota4.vx">; - def VDOTA4U_VV : VDOTA4VV<0b101000, OPMVV, "vdota4u.vv">; - def VDOTA4U_VX : VDOTA4VX<0b101000, OPMVX, "vdota4u.vx">; - def VDOTA4SU_VV : VDOTA4VV<0b101010, OPMVV, "vdota4su.vv">; - def VDOTA4SU_VX : VDOTA4VX<0b101010, OPMVX, "vdota4su.vx">; - def VDOTA4US_VX : VDOTA4VX<0b101110, OPMVX, "vdota4us.vx">; + def VDOT4A_VV : VDOT4AVV<0b101100, OPMVV, "vdot4a.vv">; + def VDOT4A_VX : VDOT4AVX<0b101100, OPMVX, "vdot4a.vx">; + def VDOT4AU_VV : VDOT4AVV<0b101000, OPMVV, "vdot4au.vv">; + def VDOT4AU_VX : VDOT4AVX<0b101000, OPMVX, "vdot4au.vx">; + def VDOT4ASU_VV : VDOT4AVV<0b101010, OPMVV, "vdot4asu.vv">; + def VDOT4ASU_VX : VDOT4AVX<0b101010, OPMVX, "vdot4asu.vx">; + def VDOT4AUS_VX : VDOT4AVX<0b101110, OPMVX, "vdot4aus.vx">; } // Predicates = [HasStdExtZvdot4a8i] //===----------------------------------------------------------------------===// @@ -52,16 +52,16 @@ let Predicates = [HasStdExtZvdot4a8i] in { //===----------------------------------------------------------------------===// let HasPassthruOp = true, HasMaskOp = true in { - def riscv_vdota4_vl : RVSDNode<"VDOTA4_VL", SDT_RISCVIntBinOp_VL>; - def riscv_vdota4u_vl : RVSDNode<"VDOTA4U_VL", SDT_RISCVIntBinOp_VL>; - def riscv_vdota4su_vl : RVSDNode<"VDOTA4SU_VL", SDT_RISCVIntBinOp_VL>; + def riscv_vdot4a_vl : RVSDNode<"VDOT4A_VL", SDT_RISCVIntBinOp_VL>; + def riscv_vdot4au_vl : RVSDNode<"VDOT4AU_VL", SDT_RISCVIntBinOp_VL>; + def riscv_vdot4asu_vl : RVSDNode<"VDOT4ASU_VL", SDT_RISCVIntBinOp_VL>; } // let HasPassthruOp = true, HasMaskOp = true //===----------------------------------------------------------------------===// // Pseudo Instructions for CodeGen //===----------------------------------------------------------------------===// -multiclass VPseudoVDOTA4_VV_VX { +multiclass VPseudoVDOT4A_VV_VX { foreach m = MxSet<32>.m in { defm "" : VPseudoBinaryV_VV, SchedBinary<"WriteVIMulAddV", "ReadVIMulAddV", "ReadVIMulAddV", m.MX, @@ -74,12 +74,12 @@ multiclass VPseudoVDOTA4_VV_VX { let Predicates = [HasStdExtZvdot4a8i], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - defm PseudoVDOTA4 : VPseudoVDOTA4_VV_VX; - defm PseudoVDOTA4U : VPseudoVDOTA4_VV_VX; - defm PseudoVDOTA4SU : VPseudoVDOTA4_VV_VX; - // VDOTA4US does not have a VV variant + defm PseudoVDOT4A : VPseudoVDOT4A_VV_VX; + defm PseudoVDOT4AU : VPseudoVDOT4A_VV_VX; + defm PseudoVDOT4ASU : VPseudoVDOT4A_VV_VX; + // VDOT4AUS does not have a VV variant foreach m = MxListVF4 in { - defm "PseudoVDOTA4US_VX" : VPseudoTernaryWithPolicy; + defm "PseudoVDOT4AUS_VX" : VPseudoTernaryWithPolicy; } } @@ -88,11 +88,11 @@ let Predicates = [HasStdExtZvdot4a8i], mayLoad = 0, mayStore = 0, //===----------------------------------------------------------------------===// defvar AllE32Vectors = [VI32MF2, VI32M1, VI32M2, VI32M4, VI32M8]; -defm : VPatBinaryVL_VV_VX; -defm : VPatBinaryVL_VV_VX; -defm : VPatBinaryVL_VV_VX; +defm : VPatBinaryVL_VV_VX; +defm : VPatBinaryVL_VV_VX; +defm : VPatBinaryVL_VV_VX; -// These VPat definitions are for vdota4 because they have a different operand +// These VPat definitions are for vdot4a because they have a different operand // order with other ternary instructions (i.e. vop.vx vd, vs2, rs1) multiclass VPatTernaryV_VX_AAAX vtilist, @@ -126,7 +126,7 @@ multiclass VPatTernaryV_VV_VX_AAAX, VPatTernaryV_VX_AAAX; -defm : VPatTernaryV_VV_VX_AAAX<"int_riscv_vdota4", "PseudoVDOTA4", AllE32Vectors, [HasStdExtZvdot4a8i]>; -defm : VPatTernaryV_VV_VX_AAAX<"int_riscv_vdota4u", "PseudoVDOTA4U", AllE32Vectors, [HasStdExtZvdot4a8i]>; -defm : VPatTernaryV_VV_VX_AAAX<"int_riscv_vdota4su", "PseudoVDOTA4SU", AllE32Vectors, [HasStdExtZvdot4a8i]>; -defm : VPatTernaryV_VX_AAAX<"int_riscv_vdota4us", "PseudoVDOTA4US", AllE32Vectors, [HasStdExtZvdot4a8i]>; +defm : VPatTernaryV_VV_VX_AAAX<"int_riscv_vdot4a", "PseudoVDOT4A", AllE32Vectors, [HasStdExtZvdot4a8i]>; +defm : VPatTernaryV_VV_VX_AAAX<"int_riscv_vdot4au", "PseudoVDOT4AU", AllE32Vectors, [HasStdExtZvdot4a8i]>; +defm : VPatTernaryV_VV_VX_AAAX<"int_riscv_vdot4asu", "PseudoVDOT4ASU", AllE32Vectors, [HasStdExtZvdot4a8i]>; +defm : VPatTernaryV_VX_AAAX<"int_riscv_vdot4aus", "PseudoVDOT4AUS", AllE32Vectors, [HasStdExtZvdot4a8i]>; diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp index 2315a7802f7c5..fbb102caecdc6 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp @@ -35,9 +35,9 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, assert(N->getOperand(2).getOpcode() == ISD::TargetConstant && "Expected index to be a target constant!"); break; - case RISCVISD::VDOTA4_VL: - case RISCVISD::VDOTA4U_VL: - case RISCVISD::VDOTA4SU_VL: { + case RISCVISD::VDOT4A_VL: + case RISCVISD::VDOT4AU_VL: + case RISCVISD::VDOT4ASU_VL: { EVT VT = N->getValueType(0); assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i32 && "Expected result to be an i32 scalable vector"); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index ca82f3e1a147b..3d2a70a826dd1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -358,9 +358,9 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost( Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4)); std::pair LT = getTypeLegalizationCost(Tp); - // Note: Asuming all vdota4* variants are equal cost + // Note: Asuming all vdot4a* variants are equal cost return LT.first * - getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind); + getRISCVInstructionCost(RISCV::VDOT4A_VV, LT.second, CostKind); } bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { diff --git a/llvm/test/CodeGen/RISCV/rvv/commutable-zvdot4a8i.ll b/llvm/test/CodeGen/RISCV/rvv/commutable-zvdot4a8i.ll index e5b3324651fce..771246a304ce2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/commutable-zvdot4a8i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/commutable-zvdot4a8i.ll @@ -4,22 +4,22 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+experimental-zvdot4a8i \ ; RUN: -verify-machineinstrs | FileCheck %s -; vdota4.vv - commutable -define @commutable_vdota4_vv( %0, %1, iXLen %2) nounwind { -; CHECK-LABEL: commutable_vdota4_vv: +; vdot4a.vv - commutable +define @commutable_vdot4a_vv( %0, %1, iXLen %2) nounwind { +; CHECK-LABEL: commutable_vdot4a_vv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vdota4.vv v8, v8, v9 +; CHECK-NEXT: vdot4a.vv v8, v8, v9 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32( poison, %0, %1, iXLen %2, iXLen 1) - %b = call @llvm.riscv.vdota4.nxv2i32.nxv2i32( + %b = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32( poison, %1, %0, @@ -28,21 +28,21 @@ entry: ret %ret } -define @commutable_vdota4_vv_masked( %0, %1, %mask, iXLen %2) { -; CHECK-LABEL: commutable_vdota4_vv_masked: +define @commutable_vdot4a_vv_masked( %0, %1, %mask, iXLen %2) { +; CHECK-LABEL: commutable_vdot4a_vv_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vdota4.vv v8, v8, v9, v0.t +; CHECK-NEXT: vdot4a.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: ret - %a = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32( poison, %0, %1, %mask, iXLen %2, iXLen 1) - %b = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32( + %b = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32( poison, %1, %0, @@ -52,22 +52,22 @@ define @commutable_vdota4_vv_masked( %0, %ret } -; vdota4u.vv - commutable -define @commutable_vdota4u_vv( %0, %1, iXLen %2) nounwind { -; CHECK-LABEL: commutable_vdota4u_vv: +; vdot4au.vv - commutable +define @commutable_vdot4au_vv( %0, %1, iXLen %2) nounwind { +; CHECK-LABEL: commutable_vdot4au_vv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vdota4u.vv v8, v8, v9 +; CHECK-NEXT: vdot4au.vv v8, v8, v9 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32( poison, %0, %1, iXLen %2, iXLen 1) - %b = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32( + %b = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32( poison, %1, %0, @@ -76,21 +76,21 @@ entry: ret %ret } -define @commutable_vdota4u_vv_masked( %0, %1, %mask, iXLen %2) { -; CHECK-LABEL: commutable_vdota4u_vv_masked: +define @commutable_vdot4au_vv_masked( %0, %1, %mask, iXLen %2) { +; CHECK-LABEL: commutable_vdot4au_vv_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vdota4u.vv v8, v8, v9, v0.t +; CHECK-NEXT: vdot4au.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: ret - %a = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32( poison, %0, %1, %mask, iXLen %2, iXLen 1) - %b = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32( + %b = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32( poison, %1, %0, @@ -100,23 +100,23 @@ define @commutable_vdota4u_vv_masked( %0, < ret %ret } -; vdota4su.vv - NOT commutable (signed x unsigned, operand order matters) -define @commutable_vdota4su_vv( %0, %1, iXLen %2) nounwind { -; CHECK-LABEL: commutable_vdota4su_vv: +; vdot4asu.vv - NOT commutable (signed x unsigned, operand order matters) +define @commutable_vdot4asu_vv( %0, %1, iXLen %2) nounwind { +; CHECK-LABEL: commutable_vdot4asu_vv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vdota4su.vv v10, v8, v9 -; CHECK-NEXT: vdota4su.vv v8, v9, v8 +; CHECK-NEXT: vdot4asu.vv v10, v8, v9 +; CHECK-NEXT: vdot4asu.vv v8, v9, v8 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32( poison, %0, %1, iXLen %2, iXLen 1) - %b = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32( + %b = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32( poison, %1, %0, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvdot4a8i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvdot4a8i.ll index f1211adcd2c09..2482ccbdde9ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvdot4a8i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvdot4a8i.ll @@ -4,8 +4,8 @@ ; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvdot4a8i -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvdot4a8i -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT64 -define i32 @vdota4_vv(<16 x i8> %a, <16 x i8> %b) { -; NODOT-LABEL: vdota4_vv: +define i32 @vdot4a_vv(<16 x i8> %a, <16 x i8> %b) { +; NODOT-LABEL: vdot4a_vv: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -17,11 +17,11 @@ define i32 @vdota4_vv(<16 x i8> %a, <16 x i8> %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv: +; DOT-LABEL: vdot4a_vv: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -34,8 +34,8 @@ entry: ret i32 %res } -define i32 @vdota4_vx_constant(<16 x i8> %a) { -; CHECK-LABEL: vdota4_vx_constant: +define i32 @vdot4a_vx_constant(<16 x i8> %a) { +; CHECK-LABEL: vdot4a_vx_constant: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v8 @@ -53,8 +53,8 @@ entry: ret i32 %res } -define i32 @vdota4_vx_constant_swapped(<16 x i8> %a) { -; CHECK-LABEL: vdota4_vx_constant_swapped: +define i32 @vdot4a_vx_constant_swapped(<16 x i8> %a) { +; CHECK-LABEL: vdot4a_vx_constant_swapped: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v8 @@ -72,8 +72,8 @@ entry: ret i32 %res } -define i32 @vdota4u_vv(<16 x i8> %a, <16 x i8> %b) { -; NODOT-LABEL: vdota4u_vv: +define i32 @vdot4au_vv(<16 x i8> %a, <16 x i8> %b) { +; NODOT-LABEL: vdot4au_vv: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; NODOT-NEXT: vwmulu.vv v10, v8, v9 @@ -85,11 +85,11 @@ define i32 @vdota4u_vv(<16 x i8> %a, <16 x i8> %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv: +; DOT-LABEL: vdot4au_vv: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4u.vv v10, v8, v9 +; DOT-NEXT: vdot4au.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -102,8 +102,8 @@ entry: ret i32 %res } -define i32 @vdota4u_vx_constant(<16 x i8> %a) { -; CHECK-LABEL: vdota4u_vx_constant: +define i32 @vdot4au_vx_constant(<16 x i8> %a) { +; CHECK-LABEL: vdot4au_vx_constant: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 @@ -121,8 +121,8 @@ entry: ret i32 %res } -define i32 @vdota4su_vv(<16 x i8> %a, <16 x i8> %b) { -; NODOT-LABEL: vdota4su_vv: +define i32 @vdot4asu_vv(<16 x i8> %a, <16 x i8> %b) { +; NODOT-LABEL: vdot4asu_vv: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -134,11 +134,11 @@ define i32 @vdota4su_vv(<16 x i8> %a, <16 x i8> %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv: +; DOT-LABEL: vdot4asu_vv: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4su.vv v10, v8, v9 +; DOT-NEXT: vdot4asu.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -151,8 +151,8 @@ entry: ret i32 %res } -define i32 @vdota4su_vv_swapped(<16 x i8> %a, <16 x i8> %b) { -; NODOT-LABEL: vdota4su_vv_swapped: +define i32 @vdot4asu_vv_swapped(<16 x i8> %a, <16 x i8> %b) { +; NODOT-LABEL: vdot4asu_vv_swapped: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -164,11 +164,11 @@ define i32 @vdota4su_vv_swapped(<16 x i8> %a, <16 x i8> %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_swapped: +; DOT-LABEL: vdot4asu_vv_swapped: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4su.vv v10, v8, v9 +; DOT-NEXT: vdot4asu.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -236,7 +236,7 @@ define i32 @reduce_of_sext(<16 x i8> %a) { ; DOT-NEXT: vmv.v.i v9, 1 ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -263,7 +263,7 @@ define i32 @reduce_of_zext(<16 x i8> %a) { ; DOT-NEXT: vmv.v.i v9, 1 ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4u.vv v10, v8, v9 +; DOT-NEXT: vdot4au.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -274,8 +274,8 @@ entry: ret i32 %res } -define i32 @vdota4_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; NODOT-LABEL: vdota4_vv_accum: +define i32 @vdot4a_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; NODOT-LABEL: vdot4a_vv_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -287,11 +287,11 @@ define i32 @vdota4_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_accum: +; DOT-LABEL: vdot4a_vv_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vdota4.vv v16, v8, v9 +; DOT-NEXT: vdot4a.vv v16, v8, v9 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma ; DOT-NEXT: vmv.v.v v12, v16 ; DOT-NEXT: vmv.s.x v8, zero @@ -308,8 +308,8 @@ entry: ret i32 %sum } -define i32 @vdota4u_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; NODOT-LABEL: vdota4u_vv_accum: +define i32 @vdot4au_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; NODOT-LABEL: vdot4au_vv_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; NODOT-NEXT: vwmulu.vv v10, v8, v9 @@ -321,11 +321,11 @@ define i32 @vdota4u_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv_accum: +; DOT-LABEL: vdot4au_vv_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vdota4u.vv v16, v8, v9 +; DOT-NEXT: vdot4au.vv v16, v8, v9 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma ; DOT-NEXT: vmv.v.v v12, v16 ; DOT-NEXT: vmv.s.x v8, zero @@ -342,8 +342,8 @@ entry: ret i32 %sum } -define i32 @vdota4su_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; NODOT-LABEL: vdota4su_vv_accum: +define i32 @vdot4asu_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; NODOT-LABEL: vdot4asu_vv_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -355,11 +355,11 @@ define i32 @vdota4su_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_accum: +; DOT-LABEL: vdot4asu_vv_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vdota4su.vv v16, v8, v9 +; DOT-NEXT: vdot4asu.vv v16, v8, v9 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma ; DOT-NEXT: vmv.v.v v12, v16 ; DOT-NEXT: vmv.s.x v8, zero @@ -376,8 +376,8 @@ entry: ret i32 %sum } -define i32 @vdota4_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { -; NODOT-LABEL: vdota4_vv_scalar_add: +define i32 @vdot4a_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { +; NODOT-LABEL: vdot4a_vv_scalar_add: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -389,11 +389,11 @@ define i32 @vdota4_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_scalar_add: +; DOT-LABEL: vdot4a_vv_scalar_add: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, a0 ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -407,8 +407,8 @@ entry: ret i32 %add } -define i32 @vdota4u_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { -; NODOT-LABEL: vdota4u_vv_scalar_add: +define i32 @vdot4au_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { +; NODOT-LABEL: vdot4au_vv_scalar_add: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; NODOT-NEXT: vwmulu.vv v10, v8, v9 @@ -420,11 +420,11 @@ define i32 @vdota4u_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv_scalar_add: +; DOT-LABEL: vdot4au_vv_scalar_add: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4u.vv v10, v8, v9 +; DOT-NEXT: vdot4au.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, a0 ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -438,8 +438,8 @@ entry: ret i32 %add } -define i32 @vdota4su_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { -; NODOT-LABEL: vdota4su_vv_scalar_add: +define i32 @vdot4asu_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { +; NODOT-LABEL: vdot4asu_vv_scalar_add: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -451,11 +451,11 @@ define i32 @vdota4su_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_scalar_add: +; DOT-LABEL: vdot4asu_vv_scalar_add: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4su.vv v10, v8, v9 +; DOT-NEXT: vdot4asu.vv v10, v8, v9 ; DOT-NEXT: vmv.s.x v8, a0 ; DOT-NEXT: vredsum.vs v8, v10, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -469,8 +469,8 @@ entry: ret i32 %add } -define i32 @vdota4_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; NODOT-LABEL: vdota4_vv_split: +define i32 @vdot4a_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; NODOT-LABEL: vdot4a_vv_split: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -485,12 +485,12 @@ define i32 @vdota4_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_split: +; DOT-LABEL: vdot4a_vv_split: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4.vv v12, v8, v9 -; DOT-NEXT: vdota4.vv v12, v10, v11 +; DOT-NEXT: vdot4a.vv v12, v8, v9 +; DOT-NEXT: vdot4a.vv v12, v10, v11 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -507,8 +507,8 @@ entry: ret i32 %sum } -define <1 x i32> @vdota4_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { -; NODOT-LABEL: vdota4_vv_partial_reduce_v1i32_v4i8: +define <1 x i32> @vdot4a_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { +; NODOT-LABEL: vdot4a_vv_partial_reduce_v1i32_v4i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -526,11 +526,11 @@ define <1 x i32> @vdota4_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) ; NODOT-NEXT: vadd.vv v8, v9, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_partial_reduce_v1i32_v4i8: +; DOT-LABEL: vdot4a_vv_partial_reduce_v1i32_v4i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; DOT-NEXT: vmv.s.x v10, zero -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv1r.v v8, v10 ; DOT-NEXT: ret entry: @@ -541,8 +541,8 @@ entry: ret <1 x i32> %res } -define <1 x i32> @vdota4u_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { -; NODOT-LABEL: vdota4u_vv_partial_reduce_v1i32_v4i8: +define <1 x i32> @vdot4au_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { +; NODOT-LABEL: vdot4au_vv_partial_reduce_v1i32_v4i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; NODOT-NEXT: vwmulu.vv v10, v8, v9 @@ -559,11 +559,11 @@ define <1 x i32> @vdota4u_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) ; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv_partial_reduce_v1i32_v4i8: +; DOT-LABEL: vdot4au_vv_partial_reduce_v1i32_v4i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; DOT-NEXT: vmv.s.x v10, zero -; DOT-NEXT: vdota4u.vv v10, v8, v9 +; DOT-NEXT: vdot4au.vv v10, v8, v9 ; DOT-NEXT: vmv1r.v v8, v10 ; DOT-NEXT: ret entry: @@ -574,8 +574,8 @@ entry: ret <1 x i32> %res } -define <1 x i32> @vdota4u_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { -; NODOT-LABEL: vdota4u_vx_partial_reduce: +define <1 x i32> @vdot4au_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { +; NODOT-LABEL: vdot4au_vx_partial_reduce: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; NODOT-NEXT: vzext.vf4 v9, v8 @@ -591,7 +591,7 @@ define <1 x i32> @vdota4u_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { ; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vx_partial_reduce: +; DOT-LABEL: vdot4au_vx_partial_reduce: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; DOT-NEXT: vmv.s.x v9, zero @@ -599,7 +599,7 @@ define <1 x i32> @vdota4u_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { ; DOT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; DOT-NEXT: vmv.v.x v10, a0 ; DOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; DOT-NEXT: vdota4u.vv v9, v8, v10 +; DOT-NEXT: vdot4au.vv v9, v8, v10 ; DOT-NEXT: vmv1r.v v8, v9 ; DOT-NEXT: ret entry: @@ -609,8 +609,8 @@ entry: ret <1 x i32> %res } -define <1 x i32> @vdota4_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { -; NODOT-LABEL: vdota4_vx_partial_reduce: +define <1 x i32> @vdot4a_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { +; NODOT-LABEL: vdot4a_vx_partial_reduce: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; NODOT-NEXT: vsext.vf4 v9, v8 @@ -627,7 +627,7 @@ define <1 x i32> @vdota4_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { ; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vx_partial_reduce: +; DOT-LABEL: vdot4a_vx_partial_reduce: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; DOT-NEXT: vmv.s.x v9, zero @@ -635,7 +635,7 @@ define <1 x i32> @vdota4_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { ; DOT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; DOT-NEXT: vmv.v.x v10, a0 ; DOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; DOT-NEXT: vdota4.vv v9, v8, v10 +; DOT-NEXT: vdot4a.vv v9, v8, v10 ; DOT-NEXT: vmv1r.v v8, v9 ; DOT-NEXT: ret entry: @@ -645,8 +645,8 @@ entry: ret <1 x i32> %res } -define <1 x i32> @vdota4su_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { -; NODOT-LABEL: vdota4su_vv_partial_reduce_v1i32_v4i8: +define <1 x i32> @vdot4asu_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { +; NODOT-LABEL: vdot4asu_vv_partial_reduce_v1i32_v4i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -664,11 +664,11 @@ define <1 x i32> @vdota4su_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b ; NODOT-NEXT: vadd.vv v8, v9, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_partial_reduce_v1i32_v4i8: +; DOT-LABEL: vdot4asu_vv_partial_reduce_v1i32_v4i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; DOT-NEXT: vmv.s.x v10, zero -; DOT-NEXT: vdota4su.vv v10, v8, v9 +; DOT-NEXT: vdot4asu.vv v10, v8, v9 ; DOT-NEXT: vmv1r.v v8, v10 ; DOT-NEXT: ret entry: @@ -679,8 +679,8 @@ entry: ret <1 x i32> %res } -define <1 x i32> @vdota4su_vv_partial_reduce_swapped(<4 x i8> %a, <4 x i8> %b) { -; NODOT-LABEL: vdota4su_vv_partial_reduce_swapped: +define <1 x i32> @vdot4asu_vv_partial_reduce_swapped(<4 x i8> %a, <4 x i8> %b) { +; NODOT-LABEL: vdot4asu_vv_partial_reduce_swapped: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -698,11 +698,11 @@ define <1 x i32> @vdota4su_vv_partial_reduce_swapped(<4 x i8> %a, <4 x i8> %b) { ; NODOT-NEXT: vadd.vv v8, v9, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_partial_reduce_swapped: +; DOT-LABEL: vdot4asu_vv_partial_reduce_swapped: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; DOT-NEXT: vmv.s.x v10, zero -; DOT-NEXT: vdota4su.vv v10, v8, v9 +; DOT-NEXT: vdot4asu.vv v10, v8, v9 ; DOT-NEXT: vmv1r.v v8, v10 ; DOT-NEXT: ret entry: @@ -713,8 +713,8 @@ entry: ret <1 x i32> %res } -define <1 x i32> @vdota4su_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { -; CHECK-LABEL: vdota4su_vx_partial_reduce: +define <1 x i32> @vdot4asu_vx_partial_reduce(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: vdot4asu_vx_partial_reduce: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vsext.vf4 v9, v8 @@ -737,8 +737,8 @@ entry: } -define <2 x i32> @vdota4_vv_partial_reduce_v2i32_v8i8(<8 x i8> %a, <8 x i8> %b) { -; NODOT-LABEL: vdota4_vv_partial_reduce_v2i32_v8i8: +define <2 x i32> @vdot4a_vv_partial_reduce_v2i32_v8i8(<8 x i8> %a, <8 x i8> %b) { +; NODOT-LABEL: vdot4a_vv_partial_reduce_v2i32_v8i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -757,11 +757,11 @@ define <2 x i32> @vdota4_vv_partial_reduce_v2i32_v8i8(<8 x i8> %a, <8 x i8> %b) ; NODOT-NEXT: vadd.vv v8, v8, v12 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_partial_reduce_v2i32_v8i8: +; DOT-LABEL: vdot4a_vv_partial_reduce_v2i32_v8i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv1r.v v8, v10 ; DOT-NEXT: ret entry: @@ -772,8 +772,8 @@ entry: ret <2 x i32> %res } -define <2 x i32> @vdota4_vv_partial_reduce_v2i32_v64i8(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: vdota4_vv_partial_reduce_v2i32_v64i8: +define <2 x i32> @vdot4a_vv_partial_reduce_v2i32_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: vdot4a_vv_partial_reduce_v2i32_v64i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -949,8 +949,8 @@ entry: ret <2 x i32> %res } -define <4 x i32> @vdota4_vv_partial_reduce_v4i32_v16i8(<16 x i8> %a, <16 x i8> %b) { -; NODOT-LABEL: vdota4_vv_partial_reduce_v4i32_v16i8: +define <4 x i32> @vdot4a_vv_partial_reduce_v4i32_v16i8(<16 x i8> %a, <16 x i8> %b) { +; NODOT-LABEL: vdot4a_vv_partial_reduce_v4i32_v16i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -969,11 +969,11 @@ define <4 x i32> @vdota4_vv_partial_reduce_v4i32_v16i8(<16 x i8> %a, <16 x i8> % ; NODOT-NEXT: vadd.vv v8, v8, v16 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_partial_reduce_v4i32_v16i8: +; DOT-LABEL: vdot4a_vv_partial_reduce_v4i32_v16i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv.v.v v8, v10 ; DOT-NEXT: ret entry: @@ -984,8 +984,8 @@ entry: ret <4 x i32> %res } -define <16 x i32> @vdota4_vv_partial_reduce_v16i32_v64i8(<64 x i8> %a, <64 x i8> %b) { -; NODOT-LABEL: vdota4_vv_partial_reduce_v16i32_v64i8: +define <16 x i32> @vdot4a_vv_partial_reduce_v16i32_v64i8(<64 x i8> %a, <64 x i8> %b) { +; NODOT-LABEL: vdot4a_vv_partial_reduce_v16i32_v64i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: li a0, 32 ; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma @@ -1012,11 +1012,11 @@ define <16 x i32> @vdota4_vv_partial_reduce_v16i32_v64i8(<64 x i8> %a, <64 x i8> ; NODOT-NEXT: vadd.vv v8, v8, v24 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_partial_reduce_v16i32_v64i8: +; DOT-LABEL: vdot4a_vv_partial_reduce_v16i32_v64i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; DOT-NEXT: vmv.v.i v16, 0 -; DOT-NEXT: vdota4.vv v16, v8, v12 +; DOT-NEXT: vdot4a.vv v16, v8, v12 ; DOT-NEXT: vmv.v.v v8, v16 ; DOT-NEXT: ret entry: @@ -1027,8 +1027,8 @@ entry: ret <16 x i32> %res } -define <4 x i32> @vdota4_vv_partial_reduce_m1_accum(<16 x i8> %a, <16 x i8> %b, <4 x i32> %accum) { -; NODOT-LABEL: vdota4_vv_partial_reduce_m1_accum: +define <4 x i32> @vdot4a_vv_partial_reduce_m1_accum(<16 x i8> %a, <16 x i8> %b, <4 x i32> %accum) { +; NODOT-LABEL: vdot4a_vv_partial_reduce_m1_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -1049,10 +1049,10 @@ define <4 x i32> @vdota4_vv_partial_reduce_m1_accum(<16 x i8> %a, <16 x i8> %b, ; NODOT-NEXT: vadd.vv v8, v8, v16 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_partial_reduce_m1_accum: +; DOT-LABEL: vdot4a_vv_partial_reduce_m1_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; DOT-NEXT: vdota4.vv v10, v8, v9 +; DOT-NEXT: vdot4a.vv v10, v8, v9 ; DOT-NEXT: vmv.v.v v8, v10 ; DOT-NEXT: ret entry: @@ -1063,8 +1063,8 @@ entry: ret <4 x i32> %res } -define <16 x i32> @vdota4_vv_partial_reduce3(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: vdota4_vv_partial_reduce3: +define <16 x i32> @vdot4a_vv_partial_reduce3(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vdot4a_vv_partial_reduce3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v8 @@ -1080,8 +1080,8 @@ entry: } ; Test legalization - type split -define <64 x i32> @vdota4su_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> %b) { -; NODOT-LABEL: vdota4su_vv_partial_v64i32_v256i8: +define <64 x i32> @vdot4asu_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> %b) { +; NODOT-LABEL: vdot4asu_vv_partial_v64i32_v256i8: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: addi sp, sp, -16 ; NODOT-NEXT: .cfi_def_cfa_offset 16 @@ -1298,7 +1298,7 @@ define <64 x i32> @vdota4su_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> % ; NODOT-NEXT: .cfi_def_cfa_offset 0 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_partial_v64i32_v256i8: +; DOT-LABEL: vdot4asu_vv_partial_v64i32_v256i8: ; DOT: # %bb.0: # %entry ; DOT-NEXT: addi sp, sp, -16 ; DOT-NEXT: .cfi_def_cfa_offset 16 @@ -1345,7 +1345,7 @@ define <64 x i32> @vdota4su_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> % ; DOT-NEXT: add a0, sp, a0 ; DOT-NEXT: addi a0, a0, 16 ; DOT-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; DOT-NEXT: vdota4su.vv v0, v16, v8 +; DOT-NEXT: vdot4asu.vv v0, v16, v8 ; DOT-NEXT: csrr a0, vlenb ; DOT-NEXT: slli a0, a0, 3 ; DOT-NEXT: mv a1, a0 @@ -1356,7 +1356,7 @@ define <64 x i32> @vdota4su_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> % ; DOT-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; DOT-NEXT: addi a0, sp, 16 ; DOT-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; DOT-NEXT: vdota4su.vv v24, v16, v8 +; DOT-NEXT: vdot4asu.vv v24, v16, v8 ; DOT-NEXT: vmv.v.v v8, v0 ; DOT-NEXT: vmv.v.v v16, v24 ; DOT-NEXT: csrr a0, vlenb @@ -1375,8 +1375,8 @@ entry: } ; Test legalization - integer promote -define <4 x i31> @vdota4su_vv_partial_v4i31_v16i7(<16 x i7> %a, <16 x i7> %b) { -; NODOT-LABEL: vdota4su_vv_partial_v4i31_v16i7: +define <4 x i31> @vdot4asu_vv_partial_v4i31_v16i7(<16 x i7> %a, <16 x i7> %b) { +; NODOT-LABEL: vdot4asu_vv_partial_v4i31_v16i7: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; NODOT-NEXT: vzext.vf4 v12, v8 @@ -1404,7 +1404,7 @@ define <4 x i31> @vdota4su_vv_partial_v4i31_v16i7(<16 x i7> %a, <16 x i7> %b) { ; NODOT-NEXT: vadd.vv v8, v8, v16 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_partial_v4i31_v16i7: +; DOT-LABEL: vdot4asu_vv_partial_v4i31_v16i7: ; DOT: # %bb.0: # %entry ; DOT-NEXT: li a0, 127 ; DOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma @@ -1413,7 +1413,7 @@ define <4 x i31> @vdota4su_vv_partial_v4i31_v16i7(<16 x i7> %a, <16 x i7> %b) { ; DOT-NEXT: vsra.vi v10, v8, 1 ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v8, 0 -; DOT-NEXT: vdota4su.vv v8, v10, v9 +; DOT-NEXT: vdot4asu.vv v8, v10, v9 ; DOT-NEXT: ret entry: %a.ext = sext <16 x i7> %a to <16 x i31> @@ -1425,8 +1425,8 @@ entry: ; Test legalization - expand -define <1 x i32> @vdota4su_vv_partial_v1i32_v2i8(<2 x i8> %a, <2 x i8> %b) { -; CHECK-LABEL: vdota4su_vv_partial_v1i32_v2i8: +define <1 x i32> @vdot4asu_vv_partial_v1i32_v2i8(<2 x i8> %a, <2 x i8> %b) { +; CHECK-LABEL: vdot4asu_vv_partial_v1i32_v2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vsext.vf2 v10, v8 @@ -1446,10 +1446,10 @@ entry: } ; TODO: This isn't legal, but we could split it into two halves, and use -; a pair of slides + two vdota4su_vv here. Or alternatively, the mul +; a pair of slides + two vdot4asu_vv here. Or alternatively, the mul ; sequence + one vredsum, or a vadd reduce tree. -define <1 x i32> @vdota4su_vv_partial_v1i32_v8i8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: vdota4su_vv_partial_v1i32_v8i8: +define <1 x i32> @vdot4asu_vv_partial_v1i32_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: vdot4asu_vv_partial_v1i32_v8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsext.vf2 v10, v8 @@ -1511,7 +1511,7 @@ define <4 x i32> @partial_of_sext(<16 x i8> %a) { ; DOT-NEXT: vmv.v.i v10, 1 ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v9, 0 -; DOT-NEXT: vdota4.vv v9, v8, v10 +; DOT-NEXT: vdot4a.vv v9, v8, v10 ; DOT-NEXT: vmv.v.v v8, v9 ; DOT-NEXT: ret entry: @@ -1544,7 +1544,7 @@ define <4 x i32> @partial_of_zext(<16 x i8> %a) { ; DOT-NEXT: vmv.v.i v10, 1 ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v9, 0 -; DOT-NEXT: vdota4u.vv v9, v8, v10 +; DOT-NEXT: vdot4au.vv v9, v8, v10 ; DOT-NEXT: vmv.v.v v8, v9 ; DOT-NEXT: ret entry: @@ -1553,8 +1553,8 @@ entry: ret <4 x i32> %res } -define i32 @vdota4_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; NODOT-LABEL: vdota4_vv_accum_disjoint_or: +define i32 @vdot4a_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; NODOT-LABEL: vdot4a_vv_accum_disjoint_or: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -1567,11 +1567,11 @@ define i32 @vdota4_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> % ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_accum_disjoint_or: +; DOT-LABEL: vdot4a_vv_accum_disjoint_or: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vdota4.vv v16, v8, v9 +; DOT-NEXT: vdot4a.vv v16, v8, v9 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma ; DOT-NEXT: vmv.v.v v12, v16 ; DOT-NEXT: vmv.s.x v8, zero @@ -1588,8 +1588,8 @@ entry: ret i32 %sum } -define i32 @vdota4_vv_accum_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; CHECK-LABEL: vdota4_vv_accum_or: +define i32 @vdot4a_vv_accum_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; CHECK-LABEL: vdot4a_vv_accum_or: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vsext.vf2 v16, v8 @@ -1610,8 +1610,8 @@ entry: ret i32 %sum } -define i32 @vdota4u_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; NODOT-LABEL: vdota4u_vv_accum_disjoint_or: +define i32 @vdot4au_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; NODOT-LABEL: vdot4au_vv_accum_disjoint_or: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; NODOT-NEXT: vwmulu.vv v10, v8, v9 @@ -1623,11 +1623,11 @@ define i32 @vdota4u_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv_accum_disjoint_or: +; DOT-LABEL: vdot4au_vv_accum_disjoint_or: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vdota4u.vv v16, v8, v9 +; DOT-NEXT: vdot4au.vv v16, v8, v9 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma ; DOT-NEXT: vmv.v.v v12, v16 ; DOT-NEXT: vmv.s.x v8, zero @@ -1644,8 +1644,8 @@ entry: ret i32 %sum } -define i32 @vdota4su_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { -; NODOT-LABEL: vdota4su_vv_accum_disjoint_or: +define i32 @vdot4asu_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { +; NODOT-LABEL: vdot4asu_vv_accum_disjoint_or: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -1658,11 +1658,11 @@ define i32 @vdota4su_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_accum_disjoint_or: +; DOT-LABEL: vdot4asu_vv_accum_disjoint_or: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vdota4su.vv v16, v8, v9 +; DOT-NEXT: vdot4asu.vv v16, v8, v9 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma ; DOT-NEXT: vmv.v.v v12, v16 ; DOT-NEXT: vmv.s.x v8, zero diff --git a/llvm/test/CodeGen/RISCV/rvv/vdota4.ll b/llvm/test/CodeGen/RISCV/rvv/vdot4a.ll similarity index 62% rename from llvm/test/CodeGen/RISCV/rvv/vdota4.ll rename to llvm/test/CodeGen/RISCV/rvv/vdot4a.ll index c28520b81d014..591b80d5cdb6d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdota4.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdot4a.ll @@ -4,14 +4,14 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+experimental-zvdot4a8i \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK -define @intrinsic_vdota4_vv_nxv1i32_nxv1i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vv_nxv1i32_nxv1i32: +define @intrinsic_vdot4a_vv_nxv1i32_nxv1i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vv_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vdota4.vv v8, v9, v10 +; CHECK-NEXT: vdot4a.vv v8, v9, v10 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv1i32.nxv1i32( + %a = call @llvm.riscv.vdot4a.nxv1i32.nxv1i32( %0, %1, %2, @@ -20,14 +20,14 @@ entry: ret %a } -define @intrinsic_vdota4_vv_nxv2i32_nxv2i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vv_nxv2i32_nxv2i32: +define @intrinsic_vdot4a_vv_nxv2i32_nxv2i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vv_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vdota4.vv v8, v9, v10 +; CHECK-NEXT: vdot4a.vv v8, v9, v10 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4a.nxv2i32.nxv2i32( %0, %1, %2, @@ -36,14 +36,14 @@ entry: ret %a } -define @intrinsic_vdota4_vv_nxv4i32_nxv4i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vv_nxv4i32_nxv4i32: +define @intrinsic_vdot4a_vv_nxv4i32_nxv4i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vv_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vdota4.vv v8, v10, v12 +; CHECK-NEXT: vdot4a.vv v8, v10, v12 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv4i32.nxv4i32( + %a = call @llvm.riscv.vdot4a.nxv4i32.nxv4i32( %0, %1, %2, @@ -52,14 +52,14 @@ entry: ret %a } -define @intrinsic_vdota4_vv_nxv8i32_nxv8i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vv_nxv8i32_nxv8i32: +define @intrinsic_vdot4a_vv_nxv8i32_nxv8i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vv_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vdota4.vv v8, v12, v16 +; CHECK-NEXT: vdot4a.vv v8, v12, v16 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv8i32.nxv8i32( + %a = call @llvm.riscv.vdot4a.nxv8i32.nxv8i32( %0, %1, %2, @@ -68,15 +68,15 @@ entry: ret %a } -define @intrinsic_vdota4_vv_nxv16i32_nxv16i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vv_nxv16i32_nxv16i32: +define @intrinsic_vdot4a_vv_nxv16i32_nxv16i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vv_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4.vv v8, v16, v24 +; CHECK-NEXT: vdot4a.vv v8, v16, v24 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv16i32.nxv16i32( + %a = call @llvm.riscv.vdot4a.nxv16i32.nxv16i32( %0, %1, %2, @@ -85,14 +85,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vv_nxv1i32_nxv1i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vv_nxv1i32_nxv1i32: +define @intrinsic_vdot4a_mask_vv_nxv1i32_nxv1i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vv_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vdota4.vv v8, v9, v10, v0.t +; CHECK-NEXT: vdot4a.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv1i32.nxv1i32( + %a = call @llvm.riscv.vdot4a.mask.nxv1i32.nxv1i32( %0, %1, %2, @@ -102,14 +102,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vv_nxv2i32_nxv2i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vv_nxv2i32_nxv2i32: +define @intrinsic_vdot4a_mask_vv_nxv2i32_nxv2i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vv_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vdota4.vv v8, v9, v10, v0.t +; CHECK-NEXT: vdot4a.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4a.mask.nxv2i32.nxv2i32( %0, %1, %2, @@ -119,14 +119,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vv_nxv4i32_nxv4i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vv_nxv4i32_nxv4i32: +define @intrinsic_vdot4a_mask_vv_nxv4i32_nxv4i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vv_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vdota4.vv v8, v10, v12, v0.t +; CHECK-NEXT: vdot4a.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv4i32.nxv4i32( + %a = call @llvm.riscv.vdot4a.mask.nxv4i32.nxv4i32( %0, %1, %2, @@ -136,14 +136,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vv_nxv8i32_nxv8i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vv_nxv8i32_nxv8i32: +define @intrinsic_vdot4a_mask_vv_nxv8i32_nxv8i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vv_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vdota4.vv v8, v12, v16, v0.t +; CHECK-NEXT: vdot4a.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv8i32.nxv8i32( + %a = call @llvm.riscv.vdot4a.mask.nxv8i32.nxv8i32( %0, %1, %2, @@ -153,15 +153,15 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vv_nxv16i32_nxv16i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vv_nxv16i32_nxv16i32: +define @intrinsic_vdot4a_mask_vv_nxv16i32_nxv16i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vv_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4.vv v8, v16, v24, v0.t +; CHECK-NEXT: vdot4a.vv v8, v16, v24, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv16i32.nxv16i32( + %a = call @llvm.riscv.vdot4a.mask.nxv16i32.nxv16i32( %0, %1, %2, @@ -171,14 +171,14 @@ entry: ret %a } -define @intrinsic_vdota4_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vx_nxv1i32_i32: +define @intrinsic_vdot4a_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vdota4.vx v8, v9, a0 +; CHECK-NEXT: vdot4a.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv1i32.i32( + %a = call @llvm.riscv.vdot4a.nxv1i32.i32( %0, %1, i32 %2, @@ -187,14 +187,14 @@ entry: ret %a } -define @intrinsic_vdota4_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vx_nxv2i32_i32: +define @intrinsic_vdot4a_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma -; CHECK-NEXT: vdota4.vx v8, v9, a0 +; CHECK-NEXT: vdot4a.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv2i32.i32( + %a = call @llvm.riscv.vdot4a.nxv2i32.i32( %0, %1, i32 %2, @@ -203,14 +203,14 @@ entry: ret %a } -define @intrinsic_vdota4_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vx_nxv4i32_i32: +define @intrinsic_vdot4a_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vdota4.vx v8, v10, a0 +; CHECK-NEXT: vdot4a.vx v8, v10, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv4i32.i32( + %a = call @llvm.riscv.vdot4a.nxv4i32.i32( %0, %1, i32 %2, @@ -219,14 +219,14 @@ entry: ret %a } -define @intrinsic_vdota4_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vx_nxv8i32_i32: +define @intrinsic_vdot4a_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, ma -; CHECK-NEXT: vdota4.vx v8, v12, a0 +; CHECK-NEXT: vdot4a.vx v8, v12, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv8i32.i32( + %a = call @llvm.riscv.vdot4a.nxv8i32.i32( %0, %1, i32 %2, @@ -235,14 +235,14 @@ entry: ret %a } -define @intrinsic_vdota4_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_vx_nxv16i32_i32: +define @intrinsic_vdot4a_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4.vx v8, v16, a0 +; CHECK-NEXT: vdot4a.vx v8, v16, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.nxv16i32.i32( + %a = call @llvm.riscv.vdot4a.nxv16i32.i32( %0, %1, i32 %2, @@ -251,14 +251,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vx_nxv1i32_i32: +define @intrinsic_vdot4a_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vdota4.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4a.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv1i32.i32( + %a = call @llvm.riscv.vdot4a.mask.nxv1i32.i32( %0, %1, i32 %2, @@ -268,14 +268,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vx_nxv2i32_i32: +define @intrinsic_vdot4a_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vdota4.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4a.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv2i32.i32( + %a = call @llvm.riscv.vdot4a.mask.nxv2i32.i32( %0, %1, i32 %2, @@ -285,14 +285,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vx_nxv4i32_i32: +define @intrinsic_vdot4a_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu -; CHECK-NEXT: vdota4.vx v8, v10, a0, v0.t +; CHECK-NEXT: vdot4a.vx v8, v10, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv4i32.i32( + %a = call @llvm.riscv.vdot4a.mask.nxv4i32.i32( %0, %1, i32 %2, @@ -302,14 +302,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vx_nxv8i32_i32: +define @intrinsic_vdot4a_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu -; CHECK-NEXT: vdota4.vx v8, v12, a0, v0.t +; CHECK-NEXT: vdot4a.vx v8, v12, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv8i32.i32( + %a = call @llvm.riscv.vdot4a.mask.nxv8i32.i32( %0, %1, i32 %2, @@ -319,14 +319,14 @@ entry: ret %a } -define @intrinsic_vdota4_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4_mask_vx_nxv16i32_i32: +define @intrinsic_vdot4a_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4a_mask_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4.vx v8, v16, a0, v0.t +; CHECK-NEXT: vdot4a.vx v8, v16, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4.mask.nxv16i32.i32( + %a = call @llvm.riscv.vdot4a.mask.nxv16i32.i32( %0, %1, i32 %2, diff --git a/llvm/test/CodeGen/RISCV/rvv/vdota4su.ll b/llvm/test/CodeGen/RISCV/rvv/vdot4asu.ll similarity index 62% rename from llvm/test/CodeGen/RISCV/rvv/vdota4su.ll rename to llvm/test/CodeGen/RISCV/rvv/vdot4asu.ll index a33b5ce5fa3d2..07d10f0135f18 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdota4su.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdot4asu.ll @@ -4,14 +4,14 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+experimental-zvdot4a8i \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK -define @intrinsic_vdota4su_vv_nxv1i32_nxv1i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vv_nxv1i32_nxv1i32: +define @intrinsic_vdot4asu_vv_nxv1i32_nxv1i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vv_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vdota4su.vv v8, v9, v10 +; CHECK-NEXT: vdot4asu.vv v8, v9, v10 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv1i32.nxv1i32( + %a = call @llvm.riscv.vdot4asu.nxv1i32.nxv1i32( %0, %1, %2, @@ -20,14 +20,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vv_nxv2i32_nxv2i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vv_nxv2i32_nxv2i32: +define @intrinsic_vdot4asu_vv_nxv2i32_nxv2i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vv_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vdota4su.vv v8, v9, v10 +; CHECK-NEXT: vdot4asu.vv v8, v9, v10 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4asu.nxv2i32.nxv2i32( %0, %1, %2, @@ -36,14 +36,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vv_nxv4i32_nxv4i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vv_nxv4i32_nxv4i32: +define @intrinsic_vdot4asu_vv_nxv4i32_nxv4i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vv_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vdota4su.vv v8, v10, v12 +; CHECK-NEXT: vdot4asu.vv v8, v10, v12 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv4i32.nxv4i32( + %a = call @llvm.riscv.vdot4asu.nxv4i32.nxv4i32( %0, %1, %2, @@ -52,14 +52,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vv_nxv8i32_nxv8i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vv_nxv8i32_nxv8i32: +define @intrinsic_vdot4asu_vv_nxv8i32_nxv8i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vv_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vdota4su.vv v8, v12, v16 +; CHECK-NEXT: vdot4asu.vv v8, v12, v16 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv8i32.nxv8i32( + %a = call @llvm.riscv.vdot4asu.nxv8i32.nxv8i32( %0, %1, %2, @@ -68,15 +68,15 @@ entry: ret %a } -define @intrinsic_vdota4su_vv_nxv16i32_nxv16i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vv_nxv16i32_nxv16i32: +define @intrinsic_vdot4asu_vv_nxv16i32_nxv16i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vv_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4su.vv v8, v16, v24 +; CHECK-NEXT: vdot4asu.vv v8, v16, v24 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv16i32.nxv16i32( + %a = call @llvm.riscv.vdot4asu.nxv16i32.nxv16i32( %0, %1, %2, @@ -85,14 +85,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vv_nxv1i32_nxv1i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vv_nxv1i32_nxv1i32: +define @intrinsic_vdot4asu_mask_vv_nxv1i32_nxv1i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vv_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vdota4su.vv v8, v9, v10, v0.t +; CHECK-NEXT: vdot4asu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv1i32.nxv1i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv1i32.nxv1i32( %0, %1, %2, @@ -102,14 +102,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vv_nxv2i32_nxv2i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vv_nxv2i32_nxv2i32: +define @intrinsic_vdot4asu_mask_vv_nxv2i32_nxv2i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vv_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vdota4su.vv v8, v9, v10, v0.t +; CHECK-NEXT: vdot4asu.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv2i32.nxv2i32( %0, %1, %2, @@ -119,14 +119,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vv_nxv4i32_nxv4i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vv_nxv4i32_nxv4i32: +define @intrinsic_vdot4asu_mask_vv_nxv4i32_nxv4i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vv_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vdota4su.vv v8, v10, v12, v0.t +; CHECK-NEXT: vdot4asu.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv4i32.nxv4i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv4i32.nxv4i32( %0, %1, %2, @@ -136,14 +136,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vv_nxv8i32_nxv8i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vv_nxv8i32_nxv8i32: +define @intrinsic_vdot4asu_mask_vv_nxv8i32_nxv8i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vv_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vdota4su.vv v8, v12, v16, v0.t +; CHECK-NEXT: vdot4asu.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv8i32.nxv8i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv8i32.nxv8i32( %0, %1, %2, @@ -153,15 +153,15 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vv_nxv16i32_nxv16i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vv_nxv16i32_nxv16i32: +define @intrinsic_vdot4asu_mask_vv_nxv16i32_nxv16i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vv_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4su.vv v8, v16, v24, v0.t +; CHECK-NEXT: vdot4asu.vv v8, v16, v24, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv16i32.nxv16i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv16i32.nxv16i32( %0, %1, %2, @@ -171,14 +171,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vx_nxv1i32_i32: +define @intrinsic_vdot4asu_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vdota4su.vx v8, v9, a0 +; CHECK-NEXT: vdot4asu.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv1i32.i32( + %a = call @llvm.riscv.vdot4asu.nxv1i32.i32( %0, %1, i32 %2, @@ -187,14 +187,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vx_nxv2i32_i32: +define @intrinsic_vdot4asu_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma -; CHECK-NEXT: vdota4su.vx v8, v9, a0 +; CHECK-NEXT: vdot4asu.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv2i32.i32( + %a = call @llvm.riscv.vdot4asu.nxv2i32.i32( %0, %1, i32 %2, @@ -203,14 +203,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vx_nxv4i32_i32: +define @intrinsic_vdot4asu_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vdota4su.vx v8, v10, a0 +; CHECK-NEXT: vdot4asu.vx v8, v10, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv4i32.i32( + %a = call @llvm.riscv.vdot4asu.nxv4i32.i32( %0, %1, i32 %2, @@ -219,14 +219,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vx_nxv8i32_i32: +define @intrinsic_vdot4asu_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, ma -; CHECK-NEXT: vdota4su.vx v8, v12, a0 +; CHECK-NEXT: vdot4asu.vx v8, v12, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv8i32.i32( + %a = call @llvm.riscv.vdot4asu.nxv8i32.i32( %0, %1, i32 %2, @@ -235,14 +235,14 @@ entry: ret %a } -define @intrinsic_vdota4su_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_vx_nxv16i32_i32: +define @intrinsic_vdot4asu_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4su.vx v8, v16, a0 +; CHECK-NEXT: vdot4asu.vx v8, v16, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.nxv16i32.i32( + %a = call @llvm.riscv.vdot4asu.nxv16i32.i32( %0, %1, i32 %2, @@ -251,14 +251,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vx_nxv1i32_i32: +define @intrinsic_vdot4asu_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vdota4su.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4asu.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv1i32.i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv1i32.i32( %0, %1, i32 %2, @@ -268,14 +268,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vx_nxv2i32_i32: +define @intrinsic_vdot4asu_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vdota4su.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4asu.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv2i32.i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv2i32.i32( %0, %1, i32 %2, @@ -285,14 +285,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vx_nxv4i32_i32: +define @intrinsic_vdot4asu_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu -; CHECK-NEXT: vdota4su.vx v8, v10, a0, v0.t +; CHECK-NEXT: vdot4asu.vx v8, v10, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv4i32.i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv4i32.i32( %0, %1, i32 %2, @@ -302,14 +302,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vx_nxv8i32_i32: +define @intrinsic_vdot4asu_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu -; CHECK-NEXT: vdota4su.vx v8, v12, a0, v0.t +; CHECK-NEXT: vdot4asu.vx v8, v12, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv8i32.i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv8i32.i32( %0, %1, i32 %2, @@ -319,14 +319,14 @@ entry: ret %a } -define @intrinsic_vdota4su_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4su_mask_vx_nxv16i32_i32: +define @intrinsic_vdot4asu_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4asu_mask_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4su.vx v8, v16, a0, v0.t +; CHECK-NEXT: vdot4asu.vx v8, v16, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4su.mask.nxv16i32.i32( + %a = call @llvm.riscv.vdot4asu.mask.nxv16i32.i32( %0, %1, i32 %2, diff --git a/llvm/test/CodeGen/RISCV/rvv/vdota4u.ll b/llvm/test/CodeGen/RISCV/rvv/vdot4au.ll similarity index 62% rename from llvm/test/CodeGen/RISCV/rvv/vdota4u.ll rename to llvm/test/CodeGen/RISCV/rvv/vdot4au.ll index f26027eef1807..7b04ca7c0cbd2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdota4u.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdot4au.ll @@ -4,14 +4,14 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+experimental-zvdot4a8i \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK -define @intrinsic_vdota4u_vv_nxv1i32_nxv1i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vv_nxv1i32_nxv1i32: +define @intrinsic_vdot4au_vv_nxv1i32_nxv1i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vv_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vdota4u.vv v8, v9, v10 +; CHECK-NEXT: vdot4au.vv v8, v9, v10 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv1i32.nxv1i32( + %a = call @llvm.riscv.vdot4au.nxv1i32.nxv1i32( %0, %1, %2, @@ -20,14 +20,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vv_nxv2i32_nxv2i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vv_nxv2i32_nxv2i32: +define @intrinsic_vdot4au_vv_nxv2i32_nxv2i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vv_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vdota4u.vv v8, v9, v10 +; CHECK-NEXT: vdot4au.vv v8, v9, v10 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4au.nxv2i32.nxv2i32( %0, %1, %2, @@ -36,14 +36,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vv_nxv4i32_nxv4i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vv_nxv4i32_nxv4i32: +define @intrinsic_vdot4au_vv_nxv4i32_nxv4i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vv_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vdota4u.vv v8, v10, v12 +; CHECK-NEXT: vdot4au.vv v8, v10, v12 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv4i32.nxv4i32( + %a = call @llvm.riscv.vdot4au.nxv4i32.nxv4i32( %0, %1, %2, @@ -52,14 +52,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vv_nxv8i32_nxv8i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vv_nxv8i32_nxv8i32: +define @intrinsic_vdot4au_vv_nxv8i32_nxv8i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vv_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vdota4u.vv v8, v12, v16 +; CHECK-NEXT: vdot4au.vv v8, v12, v16 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv8i32.nxv8i32( + %a = call @llvm.riscv.vdot4au.nxv8i32.nxv8i32( %0, %1, %2, @@ -68,15 +68,15 @@ entry: ret %a } -define @intrinsic_vdota4u_vv_nxv16i32_nxv16i32( %0, %1, %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vv_nxv16i32_nxv16i32: +define @intrinsic_vdot4au_vv_nxv16i32_nxv16i32( %0, %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vv_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4u.vv v8, v16, v24 +; CHECK-NEXT: vdot4au.vv v8, v16, v24 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv16i32.nxv16i32( + %a = call @llvm.riscv.vdot4au.nxv16i32.nxv16i32( %0, %1, %2, @@ -85,14 +85,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vv_nxv1i32_nxv1i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vv_nxv1i32_nxv1i32: +define @intrinsic_vdot4au_mask_vv_nxv1i32_nxv1i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vv_nxv1i32_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vdota4u.vv v8, v9, v10, v0.t +; CHECK-NEXT: vdot4au.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv1i32.nxv1i32( + %a = call @llvm.riscv.vdot4au.mask.nxv1i32.nxv1i32( %0, %1, %2, @@ -102,14 +102,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vv_nxv2i32_nxv2i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vv_nxv2i32_nxv2i32: +define @intrinsic_vdot4au_mask_vv_nxv2i32_nxv2i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vv_nxv2i32_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vdota4u.vv v8, v9, v10, v0.t +; CHECK-NEXT: vdot4au.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv2i32.nxv2i32( + %a = call @llvm.riscv.vdot4au.mask.nxv2i32.nxv2i32( %0, %1, %2, @@ -119,14 +119,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vv_nxv4i32_nxv4i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vv_nxv4i32_nxv4i32: +define @intrinsic_vdot4au_mask_vv_nxv4i32_nxv4i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vv_nxv4i32_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vdota4u.vv v8, v10, v12, v0.t +; CHECK-NEXT: vdot4au.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv4i32.nxv4i32( + %a = call @llvm.riscv.vdot4au.mask.nxv4i32.nxv4i32( %0, %1, %2, @@ -136,14 +136,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vv_nxv8i32_nxv8i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vv_nxv8i32_nxv8i32: +define @intrinsic_vdot4au_mask_vv_nxv8i32_nxv8i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vv_nxv8i32_nxv8i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vdota4u.vv v8, v12, v16, v0.t +; CHECK-NEXT: vdot4au.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv8i32.nxv8i32( + %a = call @llvm.riscv.vdot4au.mask.nxv8i32.nxv8i32( %0, %1, %2, @@ -153,15 +153,15 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vv_nxv16i32_nxv16i32( %0, %1, %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vv_nxv16i32_nxv16i32: +define @intrinsic_vdot4au_mask_vv_nxv16i32_nxv16i32( %0, %1, %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vv_nxv16i32_nxv16i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4u.vv v8, v16, v24, v0.t +; CHECK-NEXT: vdot4au.vv v8, v16, v24, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv16i32.nxv16i32( + %a = call @llvm.riscv.vdot4au.mask.nxv16i32.nxv16i32( %0, %1, %2, @@ -171,14 +171,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vx_nxv1i32_i32: +define @intrinsic_vdot4au_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vdota4u.vx v8, v9, a0 +; CHECK-NEXT: vdot4au.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv1i32.i32( + %a = call @llvm.riscv.vdot4au.nxv1i32.i32( %0, %1, i32 %2, @@ -187,14 +187,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vx_nxv2i32_i32: +define @intrinsic_vdot4au_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma -; CHECK-NEXT: vdota4u.vx v8, v9, a0 +; CHECK-NEXT: vdot4au.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv2i32.i32( + %a = call @llvm.riscv.vdot4au.nxv2i32.i32( %0, %1, i32 %2, @@ -203,14 +203,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vx_nxv4i32_i32: +define @intrinsic_vdot4au_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vdota4u.vx v8, v10, a0 +; CHECK-NEXT: vdot4au.vx v8, v10, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv4i32.i32( + %a = call @llvm.riscv.vdot4au.nxv4i32.i32( %0, %1, i32 %2, @@ -219,14 +219,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vx_nxv8i32_i32: +define @intrinsic_vdot4au_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, ma -; CHECK-NEXT: vdota4u.vx v8, v12, a0 +; CHECK-NEXT: vdot4au.vx v8, v12, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv8i32.i32( + %a = call @llvm.riscv.vdot4au.nxv8i32.i32( %0, %1, i32 %2, @@ -235,14 +235,14 @@ entry: ret %a } -define @intrinsic_vdota4u_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_vx_nxv16i32_i32: +define @intrinsic_vdot4au_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4u.vx v8, v16, a0 +; CHECK-NEXT: vdot4au.vx v8, v16, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.nxv16i32.i32( + %a = call @llvm.riscv.vdot4au.nxv16i32.i32( %0, %1, i32 %2, @@ -251,14 +251,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vx_nxv1i32_i32: +define @intrinsic_vdot4au_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vdota4u.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4au.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv1i32.i32( + %a = call @llvm.riscv.vdot4au.mask.nxv1i32.i32( %0, %1, i32 %2, @@ -268,14 +268,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vx_nxv2i32_i32: +define @intrinsic_vdot4au_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vdota4u.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4au.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv2i32.i32( + %a = call @llvm.riscv.vdot4au.mask.nxv2i32.i32( %0, %1, i32 %2, @@ -285,14 +285,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vx_nxv4i32_i32: +define @intrinsic_vdot4au_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu -; CHECK-NEXT: vdota4u.vx v8, v10, a0, v0.t +; CHECK-NEXT: vdot4au.vx v8, v10, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv4i32.i32( + %a = call @llvm.riscv.vdot4au.mask.nxv4i32.i32( %0, %1, i32 %2, @@ -302,14 +302,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vx_nxv8i32_i32: +define @intrinsic_vdot4au_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu -; CHECK-NEXT: vdota4u.vx v8, v12, a0, v0.t +; CHECK-NEXT: vdot4au.vx v8, v12, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv8i32.i32( + %a = call @llvm.riscv.vdot4au.mask.nxv8i32.i32( %0, %1, i32 %2, @@ -319,14 +319,14 @@ entry: ret %a } -define @intrinsic_vdota4u_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4u_mask_vx_nxv16i32_i32: +define @intrinsic_vdot4au_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4au_mask_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4u.vx v8, v16, a0, v0.t +; CHECK-NEXT: vdot4au.vx v8, v16, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4u.mask.nxv16i32.i32( + %a = call @llvm.riscv.vdot4au.mask.nxv16i32.i32( %0, %1, i32 %2, diff --git a/llvm/test/CodeGen/RISCV/rvv/vdota4us.ll b/llvm/test/CodeGen/RISCV/rvv/vdot4aus.ll similarity index 62% rename from llvm/test/CodeGen/RISCV/rvv/vdota4us.ll rename to llvm/test/CodeGen/RISCV/rvv/vdot4aus.ll index 5b84572a790de..b52219490adda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdota4us.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdot4aus.ll @@ -4,14 +4,14 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+experimental-zvdot4a8i \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK -define @intrinsic_vdota4us_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_vx_nxv1i32_i32: +define @intrinsic_vdot4aus_vx_nxv1i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vdota4us.vx v8, v9, a0 +; CHECK-NEXT: vdot4aus.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.nxv1i32.i32( + %a = call @llvm.riscv.vdot4aus.nxv1i32.i32( %0, %1, i32 %2, @@ -20,14 +20,14 @@ entry: ret %a } -define @intrinsic_vdota4us_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_vx_nxv2i32_i32: +define @intrinsic_vdot4aus_vx_nxv2i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma -; CHECK-NEXT: vdota4us.vx v8, v9, a0 +; CHECK-NEXT: vdot4aus.vx v8, v9, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.nxv2i32.i32( + %a = call @llvm.riscv.vdot4aus.nxv2i32.i32( %0, %1, i32 %2, @@ -36,14 +36,14 @@ entry: ret %a } -define @intrinsic_vdota4us_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_vx_nxv4i32_i32: +define @intrinsic_vdot4aus_vx_nxv4i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vdota4us.vx v8, v10, a0 +; CHECK-NEXT: vdot4aus.vx v8, v10, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.nxv4i32.i32( + %a = call @llvm.riscv.vdot4aus.nxv4i32.i32( %0, %1, i32 %2, @@ -52,14 +52,14 @@ entry: ret %a } -define @intrinsic_vdota4us_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_vx_nxv8i32_i32: +define @intrinsic_vdot4aus_vx_nxv8i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, ma -; CHECK-NEXT: vdota4us.vx v8, v12, a0 +; CHECK-NEXT: vdot4aus.vx v8, v12, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.nxv8i32.i32( + %a = call @llvm.riscv.vdot4aus.nxv8i32.i32( %0, %1, i32 %2, @@ -68,14 +68,14 @@ entry: ret %a } -define @intrinsic_vdota4us_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_vx_nxv16i32_i32: +define @intrinsic_vdot4aus_vx_nxv16i32_i32( %0, %1, i32 %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma -; CHECK-NEXT: vdota4us.vx v8, v16, a0 +; CHECK-NEXT: vdot4aus.vx v8, v16, a0 ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.nxv16i32.i32( + %a = call @llvm.riscv.vdot4aus.nxv16i32.i32( %0, %1, i32 %2, @@ -84,14 +84,14 @@ entry: ret %a } -define @intrinsic_vdota4us_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_mask_vx_nxv1i32_i32: +define @intrinsic_vdot4aus_mask_vx_nxv1i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_mask_vx_nxv1i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vdota4us.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4aus.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.mask.nxv1i32.i32( + %a = call @llvm.riscv.vdot4aus.mask.nxv1i32.i32( %0, %1, i32 %2, @@ -101,14 +101,14 @@ entry: ret %a } -define @intrinsic_vdota4us_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_mask_vx_nxv2i32_i32: +define @intrinsic_vdot4aus_mask_vx_nxv2i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_mask_vx_nxv2i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vdota4us.vx v8, v9, a0, v0.t +; CHECK-NEXT: vdot4aus.vx v8, v9, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.mask.nxv2i32.i32( + %a = call @llvm.riscv.vdot4aus.mask.nxv2i32.i32( %0, %1, i32 %2, @@ -118,14 +118,14 @@ entry: ret %a } -define @intrinsic_vdota4us_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_mask_vx_nxv4i32_i32: +define @intrinsic_vdot4aus_mask_vx_nxv4i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_mask_vx_nxv4i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu -; CHECK-NEXT: vdota4us.vx v8, v10, a0, v0.t +; CHECK-NEXT: vdot4aus.vx v8, v10, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.mask.nxv4i32.i32( + %a = call @llvm.riscv.vdot4aus.mask.nxv4i32.i32( %0, %1, i32 %2, @@ -135,14 +135,14 @@ entry: ret %a } -define @intrinsic_vdota4us_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_mask_vx_nxv8i32_i32: +define @intrinsic_vdot4aus_mask_vx_nxv8i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_mask_vx_nxv8i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu -; CHECK-NEXT: vdota4us.vx v8, v12, a0, v0.t +; CHECK-NEXT: vdot4aus.vx v8, v12, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.mask.nxv8i32.i32( + %a = call @llvm.riscv.vdot4aus.mask.nxv8i32.i32( %0, %1, i32 %2, @@ -152,14 +152,14 @@ entry: ret %a } -define @intrinsic_vdota4us_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { -; CHECK-LABEL: intrinsic_vdota4us_mask_vx_nxv16i32_i32: +define @intrinsic_vdot4aus_mask_vx_nxv16i32_i32( %0, %1, i32 %2, %m, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vdot4aus_mask_vx_nxv16i32_i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu -; CHECK-NEXT: vdota4us.vx v8, v16, a0, v0.t +; CHECK-NEXT: vdot4aus.vx v8, v16, a0, v0.t ; CHECK-NEXT: ret entry: - %a = call @llvm.riscv.vdota4us.mask.nxv16i32.i32( + %a = call @llvm.riscv.vdot4aus.mask.nxv16i32.i32( %0, %1, i32 %2, diff --git a/llvm/test/CodeGen/RISCV/rvv/zvdot4a8i-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/zvdot4a8i-sdnode.ll index 7a54cee626e4b..f12078dae5e1f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zvdot4a8i-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/zvdot4a8i-sdnode.ll @@ -4,8 +4,8 @@ ; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvdot4a8i -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvdot4a8i -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT64 -define i32 @vdota4_vv( %a, %b) { -; NODOT-LABEL: vdota4_vv: +define i32 @vdot4a_vv( %a, %b) { +; NODOT-LABEL: vdot4a_vv: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -17,11 +17,11 @@ define i32 @vdota4_vv( %a, %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv: +; DOT-LABEL: vdot4a_vv: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4.vv v12, v8, v10 +; DOT-NEXT: vdot4a.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -34,8 +34,8 @@ entry: ret i32 %res } -define i32 @vdota4_vx_constant( %a) { -; CHECK-LABEL: vdota4_vx_constant: +define i32 @vdot4a_vx_constant( %a) { +; CHECK-LABEL: vdot4a_vx_constant: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vsext.vf2 v16, v8 @@ -53,8 +53,8 @@ entry: ret i32 %res } -define i32 @vdota4_vx_constant_swapped( %a) { -; CHECK-LABEL: vdota4_vx_constant_swapped: +define i32 @vdot4a_vx_constant_swapped( %a) { +; CHECK-LABEL: vdot4a_vx_constant_swapped: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vsext.vf2 v16, v8 @@ -72,8 +72,8 @@ entry: ret i32 %res } -define i32 @vdota4u_vv( %a, %b) { -; NODOT-LABEL: vdota4u_vv: +define i32 @vdot4au_vv( %a, %b) { +; NODOT-LABEL: vdot4au_vv: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; NODOT-NEXT: vwmulu.vv v12, v8, v10 @@ -85,11 +85,11 @@ define i32 @vdota4u_vv( %a, %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv: +; DOT-LABEL: vdot4au_vv: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4u.vv v12, v8, v10 +; DOT-NEXT: vdot4au.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -102,8 +102,8 @@ entry: ret i32 %res } -define i32 @vdota4u_vx_constant( %a) { -; CHECK-LABEL: vdota4u_vx_constant: +define i32 @vdot4au_vx_constant( %a) { +; CHECK-LABEL: vdot4au_vx_constant: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vzext.vf2 v16, v8 @@ -121,8 +121,8 @@ entry: ret i32 %res } -define i32 @vdota4su_vv( %a, %b) { -; NODOT-LABEL: vdota4su_vv: +define i32 @vdot4asu_vv( %a, %b) { +; NODOT-LABEL: vdot4asu_vv: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -134,11 +134,11 @@ define i32 @vdota4su_vv( %a, %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv: +; DOT-LABEL: vdot4asu_vv: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4su.vv v12, v8, v10 +; DOT-NEXT: vdot4asu.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -151,8 +151,8 @@ entry: ret i32 %res } -define i32 @vdota4su_vv_swapped( %a, %b) { -; NODOT-LABEL: vdota4su_vv_swapped: +define i32 @vdot4asu_vv_swapped( %a, %b) { +; NODOT-LABEL: vdot4asu_vv_swapped: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -164,11 +164,11 @@ define i32 @vdota4su_vv_swapped( %a, %b) { ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_swapped: +; DOT-LABEL: vdot4asu_vv_swapped: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4su.vv v12, v8, v10 +; DOT-NEXT: vdot4asu.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -236,7 +236,7 @@ define i32 @reduce_of_sext( %a) { ; DOT-NEXT: vmv.v.i v10, 1 ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4.vv v12, v8, v10 +; DOT-NEXT: vdot4a.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -263,7 +263,7 @@ define i32 @reduce_of_zext( %a) { ; DOT-NEXT: vmv.v.i v10, 1 ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4u.vv v12, v8, v10 +; DOT-NEXT: vdot4au.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -274,8 +274,8 @@ entry: ret i32 %res } -define i32 @vdota4_vv_accum( %a, %b, %x) { -; NODOT-LABEL: vdota4_vv_accum: +define i32 @vdot4a_vv_accum( %a, %b, %x) { +; NODOT-LABEL: vdot4a_vv_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -287,11 +287,11 @@ define i32 @vdota4_vv_accum( %a, %b, %a, %b, %x) { -; NODOT-LABEL: vdota4u_vv_accum: +define i32 @vdot4au_vv_accum( %a, %b, %x) { +; NODOT-LABEL: vdot4au_vv_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; NODOT-NEXT: vwmulu.vv v12, v8, v10 @@ -318,11 +318,11 @@ define i32 @vdota4u_vv_accum( %a, %b, %a, %b, %x) { -; NODOT-LABEL: vdota4su_vv_accum: +define i32 @vdot4asu_vv_accum( %a, %b, %x) { +; NODOT-LABEL: vdot4asu_vv_accum: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v12, v8 @@ -349,11 +349,11 @@ define i32 @vdota4su_vv_accum( %a, %b, %a, %b, i32 %x) { -; NODOT-LABEL: vdota4_vv_scalar_add: +define i32 @vdot4a_vv_scalar_add( %a, %b, i32 %x) { +; NODOT-LABEL: vdot4a_vv_scalar_add: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -380,11 +380,11 @@ define i32 @vdota4_vv_scalar_add( %a, %b, i ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4_vv_scalar_add: +; DOT-LABEL: vdot4a_vv_scalar_add: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4.vv v12, v8, v10 +; DOT-NEXT: vdot4a.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, a0 ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -398,8 +398,8 @@ entry: ret i32 %add } -define i32 @vdota4u_vv_scalar_add( %a, %b, i32 %x) { -; NODOT-LABEL: vdota4u_vv_scalar_add: +define i32 @vdot4au_vv_scalar_add( %a, %b, i32 %x) { +; NODOT-LABEL: vdot4au_vv_scalar_add: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; NODOT-NEXT: vwmulu.vv v12, v8, v10 @@ -411,11 +411,11 @@ define i32 @vdota4u_vv_scalar_add( %a, %b, ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4u_vv_scalar_add: +; DOT-LABEL: vdot4au_vv_scalar_add: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4u.vv v12, v8, v10 +; DOT-NEXT: vdot4au.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, a0 ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -429,8 +429,8 @@ entry: ret i32 %add } -define i32 @vdota4su_vv_scalar_add( %a, %b, i32 %x) { -; NODOT-LABEL: vdota4su_vv_scalar_add: +define i32 @vdot4asu_vv_scalar_add( %a, %b, i32 %x) { +; NODOT-LABEL: vdot4asu_vv_scalar_add: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -442,11 +442,11 @@ define i32 @vdota4su_vv_scalar_add( %a, %b, ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; -; DOT-LABEL: vdota4su_vv_scalar_add: +; DOT-LABEL: vdot4asu_vv_scalar_add: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 -; DOT-NEXT: vdota4su.vv v12, v8, v10 +; DOT-NEXT: vdot4asu.vv v12, v8, v10 ; DOT-NEXT: vmv.s.x v8, a0 ; DOT-NEXT: vredsum.vs v8, v12, v8 ; DOT-NEXT: vmv.x.s a0, v8 @@ -460,8 +460,8 @@ entry: ret i32 %add } -define i32 @vdota4_vv_split( %a, %b, %c, %d) { -; NODOT-LABEL: vdota4_vv_split: +define i32 @vdot4a_vv_split( %a, %b, %c, %d) { +; NODOT-LABEL: vdot4a_vv_split: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; NODOT-NEXT: vsext.vf2 v16, v8 @@ -476,12 +476,12 @@ define i32 @vdota4_vv_split( %a, %b, @partial_reduce_nf2( %a, @partial_reduce_m1( %a, @partial_reduce_m2( %a, @partial_reduce_m4( %a, @partial_reduce_m8( %a, @partial_reduce_m16( %a, @partial_reduce_m16( %a, @partial_reduce_accum( %a, %res } -define @partial_reduce_vdota4u( %a, %b) { -; NODOT-LABEL: partial_reduce_vdota4u: +define @partial_reduce_vdot4au( %a, %b) { +; NODOT-LABEL: partial_reduce_vdot4au: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; NODOT-NEXT: vwmulu.vv v10, v8, v9 @@ -894,11 +894,11 @@ define @partial_reduce_vdota4u( %a, %res } -define @partial_reduce_vdota4su( %a, %b) { -; NODOT-LABEL: partial_reduce_vdota4su: +define @partial_reduce_vdot4asu( %a, %b) { +; NODOT-LABEL: partial_reduce_vdot4asu: ; NODOT: # %bb.0: # %entry ; NODOT-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; NODOT-NEXT: vsext.vf2 v10, v8 @@ -927,11 +927,11 @@ define @partial_reduce_vdota4su( %a, @partial_of_sext( %a) { ; DOT-NEXT: vmv.v.i v12, 1 ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4.vv v10, v8, v12 +; DOT-NEXT: vdot4a.vv v10, v8, v12 ; DOT-NEXT: vmv.v.v v8, v10 ; DOT-NEXT: ret entry: @@ -986,7 +986,7 @@ define @partial_of_zext( %a) { ; DOT-NEXT: vmv.v.i v12, 1 ; DOT-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; DOT-NEXT: vmv.v.i v10, 0 -; DOT-NEXT: vdota4u.vv v10, v8, v12 +; DOT-NEXT: vdot4au.vv v10, v8, v12 ; DOT-NEXT: vmv.v.v v8, v10 ; DOT-NEXT: ret entry: @@ -1018,7 +1018,7 @@ define @partial_reduce_select( %a, &1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR -vdota4.vv v0, v2, v4, v0.t +vdot4a.vv v0, v2, v4, v0.t # CHECK-ERROR: the destination vector register group cannot overlap the mask register -# CHECK-ERROR-LABEL: vdota4.vv v0, v2, v4, v0.t +# CHECK-ERROR-LABEL: vdot4a.vv v0, v2, v4, v0.t -vdota4.vx v0, v2, a0, v0.t +vdot4a.vx v0, v2, a0, v0.t # CHECK-ERROR: the destination vector register group cannot overlap the mask register -# CHECK-ERROR-LABEL: vdota4.vx v0, v2, a0, v0.t +# CHECK-ERROR-LABEL: vdot4a.vx v0, v2, a0, v0.t diff --git a/llvm/test/MC/RISCV/rvv/zvdot4a8i.s b/llvm/test/MC/RISCV/rvv/zvdot4a8i.s index 8797f621b360f..fe217bb6d72f6 100644 --- a/llvm/test/MC/RISCV/rvv/zvdot4a8i.s +++ b/llvm/test/MC/RISCV/rvv/zvdot4a8i.s @@ -6,72 +6,72 @@ # RUN: | llvm-objdump -d --mattr=+experimental-zvdot4a8i - \ # RUN: | FileCheck %s --check-prefix=CHECK-INST -vdota4.vv v8, v4, v20, v0.t -# CHECK-INST: vdota4.vv v8, v4, v20, v0.t +vdot4a.vv v8, v4, v20, v0.t +# CHECK-INST: vdot4a.vv v8, v4, v20, v0.t # CHECK-ENCODING: [0x57,0x24,0x4a,0xb0] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4.vv v8, v4, v20 -# CHECK-INST: vdota4.vv v8, v4, v20 +vdot4a.vv v8, v4, v20 +# CHECK-INST: vdot4a.vv v8, v4, v20 # CHECK-ENCODING: [0x57,0x24,0x4a,0xb2] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4u.vv v8, v4, v20, v0.t -# CHECK-INST: vdota4u.vv v8, v4, v20, v0.t +vdot4au.vv v8, v4, v20, v0.t +# CHECK-INST: vdot4au.vv v8, v4, v20, v0.t # CHECK-ENCODING: [0x57,0x24,0x4a,0xa0] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4u.vv v8, v4, v20 -# CHECK-INST: vdota4u.vv v8, v4, v20 +vdot4au.vv v8, v4, v20 +# CHECK-INST: vdot4au.vv v8, v4, v20 # CHECK-ENCODING: [0x57,0x24,0x4a,0xa2] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4su.vv v8, v4, v20, v0.t -# CHECK-INST: vdota4su.vv v8, v4, v20, v0.t +vdot4asu.vv v8, v4, v20, v0.t +# CHECK-INST: vdot4asu.vv v8, v4, v20, v0.t # CHECK-ENCODING: [0x57,0x24,0x4a,0xa8] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4su.vv v8, v4, v20 -# CHECK-INST: vdota4su.vv v8, v4, v20 +vdot4asu.vv v8, v4, v20 +# CHECK-INST: vdot4asu.vv v8, v4, v20 # CHECK-ENCODING: [0x57,0x24,0x4a,0xaa] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4.vx v8, v4, s4, v0.t -# CHECK-INST: vdota4.vx v8, v4, s4, v0.t +vdot4a.vx v8, v4, s4, v0.t +# CHECK-INST: vdot4a.vx v8, v4, s4, v0.t # CHECK-ENCODING: [0x57,0x64,0x4a,0xb0] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4.vx v8, v4, s4 -# CHECK-INST: vdota4.vx v8, v4, s4 +vdot4a.vx v8, v4, s4 +# CHECK-INST: vdot4a.vx v8, v4, s4 # CHECK-ENCODING: [0x57,0x64,0x4a,0xb2] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4u.vx v8, v4, s4, v0.t -# CHECK-INST: vdota4u.vx v8, v4, s4, v0.t +vdot4au.vx v8, v4, s4, v0.t +# CHECK-INST: vdot4au.vx v8, v4, s4, v0.t # CHECK-ENCODING: [0x57,0x64,0x4a,0xa0] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4u.vx v8, v4, s4 -# CHECK-INST: vdota4u.vx v8, v4, s4 +vdot4au.vx v8, v4, s4 +# CHECK-INST: vdot4au.vx v8, v4, s4 # CHECK-ENCODING: [0x57,0x64,0x4a,0xa2] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4su.vx v8, v4, s4, v0.t -# CHECK-INST: vdota4su.vx v8, v4, s4, v0.t +vdot4asu.vx v8, v4, s4, v0.t +# CHECK-INST: vdot4asu.vx v8, v4, s4, v0.t # CHECK-ENCODING: [0x57,0x64,0x4a,0xa8] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4su.vx v8, v4, s4 -# CHECK-INST: vdota4su.vx v8, v4, s4 +vdot4asu.vx v8, v4, s4 +# CHECK-INST: vdot4asu.vx v8, v4, s4 # CHECK-ENCODING: [0x57,0x64,0x4a,0xaa] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4us.vx v8, v4, s4, v0.t -# CHECK-INST: vdota4us.vx v8, v4, s4, v0.t +vdot4aus.vx v8, v4, s4, v0.t +# CHECK-INST: vdot4aus.vx v8, v4, s4, v0.t # CHECK-ENCODING: [0x57,0x64,0x4a,0xb8] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} -vdota4us.vx v8, v4, s4 -# CHECK-INST: vdota4us.vx v8, v4, s4 +vdot4aus.vx v8, v4, s4 +# CHECK-INST: vdot4aus.vx v8, v4, s4 # CHECK-ENCODING: [0x57,0x64,0x4a,0xba] # CHECK-ERROR: instruction requires the following: 'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers){{$}} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 49144f4ad048c..2c820e8af6ff6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -9,8 +9,8 @@ target triple = "riscv64-none-unknown-elf" -define i32 @vdota4(ptr %a, ptr %b) #0 { -; V-LABEL: define i32 @vdota4( +define i32 @vdot4a(ptr %a, ptr %b) #0 { +; V-LABEL: define i32 @vdot4a( ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -43,7 +43,7 @@ define i32 @vdota4(ptr %a, ptr %b) #0 { ; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; V: scalar.ph: ; -; ZVDOT4A8I-LABEL: define i32 @vdota4( +; ZVDOT4A8I-LABEL: define i32 @vdot4a( ; ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; ZVDOT4A8I-NEXT: entry: ; ZVDOT4A8I-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -76,7 +76,7 @@ define i32 @vdota4(ptr %a, ptr %b) #0 { ; ZVDOT4A8I-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; ZVDOT4A8I: scalar.ph: ; -; FIXED-V-LABEL: define i32 @vdota4( +; FIXED-V-LABEL: define i32 @vdot4a( ; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; FIXED-V-NEXT: entry: ; FIXED-V-NEXT: br label [[VECTOR_PH:%.*]] @@ -112,7 +112,7 @@ define i32 @vdota4(ptr %a, ptr %b) #0 { ; FIXED-V: for.exit: ; FIXED-V-NEXT: ret i32 [[TMP15]] ; -; FIXED-ZVDOT4A8I-LABEL: define i32 @vdota4( +; FIXED-ZVDOT4A8I-LABEL: define i32 @vdot4a( ; FIXED-ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; FIXED-ZVDOT4A8I-NEXT: entry: ; FIXED-ZVDOT4A8I-NEXT: br label [[VECTOR_PH:%.*]] @@ -148,7 +148,7 @@ define i32 @vdota4(ptr %a, ptr %b) #0 { ; FIXED-ZVDOT4A8I: for.exit: ; FIXED-ZVDOT4A8I-NEXT: ret i32 [[TMP13]] ; -; TAILFOLD-LABEL: define i32 @vdota4( +; TAILFOLD-LABEL: define i32 @vdot4a( ; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; TAILFOLD-NEXT: entry: ; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] @@ -202,8 +202,8 @@ for.exit: } -define i32 @vdota4u(ptr %a, ptr %b) #0 { -; V-LABEL: define i32 @vdota4u( +define i32 @vdot4au(ptr %a, ptr %b) #0 { +; V-LABEL: define i32 @vdot4au( ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -236,7 +236,7 @@ define i32 @vdota4u(ptr %a, ptr %b) #0 { ; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; V: scalar.ph: ; -; ZVDOT4A8I-LABEL: define i32 @vdota4u( +; ZVDOT4A8I-LABEL: define i32 @vdot4au( ; ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; ZVDOT4A8I-NEXT: entry: ; ZVDOT4A8I-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -269,7 +269,7 @@ define i32 @vdota4u(ptr %a, ptr %b) #0 { ; ZVDOT4A8I-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; ZVDOT4A8I: scalar.ph: ; -; FIXED-V-LABEL: define i32 @vdota4u( +; FIXED-V-LABEL: define i32 @vdot4au( ; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; FIXED-V-NEXT: entry: ; FIXED-V-NEXT: br label [[VECTOR_PH:%.*]] @@ -305,7 +305,7 @@ define i32 @vdota4u(ptr %a, ptr %b) #0 { ; FIXED-V: for.exit: ; FIXED-V-NEXT: ret i32 [[TMP15]] ; -; FIXED-ZVDOT4A8I-LABEL: define i32 @vdota4u( +; FIXED-ZVDOT4A8I-LABEL: define i32 @vdot4au( ; FIXED-ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; FIXED-ZVDOT4A8I-NEXT: entry: ; FIXED-ZVDOT4A8I-NEXT: br label [[VECTOR_PH:%.*]] @@ -341,7 +341,7 @@ define i32 @vdota4u(ptr %a, ptr %b) #0 { ; FIXED-ZVDOT4A8I: for.exit: ; FIXED-ZVDOT4A8I-NEXT: ret i32 [[TMP13]] ; -; TAILFOLD-LABEL: define i32 @vdota4u( +; TAILFOLD-LABEL: define i32 @vdot4au( ; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; TAILFOLD-NEXT: entry: ; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] @@ -395,8 +395,8 @@ for.exit: } -define i32 @vdota4su(ptr %a, ptr %b) #0 { -; V-LABEL: define i32 @vdota4su( +define i32 @vdot4asu(ptr %a, ptr %b) #0 { +; V-LABEL: define i32 @vdot4asu( ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -429,7 +429,7 @@ define i32 @vdota4su(ptr %a, ptr %b) #0 { ; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; V: scalar.ph: ; -; ZVDOT4A8I-LABEL: define i32 @vdota4su( +; ZVDOT4A8I-LABEL: define i32 @vdot4asu( ; ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; ZVDOT4A8I-NEXT: entry: ; ZVDOT4A8I-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -462,7 +462,7 @@ define i32 @vdota4su(ptr %a, ptr %b) #0 { ; ZVDOT4A8I-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; ZVDOT4A8I: scalar.ph: ; -; FIXED-V-LABEL: define i32 @vdota4su( +; FIXED-V-LABEL: define i32 @vdot4asu( ; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; FIXED-V-NEXT: entry: ; FIXED-V-NEXT: br label [[VECTOR_PH:%.*]] @@ -498,7 +498,7 @@ define i32 @vdota4su(ptr %a, ptr %b) #0 { ; FIXED-V: for.exit: ; FIXED-V-NEXT: ret i32 [[TMP15]] ; -; FIXED-ZVDOT4A8I-LABEL: define i32 @vdota4su( +; FIXED-ZVDOT4A8I-LABEL: define i32 @vdot4asu( ; FIXED-ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; FIXED-ZVDOT4A8I-NEXT: entry: ; FIXED-ZVDOT4A8I-NEXT: br label [[VECTOR_PH:%.*]] @@ -534,7 +534,7 @@ define i32 @vdota4su(ptr %a, ptr %b) #0 { ; FIXED-ZVDOT4A8I: for.exit: ; FIXED-ZVDOT4A8I-NEXT: ret i32 [[TMP13]] ; -; TAILFOLD-LABEL: define i32 @vdota4su( +; TAILFOLD-LABEL: define i32 @vdot4asu( ; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; TAILFOLD-NEXT: entry: ; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] @@ -587,8 +587,8 @@ for.exit: ret i32 %add } -define i32 @vdota4su2(ptr %a, ptr %b) #0 { -; V-LABEL: define i32 @vdota4su2( +define i32 @vdot4asu2(ptr %a, ptr %b) #0 { +; V-LABEL: define i32 @vdot4asu2( ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -621,7 +621,7 @@ define i32 @vdota4su2(ptr %a, ptr %b) #0 { ; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; V: scalar.ph: ; -; ZVDOT4A8I-LABEL: define i32 @vdota4su2( +; ZVDOT4A8I-LABEL: define i32 @vdot4asu2( ; ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; ZVDOT4A8I-NEXT: entry: ; ZVDOT4A8I-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() @@ -654,7 +654,7 @@ define i32 @vdota4su2(ptr %a, ptr %b) #0 { ; ZVDOT4A8I-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; ZVDOT4A8I: scalar.ph: ; -; FIXED-V-LABEL: define i32 @vdota4su2( +; FIXED-V-LABEL: define i32 @vdot4asu2( ; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; FIXED-V-NEXT: entry: ; FIXED-V-NEXT: br label [[VECTOR_PH:%.*]] @@ -690,7 +690,7 @@ define i32 @vdota4su2(ptr %a, ptr %b) #0 { ; FIXED-V: for.exit: ; FIXED-V-NEXT: ret i32 [[TMP15]] ; -; FIXED-ZVDOT4A8I-LABEL: define i32 @vdota4su2( +; FIXED-ZVDOT4A8I-LABEL: define i32 @vdot4asu2( ; FIXED-ZVDOT4A8I-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; FIXED-ZVDOT4A8I-NEXT: entry: ; FIXED-ZVDOT4A8I-NEXT: br label [[VECTOR_PH:%.*]] @@ -726,7 +726,7 @@ define i32 @vdota4su2(ptr %a, ptr %b) #0 { ; FIXED-ZVDOT4A8I: for.exit: ; FIXED-ZVDOT4A8I-NEXT: ret i32 [[TMP13]] ; -; TAILFOLD-LABEL: define i32 @vdota4su2( +; TAILFOLD-LABEL: define i32 @vdot4asu2( ; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; TAILFOLD-NEXT: entry: ; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] From 74991888cc6ca1b1a71b1fcf4b044c2c7fe5a088 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 11 May 2026 21:08:22 -0700 Subject: [PATCH 390/538] [CodeGen,IRNormalizer,BOLT] Duplicate stable hash_16_bytes locally. NFC (#196854) llvm/ADT/Hashing.h doesn't guarantee cross-process stability. llvm/ADT/StableHashing.h provides stability for a specific compiler version. They reserve the right the right to adjust the implementation as the compiler evolves. Some callers of hashing::detail::hash_16_bytes rely on hash output embedded in persisted artifacts: * MachineBlockHashInfo serializes block hashes into BB section profile data and pins the mixer's exact output with a static_assert. * BOLT's stale profile matching records block hashes in its on-disk profile format and replays them against potentially-rebuilt binaries (possibly built from a later LLVM revision). * IRNormalizer derives basic-block names from the hash; the names land in the normalized IR text used to diff modules. Move the implementation into each caller as a file-local static constexpr helper with a comment documenting why it is frozen. --- bolt/lib/Profile/StaleProfileMatching.cpp | 18 ++++++++++++-- llvm/lib/CodeGen/MachineBlockHashInfo.cpp | 28 +++++++++++++++------- llvm/lib/Transforms/Utils/IRNormalizer.cpp | 23 ++++++++++++++---- 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index f1cffde975520..90562b63aed1f 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -42,6 +42,20 @@ using namespace llvm; #undef DEBUG_TYPE #define DEBUG_TYPE "bolt-prof" +// Frozen mixer; the block hashes computed below participate in BOLT's +// stale profile matching, so this function's exact output is part of +// the on-disk profile format. Do not change without versioning that +// format. +static constexpr uint64_t hash_16_bytes(uint64_t low, uint64_t high) { + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + uint64_t a = (low ^ high) * kMul; + a ^= (a >> 47); + uint64_t b = (high ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + namespace opts { extern cl::opt TimeRewrite; @@ -414,7 +428,7 @@ void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const { uint64_t Hash = 0; for (BinaryBasicBlock *SuccBB : BB->successors()) { uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; - Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); + Hash = hash_16_bytes(Hash, SuccHash); } if (HashFunction == HashFunction::StdHash) { // Compatibility with old behavior. @@ -427,7 +441,7 @@ void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const { Hash = 0; for (BinaryBasicBlock *PredBB : BB->predecessors()) { uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; - Hash = hashing::detail::hash_16_bytes(Hash, PredHash); + Hash = hash_16_bytes(Hash, PredHash); } if (HashFunction == HashFunction::StdHash) { // Compatibility with old behavior. diff --git a/llvm/lib/CodeGen/MachineBlockHashInfo.cpp b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp index 27110b014d2ec..029788f811bcb 100644 --- a/llvm/lib/CodeGen/MachineBlockHashInfo.cpp +++ b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineBlockHashInfo.h" -#include "llvm/ADT/Hashing.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineStableHash.h" #include "llvm/CodeGen/Passes.h" @@ -21,16 +20,28 @@ using namespace llvm; +// Frozen mixer; the block hashes computed below are serialized into BB +// section profile data, so this function's exact output is part of the +// on-disk format. Do not change without versioning that format. +static constexpr uint64_t hash_16_bytes(uint64_t low, uint64_t high) { + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + uint64_t a = (low ^ high) * kMul; + a ^= (a >> 47); + uint64_t b = (high ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + static uint64_t hashBlock(const MachineBasicBlock &MBB, bool HashOperands) { uint64_t Hash = 0; for (const MachineInstr &MI : MBB) { if (MI.isMetaInstruction() || MI.isTerminator()) continue; - Hash = hashing::detail::hash_16_bytes(Hash, MI.getOpcode()); + Hash = hash_16_bytes(Hash, MI.getOpcode()); if (HashOperands) { for (unsigned i = 0; i < MI.getNumOperands(); i++) { - Hash = hashing::detail::hash_16_bytes( - Hash, stableHashValue(MI.getOperand(i))); + Hash = hash_16_bytes(Hash, stableHashValue(MI.getOperand(i))); } } } @@ -46,10 +57,9 @@ static constexpr uint16_t fold_64_to_16(const uint64_t Value) { return Res; } -// Keep stable to serialize data. -static_assert(hashing::detail::hash_16_bytes(1, 2) == 9684580150926652833ull, +static_assert(hash_16_bytes(1, 2) == 9684580150926652833ull, "Hash function must be stable"); -static_assert(hashing::detail::hash_16_bytes(-1, -2) == 7819786907124864172ull, +static_assert(hash_16_bytes(-1, -2) == 7819786907124864172ull, "Hash function must be stable"); static_assert(fold_64_to_16(1) == 1, "Fold function must be stable"); static_assert(fold_64_to_16(12345678) == 25074, "Fold function must be stable"); @@ -97,12 +107,12 @@ MachineBlockHashInfoResult::MachineBlockHashInfoResult( // Append hashes of successors for (const MachineBasicBlock *SuccMBB : MBB.successors()) { uint64_t SuccHash = HashInfos[SuccMBB].OpcodeHash; - Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); + Hash = hash_16_bytes(Hash, SuccHash); } // Append hashes of predecessors for (const MachineBasicBlock *PredMBB : MBB.predecessors()) { uint64_t PredHash = HashInfos[PredMBB].OpcodeHash; - Hash = hashing::detail::hash_16_bytes(Hash, PredHash); + Hash = hash_16_bytes(Hash, PredHash); } HashInfo.NeighborHash = Hash; } diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp index df8dcddc7b32e..3f59044e0ab2e 100644 --- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp +++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp @@ -30,6 +30,19 @@ using namespace llvm; +// Frozen mixer; basic-block names derived from these hashes appear in +// the normalized IR text and must be deterministic across processes +// for the normalizer's "compare normalized IR" workflow to work. +static constexpr uint64_t hash_16_bytes(uint64_t low, uint64_t high) { + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + uint64_t a = (low ^ high) * kMul; + a ^= (a >> 47); + uint64_t b = (high ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + namespace { /// IRNormalizer aims to transform LLVM IR into normal form. class IRNormalizer { @@ -138,7 +151,7 @@ void IRNormalizer::nameBasicBlocks(Function &F) const { // Hash considering output instruction opcodes. for (auto &I : B) if (isOutput(&I)) - Hash = hashing::detail::hash_16_bytes(Hash, I.getOpcode()); + Hash = hash_16_bytes(Hash, I.getOpcode()); if (Options.RenameAll || B.getName().empty()) { // Name basic block. Substring hash to make diffs more readable. @@ -215,7 +228,7 @@ void IRNormalizer::nameAsInitialInstruction(Instruction *I) const { uint64_t Hash = MagicHashConstant; // Consider instruction's opcode in the hash. - Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode()); + Hash = hash_16_bytes(Hash, I->getOpcode()); SmallPtrSet Visited; // Get output footprint for I. @@ -223,7 +236,7 @@ void IRNormalizer::nameAsInitialInstruction(Instruction *I) const { // Consider output footprint in the hash. for (const int &Output : OutputFootprint) - Hash = hashing::detail::hash_16_bytes(Hash, Output); + Hash = hash_16_bytes(Hash, Output); // Base instruction name. SmallString<256> Name; @@ -298,7 +311,7 @@ void IRNormalizer::nameAsRegularInstruction(Instruction *I) { uint64_t Hash = MagicHashConstant; // Consider instruction opcode in the hash. - Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode()); + Hash = hash_16_bytes(Hash, I->getOpcode()); // Operand opcodes for further sorting (commutative). SmallVector OperandsOpcodes; @@ -312,7 +325,7 @@ void IRNormalizer::nameAsRegularInstruction(Instruction *I) { // Consider operand opcodes in the hash. for (const int Code : OperandsOpcodes) - Hash = hashing::detail::hash_16_bytes(Hash, Code); + Hash = hash_16_bytes(Hash, Code); // Base instruction name. SmallString<512> Name; From 5f686f14a3068c4c99bb4091d3a31fce10d5f525 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Mon, 11 May 2026 21:16:08 -0700 Subject: [PATCH 391/538] [RISCV] Use getSignedTargetConstant for QC_MULIADD (#197032) The coefficient is a signed value, and `getTargetConstant` is for unsigned values. This was causing us assertion failures. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/test/CodeGen/RISCV/xqciac.ll | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b3258898452f9..bdee637794756 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9584,7 +9584,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, if (MulImm != INT64_MIN && isInt<12>(MulImm - 1) && Subtarget.hasVendorXqciac()) return DAG.getNode(RISCVISD::QC_MULIADD, DL, VT, CondV, CondV, - DAG.getTargetConstant(MulImm - 1, DL, VT)); + DAG.getSignedTargetConstant(MulImm - 1, DL, VT)); // (select c, (1 << ShAmount) + 1, 0) -> (c << ShAmount) + c uint64_t TrueM1 = TrueC->getZExtValue() - 1; diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll index 0abe12171f136..ed5c574f382f9 100644 --- a/llvm/test/CodeGen/RISCV/xqciac.ll +++ b/llvm/test/CodeGen/RISCV/xqciac.ll @@ -705,3 +705,23 @@ define i64 @select_int64_min(i1 zeroext %x) { %select = select i1 %x, i64 -9223372036854775808, i64 0 ret i64 %select } + +define i32 @neg_coefficient(i1 zeroext %x) nounwind { +; RV32IM-LABEL: neg_coefficient: +; RV32IM: # %bb.0: +; RV32IM-NEXT: neg a0, a0 +; RV32IM-NEXT: andi a0, a0, -32 +; RV32IM-NEXT: ret +; +; RV32IMXQCIAC-LABEL: neg_coefficient: +; RV32IMXQCIAC: # %bb.0: +; RV32IMXQCIAC-NEXT: qc.muliadd a0, a0, -33 +; RV32IMXQCIAC-NEXT: ret +; +; RV32IZBAMXQCIAC-LABEL: neg_coefficient: +; RV32IZBAMXQCIAC: # %bb.0: +; RV32IZBAMXQCIAC-NEXT: qc.muliadd a0, a0, -33 +; RV32IZBAMXQCIAC-NEXT: ret + %cond = select i1 %x, i32 -32, i32 0 + ret i32 %cond +} From 3ccf17db5e0c8358e40e9e4faf5d5f35ba48b7a5 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Mon, 11 May 2026 21:18:09 -0700 Subject: [PATCH 392/538] [clang][NFC] Remove non-builtin generic bit functions (#197069) The non-builtin generic bit functions stdc_leading_zeros stdc_leading_ones stdc_trailing_zeros stdc_trailing_ones stdc_first_leading_zero stdc_first_leading_one stdc_first_trailing_zero stdc_first_trailing_one stdc_count_zeros stdc_count_ones stdc_has_single_bit stdc_bit_width stdc_bit_floor stdc_bit_ceil could never be matched, and the corresponding BIstdc_leading_zeros BIstdc_leading_ones BIstdc_trailing_zeros BIstdc_trailing_ones BIstdc_first_leading_zero BIstdc_first_leading_one BIstdc_first_trailing_one BIstdc_first_trailing_zero BIstdc_count_zeros BIstdc_count_ones BIstdc_has_single_bit BIstdc_bit_floor BIstdc_bit_ceil BIstdc_bit_width cases in CGBuiltin.cpp, ExprConstant.cpp, InterpBuiltin.cpp, and SemaChecking.cpp are unreachable. Followup: #185978 --- clang/include/clang/Basic/Builtins.td | 84 ------------------------ clang/lib/AST/ByteCode/InterpBuiltin.cpp | 14 ---- clang/lib/AST/ExprConstant.cpp | 28 -------- clang/lib/CodeGen/CGBuiltin.cpp | 14 ---- clang/lib/Sema/SemaChecking.cpp | 14 ---- 5 files changed, 154 deletions(-) diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 4a7eaeb3d353e..5341a8c347f74 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -886,90 +886,6 @@ def StdcBitCeil: Builtin { let Prototype = "void(...)"; } -def StdcLeadingZerosLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_leading_zeros"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcLeadingOnesLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_leading_ones"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcTrailingZerosLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_trailing_zeros"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcTrailingOnesLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_trailing_ones"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcFirstLeadingZeroLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_first_leading_zero"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcFirstLeadingOneLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_first_leading_one"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcFirstTrailingZeroLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_first_trailing_zero"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcFirstTrailingOneLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_first_trailing_one"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcCountZerosLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_count_zeros"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcCountOnesLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_count_ones"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcHasSingleBitLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_has_single_bit"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "bool(...)"; -} - -def StdcBitWidthLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_bit_width"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcBitFloorLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_bit_floor"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - -def StdcBitCeilLib: LibBuiltin<"stdbit.h", "C23_LANG"> { - let Spellings = ["stdc_bit_ceil"]; - let Attributes = [NoThrow, Const, CustomTypeChecking]; - let Prototype = "void(...)"; -} - // Typed variants of the C23 stdbit.h builtins (e.g. stdc_leading_zeros_uc). // IntBitUtilTemplate generates the _uc/_us/_ui/_ul/_ull spellings with // concrete prototypes so the compiler can match the declarations. diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 11ca93c251380..5ba15b7ad4f63 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4661,7 +4661,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_leading_zeros_ui: case Builtin::BIstdc_leading_zeros_ul: case Builtin::BIstdc_leading_zeros_ull: - case Builtin::BIstdc_leading_zeros: case Builtin::BI__builtin_stdc_leading_zeros: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4675,7 +4674,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_leading_ones_ui: case Builtin::BIstdc_leading_ones_ul: case Builtin::BIstdc_leading_ones_ull: - case Builtin::BIstdc_leading_ones: case Builtin::BI__builtin_stdc_leading_ones: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4689,7 +4687,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_trailing_zeros_ui: case Builtin::BIstdc_trailing_zeros_ul: case Builtin::BIstdc_trailing_zeros_ull: - case Builtin::BIstdc_trailing_zeros: case Builtin::BI__builtin_stdc_trailing_zeros: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4703,7 +4700,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_trailing_ones_ui: case Builtin::BIstdc_trailing_ones_ul: case Builtin::BIstdc_trailing_ones_ull: - case Builtin::BIstdc_trailing_ones: case Builtin::BI__builtin_stdc_trailing_ones: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4717,7 +4713,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_first_leading_zero_ui: case Builtin::BIstdc_first_leading_zero_ul: case Builtin::BIstdc_first_leading_zero_ull: - case Builtin::BIstdc_first_leading_zero: case Builtin::BI__builtin_stdc_first_leading_zero: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4731,7 +4726,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_first_leading_one_ui: case Builtin::BIstdc_first_leading_one_ul: case Builtin::BIstdc_first_leading_one_ull: - case Builtin::BIstdc_first_leading_one: case Builtin::BI__builtin_stdc_first_leading_one: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4745,7 +4739,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_first_trailing_zero_ui: case Builtin::BIstdc_first_trailing_zero_ul: case Builtin::BIstdc_first_trailing_zero_ull: - case Builtin::BIstdc_first_trailing_zero: case Builtin::BI__builtin_stdc_first_trailing_zero: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4759,7 +4752,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_first_trailing_one_ui: case Builtin::BIstdc_first_trailing_one_ul: case Builtin::BIstdc_first_trailing_one_ull: - case Builtin::BIstdc_first_trailing_one: case Builtin::BI__builtin_stdc_first_trailing_one: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4773,7 +4765,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_count_zeros_ui: case Builtin::BIstdc_count_zeros_ul: case Builtin::BIstdc_count_zeros_ull: - case Builtin::BIstdc_count_zeros: case Builtin::BI__builtin_stdc_count_zeros: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4788,7 +4779,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_count_ones_ui: case Builtin::BIstdc_count_ones_ul: case Builtin::BIstdc_count_ones_ull: - case Builtin::BIstdc_count_ones: case Builtin::BI__builtin_stdc_count_ones: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4802,7 +4792,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_has_single_bit_ui: case Builtin::BIstdc_has_single_bit_ul: case Builtin::BIstdc_has_single_bit_ull: - case Builtin::BIstdc_has_single_bit: case Builtin::BI__builtin_stdc_has_single_bit: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4816,7 +4805,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_bit_width_ui: case Builtin::BIstdc_bit_width_ul: case Builtin::BIstdc_bit_width_ull: - case Builtin::BIstdc_bit_width: case Builtin::BI__builtin_stdc_bit_width: { unsigned ResWidth = S.getASTContext().getIntWidth(Call->getType()); return interp__builtin_elementwise_int_unaryop( @@ -4831,7 +4819,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_bit_floor_ui: case Builtin::BIstdc_bit_floor_ul: case Builtin::BIstdc_bit_floor_ull: - case Builtin::BIstdc_bit_floor: case Builtin::BI__builtin_stdc_bit_floor: return interp__builtin_elementwise_int_unaryop( S, OpPC, Call, [](const APSInt &Val) { @@ -4847,7 +4834,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BIstdc_bit_ceil_ui: case Builtin::BIstdc_bit_ceil_ul: case Builtin::BIstdc_bit_ceil_ull: - case Builtin::BIstdc_bit_ceil: case Builtin::BI__builtin_stdc_bit_ceil: return interp__builtin_elementwise_int_unaryop( S, OpPC, Call, [](const APSInt &Val) { diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3f3a80f5b77a3..81b42ef1467c7 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -16980,20 +16980,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_bit_ceil_ui: case Builtin::BIstdc_bit_ceil_ul: case Builtin::BIstdc_bit_ceil_ull: - case Builtin::BIstdc_leading_zeros: - case Builtin::BIstdc_leading_ones: - case Builtin::BIstdc_trailing_zeros: - case Builtin::BIstdc_trailing_ones: - case Builtin::BIstdc_first_leading_zero: - case Builtin::BIstdc_first_leading_one: - case Builtin::BIstdc_first_trailing_zero: - case Builtin::BIstdc_first_trailing_one: - case Builtin::BIstdc_count_zeros: - case Builtin::BIstdc_count_ones: - case Builtin::BIstdc_has_single_bit: - case Builtin::BIstdc_bit_width: - case Builtin::BIstdc_bit_floor: - case Builtin::BIstdc_bit_ceil: case Builtin::BI__builtin_stdc_leading_zeros: case Builtin::BI__builtin_stdc_leading_ones: case Builtin::BI__builtin_stdc_trailing_zeros: @@ -17021,7 +17007,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_leading_zeros_ui: case Builtin::BIstdc_leading_zeros_ul: case Builtin::BIstdc_leading_zeros_ull: - case Builtin::BIstdc_leading_zeros: case Builtin::BI__builtin_stdc_leading_zeros: return Success(APInt(ResBitWidth, Val.countl_zero()), E); case Builtin::BIstdc_leading_ones_uc: @@ -17029,7 +17014,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_leading_ones_ui: case Builtin::BIstdc_leading_ones_ul: case Builtin::BIstdc_leading_ones_ull: - case Builtin::BIstdc_leading_ones: case Builtin::BI__builtin_stdc_leading_ones: return Success(APInt(ResBitWidth, Val.countl_one()), E); case Builtin::BIstdc_trailing_zeros_uc: @@ -17037,7 +17021,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_trailing_zeros_ui: case Builtin::BIstdc_trailing_zeros_ul: case Builtin::BIstdc_trailing_zeros_ull: - case Builtin::BIstdc_trailing_zeros: case Builtin::BI__builtin_stdc_trailing_zeros: return Success(APInt(ResBitWidth, Val.countr_zero()), E); case Builtin::BIstdc_trailing_ones_uc: @@ -17045,7 +17028,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_trailing_ones_ui: case Builtin::BIstdc_trailing_ones_ul: case Builtin::BIstdc_trailing_ones_ull: - case Builtin::BIstdc_trailing_ones: case Builtin::BI__builtin_stdc_trailing_ones: return Success(APInt(ResBitWidth, Val.countr_one()), E); case Builtin::BIstdc_first_leading_zero_uc: @@ -17053,7 +17035,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_first_leading_zero_ui: case Builtin::BIstdc_first_leading_zero_ul: case Builtin::BIstdc_first_leading_zero_ull: - case Builtin::BIstdc_first_leading_zero: case Builtin::BI__builtin_stdc_first_leading_zero: return Success( APInt(ResBitWidth, Val.isAllOnes() ? 0 : Val.countl_one() + 1), E); @@ -17062,7 +17043,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_first_leading_one_ui: case Builtin::BIstdc_first_leading_one_ul: case Builtin::BIstdc_first_leading_one_ull: - case Builtin::BIstdc_first_leading_one: case Builtin::BI__builtin_stdc_first_leading_one: return Success( APInt(ResBitWidth, Val.isZero() ? 0 : Val.countl_zero() + 1), E); @@ -17071,7 +17051,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_first_trailing_zero_ui: case Builtin::BIstdc_first_trailing_zero_ul: case Builtin::BIstdc_first_trailing_zero_ull: - case Builtin::BIstdc_first_trailing_zero: case Builtin::BI__builtin_stdc_first_trailing_zero: return Success( APInt(ResBitWidth, Val.isAllOnes() ? 0 : Val.countr_one() + 1), E); @@ -17080,7 +17059,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_first_trailing_one_ui: case Builtin::BIstdc_first_trailing_one_ul: case Builtin::BIstdc_first_trailing_one_ull: - case Builtin::BIstdc_first_trailing_one: case Builtin::BI__builtin_stdc_first_trailing_one: return Success( APInt(ResBitWidth, Val.isZero() ? 0 : Val.countr_zero() + 1), E); @@ -17089,7 +17067,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_count_zeros_ui: case Builtin::BIstdc_count_zeros_ul: case Builtin::BIstdc_count_zeros_ull: - case Builtin::BIstdc_count_zeros: case Builtin::BI__builtin_stdc_count_zeros: { APInt Cnt(ResBitWidth, BitWidth - Val.popcount()); return Success(APSInt(Cnt, /*IsUnsigned*/ true), E); @@ -17099,7 +17076,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_count_ones_ui: case Builtin::BIstdc_count_ones_ul: case Builtin::BIstdc_count_ones_ull: - case Builtin::BIstdc_count_ones: case Builtin::BI__builtin_stdc_count_ones: { APInt Cnt(ResBitWidth, Val.popcount()); return Success(APSInt(Cnt, /*IsUnsigned*/ true), E); @@ -17109,7 +17085,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_has_single_bit_ui: case Builtin::BIstdc_has_single_bit_ul: case Builtin::BIstdc_has_single_bit_ull: - case Builtin::BIstdc_has_single_bit: case Builtin::BI__builtin_stdc_has_single_bit: { APInt Res(ResBitWidth, Val.popcount() == 1 ? 1 : 0); return Success(APSInt(Res, /*IsUnsigned*/ true), E); @@ -17119,7 +17094,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_bit_width_ui: case Builtin::BIstdc_bit_width_ul: case Builtin::BIstdc_bit_width_ull: - case Builtin::BIstdc_bit_width: case Builtin::BI__builtin_stdc_bit_width: return Success(APInt(ResBitWidth, BitWidth - Val.countl_zero()), E); case Builtin::BIstdc_bit_floor_uc: @@ -17127,7 +17101,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_bit_floor_ui: case Builtin::BIstdc_bit_floor_ul: case Builtin::BIstdc_bit_floor_ull: - case Builtin::BIstdc_bit_floor: case Builtin::BI__builtin_stdc_bit_floor: { if (Val.isZero()) return Success(APInt(BitWidth, 0), E); @@ -17140,7 +17113,6 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, case Builtin::BIstdc_bit_ceil_ui: case Builtin::BIstdc_bit_ceil_ul: case Builtin::BIstdc_bit_ceil_ull: - case Builtin::BIstdc_bit_ceil: case Builtin::BI__builtin_stdc_bit_ceil: { if (Val.ule(1)) return Success(APSInt(APInt(BitWidth, 1), /*IsUnsigned*/ true), E); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1318641159212..f29e27818d7ec 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3794,7 +3794,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_leading_zeros_ui: case Builtin::BIstdc_leading_zeros_ul: case Builtin::BIstdc_leading_zeros_ull: - case Builtin::BIstdc_leading_zeros: case Builtin::BI__builtin_stdc_leading_zeros: return emitStdcCountIntrinsic(E, Intrinsic::ctlz, /*InvertArg=*/false); case Builtin::BIstdc_leading_ones_uc: @@ -3802,7 +3801,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_leading_ones_ui: case Builtin::BIstdc_leading_ones_ul: case Builtin::BIstdc_leading_ones_ull: - case Builtin::BIstdc_leading_ones: case Builtin::BI__builtin_stdc_leading_ones: return emitStdcCountIntrinsic(E, Intrinsic::ctlz, /*InvertArg=*/true); case Builtin::BIstdc_trailing_zeros_uc: @@ -3810,7 +3808,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_trailing_zeros_ui: case Builtin::BIstdc_trailing_zeros_ul: case Builtin::BIstdc_trailing_zeros_ull: - case Builtin::BIstdc_trailing_zeros: case Builtin::BI__builtin_stdc_trailing_zeros: return emitStdcCountIntrinsic(E, Intrinsic::cttz, /*InvertArg=*/false); case Builtin::BIstdc_trailing_ones_uc: @@ -3818,7 +3815,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_trailing_ones_ui: case Builtin::BIstdc_trailing_ones_ul: case Builtin::BIstdc_trailing_ones_ull: - case Builtin::BIstdc_trailing_ones: case Builtin::BI__builtin_stdc_trailing_ones: return emitStdcCountIntrinsic(E, Intrinsic::cttz, /*InvertArg=*/true); case Builtin::BIstdc_first_leading_zero_uc: @@ -3826,7 +3822,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_first_leading_zero_ui: case Builtin::BIstdc_first_leading_zero_ul: case Builtin::BIstdc_first_leading_zero_ull: - case Builtin::BIstdc_first_leading_zero: case Builtin::BI__builtin_stdc_first_leading_zero: return emitStdcFirstBit(E, Intrinsic::ctlz, /*InvertArg=*/true); case Builtin::BIstdc_first_leading_one_uc: @@ -3834,7 +3829,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_first_leading_one_ui: case Builtin::BIstdc_first_leading_one_ul: case Builtin::BIstdc_first_leading_one_ull: - case Builtin::BIstdc_first_leading_one: case Builtin::BI__builtin_stdc_first_leading_one: return emitStdcFirstBit(E, Intrinsic::ctlz, /*InvertArg=*/false); case Builtin::BIstdc_first_trailing_zero_uc: @@ -3842,7 +3836,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_first_trailing_zero_ui: case Builtin::BIstdc_first_trailing_zero_ul: case Builtin::BIstdc_first_trailing_zero_ull: - case Builtin::BIstdc_first_trailing_zero: case Builtin::BI__builtin_stdc_first_trailing_zero: return emitStdcFirstBit(E, Intrinsic::cttz, /*InvertArg=*/true); case Builtin::BIstdc_first_trailing_one_uc: @@ -3850,7 +3843,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_first_trailing_one_ui: case Builtin::BIstdc_first_trailing_one_ul: case Builtin::BIstdc_first_trailing_one_ull: - case Builtin::BIstdc_first_trailing_one: case Builtin::BI__builtin_stdc_first_trailing_one: return emitStdcFirstBit(E, Intrinsic::cttz, /*InvertArg=*/false); case Builtin::BIstdc_count_zeros_uc: @@ -3858,7 +3850,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_count_zeros_ui: case Builtin::BIstdc_count_zeros_ul: case Builtin::BIstdc_count_zeros_ull: - case Builtin::BIstdc_count_zeros: case Builtin::BI__builtin_stdc_count_zeros: return emitStdcBitWidthMinus(E, Intrinsic::ctpop, /*IsPop=*/true); case Builtin::BIstdc_count_ones_uc: @@ -3866,7 +3857,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_count_ones_ui: case Builtin::BIstdc_count_ones_ul: case Builtin::BIstdc_count_ones_ull: - case Builtin::BIstdc_count_ones: case Builtin::BI__builtin_stdc_count_ones: return emitStdcCountIntrinsic(E, Intrinsic::ctpop, /*InvertArg=*/false, /*IsPop=*/true); @@ -3875,7 +3865,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_has_single_bit_ui: case Builtin::BIstdc_has_single_bit_ul: case Builtin::BIstdc_has_single_bit_ull: - case Builtin::BIstdc_has_single_bit: case Builtin::BI__builtin_stdc_has_single_bit: { Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); @@ -3889,7 +3878,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_bit_width_ui: case Builtin::BIstdc_bit_width_ul: case Builtin::BIstdc_bit_width_ull: - case Builtin::BIstdc_bit_width: case Builtin::BI__builtin_stdc_bit_width: return emitStdcBitWidthMinus(E, Intrinsic::ctlz, /*IsPop=*/false); case Builtin::BIstdc_bit_floor_uc: @@ -3897,7 +3885,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_bit_floor_ui: case Builtin::BIstdc_bit_floor_ul: case Builtin::BIstdc_bit_floor_ull: - case Builtin::BIstdc_bit_floor: case Builtin::BI__builtin_stdc_bit_floor: { Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); @@ -3918,7 +3905,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIstdc_bit_ceil_ui: case Builtin::BIstdc_bit_ceil_ul: case Builtin::BIstdc_bit_ceil_ull: - case Builtin::BIstdc_bit_ceil: case Builtin::BI__builtin_stdc_bit_ceil: { Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 4706fa5d3cde0..530587208cce8 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3865,13 +3865,10 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, case Builtin::BI__builtin_stdc_bit_floor: case Builtin::BI__builtin_stdc_bit_ceil: - case Builtin::BIstdc_bit_floor: - case Builtin::BIstdc_bit_ceil: if (BuiltinStdCBuiltin(*this, TheCall, QualType())) return ExprError(); break; case Builtin::BI__builtin_stdc_has_single_bit: - case Builtin::BIstdc_has_single_bit: if (BuiltinStdCBuiltin(*this, TheCall, Context.BoolTy)) return ExprError(); break; @@ -3886,17 +3883,6 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, case Builtin::BI__builtin_stdc_count_zeros: case Builtin::BI__builtin_stdc_count_ones: case Builtin::BI__builtin_stdc_bit_width: - case Builtin::BIstdc_leading_zeros: - case Builtin::BIstdc_leading_ones: - case Builtin::BIstdc_trailing_zeros: - case Builtin::BIstdc_trailing_ones: - case Builtin::BIstdc_first_leading_zero: - case Builtin::BIstdc_first_leading_one: - case Builtin::BIstdc_first_trailing_zero: - case Builtin::BIstdc_first_trailing_one: - case Builtin::BIstdc_count_zeros: - case Builtin::BIstdc_count_ones: - case Builtin::BIstdc_bit_width: if (BuiltinStdCBuiltin(*this, TheCall, Context.UnsignedIntTy)) return ExprError(); break; From f5b99951729a212b20c908ba6e270fe18a426a01 Mon Sep 17 00:00:00 2001 From: Will Hawkins Date: Tue, 12 May 2026 00:32:53 -0400 Subject: [PATCH 393/538] [lldb] Fix Broken Test For Embedded Source in Binary (#197040) A previous change to fix a regression when displaying source of programs that embed their source code into their debugging information (using, e.g., DW_LNCT_LLVM_source) should display included a test that was not appropriately robust to all platforms. Disable the regression on Windows systems and make the CHECKs robust against output generated on different platforms. --- lldb/test/Shell/Commands/command-source-embedded.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lldb/test/Shell/Commands/command-source-embedded.test b/lldb/test/Shell/Commands/command-source-embedded.test index 183447d1aa0c5..e792ecafaf289 100644 --- a/lldb/test/Shell/Commands/command-source-embedded.test +++ b/lldb/test/Shell/Commands/command-source-embedded.test @@ -3,8 +3,10 @@ # When a program with embedded source being debugged reaches a breakpoint, its # source code should be listed. This test prevents a regression identified in #191801. +# UNSUPPORTED: system-windows + # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf -gembed-source %t/main.c -o %t.out +# RUN: %clang_host -g -gdwarf-5 -gembed-source %t/main.c -o %t.out # RUN: %lldb -x -b -s %t/commands %t.out -o exit 2>&1 \ # RUN: | FileCheck %s @@ -22,7 +24,7 @@ run # CHECK-LABEL: run # CHECK-NEXT: Process [[PID:.*]] launched # CHECK-NEXT: Process [[PID]] stopped -# CHECK-NEXT: name = {{.*}}, stop reason = breakpoint 1.1 +# CHECK-NEXT: stop reason = breakpoint 1.1 # CHECK-NEXT: frame #0: {{.*}}`main at main.c:2:3 # CHECK-NEXT: 1 int main() { # CHECK-NEXT: -> 2 return 0; From 6e9694a1c32d27b9fb23aa0a4a71171c08a0597a Mon Sep 17 00:00:00 2001 From: Chen Li Date: Mon, 11 May 2026 21:35:27 -0700 Subject: [PATCH 394/538] [llvm-gsymutil] Add --symtab-file option to specify separate symbol table file (#184059) Add a `--symtab-file` option that allows specifying a separate file from which to read the symbol table during GSYM conversion. This is useful when DWARF and function symbols are stored in separate files. Example: `llvm-gsymutil --convert debug_info.elf --symtab-file=symbols.elf -o output.gsym` ## Changes - Added `--symtab-file` to `Opts.td` and command-line help. - Parsed and used `--symtab-file` in `llvm-gsymutil.cpp` during conversion. - Kept architecture validation for thin object inputs. - Added support for universal Mach-O inputs by selecting the matching symbol-table slice for each selected architecture. - Reject the case where a multi-arch universal input is converted with a thin `--symtab-file`; `--arch` can be used to narrow the conversion to one architecture. - Added lit tests covering basic ELF usage, architecture mismatch, invalid symtab file, and thin/fat Mach-O cases. ## Test Plan - `ninja -C /tmp/llvm-gsymutil-build llvm-gsymutil` - `/tmp/llvm-gsymutil-build/bin/llvm-lit -sv llvm/test/tools/llvm-gsymutil/X86/elf-symtab-file.yaml llvm/test/tools/ llvm-gsymutil/ARM_AArch64/fat-macho-symtab-file.yaml` --------- Co-authored-by: Chen Li --- .../ARM_AArch64/fat-macho-symtab-file.yaml | 234 ++++++++++++++++++ .../llvm-gsymutil/X86/elf-symtab-file.yaml | 150 +++++++++++ llvm/test/tools/llvm-gsymutil/cmdline.test | 1 + llvm/tools/llvm-gsymutil/Opts.td | 5 + llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp | 134 ++++++++-- 5 files changed, 508 insertions(+), 16 deletions(-) create mode 100644 llvm/test/tools/llvm-gsymutil/ARM_AArch64/fat-macho-symtab-file.yaml create mode 100644 llvm/test/tools/llvm-gsymutil/X86/elf-symtab-file.yaml diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/fat-macho-symtab-file.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/fat-macho-symtab-file.yaml new file mode 100644 index 0000000000000..4311b549fc04e --- /dev/null +++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/fat-macho-symtab-file.yaml @@ -0,0 +1,234 @@ +## Test --symtab-file with thin and fat Mach-O inputs. +## +## We reuse the existing fat Mach-O DWARF fixture as the debug-info input and +## create separate symbol-table-only Mach-O objects. The arm64 symtab slice +## intentionally has two function symbols so we can tell that the matching +## symtab slice was selected instead of the input file's own symbol table. + +# RUN: split-file %s %t +# RUN: yaml2obj %S/fat-macho-dwarf.yaml -o %t/debug-fat.o +# RUN: yaml2obj %t/armv7-symtab.yaml -o %t/armv7-symtab.o +# RUN: yaml2obj %t/arm64-symtab.yaml -o %t/arm64-symtab.o +# RUN: llvm-lipo %t/armv7-symtab.o %t/arm64-symtab.o -create -output %t/symtab-fat.o +# RUN: llvm-lipo %t/debug-fat.o -thin arm64 -output %t/debug-arm64.o +# RUN: llvm-lipo %t/symtab-fat.o -thin arm64 -output %t/symtab-arm64.o +# RUN: llvm-lipo %t/armv7-symtab.o -create -output %t/symtab-armv7-only-fat.o + +# RUN: llvm-gsymutil --convert %t/debug-arm64.o --symtab-file=%t/symtab-fat.o -o %t/thin-from-fat.gsym 2>&1 | FileCheck %s --check-prefix=THIN-FAT +# RUN: llvm-gsymutil --convert %t/debug-fat.o --symtab-file=%t/symtab-fat.o -o %t/fat-from-fat.gsym 2>&1 | FileCheck %s --check-prefix=FAT-FAT +# RUN: llvm-gsymutil --convert %t/debug-fat.o --arch arm64 --symtab-file=%t/symtab-arm64.o -o %t/fat-from-thin.gsym 2>&1 | FileCheck %s --check-prefix=FAT-THIN +# RUN: not llvm-gsymutil --convert %t/debug-fat.o --symtab-file=%t/symtab-arm64.o -o %t/fat-from-thin-error.gsym 2>&1 | FileCheck %s --check-prefix=FAT-THIN-ERR +# RUN: not llvm-gsymutil --convert %t/debug-arm64.o --symtab-file=%t/symtab-armv7-only-fat.o -o %t/missing-arch.gsym 2>&1 | FileCheck %s --check-prefix=MISSING-ARCH + +# THIN-FAT: Input file: {{.*}}debug-arm64.o +# THIN-FAT: Output file (arm64): {{.*}}thin-from-fat.gsym +# THIN-FAT: Using symbol table file: {{.*}}symtab-fat.o +# THIN-FAT: Loaded 2 functions from symbol table. + +# FAT-FAT: Input file: {{.*}}debug-fat.o +# FAT-FAT: Output file (armv7): {{.*}}fat-from-fat.gsym.armv7 +# FAT-FAT: Using symbol table file: {{.*}}symtab-fat.o +# FAT-FAT: Loaded 1 functions from symbol table. +# FAT-FAT: Output file (arm64): {{.*}}fat-from-fat.gsym.arm64 +# FAT-FAT: Using symbol table file: {{.*}}symtab-fat.o +# FAT-FAT: Loaded 2 functions from symbol table. + +# FAT-THIN: Input file: {{.*}}debug-fat.o +# FAT-THIN: Output file (arm64): {{.*}}fat-from-thin.gsym +# FAT-THIN: Using symbol table file: {{.*}}symtab-arm64.o +# FAT-THIN: Loaded 2 functions from symbol table. + +# FAT-THIN-ERR: use --arch to select a single architecture + +# MISSING-ARCH: symbol table file '{{.*}}symtab-armv7-only-fat.o' does not contain architecture 'arm64' + +#--- armv7-symtab.yaml +--- !mach-o +FileHeader: + magic: 0xFEEDFACE + cputype: 0x0000000C + cpusubtype: 0x00000009 + filetype: 0x00000001 + ncmds: 4 + sizeofcmds: 312 + flags: 0x00002000 +LoadCommands: + - cmd: LC_SEGMENT + cmdsize: 192 + segname: '' + vmaddr: 49136 + vmsize: 72 + fileoff: 340 + filesize: 72 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x000000000000BFF0 + size: 16 + offset: 0x00000154 + align: 4 + reloff: 0x00000000 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: 00000000000000000000000000000000 + - sectname: __eh_frame + segname: __TEXT + addr: 0x000000000000C000 + size: 52 + offset: 0x00000164 + align: 2 + reloff: 0x00000000 + nreloc: 0 + flags: 0x6800000B + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + - cmd: LC_VERSION_MIN_MACOSX + cmdsize: 16 + version: 656384 + sdk: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 412 + nsyms: 1 + stroff: 424 + strsize: 8 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 1 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 49136 + StringTable: + - '' + - _main + - '' +... + +#--- arm64-symtab.yaml +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x0100000C + cpusubtype: 0x00000000 + filetype: 0x00000001 + ncmds: 4 + sizeofcmds: 352 + flags: 0x00002000 + reserved: 0x00000000 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 4294999964 + vmsize: 84 + fileoff: 384 + filesize: 80 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0000000100007F9C + size: 16 + offset: 0x00000180 + align: 4 + reloff: 0x00000000 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: 00000000000000000000000000000000 + - sectname: __eh_frame + segname: __TEXT + addr: 0x0000000100007FB0 + size: 64 + offset: 0x00000190 + align: 3 + reloff: 0x00000000 + nreloc: 0 + flags: 0x6800000B + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + - cmd: LC_VERSION_MIN_MACOSX + cmdsize: 16 + version: 656384 + sdk: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 464 + nsyms: 2 + stroff: 496 + strsize: 15 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 2 + iundefsym: 2 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 1 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 4294999964 + - n_strx: 7 + n_type: 0x0F + n_sect: 1 + n_desc: 0 + n_value: 4294999976 + StringTable: + - '' + - _main + - _helper + - '' +... diff --git a/llvm/test/tools/llvm-gsymutil/X86/elf-symtab-file.yaml b/llvm/test/tools/llvm-gsymutil/X86/elf-symtab-file.yaml new file mode 100644 index 0000000000000..31b70b13f15fd --- /dev/null +++ b/llvm/test/tools/llvm-gsymutil/X86/elf-symtab-file.yaml @@ -0,0 +1,150 @@ +## Test the --symtab-file option to specify a separate file for the symbol table. +## We create two ELF files: one with DWARF debug info (but no function symbols) +## and one with function symbols (but no DWARF). Then we use --symtab-file to +## combine them during GSYM conversion. + +# RUN: split-file %s %t + +## Create the debug-only and symtab-only ELF files. +# RUN: yaml2obj %t/debug.yaml -o %t/debug.elf +# RUN: yaml2obj %t/symtab.yaml -o %t/symtab.elf + +## Test 1: Basic --symtab-file usage. +# RUN: llvm-gsymutil --convert %t/debug.elf --symtab-file=%t/symtab.elf -o %t/out.gsym 2>&1 | FileCheck %s --check-prefix=BASIC +# BASIC: Input file: {{.*}}debug.elf +# BASIC: Output file (x86_64): {{.*}}out.gsym +# BASIC: Using symbol table file: {{.*}}symtab.elf +# BASIC: Loaded 2 functions from symbol table. + +## Test 2: Architecture mismatch error. +# RUN: yaml2obj %t/aarch64-symtab.yaml -o %t/aarch64-symtab.elf +# RUN: not llvm-gsymutil --convert %t/debug.elf --symtab-file=%t/aarch64-symtab.elf -o %t/mismatch.gsym 2>&1 | FileCheck %s --check-prefix=MISMATCH +# MISMATCH: architecture mismatch: input file is x86_64 but symbol table file '{{.*}}aarch64-symtab.elf' is aarch64 + +## Test 3: Invalid symtab file error. +# RUN: not llvm-gsymutil --convert %t/debug.elf --symtab-file=%t/debug.yaml -o %t/invalid.gsym 2>&1 | FileCheck %s --check-prefix=INVALID +# INVALID: The file was not recognized as a valid object file + +#--- debug.yaml +## An x86_64 ELF with DWARF debug info for one function (main) but no function +## symbols in the symbol table. +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x0000000000401000 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000401000 + AddressAlign: 0x10 + ## Simple function: push rbp; mov rbp,rsp; xor eax,eax; pop rbp; ret + Content: 554889E531C05DC3 + - Name: .debug_abbrev + Type: SHT_PROGBITS + AddressAlign: 0x1 + ## Abbrev 1: DW_TAG_compile_unit (DW_CHILDREN_yes) + ## DW_AT_low_pc (DW_FORM_addr), DW_AT_high_pc (DW_FORM_data4), + ## DW_AT_name (DW_FORM_strp) + ## Abbrev 2: DW_TAG_subprogram (DW_CHILDREN_no) + ## DW_AT_low_pc (DW_FORM_addr), DW_AT_high_pc (DW_FORM_data4), + ## DW_AT_name (DW_FORM_strp) + Content: 01110100110112060000023F19030E3A0B3B0B491311011207401897421901130000022E003F19030E110112070000022E00110112060003080000 + - Name: .debug_info + Type: SHT_PROGBITS + AddressAlign: 0x1 + ## Compile unit header + DIEs referencing .debug_abbrev and .debug_str + Content: 2900000004000000000008010010400000000000080000000000000002001040000000000008000000050000 + - Name: .debug_str + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + ## "test.cpp\0main\0" + Content: 746573742E63707000006D61696E00 + - Name: .debug_line + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 3300000002001F0000000101FB0E0D000101010100000001000001007465737400746573742E637070000000000000090200104000000000000105030A0101 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + Align: 0x1000 + FirstSec: .text + LastSec: .text +Symbols: [] +... + +#--- symtab.yaml +## An x86_64 ELF with function symbols but no DWARF debug info. +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x0000000000401000 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000401000 + AddressAlign: 0x10 + Content: 554889E531C05DC3 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + Align: 0x1000 + FirstSec: .text + LastSec: .text +Symbols: + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x0000000000401000 + Size: 0x0000000000000008 + - Name: _start + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x0000000000401000 + Size: 0x0000000000000008 +... + +#--- aarch64-symtab.yaml +## An AArch64 ELF to test architecture mismatch. +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 + Entry: 0x0000000000401000 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000401000 + AddressAlign: 0x10 + Content: C0035FD6 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + Align: 0x1000 + FirstSec: .text + LastSec: .text +Symbols: + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x0000000000401000 + Size: 0x0000000000000004 +... diff --git a/llvm/test/tools/llvm-gsymutil/cmdline.test b/llvm/test/tools/llvm-gsymutil/cmdline.test index ea5f76fac1635..1b2283af05031 100644 --- a/llvm/test/tools/llvm-gsymutil/cmdline.test +++ b/llvm/test/tools/llvm-gsymutil/cmdline.test @@ -11,6 +11,7 @@ HELP: --help HELP: --num-threads= HELP: --out-file= HELP: --quiet +HELP: --symtab-file= HELP: --verbose HELP: --verify HELP: --version diff --git a/llvm/tools/llvm-gsymutil/Opts.td b/llvm/tools/llvm-gsymutil/Opts.td index d4a3cd09d46eb..f2bf5e0783671 100644 --- a/llvm/tools/llvm-gsymutil/Opts.td +++ b/llvm/tools/llvm-gsymutil/Opts.td @@ -16,6 +16,11 @@ def verbose : FF<"verbose", "Enable verbose logging and encoding details">; defm convert : Eq<"convert", "Convert the specified file to the GSYM format.\nSupported files include ELF and mach-o files that will have their debug info (DWARF) and symbol table converted">; +defm symtab_file : + Eq<"symtab-file", + "Specify a separate file to read the symbol table from during GSYM conversion.\n" + "Use when the symbol table and debug info are in separate files.\n" + "Matching architectures are selected automatically for universal binaries">; def merged_functions : FF<"merged-functions", "When used with --convert, encodes merged function information for functions in debug info that have matching address ranges.\n" "Without this option one function per unique address range will be emitted.\n" diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp index 7ec69c3afc023..33a5a2224e3f9 100644 --- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp +++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp @@ -92,6 +92,7 @@ class GSYMUtilOptTable : public llvm::opt::GenericOptTable { static bool Verbose; static std::vector InputFilenames; static std::string ConvertFilename; +static std::string SymtabFilename; static std::vector ArchFilters; static std::string OutputFilename; static std::string JsonSummaryFile; @@ -147,6 +148,9 @@ static void parseArgs(int argc, char **argv) { if (const llvm::opt::Arg *A = Args.getLastArg(OPT_convert_EQ)) ConvertFilename = A->getValue(); + if (const llvm::opt::Arg *A = Args.getLastArg(OPT_symtab_file_EQ)) + SymtabFilename = A->getValue(); + for (const llvm::opt::Arg *A : Args.filtered(OPT_arch_EQ)) ArchFilters.emplace_back(A->getValue()); @@ -266,6 +270,16 @@ static uint32_t getCPUType(MachOObjectFile &MachO) { return MachO.getHeader().cputype; } +static std::string getArchitectureName(const ObjectFile &Obj) { + if (const auto *MachO = dyn_cast(&Obj)) { + Triple ObjTriple(MachO->getArchTriple()); + return ObjTriple.getArchName().str(); + } + + Triple ObjTriple(Obj.makeTriple()); + return ObjTriple.getArchName().str(); +} + /// Return true if the object file has not been filtered by an --arch option. static bool filterArch(MachOObjectFile &Obj) { if (ArchFilters.empty()) @@ -363,7 +377,47 @@ static std::optional getImageBaseAddress(object::ObjectFile &Obj) { return std::nullopt; } -static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile, +static Expected +resolveSymtabObject(StringRef ArchName, Binary *SymtabBinary, + StringRef SymtabPath, + std::unique_ptr &OwnedSymtabObj) { + if (!SymtabBinary) + return nullptr; + + if (auto *SymtabObj = dyn_cast(SymtabBinary)) { + std::string SymtabArchName = getArchitectureName(*SymtabObj); + if (SymtabArchName != ArchName) + return createStringError(std::errc::invalid_argument, + "architecture mismatch: input file is %s but " + "symbol table file '%s' is %s", + ArchName.str().c_str(), SymtabPath.str().c_str(), + SymtabArchName.c_str()); + + return SymtabObj; + } + + if (auto *SymtabFat = dyn_cast(SymtabBinary)) { + auto SymtabObjOrErr = SymtabFat->getMachOObjectForArch(ArchName); + if (!SymtabObjOrErr) { + consumeError(SymtabObjOrErr.takeError()); + return createStringError( + std::errc::invalid_argument, + "symbol table file '%s' does not contain architecture '%s'", + SymtabPath.str().c_str(), ArchName.str().c_str()); + } + + OwnedSymtabObj = std::move(*SymtabObjOrErr); + return OwnedSymtabObj.get(); + } + + return createStringError(std::errc::invalid_argument, + "symbol table file '%s' is not a valid object file", + SymtabPath.str().c_str()); +} + +static llvm::Error handleObjectFile(ObjectFile &Obj, ObjectFile *SymtabObj, + StringRef SymtabPath, + const std::string &OutFile, OutputAggregator &Out) { auto ThreadCount = NumThreads > 0 ? NumThreads : std::thread::hardware_concurrency(); @@ -436,8 +490,13 @@ static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile, Gsym.prepareMergedFunctions(Out); // Get the UUID and convert symbol table to GSYM. - if (auto Err = ObjectFileTransformer::convert(Obj, Out, Gsym)) + if (SymtabObj) { + Out << "Using symbol table file: " << SymtabPath << "\n"; + if (auto Err = ObjectFileTransformer::convert(*SymtabObj, Out, Gsym)) + return Err; + } else if (auto Err = ObjectFileTransformer::convert(Obj, Out, Gsym)) { return Err; + } // If any call site YAML files were specified, load them now. if (!CallSiteYamlPath.empty()) @@ -472,16 +531,23 @@ static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile, } static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer, + Binary *SymtabBinary, StringRef SymtabPath, const std::string &OutFile, OutputAggregator &Out) { Expected> BinOrErr = object::createBinary(Buffer); error(Filename, errorToErrorCode(BinOrErr.takeError())); if (auto *Obj = dyn_cast(BinOrErr->get())) { - Triple ObjTriple(Obj->makeTriple()); - auto ArchName = ObjTriple.getArchName(); + std::string ArchName = getArchitectureName(*Obj); + std::unique_ptr OwnedSymtabObj; + auto SymtabObjOrErr = + resolveSymtabObject(ArchName, SymtabBinary, SymtabPath, OwnedSymtabObj); + if (!SymtabObjOrErr) + return SymtabObjOrErr.takeError(); + outs() << "Output file (" << ArchName << "): " << OutFile << "\n"; - if (auto Err = handleObjectFile(*Obj, OutFile, Out)) + if (auto Err = + handleObjectFile(*Obj, *SymtabObjOrErr, SymtabPath, OutFile, Out)) return Err; } else if (auto *Fat = dyn_cast(BinOrErr->get())) { // Iterate over all contained architectures and filter out any that were @@ -489,33 +555,51 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer, // not specified on the command line, we will process all architectures. std::vector> FilterObjs; for (auto &ObjForArch : Fat->objects()) { - if (auto MachOOrErr = ObjForArch.getAsObjectFile()) { - auto &Obj = **MachOOrErr; - if (filterArch(Obj)) - FilterObjs.emplace_back(MachOOrErr->release()); - } else { + auto MachOOrErr = ObjForArch.getAsObjectFile(); + if (!MachOOrErr) { error(Filename, MachOOrErr.takeError()); + continue; } + + std::unique_ptr Obj = std::move(*MachOOrErr); + if (filterArch(*Obj)) + FilterObjs.emplace_back(std::move(Obj)); } if (FilterObjs.empty()) error(Filename, createStringError(std::errc::invalid_argument, "no matching architectures found")); // Now handle each architecture we need to convert. + bool MultipleArchitecturesSelected = FilterObjs.size() > 1; + if (MultipleArchitecturesSelected && SymtabBinary && + isa(SymtabBinary)) + return createStringError( + std::errc::invalid_argument, + "symbol table file '%s' is not a universal binary, but the input " + "contains multiple architectures; use --arch to select a single " + "architecture", + SymtabPath.str().c_str()); + for (auto &Obj : FilterObjs) { - Triple ObjTriple(Obj->getArchTriple()); - auto ArchName = ObjTriple.getArchName(); + std::string ArchName = getArchitectureName(*Obj); + std::unique_ptr OwnedSymtabObj; + auto SymtabObjOrErr = resolveSymtabObject(ArchName, SymtabBinary, + SymtabPath, OwnedSymtabObj); + if (!SymtabObjOrErr) + return SymtabObjOrErr.takeError(); + std::string ArchOutFile(OutFile); // If we are only handling a single architecture, then we will use the // normal output file. If we are handling multiple architectures append // the architecture name to the end of the out file path so that we // don't overwrite the previous architecture's gsym file. - if (FilterObjs.size() > 1) { + if (MultipleArchitecturesSelected) { ArchOutFile.append(1, '.'); - ArchOutFile.append(ArchName.str()); + ArchOutFile.append(ArchName); } outs() << "Output file (" << ArchName << "): " << ArchOutFile << "\n"; - if (auto Err = handleObjectFile(*Obj, ArchOutFile, Out)) + if (auto Err = handleObjectFile(*Obj, *SymtabObjOrErr, SymtabPath, + ArchOutFile, Out)) return Err; } } @@ -529,7 +613,25 @@ static llvm::Error handleFileConversionToGSYM(StringRef Filename, MemoryBuffer::getFileOrSTDIN(Filename); error(Filename, BuffOrErr.getError()); std::unique_ptr Buffer = std::move(BuffOrErr.get()); - return handleBuffer(Filename, *Buffer, OutFile, Out); + + std::unique_ptr SymtabBuffer; + std::unique_ptr SymtabBinary; + if (!SymtabFilename.empty()) { + auto SymtabBufOrErr = MemoryBuffer::getFile(SymtabFilename); + if (!SymtabBufOrErr) + return createStringError(SymtabBufOrErr.getError(), + "failed to open symbol table file '%s'", + SymtabFilename.c_str()); + + SymtabBuffer = std::move(*SymtabBufOrErr); + auto SymtabBinOrErr = object::createBinary(*SymtabBuffer); + if (!SymtabBinOrErr) + return SymtabBinOrErr.takeError(); + SymtabBinary = std::move(*SymtabBinOrErr); + } + + return handleBuffer(Filename, *Buffer, SymtabBinary.get(), SymtabFilename, + OutFile, Out); } static llvm::Error convertFileToGSYM(OutputAggregator &Out) { From a26e8fac301a6e22507469f93a7c2729e4004d11 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 12 May 2026 06:52:34 +0200 Subject: [PATCH 395/538] [clang][test][NFC] Remove a FIXME marker (#196953) There is nothing to fix here. This behavior is on purpose. Remove the "FIXME". --- clang/test/AST/ByteCode/cxx14.cpp | 2 +- clang/test/SemaCXX/constant-expression-cxx14.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp index 170bd09504993..97d2c0ac4f711 100644 --- a/clang/test/AST/ByteCode/cxx14.cpp +++ b/clang/test/AST/ByteCode/cxx14.cpp @@ -43,7 +43,7 @@ namespace InitListModify { }; constexpr Aggregate aggr1; static_assert(aggr1.x == 1 && aggr1.y == 1, ""); - // FIXME: This is not specified by the standard, but sanity requires it. + // This is not specified by the standard, but sanity requires it. constexpr Aggregate aggr2 = {}; static_assert(aggr2.x == 1 && aggr2.y == 1, ""); } diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp index fb7fd5b528b05..1bead18080271 100644 --- a/clang/test/SemaCXX/constant-expression-cxx14.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp @@ -1237,7 +1237,7 @@ namespace ObjectsUnderConstruction { }; constexpr Aggregate aggr1; static_assert(aggr1.x == 1 && aggr1.y == 1, ""); - // FIXME: This is not specified by the standard, but sanity requires it. + // This is not specified by the standard, but sanity requires it. constexpr Aggregate aggr2 = {}; static_assert(aggr2.x == 1 && aggr2.y == 1, ""); From a5d8825fe12d8c7606f6002e2a8d199c1b0c22d3 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Tue, 12 May 2026 10:53:55 +0530 Subject: [PATCH 396/538] [VPlan] Add SCEV support for abs intrinsic (#195678) Teach `getSCEVExprForVPValue` to model `llvm.abs` via `ScalarEvolution::getAbsExpr`, preserving the intrinsic's is_int_min_poison flag as the SCEV IsNSW argument. Add a unit test covering both poison and wrapping llvm.abs forms. --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 7 ++++ .../Transforms/Vectorize/VPlanTest.cpp | 38 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 5b80fa15a5535..24adcea1040b5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -278,6 +278,13 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef Ops) { return SE.getSMinExpr(Ops[0], Ops[1]); }); + if (match(V, m_Intrinsic(m_VPValue(LHSVal), m_VPValue()))) + return CreateSCEV({LHSVal}, [&](ArrayRef Ops) { + // is_int_min_poison is local to this intrinsic: poison on INT_MIN is + // not proof that the input is never INT_MIN, nor that poison reaches + // UB. Do not translate it to SCEV's global IsNSW flag. + return SE.getAbsExpr(Ops[0], /*IsNSW=*/false); + }); ArrayRef Ops; Type *SourceElementType; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index a1ddda7eda969..61acb5846a9cb 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -10,6 +10,7 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanCFG.h" #include "../lib/Transforms/Vectorize/VPlanHelpers.h" +#include "../lib/Transforms/Vectorize/VPlanUtils.h" #include "VPlanTestBase.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" @@ -33,6 +34,43 @@ namespace { } while (0) using VPInstructionTest = VPlanTestBase; +using VPlanSCEVTest = VPlanTestIRBase; + +TEST_F(VPlanSCEVTest, GetSCEVExprForVPValueAbs) { + const char *ModuleString = R"( +define void @f(i32 %x) { +entry: + br label %loop +loop: + br label %loop +} +)"; + + Module &M = parseModule(ModuleString); + Function *F = M.getFunction("f"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + doAnalysis(*F); + + Loop *L = LI->getLoopFor(LoopHeader); + PredicatedScalarEvolution PSE(*SE, *L); + VPlan Plan(LoopHeader); + Argument *X = F->getArg(0); + VPValue *Op = Plan.getOrAddLiveIn(X); + + // is_int_min_poison is local to the call, not a global no-wrap fact. + // Exercise both getAbsExpr flag paths; SCEV drops IsNSW for this input. + const SCEV *XSCEV = SE->getSCEV(X); + + VPWidenIntrinsicRecipe Abs(Intrinsic::abs, {Op, Plan.getTrue()}, + X->getType()); + EXPECT_EQ(SE->getAbsExpr(XSCEV, /*IsNSW=*/true), + vputils::getSCEVExprForVPValue(&Abs, PSE, L)); + + VPWidenIntrinsicRecipe WrappingAbs(Intrinsic::abs, {Op, Plan.getFalse()}, + X->getType()); + EXPECT_EQ(SE->getAbsExpr(XSCEV, /*IsNSW=*/false), + vputils::getSCEVExprForVPValue(&WrappingAbs, PSE, L)); +} TEST_F(VPInstructionTest, insertBefore) { VPInstruction *I1 = new VPInstruction(VPInstruction::StepVector, {}); From 8ecec455183fd42fc191089d061f80bdf7b158fc Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Tue, 12 May 2026 07:55:36 +0200 Subject: [PATCH 397/538] [clang] Fix x86_64-windows-msvc over- and under-alignment (#196505) This fixes two issues where Clang was both over- and under-aligning variables: 1) We were applying the x86_64 Sys V psABI "large array" alignment increase (default when inheriting from X86_64TargetInfo), but MSVC doesn't follow that ABI. 2) MSVC implements a similar scheme though, where it increases the alignment of large objects. This is documented for ARM64 [1] and was implemented in Clang b7c6d95af5e295c560d1445e7090e31eb9289932, but it also applies to x86_64. ([2] says "MSVC does size (total size, not element size) based alignment for global symbols on ARM64 *which is copied from AMD64*"). This patch stops doing 1) and implements 2) for x86_64-windows-msvc. [1] https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#alignment [2] https://github.com/llvm/llvm-project/issues/40851 Fixes https://github.com/llvm/llvm-project/issues/196071 Fixes https://github.com/llvm/llvm-project/issues/171855 --- clang/include/clang/Basic/TargetInfo.h | 2 ++ clang/lib/Basic/TargetInfo.cpp | 17 +++++++++++++++++ clang/lib/Basic/Targets/AArch64.cpp | 13 +------------ clang/lib/Basic/Targets/X86.cpp | 9 +++++++++ clang/lib/Basic/Targets/X86.h | 5 +++++ clang/test/CodeGen/align-x68_64.c | 10 ++++++++++ clang/test/CodeGen/asan-strings.c | 4 ++-- ...ign.cpp => microsoft-64bit-struct-align.cpp} | 6 +++--- .../ms-constexpr-static-data-member.cpp | 4 ++-- clang/test/CodeGenObjC/encode-test-6.m | 2 +- .../CodeGenSYCL/kernel-caller-entry-point.cpp | 16 ++++++++-------- 11 files changed, 60 insertions(+), 28 deletions(-) rename clang/test/CodeGen/{arm64-microsoft-struct-align.cpp => microsoft-64bit-struct-align.cpp} (80%) diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 9f7d2a17a0f8a..e21155b2e4fd4 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1968,6 +1968,8 @@ class TargetInfo : public TransferrableTargetInfo, void CheckFixedPointBits() const; }; +unsigned Microsoft64BitMinGlobalAlign(uint64_t TypeSize); + namespace targets { std::unique_ptr AllocateTarget(const llvm::Triple &Triple, const clang::TargetOptions &Opts); diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index e6ae89e0948c5..ad083242d9e3e 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -1114,3 +1114,20 @@ TargetInfo::simplifyConstraint(StringRef Constraint, } return Result; } + +unsigned clang::Microsoft64BitMinGlobalAlign(uint64_t TypeSize) { + // MSVC does size based alignment for arm64 based on alignment section in + // below document. Replicate that to keep alignment consistent with object + // files compiled by MSVC. + // https://docs.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions + // The same is done for x64, but not documented. + + if (TypeSize >= 512) // TypeSize >= 64 bytes + return 128; // align type at least 16 bytes + if (TypeSize >= 64) // TypeSize >= 8 bytes + return 64; // align type at least 8 bytes + if (TypeSize >= 16) // TypeSize >= 2 bytes + return 32; // align type at least 4 bytes + + return 0; +} diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 9b951e69cce33..9afe6cb10729d 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -1815,18 +1815,7 @@ unsigned MicrosoftARM64TargetInfo::getMinGlobalAlign(uint64_t TypeSize, unsigned Align = WindowsARM64TargetInfo::getMinGlobalAlign(TypeSize, HasNonWeakDef); - // MSVC does size based alignment for arm64 based on alignment section in - // below document, replicate that to keep alignment consistent with object - // files compiled by MSVC. - // https://docs.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions - if (TypeSize >= 512) { // TypeSize >= 64 bytes - Align = std::max(Align, 128u); // align type at least 16 bytes - } else if (TypeSize >= 64) { // TypeSize >= 8 bytes - Align = std::max(Align, 64u); // align type at least 8 butes - } else if (TypeSize >= 16) { // TypeSize >= 2 bytes - Align = std::max(Align, 32u); // align type at least 4 bytes - } - return Align; + return std::max(Align, Microsoft64BitMinGlobalAlign(TypeSize)); } MinGWARM64TargetInfo::MinGWARM64TargetInfo(const llvm::Triple &Triple, diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index cb941c94c84a7..60c001a826078 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1852,3 +1852,12 @@ X86_64TargetInfo::getTargetBuiltins() const { "__builtin_ia32_"}, }; } + +unsigned +MicrosoftX86_64TargetInfo::getMinGlobalAlign(uint64_t TypeSize, + bool HasNonWeakDef) const { + unsigned Align = + WindowsX86_64TargetInfo::getMinGlobalAlign(TypeSize, HasNonWeakDef); + + return std::max(Align, Microsoft64BitMinGlobalAlign(TypeSize)); +} diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index c7afcc7c86053..c8c5d280754b4 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -968,6 +968,8 @@ class LLVM_LIBRARY_VISIBILITY MicrosoftX86_64TargetInfo : WindowsX86_64TargetInfo(Triple, Opts) { LongDoubleWidth = LongDoubleAlign = 64; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); + LargeArrayMinWidth = 0; + LargeArrayAlign = 0; } void getTargetDefines(const LangOptions &Opts, @@ -981,6 +983,9 @@ class LLVM_LIBRARY_VISIBILITY MicrosoftX86_64TargetInfo getCallingConvKind(bool ClangABICompat4) const override { return CCK_MicrosoftWin64; } + + unsigned getMinGlobalAlign(uint64_t TypeSize, + bool HasNonWeakDef) const override; }; // x86-64 MinGW target diff --git a/clang/test/CodeGen/align-x68_64.c b/clang/test/CodeGen/align-x68_64.c index cf128b43433ea..91f5fac199136 100644 --- a/clang/test/CodeGen/align-x68_64.c +++ b/clang/test/CodeGen/align-x68_64.c @@ -1,11 +1,21 @@ // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm %s -o - | FileCheck --check-prefix=MSVC %s // PR5599 +char arr[16]; + void test1_f(void *); void test1_g(void) { float x[4]; test1_f(x); } +// CHECK: @arr = {{.*}} align 16 // CHECK: @test1_g // CHECK: alloca [4 x float], align 16 + +// The "large array" alignment increase does not apply on windows-msvc. +// MSVC: @arr = {{.*}} align 8 +// MSVC: @test1_g +// MSVC: alloca [4 x float], align 4 diff --git a/clang/test/CodeGen/asan-strings.c b/clang/test/CodeGen/asan-strings.c index 0c7420034f89e..e71445eaf7fe4 100644 --- a/clang/test/CodeGen/asan-strings.c +++ b/clang/test/CodeGen/asan-strings.c @@ -12,6 +12,6 @@ const char *foo(void) { return "asdf"; } // LINUX: @.str = private unnamed_addr constant [5 x i8] c"asdf\00", align 1 -// WINDOWS: @"??_C@_04JIHMPGLA@asdf?$AA@" = linkonce_odr dso_local unnamed_addr constant [5 x i8] c"asdf\00", comdat, align 1 +// WINDOWS: @"??_C@_04JIHMPGLA@asdf?$AA@" = linkonce_odr dso_local unnamed_addr constant [5 x i8] c"asdf\00", comdat, align 4 -// WINWRITE: @.str = private unnamed_addr global [5 x i8] c"asdf\00", align 1 +// WINWRITE: @.str = private unnamed_addr global [5 x i8] c"asdf\00", align 4 diff --git a/clang/test/CodeGen/arm64-microsoft-struct-align.cpp b/clang/test/CodeGen/microsoft-64bit-struct-align.cpp similarity index 80% rename from clang/test/CodeGen/arm64-microsoft-struct-align.cpp rename to clang/test/CodeGen/microsoft-64bit-struct-align.cpp index 4076c3ca34ad7..c4e0f2f6dfc88 100644 --- a/clang/test/CodeGen/arm64-microsoft-struct-align.cpp +++ b/clang/test/CodeGen/microsoft-64bit-struct-align.cpp @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -triple aarch64-windows -ffreestanding -emit-llvm -O0 \ -// RUN: -x c++ -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-windows-msvc -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s struct size1 { char str[1]; }; struct size2 { char str[2]; }; -struct size7 { char str[4]; }; +struct size7 { char str[7]; }; struct size8 { char str[8]; }; struct size63 { char str[63]; }; struct size64 { char str[64]; }; diff --git a/clang/test/CodeGenCXX/ms-constexpr-static-data-member.cpp b/clang/test/CodeGenCXX/ms-constexpr-static-data-member.cpp index 604a49fefbacb..4b191fd472c20 100644 --- a/clang/test/CodeGenCXX/ms-constexpr-static-data-member.cpp +++ b/clang/test/CodeGenCXX/ms-constexpr-static-data-member.cpp @@ -19,8 +19,8 @@ void usethem() { useptr(&S::sdm_udt); } -// CHECK-DAG: @"?sdm_char_array@S@@2QBDB" = linkonce_odr dso_local constant [5 x i8] c"asdf\00", comdat, align 1 +// CHECK-DAG: @"?sdm_char_array@S@@2QBDB" = linkonce_odr dso_local constant [5 x i8] c"asdf\00", comdat, align 4 // CHECK-DAG: @"?sdm_char_ptr@S@@2QEBDEB" = linkonce_odr dso_local constant ptr @"??_C@_04JIHMPGLA@asdf?$AA@", comdat, align 8 -// CHECK-DAG: @"?sdm_udt@S@@2UFoo@@B" = linkonce_odr dso_local constant %struct.Foo { i32 1, i32 2 }, comdat, align 4 +// CHECK-DAG: @"?sdm_udt@S@@2UFoo@@B" = linkonce_odr dso_local constant %struct.Foo { i32 1, i32 2 }, comdat, align 8 diff --git a/clang/test/CodeGenObjC/encode-test-6.m b/clang/test/CodeGenObjC/encode-test-6.m index c32f8f24c0009..dd770f523e2f3 100644 --- a/clang/test/CodeGenObjC/encode-test-6.m +++ b/clang/test/CodeGenObjC/encode-test-6.m @@ -80,6 +80,6 @@ @implementation SCNCamera // CHECK-DWARF: define{{.*}} ptr @Test() // CHECK-DWARF: ret ptr @e -// CHECK-MSVC: @e = dso_local global [2 x i8] c"i\00", align 1 +// CHECK-MSVC: @e = dso_local global [2 x i8] c"i\00", align 4 // CHECK-MINGW: @e = dso_local global [2 x i8] c"i\00", align 1 // CHECK-ELF: @e = global [2 x i8] c"i\00", align 1 diff --git a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp index 410988e16acdc..528d27f85e54b 100644 --- a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp +++ b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp @@ -148,14 +148,14 @@ int main() { // CHECK-HOST-LINUX @.str.5 = private unnamed_addr constant [30 x i8] c"_ZTS23fwd_ref_arg_kernel_name\00", align 1 // CHECK-HOST-LINUX: @.str.6 = private unnamed_addr constant [35 x i8] c"_ZTS28fwd_ref_arg_kernel_name_move\00", align 1 // CHECK-HOST-LINUX: @.str.7 = private unnamed_addr constant [33 x i8] c"_ZTS26rvalue_ref_arg_kernel_name\00", align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0CB@KFIJOMLB@_ZTS26single_purpose_kernel_name@" = linkonce_odr dso_local unnamed_addr constant [33 x i8] c"_ZTS26single_purpose_kernel_name\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0BC@NHCDOLAA@_ZTSZ4mainEUlT_E_?$AA@" = linkonce_odr dso_local unnamed_addr constant [18 x i8] c"_ZTSZ4mainEUlT_E_\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0M@BCGAEMBE@_ZTS6?N?$LE?O?$IE?O?$IH?$AA@" = linkonce_odr dso_local unnamed_addr constant [12 x i8] c"_ZTS6\CE\B4\CF\84\CF\87\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0P@DLGHPODL@_ZTSZ4mainE2KN?$AA@" = linkonce_odr dso_local unnamed_addr constant [15 x i8] c"_ZTSZ4mainE2KN\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0BK@PPDJPOBM@_ZTS19ref_arg_kernel_name?$AA@" = linkonce_odr dso_local unnamed_addr constant [26 x i8] c"_ZTS19ref_arg_kernel_name\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0BO@KEIBIHKH@_ZTS23fwd_ref_arg_kernel_name?$AA@" = linkonce_odr dso_local unnamed_addr constant [30 x i8] c"_ZTS23fwd_ref_arg_kernel_name\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0CD@FDALJLMM@_ZTS28fwd_ref_arg_kernel_name_mo@" = linkonce_odr dso_local unnamed_addr constant [35 x i8] c"_ZTS28fwd_ref_arg_kernel_name_move\00", comdat, align 1 -// CHECK-HOST-WINDOWS: @"??_C@_0CB@HCPMABHM@_ZTS26rvalue_ref_arg_kernel_name@" = linkonce_odr dso_local unnamed_addr constant [33 x i8] c"_ZTS26rvalue_ref_arg_kernel_name\00", comdat, align 1 +// CHECK-HOST-WINDOWS: @"??_C@_0CB@KFIJOMLB@_ZTS26single_purpose_kernel_name@" = linkonce_odr dso_local unnamed_addr constant [33 x i8] c"_ZTS26single_purpose_kernel_name\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0BC@NHCDOLAA@_ZTSZ4mainEUlT_E_?$AA@" = linkonce_odr dso_local unnamed_addr constant [18 x i8] c"_ZTSZ4mainEUlT_E_\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0M@BCGAEMBE@_ZTS6?N?$LE?O?$IE?O?$IH?$AA@" = linkonce_odr dso_local unnamed_addr constant [12 x i8] c"_ZTS6\CE\B4\CF\84\CF\87\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0P@DLGHPODL@_ZTSZ4mainE2KN?$AA@" = linkonce_odr dso_local unnamed_addr constant [15 x i8] c"_ZTSZ4mainE2KN\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0BK@PPDJPOBM@_ZTS19ref_arg_kernel_name?$AA@" = linkonce_odr dso_local unnamed_addr constant [26 x i8] c"_ZTS19ref_arg_kernel_name\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0BO@KEIBIHKH@_ZTS23fwd_ref_arg_kernel_name?$AA@" = linkonce_odr dso_local unnamed_addr constant [30 x i8] c"_ZTS23fwd_ref_arg_kernel_name\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0CD@FDALJLMM@_ZTS28fwd_ref_arg_kernel_name_mo@" = linkonce_odr dso_local unnamed_addr constant [35 x i8] c"_ZTS28fwd_ref_arg_kernel_name_move\00", comdat +// CHECK-HOST-WINDOWS: @"??_C@_0CB@HCPMABHM@_ZTS26rvalue_ref_arg_kernel_name@" = linkonce_odr dso_local unnamed_addr constant [33 x i8] c"_ZTS26rvalue_ref_arg_kernel_name\00", comdat // // CHECK-HOST-LINUX: define dso_local void @_Z26single_purpose_kernel_task21single_purpose_kernel() #{{[0-9]+}} { // CHECK-HOST-LINUX-NEXT: entry: From 03123695573f476775b43a4dff4ea712f5b78909 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Tue, 12 May 2026 07:02:33 +0100 Subject: [PATCH 398/538] [GlobalISel] Defer RegBankSelect operand mapper creation (#196985) RegBankSelect::applyMapping constructs an OperandsMapper before applying repairs. Default mappings that only need Reassign repairs only update the register bank and do not create replacement operands, so the generic applyDefaultMapping path has no rewriting work to do in that case. Defer OperandsMapper creation until an Insert repair actually needs new virtual registers. If no mapper was needed for a default mapping, return after applying the repairs. CTMark geomean -0.23% improvement on aarch64-O0-g. https://llvm-compile-time-tracker.com/compare.php?from=ed50ea52004259af958bb3e5636268342c49ee62&to=1a4730426e14969626cad43c6b06e93bde707bd1&stat=instructions%3Au Assisted-by: Codex --- llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp index 4dfd5179a4e56..bc6b5df99d2e7 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #define DEBUG_TYPE "regbankselect" @@ -595,7 +596,7 @@ bool RegBankSelect::applyMapping( MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping, SmallVectorImpl &RepairPts) { // OpdMapper will hold all the information needed for the rewriting. - RegisterBankInfo::OperandsMapper OpdMapper(MI, InstrMapping, *MRI); + std::optional OpdMapper; // First, place the repairing code. for (RepairingPlacement &RepairPt : RepairPts) { @@ -620,8 +621,10 @@ bool RegBankSelect::applyMapping( // Don't insert additional instruction for debug instruction. if (MI.isDebugInstr()) break; - OpdMapper.createVRegs(OpIdx); - if (!repairReg(MO, ValMapping, RepairPt, OpdMapper.getVRegs(OpIdx))) + if (!OpdMapper) + OpdMapper.emplace(MI, InstrMapping, *MRI); + OpdMapper->createVRegs(OpIdx); + if (!repairReg(MO, ValMapping, RepairPt, OpdMapper->getVRegs(OpIdx))) return false; break; default: @@ -629,9 +632,16 @@ bool RegBankSelect::applyMapping( } } + // Default mappings only need rewriting when repairs create new operands. + if (!OpdMapper && InstrMapping.getID() == RegisterBankInfo::DefaultMappingID) + return true; + + if (!OpdMapper) + OpdMapper.emplace(MI, InstrMapping, *MRI); // Second, rewrite the instruction. - LLVM_DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n'); - RBI->applyMapping(MIRBuilder, OpdMapper); + LLVM_DEBUG(dbgs() << "Actual mapping of the operands: " << *OpdMapper + << '\n'); + RBI->applyMapping(MIRBuilder, *OpdMapper); return true; } From 491781078a2d43120e41baece40474b56f999fe3 Mon Sep 17 00:00:00 2001 From: Konrad Kleine Date: Tue, 12 May 2026 08:21:19 +0200 Subject: [PATCH 399/538] libclc: Pass LLVM_NATIVE_TOOL_DIR to runtime builds (#196498) This patch sets `LLVM_NATIVE_TOOL_DIR` in the runtime build configuration to point to the directory containing the just-built LLVM tools, allowing libclc to find them without requiring them to be installed on the host system. Fixes build errors like: ``` Error evaluating generator expression: $ No target "opt" ``` A few lines above this change, `extra_deps` list of dependencies for libclc is created. But those tools don't get build in the runtime build. We build libclc in the monolithic build and there we have all the tools which is why I've added the path to discover the tools. --- llvm/runtimes/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 812cd387c6596..e31e1c935554f 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -581,6 +581,12 @@ if(build_runtimes) list(APPEND extra_deps ${dep}) endif() endforeach() + # Pass the location of LLVM tools to the runtime build so libclc can find them + if (LLVM_NATIVE_TOOL_DIR) + list(APPEND extra_cmake_args "-DLLVM_NATIVE_TOOL_DIR=${LLVM_NATIVE_TOOL_DIR}") + else() + list(APPEND extra_cmake_args "-DLLVM_NATIVE_TOOL_DIR=${LLVM_RUNTIME_OUTPUT_INTDIR}") + endif() endif() # Tools needed by build_symbolizer.sh. if("compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES AND COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER) From 8789401c6f0b2c53ba348860c1b0c5df1fa40c8c Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 12 May 2026 08:28:00 +0200 Subject: [PATCH 400/538] [clang][bytecode] Fix a crash with invalid ArraySubscriptExprs (#196964) In the attached test case, `arr` becomes the _index_, not the base, which causes us later to run into issues because the index is a pointer and not an integer. --- clang/lib/AST/ByteCode/Compiler.cpp | 2 +- clang/test/AST/ByteCode/arrays.cpp | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index ed464dbfadf71..77518660063fc 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2018,7 +2018,7 @@ bool Compiler::VisitImplicitValueInitExpr( template bool Compiler::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) { - if (E->getType()->isVoidType()) + if (E->getType()->isVoidType() || E->containsErrors()) return false; const Expr *LHS = E->getLHS(); diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp index cb5b7ad99070e..0c3a69c861815 100644 --- a/clang/test/AST/ByteCode/arrays.cpp +++ b/clang/test/AST/ByteCode/arrays.cpp @@ -847,3 +847,11 @@ namespace MultiDimArrayInitLoop { constexpr S s = {1}; constexpr T t = {s}; } + +namespace ErroneousArraySubscriptExpr { + constexpr int &foo(int *arr, size_t idx) { return arr[idx]; } // both-error {{unknown type name 'size_t'}} + void bar() { + int val[] = {1, 2, 3, 4}; + foo(val, 2) = 42; + } +} From 05f1fd371e590605f2a7120d971a17adc8ca1fa5 Mon Sep 17 00:00:00 2001 From: Konrad Kleine Date: Tue, 12 May 2026 08:47:44 +0200 Subject: [PATCH 401/538] [github] Fix invalid local action invocation in release-doxygen workflow (#197107) Fix the `Validate Input` step in `.github/workflows/release-doxygen.yml` to use a valid local action invocation. Before: ```yaml - name: Validate Input ./.github/workflows/validate-release-version with: release-version: ${{ inputs.release-version }} ``` After: ```yaml - name: Validate Input uses: ./.github/workflows/validate-release-version with: release-version: ${{ inputs.release-version }} ``` GitHub Actions steps must use `uses:` or `run:`. The current form is invalid. This appears to have been introduced in #196769. - Verified the workflow syntax change is minimal and correct. - No functional behavior intended beyond fixing the invalid step declaration. --- .github/workflows/release-doxygen.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml index 7bf5e5e94f905..69efa72f323a4 100644 --- a/.github/workflows/release-doxygen.yml +++ b/.github/workflows/release-doxygen.yml @@ -60,7 +60,7 @@ jobs: LLVM_TOKEN_GENERATOR_PRIVATE_KEY: ${{ secrets.LLVM_TOKEN_GENERATOR_PRIVATE_KEY }} - name: Validate Input - ./.github/workflows/validate-release-version + uses: ./.github/workflows/validate-release-version with: release-version: ${{ inputs.release-version }} From 4ef1ef58e8f12d2d80e98850d845e262e06878e0 Mon Sep 17 00:00:00 2001 From: Tomer Shafir Date: Tue, 12 May 2026 09:48:19 +0300 Subject: [PATCH 402/538] [AArch64] Add a regression test for Apple tuning features(NFC) (#196792) This patch adds a TableGen regression test that directly checks complete featrure lists per generation for Apple CPUs, to guard against changes that can break the association if we lack indirect coverage. A followup patch should introduce generational delta encoding for Apple tuning features that this test should help verify. --- .../TableGen/aarch64-apple-tuning-features.td | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 llvm/test/TableGen/aarch64-apple-tuning-features.td diff --git a/llvm/test/TableGen/aarch64-apple-tuning-features.td b/llvm/test/TableGen/aarch64-apple-tuning-features.td new file mode 100644 index 0000000000000..d0f3142dbe863 --- /dev/null +++ b/llvm/test/TableGen/aarch64-apple-tuning-features.td @@ -0,0 +1,28 @@ +// RUN: llvm-tblgen -print-records %p/../../lib/Target/AArch64/AArch64.td -I %p/../../lib/Target/AArch64 -I %p/../../include | FileCheck %s + +// Verify the resolved tuning feature lists for Apple CPUs. +// Each generation inherits from the previous and applies add/remove deltas. +// When adding a tuning feature to an Apple core, update the expected list below. + +// CHECK-LABEL: def TuneAppleA10 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128]; +// CHECK-LABEL: def TuneAppleA11 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128]; +// CHECK-LABEL: def TuneAppleA12 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128]; +// CHECK-LABEL: def TuneAppleA13 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128]; +// CHECK-LABEL: def TuneAppleA14 { +// CHECK: list Implies = [FeatureAggressiveFMA, FeatureAlignCmpCSelPairs, FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFastLD1Single, FeatureFuseAddress, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureMaxInterleaveFactor4]; +// CHECK-LABEL: def TuneAppleA15 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFastLD1Single, FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureMaxInterleaveFactor4]; +// CHECK-LABEL: def TuneAppleA16 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFastLD1Single, FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCmpCSel, FeatureFuseFCmpFCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureMaxInterleaveFactor4]; +// CHECK-LABEL: def TuneAppleA17 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFastLD1Single, FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCmpCSel, FeatureFuseFCmpFCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureMaxInterleaveFactor4]; +// CHECK-LABEL: def TuneAppleA7 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureZCZeroingFPWorkaround]; +// CHECK-LABEL: def TuneAppleM4 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFastLD1Single, FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCmpCSel, FeatureFuseFCmpFCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureMaxInterleaveFactor4]; +// CHECK-LABEL: def TuneAppleM5 { +// CHECK: list Implies = [FeatureAlternateSExtLoadCVTF32Pattern, FeatureAlignCmpCSelPairs, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFastLD1Single, FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCmpCSel, FeatureFuseFCmpFCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, FeatureZCZeroingGPR32, FeatureZCZeroingGPR64, FeatureNoZCZeroingFPR64, FeatureZCZeroingFPR128, FeatureMaxInterleaveFactor4]; From 7fddf992e1dd30c51219e298354a817f87b15a66 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 12 May 2026 08:48:52 +0200 Subject: [PATCH 403/538] [clang][bytecode] Pass correct QualType to getFixedPointSemantics() (#196952) The expression type might be different, so pass the QualType we have at hand. --- clang/lib/AST/ByteCode/Compiler.cpp | 2 +- clang/test/AST/ByteCode/fixed-point.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 77518660063fc..683676529d52f 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4756,7 +4756,7 @@ bool Compiler::visitZeroInitializer(PrimType T, QualType QT, return this->emitFloat(F, E); } case PT_FixedPoint: { - auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType()); + auto Sem = Ctx.getASTContext().getFixedPointSemantics(QT); return this->emitConstFixedPoint(FixedPoint::zero(Sem), E); } } diff --git a/clang/test/AST/ByteCode/fixed-point.cpp b/clang/test/AST/ByteCode/fixed-point.cpp index fb44558fc037b..cbf9478f80a3a 100644 --- a/clang/test/AST/ByteCode/fixed-point.cpp +++ b/clang/test/AST/ByteCode/fixed-point.cpp @@ -89,3 +89,8 @@ namespace Cmp { static_assert(A < B); static_assert(A <= B); } + +struct S { + _Accum s[2]; +}; +S s = S(); From cc7353ba52cb1e88b80e7770be4a7f10afedb844 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 12 May 2026 09:01:58 +0200 Subject: [PATCH 404/538] [AMDGPU] Add VOP3 encoding for gfx13 (#196258) Co-authored-by: Ivan Kosarev --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 10 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 54 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 483 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 218 +- llvm/test/MC/AMDGPU/gfx13_asm_vop3-fake16.s | 8182 ++++++++++++++++ llvm/test/MC/AMDGPU/gfx13_asm_vop3.s | 8195 +++++++++++++++++ llvm/test/MC/AMDGPU/gfx13_asm_vop3_aliases.s | 54 + .../MC/AMDGPU/gfx13_asm_vop3_dpp16-fake16.s | 5574 +++++++++++ llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16.s | 5587 +++++++++++ .../MC/AMDGPU/gfx13_asm_vop3_dpp8-fake16.s | 3520 +++++++ llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8.s | 3524 +++++++ .../AMDGPU/gfx13_asm_vop3_from_vop1-fake16.s | 4106 +++++++++ .../gfx13_asm_vop3_from_vop1_dpp16-fake16.s | 297 + .../gfx13_asm_vop3_from_vop1_dpp8-fake16.s | 87 + 16 files changed, 39598 insertions(+), 300 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_aliases.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp16-fake16.s create mode 100644 llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp8-fake16.s diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 59bdfc177d1d7..85e804e211e72 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9722,7 +9722,9 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx11 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx11 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) { + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx13 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx13) { Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods Inst.addOperand(Inst.getOperand(0)); } @@ -10295,9 +10297,13 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, int Fi = 0; int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx13 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx13 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12; + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx13 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx13; for (unsigned E = Operands.size(); I != E; ++I) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 7c3aff9c9f47f..90b5f7b0b794b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -3095,6 +3095,7 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; +def VOP_V6I32_V32F32_F32 : VOPProfile<[v6i32, v32f32, f32, untyped]>; def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>; def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>; def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b13aed2432602..6feaa4951b14a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -729,9 +729,13 @@ bool isPermlane16(unsigned Opc) { Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx11 || Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx11 || Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx13 || Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx13 || Opc == AMDGPU::V_PERMLANE16_VAR_B32_e64_gfx12 || - Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; + Opc == AMDGPU::V_PERMLANE16_VAR_B32_e64_gfx13 || + Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx13; } bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 96737e5e61020..ea2681846e0d6 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1977,23 +1977,29 @@ multiclass VOP2_Real_FULL_with_name_gfx1170 op, string opName, string asmName> : VOP2_Real_FULL_with_name; +multiclass VOP3Only_Realtriple_gfx11_gfx12_gfx13< + bits<10> preGFX13Op, bits<10> op = preGFX13Op> : + VOP3Only_Realtriple, + VOP3Only_Realtriple, + VOP3Only_Realtriple; + multiclass VOP2_Real_e32_gfx11_gfx12 op> : VOP2Only_Real_e32, VOP2Only_Real_e32; -multiclass VOP3Only_Realtriple_gfx11_gfx12 op> : - VOP3Only_Realtriple, VOP3Only_Realtriple; - -multiclass VOP3Only_Realtriple_t16_gfx11_gfx12 op, string asmName, string OpName = NAME> : - VOP3_Realtriple_t16_gfx11, - VOP3_Realtriple_t16_gfx12; +multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, string OpName = NAME> : + VOP3_Realtriple_t16_gfx11, + VOP3_Realtriple_t16_gfx12, + VOP3_Realtriple_t16_gfx13; -multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12 op, string asmName, string OpName = NAME> { - defm _t16: VOP3Only_Realtriple_t16_gfx11_gfx12; - defm _fake16: VOP3Only_Realtriple_t16_gfx11_gfx12; +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, string OpName = NAME> { + defm _t16: VOP3Only_Realtriple_t16_gfx11_gfx12_gfx13; + defm _fake16: VOP3Only_Realtriple_t16_gfx11_gfx12_gfx13; } -multiclass VOP3beOnly_Realtriple_gfx11_gfx12 op> : - VOP3beOnly_Realtriple, VOP3beOnly_Realtriple; +multiclass VOP3beOnly_Realtriple_gfx11_gfx12_gfx13 preGFX13Op, bits<10> op> : + VOP3beOnly_Realtriple, + VOP3beOnly_Realtriple, + VOP3beOnly_Realtriple; multiclass VOP2Only_Real_MADK_t16_gfx11_gfx12_gfx13< bits<6> op, string asmName, string opName = NAME> : @@ -2056,19 +2062,19 @@ defm V_FMAMK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12_gfx13 defm V_FMAAK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12_gfx13<0x038, "v_fmaak_f16">; // VOP3 only. -defm V_CNDMASK_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x25d, "v_cndmask_b16">; -defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x31c>; -defm V_BFM_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31d>; -defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31e>; -defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31f>; -defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x320>; -defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">; -defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">; -defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11_gfx12<0x323>; -defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11_gfx12<0x324>; -defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x300>; -defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x301>; -defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x302>; +defm V_CNDMASK_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x25d, 0x25d, "v_cndmask_b16">; +defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x31c, 0x362>; +defm V_BFM_B32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x31d>; +defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x31e, 0x364>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x31f, 0x365>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x320, 0x366>; +defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12_gfx13<0x321, 0x368, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">; +defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12_gfx13<0x322, 0x369, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">; +defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x323, 0x36a>; +defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x324, 0x36b>; +defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12_gfx13<0x300, 0x30f>; +defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12_gfx13<0x301, 0x310>; +defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12_gfx13<0x302, 0x319>; let SubtargetPredicate = isGFX11Only in { defm : VOP2eInstAliases; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index fc772ffeb1141..e78ecd1af9b45 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1855,12 +1855,14 @@ let SubtargetPredicate = isGFX1250Plus in { } let Constraints = "@earlyclobber $vdst" in { - defm V_CVT_SCALE_PK8_F16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp8>; - defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp8>; - defm V_CVT_SCALE_PK8_F16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_bf8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_bf8>; - defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; - defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; - defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALE_PK8_F16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp8>; + defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp8>; + defm V_CVT_SCALE_PK8_F16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_bf8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_bf8>; + defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; + defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; + defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + } defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>; defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>; defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>; @@ -1869,9 +1871,11 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>; } // End Constraints = "@earlyclobber $vdst" - defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; - defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp4", VOP_V8BF16_I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp4>; - defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; + defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp4", VOP_V8BF16_I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp4>; + defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; + } } // End ReadsModeReg = 0 let Constraints = "@earlyclobber $vdst" in { @@ -2028,6 +2032,11 @@ let SubtargetPredicate = HasAshrPkInsts in { def : AshrPkU8Pat; } +let SubtargetPredicate = isGFX13Plus in { + defm V_CVT_SCALEF32_PK32_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile>; + defm V_CVT_SCALEF32_PK32_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile>; +} + //===----------------------------------------------------------------------===// // Integer Clamp Patterns //===----------------------------------------------------------------------===// @@ -2090,56 +2099,56 @@ def : MinimumMaximumByMinimum3Maximum3; //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// GFX12. +// GFX12, GFX13. //===----------------------------------------------------------------------===// -defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">; -defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">; -defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">; -defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">; -defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">; -defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">; -defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">; -defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">; -defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">; -defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">; -defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>; -defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>; -defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>; -defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>; -defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>; -defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>; -defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>; -defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>; -defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>; -defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>; -defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">; -defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">; +defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12_gfx13<0x229, 0x229, "V_MIN3_F32", "v_min3_num_f32">; +defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12_gfx13<0x22a, 0x22a, "V_MAX3_F32", "v_max3_num_f32">; +defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12_gfx13<0x22b, 0x351, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">; +defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12_gfx13<0x22c, 0x354, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">; +defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12_gfx13<0x231, 0x231, "V_MED3_F32", "v_med3_num_f32">; +defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12_gfx13<0x232, 0x357, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">; +defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12_gfx13<0x268,0x268, "V_MINMAX_F32", "v_minmax_num_f32">; +defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12_gfx13<0x269, 0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">; +defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12_gfx13<0x26a, 0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">; +defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12_gfx13<0x26b, 0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">; +defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12_gfx13<0x280>; +defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12_gfx13<0x281>; +defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12_gfx13<0x282>; +defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12_gfx13<0x283>; +defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12_gfx13<0x284>; +defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12_gfx13<0x285>; +defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12_gfx13<0x286>; +defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12_gfx13<0x287>; +defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12_gfx13<0x288>; +defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12_gfx13<0x289>; +defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12_NO_DPP_gfx13<0x2fe, 0x2f8, "V_MAD_U64_U32", "v_mad_co_u64_u32">; +defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12_NO_DPP_gfx13<0x2ff, 0x2f9, "V_MAD_I64_I32", "v_mad_co_i64_i32">; let isConvergent = 1 in { - defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; - defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; + defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x30f, 0x315>; + defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x310, 0x316>; } -defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">; -defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>; +defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250_gfx13<0x233, "v_bitop3_b16">; +defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250_gfx13<0x234>; -defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250_gfx13<0x235>; defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>; defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>; defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>; defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>; -defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; -defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; -defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; -defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; -defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12<0x270>; -defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12<0x271>; -defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12<0x272>; -defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12<0x273>; -defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>; +defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250_gfx13<0x25e>; +defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250_gfx13<0x25f>; +defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250_gfx13<0x260>; +defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250_gfx13<0x261>; +defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x270>; +defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x271>; +defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x272>; +defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x273>; +defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12_gfx13<0x314, 0x317>; //===----------------------------------------------------------------------===// // GFX1170 @@ -2147,10 +2156,10 @@ defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>; defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx1170<0x219, "V_MIN3_F32", "v_min3_num_f32">; defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx1170<0x21c, "V_MAX3_F32", "v_max3_num_f32">; -defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x22d>; -defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x22e>; -defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x22f, "v_minimum3_f16">; -defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x230, "v_maximum3_f16">; +defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x22d>; +defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x22e>; +defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x22f, 0x22f, "v_minimum3_f16">; +defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x230, 0x230, "v_maximum3_f16">; defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx1170<0x231, "V_MED3_F32", "v_med3_num_f32">; defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx1170<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">; defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx1170<0x249, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">; @@ -2159,190 +2168,207 @@ defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx1170<0x25e, "V_MAXMIN_F defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx1170<0x25f, "V_MINMAX_F32", "v_minmax_num_f32">; defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx1170<0x260, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">; defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx1170<0x261, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">; -defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x26c>; -defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x26d>; -defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x26e, "v_minimummaximum_f16">; -defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x26f, "v_maximumminimum_f16">; +defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x26c>; +defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x26d>; +defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x26e, 0x26e, "v_minimummaximum_f16">; +defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x26f, 0x26f, "v_maximumminimum_f16">; defm V_MIN_NUM_F64 : VOP3_Real_Base_gfx1170<0x329, "V_MIN_NUM_F64", "v_min_f64", 1>; defm V_MAX_NUM_F64 : VOP3_Real_Base_gfx1170<0x32a, "V_MAX_NUM_F64", "v_max_f64", 1>; -defm V_MINIMUM_F64 : VOP3Only_Realtriple_gfx11_gfx12<0x341>; -defm V_MAXIMUM_F64 : VOP3Only_Realtriple_gfx11_gfx12<0x342>; -defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x365>; -defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x366>; -defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x367, "v_minimum_f16">; -defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x368, "v_maximum_f16">; +defm V_MINIMUM_F64 : VOP3Only_Realtriple_gfx11_gfx12_NO_DPP_gfx13<0x341, 0x33e>; +defm V_MAXIMUM_F64 : VOP3Only_Realtriple_gfx11_gfx12_NO_DPP_gfx13<0x342, 0x33f>; +defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x365, 0x33c>; +defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx11_gfx12_gfx13<0x366, 0x33d>; +defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x367, 0x33a, "v_minimum_f16">; +defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x368, 0x33b, "v_maximum_f16">; //===----------------------------------------------------------------------===// -// GFX11, GFX12 +// GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// -multiclass VOP3_Realtriple_gfx11_gfx12 op> : - VOP3_Realtriple, VOP3_Realtriple; +multiclass VOP3_Realtriple_gfx11_gfx12_gfx13< + bits<10> preGFX13Op, bits<10> op = preGFX13Op> : + VOP3_Realtriple, VOP3_Realtriple, + VOP3_Realtriple; -multiclass VOP3_Real_Base_gfx11_gfx12 op> : - VOP3_Real_Base, VOP3_Real_Base; +multiclass VOP3_Real_Base_gfx11_gfx12_gfx13< + bits<10> preGFX13Op, bits<10> op = preGFX13Op> : + VOP3_Real_Base, VOP3_Real_Base, + VOP3_Real_Base; + +multiclass VOP3_Realtriple_with_name_gfx11_gfx12_gfx13< + bits<10> preGFX13Op, bits<10> op, string opName, string asmName> : + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250 op> : VOP3_Real_Base, VOP3_Real_Base; -multiclass VOP3_Realtriple_with_name_gfx11_gfx12 op, string opName, - string asmName> : - VOP3_Realtriple_with_name, - VOP3_Realtriple_with_name; +multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250_gfx13 op> : + VOP3_Real_Base_gfx11_gfx12_not_gfx1250, VOP3_Real_Base; multiclass VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12 op, string asmName, string opName = NAME> { defm _t16: VOP3Dot_Realtriple_gfx11_gfx12; defm _fake16: VOP3Dot_Realtriple_gfx11_gfx12; } -multiclass VOP3_Realtriple_t16_gfx11_gfx12 op, string asmName, string opName = NAME, +multiclass VOP3_Realtriple_t16_gfx11_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : - VOP3_Realtriple_with_name, - VOP3_Realtriple_with_name; + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; -multiclass VOP3_Realtriple_t16_and_fake16_gfx11_gfx12 op, string asmName, string opName = NAME, +multiclass VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> { - defm opName#"_t16": VOP3_Realtriple_t16_gfx11_gfx12; - defm opName#"_fake16": VOP3_Realtriple_t16_gfx11_gfx12; + defm opName#"_t16": VOP3_Realtriple_t16_gfx11_gfx12_gfx13; + defm opName#"_fake16": VOP3_Realtriple_t16_gfx11_gfx12_gfx13; } -multiclass VOP3be_Real_gfx11_gfx12 op, string opName, string asmName> : +multiclass VOP3be_Real_gfx11_gfx12_gfx13< + bits<10> op, string opName, string asmName> : VOP3be_Real, - VOP3be_Real; + VOP3be_Real, + VOP3be_Real; -multiclass VOP3be_Real_gfx11_gfx12_not_gfx1250 op, string opName, string asmName> : +multiclass VOP3be_Real_gfx11_gfx12_not_gfx1250_gfx13 op, string opName, string asmName> : VOP3be_Real, - VOP3be_Real; + VOP3be_Real, + VOP3be_Real; multiclass VOP3be_Realtriple_gfx1250 op> : VOP3be_Realtriple; -multiclass VOP3_Real_No_Suffix_gfx11_gfx12 op> : - VOP3_Real_No_Suffix, VOP3_Real_No_Suffix; - -defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11_gfx12<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">; -defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11_gfx12<0x20a>; -defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11_gfx12<0x20b>; -defm V_CUBEID_F32 : VOP3_Realtriple_gfx11_gfx12<0x20c>; -defm V_CUBESC_F32 : VOP3_Realtriple_gfx11_gfx12<0x20d>; -defm V_CUBETC_F32 : VOP3_Realtriple_gfx11_gfx12<0x20e>; -defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x20f>; -defm V_BFE_U32 : VOP3_Realtriple_gfx11_gfx12<0x210>; -defm V_BFE_I32 : VOP3_Realtriple_gfx11_gfx12<0x211>; -defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12<0x212>; -defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x213>; +multiclass VOP3_Real_No_Suffix_gfx11_gfx12_gfx13 op> : + VOP3_Real_No_Suffix, VOP3_Real_No_Suffix, + VOP3_Real_No_Suffix; + +defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">; +defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x20a>; +defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x20b>; +defm V_CUBEID_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x20c>; +defm V_CUBESC_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x20d>; +defm V_CUBETC_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x20e>; +defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x20f>; +defm V_BFE_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x210>; +defm V_BFE_I32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x211>; +defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x212>; +defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x213>; defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x214>; -defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12<0x215>; -defm V_ALIGNBIT_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x216, "v_alignbit_b32">; -defm V_ALIGNBYTE_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x217, "v_alignbyte_b32">; -defm V_MULLIT_F32 : VOP3_Realtriple_gfx11_gfx12<0x218>; +defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x215>; +defm V_ALIGNBIT_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x216, 0x216, "v_alignbit_b32">; +defm V_ALIGNBYTE_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x217, 0x217, "v_alignbyte_b32">; +defm V_MULLIT_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x218>; defm V_MIN3_F32 : VOP3_Realtriple_gfx11_not_gfx1170<0x219>; -defm V_MIN3_I32 : VOP3_Realtriple_gfx11_gfx12<0x21a>; -defm V_MIN3_U32 : VOP3_Realtriple_gfx11_gfx12<0x21b>; +defm V_MIN3_I32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x21a>; +defm V_MIN3_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x21b>; defm V_MAX3_F32 : VOP3_Realtriple_gfx11_not_gfx1170<0x21c>; -defm V_MAX3_I32 : VOP3_Realtriple_gfx11_gfx12<0x21d>; -defm V_MAX3_U32 : VOP3_Realtriple_gfx11_gfx12<0x21e>; +defm V_MAX3_I32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x21d>; +defm V_MAX3_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x21e>; defm V_MED3_F32 : VOP3_Realtriple_gfx11_not_gfx1170<0x21f>; -defm V_MED3_I32 : VOP3_Realtriple_gfx11_gfx12<0x220>; -defm V_MED3_U32 : VOP3_Realtriple_gfx11_gfx12<0x221>; -defm V_SAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x222>; -defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11_gfx12<0x223>; -defm V_SAD_U16 : VOP3_Realtriple_gfx11_gfx12<0x224>; -defm V_SAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x225>; -defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11_gfx12<0x226>; -defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11_gfx12<0x227>; -defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x228>; -defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11_gfx12<0x237>; -defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x238>; -defm V_MSAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x239>; -defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>; -defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>; -defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12<0x23d>; -defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x240>; -defm V_MAD_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x241, "v_mad_u16", "V_MAD_U16_gfx9">; -defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>; -defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>; -defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>; -defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>; -defm V_FMA_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x248, "v_fma_f16", "V_FMA_F16_gfx9">; +defm V_MED3_I32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x220>; +defm V_MED3_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x221>; +defm V_SAD_U8 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x222>; +defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x223>; +defm V_SAD_U16 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x224>; +defm V_SAD_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x225>; +defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x226>; +defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x227>; +defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250_gfx13<0x228>; +defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x237>; +defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250_gfx13<0x238>; +defm V_MSAD_U8 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x239>; +defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x23a>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x23b>; +defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x23d>; +defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x240>; +defm V_MAD_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x241, 0x340, "v_mad_u16", "V_MAD_U16_gfx9">; +defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x244, 0x344>; +defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x245, 0x345>; +defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x246, 0x346>; +defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x247, 0x347>; +defm V_FMA_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x248, 0x34b, "v_fma_f16", "V_FMA_F16_gfx9">; defm V_MIN3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x249, "v_min3_f16">; -defm V_MIN3_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24a, "v_min3_i16">; -defm V_MIN3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24b, "v_min3_u16">; +defm V_MIN3_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x24a, 0x352, "v_min3_i16">; +defm V_MIN3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x24b, 0x353, "v_min3_u16">; defm V_MAX3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24c, "v_max3_f16">; -defm V_MAX3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24d, "v_max3_i16">; -defm V_MAX3_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24e, "v_max3_u16">; +defm V_MAX3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x24d, 0x355, "v_max3_i16">; +defm V_MAX3_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x24e, 0x356, "v_max3_u16">; defm V_MED3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24f, "v_med3_f16">; -defm V_MED3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x250, "v_med3_i16">; -defm V_MED3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x251, "v_med3_u16">; -defm V_MAD_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x253, "v_mad_i16", "V_MAD_I16_gfx9">; -defm V_DIV_FIXUP_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x254, "v_div_fixup_f16", "V_DIV_FIXUP_F16_gfx9">; -defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12<0x255>; -defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x256>; -defm V_AND_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x257>; -defm V_OR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x258>; -defm V_MAD_U32_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x259, "v_mad_u32_u16">; -defm V_MAD_I32_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x25a, "v_mad_i32_i16">; -defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>; -defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>; +defm V_MED3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x250, 0x358, "v_med3_i16">; +defm V_MED3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x251, 0x359, "v_med3_u16">; +defm V_MAD_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x253, 0x35e, "v_mad_i16", "V_MAD_I16_gfx9">; +defm V_DIV_FIXUP_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x254, 0x35f, "v_div_fixup_f16", "V_DIV_FIXUP_F16_gfx9">; +defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x255, 0x36d>; +defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x256, 0x36f>; +defm V_AND_OR_B32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x257, 0x371>; +defm V_OR3_B32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x258, 0x372>; +defm V_MAD_U32_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x259, 0x373, "v_mad_u32_u16">; +defm V_MAD_I32_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x25a, 0x375, "v_mad_i32_i16">; +defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x25b, 0x377>; +defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x25c, 0x378>; defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>; defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>; defm V_MAXMIN_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">; defm V_MINMAX_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">; -defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>; -defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>; -defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>; -defm V_MINMAX_I32 : VOP3_Realtriple_gfx11_gfx12<0x265>; +defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x262>; +defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x263>; +defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x264>; +defm V_MINMAX_I32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x265>; defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x266, "v_dot2_f16_f16">; defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x267, "v_dot2_bf16_bf16">; -defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; -defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12_not_gfx1250<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; +defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12_gfx13<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; +defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12_not_gfx1250_gfx13<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; -defm V_ADD_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x303, "v_add_nc_u16">; -defm V_SUB_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x304, "v_sub_nc_u16">; -defm V_MUL_LO_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x305, "v_mul_lo_u16">; -defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11_gfx12<0x306>; -defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11_gfx12<0x307>; -defm V_MAX_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x309, "v_max_u16">; -defm V_MAX_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30a, "v_max_i16">; -defm V_MIN_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30b, "v_min_u16">; -defm V_MIN_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">; -defm V_ADD_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30d, "v_add_nc_i16", "V_ADD_I16">; -defm V_SUB_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30e, "v_sub_nc_i16", "V_SUB_I16">; -defm V_PACK_B32_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x311, "v_pack_b32_f16">; -defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x312, "v_cvt_pk_norm_i16_f16", "V_CVT_PKNORM_I16_F16", "v_cvt_pknorm_i16_f16">; -defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x313, "v_cvt_pk_norm_u16_f16", "V_CVT_PKNORM_U16_F16", "v_cvt_pknorm_u16_f16">; -defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x325, "V_SUB_I32", "v_sub_nc_i32">; -defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x326, "V_ADD_I32", "v_add_nc_i32">; +defm V_ADD_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x303, 0x303, "v_add_nc_u16">; +defm V_SUB_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x304, 0x304, "v_sub_nc_u16">; +defm V_MUL_LO_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x305, 0x305, "v_mul_lo_u16">; +defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x306, 0x321>; +defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11_gfx12_gfx13<0x307, 0x322>; +defm V_MAX_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x309, 0x309, "v_max_u16">; +defm V_MAX_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x30a, 0x30a, "v_max_i16">; +defm V_MIN_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x30b, 0x30b, "v_min_u16">; +defm V_MIN_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x30c, 0x30c, "v_min_i16">; +defm V_ADD_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x30d, 0x30d, "v_add_nc_i16", "V_ADD_I16">; +defm V_SUB_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x30e, 0x30e, "v_sub_nc_i16", "V_SUB_I16">; +defm V_PACK_B32_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x311, 0x311, "v_pack_b32_f16">; +defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x312, 0x312, "v_cvt_pk_norm_i16_f16", "V_CVT_PKNORM_I16_F16", "v_cvt_pknorm_i16_f16">; +defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x313, 0x313, "v_cvt_pk_norm_u16_f16", "V_CVT_PKNORM_U16_F16", "v_cvt_pknorm_u16_f16">; +defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12_gfx13<0x325, 0x376, "V_SUB_I32", "v_sub_nc_i32">; +defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12_gfx13<0x326, 0x37f, "V_ADD_I32", "v_add_nc_i32">; defm V_ADD_F64 : VOP3_Real_Base_gfx11<0x327>; defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>; defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>; defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>; defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32b>; -defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>; -defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>; -defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>; -defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>; -defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">; -defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">; -defm V_ASHRREV_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x33a, "v_ashrrev_i16">; +defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250_gfx13<0x32c>; +defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250_gfx13<0x32d>; +defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250_gfx13<0x32e>; +defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12_gfx13<0x32f>; +defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x338, 0x314, "v_lshlrev_b16">; +defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x339, 0x339, "v_lshrrev_b16">; +defm V_ASHRREV_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x33a, 0x308, "v_ashrrev_i16">; defm V_LSHLREV_B64 : VOP3_Real_Base_gfx11<0x33c>; defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x33d>; defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x33e>; -defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x360>; // Pseudo in VOP2 +defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12_gfx13<0x360>; // Pseudo in VOP2 let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { - defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x361>; // Pseudo in VOP2 + defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12_gfx13<0x361>; // Pseudo in VOP2 } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) -defm V_AND_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x362, "v_and_b16">; -defm V_OR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x363, "v_or_b16">; -defm V_XOR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x364, "v_xor_b16">; +defm V_AND_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x362, 0x367, "v_and_b16">; +defm V_OR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x363, 0x325, "v_or_b16">; +defm V_XOR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x364, 0x370, "v_xor_b16">; + +def : AMDGPUMnemonicAlias<"v_perm_pk4_b8_u8", "v_perm_b32"> { + let AssemblerPredicate = isGFX13Plus; +} -defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<0x369, "v_cvt_pk_fp8_f32">; +defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250_gfx13<0x369, 0x37a, "v_cvt_pk_fp8_f32">; defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x369, "v_cvt_pk_fp8_f32">; -defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36a, "v_cvt_pk_bf8_f32">; -defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32">; +defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_gfx13<0x36a, 0x37b, "v_cvt_pk_bf8_f32">; +defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250_gfx13<0x36b, 0x337, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32">; defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Only_Realtriple_with_name_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx1250", "v_cvt_sr_fp8_f32">; -defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">; +defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_gfx13<0x36c, 0x338, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">; let AssemblerPredicate = isGFX11Plus in { def : AMDGPUMnemonicAlias<"v_add3_nc_u32", "v_add3_u32">; @@ -2350,41 +2376,41 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: -defm V_FMA_F64 : VOP3Only_Realtriple_gfx1250<0x214>; +defm V_FMA_F64 : VOP3Only_Realtriple_gfx1250_gfx13<0x214>; defm V_DIV_FIXUP_F64 : VOP3Only_Realtriple_gfx1250<0x228>; defm V_DIV_FMAS_F64 : VOP3Only_Realtriple_gfx1250<0x238>; defm V_DIV_SCALE_F64 : VOP3be_Realtriple_gfx1250<0x2fd>; -defm V_LDEXP_F64 : VOP3Only_Realtriple_gfx1250<0x32b>; +defm V_LDEXP_F64 : VOP3Only_Realtriple_gfx1250_gfx13<0x32b>; defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>; defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>; defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>; -defm V_LSHRREV_B64 : VOP3Only_Realtriple_gfx1250<0x33d>; -defm V_ASHRREV_I64 : VOP3Only_Realtriple_gfx1250<0x33e>; - -defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; -defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; -defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; -defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; -defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; -defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; -defm V_CVT_SCALE_PK8_F16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x29f>; -defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a0>; -defm V_CVT_SCALE_PK8_F32_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a1>; -defm V_CVT_SCALE_PK8_F16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a8>; -defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a9>; -defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>; -defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>; -defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>; -defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>; -defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>; -defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>; -defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>; -defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>; -defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>; -defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>; -defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>; -defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>; -defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>; +defm V_LSHRREV_B64 : VOP3Only_Realtriple_gfx1250_gfx13<0x33d, 0x300>; +defm V_ASHRREV_I64 : VOP3Only_Realtriple_gfx1250_gfx13<0x33e, 0x301>; + +defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250_gfx13<0x23f, 0x348>; +defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250_gfx13<0x242, 0x349>; +defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250_gfx13<0x243, 0x34a>; +defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250_DPP8_gfx13<0x252>; +defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250_gfx13<0x290, 0x290>; +defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250_gfx13<0x291, 0x291>; +defm V_CVT_SCALE_PK8_F16_FP4 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x29f, 0x2bd>; +defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2a0, 0x2ba>; +defm V_CVT_SCALE_PK8_F32_FP4 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2a1, 0x2c0>; +defm V_CVT_SCALE_PK8_F16_FP8 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2a8, 0x2be>; +defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2a9, 0x2bb>; +defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2aa, 0x2c1>; +defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2ab, 0x2bc>; +defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2ac, 0x2b9>; +defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250_gfx13<0x2ad, 0x2bf>; +defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250_gfx13<0x2b0, 0x2a0>; +defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2b3, 0x29f>; +defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2b4, 0x2a1>; +defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2b5, 0x29b>; +defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2b8, 0x29e>; +defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c3, 0x2a3>; +defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c4, 0x2a2>; +defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c5, 0x29d>; +defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c6, 0x29c>; defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>; defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>; defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>; @@ -2403,23 +2429,38 @@ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>; defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>; defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>; defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>; -defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; -defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; -defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; -defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>; -defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>; -defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>; -defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>; -defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>; -defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>; -defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; -defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; -defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; -defm V_CVT_SR_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x370>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250_gfx13<0x297, 0x2af>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250_gfx13<0x298, 0x2b2>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250_gfx13<0x299, 0x2ac>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2b9, 0x2ae>; +defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2bc, 0x2ad>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2bf, 0x2b1>; +defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c0, 0x2b0>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c1, 0x2ab>; +defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250_gfx13<0x2c2, 0x2aa>; +defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250_gfx13<0x36d, 0x36e>; +defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250_gfx13<0x36e, 0x2c2>; +defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250_gfx13<0x36f, 0x374>; +defm V_CVT_SR_PK_F16_F32 : VOP3Only_Realtriple_gfx1250_gfx13<0x370, 0x2c3>; defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">; defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">; -defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>; -defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>; +defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250_gfx13<0x374, 0x335>; +defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250_gfx13<0x375, 0x336>; + +let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Only_Real_Base_gfx13<0x295>; + defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Only_Real_Base_gfx13<0x296>; + defm V_CVT_SCALEF32_PK32_BF6_F32 : VOP3Only_Real_Base_gfx13<0x297>; + defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Only_Real_Base_gfx13<0x298>; + defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Only_Real_Base_gfx13<0x299>; + defm V_CVT_SCALEF32_PK32_FP6_F32 : VOP3Only_Real_Base_gfx13<0x29a>; + defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3Only_Real_Base_gfx13<0x2a4>; + defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3Only_Real_Base_gfx13<0x2a5>; + defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3Only_Real_Base_gfx13<0x2a6>; + defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3Only_Real_Base_gfx13<0x2a7>; + defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3Only_Real_Base_gfx13<0x2a8>; + defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3Only_Real_Base_gfx13<0x2a9>; +} // End WaveSizePredicate = isWave32 //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 4f9b679e05ea3..dac9c2aeff95a 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -418,7 +418,7 @@ class VOP3a_BITOP3_gfx12 op, VOPProfile p> : VOP3e_gfx11_gfx12 { let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); } -class VOP3a_ScaleSel_gfx1250 op, VOPProfile p> : VOP3e_gfx11_gfx12 { +class VOP3a_ScaleSel_gfx1250_gfx13 op, VOPProfile p> : VOP3e_gfx11_gfx12 { bits<4> scale_sel; let Inst{14-11} = scale_sel; @@ -1821,7 +1821,7 @@ class VOP3b_DPP8_Base_t16 op, VOP_Pseudo ps, string opName = ps.OpName> } //===----------------------------------------------------------------------===// -// VOP3 GFX11, GFX12 +// VOP3 GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass VOP3_Real_Base op, string opName = NAME, @@ -1901,6 +1901,11 @@ multiclass VOP3_Real_with_name op, string opName, } } +multiclass VOP3_Real_with_name_gfx12_gfx13< + bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Real_with_name, + VOP3_Real_with_name; + multiclass VOP3_Real_with_name_gfx11_gfx12_gfx13< bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Real_with_name, @@ -2012,10 +2017,19 @@ multiclass VOP3be_Real_dpp8 op, string opName, // VOP1 and VOP2 depend on these triple defs multiclass VOP3_Realtriple op, bit isSingle = 0, - string opName = NAME> : - VOP3_Real_Base, - VOP3_Real_dpp_Base, - VOP3_Real_dpp8_Base; + string opName = NAME> { + defm NAME : VOP3_Real_Base, + VOP3_Real_dpp_Base; + + defvar ps = !cast(opName#"_e64"); + if !not(ps.Pfl.HasExt64BitDPP) then + defm NAME : VOP3_Real_dpp8_Base; +} + +multiclass VOP3_Realtriple_DPP8 op> : + VOP3_Real_Base, + VOP3_Real_dpp_Base, + VOP3_Real_dpp8_Base; multiclass VOP3Dot_Realtriple op, string asmName, bit isSingle = 0, string opName = NAME> : @@ -2073,25 +2087,35 @@ multiclass VOP3Only_ScaleSel_Real_gfx1250 op> { defvar ps = !cast(NAME#"_e64"); def _e64_gfx1250 : VOP3_Real_Gen, - VOP3a_ScaleSel_gfx1250; + VOP3a_ScaleSel_gfx1250_gfx13; +} + +multiclass VOP3Only_ScaleSel_Real_gfx1250_gfx13 preGFX13, bits<10> op = preGFX13> + : VOP3Only_ScaleSel_Real_gfx1250 { + defvar ps = !cast(NAME#"_e64"); + def _e64_gfx13 : + VOP3_Real_Gen, + VOP3a_ScaleSel_gfx1250_gfx13; } -multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250 op, string asmName, string opName = NAME, +multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250_gfx13 preGFX13Op, bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : - VOP3_Realtriple_with_name, - VOP3_Realtriple_with_name; + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; -multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250 op, string asmName, +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250_gfx13 preGFX13Op, bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = ""> { - defm _t16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250; - defm _fake16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250; + defm _t16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250_gfx13; + defm _fake16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250_gfx13; } -multiclass VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250 op, string opName, +multiclass VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250_gfx13 preGFX13Op, bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : - VOP3_Realtriple_with_name, - VOP3_Realtriple_with_name; + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; //===----------------------------------------------------------------------===// // VOP3 GFX11 @@ -2164,21 +2188,29 @@ multiclass VOP3Only_Real_Base_gfx11_gfx12 op> : VOP3_Real_Base; //===----------------------------------------------------------------------===// -// VOP3 GFX12 +// VOP3 GFX12, GFX13 //===----------------------------------------------------------------------===// -multiclass VOP3Only_Realtriple_gfx12 op, bit isSingle = 0> : - VOP3_Realtriple; +multiclass VOP3Only_Realtriple_gfx12_gfx13 gfx12Op, bits<10> op = gfx12Op> : + VOP3_Realtriple, + VOP3_Realtriple; + +multiclass VOP3Only_Realtriple_gfx11_gfx12_NO_DPP_gfx13 gfx12Op, bits<10> op> : + VOP3_Realtriple, + VOP3_Realtriple, + VOP3_Real_Base; // IsSingle is captured from the vopprofile for these instructions, but the // following alternative is more explicit -multiclass VOP3Only_Real_Base_gfx12 op> : - VOP3_Real_Base; +multiclass VOP3Only_Real_Base_gfx12_gfx13 gfx12Op, bits<10> op = gfx12Op> : + VOP3_Real_Base, + VOP3_Real_Base; -multiclass VOP3Only_Realtriple_with_name_gfx12_not_gfx1250 op, string opName, - string asmName, string pseudo_mnemonic = "", - bit isSingle = 0> : - VOP3_Realtriple_with_name; +multiclass VOP3Only_Realtriple_with_name_gfx12_not_gfx1250_gfx13 gfx12Op, bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; multiclass VOP3Only_Real_Base_gfx1250 op> : VOP3_Real_Base; @@ -2186,8 +2218,17 @@ multiclass VOP3Only_Real_Base_gfx1250 op> : multiclass VOP3Only_Realtriple_gfx1250 op, bit isSingle = 0> : VOP3_Realtriple; -multiclass VOP3Only_Realtriple_gfx12_not_gfx1250 op, bit isSingle = 0> : - VOP3_Realtriple; +multiclass VOP3Only_Real_Base_gfx13 op> : + VOP3_Real_Base; + +multiclass VOP3Only_Real_Base_gfx1250_gfx13 gfx1250Op, bits<10> op> : + VOP3Only_Real_Base_gfx1250, + VOP3Only_Real_Base_gfx13; + +multiclass VOP3Only_Realtriple_gfx11_gfx12_not_gfx1250_gfx13 gfx12Op, bits<10> op, bit isSingle = 0> : + VOP3_Realtriple, + VOP3_Realtriple, + VOP3_Realtriple; multiclass VOP3Only_Realtriple_with_name_gfx1250 op, string opName, string asmName, string pseudo_mnemonic = "", @@ -2198,24 +2239,86 @@ multiclass VOP3Only_Realtriple_t16_gfx1250 op, string asmName = !cast : VOP3Only_Realtriple_with_name_gfx1250; +multiclass VOP3Only_Realtriple_with_name_gfx1250_gfx13 preGFX13Op, bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; + +multiclass VOP3Only_Realtriple_t16_gfx1250_gfx13 preGFX13Op, bits<10> op, string asmName = !cast(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3Only_Realtriple_with_name_gfx1250_gfx13; + +multiclass VOP3Only_Realtriple_gfx1250_gfx13< + bits<10> gfx1250Op, bits<10> op = gfx1250Op> : + VOP3_Realtriple, + VOP3_Realtriple; + +multiclass VOP3Only_Realtriple_gfx1250_DPP8_gfx13 op> : + VOP3_Realtriple, + VOP3_Realtriple_DPP8; + +multiclass VOP3Only_Realtriple_gfx1250_NO_DPP_gfx13 op> : + VOP3_Realtriple, + VOP3_Real_Base; + +multiclass VOP3be_Real_with_name_gfx12_NO_DPP_gfx13< + bits<10> gfx12Op, bits<10> op, string opName, string asmName, bit isSingle = 0> { + defvar ps = !cast(opName#"_e64"); + defm NAME : VOP3be_Realtriple; + defm NAME : VOP3be_Real; + def : AMDGPUMnemonicAlias { + let AssemblerPredicate = isGFX12Plus; + } +} + multiclass VOP3_Realtriple_t16_gfx12 op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Realtriple_with_name; -multiclass VOP3_Realtriple_t16_and_fake16_gfx12 op, string asmName, string opName = NAME, - string pseudo_mnemonic = "", bit isSingle = 0> { - defm _t16:VOP3_Realtriple_t16_gfx12; - defm _fake16:VOP3_Realtriple_t16_gfx12; +multiclass VOP3_Realtriple_t16_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; + +multiclass VOP3_Realtriple_t16_and_fake16_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> { + defm _t16:VOP3_Realtriple_t16_gfx12_gfx13; + defm _fake16:VOP3_Realtriple_t16_gfx12_gfx13; } -multiclass VOP3Only_Realtriple_t16_gfx12 op, string asmName, - string opName = NAME, string pseudo_mnemonic = ""> - : VOP3_Realtriple_t16_gfx12; +multiclass VOP3_Realtriple_t16_gfx13 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name; + +multiclass VOP3_Realtriple_t16_and_fake16_gfx13 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> { + defm opName#"_t16":VOP3_Realtriple_t16_gfx13; + defm opName#"_fake16":VOP3_Realtriple_t16_gfx13; +} -multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12 op, string asmName, - string opName = NAME, string pseudo_mnemonic = ""> { - defm _t16 : VOP3Only_Realtriple_t16_gfx12; - defm _fake16 : VOP3Only_Realtriple_t16_gfx12; +multiclass VOP3_Realtriple_with_name_gfx12_gfx13< + bits<10> gfx12Op, bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; + +multiclass VOP3Only_Realtriple_t16_gfx12_gfx13 preGFX13Op, bits<10> op, string asmName, + string opName = NAME, string pseudo_mnemonic = ""> + : VOP3_Realtriple_t16_gfx12_gfx13; + +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12_gfx13 preGFX13Op, bits<10> op, + string asmName, string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx12_gfx13; + defm _fake16 : VOP3Only_Realtriple_t16_gfx12_gfx13; +} + +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250_gfx13 preGFX13Op, bits<10> op, + string asmName = !cast(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx1250_gfx13; + defm _fake16 : VOP3Only_Realtriple_t16_gfx1250_gfx13; } multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250 op, @@ -2235,23 +2338,28 @@ multiclass VOP3be_Real_with_name_gfx12 op, string opName, } } -multiclass VOP3_Realtriple_with_name_gfx12 op, string opName, - string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : - VOP3_Realtriple_with_name; +multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12_gfx13< + bits<10> preGFX13Op, bits<10> op, string opName, string asmName> : + VOP3Only_Realtriple_with_name, + VOP3Only_Realtriple_with_name, + VOP3Only_Realtriple_with_name; + +multiclass VOP3Only_Realtriple_with_name_t16_gfx12 op, string asmName, + string opName = NAME> + : VOP3Only_Realtriple_with_name; -multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12 op, string opName, - string asmName> : - VOP3Only_Realtriple_with_name, - VOP3Only_Realtriple_with_name; +multiclass VOP3_Real_BITOP3 op, string asmName> : + VOP3_BITOP3_Real_Base, + VOP3_BITOP3_Real_dpp_Base, + VOP3_BITOP3_Real_dpp8_Base; -multiclass VOP3_Real_BITOP3_gfx1250 op, string asmName = !cast(NAME#"_e64").Mnemonic> : - VOP3_BITOP3_Real_Base, - VOP3_BITOP3_Real_dpp_Base, - VOP3_BITOP3_Real_dpp8_Base; +multiclass VOP3_Real_BITOP3_gfx1250_gfx13 op, string asmName = !cast(NAME#"_e64").Mnemonic> : + VOP3_Real_BITOP3, + VOP3_Real_BITOP3; -multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250 op, string asmName = !cast(NAME#"_e64").Mnemonic> { - defm _t16 : VOP3_Real_BITOP3_gfx1250; - defm _fake16: VOP3_Real_BITOP3_gfx1250; +multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250_gfx13 op, string asmName = !cast(NAME#"_e64").Mnemonic> { + defm _t16 : VOP3_Real_BITOP3_gfx1250_gfx13; + defm _fake16: VOP3_Real_BITOP3_gfx1250_gfx13; } multiclass VOP3Dot_Realtriple_gfx11_gfx12 op, string asmName, bit isSingle = 0, @@ -2264,6 +2372,12 @@ multiclass VOP3_Real_with_name_gfx11_gfx12 op, string opName, VOP3_Real_with_name, VOP3_Real_with_name; +multiclass VOP3Only_Real_with_name_t16_and_fake16_gfx13 op, string asmName=!tolower(NAME), + string opName=NAME, string pseudo_mnemonic = ""> { + defm _t16: VOP3_Real_with_name; + defm _fake16: VOP3_Real_with_name; +} + //===----------------------------------------------------------------------===// include "VOPCInstructions.td" diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3-fake16.s new file mode 100644 index 0000000000000..faf477732ebde --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3-fake16.s @@ -0,0 +1,8182 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W32,GFX13-ASM,W32-ASM %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W32,GFX13-DIS,W32-DIS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W64,GFX13-ASM,W64-ASM %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W64,GFX13-DIS,W64-DIS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s + +v_add3_u32 v5, v1, v2, s3 +// GFX13: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x0e,0x00] + +v_add3_u32 v5, v255, s2, s105 +// GFX13: v_add3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0x05,0xa4,0x01] + +v_add3_u32 v5, s1, v255, exec_hi +// GFX13: v_add3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0xfe,0xff,0x01] + +v_add3_u32 v5, s105, s105, exec_lo +// GFX13: v_add3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0xf8,0x01] + +v_add3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_add3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x0c,0x04] + +v_add3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_add3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_add3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_add3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0xed,0x01] + +v_add3_u32 v5, m0, 0.5, m0 +// GFX13: v_add3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0xf5,0x01] + +v_add3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_add3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0xad,0x01] + +v_add3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_add3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0xa8,0x01] + +v_add3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_add3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_add3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_add3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0xf4,0x03] + +v_add3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_add3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0xc0,0x03] + +v_add3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_add3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x04,0x03] + +v_add3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_add3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x6d,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_add_co_u32 v5, s6, v1, v2 +// W32: v_add_co_u32 v5, s6, v1, v2 ; encoding: [0x05,0x06,0x0f,0xd7,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, v255, v255 +// W32: v_add_co_u32 v5, s6, v255, v255 ; encoding: [0x05,0x06,0x0f,0xd7,0xff,0xff,0x03,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, s1, s2 +// W32: v_add_co_u32 v5, s6, s1, s2 ; encoding: [0x05,0x06,0x0f,0xd7,0x01,0x04,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, s105, s105 +// W32: v_add_co_u32 v5, s6, s105, s105 ; encoding: [0x05,0x06,0x0f,0xd7,0x69,0xd2,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, vcc_lo, ttmp15 +// W32: v_add_co_u32 v5, s6, vcc_lo, ttmp15 ; encoding: [0x05,0x06,0x0f,0xd7,0x6a,0xf6,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, vcc_hi, 0xaf123456 +// W32: v_add_co_u32 v5, s6, vcc_hi, 0xaf123456 ; encoding: [0x05,0x06,0x0f,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, ttmp15, src_scc +// W32: v_add_co_u32 v5, s6, ttmp15, src_scc ; encoding: [0x05,0x06,0x0f,0xd7,0x7b,0xfa,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, m0, 0.5 +// W32: v_add_co_u32 v5, s6, m0, 0.5 ; encoding: [0x05,0x06,0x0f,0xd7,0x7d,0xe0,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, exec_lo, -1 +// W32: v_add_co_u32 v5, s6, exec_lo, -1 ; encoding: [0x05,0x06,0x0f,0xd7,0x7e,0x82,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, exec_hi, null +// W32: v_add_co_u32 v5, s6, exec_hi, null ; encoding: [0x05,0x06,0x0f,0xd7,0x7f,0xf8,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s105, null, exec_lo +// W32: v_add_co_u32 v5, s105, null, exec_lo ; encoding: [0x05,0x69,0x0f,0xd7,0x7c,0xfc,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, vcc_lo, -1, exec_hi +// W32: v_add_co_u32 v5, vcc_lo, -1, exec_hi ; encoding: [0x05,0x6a,0x0f,0xd7,0xc1,0xfe,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, vcc_hi, 0.5, m0 +// W32: v_add_co_u32 v5, vcc_hi, 0.5, m0 ; encoding: [0x05,0x6b,0x0f,0xd7,0xf0,0xfa,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, ttmp15, src_scc, vcc_lo +// W32: v_add_co_u32 v5, ttmp15, src_scc, vcc_lo ; encoding: [0x05,0x7b,0x0f,0xd7,0xfd,0xd4,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s[12:13], v1, v2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], v1, v2 ; encoding: [0x05,0x0c,0x0f,0xd7,0x01,0x05,0x02,0x02] + +v_add_co_u32 v5, s[12:13], v255, v255 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], v255, v255 ; encoding: [0x05,0x0c,0x0f,0xd7,0xff,0xff,0x03,0x02] + +v_add_co_u32 v5, s[12:13], s1, s2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], s1, s2 ; encoding: [0x05,0x0c,0x0f,0xd7,0x01,0x04,0x00,0x02] + +v_add_co_u32 v5, s[12:13], s105, s105 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], s105, s105 ; encoding: [0x05,0x0c,0x0f,0xd7,0x69,0xd2,0x00,0x02] + +v_add_co_u32 v5, s[12:13], vcc_lo, ttmp15 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], vcc_lo, ttmp15 ; encoding: [0x05,0x0c,0x0f,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 ; encoding: [0x05,0x0c,0x0f,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_add_co_u32 v5, s[12:13], ttmp15, src_scc +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], ttmp15, src_scc ; encoding: [0x05,0x0c,0x0f,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_co_u32 v5, s[12:13], m0, 0.5 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], m0, 0.5 ; encoding: [0x05,0x0c,0x0f,0xd7,0x7d,0xe0,0x01,0x02] + +v_add_co_u32 v5, s[12:13], exec_lo, -1 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], exec_lo, -1 ; encoding: [0x05,0x0c,0x0f,0xd7,0x7e,0x82,0x01,0x02] + +v_add_co_u32 v5, s[12:13], exec_hi, null +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], exec_hi, null ; encoding: [0x05,0x0c,0x0f,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_co_u32 v5, s[12:13], null, exec_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], null, exec_lo ; encoding: [0x05,0x0c,0x0f,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_co_u32 v5, s[104:105], -1, exec_hi +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[104:105], -1, exec_hi ; encoding: [0x05,0x68,0x0f,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_co_u32 v5, vcc, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// W64: v_add_co_u32 v5, vcc, 0.5, m0 ; encoding: [0x05,0x6a,0x0f,0xd7,0xf0,0xfa,0x00,0x02] + +v_add_co_u32 v5, ttmp[14:15], src_scc, vcc_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, ttmp[14:15], src_scc, vcc_lo ; encoding: [0x05,0x7a,0x0f,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_co_u32 v255, null, 0xaf123456, vcc_hi clamp +// GFX13: v_add_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x0f,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_add_lshl_u32 v5, v1, v2, s3 +// GFX13: v_add_lshl_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x47,0xd7,0x01,0x05,0x0e,0x00] + +v_add_lshl_u32 v5, v255, s2, s105 +// GFX13: v_add_lshl_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x47,0xd7,0xff,0x05,0xa4,0x01] + +v_add_lshl_u32 v5, s1, v255, exec_hi +// GFX13: v_add_lshl_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x47,0xd7,0x01,0xfe,0xff,0x01] + +v_add_lshl_u32 v5, s105, s105, exec_lo +// GFX13: v_add_lshl_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x47,0xd7,0x69,0xd2,0xf8,0x01] + +v_add_lshl_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_add_lshl_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x47,0xd7,0x6a,0xf6,0x0c,0x04] + +v_add_lshl_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_add_lshl_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x47,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_add_lshl_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_add_lshl_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x47,0xd7,0x7b,0xfa,0xed,0x01] + +v_add_lshl_u32 v5, m0, 0.5, m0 +// GFX13: v_add_lshl_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x47,0xd7,0x7d,0xe0,0xf5,0x01] + +v_add_lshl_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_add_lshl_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x47,0xd7,0x7e,0x82,0xad,0x01] + +v_add_lshl_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_add_lshl_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x47,0xd7,0x7f,0xf8,0xa8,0x01] + +v_add_lshl_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_add_lshl_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x47,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_add_lshl_u32 v5, -1, exec_hi, src_scc +// GFX13: v_add_lshl_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x47,0xd7,0xc1,0xfe,0xf4,0x03] + +v_add_lshl_u32 v5, 0.5, m0, 0.5 +// GFX13: v_add_lshl_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x47,0xd7,0xf0,0xfa,0xc0,0x03] + +v_add_lshl_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_add_lshl_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x47,0xd7,0xfd,0xd4,0x04,0x03] + +v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_add_nc_i16 v5, v1, v2 +// GFX13: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x02] + +v_add_nc_i16 v5, v255, v255 +// GFX13: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x02] + +v_add_nc_i16 v5, s1, s2 +// GFX13: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x02] + +v_add_nc_i16 v5, s105, s105 +// GFX13: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x02] + +v_add_nc_i16 v5, vcc_lo, ttmp15 +// GFX13: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_nc_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_add_nc_i16 v5, ttmp15, src_scc +// GFX13: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_nc_i16 v5, m0, 0.5 +// GFX13-ASM: v_add_nc_i16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_i16 v5, exec_lo, -1 +// GFX13: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x02] + +v_add_nc_i16 v5, exec_hi, null +// GFX13: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1] +// GFX13: v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_add_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_add_nc_i32 v5, v1, v2 +// GFX13: v_add_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x7f,0xd7,0x01,0x05,0x02,0x02] + +v_add_nc_i32 v5, v255, v255 +// GFX13: v_add_nc_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x7f,0xd7,0xff,0xff,0x03,0x02] + +v_add_nc_i32 v5, s1, s2 +// GFX13: v_add_nc_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x7f,0xd7,0x01,0x04,0x00,0x02] + +v_add_nc_i32 v5, s105, s105 +// GFX13: v_add_nc_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x7f,0xd7,0x69,0xd2,0x00,0x02] + +v_add_nc_i32 v5, vcc_lo, ttmp15 +// GFX13: v_add_nc_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x7f,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_nc_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_add_nc_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x7f,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_add_nc_i32 v5, ttmp15, src_scc +// GFX13: v_add_nc_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x7f,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_nc_i32 v5, m0, 0.5 +// GFX13: v_add_nc_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x7f,0xd7,0x7d,0xe0,0x01,0x02] + +v_add_nc_i32 v5, exec_lo, -1 +// GFX13: v_add_nc_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x7f,0xd7,0x7e,0x82,0x01,0x02] + +v_add_nc_i32 v5, exec_hi, null +// GFX13: v_add_nc_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x7f,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_nc_i32 v5, null, exec_lo +// GFX13: v_add_nc_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x7f,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_nc_i32 v5, -1, exec_hi +// GFX13: v_add_nc_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x7f,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_nc_i32 v5, 0.5, m0 +// GFX13: v_add_nc_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x7f,0xd7,0xf0,0xfa,0x00,0x02] + +v_add_nc_i32 v5, src_scc, vcc_lo +// GFX13: v_add_nc_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x7f,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp +// GFX13: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x7f,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_add_nc_u16 v5, v1, v2 +// GFX13: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x02] + +v_add_nc_u16 v5, v255, v255 +// GFX13: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x02] + +v_add_nc_u16 v5, s1, s2 +// GFX13: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x02] + +v_add_nc_u16 v5, s105, s105 +// GFX13: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x02] + +v_add_nc_u16 v5, vcc_lo, ttmp15 +// GFX13: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_nc_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_add_nc_u16 v5, ttmp15, src_scc +// GFX13: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_nc_u16 v5, m0, 0.5 +// GFX13-ASM: v_add_nc_u16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_u16 v5, exec_lo, -1 +// GFX13: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x02] + +v_add_nc_u16 v5, exec_hi, null +// GFX13: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1] +// GFX13: v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_add_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_alignbit_b32 v5, v1, v2, s3 +// GFX13: v_alignbit_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00] + +v_alignbit_b32 v5, v255, s2, s3 +// GFX13: v_alignbit_b32 v5, v255, s2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0xff,0x05,0x0c,0x00] + +v_alignbit_b32 v5, s1, v255, s3 +// GFX13: v_alignbit_b32 v5, s1, v255, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0xfe,0x0f,0x00] + +v_alignbit_b32 v5, s105, s105, s105 +// GFX13: v_alignbit_b32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x16,0xd6,0x69,0xd2,0xa4,0x01] + +v_alignbit_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_alignbit_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x16,0xd6,0x6a,0xf6,0x0c,0x04] + +v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x16,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_alignbit_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_alignbit_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x16,0xd6,0x7b,0xfa,0xed,0x01] + +v_alignbit_b32 v5, m0, 0.5, exec_lo +// GFX13: v_alignbit_b32 v5, m0, 0.5, exec_lo ; encoding: [0x05,0x00,0x16,0xd6,0x7d,0xe0,0xf9,0x01] + +v_alignbit_b32 v5, exec_lo, -1, m0 +// GFX13: v_alignbit_b32 v5, exec_lo, -1, m0 ; encoding: [0x05,0x00,0x16,0xd6,0x7e,0x82,0xf5,0x01] + +v_alignbit_b32 v5, exec_hi, null, vcc_hi +// GFX13: v_alignbit_b32 v5, exec_hi, null, vcc_hi ; encoding: [0x05,0x00,0x16,0xd6,0x7f,0xf8,0xac,0x01] + +v_alignbit_b32 v5, null, exec_lo, vcc_lo +// GFX13: v_alignbit_b32 v5, null, exec_lo, vcc_lo ; encoding: [0x05,0x00,0x16,0xd6,0x7c,0xfc,0xa8,0x01] + +v_alignbit_b32 v5, -1, exec_hi, src_scc +// GFX13: v_alignbit_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x16,0xd6,0xc1,0xfe,0xf4,0x03] + +v_alignbit_b32 v5, 0.5, m0, exec_hi +// GFX13: v_alignbit_b32 v5, 0.5, m0, exec_hi ; encoding: [0x05,0x00,0x16,0xd6,0xf0,0xfa,0xfc,0x01] + +v_alignbit_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_alignbit_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x16,0xd6,0xfd,0xd4,0x04,0x03] + +v_alignbit_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_alignbit_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x16,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_alignbyte_b32 v5, v1, v2, s3 +// GFX13: v_alignbyte_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x17,0xd6,0x01,0x05,0x0e,0x00] + +v_alignbyte_b32 v5, v255, s2, s3 +// GFX13: v_alignbyte_b32 v5, v255, s2, s3 ; encoding: [0x05,0x00,0x17,0xd6,0xff,0x05,0x0c,0x00] + +v_alignbyte_b32 v5, s1, v255, s3 +// GFX13: v_alignbyte_b32 v5, s1, v255, s3 ; encoding: [0x05,0x00,0x17,0xd6,0x01,0xfe,0x0f,0x00] + +v_alignbyte_b32 v5, s105, s105, s105 +// GFX13: v_alignbyte_b32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x17,0xd6,0x69,0xd2,0xa4,0x01] + +v_alignbyte_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_alignbyte_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x17,0xd6,0x6a,0xf6,0x0c,0x04] + +v_alignbyte_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_alignbyte_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x17,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_alignbyte_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_alignbyte_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x17,0xd6,0x7b,0xfa,0xed,0x01] + +v_alignbyte_b32 v5, m0, 0.5, exec_lo +// GFX13: v_alignbyte_b32 v5, m0, 0.5, exec_lo ; encoding: [0x05,0x00,0x17,0xd6,0x7d,0xe0,0xf9,0x01] + +v_alignbyte_b32 v5, exec_lo, -1, m0 +// GFX13: v_alignbyte_b32 v5, exec_lo, -1, m0 ; encoding: [0x05,0x00,0x17,0xd6,0x7e,0x82,0xf5,0x01] + +v_alignbyte_b32 v5, exec_hi, null, vcc_hi +// GFX13: v_alignbyte_b32 v5, exec_hi, null, vcc_hi ; encoding: [0x05,0x00,0x17,0xd6,0x7f,0xf8,0xac,0x01] + +v_alignbyte_b32 v5, null, exec_lo, vcc_lo +// GFX13: v_alignbyte_b32 v5, null, exec_lo, vcc_lo ; encoding: [0x05,0x00,0x17,0xd6,0x7c,0xfc,0xa8,0x01] + +v_alignbyte_b32 v5, -1, exec_hi, src_scc +// GFX13: v_alignbyte_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x17,0xd6,0xc1,0xfe,0xf4,0x03] + +v_alignbyte_b32 v5, 0.5, m0, exec_hi +// GFX13: v_alignbyte_b32 v5, 0.5, m0, exec_hi ; encoding: [0x05,0x00,0x17,0xd6,0xf0,0xfa,0xfc,0x01] + +v_alignbyte_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_alignbyte_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x17,0xd6,0xfd,0xd4,0x04,0x03] + +v_alignbyte_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_alignbyte_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x17,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_and_b16 v5, v1, v2 +// GFX13: v_and_b16 v5, v1, v2 ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x02] + +v_and_b16 v5, v255, v255 +// GFX13: v_and_b16 v5, v255, v255 ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x02] + +v_and_b16 v5, s1, s2 +// GFX13: v_and_b16 v5, s1, s2 ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x02] + +v_and_b16 v5, s105, s105 +// GFX13: v_and_b16 v5, s105, s105 ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x02] + +v_and_b16 v5, vcc_lo, ttmp15 +// GFX13: v_and_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x02] + +v_and_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_and_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_and_b16 v5, ttmp15, src_scc +// GFX13: v_and_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x02] + +v_and_b16 v5, m0, 0.5 +// GFX13-ASM: v_and_b16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_and_b16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_and_b16 v5, exec_lo, -1 +// GFX13: v_and_b16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x02] + +v_and_b16 v5, exec_hi, null +// GFX13: v_and_b16 v5, exec_hi, null ; encoding: [0x05,0x00,0x67,0xd7,0x7f,0xf8,0x00,0x02] + +v_and_b16 v5, null, exec_lo +// GFX13: v_and_b16 v5, null, exec_lo ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x02] + +v_and_b16 v5, -1, exec_hi +// GFX13: v_and_b16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x02] + +v_and_b16 v5, 0.5, m0 +// GFX13-ASM: v_and_b16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_and_b16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_and_b16 v5, src_scc, vcc_lo +// GFX13: v_and_b16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x67,0xd7,0xfd,0xd4,0x00,0x02] + +v_and_b16 v255, 0xfe0b, vcc_hi +// GFX13: v_and_b16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x67,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_and_or_b32 v5, v1, v2, s3 +// GFX13: v_and_or_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x71,0xd7,0x01,0x05,0x0e,0x00] + +v_and_or_b32 v5, v255, s2, s105 +// GFX13: v_and_or_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x71,0xd7,0xff,0x05,0xa4,0x01] + +v_and_or_b32 v5, s1, v255, exec_hi +// GFX13: v_and_or_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x71,0xd7,0x01,0xfe,0xff,0x01] + +v_and_or_b32 v5, s105, s105, exec_lo +// GFX13: v_and_or_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x71,0xd7,0x69,0xd2,0xf8,0x01] + +v_and_or_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_and_or_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x71,0xd7,0x6a,0xf6,0x0c,0x04] + +v_and_or_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_and_or_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x71,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_and_or_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_and_or_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x71,0xd7,0x7b,0xfa,0xed,0x01] + +v_and_or_b32 v5, m0, 0.5, m0 +// GFX13: v_and_or_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x71,0xd7,0x7d,0xe0,0xf5,0x01] + +v_and_or_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_and_or_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x71,0xd7,0x7e,0x82,0xad,0x01] + +v_and_or_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_and_or_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x71,0xd7,0x7f,0xf8,0xa8,0x01] + +v_and_or_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_and_or_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x71,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_and_or_b32 v5, -1, exec_hi, src_scc +// GFX13: v_and_or_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x71,0xd7,0xc1,0xfe,0xf4,0x03] + +v_and_or_b32 v5, 0.5, m0, 0.5 +// GFX13: v_and_or_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x71,0xd7,0xf0,0xfa,0xc0,0x03] + +v_and_or_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_and_or_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x71,0xd7,0xfd,0xd4,0x04,0x03] + +v_and_or_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_and_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x71,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] +// GFX13: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x90,0xd6,0x02,0x07,0x12,0x04] + +v_ashr_pk_i8_i32 v2, s4, 4, v2 +// GFX13: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x08,0x09,0x04] + +v_ashr_pk_i8_i32 v2, s4, v7, v8 +// GFX13: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x0e,0x22,0x04] + +v_ashr_pk_i8_i32 v2, v4, 0, 1 +// GFX13: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x01,0x05,0x02] + +v_ashr_pk_i8_i32 v2, v4, 3, s2 +// GFX13: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x07,0x09,0x00] + +v_ashr_pk_i8_i32 v2, v4, v7, 12345 +// GFX13: v_ashr_pk_i8_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] +// GFX13: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x91,0xd6,0x02,0x07,0x12,0x04] + +v_ashr_pk_u8_i32 v2, s4, 4, v2 +// GFX13: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x08,0x09,0x04] + +v_ashr_pk_u8_i32 v2, s4, v7, v8 +// GFX13: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x0e,0x22,0x04] + +v_ashr_pk_u8_i32 v2, v4, 0, 1 +// GFX13: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x01,0x05,0x02] + +v_ashr_pk_u8_i32 v2, v4, 3, s2 +// GFX13: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x07,0x09,0x00] + +v_ashr_pk_u8_i32 v2, v4, v7, 12345 +// GFX13: v_ashr_pk_u8_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_ashrrev_i16 v5, v1, v2 +// GFX13: v_ashrrev_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x08,0xd7,0x01,0x05,0x02,0x02] + +v_ashrrev_i16 v5, v255, v255 +// GFX13: v_ashrrev_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x08,0xd7,0xff,0xff,0x03,0x02] + +v_ashrrev_i16 v5, s1, s2 +// GFX13: v_ashrrev_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x08,0xd7,0x01,0x04,0x00,0x02] + +v_ashrrev_i16 v5, s105, s105 +// GFX13: v_ashrrev_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x08,0xd7,0x69,0xd2,0x00,0x02] + +v_ashrrev_i16 v5, vcc_lo, ttmp15 +// GFX13: v_ashrrev_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x08,0xd7,0x6a,0xf6,0x00,0x02] + +v_ashrrev_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_ashrrev_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x08,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_ashrrev_i16 v5, ttmp15, src_scc +// GFX13: v_ashrrev_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x08,0xd7,0x7b,0xfa,0x01,0x02] + +v_ashrrev_i16 v5, m0, 0.5 +// GFX13-ASM: v_ashrrev_i16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x08,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_ashrrev_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x08,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_ashrrev_i16 v5, exec_lo, -1 +// GFX13: v_ashrrev_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x08,0xd7,0x7e,0x82,0x01,0x02] + +v_ashrrev_i16 v5, exec_hi, null +// GFX13: v_ashrrev_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x08,0xd7,0x7f,0xf8,0x00,0x02] + +v_ashrrev_i16 v5, null, exec_lo +// GFX13: v_ashrrev_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x08,0xd7,0x7c,0xfc,0x00,0x02] + +v_ashrrev_i16 v5, -1, exec_hi +// GFX13: v_ashrrev_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x08,0xd7,0xc1,0xfe,0x00,0x02] + +v_ashrrev_i16 v5, 0.5, m0 +// GFX13-ASM: v_ashrrev_i16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x08,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_ashrrev_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x08,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_ashrrev_i16 v5, src_scc, vcc_lo +// GFX13: v_ashrrev_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x08,0xd7,0xfd,0xd4,0x00,0x02] + +v_ashrrev_i16 v255, 0xfe0b, vcc_hi +// GFX13: v_ashrrev_i16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x08,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_ashrrev_i64 v[5:6], v1, vcc +// GFX13: v_ashrrev_i64 v[5:6], v1, vcc ; encoding: [0x05,0x00,0x01,0xd7,0x01,0xd5,0x00,0x02] + +v_ashrrev_i64 v[5:6], v255, exec +// GFX13: v_ashrrev_i64 v[5:6], v255, exec ; encoding: [0x05,0x00,0x01,0xd7,0xff,0xfd,0x00,0x02] + +v_ashrrev_i64 v[5:6], exec_lo, v[2:3] +// GFX13: v_ashrrev_i64 v[5:6], exec_lo, v[2:3] ; encoding: [0x05,0x00,0x01,0xd7,0x7e,0x04,0x02,0x02] + +v_ashrrev_i64 v[5:6], exec_hi, v[254:255] +// GFX13: v_ashrrev_i64 v[5:6], exec_hi, v[254:255] ; encoding: [0x05,0x00,0x01,0xd7,0x7f,0xfc,0x03,0x02] + +v_ashrrev_i64 v[5:6], null, null +// GFX13: v_ashrrev_i64 v[5:6], null, null ; encoding: [0x05,0x00,0x01,0xd7,0x7c,0xf8,0x00,0x02] + +v_ashrrev_i64 v[5:6], -1, -1 +// GFX13: v_ashrrev_i64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x01,0xd7,0xc1,0x82,0x01,0x02] + +v_ashrrev_i64 v[5:6], 0.5, 0xaf123456 +// GFX13: v_ashrrev_i64 v[5:6], 0.5, 0xaf123456 ; encoding: [0x05,0x00,0x01,0xd7,0xf0,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ashrrev_i64 v[5:6], src_scc, src_scc +// GFX13: v_ashrrev_i64 v[5:6], src_scc, src_scc ; encoding: [0x05,0x00,0x01,0xd7,0xfd,0xfa,0x01,0x02] + +v_ashrrev_i64 v[254:255], 0xaf123456, 0.5 +// GFX13: v_ashrrev_i64 v[254:255], 0xaf123456, 0.5 ; encoding: [0xfe,0x00,0x01,0xd7,0xff,0xe0,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_bcnt_u32_b32 v5, v1, v2 +// GFX13: v_bcnt_u32_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x64,0xd7,0x01,0x05,0x02,0x02] + +v_bcnt_u32_b32 v5, v255, v255 +// GFX13: v_bcnt_u32_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x64,0xd7,0xff,0xff,0x03,0x02] + +v_bcnt_u32_b32 v5, s1, s2 +// GFX13: v_bcnt_u32_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x64,0xd7,0x01,0x04,0x00,0x02] + +v_bcnt_u32_b32 v5, s105, s105 +// GFX13: v_bcnt_u32_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x64,0xd7,0x69,0xd2,0x00,0x02] + +v_bcnt_u32_b32 v5, vcc_lo, ttmp15 +// GFX13: v_bcnt_u32_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x64,0xd7,0x6a,0xf6,0x00,0x02] + +v_bcnt_u32_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_bcnt_u32_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x64,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_bcnt_u32_b32 v5, ttmp15, src_scc +// GFX13: v_bcnt_u32_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x64,0xd7,0x7b,0xfa,0x01,0x02] + +v_bcnt_u32_b32 v5, m0, 0.5 +// GFX13: v_bcnt_u32_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x64,0xd7,0x7d,0xe0,0x01,0x02] + +v_bcnt_u32_b32 v5, exec_lo, -1 +// GFX13: v_bcnt_u32_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x64,0xd7,0x7e,0x82,0x01,0x02] + +v_bcnt_u32_b32 v5, exec_hi, null +// GFX13: v_bcnt_u32_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x64,0xd7,0x7f,0xf8,0x00,0x02] + +v_bcnt_u32_b32 v5, null, exec_lo +// GFX13: v_bcnt_u32_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x64,0xd7,0x7c,0xfc,0x00,0x02] + +v_bcnt_u32_b32 v5, -1, exec_hi +// GFX13: v_bcnt_u32_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x64,0xd7,0xc1,0xfe,0x00,0x02] + +v_bcnt_u32_b32 v5, 0.5, m0 +// GFX13: v_bcnt_u32_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x64,0xd7,0xf0,0xfa,0x00,0x02] + +v_bcnt_u32_b32 v5, src_scc, vcc_lo +// GFX13: v_bcnt_u32_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x64,0xd7,0xfd,0xd4,0x00,0x02] + +v_bcnt_u32_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_bcnt_u32_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x64,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_bfe_i32 v5, v1, v2, s3 +// GFX13: v_bfe_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x11,0xd6,0x01,0x05,0x0e,0x00] + +v_bfe_i32 v5, v255, s2, s105 +// GFX13: v_bfe_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x11,0xd6,0xff,0x05,0xa4,0x01] + +v_bfe_i32 v5, s1, v255, exec_hi +// GFX13: v_bfe_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x11,0xd6,0x01,0xfe,0xff,0x01] + +v_bfe_i32 v5, s105, s105, exec_lo +// GFX13: v_bfe_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x11,0xd6,0x69,0xd2,0xf8,0x01] + +v_bfe_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_bfe_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x11,0xd6,0x6a,0xf6,0x0c,0x04] + +v_bfe_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_bfe_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x11,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_bfe_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_bfe_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x11,0xd6,0x7b,0xfa,0xed,0x01] + +v_bfe_i32 v5, m0, 0.5, m0 +// GFX13: v_bfe_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x11,0xd6,0x7d,0xe0,0xf5,0x01] + +v_bfe_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_bfe_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x11,0xd6,0x7e,0x82,0xad,0x01] + +v_bfe_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_bfe_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x11,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bfe_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_bfe_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x11,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_bfe_i32 v5, -1, exec_hi, src_scc +// GFX13: v_bfe_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x11,0xd6,0xc1,0xfe,0xf4,0x03] + +v_bfe_i32 v5, 0.5, m0, 0.5 +// GFX13: v_bfe_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x11,0xd6,0xf0,0xfa,0xc0,0x03] + +v_bfe_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_bfe_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x11,0xd6,0xfd,0xd4,0x04,0x03] + +v_bfe_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_bfe_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x11,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_bfe_u32 v5, v1, v2, s3 +// GFX13: v_bfe_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x10,0xd6,0x01,0x05,0x0e,0x00] + +v_bfe_u32 v5, v255, s2, s105 +// GFX13: v_bfe_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x10,0xd6,0xff,0x05,0xa4,0x01] + +v_bfe_u32 v5, s1, v255, exec_hi +// GFX13: v_bfe_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x10,0xd6,0x01,0xfe,0xff,0x01] + +v_bfe_u32 v5, s105, s105, exec_lo +// GFX13: v_bfe_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x10,0xd6,0x69,0xd2,0xf8,0x01] + +v_bfe_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_bfe_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x10,0xd6,0x6a,0xf6,0x0c,0x04] + +v_bfe_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_bfe_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x10,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_bfe_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_bfe_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x10,0xd6,0x7b,0xfa,0xed,0x01] + +v_bfe_u32 v5, m0, 0.5, m0 +// GFX13: v_bfe_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x10,0xd6,0x7d,0xe0,0xf5,0x01] + +v_bfe_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_bfe_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x10,0xd6,0x7e,0x82,0xad,0x01] + +v_bfe_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_bfe_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x10,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bfe_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_bfe_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x10,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_bfe_u32 v5, -1, exec_hi, src_scc +// GFX13: v_bfe_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x10,0xd6,0xc1,0xfe,0xf4,0x03] + +v_bfe_u32 v5, 0.5, m0, 0.5 +// GFX13: v_bfe_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x10,0xd6,0xf0,0xfa,0xc0,0x03] + +v_bfe_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_bfe_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x10,0xd6,0xfd,0xd4,0x04,0x03] + +v_bfe_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_bfe_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x10,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_bfi_b32 v5, v1, v2, s3 +// GFX13: v_bfi_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x12,0xd6,0x01,0x05,0x0e,0x00] + +v_bfi_b32 v5, v255, s2, s105 +// GFX13: v_bfi_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x12,0xd6,0xff,0x05,0xa4,0x01] + +v_bfi_b32 v5, s1, v255, exec_hi +// GFX13: v_bfi_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x12,0xd6,0x01,0xfe,0xff,0x01] + +v_bfi_b32 v5, s105, s105, exec_lo +// GFX13: v_bfi_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x12,0xd6,0x69,0xd2,0xf8,0x01] + +v_bfi_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_bfi_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x12,0xd6,0x6a,0xf6,0x0c,0x04] + +v_bfi_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_bfi_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x12,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_bfi_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_bfi_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x12,0xd6,0x7b,0xfa,0xed,0x01] + +v_bfi_b32 v5, m0, 0.5, m0 +// GFX13: v_bfi_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x12,0xd6,0x7d,0xe0,0xf5,0x01] + +v_bfi_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_bfi_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x12,0xd6,0x7e,0x82,0xad,0x01] + +v_bfi_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_bfi_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x12,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bfi_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_bfi_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x12,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_bfi_b32 v5, -1, exec_hi, src_scc +// GFX13: v_bfi_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x12,0xd6,0xc1,0xfe,0xf4,0x03] + +v_bfi_b32 v5, 0.5, m0, 0.5 +// GFX13: v_bfi_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x12,0xd6,0xf0,0xfa,0xc0,0x03] + +v_bfi_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_bfi_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x12,0xd6,0xfd,0xd4,0x04,0x03] + +v_bfi_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_bfi_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x12,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_bfm_b32 v5, v1, v2 +// GFX13: v_bfm_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x1d,0xd7,0x01,0x05,0x02,0x02] + +v_bfm_b32 v5, v255, v255 +// GFX13: v_bfm_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x1d,0xd7,0xff,0xff,0x03,0x02] + +v_bfm_b32 v5, s1, s2 +// GFX13: v_bfm_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x1d,0xd7,0x01,0x04,0x00,0x02] + +v_bfm_b32 v5, s105, s105 +// GFX13: v_bfm_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x1d,0xd7,0x69,0xd2,0x00,0x02] + +v_bfm_b32 v5, vcc_lo, ttmp15 +// GFX13: v_bfm_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x1d,0xd7,0x6a,0xf6,0x00,0x02] + +v_bfm_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_bfm_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x1d,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_bfm_b32 v5, ttmp15, src_scc +// GFX13: v_bfm_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x1d,0xd7,0x7b,0xfa,0x01,0x02] + +v_bfm_b32 v5, m0, 0.5 +// GFX13: v_bfm_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x1d,0xd7,0x7d,0xe0,0x01,0x02] + +v_bfm_b32 v5, exec_lo, -1 +// GFX13: v_bfm_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x1d,0xd7,0x7e,0x82,0x01,0x02] + +v_bfm_b32 v5, exec_hi, null +// GFX13: v_bfm_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x1d,0xd7,0x7f,0xf8,0x00,0x02] + +v_bfm_b32 v5, null, exec_lo +// GFX13: v_bfm_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x1d,0xd7,0x7c,0xfc,0x00,0x02] + +v_bfm_b32 v5, -1, exec_hi +// GFX13: v_bfm_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x1d,0xd7,0xc1,0xfe,0x00,0x02] + +v_bfm_b32 v5, 0.5, m0 +// GFX13: v_bfm_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x1d,0xd7,0xf0,0xfa,0x00,0x02] + +v_bfm_b32 v5, src_scc, vcc_lo +// GFX13: v_bfm_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x1d,0xd7,0xfd,0xd4,0x00,0x02] + +v_bfm_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_bfm_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_bitop3_b16 v255, 0xfe0b, vcc_hi, null bitop3:103 op_sel:[0,0,0,1] +// GFX13: v_bitop3_b16 v255, 0xfe0b, vcc_hi, null bitop3:0x67 op_sel:[0,0,0,1] ; encoding: [0xff,0x44,0x33,0xd6,0xff,0xd6,0xf0,0xe9,0x0b,0xfe,0x00,0x00] + +v_bitop3_b16 v5, -1, exec_hi, src_scc bitop3:99 op_sel:[1,0,0,0] +// GFX13: v_bitop3_b16 v5, -1, exec_hi, src_scc bitop3:0x63 op_sel:[1,0,0,0] ; encoding: [0x05,0x0c,0x33,0xd6,0xc1,0xfe,0xf4,0x6b] + +v_bitop3_b16 v5, 0.5, m0, 0.5 bitop3:101 op_sel:[0,1,0,0] +// GFX13-ASM: v_bitop3_b16 v5, 0.5, m0, 0.5 bitop3:0x65 op_sel:[0,1,0,0] ; encoding: [0x05,0x14,0x33,0xd6,0xf0,0xfa,0xc0,0xab] +// GFX13-DIS: v_bitop3_b16 v5, 0x3800, m0, 0x3800 bitop3:0x65 op_sel:[0,1,0,0] ; encoding: [0x05,0x14,0x33,0xd6,0xff,0xfa,0xfc,0xab,0x00,0x38,0x00,0x00] + +v_bitop3_b16 v5, exec_hi, null, vcc_lo bitop3:0x77 op_sel:[1,1,1,1] +// GFX13: v_bitop3_b16 v5, exec_hi, null, vcc_lo bitop3:0x77 op_sel:[1,1,1,1] ; encoding: [0x05,0x7e,0x33,0xd6,0x7f,0xf8,0xa8,0xe9] + +v_bitop3_b16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_bitop3_b16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x33,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bitop3_b16 v5, exec_lo, -1, vcc_hi bitop3:6 +// GFX13: v_bitop3_b16 v5, exec_lo, -1, vcc_hi bitop3:6 ; encoding: [0x05,0x00,0x33,0xd6,0x7e,0x82,0xad,0xc1] + +v_bitop3_b16 v5, m0, 0.5, m0 bitop3:5 +// GFX13-ASM: v_bitop3_b16 v5, m0, 0.5, m0 bitop3:5 ; encoding: [0x05,0x00,0x33,0xd6,0x7d,0xe0,0xf5,0xa1] +// GFX13-DIS: v_bitop3_b16 v5, m0, 0x3800, m0 bitop3:5 ; encoding: [0x05,0x00,0x33,0xd6,0x7d,0xfe,0xf5,0xa1,0x00,0x38,0x00,0x00] + +v_bitop3_b16 v5, null, exec_lo, 0xfe0b bitop3:0x88 op_sel:[0,0,0,0] +// GFX13: v_bitop3_b16 v5, null, exec_lo, 0xfe0b bitop3:0x88 ; encoding: [0x05,0x01,0x33,0xd6,0x7c,0xfc,0xfc,0x13,0x0b,0xfe,0x00,0x00] + +v_bitop3_b16 v5, s1, v255, exec_hi bitop3:100 +// GFX13: v_bitop3_b16 v5, s1, v255, exec_hi bitop3:0x64 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0xfe,0xff,0x89] + +v_bitop3_b16 v5, s105, s105, exec_lo bitop3:0 +// GFX13: v_bitop3_b16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x33,0xd6,0x69,0xd2,0xf8,0x01] + +v_bitop3_b16 v5, src_scc, vcc_lo, -1 bitop3:102 op_sel:[0,0,1,0] +// GFX13: v_bitop3_b16 v5, src_scc, vcc_lo, -1 bitop3:0x66 op_sel:[0,0,1,0] ; encoding: [0x05,0x24,0x33,0xd6,0xfd,0xd4,0x04,0xcb] + +v_bitop3_b16 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 +// GFX13: v_bitop3_b16 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 ; encoding: [0x05,0x04,0x33,0xd6,0x7b,0xfa,0xed,0x81] + +v_bitop3_b16 v5, v1, v2, s3 +// GFX13: v_bitop3_b16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x33,0xd6,0x01,0x05,0x0e,0x00] + +v_bitop3_b16 v5, v1, v2, s3 bitop3:161 +// GFX13: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0x05,0x0e,0x30] + +v_bitop3_b16 v5, v255, s2, s105 bitop3:0x27 +// GFX13: v_bitop3_b16 v5, v255, s2, s105 bitop3:0x27 ; encoding: [0x05,0x04,0x33,0xd6,0xff,0x05,0xa4,0xe1] + +v_bitop3_b16 v5, vcc_hi, 0xfe0b, v255 bitop3:63 +// GFX13: v_bitop3_b16 v5, vcc_hi, 0xfe0b, v255 bitop3:0x3f ; encoding: [0x05,0x07,0x33,0xd6,0x6b,0xfe,0xfd,0xe7,0x0b,0xfe,0x00,0x00] + +v_bitop3_b16 v5, vcc_lo, ttmp15, v3 bitop3:0x15 +// GFX13: v_bitop3_b16 v5, vcc_lo, ttmp15, v3 bitop3:0x15 ; encoding: [0x05,0x02,0x33,0xd6,0x6a,0xf6,0x0c,0xa4] + +v_bitop3_b32 v255, 0xaf123456, vcc_hi, null bitop3:103 +// GFX13: v_bitop3_b32 v255, 0xaf123456, vcc_hi, null bitop3:0x67 ; encoding: [0xff,0x04,0x34,0xd6,0xff,0xd6,0xf0,0xe9,0x56,0x34,0x12,0xaf] + +v_bitop3_b32 v5, -1, exec_hi, src_scc bitop3:99 +// GFX13: v_bitop3_b32 v5, -1, exec_hi, src_scc bitop3:0x63 ; encoding: [0x05,0x04,0x34,0xd6,0xc1,0xfe,0xf4,0x6b] + +v_bitop3_b32 v5, 0.5, m0, 0.5 bitop3:101 +// GFX13: v_bitop3_b32 v5, 0.5, m0, 0.5 bitop3:0x65 ; encoding: [0x05,0x04,0x34,0xd6,0xf0,0xfa,0xc0,0xab] + +v_bitop3_b32 v5, exec_hi, null, vcc_lo bitop3:77 +// GFX13: v_bitop3_b32 v5, exec_hi, null, vcc_lo bitop3:0x4d ; encoding: [0x05,0x01,0x34,0xd6,0x7f,0xf8,0xa8,0xa9] + +v_bitop3_b32 v5, exec_lo, -1, vcc_hi bitop3:6 +// GFX13: v_bitop3_b32 v5, exec_lo, -1, vcc_hi bitop3:6 ; encoding: [0x05,0x00,0x34,0xd6,0x7e,0x82,0xad,0xc1] + +v_bitop3_b32 v5, m0, 0.5, m0 bitop3:5 +// GFX13: v_bitop3_b32 v5, m0, 0.5, m0 bitop3:5 ; encoding: [0x05,0x00,0x34,0xd6,0x7d,0xe0,0xf5,0xa1] + +v_bitop3_b32 v5, null, exec_lo, 0xaf123456 bitop3:88 +// GFX13: v_bitop3_b32 v5, null, exec_lo, 0xaf123456 bitop3:0x58 ; encoding: [0x05,0x03,0x34,0xd6,0x7c,0xfc,0xfc,0x0b,0x56,0x34,0x12,0xaf] + +v_bitop3_b32 v5, s1, v255, exec_hi bitop3:100 +// GFX13: v_bitop3_b32 v5, s1, v255, exec_hi bitop3:0x64 ; encoding: [0x05,0x04,0x34,0xd6,0x01,0xfe,0xff,0x89] + +v_bitop3_b32 v5, s105, s105, exec_lo bitop3:0 +// GFX13: v_bitop3_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x34,0xd6,0x69,0xd2,0xf8,0x01] + +v_bitop3_b32 v5, src_scc, vcc_lo, -1 bitop3:102 +// GFX13: v_bitop3_b32 v5, src_scc, vcc_lo, -1 bitop3:0x66 ; encoding: [0x05,0x04,0x34,0xd6,0xfd,0xd4,0x04,0xcb] + +v_bitop3_b32 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 +// GFX13: v_bitop3_b32 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 ; encoding: [0x05,0x04,0x34,0xd6,0x7b,0xfa,0xed,0x81] + +v_bitop3_b32 v5, v1, v2, s3 +// GFX13: v_bitop3_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x34,0xd6,0x01,0x05,0x0e,0x00] + +v_bitop3_b32 v5, v1, v2, s3 bitop3:161 +// GFX13: v_bitop3_b32 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x34,0xd6,0x01,0x05,0x0e,0x30] + +v_bitop3_b32 v5, v255, s2, s105 bitop3:0x27 +// GFX13: v_bitop3_b32 v5, v255, s2, s105 bitop3:0x27 ; encoding: [0x05,0x04,0x34,0xd6,0xff,0x05,0xa4,0xe1] + +v_bitop3_b32 v5, vcc_hi, 0xaf123456, v255 bitop3:63 +// GFX13: v_bitop3_b32 v5, vcc_hi, 0xaf123456, v255 bitop3:0x3f ; encoding: [0x05,0x07,0x34,0xd6,0x6b,0xfe,0xfd,0xe7,0x56,0x34,0x12,0xaf] + +v_bitop3_b32 v5, vcc_lo, ttmp15, v3 bitop3:0x15 +// GFX13: v_bitop3_b32 v5, vcc_lo, ttmp15, v3 bitop3:0x15 ; encoding: [0x05,0x02,0x34,0xd6,0x6a,0xf6,0x0c,0xa4] + +v_cndmask_b16 v5, v1, src_scc, s3 +// W32: v_cndmask_b16 v5, v1, src_scc, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction + +v_cndmask_b16 v5, v255, 0.5, s3 +// W32-ASM: v_cndmask_b16 v5, v255, 0.5, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00] +// W32-DIS: v_cndmask_b16 v5, v255, 0x3800, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x0d,0x00,0x00,0x38,0x00,0x00] +// W64-ERR: :[[@LINE-3]]:30: error: invalid operand for instruction + +v_cndmask_b16 v5, s105, s105, s3 +// W32: v_cndmask_b16 v5, s105, s105, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, vcc_hi, v2, s3 +// W32: v_cndmask_b16 v5, vcc_hi, v2, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, ttmp15, ttmp15, s3 +// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, m0, v255, s3 +// W32: v_cndmask_b16 v5, m0, v255, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, exec_lo, exec_lo, s3 +// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, exec_hi, exec_hi, s3 +// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, null, m0, s105 +// W32: v_cndmask_b16 v5, null, m0, s105 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo +// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41] +// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, 0.5, -1, vcc_hi +// W32-ASM: v_cndmask_b16 v5, 0.5, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01] +// W32-DIS: v_cndmask_b16 v5, 0x3800, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xad,0x01,0x00,0x38,0x00,0x00] +// W64-ERR: :[[@LINE-3]]:19: error: invalid operand for instruction + +v_cndmask_b16 v5, -|src_scc|, null, ttmp15 +// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5, v1, src_scc, s[6:7] +// W32-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] + +v_cndmask_b16 v5, v255, 0.5, s[6:7] +// W32-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, v255, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00] +// W64-DIS: v_cndmask_b16 v5, v255, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] + +v_cndmask_b16 v5, s105, s105, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] + +v_cndmask_b16 v5, vcc_hi, v2, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] + +v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] + +v_cndmask_b16 v5, m0, v255, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] + +v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] + +v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] + +v_cndmask_b16 v5, null, m0, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] + +v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] + +v_cndmask_b16 v5, 0.5, -1, vcc +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, 0.5, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01] +// W64-DIS: v_cndmask_b16 v5, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] + +v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] + +v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null +// GFX13: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_cubeid_f32 v5, v1, v2, s3 +// GFX13: v_cubeid_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00] + +v_cubeid_f32 v5, v255, s2, s105 +// GFX13: v_cubeid_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0c,0xd6,0xff,0x05,0xa4,0x01] + +v_cubeid_f32 v5, s1, v255, exec_hi +// GFX13: v_cubeid_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0xfe,0xff,0x01] + +v_cubeid_f32 v5, s105, s105, exec_lo +// GFX13: v_cubeid_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0c,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubeid_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubeid_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0c,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubeid_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubeid_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0c,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubeid_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubeid_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0c,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubeid_f32 v5, m0, 0.5, m0 +// GFX13: v_cubeid_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0c,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubeid_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubeid_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0c,0xd6,0x7e,0x82,0xad,0x01] + +v_cubeid_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubeid_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0c,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubeid_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubeid_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0c,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubeid_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubeid_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0c,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubeid_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubeid_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0c,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubeid_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubeid_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0c,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubeid_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubeid_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0c,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cubema_f32 v5, v1, v2, s3 +// GFX13: v_cubema_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0f,0xd6,0x01,0x05,0x0e,0x00] + +v_cubema_f32 v5, v255, s2, s105 +// GFX13: v_cubema_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0f,0xd6,0xff,0x05,0xa4,0x01] + +v_cubema_f32 v5, s1, v255, exec_hi +// GFX13: v_cubema_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0f,0xd6,0x01,0xfe,0xff,0x01] + +v_cubema_f32 v5, s105, s105, exec_lo +// GFX13: v_cubema_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0f,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubema_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubema_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0f,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubema_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubema_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0f,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubema_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubema_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0f,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubema_f32 v5, m0, 0.5, m0 +// GFX13: v_cubema_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0f,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubema_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubema_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0f,0xd6,0x7e,0x82,0xad,0x01] + +v_cubema_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubema_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0f,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubema_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubema_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0f,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubema_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubema_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0f,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubema_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubema_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0f,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubema_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubema_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0f,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubema_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubema_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0f,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cubesc_f32 v5, v1, v2, s3 +// GFX13: v_cubesc_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0d,0xd6,0x01,0x05,0x0e,0x00] + +v_cubesc_f32 v5, v255, s2, s105 +// GFX13: v_cubesc_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0d,0xd6,0xff,0x05,0xa4,0x01] + +v_cubesc_f32 v5, s1, v255, exec_hi +// GFX13: v_cubesc_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0d,0xd6,0x01,0xfe,0xff,0x01] + +v_cubesc_f32 v5, s105, s105, exec_lo +// GFX13: v_cubesc_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0d,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubesc_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubesc_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubesc_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubesc_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubesc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubesc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0d,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubesc_f32 v5, m0, 0.5, m0 +// GFX13: v_cubesc_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubesc_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubesc_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0d,0xd6,0x7e,0x82,0xad,0x01] + +v_cubesc_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubesc_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0d,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubesc_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubesc_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0d,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubesc_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubesc_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0d,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubesc_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubesc_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0d,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubesc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubesc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0d,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubesc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubesc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cubetc_f32 v5, v1, v2, s3 +// GFX13: v_cubetc_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0e,0xd6,0x01,0x05,0x0e,0x00] + +v_cubetc_f32 v5, v255, s2, s105 +// GFX13: v_cubetc_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0e,0xd6,0xff,0x05,0xa4,0x01] + +v_cubetc_f32 v5, s1, v255, exec_hi +// GFX13: v_cubetc_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0e,0xd6,0x01,0xfe,0xff,0x01] + +v_cubetc_f32 v5, s105, s105, exec_lo +// GFX13: v_cubetc_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0e,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubetc_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubetc_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubetc_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubetc_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0e,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubetc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubetc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0e,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubetc_f32 v5, m0, 0.5, m0 +// GFX13: v_cubetc_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubetc_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubetc_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0e,0xd6,0x7e,0x82,0xad,0x01] + +v_cubetc_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubetc_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0e,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubetc_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubetc_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0e,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubetc_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubetc_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0e,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubetc_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubetc_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0e,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0e,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 +// GFX13: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6e,0xd7,0xff,0xd6,0x00,0x3a,0x56,0x34,0x12,0xaf] + +v_cvt_pk_bf16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6e,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 +// GFX13: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6e,0xd7,0xf0,0xfa,0x00,0x0a] + +v_cvt_pk_bf16_f32 v5, exec_hi, null +// GFX13: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6e,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6e,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_bf16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_bf16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6e,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6e,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 +// GFX13: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6e,0xd7,0xfd,0xd4,0x00,0x12] + +v_cvt_pk_bf16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6e,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_bf16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_bf16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6e,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6e,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6e,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_bf8_f32 v1, -v2, |v3| +// GFX13: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x7b,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_bf8_f32 v1, s2, 3 +// GFX13: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_pk_bf8_f32 v1, v2, v3 +// GFX13: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_pk_fp8_f32 v1, -v2, |v3| +// GFX13: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x7a,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_fp8_f32 v1, s2, 3 +// GFX13: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x7a,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_pk_fp8_f32 v1, v2, v3 +// GFX13: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x7a,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_pk_fp8_f32 v1, -v2, |v3| +// GFX13: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x7a,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_fp8_f32 v1, s2, 3 +// GFX13: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x7a,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_pk_bf8_f32 v1, v2, v3 +// GFX13: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_pk_bf8_f32 v1, -v2, |v3| +// GFX13: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x7b,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_bf8_f32 v1, s2, 3 +// GFX13: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_sr_bf8_f16 v1, -v2, v3 +// GFX13: v_cvt_sr_bf8_f16 v1, -v2, v3 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_sr_bf8_f16 v1, v2, 0x1234 +// GFX13: v_cvt_sr_bf8_f16 v1, v2, 0x1234 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0xff,0x01,0x02,0x34,0x12,0x00,0x00] + +v_cvt_sr_bf8_f16 v1, v2, s3 +// GFX13: v_cvt_sr_bf8_f16 v1, v2, s3 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0x07,0x00,0x02] + +v_cvt_sr_bf8_f16 v1, v2, v3 +// GFX13: v_cvt_sr_bf8_f16 v1, v2, v3 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:1 +// GFX13: v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:1 ; encoding: [0x01,0x20,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:2 +// GFX13: v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:2 ; encoding: [0x01,0x40,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:3 +// GFX13: v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:3 ; encoding: [0x01,0x60,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, |v2|, v3 +// GFX13: v_cvt_sr_bf8_f16 v1, |v2|, v3 ; encoding: [0x01,0x01,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX13: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x38,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX13: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x38,0xd7,0x02,0x0a,0x02,0x02] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX13: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x38,0xd7,0xff,0x09,0x02,0x22] + +v_cvt_sr_fp8_f16 v1, -v2, v3 +// GFX13: v_cvt_sr_fp8_f16 v1, -v2, v3 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_sr_fp8_f16 v1, v2, 0x1234 +// GFX13: v_cvt_sr_fp8_f16 v1, v2, 0x1234 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0xff,0x01,0x02,0x34,0x12,0x00,0x00] + +v_cvt_sr_fp8_f16 v1, v2, s3 +// GFX13: v_cvt_sr_fp8_f16 v1, v2, s3 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0x07,0x00,0x02] + +v_cvt_sr_fp8_f16 v1, v2, v3 +// GFX13: v_cvt_sr_fp8_f16 v1, v2, v3 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:1 +// GFX13: v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:1 ; encoding: [0x01,0x20,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:2 +// GFX13: v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:2 ; encoding: [0x01,0x40,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:3 +// GFX13: v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:3 ; encoding: [0x01,0x60,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, |v2|, v3 +// GFX13: v_cvt_sr_fp8_f16 v1, |v2|, v3 ; encoding: [0x01,0x01,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f32 v1, v2, v3 +// GFX13: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x37,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f32 v10, s2, v5 +// GFX13: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x37,0xd7,0x02,0x0a,0x02,0x02] + +v_cvt_sr_fp8_f32 v5, -|v255|, v4 +// GFX13: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x37,0xd7,0xff,0x09,0x02,0x22] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX13: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x38,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX13: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x38,0xd7,0x02,0x0a,0x02,0x02] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX13: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x38,0xd7,0xff,0x09,0x02,0x22] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xbe,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbe,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 +// W32: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xbe,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xbb,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbb,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 +// W32: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xbb,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 +// W32: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xbc,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 +// W32: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xb9,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 +// W32: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xba,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 +// W32: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xba,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 +// W32: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xba,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 +// W32: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xbd,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 +// W32: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xbd,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 +// W32: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0xbd,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 +// W32: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 +// W32: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xc1,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 +// W32: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 +// W32: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xbf,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 +// W32: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 +// W32: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 +// W32: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xc0,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_pk_i16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x21,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_i16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_i16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x21,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_i16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_i16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x21,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_i16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_i16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x21,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_i16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_i16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x21,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_i16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_i16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x21,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_i16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_i16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x21,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_i16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_i16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x21,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_i16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_i16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x21,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_i16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_i16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x21,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_i16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_i16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x21,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_i16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_i16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x21,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_i16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_i16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x21,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_i16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_i16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x21,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_i16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_i16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x21,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_cvt_pk_i16_i32 v5, v1, v2 +// GFX13: v_cvt_pk_i16_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x6b,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_i16_i32 v5, v255, v255 +// GFX13: v_cvt_pk_i16_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x6b,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_i16_i32 v5, s1, s2 +// GFX13: v_cvt_pk_i16_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x6b,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_i16_i32 v5, s105, s105 +// GFX13: v_cvt_pk_i16_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x6b,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_i16_i32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_i16_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6b,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_i16_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_i16_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6b,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_i16_i32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_i16_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6b,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_i16_i32 v5, m0, 0.5 +// GFX13: v_cvt_pk_i16_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6b,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_i16_i32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_i16_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6b,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_i16_i32 v5, exec_hi, null +// GFX13: v_cvt_pk_i16_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6b,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_i16_i32 v5, null, exec_lo +// GFX13: v_cvt_pk_i16_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6b,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_i16_i32 v5, -1, exec_hi +// GFX13: v_cvt_pk_i16_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6b,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_i16_i32 v5, 0.5, m0 +// GFX13: v_cvt_pk_i16_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd7,0xf0,0xfa,0x00,0x02] + +v_cvt_pk_i16_i32 v5, src_scc, vcc_lo +// GFX13: v_cvt_pk_i16_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x6b,0xd7,0xfd,0xd4,0x00,0x02] + +v_cvt_pk_i16_i32 v255, 0xaf123456, vcc_hi +// GFX13: v_cvt_pk_i16_i32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x6b,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_i16_f16 v5, v1, v2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_i16_f16 v5, v255, v255 +// GFX13: v_cvt_pk_norm_i16_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x12,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_i16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x12,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x12,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x12,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x12,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_i16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x12,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x12,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x12,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_i16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x12,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_i16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x12,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x12,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x12,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x12,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f16 v5, v1, v2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_u16_f16 v5, v255, v255 +// GFX13: v_cvt_pk_norm_u16_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x13,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_u16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x13,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x13,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x13,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x13,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_u16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x13,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x13,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x13,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_u16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x13,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_u16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x13,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x13,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x13,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x13,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_u16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_u16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x22,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_u16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_u16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x22,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_u16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_u16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x22,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_u16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_u16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x22,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_u16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_u16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x22,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_u16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_u16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x22,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_u16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x22,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_u16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_u16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x22,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_u16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_u16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x22,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_u16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_u16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x22,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_u16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_u16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x22,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_u16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_u16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x22,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_u16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_u16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x22,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_u16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_u16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x22,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_u16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_u16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x22,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u16_u32 v5, v1, v2 +// GFX13: v_cvt_pk_u16_u32 v5, v1, v2 ; encoding: [0x05,0x00,0x6a,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_u16_u32 v5, v255, v255 +// GFX13: v_cvt_pk_u16_u32 v5, v255, v255 ; encoding: [0x05,0x00,0x6a,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_u16_u32 v5, s1, s2 +// GFX13: v_cvt_pk_u16_u32 v5, s1, s2 ; encoding: [0x05,0x00,0x6a,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_u16_u32 v5, s105, s105 +// GFX13: v_cvt_pk_u16_u32 v5, s105, s105 ; encoding: [0x05,0x00,0x6a,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_u16_u32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_u16_u32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6a,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_u16_u32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_u16_u32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6a,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u16_u32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_u16_u32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6a,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_u16_u32 v5, m0, 0.5 +// GFX13: v_cvt_pk_u16_u32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6a,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_u16_u32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_u16_u32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6a,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_u16_u32 v5, exec_hi, null +// GFX13: v_cvt_pk_u16_u32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6a,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_u16_u32 v5, null, exec_lo +// GFX13: v_cvt_pk_u16_u32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6a,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_u16_u32 v5, -1, exec_hi +// GFX13: v_cvt_pk_u16_u32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6a,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_u16_u32 v5, 0.5, m0 +// GFX13: v_cvt_pk_u16_u32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd7,0xf0,0xfa,0x00,0x02] + +v_cvt_pk_u16_u32 v5, src_scc, vcc_lo +// GFX13: v_cvt_pk_u16_u32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x6a,0xd7,0xfd,0xd4,0x00,0x02] + +v_cvt_pk_u16_u32 v255, 0xaf123456, vcc_hi +// GFX13: v_cvt_pk_u16_u32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x6a,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u8_f32 v5, v1, v2, s3 +// GFX13: v_cvt_pk_u8_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x26,0xd6,0x01,0x05,0x0e,0x00] + +v_cvt_pk_u8_f32 v5, v255, s2, s105 +// GFX13: v_cvt_pk_u8_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x26,0xd6,0xff,0x05,0xa4,0x01] + +v_cvt_pk_u8_f32 v5, s1, v255, exec_hi +// GFX13: v_cvt_pk_u8_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x26,0xd6,0x01,0xfe,0xff,0x01] + +v_cvt_pk_u8_f32 v5, s105, s105, exec_lo +// GFX13: v_cvt_pk_u8_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x26,0xd6,0x69,0xd2,0xf8,0x01] + +v_cvt_pk_u8_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cvt_pk_u8_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x26,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cvt_pk_u8_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cvt_pk_u8_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x26,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u8_f32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_cvt_pk_u8_f32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x26,0xd6,0x7b,0xfa,0xed,0x01] + +v_cvt_pk_u8_f32 v5, m0, 0.5, m0 +// GFX13: v_cvt_pk_u8_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x26,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cvt_pk_u8_f32 v5, exec_lo, -1, vcc_hi +// GFX13: v_cvt_pk_u8_f32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x26,0xd6,0x7e,0x82,0xad,0x01] + +v_cvt_pk_u8_f32 v5, exec_hi, null, vcc_lo +// GFX13: v_cvt_pk_u8_f32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x26,0xd6,0x7f,0xf8,0xa8,0x01] + +v_cvt_pk_u8_f32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_cvt_pk_u8_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x26,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u8_f32 v5, -1, exec_hi, src_scc +// GFX13: v_cvt_pk_u8_f32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x26,0xd6,0xc1,0xfe,0xf4,0x03] + +v_cvt_pk_u8_f32 v5, 0.5, m0, 0.5 +// GFX13: v_cvt_pk_u8_f32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x26,0xd6,0xf0,0xfa,0xc0,0x03] + +v_cvt_pk_u8_f32 v5, src_scc, vcc_lo, -1 +// GFX13: v_cvt_pk_u8_f32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x26,0xd6,0xfd,0xd4,0x04,0x03] + +v_cvt_pk_u8_f32 v255, -|0xaf123456|, vcc_hi, null +// GFX13: v_cvt_pk_u8_f32 v255, -|0xaf123456|, vcc_hi, null ; encoding: [0xff,0x01,0x26,0xd6,0xff,0xd6,0xf0,0x21,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_i16_f16 v5, v1, v2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_i16_f16 v5, v255, v255 +// GFX13: v_cvt_pk_norm_i16_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x12,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_i16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x12,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x12,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x12,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x12,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_i16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x12,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x12,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x12,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_i16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x12,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_i16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x12,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x12,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x12,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x12,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_i16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_norm_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_i16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_norm_i16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_i16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_norm_i16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_norm_i16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_i16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_norm_i16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_i16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_i16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_i16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_i16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_i16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_i16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_i16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_i16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_i16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_i16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_norm_i16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_i16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_norm_i16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_i16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_norm_i16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_u16_f16 v5, v1, v2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_u16_f16 v5, v255, v255 +// GFX13: v_cvt_pk_norm_u16_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x13,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_u16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x13,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x13,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x13,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x13,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_u16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x13,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x13,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x13,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_u16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x13,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_u16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x13,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x13,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x13,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x13,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_norm_u16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x69,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_u16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_norm_u16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x69,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_u16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_norm_u16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x69,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_norm_u16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x69,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_u16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x69,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_norm_u16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x69,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_u16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_u16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x69,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_u16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_u16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x69,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_u16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_u16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x69,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_u16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_u16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x69,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_u16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x69,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_u16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x69,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_norm_u16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x69,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_u16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_norm_u16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x69,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_u16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_norm_u16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x69,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_div_fixup_f16 v5, v1, v2, s3 +// GFX13: v_div_fixup_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x5f,0xd7,0x01,0x05,0x0e,0x00] + +v_div_fixup_f16 v5, v255, s2, s105 +// GFX13: v_div_fixup_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x5f,0xd7,0xff,0x05,0xa4,0x01] + +v_div_fixup_f16 v5, s1, v255, exec_hi +// GFX13: v_div_fixup_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x5f,0xd7,0x01,0xfe,0xff,0x01] + +v_div_fixup_f16 v5, s105, s105, exec_lo +// GFX13: v_div_fixup_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x5f,0xd7,0x69,0xd2,0xf8,0x01] + +v_div_fixup_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_div_fixup_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x5f,0xd7,0x6a,0xf6,0x0c,0x04] + +v_div_fixup_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_div_fixup_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x5f,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_div_fixup_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_div_fixup_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x5f,0xd7,0x7b,0xfa,0xed,0xe1] + +v_div_fixup_f16 v5, m0, 0.5, m0 +// GFX13: v_div_fixup_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x5f,0xd7,0x7d,0xe0,0xf5,0x01] + +v_div_fixup_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_div_fixup_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x5f,0xd7,0x7e,0x82,0xad,0x01] + +v_div_fixup_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_div_fixup_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x5f,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_div_fixup_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_div_fixup_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x5f,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_div_fixup_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_div_fixup_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x5f,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_div_fixup_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_div_fixup_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x5f,0xd7,0xf0,0xfa,0xc0,0x43] + +v_div_fixup_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_div_fixup_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x5f,0xd7,0xfd,0xd4,0x04,0x23] + +v_div_fixup_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_div_fixup_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x5f,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_div_fixup_f32 v5, v1, v2, s3 +// GFX13: v_div_fixup_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x27,0xd6,0x01,0x05,0x0e,0x00] + +v_div_fixup_f32 v5, v255, s2, s105 +// GFX13: v_div_fixup_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x27,0xd6,0xff,0x05,0xa4,0x01] + +v_div_fixup_f32 v5, s1, v255, exec_hi +// GFX13: v_div_fixup_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x27,0xd6,0x01,0xfe,0xff,0x01] + +v_div_fixup_f32 v5, s105, s105, exec_lo +// GFX13: v_div_fixup_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x27,0xd6,0x69,0xd2,0xf8,0x01] + +v_div_fixup_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_div_fixup_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x27,0xd6,0x6a,0xf6,0x0c,0x04] + +v_div_fixup_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_div_fixup_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x27,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_div_fixup_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_div_fixup_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x27,0xd6,0x7b,0xfa,0xed,0xe1] + +v_div_fixup_f32 v5, m0, 0.5, m0 +// GFX13: v_div_fixup_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x27,0xd6,0x7d,0xe0,0xf5,0x01] + +v_div_fixup_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_div_fixup_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x27,0xd6,0x7e,0x82,0xad,0x01] + +v_div_fixup_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_div_fixup_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x27,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_div_fixup_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_div_fixup_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x27,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_div_fixup_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_div_fixup_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x27,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_div_fixup_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_div_fixup_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x27,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_div_fixup_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_div_fixup_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x27,0xd6,0xfd,0xd4,0x04,0x33] + +v_div_fixup_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_div_fixup_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x27,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_div_fixup_f64 v[5:6], v[1:2], v[2:3], v[3:4] +// GFX13: v_div_fixup_f64 v[5:6], v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x00,0x28,0xd6,0x01,0x05,0x0e,0x04] + +v_div_fixup_f64 v[5:6], v[254:255], v[254:255], s[6:7] +// GFX13: v_div_fixup_f64 v[5:6], v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x00,0x28,0xd6,0xfe,0xfd,0x1b,0x00] + +v_div_fixup_f64 v[5:6], s[2:3], s[4:5], v[254:255] +// GFX13: v_div_fixup_f64 v[5:6], s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x00,0x28,0xd6,0x02,0x08,0xf8,0x07] + +v_div_fixup_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| +// GFX13: v_div_fixup_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| ; encoding: [0x05,0x05,0x28,0xd6,0x68,0xd0,0xa0,0xa1] + +v_div_fixup_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| +// GFX13: v_div_fixup_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| ; encoding: [0x05,0x06,0x28,0xd6,0x6a,0xf4,0xe8,0xc1] + +v_div_fixup_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null +// GFX13: v_div_fixup_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null ; encoding: [0x05,0x01,0x28,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] + +v_div_fixup_f64 v[5:6], -|exec|, -|src_scc|, -|exec| +// GFX13: v_div_fixup_f64 v[5:6], -|exec|, -|src_scc|, -|exec| ; encoding: [0x05,0x07,0x28,0xd6,0x7e,0xfa,0xf9,0xe1] + +v_div_fixup_f64 v[5:6], null, 0.5, vcc +// GFX13: v_div_fixup_f64 v[5:6], null, 0.5, vcc ; encoding: [0x05,0x00,0x28,0xd6,0x7c,0xe0,0xa9,0x01] + +v_div_fixup_f64 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_div_fixup_f64 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x28,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_fixup_f64 v[5:6], 0.5, null, -|src_scc| mul:2 +// GFX13: v_div_fixup_f64 v[5:6], 0.5, null, -|src_scc| mul:2 ; encoding: [0x05,0x04,0x28,0xd6,0xf0,0xf8,0xf4,0x8b] + +v_div_fixup_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 +// GFX13: v_div_fixup_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 ; encoding: [0x05,0x03,0x28,0xd6,0xfd,0xfc,0xc0,0x73] + +v_div_fixup_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 +// GFX13: v_div_fixup_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 ; encoding: [0xfe,0x82,0x28,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] + +v_div_fmas_f32 v5, vcc_lo, v2, vcc_lo +// GFX13: v_div_fmas_f32 v5, vcc_lo, v2, vcc_lo ; encoding: [0x05,0x00,0x37,0xd6,0x6a,0x04,0xaa,0x01] + +v_div_fmas_f32 v5, ttmp15, ttmp15, ttmp15 +// GFX13: v_div_fmas_f32 v5, ttmp15, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x37,0xd6,0x7b,0xf6,0xec,0x01] + +v_div_fmas_f32 v5, -|m0|, -|v255|, v3 +// GFX13: v_div_fmas_f32 v5, -|m0|, -|v255|, v3 ; encoding: [0x05,0x03,0x37,0xd6,0x7d,0xfe,0x0f,0x64] + +v_div_fmas_f32 v5, -|exec_lo|, -|exec_lo|, -|exec_lo| +// GFX13: v_div_fmas_f32 v5, -|exec_lo|, -|exec_lo|, -|exec_lo| ; encoding: [0x05,0x07,0x37,0xd6,0x7e,0xfc,0xf8,0xe1] + +v_div_fmas_f32 v5, -|exec_hi|, 0.5, -|v255| +// GFX13: v_div_fmas_f32 v5, -|exec_hi|, 0.5, -|v255| ; encoding: [0x05,0x05,0x37,0xd6,0x7f,0xe0,0xfd,0xa7] + +v_div_fmas_f32 v5, null, exec_hi, -|exec_hi| +// GFX13: v_div_fmas_f32 v5, null, exec_hi, -|exec_hi| ; encoding: [0x05,0x04,0x37,0xd6,0x7c,0xfe,0xfc,0x81] + +v_div_fmas_f32 v5, -1, -|m0|, -|m0| +// GFX13: v_div_fmas_f32 v5, -1, -|m0|, -|m0| ; encoding: [0x05,0x06,0x37,0xd6,0xc1,0xfa,0xf4,0xc1] + +v_div_fmas_f32 v5, 0.5, -|vcc_lo|, 0.5 mul:2 +// GFX13: v_div_fmas_f32 v5, 0.5, -|vcc_lo|, 0.5 mul:2 ; encoding: [0x05,0x02,0x37,0xd6,0xf0,0xd4,0xc0,0x4b] + +v_div_fmas_f32 v5, vcc_lo, v2, v3 +// GFX13: v_div_fmas_f32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x37,0xd6,0x6a,0x04,0x0e,0x04] + +v_div_fmas_f32 v5, vcc_hi, v255, vcc_hi +// GFX13: v_div_fmas_f32 v5, vcc_hi, v255, vcc_hi ; encoding: [0x05,0x00,0x37,0xd6,0x6b,0xfe,0xaf,0x01] + +v_div_fmas_f32 v5, -|ttmp15|, -|ttmp15|, ttmp15 +// GFX13: v_div_fmas_f32 v5, -|ttmp15|, -|ttmp15|, ttmp15 ; encoding: [0x05,0x03,0x37,0xd6,0x7b,0xf6,0xec,0x61] + +v_div_fmas_f32 v5, m0, 0.5, v255 +// GFX13: v_div_fmas_f32 v5, m0, 0.5, v255 ; encoding: [0x05,0x00,0x37,0xd6,0x7d,0xe0,0xfd,0x07] + +v_div_fmas_f32 v5, -|exec_lo|, exec_lo, -|exec_lo| +// GFX13: v_div_fmas_f32 v5, -|exec_lo|, exec_lo, -|exec_lo| ; encoding: [0x05,0x05,0x37,0xd6,0x7e,0xfc,0xf8,0xa1] + +v_div_fmas_f32 v5, -|exec_hi|, -|exec_hi|, -|exec_hi| +// GFX13: v_div_fmas_f32 v5, -|exec_hi|, -|exec_hi|, -|exec_hi| ; encoding: [0x05,0x07,0x37,0xd6,0x7f,0xfe,0xfc,0xe1] + +v_div_fmas_f32 v5, null, m0, -|m0| +// GFX13: v_div_fmas_f32 v5, null, m0, -|m0| ; encoding: [0x05,0x04,0x37,0xd6,0x7c,0xfa,0xf4,0x81] + +v_div_fmas_f32 v5, -1, -|vcc_lo|, -|vcc_lo| +// GFX13: v_div_fmas_f32 v5, -1, -|vcc_lo|, -|vcc_lo| ; encoding: [0x05,0x06,0x37,0xd6,0xc1,0xd4,0xa8,0xc1] + +v_div_fmas_f32 v5, 0.5, -|vcc_hi|, 0.5 mul:2 +// GFX13: v_div_fmas_f32 v5, 0.5, -|vcc_hi|, 0.5 mul:2 ; encoding: [0x05,0x02,0x37,0xd6,0xf0,0xd6,0xc0,0x4b] + +v_div_fmas_f32 v5, v1, 0xaf123456, 0xaf123456 +// GFX13: v_div_fmas_f32 v5, v1, 0xaf123456, 0xaf123456 ; encoding: [0x05,0x00,0x37,0xd6,0x01,0xff,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_fmas_f32 v5, v255, src_scc, src_scc +// GFX13: v_div_fmas_f32 v5, v255, src_scc, src_scc ; encoding: [0x05,0x00,0x37,0xd6,0xff,0xfb,0xf5,0x03] + +v_div_fmas_f32 v5, s105, s105, s105 +// GFX13: v_div_fmas_f32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x37,0xd6,0x69,0xd2,0xa4,0x01] + +v_div_fmas_f32 v5, src_scc, -1, -1 mul:4 +// GFX13: v_div_fmas_f32 v5, src_scc, -1, -1 mul:4 ; encoding: [0x05,0x00,0x37,0xd6,0xfd,0x82,0x05,0x13] + +v_div_fmas_f32 v255, -|0xaf123456|, null, null clamp div:2 +// GFX13: v_div_fmas_f32 v255, -|0xaf123456|, null, null clamp div:2 ; encoding: [0xff,0x81,0x37,0xd6,0xff,0xf8,0xf0,0x39,0x56,0x34,0x12,0xaf] + +v_div_fmas_f64 v[5:6], v[1:2], 0xaf123456, 0xaf123456 +// GFX13: v_div_fmas_f64 v[5:6], v[1:2], 0xaf123456, 0xaf123456 ; encoding: [0x05,0x00,0x38,0xd6,0x01,0xff,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_fmas_f64 v[5:6], v[254:255], src_scc, v[3:4] +// GFX13: v_div_fmas_f64 v[5:6], v[254:255], src_scc, v[3:4] ; encoding: [0x05,0x00,0x38,0xd6,0xfe,0xfb,0x0d,0x04] + +v_div_fmas_f64 v[5:6], s[104:105], |s[104:105]|, s[104:105] +// GFX13: v_div_fmas_f64 v[5:6], s[104:105], |s[104:105]|, s[104:105] ; encoding: [0x05,0x02,0x38,0xd6,0x68,0xd0,0xa0,0x01] + +v_div_fmas_f64 v[5:6], -|vcc|, v[2:3], -|v[254:255]| +// GFX13: v_div_fmas_f64 v[5:6], -|vcc|, v[2:3], -|v[254:255]| ; encoding: [0x05,0x05,0x38,0xd6,0x6a,0x04,0xfa,0xa7] + +v_div_fmas_f64 v[5:6], -|ttmp[14:15]|, -|ttmp[14:15]|, -|ttmp[14:15]| +// GFX13: v_div_fmas_f64 v[5:6], -|ttmp[14:15]|, -|ttmp[14:15]|, -|ttmp[14:15]| ; encoding: [0x05,0x07,0x38,0xd6,0x7a,0xf4,0xe8,0xe1] + +v_div_fmas_f64 v[5:6], -|exec|, -|v[254:255]|, null +// GFX13: v_div_fmas_f64 v[5:6], -|exec|, -|v[254:255]|, null ; encoding: [0x05,0x03,0x38,0xd6,0x7e,0xfc,0xf3,0x61] + +v_div_fmas_f64 v[5:6], null, 0.5, -src_scc +// GFX13: v_div_fmas_f64 v[5:6], null, 0.5, -src_scc ; encoding: [0x05,0x00,0x38,0xd6,0x7c,0xe0,0xf5,0x83] + +v_div_fmas_f64 v[5:6], -1, -exec, |exec| +// GFX13: v_div_fmas_f64 v[5:6], -1, -exec, |exec| ; encoding: [0x05,0x04,0x38,0xd6,0xc1,0xfc,0xf8,0x41] + +v_div_fmas_f64 v[5:6], 0.5, -|vcc|, -|vcc| mul:2 +// GFX13: v_div_fmas_f64 v[5:6], 0.5, -|vcc|, -|vcc| mul:2 ; encoding: [0x05,0x06,0x38,0xd6,0xf0,0xd4,0xa8,0xc9] + +v_div_fmas_f64 v[5:6], -|src_scc|, -1, 0.5 mul:4 +// GFX13: v_div_fmas_f64 v[5:6], -|src_scc|, -1, 0.5 mul:4 ; encoding: [0x05,0x01,0x38,0xd6,0xfd,0x82,0xc1,0x33] + +v_div_fmas_f64 v[254:255], 0xaf123456, null, -1 clamp div:2 +// GFX13: v_div_fmas_f64 v[254:255], 0xaf123456, null, -1 clamp div:2 ; encoding: [0xfe,0x80,0x38,0xd6,0xff,0xf8,0x04,0x1b,0x56,0x34,0x12,0xaf] + +v_div_scale_f32 v5, vcc_lo, v1, v2, s3 +// W32: v_div_scale_f32 v5, vcc_lo, v1, v2, s3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0x05,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, v255, s2, s105 +// W32: v_div_scale_f32 v5, vcc_lo, v255, s2, s105 ; encoding: [0x05,0x6a,0xfc,0xd6,0xff,0x05,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, s1, v255, exec_hi +// W32: v_div_scale_f32 v5, vcc_lo, s1, v255, exec_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0xfe,0xff,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, s105, s105, exec_lo +// W32: v_div_scale_f32 v5, vcc_lo, s105, s105, exec_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x69,0xd2,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, vcc_lo, ttmp15, v3 +// W32: v_div_scale_f32 v5, vcc_lo, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6a,0xf6,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, vcc_hi, 0xaf123456, v255 +// W32: v_div_scale_f32 v5, vcc_lo, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -ttmp15, -src_scc, -ttmp15 +// W32: v_div_scale_f32 v5, vcc_lo, -ttmp15, -src_scc, -ttmp15 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7b,0xfa,0xed,0xe1] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, m0, 0.5, m0 +// W32: v_div_scale_f32 v5, vcc_lo, m0, 0.5, m0 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7d,0xe0,0xf5,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, exec_lo, -1, vcc_hi +// W32: v_div_scale_f32 v5, vcc_lo, exec_lo, -1, vcc_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x7e,0x82,0xad,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -exec_hi, null, -vcc_lo +// W32: v_div_scale_f32 v5, vcc_lo, -exec_hi, null, -vcc_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x7f,0xf8,0xa8,0xa1] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, null, exec_lo, neg(0xaf123456) +// W32: v_div_scale_f32 v5, vcc_lo, null, exec_lo, neg(0xaf123456) ; encoding: [0x05,0x6a,0xfc,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -1, -exec_hi, -src_scc +// W32: v_div_scale_f32 v5, vcc_lo, -1, -exec_hi, -src_scc ; encoding: [0x05,0x6a,0xfc,0xd6,0xc1,0xfe,0xf4,0xc3] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, 0.5, -m0, 0.5 mul:2 +// W32: v_div_scale_f32 v5, vcc_lo, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x6a,0xfc,0xd6,0xf0,0xfa,0xc0,0x4b] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -src_scc, vcc_lo, -1 mul:4 +// W32: v_div_scale_f32 v5, vcc_lo, -src_scc, vcc_lo, -1 mul:4 ; encoding: [0x05,0x6a,0xfc,0xd6,0xfd,0xd4,0x04,0x33] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v255, vcc_lo, neg(0xaf123456), -vcc_hi, null clamp div:2 +// W32: v_div_scale_f32 v255, vcc_lo, neg(0xaf123456), -vcc_hi, null clamp div:2 ; encoding: [0xff,0xea,0xfc,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc, v1, v2, s3 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, v1, v2, s3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0x05,0x0e,0x00] + +v_div_scale_f32 v5, vcc, v255, s2, s105 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, v255, s2, s105 ; encoding: [0x05,0x6a,0xfc,0xd6,0xff,0x05,0xa4,0x01] + +v_div_scale_f32 v5, vcc, s1, v255, exec_hi +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, s1, v255, exec_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0xfe,0xff,0x01] + +v_div_scale_f32 v5, vcc, s105, s105, exec_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, s105, s105, exec_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x69,0xd2,0xf8,0x01] + +v_div_scale_f32 v5, vcc, vcc_lo, ttmp15, v3 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6a,0xf6,0x0c,0x04] + +v_div_scale_f32 v5, vcc, vcc_hi, 0xaf123456, v255 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_div_scale_f32 v5, vcc, -ttmp15, -src_scc, -ttmp15 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -ttmp15, -src_scc, -ttmp15 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7b,0xfa,0xed,0xe1] + +v_div_scale_f32 v5, vcc, m0, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, m0, 0.5, m0 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7d,0xe0,0xf5,0x01] + +v_div_scale_f32 v5, vcc, exec_lo, -1, vcc_hi +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, exec_lo, -1, vcc_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x7e,0x82,0xad,0x01] + +v_div_scale_f32 v5, vcc, -exec_hi, null, -vcc_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -exec_hi, null, -vcc_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_div_scale_f32 v5, vcc, null, exec_lo, neg(0xaf123456) +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, null, exec_lo, neg(0xaf123456) ; encoding: [0x05,0x6a,0xfc,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_div_scale_f32 v5, vcc, -1, -exec_hi, -src_scc +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -1, -exec_hi, -src_scc ; encoding: [0x05,0x6a,0xfc,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_div_scale_f32 v5, vcc, 0.5, -m0, 0.5 mul:2 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x6a,0xfc,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_div_scale_f32 v5, vcc, -src_scc, vcc_lo, -1 mul:4 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -src_scc, vcc_lo, -1 mul:4 ; encoding: [0x05,0x6a,0xfc,0xd6,0xfd,0xd4,0x04,0x33] + +v_div_scale_f32 v255, vcc, neg(0xaf123456), -vcc_hi, null clamp div:2 +// W32-ERR: :[[@LINE-1]]:23: error: invalid operand for instruction +// W64: v_div_scale_f32 v255, vcc, neg(0xaf123456), -vcc_hi, null clamp div:2 ; encoding: [0xff,0xea,0xfc,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_div_scale_f64 v[5:6], vcc_lo, v[1:2], v[2:3], v[3:4] +// W32: v_div_scale_f64 v[5:6], vcc_lo, v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x6a,0xfd,0xd6,0x01,0x05,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, v[254:255], v[254:255], s[6:7] +// W32: v_div_scale_f64 v[5:6], vcc_lo, v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x6a,0xfd,0xd6,0xfe,0xfd,0x1b,0x00] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, s[2:3], s[4:5], v[254:255] +// W32: v_div_scale_f64 v[5:6], vcc_lo, s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x6a,0xfd,0xd6,0x02,0x08,0xf8,0x07] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -s[104:105], s[104:105], -s[104:105] +// W32: v_div_scale_f64 v[5:6], vcc_lo, -s[104:105], s[104:105], -s[104:105] ; encoding: [0x05,0x6a,0xfd,0xd6,0x68,0xd0,0xa0,0xa1] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, vcc, -ttmp[14:15], -ttmp[14:15] +// W32: v_div_scale_f64 v[5:6], vcc_lo, vcc, -ttmp[14:15], -ttmp[14:15] ; encoding: [0x05,0x6a,0xfd,0xd6,0x6a,0xf4,0xe8,0xc1] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -ttmp[14:15], 0xaf123456, null +// W32: v_div_scale_f64 v[5:6], vcc_lo, -ttmp[14:15], 0xaf123456, null ; encoding: [0x05,0x6a,0xfd,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -exec, -src_scc, -exec +// W32: v_div_scale_f64 v[5:6], vcc_lo, -exec, -src_scc, -exec ; encoding: [0x05,0x6a,0xfd,0xd6,0x7e,0xfa,0xf9,0xe1] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, null, 0.5, vcc +// W32: v_div_scale_f64 v[5:6], vcc_lo, null, 0.5, vcc ; encoding: [0x05,0x6a,0xfd,0xd6,0x7c,0xe0,0xa9,0x01] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -1, -1, 0xaf123456 +// W32: v_div_scale_f64 v[5:6], vcc_lo, -1, -1, 0xaf123456 ; encoding: [0x05,0x6a,0xfd,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, 0.5, null, -src_scc mul:2 +// W32: v_div_scale_f64 v[5:6], vcc_lo, 0.5, null, -src_scc mul:2 ; encoding: [0x05,0x6a,0xfd,0xd6,0xf0,0xf8,0xf4,0x8b] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -src_scc, -exec, 0.5 mul:4 +// W32: v_div_scale_f64 v[5:6], vcc_lo, -src_scc, -exec, 0.5 mul:4 ; encoding: [0x05,0x6a,0xfd,0xd6,0xfd,0xfc,0xc0,0x73] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[254:255], vcc_lo, 0xaf123456, -vcc, -1 clamp div:2 +// W32: v_div_scale_f64 v[254:255], vcc_lo, 0xaf123456, -vcc, -1 clamp div:2 ; encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc, v[1:2], v[2:3], v[3:4] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x6a,0xfd,0xd6,0x01,0x05,0x0e,0x04] + +v_div_scale_f64 v[5:6], vcc, v[254:255], v[254:255], s[6:7] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x6a,0xfd,0xd6,0xfe,0xfd,0x1b,0x00] + +v_div_scale_f64 v[5:6], vcc, s[2:3], s[4:5], v[254:255] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x6a,0xfd,0xd6,0x02,0x08,0xf8,0x07] + +v_div_scale_f64 v[5:6], vcc, -s[104:105], s[104:105], -s[104:105] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -s[104:105], s[104:105], -s[104:105] ; encoding: [0x05,0x6a,0xfd,0xd6,0x68,0xd0,0xa0,0xa1] + +v_div_scale_f64 v[5:6], vcc, vcc, -ttmp[14:15], -ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, vcc, -ttmp[14:15], -ttmp[14:15] ; encoding: [0x05,0x6a,0xfd,0xd6,0x6a,0xf4,0xe8,0xc1] + +v_div_scale_f64 v[5:6], vcc, -ttmp[14:15], 0xaf123456, null +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -ttmp[14:15], 0xaf123456, null ; encoding: [0x05,0x6a,0xfd,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] + +v_div_scale_f64 v[5:6], vcc, -exec, -src_scc, -exec +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -exec, -src_scc, -exec ; encoding: [0x05,0x6a,0xfd,0xd6,0x7e,0xfa,0xf9,0xe1] + +v_div_scale_f64 v[5:6], vcc, null, 0.5, vcc +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, null, 0.5, vcc ; encoding: [0x05,0x6a,0xfd,0xd6,0x7c,0xe0,0xa9,0x01] + +v_div_scale_f64 v[5:6], vcc, -1, -1, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -1, -1, 0xaf123456 ; encoding: [0x05,0x6a,0xfd,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_scale_f64 v[5:6], vcc, 0.5, null, -src_scc mul:2 +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, 0.5, null, -src_scc mul:2 ; encoding: [0x05,0x6a,0xfd,0xd6,0xf0,0xf8,0xf4,0x8b] + +v_div_scale_f64 v[5:6], vcc, -src_scc, -exec, 0.5 mul:4 +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -src_scc, -exec, 0.5 mul:4 ; encoding: [0x05,0x6a,0xfd,0xd6,0xfd,0xfc,0xc0,0x73] + +v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2 ; encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, v1, v2, s3 +// GFX13: v_fma_dx9_zero_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00] + +v_fma_dx9_zero_f32 v5, v255, s2, s105 +// GFX13: v_fma_dx9_zero_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x09,0xd6,0xff,0x05,0xa4,0x01] + +v_fma_dx9_zero_f32 v5, s1, v255, exec_hi +// GFX13: v_fma_dx9_zero_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x09,0xd6,0x01,0xfe,0xff,0x01] + +v_fma_dx9_zero_f32 v5, s105, s105, exec_lo +// GFX13: v_fma_dx9_zero_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x09,0xd6,0x69,0xd2,0xf8,0x01] + +v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x09,0xd6,0x6a,0xf6,0x0c,0x04] + +v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x09,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x09,0xd6,0x7b,0xfa,0xed,0xe1] + +v_fma_dx9_zero_f32 v5, m0, 0.5, m0 +// GFX13: v_fma_dx9_zero_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x09,0xd6,0x7d,0xe0,0xf5,0x01] + +v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x09,0xd6,0x7e,0x82,0xad,0x01] + +v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x09,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x09,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x09,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x09,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x09,0xd6,0xfd,0xd4,0x04,0x33] + +v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_fma_f16 v5, v1, v2, s3 +// GFX13: v_fma_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x4b,0xd7,0x01,0x05,0x0e,0x00] + +v_fma_f16 v5, v255, s2, s105 +// GFX13: v_fma_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x4b,0xd7,0xff,0x05,0xa4,0x01] + +v_fma_f16 v5, s1, v255, exec_hi +// GFX13: v_fma_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x4b,0xd7,0x01,0xfe,0xff,0x01] + +v_fma_f16 v5, s105, s105, exec_lo +// GFX13: v_fma_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x4b,0xd7,0x69,0xd2,0xf8,0x01] + +v_fma_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x4b,0xd7,0x6a,0xf6,0x0c,0x04] + +v_fma_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x4b,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x4b,0xd7,0x7b,0xfa,0xed,0xe1] + +v_fma_f16 v5, m0, 0.5, m0 +// GFX13: v_fma_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x4b,0xd7,0x7d,0xe0,0xf5,0x01] + +v_fma_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x4b,0xd7,0x7e,0x82,0xad,0x01] + +v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x4b,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_fma_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x4b,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x4b,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x4b,0xd7,0xf0,0xfa,0xc0,0x43] + +v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x4b,0xd7,0xfd,0xd4,0x04,0x23] + +v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x4b,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_fma_f32 v5, v1, v2, s3 +// GFX13: v_fma_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00] + +v_fma_f32 v5, v255, s2, s105 +// GFX13: v_fma_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x13,0xd6,0xff,0x05,0xa4,0x01] + +v_fma_f32 v5, s1, v255, exec_hi +// GFX13: v_fma_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x13,0xd6,0x01,0xfe,0xff,0x01] + +v_fma_f32 v5, s105, s105, exec_lo +// GFX13: v_fma_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x13,0xd6,0x69,0xd2,0xf8,0x01] + +v_fma_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x13,0xd6,0x6a,0xf6,0x0c,0x04] + +v_fma_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_fma_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x13,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_fma_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x13,0xd6,0x7b,0xfa,0xed,0xe1] + +v_fma_f32 v5, m0, 0.5, m0 +// GFX13: v_fma_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x13,0xd6,0x7d,0xe0,0xf5,0x01] + +v_fma_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x13,0xd6,0x7e,0x82,0xad,0x01] + +v_fma_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_fma_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x13,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_fma_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_fma_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x13,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_fma_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_fma_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x13,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_fma_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_fma_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x13,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_fma_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_fma_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x13,0xd6,0xfd,0xd4,0x04,0x33] + +v_fma_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_fma_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x13,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_fma_f64 v[5:6], v[1:2], v[2:3], v[3:4] +// GFX13: v_fma_f64 v[5:6], v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x00,0x14,0xd6,0x01,0x05,0x0e,0x04] + +v_fma_f64 v[5:6], v[254:255], v[254:255], s[6:7] +// GFX13: v_fma_f64 v[5:6], v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x00,0x14,0xd6,0xfe,0xfd,0x1b,0x00] + +v_fma_f64 v[5:6], s[2:3], s[4:5], v[254:255] +// GFX13: v_fma_f64 v[5:6], s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x00,0x14,0xd6,0x02,0x08,0xf8,0x07] + +v_fma_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| +// GFX13: v_fma_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| ; encoding: [0x05,0x05,0x14,0xd6,0x68,0xd0,0xa0,0xa1] + +v_fma_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| +// GFX13: v_fma_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| ; encoding: [0x05,0x06,0x14,0xd6,0x6a,0xf4,0xe8,0xc1] + +v_fma_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null +// GFX13: v_fma_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null ; encoding: [0x05,0x01,0x14,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] + +v_fma_f64 v[5:6], -|exec|, -|src_scc|, -|exec| +// GFX13: v_fma_f64 v[5:6], -|exec|, -|src_scc|, -|exec| ; encoding: [0x05,0x07,0x14,0xd6,0x7e,0xfa,0xf9,0xe1] + +v_fma_f64 v[5:6], null, 0.5, vcc +// GFX13: v_fma_f64 v[5:6], null, 0.5, vcc ; encoding: [0x05,0x00,0x14,0xd6,0x7c,0xe0,0xa9,0x01] + +v_fma_f64 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_fma_f64 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x14,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_fma_f64 v[5:6], 0.5, null, -|src_scc| mul:2 +// GFX13: v_fma_f64 v[5:6], 0.5, null, -|src_scc| mul:2 ; encoding: [0x05,0x04,0x14,0xd6,0xf0,0xf8,0xf4,0x8b] + +v_fma_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 +// GFX13: v_fma_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 ; encoding: [0x05,0x03,0x14,0xd6,0xfd,0xfc,0xc0,0x73] + +v_fma_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 +// GFX13: v_fma_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 ; encoding: [0xfe,0x82,0x14,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, v1, v2, s3 +// GFX13: v_fma_dx9_zero_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00] + +v_fma_dx9_zero_f32 v5, v255, s2, s105 +// GFX13: v_fma_dx9_zero_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x09,0xd6,0xff,0x05,0xa4,0x01] + +v_fma_dx9_zero_f32 v5, s1, v255, exec_hi +// GFX13: v_fma_dx9_zero_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x09,0xd6,0x01,0xfe,0xff,0x01] + +v_fma_dx9_zero_f32 v5, s105, s105, exec_lo +// GFX13: v_fma_dx9_zero_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x09,0xd6,0x69,0xd2,0xf8,0x01] + +v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x09,0xd6,0x6a,0xf6,0x0c,0x04] + +v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x09,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x09,0xd6,0x7b,0xfa,0xed,0xe1] + +v_fma_dx9_zero_f32 v5, m0, 0.5, m0 +// GFX13: v_fma_dx9_zero_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x09,0xd6,0x7d,0xe0,0xf5,0x01] + +v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x09,0xd6,0x7e,0x82,0xad,0x01] + +v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x09,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x09,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x09,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x09,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x09,0xd6,0xfd,0xd4,0x04,0x33] + +v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_ldexp_f32 v5, v1, v2 +// GFX13: v_ldexp_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x62,0xd7,0x01,0x05,0x02,0x02] + +v_ldexp_f32 v5, v255, v255 +// GFX13: v_ldexp_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x62,0xd7,0xff,0xff,0x03,0x02] + +v_ldexp_f32 v5, s1, s2 +// GFX13: v_ldexp_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x62,0xd7,0x01,0x04,0x00,0x02] + +v_ldexp_f32 v5, s105, s105 +// GFX13: v_ldexp_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x62,0xd7,0x69,0xd2,0x00,0x02] + +v_ldexp_f32 v5, vcc_lo, ttmp15 +// GFX13: v_ldexp_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x62,0xd7,0x6a,0xf6,0x00,0x02] + +v_ldexp_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_ldexp_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x62,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ldexp_f32 v5, ttmp15, src_scc +// GFX13: v_ldexp_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x62,0xd7,0x7b,0xfa,0x01,0x02] + +v_ldexp_f32 v5, m0, 0.5 +// GFX13: v_ldexp_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x62,0xd7,0x7d,0xe0,0x01,0x02] + +v_ldexp_f32 v5, exec_lo, -1 +// GFX13: v_ldexp_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x62,0xd7,0x7e,0x82,0x01,0x02] + +v_ldexp_f32 v5, exec_hi, null +// GFX13: v_ldexp_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x62,0xd7,0x7f,0xf8,0x00,0x02] + +v_ldexp_f32 v5, null, exec_lo +// GFX13: v_ldexp_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x62,0xd7,0x7c,0xfc,0x00,0x02] + +v_ldexp_f32 v5, -1, exec_hi +// GFX13: v_ldexp_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x62,0xd7,0xc1,0xfe,0x00,0x02] + +v_ldexp_f32 v5, 0.5, m0 mul:2 +// GFX13: v_ldexp_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x62,0xd7,0xf0,0xfa,0x00,0x0a] + +v_ldexp_f32 v5, src_scc, vcc_lo mul:4 +// GFX13: v_ldexp_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x62,0xd7,0xfd,0xd4,0x00,0x12] + +v_ldexp_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 +// GFX13: v_ldexp_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x62,0xd7,0xff,0xd6,0x00,0x3a,0x56,0x34,0x12,0xaf] + +v_ldexp_f64 v[5:6], v[1:2], v2 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], v2 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0x05,0x02,0x02] + +v_ldexp_f64 v[5:6], v[1:2], v255 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], v255 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0xff,0x03,0x02] + +v_ldexp_f64 v[5:6], v[1:2], s2 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], s2 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0x05,0x00,0x02] + +v_ldexp_f64 v[5:6], v[1:2], s105 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], s105 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0xd3,0x00,0x02] + +v_ldexp_f64 v[5:6], v[254:255], ttmp15 +// GFX13: v_ldexp_f64 v[5:6], v[254:255], ttmp15 ; encoding: [0x05,0x00,0x2b,0xd7,0xfe,0xf7,0x00,0x02] + +v_ldexp_f64 v[5:6], s[2:3], vcc_hi +// GFX13: v_ldexp_f64 v[5:6], s[2:3], vcc_hi ; encoding: [0x05,0x00,0x2b,0xd7,0x02,0xd6,0x00,0x02] + +v_ldexp_f64 v[5:6], s[104:105], vcc_lo +// GFX13: v_ldexp_f64 v[5:6], s[104:105], vcc_lo ; encoding: [0x05,0x00,0x2b,0xd7,0x68,0xd4,0x00,0x02] + +v_ldexp_f64 v[5:6], vcc, m0 +// GFX13: v_ldexp_f64 v[5:6], vcc, m0 ; encoding: [0x05,0x00,0x2b,0xd7,0x6a,0xfa,0x00,0x02] + +v_ldexp_f64 v[5:6], ttmp[14:15], exec_hi +// GFX13: v_ldexp_f64 v[5:6], ttmp[14:15], exec_hi ; encoding: [0x05,0x00,0x2b,0xd7,0x7a,0xfe,0x00,0x02] + +v_ldexp_f64 v[5:6], exec, exec_lo +// GFX13: v_ldexp_f64 v[5:6], exec, exec_lo ; encoding: [0x05,0x00,0x2b,0xd7,0x7e,0xfc,0x00,0x02] + +v_ldexp_f64 v[5:6], null, null +// GFX13: v_ldexp_f64 v[5:6], null, null ; encoding: [0x05,0x00,0x2b,0xd7,0x7c,0xf8,0x00,0x02] + +v_ldexp_f64 v[5:6], -1, -1 +// GFX13: v_ldexp_f64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x2b,0xd7,0xc1,0x82,0x01,0x02] + +v_ldexp_f64 v[5:6], 0.5, 0.5 mul:2 +// GFX13: v_ldexp_f64 v[5:6], 0.5, 0.5 mul:2 ; encoding: [0x05,0x00,0x2b,0xd7,0xf0,0xe0,0x01,0x0a] + +v_ldexp_f64 v[5:6], -|src_scc|, src_scc mul:4 +// GFX13: v_ldexp_f64 v[5:6], -|src_scc|, src_scc mul:4 ; encoding: [0x05,0x01,0x2b,0xd7,0xfd,0xfa,0x01,0x32] + +v_ldexp_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 +// GFX13: v_ldexp_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x2b,0xd7,0xff,0xfe,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_lerp_u8 v5, v1, v2, s3 +// GFX13: v_lerp_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x15,0xd6,0x01,0x05,0x0e,0x00] + +v_lerp_u8 v5, v255, s2, s105 +// GFX13: v_lerp_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x15,0xd6,0xff,0x05,0xa4,0x01] + +v_lerp_u8 v5, s1, v255, exec_hi +// GFX13: v_lerp_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x15,0xd6,0x01,0xfe,0xff,0x01] + +v_lerp_u8 v5, s105, s105, exec_lo +// GFX13: v_lerp_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x15,0xd6,0x69,0xd2,0xf8,0x01] + +v_lerp_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_lerp_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x15,0xd6,0x6a,0xf6,0x0c,0x04] + +v_lerp_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_lerp_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x15,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_lerp_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_lerp_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x15,0xd6,0x7b,0xfa,0xed,0x01] + +v_lerp_u8 v5, m0, 0.5, m0 +// GFX13: v_lerp_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x15,0xd6,0x7d,0xe0,0xf5,0x01] + +v_lerp_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_lerp_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x15,0xd6,0x7e,0x82,0xad,0x01] + +v_lerp_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_lerp_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x15,0xd6,0x7f,0xf8,0xa8,0x01] + +v_lerp_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_lerp_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x15,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_lerp_u8 v5, -1, exec_hi, src_scc +// GFX13: v_lerp_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x15,0xd6,0xc1,0xfe,0xf4,0x03] + +v_lerp_u8 v5, 0.5, m0, 0.5 +// GFX13: v_lerp_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x15,0xd6,0xf0,0xfa,0xc0,0x03] + +v_lerp_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_lerp_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x15,0xd6,0xfd,0xd4,0x04,0x03] + +v_lerp_u8 v255, 0xaf123456, vcc_hi, null +// GFX13: v_lerp_u8 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x15,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_lshl_add_u32 v5, v1, v2, s3 +// GFX13: v_lshl_add_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x46,0xd7,0x01,0x05,0x0e,0x00] + +v_lshl_add_u32 v5, v255, s2, s105 +// GFX13: v_lshl_add_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x46,0xd7,0xff,0x05,0xa4,0x01] + +v_lshl_add_u32 v5, s1, v255, exec_hi +// GFX13: v_lshl_add_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x46,0xd7,0x01,0xfe,0xff,0x01] + +v_lshl_add_u32 v5, s105, s105, exec_lo +// GFX13: v_lshl_add_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x46,0xd7,0x69,0xd2,0xf8,0x01] + +v_lshl_add_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_lshl_add_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x46,0xd7,0x6a,0xf6,0x0c,0x04] + +v_lshl_add_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_lshl_add_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x46,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_lshl_add_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_lshl_add_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x46,0xd7,0x7b,0xfa,0xed,0x01] + +v_lshl_add_u32 v5, m0, 0.5, m0 +// GFX13: v_lshl_add_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x46,0xd7,0x7d,0xe0,0xf5,0x01] + +v_lshl_add_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_lshl_add_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x46,0xd7,0x7e,0x82,0xad,0x01] + +v_lshl_add_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_lshl_add_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x46,0xd7,0x7f,0xf8,0xa8,0x01] + +v_lshl_add_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_lshl_add_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x46,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_lshl_add_u32 v5, -1, exec_hi, src_scc +// GFX13: v_lshl_add_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x46,0xd7,0xc1,0xfe,0xf4,0x03] + +v_lshl_add_u32 v5, 0.5, m0, 0.5 +// GFX13: v_lshl_add_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x46,0xd7,0xf0,0xfa,0xc0,0x03] + +v_lshl_add_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_lshl_add_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x46,0xd7,0xfd,0xd4,0x04,0x03] + +v_lshl_add_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_lshl_add_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x46,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_lshl_or_b32 v5, v1, v2, s3 +// GFX13: v_lshl_or_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6f,0xd7,0x01,0x05,0x0e,0x00] + +v_lshl_or_b32 v5, v255, s2, s105 +// GFX13: v_lshl_or_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd7,0xff,0x05,0xa4,0x01] + +v_lshl_or_b32 v5, s1, v255, exec_hi +// GFX13: v_lshl_or_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd7,0x01,0xfe,0xff,0x01] + +v_lshl_or_b32 v5, s105, s105, exec_lo +// GFX13: v_lshl_or_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd7,0x69,0xd2,0xf8,0x01] + +v_lshl_or_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_lshl_or_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd7,0x6a,0xf6,0x0c,0x04] + +v_lshl_or_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_lshl_or_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6f,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_lshl_or_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_lshl_or_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x6f,0xd7,0x7b,0xfa,0xed,0x01] + +v_lshl_or_b32 v5, m0, 0.5, m0 +// GFX13: v_lshl_or_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6f,0xd7,0x7d,0xe0,0xf5,0x01] + +v_lshl_or_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_lshl_or_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x6f,0xd7,0x7e,0x82,0xad,0x01] + +v_lshl_or_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_lshl_or_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x6f,0xd7,0x7f,0xf8,0xa8,0x01] + +v_lshl_or_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_lshl_or_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x6f,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_lshl_or_b32 v5, -1, exec_hi, src_scc +// GFX13: v_lshl_or_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x6f,0xd7,0xc1,0xfe,0xf4,0x03] + +v_lshl_or_b32 v5, 0.5, m0, 0.5 +// GFX13: v_lshl_or_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd7,0xf0,0xfa,0xc0,0x03] + +v_lshl_or_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_lshl_or_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x6f,0xd7,0xfd,0xd4,0x04,0x03] + +v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x6f,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_lshlrev_b16 v5, v1, v2 +// GFX13: v_lshlrev_b16 v5, v1, v2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x02,0x02] + +v_lshlrev_b16 v5, v255, v255 +// GFX13: v_lshlrev_b16 v5, v255, v255 ; encoding: [0x05,0x00,0x14,0xd7,0xff,0xff,0x03,0x02] + +v_lshlrev_b16 v5, s1, s2 +// GFX13: v_lshlrev_b16 v5, s1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x04,0x00,0x02] + +v_lshlrev_b16 v5, s105, s105 +// GFX13: v_lshlrev_b16 v5, s105, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x69,0xd2,0x00,0x02] + +v_lshlrev_b16 v5, vcc_lo, ttmp15 +// GFX13: v_lshlrev_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x6a,0xf6,0x00,0x02] + +v_lshlrev_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_lshlrev_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x14,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_lshlrev_b16 v5, ttmp15, src_scc +// GFX13: v_lshlrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x14,0xd7,0x7b,0xfa,0x01,0x02] + +v_lshlrev_b16 v5, m0, 0.5 +// GFX13-ASM: v_lshlrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x14,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_lshlrev_b16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x14,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_lshlrev_b16 v5, exec_lo, -1 +// GFX13: v_lshlrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x14,0xd7,0x7e,0x82,0x01,0x02] + +v_lshlrev_b16 v5, exec_hi, null +// GFX13: v_lshlrev_b16 v5, exec_hi, null ; encoding: [0x05,0x00,0x14,0xd7,0x7f,0xf8,0x00,0x02] + +v_lshlrev_b16 v5, null, exec_lo +// GFX13: v_lshlrev_b16 v5, null, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x7c,0xfc,0x00,0x02] + +v_lshlrev_b16 v5, -1, exec_hi +// GFX13: v_lshlrev_b16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0xc1,0xfe,0x00,0x02] + +v_lshlrev_b16 v5, 0.5, m0 +// GFX13-ASM: v_lshlrev_b16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x14,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_lshlrev_b16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x14,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_lshlrev_b16 v5, src_scc, vcc_lo +// GFX13: v_lshlrev_b16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0xfd,0xd4,0x00,0x02] + +v_lshlrev_b16 v255, 0xfe0b, vcc_hi +// GFX13: v_lshlrev_b16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x14,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_lshrrev_b16 v5, v1, v2 +// GFX13: v_lshrrev_b16 v5, v1, v2 ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x02] + +v_lshrrev_b16 v5, v255, v255 +// GFX13: v_lshrrev_b16 v5, v255, v255 ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x02] + +v_lshrrev_b16 v5, s1, s2 +// GFX13: v_lshrrev_b16 v5, s1, s2 ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x02] + +v_lshrrev_b16 v5, s105, s105 +// GFX13: v_lshrrev_b16 v5, s105, s105 ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x02] + +v_lshrrev_b16 v5, vcc_lo, ttmp15 +// GFX13: v_lshrrev_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x02] + +v_lshrrev_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_lshrrev_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_lshrrev_b16 v5, ttmp15, src_scc +// GFX13: v_lshrrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x02] + +v_lshrrev_b16 v5, m0, 0.5 +// GFX13-ASM: v_lshrrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_lshrrev_b16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_lshrrev_b16 v5, exec_lo, -1 +// GFX13: v_lshrrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x02] + +v_lshrrev_b16 v5, exec_hi, null +// GFX13: v_lshrrev_b16 v5, exec_hi, null ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x02] + +v_lshrrev_b16 v5, null, exec_lo +// GFX13: v_lshrrev_b16 v5, null, exec_lo ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x02] + +v_lshrrev_b16 v5, -1, exec_hi +// GFX13: v_lshrrev_b16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x02] + +v_lshrrev_b16 v5, 0.5, m0 +// GFX13-ASM: v_lshrrev_b16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x39,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_lshrrev_b16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_lshrrev_b16 v5, src_scc, vcc_lo +// GFX13: v_lshrrev_b16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x02] + +v_lshrrev_b16 v255, 0xfe0b, vcc_hi +// GFX13: v_lshrrev_b16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_lshrrev_b64 v[5:6], v1, vcc +// GFX13: v_lshrrev_b64 v[5:6], v1, vcc ; encoding: [0x05,0x00,0x00,0xd7,0x01,0xd5,0x00,0x02] + +v_lshrrev_b64 v[5:6], v255, exec +// GFX13: v_lshrrev_b64 v[5:6], v255, exec ; encoding: [0x05,0x00,0x00,0xd7,0xff,0xfd,0x00,0x02] + +v_lshrrev_b64 v[5:6], exec_lo, v[2:3] +// GFX13: v_lshrrev_b64 v[5:6], exec_lo, v[2:3] ; encoding: [0x05,0x00,0x00,0xd7,0x7e,0x04,0x02,0x02] + +v_lshrrev_b64 v[5:6], exec_hi, v[254:255] +// GFX13: v_lshrrev_b64 v[5:6], exec_hi, v[254:255] ; encoding: [0x05,0x00,0x00,0xd7,0x7f,0xfc,0x03,0x02] + +v_lshrrev_b64 v[5:6], null, null +// GFX13: v_lshrrev_b64 v[5:6], null, null ; encoding: [0x05,0x00,0x00,0xd7,0x7c,0xf8,0x00,0x02] + +v_lshrrev_b64 v[5:6], -1, -1 +// GFX13: v_lshrrev_b64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x00,0xd7,0xc1,0x82,0x01,0x02] + +v_lshrrev_b64 v[5:6], 0.5, 0xaf123456 +// GFX13: v_lshrrev_b64 v[5:6], 0.5, 0xaf123456 ; encoding: [0x05,0x00,0x00,0xd7,0xf0,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_lshrrev_b64 v[5:6], src_scc, src_scc +// GFX13: v_lshrrev_b64 v[5:6], src_scc, src_scc ; encoding: [0x05,0x00,0x00,0xd7,0xfd,0xfa,0x01,0x02] + +v_lshrrev_b64 v[254:255], 0xaf123456, 0.5 +// GFX13: v_lshrrev_b64 v[254:255], 0xaf123456, 0.5 ; encoding: [0xfe,0x00,0x00,0xd7,0xff,0xe0,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mad_i16 v5, v1, v2, s3 +// GFX13: v_mad_i16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x5e,0xd7,0x01,0x05,0x0e,0x00] + +v_mad_i16 v5, v255, s2, s105 +// GFX13: v_mad_i16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x5e,0xd7,0xff,0x05,0xa4,0x01] + +v_mad_i16 v5, s1, v255, exec_hi +// GFX13: v_mad_i16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x5e,0xd7,0x01,0xfe,0xff,0x01] + +v_mad_i16 v5, s105, s105, exec_lo +// GFX13: v_mad_i16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x5e,0xd7,0x69,0xd2,0xf8,0x01] + +v_mad_i16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mad_i16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x5e,0xd7,0x6a,0xf6,0x0c,0x04] + +v_mad_i16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_mad_i16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x5e,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_mad_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x5e,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_i16 v5, m0, 0.5, m0 +// GFX13-ASM: v_mad_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x5e,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_i16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x5e,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_i16 v5, exec_lo, -1, vcc_hi +// GFX13: v_mad_i16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x5e,0xd7,0x7e,0x82,0xad,0x01] + +v_mad_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_mad_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x5e,0xd7,0x7f,0xf8,0xa8,0x01] + +v_mad_i16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_mad_i16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x5e,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x5e,0xd7,0xc1,0xfe,0xf4,0x03] + +v_mad_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_mad_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x5e,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_mad_i16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x5e,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x5e,0xd7,0xfd,0xd4,0x04,0x03] + +v_mad_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp +// GFX13: v_mad_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc0,0x5e,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_i32_i16 v5, v1, v2, v3 +// GFX13: v_mad_i32_i16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x75,0xd7,0x01,0x05,0x0e,0x04] + +v_mad_i32_i16 v5, v255, v255, s3 +// GFX13: v_mad_i32_i16 v5, v255, v255, s3 ; encoding: [0x05,0x00,0x75,0xd7,0xff,0xff,0x0f,0x00] + +v_mad_i32_i16 v5, s1, s2, v255 +// GFX13: v_mad_i32_i16 v5, s1, s2, v255 ; encoding: [0x05,0x00,0x75,0xd7,0x01,0x04,0xfc,0x07] + +v_mad_i32_i16 v5, s105, s105, s105 +// GFX13: v_mad_i32_i16 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x75,0xd7,0x69,0xd2,0xa4,0x01] + +v_mad_i32_i16 v5, vcc_lo, ttmp15, vcc_lo +// GFX13: v_mad_i32_i16 v5, vcc_lo, ttmp15, vcc_lo ; encoding: [0x05,0x00,0x75,0xd7,0x6a,0xf6,0xa8,0x01] + +v_mad_i32_i16 v5, vcc_hi, 0xfe0b, vcc_hi +// GFX13: v_mad_i32_i16 v5, vcc_hi, 0xfe0b, vcc_hi ; encoding: [0x05,0x00,0x75,0xd7,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_i32_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_i32_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x75,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_i32_i16 v5, m0, 0.5, m0 +// GFX13-ASM: v_mad_i32_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x75,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_i32_i16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x75,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_i32_i16 v5, exec_lo, -1, exec_hi +// GFX13: v_mad_i32_i16 v5, exec_lo, -1, exec_hi ; encoding: [0x05,0x00,0x75,0xd7,0x7e,0x82,0xfd,0x01] + +v_mad_i32_i16 v5, exec_hi, null, exec_lo +// GFX13: v_mad_i32_i16 v5, exec_hi, null, exec_lo ; encoding: [0x05,0x00,0x75,0xd7,0x7f,0xf8,0xf8,0x01] + +v_mad_i32_i16 v5, null, exec_lo, null +// GFX13: v_mad_i32_i16 v5, null, exec_lo, null ; encoding: [0x05,0x00,0x75,0xd7,0x7c,0xfc,0xf0,0x01] + +v_mad_i32_i16 v5, -1, exec_hi, 0xaf123456 +// GFX13: v_mad_i32_i16 v5, -1, exec_hi, 0xaf123456 ; encoding: [0x05,0x00,0x75,0xd7,0xc1,0xfe,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_i32_i16 v5, 0.5, m0, -1 op_sel:[0,0,0,0] +// GFX13-ASM: v_mad_i32_i16 v5, 0.5, m0, -1 ; encoding: [0x05,0x00,0x75,0xd7,0xf0,0xfa,0x04,0x03] +// GFX13-DIS: v_mad_i32_i16 v5, 0x3800, m0, -1 ; encoding: [0x05,0x00,0x75,0xd7,0xff,0xfa,0x04,0x03,0x00,0x38,0x00,0x00] + +v_mad_i32_i16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_i32_i16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x75,0xd7,0xfd,0xd4,0xf4,0x03] + +v_mad_i32_i16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp +// GFX13: v_mad_i32_i16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp ; encoding: [0xff,0x90,0x75,0xd7,0xff,0xd6,0xc0,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_i32_i24 v5, v1, v2, s3 +// GFX13: v_mad_i32_i24 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0a,0xd6,0x01,0x05,0x0e,0x00] + +v_mad_i32_i24 v5, v255, s2, s105 +// GFX13: v_mad_i32_i24 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0a,0xd6,0xff,0x05,0xa4,0x01] + +v_mad_i32_i24 v5, s1, v255, exec_hi +// GFX13: v_mad_i32_i24 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0a,0xd6,0x01,0xfe,0xff,0x01] + +v_mad_i32_i24 v5, s105, s105, exec_lo +// GFX13: v_mad_i32_i24 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0a,0xd6,0x69,0xd2,0xf8,0x01] + +v_mad_i32_i24 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mad_i32_i24 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_mad_i32_i24 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_mad_i32_i24 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0a,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_mad_i32_i24 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_i32_i24 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x0a,0xd6,0x7b,0xfa,0xed,0x01] + +v_mad_i32_i24 v5, m0, 0.5, m0 +// GFX13: v_mad_i32_i24 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_mad_i32_i24 v5, exec_lo, -1, vcc_hi +// GFX13: v_mad_i32_i24 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x0a,0xd6,0x7e,0x82,0xad,0x01] + +v_mad_i32_i24 v5, exec_hi, null, vcc_lo +// GFX13: v_mad_i32_i24 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x0a,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_i32_i24 v5, null, exec_lo, 0xaf123456 +// GFX13: v_mad_i32_i24 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x0a,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_i32_i24 v5, -1, exec_hi, src_scc +// GFX13: v_mad_i32_i24 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x0a,0xd6,0xc1,0xfe,0xf4,0x03] + +v_mad_i32_i24 v5, 0.5, m0, 0.5 +// GFX13: v_mad_i32_i24 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x0a,0xd6,0xf0,0xfa,0xc0,0x03] + +v_mad_i32_i24 v5, src_scc, vcc_lo, -1 +// GFX13: v_mad_i32_i24 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x0a,0xd6,0xfd,0xd4,0x04,0x03] + +v_mad_i32_i24 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_mad_i32_i24 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x0a,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mad_co_i64_i32 v[5:6], s6, s105, s105, s[6:7] +// W32: v_mad_co_i64_i32 v[5:6], s6, s105, s105, s[6:7] ; encoding: [0x05,0x06,0xf9,0xd6,0x69,0xd2,0x18,0x00] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, ttmp15, ttmp15, s[104:105] +// W32: v_mad_co_i64_i32 v[5:6], s6, ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x06,0xf9,0xd6,0x7b,0xf6,0xa0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, m0, 0.5, ttmp[14:15] +// W32: v_mad_co_i64_i32 v[5:6], s6, m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x06,0xf9,0xd6,0x7d,0xe0,0xe9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, exec_lo, -1, exec +// W32: v_mad_co_i64_i32 v[5:6], s6, exec_lo, -1, exec ; encoding: [0x05,0x06,0xf9,0xd6,0x7e,0x82,0xf9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, exec_hi, null, vcc +// W32: v_mad_co_i64_i32 v[5:6], s6, exec_hi, null, vcc ; encoding: [0x05,0x06,0xf9,0xd6,0x7f,0xf8,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s105, null, exec_lo, null +// W32: v_mad_co_i64_i32 v[5:6], s105, null, exec_lo, null ; encoding: [0x05,0x69,0xf9,0xd6,0x7c,0xfc,0xf0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], vcc_lo, -1, exec_hi, -1 +// W32: v_mad_co_i64_i32 v[5:6], vcc_lo, -1, exec_hi, -1 ; encoding: [0x05,0x6a,0xf9,0xd6,0xc1,0xfe,0x04,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 +// W32: v_mad_co_i64_i32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6b,0xf9,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc +// W32: v_mad_co_i64_i32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7b,0xf9,0xd6,0xfd,0xd4,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s[12:13], s105, s105, s[6:7] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], s105, s105, s[6:7] ; encoding: [0x05,0x0c,0xf9,0xd6,0x69,0xd2,0x18,0x00] + +v_mad_co_i64_i32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x0c,0xf9,0xd6,0x7b,0xf6,0xa0,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x0c,0xf9,0xd6,0x7d,0xe0,0xe9,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], exec_lo, -1, exec +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], exec_lo, -1, exec ; encoding: [0x05,0x0c,0xf9,0xd6,0x7e,0x82,0xf9,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], exec_hi, null, vcc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], exec_hi, null, vcc ; encoding: [0x05,0x0c,0xf9,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], null, exec_lo, null +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], null, exec_lo, null ; encoding: [0x05,0x0c,0xf9,0xd6,0x7c,0xfc,0xf0,0x01] + +v_mad_co_i64_i32 v[5:6], s[104:105], -1, exec_hi, -1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[104:105], -1, exec_hi, -1 ; encoding: [0x05,0x68,0xf9,0xd6,0xc1,0xfe,0x04,0x03] + +v_mad_co_i64_i32 v[5:6], vcc, 0.5, m0, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], vcc, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6a,0xf9,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_co_i64_i32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7a,0xf9,0xd6,0xfd,0xd4,0xf4,0x03] + +v_mad_co_i64_i32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp +// GFX13: v_mad_co_i64_i32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xf9,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] + +v_mad_u16 v5, v1, v2, s3 +// GFX13: v_mad_u16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x40,0xd7,0x01,0x05,0x0e,0x00] + +v_mad_u16 v5, v255, s2, s105 +// GFX13: v_mad_u16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x40,0xd7,0xff,0x05,0xa4,0x01] + +v_mad_u16 v5, s1, v255, exec_hi +// GFX13: v_mad_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x40,0xd7,0x01,0xfe,0xff,0x01] + +v_mad_u16 v5, s105, s105, exec_lo +// GFX13: v_mad_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x40,0xd7,0x69,0xd2,0xf8,0x01] + +v_mad_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mad_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x40,0xd7,0x6a,0xf6,0x0c,0x04] + +v_mad_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_mad_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x40,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_mad_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x40,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_u16 v5, m0, 0.5, m0 +// GFX13-ASM: v_mad_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x40,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_u16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x40,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_u16 v5, exec_lo, -1, vcc_hi +// GFX13: v_mad_u16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x40,0xd7,0x7e,0x82,0xad,0x01] + +v_mad_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_mad_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x40,0xd7,0x7f,0xf8,0xa8,0x01] + +v_mad_u16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_mad_u16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x40,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x40,0xd7,0xc1,0xfe,0xf4,0x03] + +v_mad_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_mad_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x40,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_mad_u16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x40,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x40,0xd7,0xfd,0xd4,0x04,0x03] + +v_mad_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp +// GFX13: v_mad_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc0,0x40,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_u32_u16 v5, v1, v2, v3 +// GFX13: v_mad_u32_u16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x73,0xd7,0x01,0x05,0x0e,0x04] + +v_mad_u32_u16 v5, v255, v255, s3 +// GFX13: v_mad_u32_u16 v5, v255, v255, s3 ; encoding: [0x05,0x00,0x73,0xd7,0xff,0xff,0x0f,0x00] + +v_mad_u32_u16 v5, s1, s2, v255 +// GFX13: v_mad_u32_u16 v5, s1, s2, v255 ; encoding: [0x05,0x00,0x73,0xd7,0x01,0x04,0xfc,0x07] + +v_mad_u32_u16 v5, s105, s105, s105 +// GFX13: v_mad_u32_u16 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x73,0xd7,0x69,0xd2,0xa4,0x01] + +v_mad_u32_u16 v5, vcc_lo, ttmp15, vcc_lo +// GFX13: v_mad_u32_u16 v5, vcc_lo, ttmp15, vcc_lo ; encoding: [0x05,0x00,0x73,0xd7,0x6a,0xf6,0xa8,0x01] + +v_mad_u32_u16 v5, vcc_hi, 0xfe0b, vcc_hi +// GFX13: v_mad_u32_u16 v5, vcc_hi, 0xfe0b, vcc_hi ; encoding: [0x05,0x00,0x73,0xd7,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_u32_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_u32_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x73,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_u32_u16 v5, m0, 0.5, m0 +// GFX13-ASM: v_mad_u32_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x73,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_u32_u16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x73,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_u32_u16 v5, exec_lo, -1, exec_hi +// GFX13: v_mad_u32_u16 v5, exec_lo, -1, exec_hi ; encoding: [0x05,0x00,0x73,0xd7,0x7e,0x82,0xfd,0x01] + +v_mad_u32_u16 v5, exec_hi, null, exec_lo +// GFX13: v_mad_u32_u16 v5, exec_hi, null, exec_lo ; encoding: [0x05,0x00,0x73,0xd7,0x7f,0xf8,0xf8,0x01] + +v_mad_u32_u16 v5, null, exec_lo, null +// GFX13: v_mad_u32_u16 v5, null, exec_lo, null ; encoding: [0x05,0x00,0x73,0xd7,0x7c,0xfc,0xf0,0x01] + +v_mad_u32_u16 v5, -1, exec_hi, 0xaf123456 +// GFX13: v_mad_u32_u16 v5, -1, exec_hi, 0xaf123456 ; encoding: [0x05,0x00,0x73,0xd7,0xc1,0xfe,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_u32_u16 v5, 0.5, m0, -1 op_sel:[0,0,0,0] +// GFX13-ASM: v_mad_u32_u16 v5, 0.5, m0, -1 ; encoding: [0x05,0x00,0x73,0xd7,0xf0,0xfa,0x04,0x03] +// GFX13-DIS: v_mad_u32_u16 v5, 0x3800, m0, -1 ; encoding: [0x05,0x00,0x73,0xd7,0xff,0xfa,0x04,0x03,0x00,0x38,0x00,0x00] + +v_mad_u32_u16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_u32_u16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x73,0xd7,0xfd,0xd4,0xf4,0x03] + +v_mad_u32_u16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp +// GFX13: v_mad_u32_u16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp ; encoding: [0xff,0x90,0x73,0xd7,0xff,0xd6,0xc0,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_u32_u24 v5, v1, v2, s3 +// GFX13: v_mad_u32_u24 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0b,0xd6,0x01,0x05,0x0e,0x00] + +v_mad_u32_u24 v5, v255, s2, s105 +// GFX13: v_mad_u32_u24 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0b,0xd6,0xff,0x05,0xa4,0x01] + +v_mad_u32_u24 v5, s1, v255, exec_hi +// GFX13: v_mad_u32_u24 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0b,0xd6,0x01,0xfe,0xff,0x01] + +v_mad_u32_u24 v5, s105, s105, exec_lo +// GFX13: v_mad_u32_u24 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0b,0xd6,0x69,0xd2,0xf8,0x01] + +v_mad_u32_u24 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mad_u32_u24 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0b,0xd6,0x6a,0xf6,0x0c,0x04] + +v_mad_u32_u24 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_mad_u32_u24 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0b,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_mad_u32_u24 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_u32_u24 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x0b,0xd6,0x7b,0xfa,0xed,0x01] + +v_mad_u32_u24 v5, m0, 0.5, m0 +// GFX13: v_mad_u32_u24 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0b,0xd6,0x7d,0xe0,0xf5,0x01] + +v_mad_u32_u24 v5, exec_lo, -1, vcc_hi +// GFX13: v_mad_u32_u24 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x0b,0xd6,0x7e,0x82,0xad,0x01] + +v_mad_u32_u24 v5, exec_hi, null, vcc_lo +// GFX13: v_mad_u32_u24 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x0b,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_u32_u24 v5, null, exec_lo, 0xaf123456 +// GFX13: v_mad_u32_u24 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x0b,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_u32_u24 v5, -1, exec_hi, src_scc +// GFX13: v_mad_u32_u24 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x0b,0xd6,0xc1,0xfe,0xf4,0x03] + +v_mad_u32_u24 v5, 0.5, m0, 0.5 +// GFX13: v_mad_u32_u24 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x0b,0xd6,0xf0,0xfa,0xc0,0x03] + +v_mad_u32_u24 v5, src_scc, vcc_lo, -1 +// GFX13: v_mad_u32_u24 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x0b,0xd6,0xfd,0xd4,0x04,0x03] + +v_mad_u32_u24 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_mad_u32_u24 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x0b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mad_co_u64_u32 v[5:6], s6, s105, s105, s[6:7] +// W32: v_mad_co_u64_u32 v[5:6], s6, s105, s105, s[6:7] ; encoding: [0x05,0x06,0xf8,0xd6,0x69,0xd2,0x18,0x00] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, ttmp15, ttmp15, s[104:105] +// W32: v_mad_co_u64_u32 v[5:6], s6, ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x06,0xf8,0xd6,0x7b,0xf6,0xa0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, m0, 0.5, ttmp[14:15] +// W32: v_mad_co_u64_u32 v[5:6], s6, m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x06,0xf8,0xd6,0x7d,0xe0,0xe9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, exec_lo, -1, exec +// W32: v_mad_co_u64_u32 v[5:6], s6, exec_lo, -1, exec ; encoding: [0x05,0x06,0xf8,0xd6,0x7e,0x82,0xf9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, exec_hi, null, vcc +// W32: v_mad_co_u64_u32 v[5:6], s6, exec_hi, null, vcc ; encoding: [0x05,0x06,0xf8,0xd6,0x7f,0xf8,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s105, null, exec_lo, null +// W32: v_mad_co_u64_u32 v[5:6], s105, null, exec_lo, null ; encoding: [0x05,0x69,0xf8,0xd6,0x7c,0xfc,0xf0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], vcc_lo, -1, exec_hi, -1 +// W32: v_mad_co_u64_u32 v[5:6], vcc_lo, -1, exec_hi, -1 ; encoding: [0x05,0x6a,0xf8,0xd6,0xc1,0xfe,0x04,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 +// W32: v_mad_co_u64_u32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6b,0xf8,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc +// W32: v_mad_co_u64_u32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7b,0xf8,0xd6,0xfd,0xd4,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s[12:13], s105, s105, s[6:7] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], s105, s105, s[6:7] ; encoding: [0x05,0x0c,0xf8,0xd6,0x69,0xd2,0x18,0x00] + +v_mad_co_u64_u32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x0c,0xf8,0xd6,0x7b,0xf6,0xa0,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x0c,0xf8,0xd6,0x7d,0xe0,0xe9,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], exec_lo, -1, exec +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], exec_lo, -1, exec ; encoding: [0x05,0x0c,0xf8,0xd6,0x7e,0x82,0xf9,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], exec_hi, null, vcc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], exec_hi, null, vcc ; encoding: [0x05,0x0c,0xf8,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], null, exec_lo, null +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], null, exec_lo, null ; encoding: [0x05,0x0c,0xf8,0xd6,0x7c,0xfc,0xf0,0x01] + +v_mad_co_u64_u32 v[5:6], s[104:105], -1, exec_hi, -1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[104:105], -1, exec_hi, -1 ; encoding: [0x05,0x68,0xf8,0xd6,0xc1,0xfe,0x04,0x03] + +v_mad_co_u64_u32 v[5:6], vcc, 0.5, m0, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], vcc, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6a,0xf8,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7a,0xf8,0xd6,0xfd,0xd4,0xf4,0x03] + +v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp +// GFX13: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xf8,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] + +v_max3_num_f16 v5, v1, v2, s3 +// GFX13: v_max3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x54,0xd7,0x01,0x05,0x0e,0x00] + +v_max3_num_f16 v5, v255, s2, s105 +// GFX13: v_max3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x54,0xd7,0xff,0x05,0xa4,0x01] + +v_max3_num_f16 v5, s1, v255, exec_hi +// GFX13: v_max3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x54,0xd7,0x01,0xfe,0xff,0x01] + +v_max3_num_f16 v5, s105, s105, exec_lo +// GFX13: v_max3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x54,0xd7,0x69,0xd2,0xf8,0x01] + +v_max3_num_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x54,0xd7,0x6a,0xf6,0x0c,0x04] + +v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x54,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x54,0xd7,0x7b,0xfa,0xed,0xe1] + +v_max3_num_f16 v5, m0, 0.5, m0 +// GFX13: v_max3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x54,0xd7,0x7d,0xe0,0xf5,0x01] + +v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x54,0xd7,0x7e,0x82,0xad,0x01] + +v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x54,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x54,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x54,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x54,0xd7,0xf0,0xfa,0xc0,0x43] + +v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x54,0xd7,0xfd,0xd4,0x04,0x23] + +v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x54,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_max3_num_f32 v5, v1, v2, s3 +// GFX13: v_max3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00] + +v_max3_num_f32 v5, v255, s2, s105 +// GFX13: v_max3_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2a,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_num_f32 v5, s1, v255, exec_hi +// GFX13: v_max3_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_num_f32 v5, s105, s105, exec_lo +// GFX13: v_max3_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2a,0xd6,0x69,0xd2,0xf8,0x01] + +v_max3_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_max3_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_max3_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x2a,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_max3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_max3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2a,0xd6,0x7b,0xfa,0xed,0xe1] + +v_max3_num_f32 v5, m0, 0.5, m0 +// GFX13: v_max3_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_max3_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_max3_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2a,0xd6,0x7e,0x82,0xad,0x01] + +v_max3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_max3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2a,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_max3_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_max3_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x2a,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_max3_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_max3_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2a,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_max3_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_max3_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x2a,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_max3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_max3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x2a,0xd6,0xfd,0xd4,0x04,0x33] + +v_max3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_max3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2a,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_max3_i16 v5, v1, v2, s3 +// GFX13: v_max3_i16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd7,0x01,0x05,0x0e,0x00] + +v_max3_i16 v5, v255, s2, s105 +// GFX13: v_max3_i16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x55,0xd7,0xff,0x05,0xa4,0x01] + +v_max3_i16 v5, s1, v255, exec_hi +// GFX13: v_max3_i16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x55,0xd7,0x01,0xfe,0xff,0x01] + +v_max3_i16 v5, s105, s105, exec_lo +// GFX13: v_max3_i16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x55,0xd7,0x69,0xd2,0xf8,0x01] + +v_max3_i16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_i16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x55,0xd7,0x6a,0xf6,0x0c,0x04] + +v_max3_i16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_max3_i16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x55,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x55,0xd7,0x7b,0xfa,0xed,0x01] + +v_max3_i16 v5, m0, 0.5, m0 +// GFX13-ASM: v_max3_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x55,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_max3_i16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x55,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_max3_i16 v5, exec_lo, -1, vcc_hi +// GFX13: v_max3_i16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x55,0xd7,0x7e,0x82,0xad,0x01] + +v_max3_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_max3_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x55,0xd7,0x7f,0xf8,0xa8,0x01] + +v_max3_i16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_max3_i16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x55,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_max3_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_max3_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x55,0xd7,0xc1,0xfe,0xf4,0x03] + +v_max3_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_max3_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x55,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_max3_i16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x55,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_max3_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_max3_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x55,0xd7,0xfd,0xd4,0x04,0x03] + +v_max3_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_max3_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x55,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_max3_i32 v5, v1, v2, s3 +// GFX13: v_max3_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1d,0xd6,0x01,0x05,0x0e,0x00] + +v_max3_i32 v5, v255, s2, s105 +// GFX13: v_max3_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1d,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_i32 v5, s1, v255, exec_hi +// GFX13: v_max3_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1d,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_i32 v5, s105, s105, exec_lo +// GFX13: v_max3_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1d,0xd6,0x69,0xd2,0xf8,0x01] + +v_max3_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_max3_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_max3_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_max3_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1d,0xd6,0x7b,0xfa,0xed,0x01] + +v_max3_i32 v5, m0, 0.5, m0 +// GFX13: v_max3_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_max3_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_max3_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1d,0xd6,0x7e,0x82,0xad,0x01] + +v_max3_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_max3_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1d,0xd6,0x7f,0xf8,0xa8,0x01] + +v_max3_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_max3_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1d,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_max3_i32 v5, -1, exec_hi, src_scc +// GFX13: v_max3_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1d,0xd6,0xc1,0xfe,0xf4,0x03] + +v_max3_i32 v5, 0.5, m0, 0.5 +// GFX13: v_max3_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1d,0xd6,0xf0,0xfa,0xc0,0x03] + +v_max3_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_max3_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1d,0xd6,0xfd,0xd4,0x04,0x03] + +v_max3_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_max3_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1d,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_max3_u16 v5, v1, v2, s3 +// GFX13: v_max3_u16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x56,0xd7,0x01,0x05,0x0e,0x00] + +v_max3_u16 v5, v255, s2, s105 +// GFX13: v_max3_u16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x56,0xd7,0xff,0x05,0xa4,0x01] + +v_max3_u16 v5, s1, v255, exec_hi +// GFX13: v_max3_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x56,0xd7,0x01,0xfe,0xff,0x01] + +v_max3_u16 v5, s105, s105, exec_lo +// GFX13: v_max3_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x56,0xd7,0x69,0xd2,0xf8,0x01] + +v_max3_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x56,0xd7,0x6a,0xf6,0x0c,0x04] + +v_max3_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_max3_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x56,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x56,0xd7,0x7b,0xfa,0xed,0x01] + +v_max3_u16 v5, m0, 0.5, m0 +// GFX13-ASM: v_max3_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x56,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_max3_u16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x56,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_max3_u16 v5, exec_lo, -1, vcc_hi +// GFX13: v_max3_u16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x56,0xd7,0x7e,0x82,0xad,0x01] + +v_max3_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_max3_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x56,0xd7,0x7f,0xf8,0xa8,0x01] + +v_max3_u16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_max3_u16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x56,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_max3_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_max3_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x56,0xd7,0xc1,0xfe,0xf4,0x03] + +v_max3_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_max3_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x56,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_max3_u16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x56,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_max3_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_max3_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x56,0xd7,0xfd,0xd4,0x04,0x03] + +v_max3_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_max3_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x56,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_max3_u32 v5, v1, v2, s3 +// GFX13: v_max3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1e,0xd6,0x01,0x05,0x0e,0x00] + +v_max3_u32 v5, v255, s2, s105 +// GFX13: v_max3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1e,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_u32 v5, s1, v255, exec_hi +// GFX13: v_max3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1e,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_u32 v5, s105, s105, exec_lo +// GFX13: v_max3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1e,0xd6,0x69,0xd2,0xf8,0x01] + +v_max3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_max3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_max3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1e,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_max3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1e,0xd6,0x7b,0xfa,0xed,0x01] + +v_max3_u32 v5, m0, 0.5, m0 +// GFX13: v_max3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_max3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_max3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1e,0xd6,0x7e,0x82,0xad,0x01] + +v_max3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_max3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1e,0xd6,0x7f,0xf8,0xa8,0x01] + +v_max3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_max3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1e,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_max3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_max3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1e,0xd6,0xc1,0xfe,0xf4,0x03] + +v_max3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_max3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1e,0xd6,0xf0,0xfa,0xc0,0x03] + +v_max3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_max3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1e,0xd6,0xfd,0xd4,0x04,0x03] + +v_max3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_max3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1e,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_max_i16 v5, v1, v2 +// GFX13: v_max_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x02] + +v_max_i16 v5, v255, v255 +// GFX13: v_max_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x02] + +v_max_i16 v5, s1, s2 +// GFX13: v_max_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x02] + +v_max_i16 v5, s105, s105 +// GFX13: v_max_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x02] + +v_max_i16 v5, vcc_lo, ttmp15 +// GFX13: v_max_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x02] + +v_max_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_max_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_max_i16 v5, ttmp15, src_scc +// GFX13: v_max_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x02] + +v_max_i16 v5, m0, 0.5 +// GFX13-ASM: v_max_i16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_max_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_max_i16 v5, exec_lo, -1 +// GFX13: v_max_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x02] + +v_max_i16 v5, exec_hi, null +// GFX13: v_max_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x02] + +v_max_i16 v5, null, exec_lo +// GFX13: v_max_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x02] + +v_max_i16 v5, -1, exec_hi +// GFX13: v_max_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x02] + +v_max_i16 v5, 0.5, m0 +// GFX13-ASM: v_max_i16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x0a,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_max_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_max_i16 v5, src_scc, vcc_lo +// GFX13: v_max_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x02] + +v_max_i16 v255, 0xfe0b, vcc_hi +// GFX13: v_max_i16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_max_u16 v5, v1, v2 +// GFX13: v_max_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x02] + +v_max_u16 v5, v255, v255 +// GFX13: v_max_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x02] + +v_max_u16 v5, s1, s2 +// GFX13: v_max_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x02] + +v_max_u16 v5, s105, s105 +// GFX13: v_max_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x02] + +v_max_u16 v5, vcc_lo, ttmp15 +// GFX13: v_max_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x02] + +v_max_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_max_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_max_u16 v5, ttmp15, src_scc +// GFX13: v_max_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x02] + +v_max_u16 v5, m0, 0.5 +// GFX13-ASM: v_max_u16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_max_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_max_u16 v5, exec_lo, -1 +// GFX13: v_max_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x02] + +v_max_u16 v5, exec_hi, null +// GFX13: v_max_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x02] + +v_max_u16 v5, null, exec_lo +// GFX13: v_max_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x02] + +v_max_u16 v5, -1, exec_hi +// GFX13: v_max_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x02] + +v_max_u16 v5, 0.5, m0 +// GFX13-ASM: v_max_u16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x09,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_max_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_max_u16 v5, src_scc, vcc_lo +// GFX13: v_max_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x02] + +v_max_u16 v255, 0xfe0b, vcc_hi +// GFX13: v_max_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5, v1, v2, s3 +// GFX13: v_maxmin_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_num_f16 v5, v255, s2, s105 +// GFX13: v_maxmin_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_num_f16 v5, s1, v255, exec_hi +// GFX13: v_maxmin_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_num_f16 v5, s105, s105, exec_lo +// GFX13: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maxmin_num_f16 v5, m0, 0.5, m0 +// GFX13: v_maxmin_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| +// GFX13: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] + +v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f32 v5, v1, v2, s3 +// GFX13: v_maxmin_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_num_f32 v5, v255, s2, s105 +// GFX13: v_maxmin_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x69,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_num_f32 v5, s1, v255, exec_hi +// GFX13: v_maxmin_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x69,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_num_f32 v5, s105, s105, exec_lo +// GFX13: v_maxmin_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x69,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x69,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maxmin_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x69,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maxmin_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maxmin_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x69,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maxmin_num_f32 v5, m0, 0.5, m0 +// GFX13: v_maxmin_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x69,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maxmin_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x69,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maxmin_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x69,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maxmin_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_maxmin_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x69,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_maxmin_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maxmin_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x69,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maxmin_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maxmin_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x69,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maxmin_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maxmin_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x69,0xd6,0xfd,0xd4,0x04,0x33] + +v_maxmin_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maxmin_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x69,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_maxmin_i32 v5, v1, v2, s3 +// GFX13: v_maxmin_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x64,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_i32 v5, v255, s2, s105 +// GFX13: v_maxmin_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x64,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_i32 v5, s1, v255, exec_hi +// GFX13: v_maxmin_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x64,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_i32 v5, s105, s105, exec_lo +// GFX13: v_maxmin_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x64,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x64,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maxmin_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x64,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maxmin_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_maxmin_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x64,0xd6,0x7b,0xfa,0xed,0x01] + +v_maxmin_i32 v5, m0, 0.5, m0 +// GFX13: v_maxmin_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x64,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_maxmin_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x64,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_maxmin_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x64,0xd6,0x7f,0xf8,0xa8,0x01] + +v_maxmin_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_maxmin_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x64,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_maxmin_i32 v5, -1, exec_hi, src_scc +// GFX13: v_maxmin_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x64,0xd6,0xc1,0xfe,0xf4,0x03] + +v_maxmin_i32 v5, 0.5, m0, 0.5 +// GFX13: v_maxmin_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x64,0xd6,0xf0,0xfa,0xc0,0x03] + +v_maxmin_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_maxmin_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x64,0xd6,0xfd,0xd4,0x04,0x03] + +v_maxmin_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_maxmin_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x64,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_maxmin_u32 v5, v1, v2, s3 +// GFX13: v_maxmin_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x62,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_u32 v5, v255, s2, s105 +// GFX13: v_maxmin_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x62,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_u32 v5, s1, v255, exec_hi +// GFX13: v_maxmin_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x62,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_u32 v5, s105, s105, exec_lo +// GFX13: v_maxmin_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x62,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x62,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maxmin_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x62,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maxmin_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_maxmin_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x62,0xd6,0x7b,0xfa,0xed,0x01] + +v_maxmin_u32 v5, m0, 0.5, m0 +// GFX13: v_maxmin_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x62,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_maxmin_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x62,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_maxmin_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x62,0xd6,0x7f,0xf8,0xa8,0x01] + +v_maxmin_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_maxmin_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x62,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_maxmin_u32 v5, -1, exec_hi, src_scc +// GFX13: v_maxmin_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x62,0xd6,0xc1,0xfe,0xf4,0x03] + +v_maxmin_u32 v5, 0.5, m0, 0.5 +// GFX13: v_maxmin_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x62,0xd6,0xf0,0xfa,0xc0,0x03] + +v_maxmin_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_maxmin_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x62,0xd6,0xfd,0xd4,0x04,0x03] + +v_maxmin_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_maxmin_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x62,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mbcnt_hi_u32_b32 v5, v1, v2 +// GFX13: v_mbcnt_hi_u32_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x66,0xd7,0x01,0x05,0x02,0x02] + +v_mbcnt_hi_u32_b32 v5, v255, v255 +// GFX13: v_mbcnt_hi_u32_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x66,0xd7,0xff,0xff,0x03,0x02] + +v_mbcnt_hi_u32_b32 v5, s1, s2 +// GFX13: v_mbcnt_hi_u32_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x66,0xd7,0x01,0x04,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, s105, s105 +// GFX13: v_mbcnt_hi_u32_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x66,0xd7,0x69,0xd2,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, vcc_lo, ttmp15 +// GFX13: v_mbcnt_hi_u32_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x66,0xd7,0x6a,0xf6,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mbcnt_hi_u32_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x66,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mbcnt_hi_u32_b32 v5, ttmp15, src_scc +// GFX13: v_mbcnt_hi_u32_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x66,0xd7,0x7b,0xfa,0x01,0x02] + +v_mbcnt_hi_u32_b32 v5, m0, 0.5 +// GFX13: v_mbcnt_hi_u32_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x66,0xd7,0x7d,0xe0,0x01,0x02] + +v_mbcnt_hi_u32_b32 v5, exec_lo, -1 +// GFX13: v_mbcnt_hi_u32_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x66,0xd7,0x7e,0x82,0x01,0x02] + +v_mbcnt_hi_u32_b32 v5, exec_hi, null +// GFX13: v_mbcnt_hi_u32_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x66,0xd7,0x7f,0xf8,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, null, exec_lo +// GFX13: v_mbcnt_hi_u32_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x66,0xd7,0x7c,0xfc,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, -1, exec_hi +// GFX13: v_mbcnt_hi_u32_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x66,0xd7,0xc1,0xfe,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, 0.5, m0 +// GFX13: v_mbcnt_hi_u32_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x66,0xd7,0xf0,0xfa,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, src_scc, vcc_lo +// GFX13: v_mbcnt_hi_u32_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x66,0xd7,0xfd,0xd4,0x00,0x02] + +v_mbcnt_hi_u32_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_mbcnt_hi_u32_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x66,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mbcnt_lo_u32_b32 v5, v1, v2 +// GFX13: v_mbcnt_lo_u32_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x65,0xd7,0x01,0x05,0x02,0x02] + +v_mbcnt_lo_u32_b32 v5, v255, v255 +// GFX13: v_mbcnt_lo_u32_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x65,0xd7,0xff,0xff,0x03,0x02] + +v_mbcnt_lo_u32_b32 v5, s1, s2 +// GFX13: v_mbcnt_lo_u32_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x65,0xd7,0x01,0x04,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, s105, s105 +// GFX13: v_mbcnt_lo_u32_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x65,0xd7,0x69,0xd2,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, vcc_lo, ttmp15 +// GFX13: v_mbcnt_lo_u32_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x65,0xd7,0x6a,0xf6,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mbcnt_lo_u32_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x65,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mbcnt_lo_u32_b32 v5, ttmp15, src_scc +// GFX13: v_mbcnt_lo_u32_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x65,0xd7,0x7b,0xfa,0x01,0x02] + +v_mbcnt_lo_u32_b32 v5, m0, 0.5 +// GFX13: v_mbcnt_lo_u32_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x65,0xd7,0x7d,0xe0,0x01,0x02] + +v_mbcnt_lo_u32_b32 v5, exec_lo, -1 +// GFX13: v_mbcnt_lo_u32_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x65,0xd7,0x7e,0x82,0x01,0x02] + +v_mbcnt_lo_u32_b32 v5, exec_hi, null +// GFX13: v_mbcnt_lo_u32_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x65,0xd7,0x7f,0xf8,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, null, exec_lo +// GFX13: v_mbcnt_lo_u32_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x65,0xd7,0x7c,0xfc,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, -1, exec_hi +// GFX13: v_mbcnt_lo_u32_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x65,0xd7,0xc1,0xfe,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, 0.5, m0 +// GFX13: v_mbcnt_lo_u32_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x65,0xd7,0xf0,0xfa,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, src_scc, vcc_lo +// GFX13: v_mbcnt_lo_u32_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x65,0xd7,0xfd,0xd4,0x00,0x02] + +v_mbcnt_lo_u32_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_mbcnt_lo_u32_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x65,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_med3_num_f16 v5, v1, v2, s3 +// GFX13: v_med3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x57,0xd7,0x01,0x05,0x0e,0x00] + +v_med3_num_f16 v5, v255, s2, s105 +// GFX13: v_med3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x57,0xd7,0xff,0x05,0xa4,0x01] + +v_med3_num_f16 v5, s1, v255, exec_hi +// GFX13: v_med3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x57,0xd7,0x01,0xfe,0xff,0x01] + +v_med3_num_f16 v5, s105, s105, exec_lo +// GFX13: v_med3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x57,0xd7,0x69,0xd2,0xf8,0x01] + +v_med3_num_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x57,0xd7,0x6a,0xf6,0x0c,0x04] + +v_med3_num_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_med3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x57,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_med3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_med3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x57,0xd7,0x7b,0xfa,0xed,0xe1] + +v_med3_num_f16 v5, m0, 0.5, m0 +// GFX13: v_med3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x57,0xd7,0x7d,0xe0,0xf5,0x01] + +v_med3_num_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_med3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x57,0xd7,0x7e,0x82,0xad,0x01] + +v_med3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_med3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x57,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_med3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_med3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x57,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_med3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_med3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x57,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_med3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_med3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x57,0xd7,0xf0,0xfa,0xc0,0x43] + +v_med3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_med3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x57,0xd7,0xfd,0xd4,0x04,0x23] + +v_med3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_med3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x57,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_med3_num_f32 v5, v1, v2, s3 +// GFX13: v_med3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x00] + +v_med3_num_f32 v5, v255, s2, s105 +// GFX13: v_med3_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x31,0xd6,0xff,0x05,0xa4,0x01] + +v_med3_num_f32 v5, s1, v255, exec_hi +// GFX13: v_med3_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x31,0xd6,0x01,0xfe,0xff,0x01] + +v_med3_num_f32 v5, s105, s105, exec_lo +// GFX13: v_med3_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x31,0xd6,0x69,0xd2,0xf8,0x01] + +v_med3_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x31,0xd6,0x6a,0xf6,0x0c,0x04] + +v_med3_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_med3_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x31,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_med3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_med3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x31,0xd6,0x7b,0xfa,0xed,0xe1] + +v_med3_num_f32 v5, m0, 0.5, m0 +// GFX13: v_med3_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x31,0xd6,0x7d,0xe0,0xf5,0x01] + +v_med3_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_med3_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x31,0xd6,0x7e,0x82,0xad,0x01] + +v_med3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_med3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x31,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_med3_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_med3_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x31,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_med3_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_med3_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x31,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_med3_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_med3_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x31,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_med3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_med3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x31,0xd6,0xfd,0xd4,0x04,0x33] + +v_med3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_med3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x31,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_med3_i16 v5, v1, v2, s3 +// GFX13: v_med3_i16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x58,0xd7,0x01,0x05,0x0e,0x00] + +v_med3_i16 v5, v255, s2, s105 +// GFX13: v_med3_i16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x58,0xd7,0xff,0x05,0xa4,0x01] + +v_med3_i16 v5, s1, v255, exec_hi +// GFX13: v_med3_i16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x58,0xd7,0x01,0xfe,0xff,0x01] + +v_med3_i16 v5, s105, s105, exec_lo +// GFX13: v_med3_i16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x58,0xd7,0x69,0xd2,0xf8,0x01] + +v_med3_i16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_i16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x58,0xd7,0x6a,0xf6,0x0c,0x04] + +v_med3_i16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_med3_i16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x58,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_med3_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x58,0xd7,0x7b,0xfa,0xed,0x01] + +v_med3_i16 v5, m0, 0.5, m0 +// GFX13-ASM: v_med3_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x58,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_med3_i16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x58,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_med3_i16 v5, exec_lo, -1, vcc_hi +// GFX13: v_med3_i16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x58,0xd7,0x7e,0x82,0xad,0x01] + +v_med3_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_med3_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x58,0xd7,0x7f,0xf8,0xa8,0x01] + +v_med3_i16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_med3_i16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x58,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_med3_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_med3_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x58,0xd7,0xc1,0xfe,0xf4,0x03] + +v_med3_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_med3_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x58,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_med3_i16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x58,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_med3_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_med3_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x58,0xd7,0xfd,0xd4,0x04,0x03] + +v_med3_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_med3_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x58,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_med3_i32 v5, v1, v2, s3 +// GFX13: v_med3_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x20,0xd6,0x01,0x05,0x0e,0x00] + +v_med3_i32 v5, v255, s2, s105 +// GFX13: v_med3_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x20,0xd6,0xff,0x05,0xa4,0x01] + +v_med3_i32 v5, s1, v255, exec_hi +// GFX13: v_med3_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x20,0xd6,0x01,0xfe,0xff,0x01] + +v_med3_i32 v5, s105, s105, exec_lo +// GFX13: v_med3_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x20,0xd6,0x69,0xd2,0xf8,0x01] + +v_med3_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x20,0xd6,0x6a,0xf6,0x0c,0x04] + +v_med3_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_med3_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x20,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_med3_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x20,0xd6,0x7b,0xfa,0xed,0x01] + +v_med3_i32 v5, m0, 0.5, m0 +// GFX13: v_med3_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x20,0xd6,0x7d,0xe0,0xf5,0x01] + +v_med3_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_med3_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x20,0xd6,0x7e,0x82,0xad,0x01] + +v_med3_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_med3_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x20,0xd6,0x7f,0xf8,0xa8,0x01] + +v_med3_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_med3_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x20,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_med3_i32 v5, -1, exec_hi, src_scc +// GFX13: v_med3_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x20,0xd6,0xc1,0xfe,0xf4,0x03] + +v_med3_i32 v5, 0.5, m0, 0.5 +// GFX13: v_med3_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x20,0xd6,0xf0,0xfa,0xc0,0x03] + +v_med3_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_med3_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x20,0xd6,0xfd,0xd4,0x04,0x03] + +v_med3_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_med3_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x20,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_med3_u16 v5, v1, v2, s3 +// GFX13: v_med3_u16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x59,0xd7,0x01,0x05,0x0e,0x00] + +v_med3_u16 v5, v255, s2, s105 +// GFX13: v_med3_u16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x59,0xd7,0xff,0x05,0xa4,0x01] + +v_med3_u16 v5, s1, v255, exec_hi +// GFX13: v_med3_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x59,0xd7,0x01,0xfe,0xff,0x01] + +v_med3_u16 v5, s105, s105, exec_lo +// GFX13: v_med3_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x59,0xd7,0x69,0xd2,0xf8,0x01] + +v_med3_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x59,0xd7,0x6a,0xf6,0x0c,0x04] + +v_med3_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_med3_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x59,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_med3_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x59,0xd7,0x7b,0xfa,0xed,0x01] + +v_med3_u16 v5, m0, 0.5, m0 +// GFX13-ASM: v_med3_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x59,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_med3_u16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x59,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_med3_u16 v5, exec_lo, -1, vcc_hi +// GFX13: v_med3_u16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x59,0xd7,0x7e,0x82,0xad,0x01] + +v_med3_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_med3_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x59,0xd7,0x7f,0xf8,0xa8,0x01] + +v_med3_u16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_med3_u16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x59,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_med3_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_med3_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x59,0xd7,0xc1,0xfe,0xf4,0x03] + +v_med3_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_med3_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x59,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_med3_u16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x59,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_med3_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_med3_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x59,0xd7,0xfd,0xd4,0x04,0x03] + +v_med3_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_med3_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x59,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_med3_u32 v5, v1, v2, s3 +// GFX13: v_med3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x21,0xd6,0x01,0x05,0x0e,0x00] + +v_med3_u32 v5, v255, s2, s105 +// GFX13: v_med3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x21,0xd6,0xff,0x05,0xa4,0x01] + +v_med3_u32 v5, s1, v255, exec_hi +// GFX13: v_med3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x21,0xd6,0x01,0xfe,0xff,0x01] + +v_med3_u32 v5, s105, s105, exec_lo +// GFX13: v_med3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x21,0xd6,0x69,0xd2,0xf8,0x01] + +v_med3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x21,0xd6,0x6a,0xf6,0x0c,0x04] + +v_med3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_med3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x21,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_med3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x21,0xd6,0x7b,0xfa,0xed,0x01] + +v_med3_u32 v5, m0, 0.5, m0 +// GFX13: v_med3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x21,0xd6,0x7d,0xe0,0xf5,0x01] + +v_med3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_med3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x21,0xd6,0x7e,0x82,0xad,0x01] + +v_med3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_med3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x21,0xd6,0x7f,0xf8,0xa8,0x01] + +v_med3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_med3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x21,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_med3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_med3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x21,0xd6,0xc1,0xfe,0xf4,0x03] + +v_med3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_med3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x21,0xd6,0xf0,0xfa,0xc0,0x03] + +v_med3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_med3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x21,0xd6,0xfd,0xd4,0x04,0x03] + +v_med3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_min3_num_f16 v5, v1, v2, s3 +// GFX13: v_min3_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x51,0xd7,0x01,0x05,0x0e,0x00] + +v_min3_num_f16 v5, v255, s2, s105 +// GFX13: v_min3_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x51,0xd7,0xff,0x05,0xa4,0x01] + +v_min3_num_f16 v5, s1, v255, exec_hi +// GFX13: v_min3_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x51,0xd7,0x01,0xfe,0xff,0x01] + +v_min3_num_f16 v5, s105, s105, exec_lo +// GFX13: v_min3_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x51,0xd7,0x69,0xd2,0xf8,0x01] + +v_min3_num_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x51,0xd7,0x6a,0xf6,0x0c,0x04] + +v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x51,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x51,0xd7,0x7b,0xfa,0xed,0xe1] + +v_min3_num_f16 v5, m0, 0.5, m0 +// GFX13: v_min3_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x51,0xd7,0x7d,0xe0,0xf5,0x01] + +v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x51,0xd7,0x7e,0x82,0xad,0x01] + +v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x51,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x51,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x51,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x51,0xd7,0xf0,0xfa,0xc0,0x43] + +v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x51,0xd7,0xfd,0xd4,0x04,0x23] + +v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x51,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_min3_num_f32 v5, v1, v2, s3 +// GFX13: v_min3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_num_f32 v5, v255, s2, s105 +// GFX13: v_min3_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x29,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_num_f32 v5, s1, v255, exec_hi +// GFX13: v_min3_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x29,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_num_f32 v5, s105, s105, exec_lo +// GFX13: v_min3_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x29,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x29,0xd6,0x6a,0xf6,0x0c,0x04] + +v_min3_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_min3_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x29,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_min3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_min3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x29,0xd6,0x7b,0xfa,0xed,0xe1] + +v_min3_num_f32 v5, m0, 0.5, m0 +// GFX13: v_min3_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x29,0xd6,0x7d,0xe0,0xf5,0x01] + +v_min3_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_min3_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x29,0xd6,0x7e,0x82,0xad,0x01] + +v_min3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_min3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x29,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_min3_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_min3_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x29,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_min3_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_min3_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x29,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_min3_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_min3_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x29,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_min3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_min3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x29,0xd6,0xfd,0xd4,0x04,0x33] + +v_min3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_min3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x29,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_min3_i16 v5, v1, v2, s3 +// GFX13: v_min3_i16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x52,0xd7,0x01,0x05,0x0e,0x00] + +v_min3_i16 v5, v255, s2, s105 +// GFX13: v_min3_i16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x52,0xd7,0xff,0x05,0xa4,0x01] + +v_min3_i16 v5, s1, v255, exec_hi +// GFX13: v_min3_i16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x52,0xd7,0x01,0xfe,0xff,0x01] + +v_min3_i16 v5, s105, s105, exec_lo +// GFX13: v_min3_i16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x52,0xd7,0x69,0xd2,0xf8,0x01] + +v_min3_i16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_i16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x52,0xd7,0x6a,0xf6,0x0c,0x04] + +v_min3_i16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_min3_i16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x52,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_min3_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x52,0xd7,0x7b,0xfa,0xed,0x01] + +v_min3_i16 v5, m0, 0.5, m0 +// GFX13-ASM: v_min3_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x52,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_min3_i16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x52,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_min3_i16 v5, exec_lo, -1, vcc_hi +// GFX13: v_min3_i16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x52,0xd7,0x7e,0x82,0xad,0x01] + +v_min3_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_min3_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x52,0xd7,0x7f,0xf8,0xa8,0x01] + +v_min3_i16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_min3_i16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x52,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_min3_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_min3_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x52,0xd7,0xc1,0xfe,0xf4,0x03] + +v_min3_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_min3_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x52,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_min3_i16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x52,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_min3_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_min3_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x52,0xd7,0xfd,0xd4,0x04,0x03] + +v_min3_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_min3_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x52,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_min3_i32 v5, v1, v2, s3 +// GFX13: v_min3_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1a,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_i32 v5, v255, s2, s105 +// GFX13: v_min3_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1a,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_i32 v5, s1, v255, exec_hi +// GFX13: v_min3_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1a,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_i32 v5, s105, s105, exec_lo +// GFX13: v_min3_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1a,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_min3_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_min3_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1a,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_min3_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1a,0xd6,0x7b,0xfa,0xed,0x01] + +v_min3_i32 v5, m0, 0.5, m0 +// GFX13: v_min3_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_min3_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_min3_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1a,0xd6,0x7e,0x82,0xad,0x01] + +v_min3_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_min3_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1a,0xd6,0x7f,0xf8,0xa8,0x01] + +v_min3_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_min3_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1a,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_min3_i32 v5, -1, exec_hi, src_scc +// GFX13: v_min3_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1a,0xd6,0xc1,0xfe,0xf4,0x03] + +v_min3_i32 v5, 0.5, m0, 0.5 +// GFX13: v_min3_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1a,0xd6,0xf0,0xfa,0xc0,0x03] + +v_min3_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_min3_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1a,0xd6,0xfd,0xd4,0x04,0x03] + +v_min3_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_min3_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1a,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_min3_u16 v5, v1, v2, s3 +// GFX13: v_min3_u16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x53,0xd7,0x01,0x05,0x0e,0x00] + +v_min3_u16 v5, v255, s2, s105 +// GFX13: v_min3_u16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x53,0xd7,0xff,0x05,0xa4,0x01] + +v_min3_u16 v5, s1, v255, exec_hi +// GFX13: v_min3_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x53,0xd7,0x01,0xfe,0xff,0x01] + +v_min3_u16 v5, s105, s105, exec_lo +// GFX13: v_min3_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x53,0xd7,0x69,0xd2,0xf8,0x01] + +v_min3_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x53,0xd7,0x6a,0xf6,0x0c,0x04] + +v_min3_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_min3_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x53,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_min3_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x53,0xd7,0x7b,0xfa,0xed,0x01] + +v_min3_u16 v5, m0, 0.5, m0 +// GFX13-ASM: v_min3_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x53,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_min3_u16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x53,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_min3_u16 v5, exec_lo, -1, vcc_hi +// GFX13: v_min3_u16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x53,0xd7,0x7e,0x82,0xad,0x01] + +v_min3_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_min3_u16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x53,0xd7,0x7f,0xf8,0xa8,0x01] + +v_min3_u16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_min3_u16 v5, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x53,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_min3_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_min3_u16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x53,0xd7,0xc1,0xfe,0xf4,0x03] + +v_min3_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_min3_u16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x53,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_min3_u16 v5, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x53,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_min3_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_min3_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x53,0xd7,0xfd,0xd4,0x04,0x03] + +v_min3_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_min3_u16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x53,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_min3_u32 v5, v1, v2, s3 +// GFX13: v_min3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1b,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_u32 v5, v255, s2, s105 +// GFX13: v_min3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1b,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_u32 v5, s1, v255, exec_hi +// GFX13: v_min3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1b,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_u32 v5, s105, s105, exec_lo +// GFX13: v_min3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1b,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1b,0xd6,0x6a,0xf6,0x0c,0x04] + +v_min3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_min3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1b,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_min3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1b,0xd6,0x7b,0xfa,0xed,0x01] + +v_min3_u32 v5, m0, 0.5, m0 +// GFX13: v_min3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1b,0xd6,0x7d,0xe0,0xf5,0x01] + +v_min3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_min3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1b,0xd6,0x7e,0x82,0xad,0x01] + +v_min3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_min3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1b,0xd6,0x7f,0xf8,0xa8,0x01] + +v_min3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_min3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1b,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_min3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_min3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1b,0xd6,0xc1,0xfe,0xf4,0x03] + +v_min3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_min3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1b,0xd6,0xf0,0xfa,0xc0,0x03] + +v_min3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_min3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1b,0xd6,0xfd,0xd4,0x04,0x03] + +v_min3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_min3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_min_i16 v5, v1, v2 +// GFX13: v_min_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x02] + +v_min_i16 v5, v255, v255 +// GFX13: v_min_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x02] + +v_min_i16 v5, s1, s2 +// GFX13: v_min_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x02] + +v_min_i16 v5, s105, s105 +// GFX13: v_min_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x02] + +v_min_i16 v5, vcc_lo, ttmp15 +// GFX13: v_min_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x02] + +v_min_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_min_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_min_i16 v5, ttmp15, src_scc +// GFX13: v_min_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x02] + +v_min_i16 v5, m0, 0.5 +// GFX13-ASM: v_min_i16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_min_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_min_i16 v5, exec_lo, -1 +// GFX13: v_min_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x02] + +v_min_i16 v5, exec_hi, null +// GFX13: v_min_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x02] + +v_min_i16 v5, null, exec_lo +// GFX13: v_min_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x02] + +v_min_i16 v5, -1, exec_hi +// GFX13: v_min_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x02] + +v_min_i16 v5, 0.5, m0 +// GFX13-ASM: v_min_i16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x0c,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_min_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_min_i16 v5, src_scc, vcc_lo +// GFX13: v_min_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x02] + +v_min_i16 v255, 0xfe0b, vcc_hi +// GFX13: v_min_i16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_min_u16 v5, v1, v2 +// GFX13: v_min_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x02] + +v_min_u16 v5, v255, v255 +// GFX13: v_min_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x02] + +v_min_u16 v5, s1, s2 +// GFX13: v_min_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x02] + +v_min_u16 v5, s105, s105 +// GFX13: v_min_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x02] + +v_min_u16 v5, vcc_lo, ttmp15 +// GFX13: v_min_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x02] + +v_min_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_min_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_min_u16 v5, ttmp15, src_scc +// GFX13: v_min_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x02] + +v_min_u16 v5, m0, 0.5 +// GFX13-ASM: v_min_u16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_min_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_min_u16 v5, exec_lo, -1 +// GFX13: v_min_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x02] + +v_min_u16 v5, exec_hi, null +// GFX13: v_min_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x02] + +v_min_u16 v5, null, exec_lo +// GFX13: v_min_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x02] + +v_min_u16 v5, -1, exec_hi +// GFX13: v_min_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x02] + +v_min_u16 v5, 0.5, m0 +// GFX13-ASM: v_min_u16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x0b,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_min_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_min_u16 v5, src_scc, vcc_lo +// GFX13: v_min_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x02] + +v_min_u16 v255, 0xfe0b, vcc_hi +// GFX13: v_min_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f16 v5, v1, v2, s3 +// GFX13: v_minmax_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_num_f16 v5, v255, s2, s105 +// GFX13: v_minmax_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_num_f16 v5, s1, v255, exec_hi +// GFX13: v_minmax_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_num_f16 v5, s105, s105, exec_lo +// GFX13: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minmax_num_f16 v5, m0, 0.5, m0 +// GFX13: v_minmax_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| +// GFX13: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] + +v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f32 v5, v1, v2, s3 +// GFX13: v_minmax_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_num_f32 v5, v255, s2, s105 +// GFX13: v_minmax_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x68,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_num_f32 v5, s1, v255, exec_hi +// GFX13: v_minmax_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x68,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_num_f32 v5, s105, s105, exec_lo +// GFX13: v_minmax_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x68,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x68,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minmax_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x68,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minmax_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minmax_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x68,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minmax_num_f32 v5, m0, 0.5, m0 +// GFX13: v_minmax_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x68,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minmax_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x68,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minmax_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x68,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minmax_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_minmax_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x68,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_minmax_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minmax_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x68,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minmax_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minmax_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x68,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minmax_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minmax_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x68,0xd6,0xfd,0xd4,0x04,0x33] + +v_minmax_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minmax_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x68,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_minmax_i32 v5, v1, v2, s3 +// GFX13: v_minmax_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x65,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_i32 v5, v255, s2, s105 +// GFX13: v_minmax_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x65,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_i32 v5, s1, v255, exec_hi +// GFX13: v_minmax_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x65,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_i32 v5, s105, s105, exec_lo +// GFX13: v_minmax_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x65,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x65,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minmax_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x65,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minmax_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_minmax_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x65,0xd6,0x7b,0xfa,0xed,0x01] + +v_minmax_i32 v5, m0, 0.5, m0 +// GFX13: v_minmax_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x65,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_minmax_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x65,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_minmax_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x65,0xd6,0x7f,0xf8,0xa8,0x01] + +v_minmax_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_minmax_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x65,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_minmax_i32 v5, -1, exec_hi, src_scc +// GFX13: v_minmax_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x65,0xd6,0xc1,0xfe,0xf4,0x03] + +v_minmax_i32 v5, 0.5, m0, 0.5 +// GFX13: v_minmax_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x65,0xd6,0xf0,0xfa,0xc0,0x03] + +v_minmax_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_minmax_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x65,0xd6,0xfd,0xd4,0x04,0x03] + +v_minmax_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_minmax_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x65,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_minmax_u32 v5, v1, v2, s3 +// GFX13: v_minmax_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x63,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_u32 v5, v255, s2, s105 +// GFX13: v_minmax_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x63,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_u32 v5, s1, v255, exec_hi +// GFX13: v_minmax_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x63,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_u32 v5, s105, s105, exec_lo +// GFX13: v_minmax_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x63,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x63,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minmax_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x63,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minmax_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_minmax_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x63,0xd6,0x7b,0xfa,0xed,0x01] + +v_minmax_u32 v5, m0, 0.5, m0 +// GFX13: v_minmax_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x63,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_minmax_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x63,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_minmax_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x63,0xd6,0x7f,0xf8,0xa8,0x01] + +v_minmax_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_minmax_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x63,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_minmax_u32 v5, -1, exec_hi, src_scc +// GFX13: v_minmax_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x63,0xd6,0xc1,0xfe,0xf4,0x03] + +v_minmax_u32 v5, 0.5, m0, 0.5 +// GFX13: v_minmax_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x63,0xd6,0xf0,0xfa,0xc0,0x03] + +v_minmax_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_minmax_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x63,0xd6,0xfd,0xd4,0x04,0x03] + +v_minmax_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_minmax_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x63,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0x05,0xea,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0xff,0xeb,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0x05,0xe8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0xd3,0xe8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] ; encoding: [0x05,0x00,0x3b,0xd6,0xfe,0xf7,0x18,0x00] + +v_mqsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] ; encoding: [0x05,0x00,0x3b,0xd6,0x02,0xd6,0x0c,0x04] + +v_mqsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] ; encoding: [0x05,0x00,0x3b,0xd6,0x68,0xd4,0xa0,0x01] + +v_mqsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] ; encoding: [0x05,0x00,0x3b,0xd6,0x6a,0xfa,0xf8,0x07] + +v_mqsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null +// GFX13: v_mqsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null ; encoding: [0x05,0x00,0x3b,0xd6,0x7a,0xfe,0xf0,0x01] + +v_mqsad_pk_u16_u8 v[5:6], exec, exec_lo, exec +// GFX13: v_mqsad_pk_u16_u8 v[5:6], exec, exec_lo, exec ; encoding: [0x05,0x00,0x3b,0xd6,0x7e,0xfc,0xf8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], null, null, vcc +// GFX13: v_mqsad_pk_u16_u8 v[5:6], null, null, vcc ; encoding: [0x05,0x00,0x3b,0xd6,0x7c,0xf8,0xa8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_mqsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x3b,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_mqsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc +// GFX13: v_mqsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc ; encoding: [0x05,0x00,0x3b,0xd6,0xf0,0xe0,0xf5,0x03] + +v_mqsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 +// GFX13: v_mqsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 ; encoding: [0x05,0x00,0x3b,0xd6,0xfd,0xfa,0xc1,0x03] + +v_mqsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp +// GFX13: v_mqsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp ; encoding: [0xfe,0x80,0x3b,0xd6,0xff,0xfe,0x05,0x03,0x56,0x34,0x12,0xaf] + +v_mqsad_u32_u8 v[5:8], v[1:2], v2, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], v2, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0x05,0xf2,0x07] + +v_mqsad_u32_u8 v[5:8], v[1:2], v255, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], v255, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0xff,0xf3,0x07] + +v_mqsad_u32_u8 v[5:8], v[1:2], s2, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], s2, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0x05,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], v[1:2], s105, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], s105, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0xd3,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], v[254:255], ttmp15, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[254:255], ttmp15, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xfe,0xf7,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], s[2:3], vcc_hi, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], s[2:3], vcc_hi, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x02,0xd6,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], s[104:105], vcc_lo, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], s[104:105], vcc_lo, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x68,0xd4,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], vcc, m0, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], vcc, m0, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x6a,0xfa,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], ttmp[14:15], exec_hi, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], ttmp[14:15], exec_hi, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x7a,0xfe,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], exec, exec_lo, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], exec, exec_lo, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x7e,0xfc,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], null, null, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], null, null, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x7c,0xf8,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], -1, -1, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], -1, -1, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xc1,0x82,0xf1,0x07] + +v_mqsad_u32_u8 v[5:8], 0.5, 0.5, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], 0.5, 0.5, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xf0,0xe0,0xf1,0x07] + +v_mqsad_u32_u8 v[5:8], src_scc, src_scc, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], src_scc, src_scc, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xfd,0xfa,0xf1,0x07] + +v_mqsad_u32_u8 v[252:255], 0xaf123456, 0xaf123456, v[3:6] clamp +// GFX13: v_mqsad_u32_u8 v[252:255], 0xaf123456, 0xaf123456, v[3:6] clamp ; encoding: [0xfc,0x80,0x3d,0xd6,0xff,0xfe,0x0d,0x04,0x56,0x34,0x12,0xaf] + +v_msad_u8 v5, v1, v2, s3 +// GFX13: v_msad_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x39,0xd6,0x01,0x05,0x0e,0x00] + +v_msad_u8 v5, v255, s2, s105 +// GFX13: v_msad_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x39,0xd6,0xff,0x05,0xa4,0x01] + +v_msad_u8 v5, s1, v255, exec_hi +// GFX13: v_msad_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x39,0xd6,0x01,0xfe,0xff,0x01] + +v_msad_u8 v5, s105, s105, exec_lo +// GFX13: v_msad_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x39,0xd6,0x69,0xd2,0xf8,0x01] + +v_msad_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_msad_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x39,0xd6,0x6a,0xf6,0x0c,0x04] + +v_msad_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_msad_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x39,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_msad_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_msad_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x39,0xd6,0x7b,0xfa,0xed,0x01] + +v_msad_u8 v5, m0, 0.5, m0 +// GFX13: v_msad_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x39,0xd6,0x7d,0xe0,0xf5,0x01] + +v_msad_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_msad_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x39,0xd6,0x7e,0x82,0xad,0x01] + +v_msad_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_msad_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x39,0xd6,0x7f,0xf8,0xa8,0x01] + +v_msad_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_msad_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x39,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_msad_u8 v5, -1, exec_hi, src_scc +// GFX13: v_msad_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x39,0xd6,0xc1,0xfe,0xf4,0x03] + +v_msad_u8 v5, 0.5, m0, 0.5 +// GFX13: v_msad_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x39,0xd6,0xf0,0xfa,0xc0,0x03] + +v_msad_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_msad_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x39,0xd6,0xfd,0xd4,0x04,0x03] + +v_msad_u8 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_msad_u8 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x39,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mul_hi_i32 v5, v1, v2 +// GFX13: v_mul_hi_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x2e,0xd7,0x01,0x05,0x02,0x02] + +v_mul_hi_i32 v5, v255, v255 +// GFX13: v_mul_hi_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x2e,0xd7,0xff,0xff,0x03,0x02] + +v_mul_hi_i32 v5, s1, s2 +// GFX13: v_mul_hi_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x2e,0xd7,0x01,0x04,0x00,0x02] + +v_mul_hi_i32 v5, s105, s105 +// GFX13: v_mul_hi_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x2e,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_hi_i32 v5, vcc_lo, ttmp15 +// GFX13: v_mul_hi_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x2e,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_hi_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mul_hi_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x2e,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mul_hi_i32 v5, ttmp15, src_scc +// GFX13: v_mul_hi_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x2e,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_hi_i32 v5, m0, 0.5 +// GFX13: v_mul_hi_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x2e,0xd7,0x7d,0xe0,0x01,0x02] + +v_mul_hi_i32 v5, exec_lo, -1 +// GFX13: v_mul_hi_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x2e,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_hi_i32 v5, exec_hi, null +// GFX13: v_mul_hi_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x2e,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_hi_i32 v5, null, exec_lo +// GFX13: v_mul_hi_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x2e,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_hi_i32 v5, -1, exec_hi +// GFX13: v_mul_hi_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x2e,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_hi_i32 v5, 0.5, m0 +// GFX13: v_mul_hi_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x2e,0xd7,0xf0,0xfa,0x00,0x02] + +v_mul_hi_i32 v5, src_scc, vcc_lo +// GFX13: v_mul_hi_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x2e,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_hi_i32 v255, 0xaf123456, vcc_hi +// GFX13: v_mul_hi_i32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x2e,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mul_hi_u32 v5, v1, v2 +// GFX13: v_mul_hi_u32 v5, v1, v2 ; encoding: [0x05,0x00,0x2d,0xd7,0x01,0x05,0x02,0x02] + +v_mul_hi_u32 v5, v255, v255 +// GFX13: v_mul_hi_u32 v5, v255, v255 ; encoding: [0x05,0x00,0x2d,0xd7,0xff,0xff,0x03,0x02] + +v_mul_hi_u32 v5, s1, s2 +// GFX13: v_mul_hi_u32 v5, s1, s2 ; encoding: [0x05,0x00,0x2d,0xd7,0x01,0x04,0x00,0x02] + +v_mul_hi_u32 v5, s105, s105 +// GFX13: v_mul_hi_u32 v5, s105, s105 ; encoding: [0x05,0x00,0x2d,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_hi_u32 v5, vcc_lo, ttmp15 +// GFX13: v_mul_hi_u32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x2d,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_hi_u32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mul_hi_u32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x2d,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mul_hi_u32 v5, ttmp15, src_scc +// GFX13: v_mul_hi_u32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x2d,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_hi_u32 v5, m0, 0.5 +// GFX13: v_mul_hi_u32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x2d,0xd7,0x7d,0xe0,0x01,0x02] + +v_mul_hi_u32 v5, exec_lo, -1 +// GFX13: v_mul_hi_u32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x2d,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_hi_u32 v5, exec_hi, null +// GFX13: v_mul_hi_u32 v5, exec_hi, null ; encoding: [0x05,0x00,0x2d,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_hi_u32 v5, null, exec_lo +// GFX13: v_mul_hi_u32 v5, null, exec_lo ; encoding: [0x05,0x00,0x2d,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_hi_u32 v5, -1, exec_hi +// GFX13: v_mul_hi_u32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x2d,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_hi_u32 v5, 0.5, m0 +// GFX13: v_mul_hi_u32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x2d,0xd7,0xf0,0xfa,0x00,0x02] + +v_mul_hi_u32 v5, src_scc, vcc_lo +// GFX13: v_mul_hi_u32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x2d,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_hi_u32 v255, 0xaf123456, vcc_hi +// GFX13: v_mul_hi_u32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mul_lo_u16 v5, v1, v2 +// GFX13: v_mul_lo_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x02] + +v_mul_lo_u16 v5, v255, v255 +// GFX13: v_mul_lo_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x02] + +v_mul_lo_u16 v5, s1, s2 +// GFX13: v_mul_lo_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x02] + +v_mul_lo_u16 v5, s105, s105 +// GFX13: v_mul_lo_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_lo_u16 v5, vcc_lo, ttmp15 +// GFX13: v_mul_lo_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_lo_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_mul_lo_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_mul_lo_u16 v5, ttmp15, src_scc +// GFX13: v_mul_lo_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_lo_u16 v5, m0, 0.5 +// GFX13-ASM: v_mul_lo_u16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_mul_lo_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_mul_lo_u16 v5, exec_lo, -1 +// GFX13: v_mul_lo_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_lo_u16 v5, exec_hi, null +// GFX13: v_mul_lo_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_lo_u16 v5, null, exec_lo +// GFX13: v_mul_lo_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_lo_u16 v5, -1, exec_hi +// GFX13: v_mul_lo_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_lo_u16 v5, 0.5, m0 +// GFX13-ASM: v_mul_lo_u16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x05,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_mul_lo_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_mul_lo_u16 v5, src_scc, vcc_lo +// GFX13: v_mul_lo_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_lo_u16 v255, 0xfe0b, vcc_hi +// GFX13: v_mul_lo_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_mul_lo_u32 v5, v1, v2 +// GFX13: v_mul_lo_u32 v5, v1, v2 ; encoding: [0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x02] + +v_mul_lo_u32 v5, v255, v255 +// GFX13: v_mul_lo_u32 v5, v255, v255 ; encoding: [0x05,0x00,0x2c,0xd7,0xff,0xff,0x03,0x02] + +v_mul_lo_u32 v5, s1, s2 +// GFX13: v_mul_lo_u32 v5, s1, s2 ; encoding: [0x05,0x00,0x2c,0xd7,0x01,0x04,0x00,0x02] + +v_mul_lo_u32 v5, s105, s105 +// GFX13: v_mul_lo_u32 v5, s105, s105 ; encoding: [0x05,0x00,0x2c,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_lo_u32 v5, vcc_lo, ttmp15 +// GFX13: v_mul_lo_u32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x2c,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_lo_u32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mul_lo_u32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x2c,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mul_lo_u32 v5, ttmp15, src_scc +// GFX13: v_mul_lo_u32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x2c,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_lo_u32 v5, m0, 0.5 +// GFX13: v_mul_lo_u32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x2c,0xd7,0x7d,0xe0,0x01,0x02] + +v_mul_lo_u32 v5, exec_lo, -1 +// GFX13: v_mul_lo_u32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x2c,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_lo_u32 v5, exec_hi, null +// GFX13: v_mul_lo_u32 v5, exec_hi, null ; encoding: [0x05,0x00,0x2c,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_lo_u32 v5, null, exec_lo +// GFX13: v_mul_lo_u32 v5, null, exec_lo ; encoding: [0x05,0x00,0x2c,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_lo_u32 v5, -1, exec_hi +// GFX13: v_mul_lo_u32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x2c,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_lo_u32 v5, 0.5, m0 +// GFX13: v_mul_lo_u32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd7,0xf0,0xfa,0x00,0x02] + +v_mul_lo_u32 v5, src_scc, vcc_lo +// GFX13: v_mul_lo_u32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x2c,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_lo_u32 v255, 0xaf123456, vcc_hi +// GFX13: v_mul_lo_u32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x2c,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mullit_f32 v5, v1, v2, s3 +// GFX13: v_mullit_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x18,0xd6,0x01,0x05,0x0e,0x00] + +v_mullit_f32 v5, v255, s2, s105 +// GFX13: v_mullit_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x18,0xd6,0xff,0x05,0xa4,0x01] + +v_mullit_f32 v5, s1, v255, exec_hi +// GFX13: v_mullit_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x18,0xd6,0x01,0xfe,0xff,0x01] + +v_mullit_f32 v5, s105, s105, exec_lo +// GFX13: v_mullit_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x18,0xd6,0x69,0xd2,0xf8,0x01] + +v_mullit_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mullit_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x18,0xd6,0x6a,0xf6,0x0c,0x04] + +v_mullit_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_mullit_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x18,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_mullit_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_mullit_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x18,0xd6,0x7b,0xfa,0xed,0xe1] + +v_mullit_f32 v5, m0, 0.5, m0 +// GFX13: v_mullit_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x18,0xd6,0x7d,0xe0,0xf5,0x01] + +v_mullit_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_mullit_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x18,0xd6,0x7e,0x82,0xad,0x01] + +v_mullit_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_mullit_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x18,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_mullit_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_mullit_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x18,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_mullit_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_mullit_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x18,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_mullit_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_mullit_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x18,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_mullit_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_mullit_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x18,0xd6,0xfd,0xd4,0x04,0x33] + +v_mullit_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_mullit_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x18,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_or3_b32 v5, v1, v2, s3 +// GFX13: v_or3_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x72,0xd7,0x01,0x05,0x0e,0x00] + +v_or3_b32 v5, v255, s2, s105 +// GFX13: v_or3_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x72,0xd7,0xff,0x05,0xa4,0x01] + +v_or3_b32 v5, s1, v255, exec_hi +// GFX13: v_or3_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x72,0xd7,0x01,0xfe,0xff,0x01] + +v_or3_b32 v5, s105, s105, exec_lo +// GFX13: v_or3_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x72,0xd7,0x69,0xd2,0xf8,0x01] + +v_or3_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_or3_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x72,0xd7,0x6a,0xf6,0x0c,0x04] + +v_or3_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_or3_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x72,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_or3_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_or3_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x72,0xd7,0x7b,0xfa,0xed,0x01] + +v_or3_b32 v5, m0, 0.5, m0 +// GFX13: v_or3_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x72,0xd7,0x7d,0xe0,0xf5,0x01] + +v_or3_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_or3_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x72,0xd7,0x7e,0x82,0xad,0x01] + +v_or3_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_or3_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x72,0xd7,0x7f,0xf8,0xa8,0x01] + +v_or3_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_or3_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x72,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_or3_b32 v5, -1, exec_hi, src_scc +// GFX13: v_or3_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x72,0xd7,0xc1,0xfe,0xf4,0x03] + +v_or3_b32 v5, 0.5, m0, 0.5 +// GFX13: v_or3_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x72,0xd7,0xf0,0xfa,0xc0,0x03] + +v_or3_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_or3_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x72,0xd7,0xfd,0xd4,0x04,0x03] + +v_or3_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_or3_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x72,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_or_b16 v5, v1, v2 +// GFX13: v_or_b16 v5, v1, v2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x02] + +v_or_b16 v5, v255, v255 +// GFX13: v_or_b16 v5, v255, v255 ; encoding: [0x05,0x00,0x25,0xd7,0xff,0xff,0x03,0x02] + +v_or_b16 v5, s1, s2 +// GFX13: v_or_b16 v5, s1, s2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x04,0x00,0x02] + +v_or_b16 v5, s105, s105 +// GFX13: v_or_b16 v5, s105, s105 ; encoding: [0x05,0x00,0x25,0xd7,0x69,0xd2,0x00,0x02] + +v_or_b16 v5, vcc_lo, ttmp15 +// GFX13: v_or_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x25,0xd7,0x6a,0xf6,0x00,0x02] + +v_or_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_or_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x25,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_or_b16 v5, ttmp15, src_scc +// GFX13: v_or_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x25,0xd7,0x7b,0xfa,0x01,0x02] + +v_or_b16 v5, m0, 0.5 +// GFX13-ASM: v_or_b16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x25,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_or_b16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x25,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_or_b16 v5, exec_lo, -1 +// GFX13: v_or_b16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x25,0xd7,0x7e,0x82,0x01,0x02] + +v_or_b16 v5, exec_hi, null +// GFX13: v_or_b16 v5, exec_hi, null ; encoding: [0x05,0x00,0x25,0xd7,0x7f,0xf8,0x00,0x02] + +v_or_b16 v5, null, exec_lo +// GFX13: v_or_b16 v5, null, exec_lo ; encoding: [0x05,0x00,0x25,0xd7,0x7c,0xfc,0x00,0x02] + +v_or_b16 v5, -1, exec_hi +// GFX13: v_or_b16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x25,0xd7,0xc1,0xfe,0x00,0x02] + +v_or_b16 v5, 0.5, m0 +// GFX13-ASM: v_or_b16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x25,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_or_b16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x25,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_or_b16 v5, src_scc, vcc_lo +// GFX13: v_or_b16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x25,0xd7,0xfd,0xd4,0x00,0x02] + +v_or_b16 v255, 0xfe0b, vcc_hi +// GFX13: v_or_b16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x25,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_pack_b32_f16 v5, v1, v2 +// GFX13: v_pack_b32_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x05,0x02,0x02] + +v_pack_b32_f16 v5, v255, v255 +// GFX13: v_pack_b32_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x11,0xd7,0xff,0xff,0x03,0x02] + +v_pack_b32_f16 v5, s1, s2 +// GFX13: v_pack_b32_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x04,0x00,0x02] + +v_pack_b32_f16 v5, s105, s105 +// GFX13: v_pack_b32_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x11,0xd7,0x69,0xd2,0x00,0x02] + +v_pack_b32_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pack_b32_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x11,0xd7,0x6a,0xf6,0x00,0x02] + +v_pack_b32_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pack_b32_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x11,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_pack_b32_f16 v5, ttmp15, src_scc +// GFX13: v_pack_b32_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x11,0xd7,0x7b,0xfa,0x01,0x02] + +v_pack_b32_f16 v5, m0, 0.5 +// GFX13: v_pack_b32_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x11,0xd7,0x7d,0xe0,0x01,0x02] + +v_pack_b32_f16 v5, exec_lo, -1 +// GFX13: v_pack_b32_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x11,0xd7,0x7e,0x82,0x01,0x02] + +v_pack_b32_f16 v5, |exec_hi|, null +// GFX13: v_pack_b32_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x11,0xd7,0x7f,0xf8,0x00,0x02] + +v_pack_b32_f16 v5, null, exec_lo +// GFX13: v_pack_b32_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x11,0xd7,0x7c,0xfc,0x00,0x02] + +v_pack_b32_f16 v5, -1, exec_hi +// GFX13: v_pack_b32_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x11,0xd7,0xc1,0xfe,0x00,0x02] + +v_pack_b32_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_pack_b32_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x11,0xd7,0xf0,0xfa,0x00,0x42] + +v_pack_b32_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_pack_b32_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x11,0xd7,0xfd,0xd4,0x00,0x22] + +v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x11,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_perm_b32 v5, v1, v2, s3 +// GFX13: v_perm_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x44,0xd7,0x01,0x05,0x0e,0x00] + +v_perm_b32 v5, v255, s2, s105 +// GFX13: v_perm_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x44,0xd7,0xff,0x05,0xa4,0x01] + +v_perm_b32 v5, s1, v255, exec_hi +// GFX13: v_perm_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x44,0xd7,0x01,0xfe,0xff,0x01] + +v_perm_b32 v5, s105, s105, exec_lo +// GFX13: v_perm_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x44,0xd7,0x69,0xd2,0xf8,0x01] + +v_perm_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_perm_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x44,0xd7,0x6a,0xf6,0x0c,0x04] + +v_perm_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_perm_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x44,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_perm_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_perm_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x44,0xd7,0x7b,0xfa,0xed,0x01] + +v_perm_b32 v5, m0, 0.5, m0 +// GFX13: v_perm_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x44,0xd7,0x7d,0xe0,0xf5,0x01] + +v_perm_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_perm_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x44,0xd7,0x7e,0x82,0xad,0x01] + +v_perm_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_perm_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x44,0xd7,0x7f,0xf8,0xa8,0x01] + +v_perm_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_perm_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x44,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_perm_b32 v5, -1, exec_hi, src_scc +// GFX13: v_perm_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x44,0xd7,0xc1,0xfe,0xf4,0x03] + +v_perm_b32 v5, 0.5, m0, 0.5 +// GFX13: v_perm_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x44,0xd7,0xf0,0xfa,0xc0,0x03] + +v_perm_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_perm_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x44,0xd7,0xfd,0xd4,0x04,0x03] + +v_perm_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_perm_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x44,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] +// GFX13: v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0a,0x1a,0x04] + +v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] ; encoding: [0x02,0x00,0x48,0xd7,0x04,0xe3,0x18,0x00] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 100 +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, v5, 0x64 ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0b,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0b,0x12,0x02] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0b,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] +// GFX13: v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x49,0xd7,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 100 +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 0x64 ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x09,0x12,0x02] + +v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x11,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] +// GFX13: v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 100 +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 0x64 ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x09,0x12,0x02] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x11,0x1a,0x04] + +v_permlane16_b32 v5, v1, s2, s3 +// GFX13: v_permlane16_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0x05,0x0c,0x00] + +v_permlane16_b32 v5, v1, s105, s105 +// GFX13: v_permlane16_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xd3,0xa4,0x01] + +v_permlane16_b32 v5, v1, ttmp15, ttmp15 +// GFX13: v_permlane16_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xf7,0xec,0x01] + +v_permlane16_b32 v5, v1, vcc_hi, exec_lo +// GFX13: v_permlane16_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xd7,0xf8,0x01] + +v_permlane16_b32 v5, v1, vcc_lo, m0 +// GFX13: v_permlane16_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xd5,0xf4,0x01] + +v_permlane16_b32 v5, v1, m0, vcc_hi +// GFX13: v_permlane16_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xfb,0xac,0x01] + +v_permlane16_b32 v5, v1, exec_hi, vcc_lo +// GFX13: v_permlane16_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xff,0xa8,0x01] + +v_permlane16_b32 v5, v1, exec_lo, src_scc +// GFX13: v_permlane16_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xfd,0xf4,0x03] + +v_permlane16_b32 v5, v1, null, 0.5 op_sel:[1,1] +// GFX13: v_permlane16_b32 v5, v1, null, 0.5 op_sel:[1,1] ; encoding: [0x05,0x18,0x77,0xd7,0x01,0xf9,0xc0,0x03] + +v_permlane16_b32 v5, v1, -1, -1 op_sel:[0,0] +// GFX13: v_permlane16_b32 v5, v1, -1, -1 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0x83,0x05,0x03] + +v_permlane16_b32 v5, v1, 0.5, null op_sel:[1,0] +// GFX13: v_permlane16_b32 v5, v1, 0.5, null op_sel:[1,0] ; encoding: [0x05,0x08,0x77,0xd7,0x01,0xe1,0xf1,0x01] + +v_permlane16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] +// GFX13: v_permlane16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] ; encoding: [0xff,0x10,0x77,0xd7,0xff,0xfb,0xfd,0x01] + +v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, s105, s105 +// W32: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, s2, s3 +// W32: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, s105, s105 +// W32: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, s2, s3 +// W32: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, exec_hi +// W32: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xff,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, exec_lo +// W32: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xfd,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, m0 +// W32: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xfb,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, s105 +// W32: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xd3,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, s2 +// W32: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0x05,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, ttmp15 +// W32: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xf7,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, vcc_hi +// W32: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xd7,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, vcc_lo +// W32: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xd5,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_up_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, s105, s105 +// W32: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, s2, s3 +// W32: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, s105, s105 +// W32: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, s2, s3 +// W32: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlanex16_b32 v5, v1, s2, s3 +// GFX13: v_permlanex16_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0x05,0x0c,0x00] + +v_permlanex16_b32 v5, v1, s105, s105 +// GFX13: v_permlanex16_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xd3,0xa4,0x01] + +v_permlanex16_b32 v5, v1, ttmp15, ttmp15 +// GFX13: v_permlanex16_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xf7,0xec,0x01] + +v_permlanex16_b32 v5, v1, vcc_hi, exec_lo +// GFX13: v_permlanex16_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xd7,0xf8,0x01] + +v_permlanex16_b32 v5, v1, vcc_lo, m0 +// GFX13: v_permlanex16_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xd5,0xf4,0x01] + +v_permlanex16_b32 v5, v1, m0, vcc_hi +// GFX13: v_permlanex16_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xfb,0xac,0x01] + +v_permlanex16_b32 v5, v1, exec_hi, vcc_lo +// GFX13: v_permlanex16_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xff,0xa8,0x01] + +v_permlanex16_b32 v5, v1, exec_lo, src_scc +// GFX13: v_permlanex16_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xfd,0xf4,0x03] + +v_permlanex16_b32 v5, v1, null, 0.5 op_sel:[1,1] +// GFX13: v_permlanex16_b32 v5, v1, null, 0.5 op_sel:[1,1] ; encoding: [0x05,0x18,0x78,0xd7,0x01,0xf9,0xc0,0x03] + +v_permlanex16_b32 v5, v1, -1, -1 op_sel:[0,0] +// GFX13: v_permlanex16_b32 v5, v1, -1, -1 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0x83,0x05,0x03] + +v_permlanex16_b32 v5, v1, 0.5, null op_sel:[1,0] +// GFX13: v_permlanex16_b32 v5, v1, 0.5, null op_sel:[1,0] ; encoding: [0x05,0x08,0x78,0xd7,0x01,0xe1,0xf1,0x01] + +v_permlanex16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] +// GFX13: v_permlanex16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] ; encoding: [0xff,0x10,0x78,0xd7,0xff,0xfb,0xfd,0x01] + +v_permlane16_var_b32 v5, v1, v2 +// GFX13: v_permlane16_var_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x15,0xd7,0x01,0x05,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v255 +// GFX13: v_permlane16_var_b32 v5, v1, v255 ; encoding: [0x05,0x00,0x15,0xd7,0x01,0xff,0x03,0x02] + +v_permlane16_var_b32 v5, v255, v0 +// GFX13: v_permlane16_var_b32 v5, v255, v0 ; encoding: [0x05,0x00,0x15,0xd7,0xff,0x01,0x02,0x02] + +v_permlane16_var_b32 v255, v1, v2 +// GFX13: v_permlane16_var_b32 v255, v1, v2 ; encoding: [0xff,0x00,0x15,0xd7,0x01,0x05,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[1,1] +// GFX13: v_permlane16_var_b32 v5, v1, v50 op_sel:[1,1] ; encoding: [0x05,0x18,0x15,0xd7,0x01,0x65,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[0,0] +// GFX13: v_permlane16_var_b32 v5, v1, v50 ; encoding: [0x05,0x00,0x15,0xd7,0x01,0x65,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[1,0] +// GFX13: v_permlane16_var_b32 v5, v1, v50 op_sel:[1,0] ; encoding: [0x05,0x08,0x15,0xd7,0x01,0x65,0x02,0x02] + +v_permlane16_var_b32 v255, v255, v0, op_sel:[0,1] +// GFX13: v_permlane16_var_b32 v255, v255, v0 op_sel:[0,1] ; encoding: [0xff,0x10,0x15,0xd7,0xff,0x01,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v2 +// GFX13: v_permlanex16_var_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0x05,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v105 +// GFX13: v_permlanex16_var_b32 v5, v1, v105 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0xd3,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v255 +// GFX13: v_permlanex16_var_b32 v5, v1, v255 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0xff,0x03,0x02] + +v_permlanex16_var_b32 v255, v1, v2 +// GFX13: v_permlanex16_var_b32 v255, v1, v2 ; encoding: [0xff,0x00,0x16,0xd7,0x01,0x05,0x02,0x02] + +v_permlanex16_var_b32 v1, v255, v2 +// GFX13: v_permlanex16_var_b32 v1, v255, v2 ; encoding: [0x01,0x00,0x16,0xd7,0xff,0x05,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[1,1] +// GFX13: v_permlanex16_var_b32 v5, v1, v100 op_sel:[1,1] ; encoding: [0x05,0x18,0x16,0xd7,0x01,0xc9,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[0,0] +// GFX13: v_permlanex16_var_b32 v5, v1, v100 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0xc9,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[1,0] +// GFX13: v_permlanex16_var_b32 v5, v1, v100 op_sel:[1,0] ; encoding: [0x05,0x08,0x16,0xd7,0x01,0xc9,0x02,0x02] + +v_permlanex16_var_b32 v255, v255, v100, op_sel:[0,1] +// GFX13: v_permlanex16_var_b32 v255, v255, v100 op_sel:[0,1] ; encoding: [0xff,0x10,0x16,0xd7,0xff,0xc9,0x02,0x02] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0x05,0xea,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0xff,0xeb,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0x05,0xe8,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0xd3,0xe8,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] ; encoding: [0x05,0x00,0x3a,0xd6,0xfe,0xf7,0x18,0x00] + +v_qsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] +// GFX13: v_qsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] ; encoding: [0x05,0x00,0x3a,0xd6,0x02,0xd6,0x0c,0x04] + +v_qsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] +// GFX13: v_qsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] ; encoding: [0x05,0x00,0x3a,0xd6,0x68,0xd4,0xa0,0x01] + +v_qsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] +// GFX13: v_qsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] ; encoding: [0x05,0x00,0x3a,0xd6,0x6a,0xfa,0xf8,0x07] + +v_qsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null +// GFX13: v_qsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null ; encoding: [0x05,0x00,0x3a,0xd6,0x7a,0xfe,0xf0,0x01] + +v_qsad_pk_u16_u8 v[5:6], exec, exec_lo, exec +// GFX13: v_qsad_pk_u16_u8 v[5:6], exec, exec_lo, exec ; encoding: [0x05,0x00,0x3a,0xd6,0x7e,0xfc,0xf8,0x01] + +v_qsad_pk_u16_u8 v[5:6], null, null, vcc +// GFX13: v_qsad_pk_u16_u8 v[5:6], null, null, vcc ; encoding: [0x05,0x00,0x3a,0xd6,0x7c,0xf8,0xa8,0x01] + +v_qsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_qsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x3a,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_qsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc +// GFX13: v_qsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc ; encoding: [0x05,0x00,0x3a,0xd6,0xf0,0xe0,0xf5,0x03] + +v_qsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 +// GFX13: v_qsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 ; encoding: [0x05,0x00,0x3a,0xd6,0xfd,0xfa,0xc1,0x03] + +v_qsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp +// GFX13: v_qsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp ; encoding: [0xfe,0x80,0x3a,0xd6,0xff,0xfe,0x05,0x03,0x56,0x34,0x12,0xaf] + +v_readlane_b32 s5, v1, s2 +// GFX13: v_readlane_b32 s5, v1, s2 ; encoding: [0x05,0x00,0x60,0xd7,0x01,0x05,0x00,0x02] + +v_readlane_b32 s5, v1, s105 +// GFX13: v_readlane_b32 s5, v1, s105 ; encoding: [0x05,0x00,0x60,0xd7,0x01,0xd3,0x00,0x02] + +v_readlane_b32 s105, v1, ttmp15 +// GFX13: v_readlane_b32 s105, v1, ttmp15 ; encoding: [0x69,0x00,0x60,0xd7,0x01,0xf7,0x00,0x02] + +v_readlane_b32 vcc_lo, v1, vcc_hi +// GFX13: v_readlane_b32 vcc_lo, v1, vcc_hi ; encoding: [0x6a,0x00,0x60,0xd7,0x01,0xd7,0x00,0x02] + +v_readlane_b32 vcc_hi, v1, vcc_lo +// GFX13: v_readlane_b32 vcc_hi, v1, vcc_lo ; encoding: [0x6b,0x00,0x60,0xd7,0x01,0xd5,0x00,0x02] + +v_readlane_b32 ttmp15, v1, m0 +// GFX13: v_readlane_b32 ttmp15, v1, m0 ; encoding: [0x7b,0x00,0x60,0xd7,0x01,0xfb,0x00,0x02] + +v_readlane_b32 null, v255, null +// GFX13: v_readlane_b32 null, v255, null ; encoding: [0x7c,0x00,0x60,0xd7,0xff,0xf9,0x00,0x02] + +v_sad_hi_u8 v5, v1, v2, s3 +// GFX13: v_sad_hi_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x23,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_hi_u8 v5, v255, s2, s105 +// GFX13: v_sad_hi_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x23,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_hi_u8 v5, s1, v255, exec_hi +// GFX13: v_sad_hi_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x23,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_hi_u8 v5, s105, s105, exec_lo +// GFX13: v_sad_hi_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x23,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_hi_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_hi_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x23,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_hi_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_sad_hi_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x23,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_sad_hi_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_hi_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x23,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_hi_u8 v5, m0, 0.5, m0 +// GFX13: v_sad_hi_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x23,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_hi_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_hi_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x23,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_hi_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_hi_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x23,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_hi_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_hi_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x23,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_hi_u8 v5, -1, exec_hi, src_scc +// GFX13: v_sad_hi_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x23,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_hi_u8 v5, 0.5, m0, 0.5 +// GFX13: v_sad_hi_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x23,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_hi_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_hi_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x23,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_hi_u8 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_sad_hi_u8 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x23,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_sad_u16 v5, v1, v2, s3 +// GFX13: v_sad_u16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x24,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_u16 v5, v255, s2, s105 +// GFX13: v_sad_u16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x24,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_u16 v5, s1, v255, exec_hi +// GFX13: v_sad_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x24,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_u16 v5, s105, s105, exec_lo +// GFX13: v_sad_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x24,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x24,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_sad_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x24,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_sad_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x24,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_u16 v5, m0, 0.5, m0 +// GFX13: v_sad_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x24,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_u16 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_u16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x24,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_u16 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_u16 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x24,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_u16 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_u16 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x24,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_u16 v5, -1, exec_hi, src_scc +// GFX13: v_sad_u16 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x24,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_u16 v5, 0.5, m0, 0.5 +// GFX13: v_sad_u16 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x24,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_u16 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_u16 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x24,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_u16 v255, 0xfe0b, vcc_hi, null clamp +// GFX13: v_sad_u16 v255, 0xfe0b, vcc_hi, null clamp ; encoding: [0xff,0x80,0x24,0xd6,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_sad_u32 v5, v1, v2, s3 +// GFX13: v_sad_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x25,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_u32 v5, v255, s2, s105 +// GFX13: v_sad_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x25,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_u32 v5, s1, v255, exec_hi +// GFX13: v_sad_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x25,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_u32 v5, s105, s105, exec_lo +// GFX13: v_sad_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x25,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x25,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_sad_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x25,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_sad_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x25,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_u32 v5, m0, 0.5, m0 +// GFX13: v_sad_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x25,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x25,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x25,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x25,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_u32 v5, -1, exec_hi, src_scc +// GFX13: v_sad_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x25,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_u32 v5, 0.5, m0, 0.5 +// GFX13: v_sad_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x25,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x25,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_u32 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_sad_u32 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x25,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_sad_u8 v5, v1, v2, s3 +// GFX13: v_sad_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x22,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_u8 v5, v255, s2, s105 +// GFX13: v_sad_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x22,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_u8 v5, s1, v255, exec_hi +// GFX13: v_sad_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x22,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_u8 v5, s105, s105, exec_lo +// GFX13: v_sad_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x22,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x22,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_sad_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x22,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_sad_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x22,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_u8 v5, m0, 0.5, m0 +// GFX13: v_sad_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x22,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x22,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x22,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x22,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_u8 v5, -1, exec_hi, src_scc +// GFX13: v_sad_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x22,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_u8 v5, 0.5, m0, 0.5 +// GFX13: v_sad_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x22,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x22,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_u8 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_sad_u8 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x22,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_sub_co_u32 v5, s6, v1, v2 +// W32: v_sub_co_u32 v5, s6, v1, v2 ; encoding: [0x05,0x06,0x10,0xd7,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, v255, v255 +// W32: v_sub_co_u32 v5, s6, v255, v255 ; encoding: [0x05,0x06,0x10,0xd7,0xff,0xff,0x03,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, s1, s2 +// W32: v_sub_co_u32 v5, s6, s1, s2 ; encoding: [0x05,0x06,0x10,0xd7,0x01,0x04,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, s105, s105 +// W32: v_sub_co_u32 v5, s6, s105, s105 ; encoding: [0x05,0x06,0x10,0xd7,0x69,0xd2,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, vcc_lo, ttmp15 +// W32: v_sub_co_u32 v5, s6, vcc_lo, ttmp15 ; encoding: [0x05,0x06,0x10,0xd7,0x6a,0xf6,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, vcc_hi, 0xaf123456 +// W32: v_sub_co_u32 v5, s6, vcc_hi, 0xaf123456 ; encoding: [0x05,0x06,0x10,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, ttmp15, src_scc +// W32: v_sub_co_u32 v5, s6, ttmp15, src_scc ; encoding: [0x05,0x06,0x10,0xd7,0x7b,0xfa,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, m0, 0.5 +// W32: v_sub_co_u32 v5, s6, m0, 0.5 ; encoding: [0x05,0x06,0x10,0xd7,0x7d,0xe0,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, exec_lo, -1 +// W32: v_sub_co_u32 v5, s6, exec_lo, -1 ; encoding: [0x05,0x06,0x10,0xd7,0x7e,0x82,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, exec_hi, null +// W32: v_sub_co_u32 v5, s6, exec_hi, null ; encoding: [0x05,0x06,0x10,0xd7,0x7f,0xf8,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s105, null, exec_lo +// W32: v_sub_co_u32 v5, s105, null, exec_lo ; encoding: [0x05,0x69,0x10,0xd7,0x7c,0xfc,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, vcc_lo, -1, exec_hi +// W32: v_sub_co_u32 v5, vcc_lo, -1, exec_hi ; encoding: [0x05,0x6a,0x10,0xd7,0xc1,0xfe,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, vcc_hi, 0.5, m0 +// W32: v_sub_co_u32 v5, vcc_hi, 0.5, m0 ; encoding: [0x05,0x6b,0x10,0xd7,0xf0,0xfa,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, ttmp15, src_scc, vcc_lo +// W32: v_sub_co_u32 v5, ttmp15, src_scc, vcc_lo ; encoding: [0x05,0x7b,0x10,0xd7,0xfd,0xd4,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s[12:13], v1, v2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], v1, v2 ; encoding: [0x05,0x0c,0x10,0xd7,0x01,0x05,0x02,0x02] + +v_sub_co_u32 v5, s[12:13], v255, v255 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], v255, v255 ; encoding: [0x05,0x0c,0x10,0xd7,0xff,0xff,0x03,0x02] + +v_sub_co_u32 v5, s[12:13], s1, s2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], s1, s2 ; encoding: [0x05,0x0c,0x10,0xd7,0x01,0x04,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], s105, s105 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], s105, s105 ; encoding: [0x05,0x0c,0x10,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], vcc_lo, ttmp15 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], vcc_lo, ttmp15 ; encoding: [0x05,0x0c,0x10,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 ; encoding: [0x05,0x0c,0x10,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_sub_co_u32 v5, s[12:13], ttmp15, src_scc +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], ttmp15, src_scc ; encoding: [0x05,0x0c,0x10,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_co_u32 v5, s[12:13], m0, 0.5 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], m0, 0.5 ; encoding: [0x05,0x0c,0x10,0xd7,0x7d,0xe0,0x01,0x02] + +v_sub_co_u32 v5, s[12:13], exec_lo, -1 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], exec_lo, -1 ; encoding: [0x05,0x0c,0x10,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_co_u32 v5, s[12:13], exec_hi, null +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], exec_hi, null ; encoding: [0x05,0x0c,0x10,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], null, exec_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], null, exec_lo ; encoding: [0x05,0x0c,0x10,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_co_u32 v5, s[104:105], -1, exec_hi +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[104:105], -1, exec_hi ; encoding: [0x05,0x68,0x10,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_co_u32 v5, vcc, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// W64: v_sub_co_u32 v5, vcc, 0.5, m0 ; encoding: [0x05,0x6a,0x10,0xd7,0xf0,0xfa,0x00,0x02] + +v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo ; encoding: [0x05,0x7a,0x10,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp +// GFX13: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x10,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_sub_nc_i16 v5, v1, v2 +// GFX13: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x02] + +v_sub_nc_i16 v5, v255, v255 +// GFX13: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x02] + +v_sub_nc_i16 v5, s1, s2 +// GFX13: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x02] + +v_sub_nc_i16 v5, s105, s105 +// GFX13: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_nc_i16 v5, vcc_lo, ttmp15 +// GFX13: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_nc_i16 v5, vcc_hi, 0xfe0b +// GFX13: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_sub_nc_i16 v5, ttmp15, src_scc +// GFX13: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_nc_i16 v5, m0, 0.5 +// GFX13-ASM: v_sub_nc_i16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_i16 v5, exec_lo, -1 +// GFX13: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_nc_i16 v5, exec_hi, null +// GFX13: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1] +// GFX13: v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_sub_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_sub_nc_i32 v5, v1, v2 +// GFX13: v_sub_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x76,0xd7,0x01,0x05,0x02,0x02] + +v_sub_nc_i32 v5, v255, v255 +// GFX13: v_sub_nc_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x76,0xd7,0xff,0xff,0x03,0x02] + +v_sub_nc_i32 v5, s1, s2 +// GFX13: v_sub_nc_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x76,0xd7,0x01,0x04,0x00,0x02] + +v_sub_nc_i32 v5, s105, s105 +// GFX13: v_sub_nc_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x76,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_nc_i32 v5, vcc_lo, ttmp15 +// GFX13: v_sub_nc_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x76,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_nc_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_sub_nc_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x76,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_sub_nc_i32 v5, ttmp15, src_scc +// GFX13: v_sub_nc_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x76,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_nc_i32 v5, m0, 0.5 +// GFX13: v_sub_nc_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x76,0xd7,0x7d,0xe0,0x01,0x02] + +v_sub_nc_i32 v5, exec_lo, -1 +// GFX13: v_sub_nc_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x76,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_nc_i32 v5, exec_hi, null +// GFX13: v_sub_nc_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x76,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_nc_i32 v5, null, exec_lo +// GFX13: v_sub_nc_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x76,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_nc_i32 v5, -1, exec_hi +// GFX13: v_sub_nc_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x76,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_nc_i32 v5, 0.5, m0 +// GFX13: v_sub_nc_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x76,0xd7,0xf0,0xfa,0x00,0x02] + +v_sub_nc_i32 v5, src_scc, vcc_lo +// GFX13: v_sub_nc_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x76,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp +// GFX13: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x76,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_sub_nc_u16 v5, v1, v2 +// GFX13: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x02] + +v_sub_nc_u16 v5, v255, v255 +// GFX13: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x02] + +v_sub_nc_u16 v5, s1, s2 +// GFX13: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x02] + +v_sub_nc_u16 v5, s105, s105 +// GFX13: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_nc_u16 v5, vcc_lo, ttmp15 +// GFX13: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_nc_u16 v5, vcc_hi, 0xfe0b +// GFX13: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_sub_nc_u16 v5, ttmp15, src_scc +// GFX13: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_nc_u16 v5, m0, 0.5 +// GFX13-ASM: v_sub_nc_u16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_u16 v5, exec_lo, -1 +// GFX13: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_nc_u16 v5, exec_hi, null +// GFX13: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1] +// GFX13: v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_sub_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_subrev_co_u32 v5, s6, v1, v2 +// W32: v_subrev_co_u32 v5, s6, v1, v2 ; encoding: [0x05,0x06,0x19,0xd7,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, v255, v255 +// W32: v_subrev_co_u32 v5, s6, v255, v255 ; encoding: [0x05,0x06,0x19,0xd7,0xff,0xff,0x03,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, s1, s2 +// W32: v_subrev_co_u32 v5, s6, s1, s2 ; encoding: [0x05,0x06,0x19,0xd7,0x01,0x04,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, s105, s105 +// W32: v_subrev_co_u32 v5, s6, s105, s105 ; encoding: [0x05,0x06,0x19,0xd7,0x69,0xd2,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, vcc_lo, ttmp15 +// W32: v_subrev_co_u32 v5, s6, vcc_lo, ttmp15 ; encoding: [0x05,0x06,0x19,0xd7,0x6a,0xf6,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, vcc_hi, 0xaf123456 +// W32: v_subrev_co_u32 v5, s6, vcc_hi, 0xaf123456 ; encoding: [0x05,0x06,0x19,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, ttmp15, src_scc +// W32: v_subrev_co_u32 v5, s6, ttmp15, src_scc ; encoding: [0x05,0x06,0x19,0xd7,0x7b,0xfa,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, m0, 0.5 +// W32: v_subrev_co_u32 v5, s6, m0, 0.5 ; encoding: [0x05,0x06,0x19,0xd7,0x7d,0xe0,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, exec_lo, -1 +// W32: v_subrev_co_u32 v5, s6, exec_lo, -1 ; encoding: [0x05,0x06,0x19,0xd7,0x7e,0x82,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, exec_hi, null +// W32: v_subrev_co_u32 v5, s6, exec_hi, null ; encoding: [0x05,0x06,0x19,0xd7,0x7f,0xf8,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s105, null, exec_lo +// W32: v_subrev_co_u32 v5, s105, null, exec_lo ; encoding: [0x05,0x69,0x19,0xd7,0x7c,0xfc,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, vcc_lo, -1, exec_hi +// W32: v_subrev_co_u32 v5, vcc_lo, -1, exec_hi ; encoding: [0x05,0x6a,0x19,0xd7,0xc1,0xfe,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, vcc_hi, 0.5, m0 +// W32: v_subrev_co_u32 v5, vcc_hi, 0.5, m0 ; encoding: [0x05,0x6b,0x19,0xd7,0xf0,0xfa,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, ttmp15, src_scc, vcc_lo +// W32: v_subrev_co_u32 v5, ttmp15, src_scc, vcc_lo ; encoding: [0x05,0x7b,0x19,0xd7,0xfd,0xd4,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s[12:13], v1, v2 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], v1, v2 ; encoding: [0x05,0x0c,0x19,0xd7,0x01,0x05,0x02,0x02] + +v_subrev_co_u32 v5, s[12:13], v255, v255 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], v255, v255 ; encoding: [0x05,0x0c,0x19,0xd7,0xff,0xff,0x03,0x02] + +v_subrev_co_u32 v5, s[12:13], s1, s2 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], s1, s2 ; encoding: [0x05,0x0c,0x19,0xd7,0x01,0x04,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], s105, s105 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], s105, s105 ; encoding: [0x05,0x0c,0x19,0xd7,0x69,0xd2,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], vcc_lo, ttmp15 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], vcc_lo, ttmp15 ; encoding: [0x05,0x0c,0x19,0xd7,0x6a,0xf6,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 ; encoding: [0x05,0x0c,0x19,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_subrev_co_u32 v5, s[12:13], ttmp15, src_scc +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], ttmp15, src_scc ; encoding: [0x05,0x0c,0x19,0xd7,0x7b,0xfa,0x01,0x02] + +v_subrev_co_u32 v5, s[12:13], m0, 0.5 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], m0, 0.5 ; encoding: [0x05,0x0c,0x19,0xd7,0x7d,0xe0,0x01,0x02] + +v_subrev_co_u32 v5, s[12:13], exec_lo, -1 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], exec_lo, -1 ; encoding: [0x05,0x0c,0x19,0xd7,0x7e,0x82,0x01,0x02] + +v_subrev_co_u32 v5, s[12:13], exec_hi, null +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], exec_hi, null ; encoding: [0x05,0x0c,0x19,0xd7,0x7f,0xf8,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], null, exec_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], null, exec_lo ; encoding: [0x05,0x0c,0x19,0xd7,0x7c,0xfc,0x00,0x02] + +v_subrev_co_u32 v5, s[104:105], -1, exec_hi +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[104:105], -1, exec_hi ; encoding: [0x05,0x68,0x19,0xd7,0xc1,0xfe,0x00,0x02] + +v_subrev_co_u32 v5, vcc, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// W64: v_subrev_co_u32 v5, vcc, 0.5, m0 ; encoding: [0x05,0x6a,0x19,0xd7,0xf0,0xfa,0x00,0x02] + +v_subrev_co_u32 v5, ttmp[14:15], src_scc, vcc_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, ttmp[14:15], src_scc, vcc_lo ; encoding: [0x05,0x7a,0x19,0xd7,0xfd,0xd4,0x00,0x02] + +v_subrev_co_u32 v255, null, 0xaf123456, vcc_hi clamp +// GFX13: v_subrev_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x19,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_trig_preop_f64 v[5:6], v[1:2], v2 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], v2 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0x05,0x02,0x02] + +v_trig_preop_f64 v[5:6], v[1:2], v255 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], v255 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0xff,0x03,0x02] + +v_trig_preop_f64 v[5:6], v[1:2], s2 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], s2 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0x05,0x00,0x02] + +v_trig_preop_f64 v[5:6], v[1:2], s105 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], s105 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0xd3,0x00,0x02] + +v_trig_preop_f64 v[5:6], v[254:255], ttmp15 +// GFX13: v_trig_preop_f64 v[5:6], v[254:255], ttmp15 ; encoding: [0x05,0x00,0x2f,0xd7,0xfe,0xf7,0x00,0x02] + +v_trig_preop_f64 v[5:6], s[2:3], vcc_hi +// GFX13: v_trig_preop_f64 v[5:6], s[2:3], vcc_hi ; encoding: [0x05,0x00,0x2f,0xd7,0x02,0xd6,0x00,0x02] + +v_trig_preop_f64 v[5:6], s[104:105], vcc_lo +// GFX13: v_trig_preop_f64 v[5:6], s[104:105], vcc_lo ; encoding: [0x05,0x00,0x2f,0xd7,0x68,0xd4,0x00,0x02] + +v_trig_preop_f64 v[5:6], vcc, m0 +// GFX13: v_trig_preop_f64 v[5:6], vcc, m0 ; encoding: [0x05,0x00,0x2f,0xd7,0x6a,0xfa,0x00,0x02] + +v_trig_preop_f64 v[5:6], ttmp[14:15], exec_hi +// GFX13: v_trig_preop_f64 v[5:6], ttmp[14:15], exec_hi ; encoding: [0x05,0x00,0x2f,0xd7,0x7a,0xfe,0x00,0x02] + +v_trig_preop_f64 v[5:6], exec, exec_lo +// GFX13: v_trig_preop_f64 v[5:6], exec, exec_lo ; encoding: [0x05,0x00,0x2f,0xd7,0x7e,0xfc,0x00,0x02] + +v_trig_preop_f64 v[5:6], null, null +// GFX13: v_trig_preop_f64 v[5:6], null, null ; encoding: [0x05,0x00,0x2f,0xd7,0x7c,0xf8,0x00,0x02] + +v_trig_preop_f64 v[5:6], -1, -1 +// GFX13: v_trig_preop_f64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x2f,0xd7,0xc1,0x82,0x01,0x02] + +v_trig_preop_f64 v[5:6], 0.5, 0.5 mul:2 +// GFX13: v_trig_preop_f64 v[5:6], 0.5, 0.5 mul:2 ; encoding: [0x05,0x00,0x2f,0xd7,0xf0,0xe0,0x01,0x0a] + +v_trig_preop_f64 v[5:6], -|src_scc|, src_scc mul:4 +// GFX13: v_trig_preop_f64 v[5:6], -|src_scc|, src_scc mul:4 ; encoding: [0x05,0x01,0x2f,0xd7,0xfd,0xfa,0x01,0x32] + +v_trig_preop_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 +// GFX13: v_trig_preop_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x2f,0xd7,0xff,0xfe,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_writelane_b32 v5, s1, s2 +// GFX13: v_writelane_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x01,0x04,0x00,0x02] + +v_writelane_b32 v5, s105, s2 +// GFX13: v_writelane_b32 v5, s105, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x69,0x04,0x00,0x02] + +v_writelane_b32 v5, vcc_lo, s2 +// GFX13: v_writelane_b32 v5, vcc_lo, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x6a,0x04,0x00,0x02] + +v_writelane_b32 v5, vcc_hi, s2 +// GFX13: v_writelane_b32 v5, vcc_hi, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x6b,0x04,0x00,0x02] + +v_writelane_b32 v5, ttmp15, s2 +// GFX13: v_writelane_b32 v5, ttmp15, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x7b,0x04,0x00,0x02] + +v_writelane_b32 v5, m0, s2 +// GFX13: v_writelane_b32 v5, m0, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x7d,0x04,0x00,0x02] + +v_writelane_b32 v5, exec_lo, s2 +// GFX13: v_writelane_b32 v5, exec_lo, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x7e,0x04,0x00,0x02] + +v_writelane_b32 v5, exec_hi, s105 +// GFX13: v_writelane_b32 v5, exec_hi, s105 ; encoding: [0x05,0x00,0x61,0xd7,0x7f,0xd2,0x00,0x02] + +v_writelane_b32 v5, null, ttmp15 +// GFX13: v_writelane_b32 v5, null, ttmp15 ; encoding: [0x05,0x00,0x61,0xd7,0x7c,0xf6,0x00,0x02] + +v_writelane_b32 v5, -1, null +// GFX13: v_writelane_b32 v5, -1, null ; encoding: [0x05,0x00,0x61,0xd7,0xc1,0xf8,0x00,0x02] + +v_writelane_b32 v5, 0.5, m0 +// GFX13: v_writelane_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd7,0xf0,0xfa,0x00,0x02] + +v_writelane_b32 v5, src_scc, vcc_lo +// GFX13: v_writelane_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x61,0xd7,0xfd,0xd4,0x00,0x02] + +v_writelane_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_writelane_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x61,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_xad_u32 v5, v1, v2, s3 +// GFX13: v_xad_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x45,0xd7,0x01,0x05,0x0e,0x00] + +v_xad_u32 v5, v255, s2, s105 +// GFX13: v_xad_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x45,0xd7,0xff,0x05,0xa4,0x01] + +v_xad_u32 v5, s1, v255, exec_hi +// GFX13: v_xad_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x45,0xd7,0x01,0xfe,0xff,0x01] + +v_xad_u32 v5, s105, s105, exec_lo +// GFX13: v_xad_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x45,0xd7,0x69,0xd2,0xf8,0x01] + +v_xad_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_xad_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x45,0xd7,0x6a,0xf6,0x0c,0x04] + +v_xad_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_xad_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x45,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_xad_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_xad_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x45,0xd7,0x7b,0xfa,0xed,0x01] + +v_xad_u32 v5, m0, 0.5, m0 +// GFX13: v_xad_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x45,0xd7,0x7d,0xe0,0xf5,0x01] + +v_xad_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_xad_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x45,0xd7,0x7e,0x82,0xad,0x01] + +v_xad_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_xad_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x45,0xd7,0x7f,0xf8,0xa8,0x01] + +v_xad_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_xad_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x45,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_xad_u32 v5, -1, exec_hi, src_scc +// GFX13: v_xad_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x45,0xd7,0xc1,0xfe,0xf4,0x03] + +v_xad_u32 v5, 0.5, m0, 0.5 +// GFX13: v_xad_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x45,0xd7,0xf0,0xfa,0xc0,0x03] + +v_xad_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_xad_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x45,0xd7,0xfd,0xd4,0x04,0x03] + +v_xad_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_xad_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x45,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_xor3_b32 v5, v1, v2, s3 +// GFX13: v_xor3_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x40,0xd6,0x01,0x05,0x0e,0x00] + +v_xor3_b32 v5, v255, s2, s105 +// GFX13: v_xor3_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x40,0xd6,0xff,0x05,0xa4,0x01] + +v_xor3_b32 v5, s1, v255, exec_hi +// GFX13: v_xor3_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x40,0xd6,0x01,0xfe,0xff,0x01] + +v_xor3_b32 v5, s105, s105, exec_lo +// GFX13: v_xor3_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x40,0xd6,0x69,0xd2,0xf8,0x01] + +v_xor3_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_xor3_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x40,0xd6,0x6a,0xf6,0x0c,0x04] + +v_xor3_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_xor3_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x40,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_xor3_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_xor3_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x40,0xd6,0x7b,0xfa,0xed,0x01] + +v_xor3_b32 v5, m0, 0.5, m0 +// GFX13: v_xor3_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x40,0xd6,0x7d,0xe0,0xf5,0x01] + +v_xor3_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_xor3_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x40,0xd6,0x7e,0x82,0xad,0x01] + +v_xor3_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_xor3_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x40,0xd6,0x7f,0xf8,0xa8,0x01] + +v_xor3_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_xor3_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x40,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_xor3_b32 v5, -1, exec_hi, src_scc +// GFX13: v_xor3_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x40,0xd6,0xc1,0xfe,0xf4,0x03] + +v_xor3_b32 v5, 0.5, m0, 0.5 +// GFX13: v_xor3_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x40,0xd6,0xf0,0xfa,0xc0,0x03] + +v_xor3_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_xor3_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x40,0xd6,0xfd,0xd4,0x04,0x03] + +v_xor3_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_xor3_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x40,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_xor_b16 v5, v1, v2 +// GFX13: v_xor_b16 v5, v1, v2 ; encoding: [0x05,0x00,0x70,0xd7,0x01,0x05,0x02,0x02] + +v_xor_b16 v5, v255, v255 +// GFX13: v_xor_b16 v5, v255, v255 ; encoding: [0x05,0x00,0x70,0xd7,0xff,0xff,0x03,0x02] + +v_xor_b16 v5, s1, s2 +// GFX13: v_xor_b16 v5, s1, s2 ; encoding: [0x05,0x00,0x70,0xd7,0x01,0x04,0x00,0x02] + +v_xor_b16 v5, s105, s105 +// GFX13: v_xor_b16 v5, s105, s105 ; encoding: [0x05,0x00,0x70,0xd7,0x69,0xd2,0x00,0x02] + +v_xor_b16 v5, vcc_lo, ttmp15 +// GFX13: v_xor_b16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x70,0xd7,0x6a,0xf6,0x00,0x02] + +v_xor_b16 v5, vcc_hi, 0xfe0b +// GFX13: v_xor_b16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x70,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_xor_b16 v5, ttmp15, src_scc +// GFX13: v_xor_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x70,0xd7,0x7b,0xfa,0x01,0x02] + +v_xor_b16 v5, m0, 0.5 +// GFX13-ASM: v_xor_b16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x70,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_xor_b16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x70,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_xor_b16 v5, exec_lo, -1 +// GFX13: v_xor_b16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x70,0xd7,0x7e,0x82,0x01,0x02] + +v_xor_b16 v5, exec_hi, null +// GFX13: v_xor_b16 v5, exec_hi, null ; encoding: [0x05,0x00,0x70,0xd7,0x7f,0xf8,0x00,0x02] + +v_xor_b16 v5, null, exec_lo +// GFX13: v_xor_b16 v5, null, exec_lo ; encoding: [0x05,0x00,0x70,0xd7,0x7c,0xfc,0x00,0x02] + +v_xor_b16 v5, -1, exec_hi +// GFX13: v_xor_b16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x70,0xd7,0xc1,0xfe,0x00,0x02] + +v_xor_b16 v5, 0.5, m0 +// GFX13-ASM: v_xor_b16 v5, 0.5, m0 ; encoding: [0x05,0x00,0x70,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_xor_b16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x70,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_xor_b16 v5, src_scc, vcc_lo +// GFX13: v_xor_b16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x70,0xd7,0xfd,0xd4,0x00,0x02] + +v_xor_b16 v255, 0xfe0b, vcc_hi +// GFX13: v_xor_b16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x70,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_minimum_f32 v5, v1, v2 +// GFX13: v_minimum_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x3c,0xd7,0x01,0x05,0x02,0x02] + +v_minimum_f32 v5, v255, v255 +// GFX13: v_minimum_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x3c,0xd7,0xff,0xff,0x03,0x02] + +v_minimum_f32 v5, s1, s2 +// GFX13: v_minimum_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x3c,0xd7,0x01,0x04,0x00,0x02] + +v_minimum_f32 v5, s105, s105 +// GFX13: v_minimum_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x3c,0xd7,0x69,0xd2,0x00,0x02] + +v_minimum_f32 v5, vcc_lo, ttmp15 +// GFX13: v_minimum_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3c,0xd7,0x6a,0xf6,0x00,0x02] + +v_minimum_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_minimum_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x3c,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_minimum_f32 v5, ttmp15, src_scc +// GFX13: v_minimum_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3c,0xd7,0x7b,0xfa,0x01,0x02] + +v_minimum_f32 v5, m0, 0.5 +// GFX13: v_minimum_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3c,0xd7,0x7d,0xe0,0x01,0x02] + +v_minimum_f32 v5, exec_lo, -1 +// GFX13: v_minimum_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3c,0xd7,0x7e,0x82,0x01,0x02] + +v_minimum_f32 v5, |exec_hi|, null +// GFX13: v_minimum_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x3c,0xd7,0x7f,0xf8,0x00,0x02] + +v_minimum_f32 v5, null, exec_lo +// GFX13: v_minimum_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x3c,0xd7,0x7c,0xfc,0x00,0x02] + +v_minimum_f32 v5, -1, exec_hi +// GFX13: v_minimum_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x3c,0xd7,0xc1,0xfe,0x00,0x02] + +v_minimum_f32 v5, 0.5, -m0 +// GFX13: v_minimum_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x3c,0xd7,0xf0,0xfa,0x00,0x42] + +v_minimum_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_minimum_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3c,0xd7,0xfd,0xd4,0x00,0x22] + +v_minimum_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_minimum_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x3c,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_maximum_f32 v5, v1, v2 +// GFX13: v_maximum_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x3d,0xd7,0x01,0x05,0x02,0x02] + +v_maximum_f32 v5, v255, v255 +// GFX13: v_maximum_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x3d,0xd7,0xff,0xff,0x03,0x02] + +v_maximum_f32 v5, s1, s2 +// GFX13: v_maximum_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x3d,0xd7,0x01,0x04,0x00,0x02] + +v_maximum_f32 v5, s105, s105 +// GFX13: v_maximum_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x3d,0xd7,0x69,0xd2,0x00,0x02] + +v_maximum_f32 v5, vcc_lo, ttmp15 +// GFX13: v_maximum_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3d,0xd7,0x6a,0xf6,0x00,0x02] + +v_maximum_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_maximum_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x3d,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_maximum_f32 v5, ttmp15, src_scc +// GFX13: v_maximum_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3d,0xd7,0x7b,0xfa,0x01,0x02] + +v_maximum_f32 v5, m0, 0.5 +// GFX13: v_maximum_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3d,0xd7,0x7d,0xe0,0x01,0x02] + +v_maximum_f32 v5, exec_lo, -1 +// GFX13: v_maximum_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3d,0xd7,0x7e,0x82,0x01,0x02] + +v_maximum_f32 v5, |exec_hi|, null +// GFX13: v_maximum_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x3d,0xd7,0x7f,0xf8,0x00,0x02] + +v_maximum_f32 v5, null, exec_lo +// GFX13: v_maximum_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x3d,0xd7,0x7c,0xfc,0x00,0x02] + +v_maximum_f32 v5, -1, exec_hi +// GFX13: v_maximum_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x3d,0xd7,0xc1,0xfe,0x00,0x02] + +v_maximum_f32 v5, 0.5, -m0 +// GFX13: v_maximum_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x3d,0xd7,0xf0,0xfa,0x00,0x42] + +v_maximum_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_maximum_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3d,0xd7,0xfd,0xd4,0x00,0x22] + +v_maximum_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_maximum_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x3d,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_minimum_f16 v5, v1, v2 +// GFX13: v_minimum_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x02] + +v_minimum_f16 v5, v255, v255 +// GFX13: v_minimum_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x02] + +v_minimum_f16 v5, s1, s2 +// GFX13: v_minimum_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x02] + +v_minimum_f16 v5, s105, s105 +// GFX13: v_minimum_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x02] + +v_minimum_f16 v5, vcc_lo, ttmp15 +// GFX13: v_minimum_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x02] + +v_minimum_f16 v5, vcc_hi, 0xaf12 +// GFX13: v_minimum_f16 v5, vcc_hi, 0xaf12 ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_minimum_f16 v5, ttmp15, src_scc +// GFX13: v_minimum_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x02] + +v_minimum_f16 v5, m0, 0.5 +// GFX13: v_minimum_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xe0,0x01,0x02] + +v_minimum_f16 v5, exec_lo, -1 +// GFX13: v_minimum_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x02] + +v_minimum_f16 v5, |exec_hi|, null +// GFX13: v_minimum_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x3a,0xd7,0x7f,0xf8,0x00,0x02] + +v_minimum_f16 v5, null, exec_lo +// GFX13: v_minimum_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x02] + +v_minimum_f16 v5, -1, exec_hi +// GFX13: v_minimum_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x02] + +v_minimum_f16 v5, 0.5, -m0 +// GFX13: v_minimum_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x3a,0xd7,0xf0,0xfa,0x00,0x42] + +v_minimum_f16 v5, -src_scc, |vcc_lo| +// GFX13: v_minimum_f16 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3a,0xd7,0xfd,0xd4,0x00,0x22] + +v_minimum_f16 v255, -|0xaf12|, -|vcc_hi| +// GFX13: v_minimum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x3a,0xd7,0xff,0xd6,0x00,0x62,0x12,0xaf,0x00,0x00] + +v_minimum_f16 v205, v201, v200 +// GFX13: v_minimum_f16 v205, v201, v200 ; encoding: [0xcd,0x00,0x3a,0xd7,0xc9,0x91,0x03,0x02] + +v_maximum_f16 v5, v1, v2 +// GFX13: v_maximum_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x3b,0xd7,0x01,0x05,0x02,0x02] + +v_maximum_f16 v5, v255, v255 +// GFX13: v_maximum_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x3b,0xd7,0xff,0xff,0x03,0x02] + +v_maximum_f16 v5, s1, s2 +// GFX13: v_maximum_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x3b,0xd7,0x01,0x04,0x00,0x02] + +v_maximum_f16 v5, s105, s105 +// GFX13: v_maximum_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x3b,0xd7,0x69,0xd2,0x00,0x02] + +v_maximum_f16 v5, vcc_lo, ttmp15 +// GFX13: v_maximum_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3b,0xd7,0x6a,0xf6,0x00,0x02] + +v_maximum_f16 v5, vcc_hi, 0xaf12 +// GFX13: v_maximum_f16 v5, vcc_hi, 0xaf12 ; encoding: [0x05,0x00,0x3b,0xd7,0x6b,0xfe,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_maximum_f16 v5, ttmp15, src_scc +// GFX13: v_maximum_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3b,0xd7,0x7b,0xfa,0x01,0x02] + +v_maximum_f16 v5, m0, 0.5 +// GFX13: v_maximum_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3b,0xd7,0x7d,0xe0,0x01,0x02] + +v_maximum_f16 v5, exec_lo, -1 +// GFX13: v_maximum_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3b,0xd7,0x7e,0x82,0x01,0x02] + +v_maximum_f16 v5, |exec_hi|, null +// GFX13: v_maximum_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x3b,0xd7,0x7f,0xf8,0x00,0x02] + +v_maximum_f16 v5, null, exec_lo +// GFX13: v_maximum_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x3b,0xd7,0x7c,0xfc,0x00,0x02] + +v_maximum_f16 v5, -1, exec_hi +// GFX13: v_maximum_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x3b,0xd7,0xc1,0xfe,0x00,0x02] + +v_maximum_f16 v5, 0.5, -m0 +// GFX13: v_maximum_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x3b,0xd7,0xf0,0xfa,0x00,0x42] + +v_maximum_f16 v5, -src_scc, |vcc_lo| +// GFX13: v_maximum_f16 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3b,0xd7,0xfd,0xd4,0x00,0x22] + +v_maximum_f16 v255, -|0xaf12|, -|vcc_hi| +// GFX13: v_maximum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x3b,0xd7,0xff,0xd6,0x00,0x62,0x12,0xaf,0x00,0x00] + +v_maximum_f16 v205, v201, v200 +// GFX13: v_maximum_f16 v205, v201, v200 ; encoding: [0xcd,0x00,0x3b,0xd7,0xc9,0x91,0x03,0x02] + +v_minimum_f64 v[5:6], v[1:2], v[3:4] +// GFX13: v_minimum_f64 v[5:6], v[1:2], v[3:4] ; encoding: [0x05,0x00,0x3e,0xd7,0x01,0x07,0x02,0x02] + +v_minimum_f64 v[5:6], v[254:255], v[254:255] +// GFX13: v_minimum_f64 v[5:6], v[254:255], v[254:255] ; encoding: [0x05,0x00,0x3e,0xd7,0xfe,0xfd,0x03,0x02] + +v_minimum_f64 v[5:6], s[6:7], s[4:5] +// GFX13: v_minimum_f64 v[5:6], s[6:7], s[4:5] ; encoding: [0x05,0x00,0x3e,0xd7,0x06,0x08,0x00,0x02] + +v_minimum_f64 v[5:6], s[104:105], s[104:105] +// GFX13: v_minimum_f64 v[5:6], s[104:105], s[104:105] ; encoding: [0x05,0x00,0x3e,0xd7,0x68,0xd0,0x00,0x02] + +v_minimum_f64 v[5:6], vcc, ttmp[14:15] +// GFX13: v_minimum_f64 v[5:6], vcc, ttmp[14:15] ; encoding: [0x05,0x00,0x3e,0xd7,0x6a,0xf4,0x00,0x02] + +v_minimum_f64 v[5:6], vcc, 0xaf121234 +// GFX13: v_minimum_f64 v[5:6], vcc, 0xaf121234 ; encoding: [0x05,0x00,0x3e,0xd7,0x6a,0xfe,0x01,0x02,0x34,0x12,0x12,0xaf] + +v_minimum_f64 v[5:6], ttmp[14:15], src_scc +// GFX13: v_minimum_f64 v[5:6], ttmp[14:15], src_scc ; encoding: [0x05,0x00,0x3e,0xd7,0x7a,0xfa,0x01,0x02] + +v_minimum_f64 v[5:6], vcc, 0.5 +// GFX13: v_minimum_f64 v[5:6], vcc, 0.5 ; encoding: [0x05,0x00,0x3e,0xd7,0x6a,0xe0,0x01,0x02] + +v_minimum_f64 v[5:6], exec, -1 +// GFX13: v_minimum_f64 v[5:6], exec, -1 ; encoding: [0x05,0x00,0x3e,0xd7,0x7e,0x82,0x01,0x02] + +v_minimum_f64 v[5:6], |exec|, null +// GFX13: v_minimum_f64 v[5:6], |exec|, null ; encoding: [0x05,0x01,0x3e,0xd7,0x7e,0xf8,0x00,0x02] + +v_minimum_f64 v[5:6], null, exec +// GFX13: v_minimum_f64 v[5:6], null, exec ; encoding: [0x05,0x00,0x3e,0xd7,0x7c,0xfc,0x00,0x02] + +v_minimum_f64 v[5:6], -1, exec +// GFX13: v_minimum_f64 v[5:6], -1, exec ; encoding: [0x05,0x00,0x3e,0xd7,0xc1,0xfc,0x00,0x02] + +v_minimum_f64 v[5:6], 0.5, -vcc +// GFX13: v_minimum_f64 v[5:6], 0.5, -vcc ; encoding: [0x05,0x00,0x3e,0xd7,0xf0,0xd4,0x00,0x42] + +v_minimum_f64 v[5:6], -src_scc, |vcc| +// GFX13: v_minimum_f64 v[5:6], -src_scc, |vcc| ; encoding: [0x05,0x02,0x3e,0xd7,0xfd,0xd4,0x00,0x22] + +v_minimum_f64 v[254:255], -|2|, -|vcc| +// GFX13: v_minimum_f64 v[254:255], -|2|, -|vcc| ; encoding: [0xfe,0x03,0x3e,0xd7,0x82,0xd4,0x00,0x62] + +v_maximum_f64 v[5:6], v[1:2], v[3:4] +// GFX13: v_maximum_f64 v[5:6], v[1:2], v[3:4] ; encoding: [0x05,0x00,0x3f,0xd7,0x01,0x07,0x02,0x02] + +v_maximum_f64 v[5:6], v[254:255], v[254:255] +// GFX13: v_maximum_f64 v[5:6], v[254:255], v[254:255] ; encoding: [0x05,0x00,0x3f,0xd7,0xfe,0xfd,0x03,0x02] + +v_maximum_f64 v[5:6], s[6:7], s[4:5] +// GFX13: v_maximum_f64 v[5:6], s[6:7], s[4:5] ; encoding: [0x05,0x00,0x3f,0xd7,0x06,0x08,0x00,0x02] + +v_maximum_f64 v[5:6], s[104:105], s[104:105] +// GFX13: v_maximum_f64 v[5:6], s[104:105], s[104:105] ; encoding: [0x05,0x00,0x3f,0xd7,0x68,0xd0,0x00,0x02] + +v_maximum_f64 v[5:6], vcc, ttmp[14:15] +// GFX13: v_maximum_f64 v[5:6], vcc, ttmp[14:15] ; encoding: [0x05,0x00,0x3f,0xd7,0x6a,0xf4,0x00,0x02] + +v_maximum_f64 v[5:6], vcc, 0xaf121234 +// GFX13: v_maximum_f64 v[5:6], vcc, 0xaf121234 ; encoding: [0x05,0x00,0x3f,0xd7,0x6a,0xfe,0x01,0x02,0x34,0x12,0x12,0xaf] + +v_maximum_f64 v[5:6], ttmp[14:15], src_scc +// GFX13: v_maximum_f64 v[5:6], ttmp[14:15], src_scc ; encoding: [0x05,0x00,0x3f,0xd7,0x7a,0xfa,0x01,0x02] + +v_maximum_f64 v[5:6], vcc, 0.5 +// GFX13: v_maximum_f64 v[5:6], vcc, 0.5 ; encoding: [0x05,0x00,0x3f,0xd7,0x6a,0xe0,0x01,0x02] + +v_maximum_f64 v[5:6], exec, -1 +// GFX13: v_maximum_f64 v[5:6], exec, -1 ; encoding: [0x05,0x00,0x3f,0xd7,0x7e,0x82,0x01,0x02] + +v_maximum_f64 v[5:6], |exec|, null +// GFX13: v_maximum_f64 v[5:6], |exec|, null ; encoding: [0x05,0x01,0x3f,0xd7,0x7e,0xf8,0x00,0x02] + +v_maximum_f64 v[5:6], null, exec +// GFX13: v_maximum_f64 v[5:6], null, exec ; encoding: [0x05,0x00,0x3f,0xd7,0x7c,0xfc,0x00,0x02] + +v_maximum_f64 v[5:6], -1, exec +// GFX13: v_maximum_f64 v[5:6], -1, exec ; encoding: [0x05,0x00,0x3f,0xd7,0xc1,0xfc,0x00,0x02] + +v_maximum_f64 v[5:6], 0.5, -vcc +// GFX13: v_maximum_f64 v[5:6], 0.5, -vcc ; encoding: [0x05,0x00,0x3f,0xd7,0xf0,0xd4,0x00,0x42] + +v_maximum_f64 v[5:6], -src_scc, |vcc| +// GFX13: v_maximum_f64 v[5:6], -src_scc, |vcc| ; encoding: [0x05,0x02,0x3f,0xd7,0xfd,0xd4,0x00,0x22] + +v_maximum_f64 v[254:255], -|2|, -|vcc| +// GFX13: v_maximum_f64 v[254:255], -|2|, -|vcc| ; encoding: [0xfe,0x03,0x3f,0xd7,0x82,0xd4,0x00,0x62] + +v_minimum3_f32 v5, v1, v2, s3 +// GFX13: v_minimum3_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2d,0xd6,0x01,0x05,0x0e,0x00] + +v_minimum3_f32 v5, v255, s2, s105 +// GFX13: v_minimum3_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2d,0xd6,0xff,0x05,0xa4,0x01] + +v_minimum3_f32 v5, s1, v255, exec_hi +// GFX13: v_minimum3_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2d,0xd6,0x01,0xfe,0xff,0x01] + +v_minimum3_f32 v5, s105, s105, exec_lo +// GFX13: v_minimum3_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2d,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimum3_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minimum3_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimum3_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minimum3_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x2d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minimum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2d,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimum3_f32 v5, m0, 0.5, m0 +// GFX13: v_minimum3_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimum3_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minimum3_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2d,0xd6,0x7e,0x82,0xad,0x01] + +v_minimum3_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimum3_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2d,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimum3_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_minimum3_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x2d,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_minimum3_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimum3_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2d,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimum3_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minimum3_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x2d,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minimum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minimum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x2d,0xd6,0xfd,0xd4,0x04,0x33] + +v_minimum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minimum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_maximum3_f32 v5, v1, v2, s3 +// GFX13: v_maximum3_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2e,0xd6,0x01,0x05,0x0e,0x00] + +v_maximum3_f32 v5, v255, s2, s105 +// GFX13: v_maximum3_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2e,0xd6,0xff,0x05,0xa4,0x01] + +v_maximum3_f32 v5, s1, v255, exec_hi +// GFX13: v_maximum3_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2e,0xd6,0x01,0xfe,0xff,0x01] + +v_maximum3_f32 v5, s105, s105, exec_lo +// GFX13: v_maximum3_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2e,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximum3_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maximum3_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximum3_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maximum3_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x2e,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maximum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2e,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximum3_f32 v5, m0, 0.5, m0 +// GFX13: v_maximum3_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximum3_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maximum3_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2e,0xd6,0x7e,0x82,0xad,0x01] + +v_maximum3_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximum3_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2e,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximum3_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_maximum3_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x2e,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_maximum3_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximum3_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2e,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximum3_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maximum3_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x2e,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maximum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maximum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x2e,0xd6,0xfd,0xd4,0x04,0x33] + +v_maximum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maximum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_minimum3_f16 v5, v1, v2, s3 +// GFX13: v_minimum3_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00] + +v_minimum3_f16 v5, v255, s2, s105 +// GFX13: v_minimum3_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01] + +v_minimum3_f16 v5, s1, v255, exec_hi +// GFX13: v_minimum3_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01] + +v_minimum3_f16 v5, s105, s105, exec_lo +// GFX13: v_minimum3_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimum3_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minimum3_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 +// GFX13: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimum3_f16 v5, m0, 0.5, m0 +// GFX13: v_minimum3_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01] + +v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimum3_f16 v5, null, exec_lo, -|0xaf12| +// GFX13: v_minimum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimum3_f16 v5, 0.5, -m0, 0.5 +// GFX13: v_minimum3_f16 v5, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43] + +v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1 +// GFX13: v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23] + +v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp +// GFX13: v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00] + +v_maximum3_f16 v5, v1, v2, s3 +// GFX13: v_maximum3_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00] + +v_maximum3_f16 v5, v255, s2, s105 +// GFX13: v_maximum3_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01] + +v_maximum3_f16 v5, s1, v255, exec_hi +// GFX13: v_maximum3_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01] + +v_maximum3_f16 v5, s105, s105, exec_lo +// GFX13: v_maximum3_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximum3_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maximum3_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 +// GFX13: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximum3_f16 v5, m0, 0.5, m0 +// GFX13: v_maximum3_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01] + +v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximum3_f16 v5, null, exec_lo, -|0xaf12| +// GFX13: v_maximum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximum3_f16 v5, 0.5, -m0, 0.5 +// GFX13: v_maximum3_f16 v5, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43] + +v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1 +// GFX13: v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23] + +v_maximumminimum_f32 v5, v1, v2, s3 +// GFX13: v_maximumminimum_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6d,0xd6,0x01,0x05,0x0e,0x00] + +v_maximumminimum_f32 v5, v255, s2, s105 +// GFX13: v_maximumminimum_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6d,0xd6,0xff,0x05,0xa4,0x01] + +v_maximumminimum_f32 v5, s1, v255, exec_hi +// GFX13: v_maximumminimum_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6d,0xd6,0x01,0xfe,0xff,0x01] + +v_maximumminimum_f32 v5, s105, s105, exec_lo +// GFX13: v_maximumminimum_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6d,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximumminimum_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maximumminimum_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximumminimum_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maximumminimum_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maximumminimum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximumminimum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6d,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximumminimum_f32 v5, m0, 0.5, m0 +// GFX13: v_maximumminimum_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximumminimum_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maximumminimum_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6d,0xd6,0x7e,0x82,0xad,0x01] + +v_maximumminimum_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximumminimum_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6d,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximumminimum_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_maximumminimum_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x6d,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_maximumminimum_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximumminimum_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6d,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximumminimum_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maximumminimum_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6d,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maximumminimum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maximumminimum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6d,0xd6,0xfd,0xd4,0x04,0x33] + +v_maximumminimum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maximumminimum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_minimummaximum_f32 v5, v1, v2, s3 +// GFX13: v_minimummaximum_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6c,0xd6,0x01,0x05,0x0e,0x00] + +v_minimummaximum_f32 v5, v255, s2, s105 +// GFX13: v_minimummaximum_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6c,0xd6,0xff,0x05,0xa4,0x01] + +v_minimummaximum_f32 v5, s1, v255, exec_hi +// GFX13: v_minimummaximum_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6c,0xd6,0x01,0xfe,0xff,0x01] + +v_minimummaximum_f32 v5, s105, s105, exec_lo +// GFX13: v_minimummaximum_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6c,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimummaximum_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minimummaximum_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6c,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimummaximum_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minimummaximum_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6c,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minimummaximum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimummaximum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6c,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimummaximum_f32 v5, m0, 0.5, m0 +// GFX13: v_minimummaximum_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6c,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimummaximum_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minimummaximum_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6c,0xd6,0x7e,0x82,0xad,0x01] + +v_minimummaximum_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimummaximum_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6c,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimummaximum_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_minimummaximum_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x6c,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_minimummaximum_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimummaximum_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6c,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimummaximum_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minimummaximum_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6c,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minimummaximum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minimummaximum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6c,0xd6,0xfd,0xd4,0x04,0x33] + +v_minimummaximum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minimummaximum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6c,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_maximumminimum_f16 v5, v1, v2, s3 +// GFX13: v_maximumminimum_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00] + +v_maximumminimum_f16 v5, v255, s2, s105 +// GFX13: v_maximumminimum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01] + +v_maximumminimum_f16 v5, s1, v255, exec_hi +// GFX13: v_maximumminimum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01] + +v_maximumminimum_f16 v5, s105, s105, exec_lo +// GFX13: v_maximumminimum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255 +// GFX13: v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximumminimum_f16 v5, m0, 0.5, m0 +// GFX13: v_maximumminimum_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01] + +v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12| +// GFX13: v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximumminimum_f16 v5, 0.5, -m0, 0.5 +// GFX13: v_maximumminimum_f16 v5, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43] + +v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1 +// GFX13: v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23] + +v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp +// GFX13: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00] + +v_minimummaximum_f16 v5, v1, v2, s3 +// GFX13: v_minimummaximum_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00] + +v_minimummaximum_f16 v5, v255, s2, s105 +// GFX13: v_minimummaximum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01] + +v_minimummaximum_f16 v5, s1, v255, exec_hi +// GFX13: v_minimummaximum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01] + +v_minimummaximum_f16 v5, s105, s105, exec_lo +// GFX13: v_minimummaximum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255 +// GFX13: v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimummaximum_f16 v5, m0, 0.5, m0 +// GFX13: v_minimummaximum_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01] + +v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12| +// GFX13: v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimummaximum_f16 v5, 0.5, -m0, 0.5 +// GFX13: v_minimummaximum_f16 v5, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43] + +v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1 +// GFX13: v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23] + +v_s_exp_f32 s5, s1 +// GFX13: v_s_exp_f32 s5, s1 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f32 s5, s105 +// GFX13: v_s_exp_f32 s5, s105 ; encoding: [0x05,0x00,0x80,0xd6,0x69,0x00,0x01,0x02] + +v_s_exp_f32 s5, vcc_lo +// GFX13: v_s_exp_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x80,0xd6,0x6a,0x00,0x01,0x02] + +v_s_exp_f32 s5, vcc_hi +// GFX13: v_s_exp_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x80,0xd6,0x6b,0x00,0x01,0x02] + +v_s_exp_f32 s5, ttmp15 +// GFX13: v_s_exp_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x80,0xd6,0x7b,0x00,0x01,0x02] + +v_s_exp_f32 s5, m0 +// GFX13: v_s_exp_f32 s5, m0 ; encoding: [0x05,0x00,0x80,0xd6,0x7d,0x00,0x01,0x02] + +v_s_exp_f32 s5, exec_lo +// GFX13: v_s_exp_f32 s5, exec_lo ; encoding: [0x05,0x00,0x80,0xd6,0x7e,0x00,0x01,0x02] + +v_s_exp_f32 s5, exec_hi +// GFX13: v_s_exp_f32 s5, exec_hi ; encoding: [0x05,0x00,0x80,0xd6,0x7f,0x00,0x01,0x02] + +v_s_exp_f32 s5, null +// GFX13: v_s_exp_f32 s5, null ; encoding: [0x05,0x00,0x80,0xd6,0x7c,0x00,0x01,0x02] + +v_s_exp_f32 s5, -1 +// GFX13: v_s_exp_f32 s5, -1 ; encoding: [0x05,0x00,0x80,0xd6,0xc1,0x00,0x01,0x02] + +v_s_exp_f32 s5, 0.5 +// GFX13: v_s_exp_f32 s5, 0.5 ; encoding: [0x05,0x00,0x80,0xd6,0xf0,0x00,0x01,0x02] + +v_s_exp_f32 s5, src_scc +// GFX13: v_s_exp_f32 s5, src_scc ; encoding: [0x05,0x00,0x80,0xd6,0xfd,0x00,0x01,0x02] + +v_s_exp_f32 s105, 0xaf123456 +// GFX13: v_s_exp_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x80,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_exp_f32 s5, -s1 +// GFX13: v_s_exp_f32 s5, -s1 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x22] + +v_s_exp_f32 s5, |s1| +// GFX13: v_s_exp_f32 s5, |s1| ; encoding: [0x05,0x01,0x80,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f32 s5, s1 clamp +// GFX13: v_s_exp_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x80,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f32 s5, s1 mul:2 +// GFX13: v_s_exp_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x0a] + +v_s_exp_f32 s5, s1 mul:4 +// GFX13: v_s_exp_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x12] + +v_s_exp_f32 s5, s1 div:2 +// GFX13: v_s_exp_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x1a] + +v_s_exp_f16 s5, s1 +// GFX13: v_s_exp_f16 s5, s1 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f16 s5, s105 +// GFX13: v_s_exp_f16 s5, s105 ; encoding: [0x05,0x00,0x81,0xd6,0x69,0x00,0x01,0x02] + +v_s_exp_f16 s5, vcc_lo +// GFX13: v_s_exp_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd6,0x6a,0x00,0x01,0x02] + +v_s_exp_f16 s5, vcc_hi +// GFX13: v_s_exp_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd6,0x6b,0x00,0x01,0x02] + +v_s_exp_f16 s5, ttmp15 +// GFX13: v_s_exp_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd6,0x7b,0x00,0x01,0x02] + +v_s_exp_f16 s5, m0 +// GFX13: v_s_exp_f16 s5, m0 ; encoding: [0x05,0x00,0x81,0xd6,0x7d,0x00,0x01,0x02] + +v_s_exp_f16 s5, exec_lo +// GFX13: v_s_exp_f16 s5, exec_lo ; encoding: [0x05,0x00,0x81,0xd6,0x7e,0x00,0x01,0x02] + +v_s_exp_f16 s5, exec_hi +// GFX13: v_s_exp_f16 s5, exec_hi ; encoding: [0x05,0x00,0x81,0xd6,0x7f,0x00,0x01,0x02] + +v_s_exp_f16 s5, null +// GFX13: v_s_exp_f16 s5, null ; encoding: [0x05,0x00,0x81,0xd6,0x7c,0x00,0x01,0x02] + +v_s_exp_f16 s5, -1 +// GFX13: v_s_exp_f16 s5, -1 ; encoding: [0x05,0x00,0x81,0xd6,0xc1,0x00,0x01,0x02] + +v_s_exp_f16 s5, 0.5 +// GFX13: v_s_exp_f16 s5, 0.5 ; encoding: [0x05,0x00,0x81,0xd6,0xf0,0x00,0x01,0x02] + +v_s_exp_f16 s5, src_scc +// GFX13: v_s_exp_f16 s5, src_scc ; encoding: [0x05,0x00,0x81,0xd6,0xfd,0x00,0x01,0x02] + +v_s_exp_f16 s105, 0xaf12 +// GFX13: v_s_exp_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x81,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_exp_f16 s5, -s1 +// GFX13: v_s_exp_f16 s5, -s1 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x22] + +v_s_exp_f16 s5, |s1| +// GFX13: v_s_exp_f16 s5, |s1| ; encoding: [0x05,0x01,0x81,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f16 s5, s1 clamp +// GFX13: v_s_exp_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x81,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f16 s5, s1 mul:2 +// GFX13: v_s_exp_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x0a] + +v_s_exp_f16 s5, s1 mul:4 +// GFX13: v_s_exp_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x12] + +v_s_exp_f16 s5, s1 div:2 +// GFX13: v_s_exp_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x1a] + +v_s_log_f32 s5, s1 +// GFX13: v_s_log_f32 s5, s1 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f32 s5, s105 +// GFX13: v_s_log_f32 s5, s105 ; encoding: [0x05,0x00,0x82,0xd6,0x69,0x00,0x01,0x02] + +v_s_log_f32 s5, vcc_lo +// GFX13: v_s_log_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x82,0xd6,0x6a,0x00,0x01,0x02] + +v_s_log_f32 s5, vcc_hi +// GFX13: v_s_log_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x82,0xd6,0x6b,0x00,0x01,0x02] + +v_s_log_f32 s5, ttmp15 +// GFX13: v_s_log_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x82,0xd6,0x7b,0x00,0x01,0x02] + +v_s_log_f32 s5, m0 +// GFX13: v_s_log_f32 s5, m0 ; encoding: [0x05,0x00,0x82,0xd6,0x7d,0x00,0x01,0x02] + +v_s_log_f32 s5, exec_lo +// GFX13: v_s_log_f32 s5, exec_lo ; encoding: [0x05,0x00,0x82,0xd6,0x7e,0x00,0x01,0x02] + +v_s_log_f32 s5, exec_hi +// GFX13: v_s_log_f32 s5, exec_hi ; encoding: [0x05,0x00,0x82,0xd6,0x7f,0x00,0x01,0x02] + +v_s_log_f32 s5, null +// GFX13: v_s_log_f32 s5, null ; encoding: [0x05,0x00,0x82,0xd6,0x7c,0x00,0x01,0x02] + +v_s_log_f32 s5, -1 +// GFX13: v_s_log_f32 s5, -1 ; encoding: [0x05,0x00,0x82,0xd6,0xc1,0x00,0x01,0x02] + +v_s_log_f32 s5, 0.5 +// GFX13: v_s_log_f32 s5, 0.5 ; encoding: [0x05,0x00,0x82,0xd6,0xf0,0x00,0x01,0x02] + +v_s_log_f32 s5, src_scc +// GFX13: v_s_log_f32 s5, src_scc ; encoding: [0x05,0x00,0x82,0xd6,0xfd,0x00,0x01,0x02] + +v_s_log_f32 s105, 0xaf123456 +// GFX13: v_s_log_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x82,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_log_f32 s5, -s1 +// GFX13: v_s_log_f32 s5, -s1 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x22] + +v_s_log_f32 s5, |s1| +// GFX13: v_s_log_f32 s5, |s1| ; encoding: [0x05,0x01,0x82,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f32 s5, s1 clamp +// GFX13: v_s_log_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x82,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f32 s5, s1 mul:2 +// GFX13: v_s_log_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x0a] + +v_s_log_f32 s5, s1 mul:4 +// GFX13: v_s_log_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x12] + +v_s_log_f32 s5, s1 div:2 +// GFX13: v_s_log_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x1a] + +v_s_log_f16 s5, s1 +// GFX13: v_s_log_f16 s5, s1 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f16 s5, s105 +// GFX13: v_s_log_f16 s5, s105 ; encoding: [0x05,0x00,0x83,0xd6,0x69,0x00,0x01,0x02] + +v_s_log_f16 s5, vcc_lo +// GFX13: v_s_log_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x83,0xd6,0x6a,0x00,0x01,0x02] + +v_s_log_f16 s5, vcc_hi +// GFX13: v_s_log_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x83,0xd6,0x6b,0x00,0x01,0x02] + +v_s_log_f16 s5, ttmp15 +// GFX13: v_s_log_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x83,0xd6,0x7b,0x00,0x01,0x02] + +v_s_log_f16 s5, m0 +// GFX13: v_s_log_f16 s5, m0 ; encoding: [0x05,0x00,0x83,0xd6,0x7d,0x00,0x01,0x02] + +v_s_log_f16 s5, exec_lo +// GFX13: v_s_log_f16 s5, exec_lo ; encoding: [0x05,0x00,0x83,0xd6,0x7e,0x00,0x01,0x02] + +v_s_log_f16 s5, exec_hi +// GFX13: v_s_log_f16 s5, exec_hi ; encoding: [0x05,0x00,0x83,0xd6,0x7f,0x00,0x01,0x02] + +v_s_log_f16 s5, null +// GFX13: v_s_log_f16 s5, null ; encoding: [0x05,0x00,0x83,0xd6,0x7c,0x00,0x01,0x02] + +v_s_log_f16 s5, -1 +// GFX13: v_s_log_f16 s5, -1 ; encoding: [0x05,0x00,0x83,0xd6,0xc1,0x00,0x01,0x02] + +v_s_log_f16 s5, 0.5 +// GFX13: v_s_log_f16 s5, 0.5 ; encoding: [0x05,0x00,0x83,0xd6,0xf0,0x00,0x01,0x02] + +v_s_log_f16 s5, src_scc +// GFX13: v_s_log_f16 s5, src_scc ; encoding: [0x05,0x00,0x83,0xd6,0xfd,0x00,0x01,0x02] + +v_s_log_f16 s105, 0xaf12 +// GFX13: v_s_log_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x83,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_log_f16 s5, -s1 +// GFX13: v_s_log_f16 s5, -s1 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x22] + +v_s_log_f16 s5, |s1| +// GFX13: v_s_log_f16 s5, |s1| ; encoding: [0x05,0x01,0x83,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f16 s5, s1 clamp +// GFX13: v_s_log_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x83,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f16 s5, s1 mul:2 +// GFX13: v_s_log_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x0a] + +v_s_log_f16 s5, s1 mul:4 +// GFX13: v_s_log_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x12] + +v_s_log_f16 s5, s1 div:2 +// GFX13: v_s_log_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rcp_f32 s5, s1 +// GFX13: v_s_rcp_f32 s5, s1 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f32 s5, s105 +// GFX13: v_s_rcp_f32 s5, s105 ; encoding: [0x05,0x00,0x84,0xd6,0x69,0x00,0x01,0x02] + +v_s_rcp_f32 s5, vcc_lo +// GFX13: v_s_rcp_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x84,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rcp_f32 s5, vcc_hi +// GFX13: v_s_rcp_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x84,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rcp_f32 s5, ttmp15 +// GFX13: v_s_rcp_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x84,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rcp_f32 s5, m0 +// GFX13: v_s_rcp_f32 s5, m0 ; encoding: [0x05,0x00,0x84,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rcp_f32 s5, exec_lo +// GFX13: v_s_rcp_f32 s5, exec_lo ; encoding: [0x05,0x00,0x84,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rcp_f32 s5, exec_hi +// GFX13: v_s_rcp_f32 s5, exec_hi ; encoding: [0x05,0x00,0x84,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rcp_f32 s5, null +// GFX13: v_s_rcp_f32 s5, null ; encoding: [0x05,0x00,0x84,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rcp_f32 s5, -1 +// GFX13: v_s_rcp_f32 s5, -1 ; encoding: [0x05,0x00,0x84,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rcp_f32 s5, 0.5 +// GFX13: v_s_rcp_f32 s5, 0.5 ; encoding: [0x05,0x00,0x84,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rcp_f32 s5, src_scc +// GFX13: v_s_rcp_f32 s5, src_scc ; encoding: [0x05,0x00,0x84,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rcp_f32 s105, 0xaf123456 +// GFX13: v_s_rcp_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x84,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_rcp_f32 s5, -s1 +// GFX13: v_s_rcp_f32 s5, -s1 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x22] + +v_s_rcp_f32 s5, |s1| +// GFX13: v_s_rcp_f32 s5, |s1| ; encoding: [0x05,0x01,0x84,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f32 s5, s1 clamp +// GFX13: v_s_rcp_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x84,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f32 s5, s1 mul:2 +// GFX13: v_s_rcp_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rcp_f32 s5, s1 mul:4 +// GFX13: v_s_rcp_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x12] + +v_s_rcp_f32 s5, s1 div:2 +// GFX13: v_s_rcp_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rcp_f16 s5, s1 +// GFX13: v_s_rcp_f16 s5, s1 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f16 s5, s105 +// GFX13: v_s_rcp_f16 s5, s105 ; encoding: [0x05,0x00,0x85,0xd6,0x69,0x00,0x01,0x02] + +v_s_rcp_f16 s5, vcc_lo +// GFX13: v_s_rcp_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rcp_f16 s5, vcc_hi +// GFX13: v_s_rcp_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rcp_f16 s5, ttmp15 +// GFX13: v_s_rcp_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rcp_f16 s5, m0 +// GFX13: v_s_rcp_f16 s5, m0 ; encoding: [0x05,0x00,0x85,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rcp_f16 s5, exec_lo +// GFX13: v_s_rcp_f16 s5, exec_lo ; encoding: [0x05,0x00,0x85,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rcp_f16 s5, exec_hi +// GFX13: v_s_rcp_f16 s5, exec_hi ; encoding: [0x05,0x00,0x85,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rcp_f16 s5, null +// GFX13: v_s_rcp_f16 s5, null ; encoding: [0x05,0x00,0x85,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rcp_f16 s5, -1 +// GFX13: v_s_rcp_f16 s5, -1 ; encoding: [0x05,0x00,0x85,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rcp_f16 s5, 0.5 +// GFX13: v_s_rcp_f16 s5, 0.5 ; encoding: [0x05,0x00,0x85,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rcp_f16 s5, src_scc +// GFX13: v_s_rcp_f16 s5, src_scc ; encoding: [0x05,0x00,0x85,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rcp_f16 s105, 0xaf12 +// GFX13: v_s_rcp_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x85,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_rcp_f16 s5, -s1 +// GFX13: v_s_rcp_f16 s5, -s1 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x22] + +v_s_rcp_f16 s5, |s1| +// GFX13: v_s_rcp_f16 s5, |s1| ; encoding: [0x05,0x01,0x85,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f16 s5, s1 clamp +// GFX13: v_s_rcp_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x85,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f16 s5, s1 mul:2 +// GFX13: v_s_rcp_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rcp_f16 s5, s1 mul:4 +// GFX13: v_s_rcp_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x12] + +v_s_rcp_f16 s5, s1 div:2 +// GFX13: v_s_rcp_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rsq_f32 s5, s1 +// GFX13: v_s_rsq_f32 s5, s1 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f32 s5, s105 +// GFX13: v_s_rsq_f32 s5, s105 ; encoding: [0x05,0x00,0x86,0xd6,0x69,0x00,0x01,0x02] + +v_s_rsq_f32 s5, vcc_lo +// GFX13: v_s_rsq_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rsq_f32 s5, vcc_hi +// GFX13: v_s_rsq_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rsq_f32 s5, ttmp15 +// GFX13: v_s_rsq_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rsq_f32 s5, m0 +// GFX13: v_s_rsq_f32 s5, m0 ; encoding: [0x05,0x00,0x86,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rsq_f32 s5, exec_lo +// GFX13: v_s_rsq_f32 s5, exec_lo ; encoding: [0x05,0x00,0x86,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rsq_f32 s5, exec_hi +// GFX13: v_s_rsq_f32 s5, exec_hi ; encoding: [0x05,0x00,0x86,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rsq_f32 s5, null +// GFX13: v_s_rsq_f32 s5, null ; encoding: [0x05,0x00,0x86,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rsq_f32 s5, -1 +// GFX13: v_s_rsq_f32 s5, -1 ; encoding: [0x05,0x00,0x86,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rsq_f32 s5, 0.5 +// GFX13: v_s_rsq_f32 s5, 0.5 ; encoding: [0x05,0x00,0x86,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rsq_f32 s5, src_scc +// GFX13: v_s_rsq_f32 s5, src_scc ; encoding: [0x05,0x00,0x86,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rsq_f32 s105, 0xaf123456 +// GFX13: v_s_rsq_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x86,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_rsq_f32 s5, -s1 +// GFX13: v_s_rsq_f32 s5, -s1 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x22] + +v_s_rsq_f32 s5, |s1| +// GFX13: v_s_rsq_f32 s5, |s1| ; encoding: [0x05,0x01,0x86,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f32 s5, s1 clamp +// GFX13: v_s_rsq_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x86,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f32 s5, s1 mul:2 +// GFX13: v_s_rsq_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rsq_f32 s5, s1 mul:4 +// GFX13: v_s_rsq_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x12] + +v_s_rsq_f32 s5, s1 div:2 +// GFX13: v_s_rsq_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rsq_f16 s5, s1 +// GFX13: v_s_rsq_f16 s5, s1 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f16 s5, s105 +// GFX13: v_s_rsq_f16 s5, s105 ; encoding: [0x05,0x00,0x87,0xd6,0x69,0x00,0x01,0x02] + +v_s_rsq_f16 s5, vcc_lo +// GFX13: v_s_rsq_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rsq_f16 s5, vcc_hi +// GFX13: v_s_rsq_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rsq_f16 s5, ttmp15 +// GFX13: v_s_rsq_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rsq_f16 s5, m0 +// GFX13: v_s_rsq_f16 s5, m0 ; encoding: [0x05,0x00,0x87,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rsq_f16 s5, exec_lo +// GFX13: v_s_rsq_f16 s5, exec_lo ; encoding: [0x05,0x00,0x87,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rsq_f16 s5, exec_hi +// GFX13: v_s_rsq_f16 s5, exec_hi ; encoding: [0x05,0x00,0x87,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rsq_f16 s5, null +// GFX13: v_s_rsq_f16 s5, null ; encoding: [0x05,0x00,0x87,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rsq_f16 s5, -1 +// GFX13: v_s_rsq_f16 s5, -1 ; encoding: [0x05,0x00,0x87,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rsq_f16 s5, 0.5 +// GFX13: v_s_rsq_f16 s5, 0.5 ; encoding: [0x05,0x00,0x87,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rsq_f16 s5, src_scc +// GFX13: v_s_rsq_f16 s5, src_scc ; encoding: [0x05,0x00,0x87,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rsq_f16 s105, 0xaf12 +// GFX13: v_s_rsq_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x87,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_rsq_f16 s5, -s1 +// GFX13: v_s_rsq_f16 s5, -s1 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x22] + +v_s_rsq_f16 s5, |s1| +// GFX13: v_s_rsq_f16 s5, |s1| ; encoding: [0x05,0x01,0x87,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f16 s5, s1 clamp +// GFX13: v_s_rsq_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x87,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f16 s5, s1 mul:2 +// GFX13: v_s_rsq_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rsq_f16 s5, s1 mul:4 +// GFX13: v_s_rsq_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x12] + +v_s_rsq_f16 s5, s1 div:2 +// GFX13: v_s_rsq_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x1a] + +v_s_sqrt_f32 s5, s1 +// GFX13: v_s_sqrt_f32 s5, s1 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, s105 +// GFX13: v_s_sqrt_f32 s5, s105 ; encoding: [0x05,0x00,0x88,0xd6,0x69,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, vcc_lo +// GFX13: v_s_sqrt_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd6,0x6a,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, vcc_hi +// GFX13: v_s_sqrt_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd6,0x6b,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, ttmp15 +// GFX13: v_s_sqrt_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd6,0x7b,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, m0 +// GFX13: v_s_sqrt_f32 s5, m0 ; encoding: [0x05,0x00,0x88,0xd6,0x7d,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, exec_lo +// GFX13: v_s_sqrt_f32 s5, exec_lo ; encoding: [0x05,0x00,0x88,0xd6,0x7e,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, exec_hi +// GFX13: v_s_sqrt_f32 s5, exec_hi ; encoding: [0x05,0x00,0x88,0xd6,0x7f,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, null +// GFX13: v_s_sqrt_f32 s5, null ; encoding: [0x05,0x00,0x88,0xd6,0x7c,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, -1 +// GFX13: v_s_sqrt_f32 s5, -1 ; encoding: [0x05,0x00,0x88,0xd6,0xc1,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, 0.5 +// GFX13: v_s_sqrt_f32 s5, 0.5 ; encoding: [0x05,0x00,0x88,0xd6,0xf0,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, src_scc +// GFX13: v_s_sqrt_f32 s5, src_scc ; encoding: [0x05,0x00,0x88,0xd6,0xfd,0x00,0x01,0x02] + +v_s_sqrt_f32 s105, 0xaf123456 +// GFX13: v_s_sqrt_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x88,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_sqrt_f32 s5, -s1 +// GFX13: v_s_sqrt_f32 s5, -s1 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x22] + +v_s_sqrt_f32 s5, |s1| +// GFX13: v_s_sqrt_f32 s5, |s1| ; encoding: [0x05,0x01,0x88,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, s1 clamp +// GFX13: v_s_sqrt_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x88,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, s1 mul:2 +// GFX13: v_s_sqrt_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x0a] + +v_s_sqrt_f32 s5, s1 mul:4 +// GFX13: v_s_sqrt_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x12] + +v_s_sqrt_f32 s5, s1 div:2 +// GFX13: v_s_sqrt_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x1a] + +v_s_sqrt_f16 s5, s1 +// GFX13: v_s_sqrt_f16 s5, s1 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, s105 +// GFX13: v_s_sqrt_f16 s5, s105 ; encoding: [0x05,0x00,0x89,0xd6,0x69,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, vcc_lo +// GFX13: v_s_sqrt_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x89,0xd6,0x6a,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, vcc_hi +// GFX13: v_s_sqrt_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x89,0xd6,0x6b,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, ttmp15 +// GFX13: v_s_sqrt_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x89,0xd6,0x7b,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, m0 +// GFX13: v_s_sqrt_f16 s5, m0 ; encoding: [0x05,0x00,0x89,0xd6,0x7d,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, exec_lo +// GFX13: v_s_sqrt_f16 s5, exec_lo ; encoding: [0x05,0x00,0x89,0xd6,0x7e,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, exec_hi +// GFX13: v_s_sqrt_f16 s5, exec_hi ; encoding: [0x05,0x00,0x89,0xd6,0x7f,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, null +// GFX13: v_s_sqrt_f16 s5, null ; encoding: [0x05,0x00,0x89,0xd6,0x7c,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, -1 +// GFX13: v_s_sqrt_f16 s5, -1 ; encoding: [0x05,0x00,0x89,0xd6,0xc1,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, 0.5 +// GFX13: v_s_sqrt_f16 s5, 0.5 ; encoding: [0x05,0x00,0x89,0xd6,0xf0,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, src_scc +// GFX13: v_s_sqrt_f16 s5, src_scc ; encoding: [0x05,0x00,0x89,0xd6,0xfd,0x00,0x01,0x02] + +v_s_sqrt_f16 s105, 0xaf12 +// GFX13: v_s_sqrt_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x89,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_sqrt_f16 s5, -s1 +// GFX13: v_s_sqrt_f16 s5, -s1 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x22] + +v_s_sqrt_f16 s5, |s1| +// GFX13: v_s_sqrt_f16 s5, |s1| ; encoding: [0x05,0x01,0x89,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, s1 clamp +// GFX13: v_s_sqrt_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x89,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, s1 mul:2 +// GFX13: v_s_sqrt_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x0a] + +v_s_sqrt_f16 s5, s1 mul:4 +// GFX13: v_s_sqrt_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x12] + +v_s_sqrt_f16 s5, s1 div:2 +// GFX13: v_s_sqrt_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x1a] + +v_cvt_sr_pk_f16_f32 v5, v1, v2, s3 +// GFX13: v_cvt_sr_pk_f16_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0xc3,0xd6,0x01,0x05,0x0e,0x00] + +v_cvt_sr_pk_f16_f32 v5, v255, s2, s105 +// GFX13: v_cvt_sr_pk_f16_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0xc3,0xd6,0xff,0x05,0xa4,0x01] + +v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi +// GFX13: v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0xc3,0xd6,0x01,0xfe,0xff,0x01] + +v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo +// GFX13: v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0xc3,0xd6,0x69,0xd2,0xf8,0x01] + +v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0xc3,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0xc3,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 +// GFX13: v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0xc3,0xd6,0x7b,0xfa,0xed,0x61] + +v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0 +// GFX13: v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0xc3,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0xc3,0xd6,0x7e,0x82,0xad,0x01] + +v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo +// GFX13: v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0xc3,0xd6,0x7f,0xf8,0xa8,0x21] + +v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0xc3,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc +// GFX13: v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0xc3,0xd6,0xc1,0xfe,0xf4,0x43] + +v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0xc3,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0xc3,0xd6,0xfd,0xd4,0x04,0x33] + +v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0xc3,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cvt_pk_f16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_f16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x74,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_f16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_f16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x74,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_f16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_f16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x74,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_f16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_f16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x74,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_f16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_f16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x74,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_f16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_f16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x74,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_f16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_f16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x74,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_f16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_f16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x74,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_f16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_f16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x74,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_f16_f32 v5, exec_hi, null +// GFX13: v_cvt_pk_f16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x74,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_f16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_f16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x74,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_f16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_f16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x74,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 +// GFX13: v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x74,0xd7,0xf0,0xfa,0x00,0x0a] + +v_cvt_pk_f16_f32 v5, src_scc, vcc_lo mul:4 +// GFX13: v_cvt_pk_f16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x74,0xd7,0xfd,0xd4,0x00,0x12] + +v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 +// GFX13: v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x74,0xd7,0xff,0xd6,0x00,0x3a,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0xc2,0xd6,0x01,0x05,0x0e,0x00] + +v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0xc2,0xd6,0xff,0x05,0xa4,0x01] + +v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi +// GFX13: v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0xc2,0xd6,0x01,0xfe,0xff,0x01] + +v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo +// GFX13: v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0xc2,0xd6,0x69,0xd2,0xf8,0x01] + +v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0xc2,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0xc2,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0xc2,0xd6,0x7b,0xfa,0xed,0x61] + +v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0xc2,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0xc2,0xd6,0x7e,0x82,0xad,0x01] + +v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0xc2,0xd6,0x7f,0xf8,0xa8,0x21] + +v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0xc2,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0xc2,0xd6,0xc1,0xfe,0xf4,0x43] + +v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0xc2,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0xc2,0xd6,0xfd,0xd4,0x04,0x33] + +v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0xc2,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 +// W32: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xa3,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 100.0 +// W32: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xa3,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 +// W32: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 100.0 +// W32: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 ; encoding: [0x0a,0x00,0xaf,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xaf,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0xb2,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb2,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0x9b,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9b,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0x9e,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9e,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xae,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xae,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb1,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb1,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xa2,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xa2,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 +// W32: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0x9d,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 100.0 +// W32: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0x9d,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0x9c,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9c,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x95,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x95,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x96,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x96,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], v8 +// W32: v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], v8 ; encoding: [0x14,0x00,0x97,0xd6,0x06,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], 100.0 +// W32: v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], 0x42c80000 ; encoding: [0x14,0x00,0x97,0xd6,0x06,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x98,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x98,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x99,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x99,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], v8 +// W32: v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], v8 ; encoding: [0x14,0x00,0x9a,0xd6,0x06,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], 100.0 +// W32: v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], 0x42c80000 ; encoding: [0x14,0x00,0x9a,0xd6,0x06,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa4,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa4,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa5,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa5,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 +// W32: v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 ; encoding: [0x00,0x00,0xa6,0xd6,0x06,0x4d,0x9e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa6,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa7,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa7,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa8,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa8,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 +// W32: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 ; encoding: [0x00,0x00,0xa9,0xd6,0x06,0x4d,0x9e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa9,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3.s new file mode 100644 index 0000000000000..601fe2e3db9d2 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3.s @@ -0,0 +1,8195 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W32,GFX13-ASM,W32-ASM %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W32,GFX13-DIS,W32-DIS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W64,GFX13-ASM,W64-ASM %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W64,GFX13-DIS,W64-DIS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add3_u32 v5, v1, v2, s3 +// GFX13: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x0e,0x00] + +v_add3_u32 v5, v255, s2, s105 +// GFX13: v_add3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0x05,0xa4,0x01] + +v_add3_u32 v5, s1, v255, exec_hi +// GFX13: v_add3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0xfe,0xff,0x01] + +v_add3_u32 v5, s105, s105, exec_lo +// GFX13: v_add3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0xf8,0x01] + +v_add3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_add3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x0c,0x04] + +v_add3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_add3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_add3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_add3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0xed,0x01] + +v_add3_u32 v5, m0, 0.5, m0 +// GFX13: v_add3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0xf5,0x01] + +v_add3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_add3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0xad,0x01] + +v_add3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_add3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0xa8,0x01] + +v_add3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_add3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_add3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_add3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0xf4,0x03] + +v_add3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_add3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0xc0,0x03] + +v_add3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_add3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x04,0x03] + +v_add3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_add3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x6d,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_add_co_u32 v5, s6, v1, v2 +// W32: v_add_co_u32 v5, s6, v1, v2 ; encoding: [0x05,0x06,0x0f,0xd7,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, v255, v255 +// W32: v_add_co_u32 v5, s6, v255, v255 ; encoding: [0x05,0x06,0x0f,0xd7,0xff,0xff,0x03,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, s1, s2 +// W32: v_add_co_u32 v5, s6, s1, s2 ; encoding: [0x05,0x06,0x0f,0xd7,0x01,0x04,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, s105, s105 +// W32: v_add_co_u32 v5, s6, s105, s105 ; encoding: [0x05,0x06,0x0f,0xd7,0x69,0xd2,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, vcc_lo, ttmp15 +// W32: v_add_co_u32 v5, s6, vcc_lo, ttmp15 ; encoding: [0x05,0x06,0x0f,0xd7,0x6a,0xf6,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, vcc_hi, 0xaf123456 +// W32: v_add_co_u32 v5, s6, vcc_hi, 0xaf123456 ; encoding: [0x05,0x06,0x0f,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, ttmp15, src_scc +// W32: v_add_co_u32 v5, s6, ttmp15, src_scc ; encoding: [0x05,0x06,0x0f,0xd7,0x7b,0xfa,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, m0, 0.5 +// W32: v_add_co_u32 v5, s6, m0, 0.5 ; encoding: [0x05,0x06,0x0f,0xd7,0x7d,0xe0,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, exec_lo, -1 +// W32: v_add_co_u32 v5, s6, exec_lo, -1 ; encoding: [0x05,0x06,0x0f,0xd7,0x7e,0x82,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s6, exec_hi, null +// W32: v_add_co_u32 v5, s6, exec_hi, null ; encoding: [0x05,0x06,0x0f,0xd7,0x7f,0xf8,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s105, null, exec_lo +// W32: v_add_co_u32 v5, s105, null, exec_lo ; encoding: [0x05,0x69,0x0f,0xd7,0x7c,0xfc,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, vcc_lo, -1, exec_hi +// W32: v_add_co_u32 v5, vcc_lo, -1, exec_hi ; encoding: [0x05,0x6a,0x0f,0xd7,0xc1,0xfe,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, vcc_hi, 0.5, m0 +// W32: v_add_co_u32 v5, vcc_hi, 0.5, m0 ; encoding: [0x05,0x6b,0x0f,0xd7,0xf0,0xfa,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, ttmp15, src_scc, vcc_lo +// W32: v_add_co_u32 v5, ttmp15, src_scc, vcc_lo ; encoding: [0x05,0x7b,0x0f,0xd7,0xfd,0xd4,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_add_co_u32 v5, s[12:13], v1, v2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], v1, v2 ; encoding: [0x05,0x0c,0x0f,0xd7,0x01,0x05,0x02,0x02] + +v_add_co_u32 v5, s[12:13], v255, v255 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], v255, v255 ; encoding: [0x05,0x0c,0x0f,0xd7,0xff,0xff,0x03,0x02] + +v_add_co_u32 v5, s[12:13], s1, s2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], s1, s2 ; encoding: [0x05,0x0c,0x0f,0xd7,0x01,0x04,0x00,0x02] + +v_add_co_u32 v5, s[12:13], s105, s105 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], s105, s105 ; encoding: [0x05,0x0c,0x0f,0xd7,0x69,0xd2,0x00,0x02] + +v_add_co_u32 v5, s[12:13], vcc_lo, ttmp15 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], vcc_lo, ttmp15 ; encoding: [0x05,0x0c,0x0f,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 ; encoding: [0x05,0x0c,0x0f,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_add_co_u32 v5, s[12:13], ttmp15, src_scc +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], ttmp15, src_scc ; encoding: [0x05,0x0c,0x0f,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_co_u32 v5, s[12:13], m0, 0.5 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], m0, 0.5 ; encoding: [0x05,0x0c,0x0f,0xd7,0x7d,0xe0,0x01,0x02] + +v_add_co_u32 v5, s[12:13], exec_lo, -1 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], exec_lo, -1 ; encoding: [0x05,0x0c,0x0f,0xd7,0x7e,0x82,0x01,0x02] + +v_add_co_u32 v5, s[12:13], exec_hi, null +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], exec_hi, null ; encoding: [0x05,0x0c,0x0f,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_co_u32 v5, s[12:13], null, exec_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[12:13], null, exec_lo ; encoding: [0x05,0x0c,0x0f,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_co_u32 v5, s[104:105], -1, exec_hi +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, s[104:105], -1, exec_hi ; encoding: [0x05,0x68,0x0f,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_co_u32 v5, vcc, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// W64: v_add_co_u32 v5, vcc, 0.5, m0 ; encoding: [0x05,0x6a,0x0f,0xd7,0xf0,0xfa,0x00,0x02] + +v_add_co_u32 v5, ttmp[14:15], src_scc, vcc_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_add_co_u32 v5, ttmp[14:15], src_scc, vcc_lo ; encoding: [0x05,0x7a,0x0f,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_co_u32 v255, null, 0xaf123456, vcc_hi clamp +// GFX13: v_add_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x0f,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_add_lshl_u32 v5, v1, v2, s3 +// GFX13: v_add_lshl_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x47,0xd7,0x01,0x05,0x0e,0x00] + +v_add_lshl_u32 v5, v255, s2, s105 +// GFX13: v_add_lshl_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x47,0xd7,0xff,0x05,0xa4,0x01] + +v_add_lshl_u32 v5, s1, v255, exec_hi +// GFX13: v_add_lshl_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x47,0xd7,0x01,0xfe,0xff,0x01] + +v_add_lshl_u32 v5, s105, s105, exec_lo +// GFX13: v_add_lshl_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x47,0xd7,0x69,0xd2,0xf8,0x01] + +v_add_lshl_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_add_lshl_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x47,0xd7,0x6a,0xf6,0x0c,0x04] + +v_add_lshl_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_add_lshl_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x47,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_add_lshl_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_add_lshl_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x47,0xd7,0x7b,0xfa,0xed,0x01] + +v_add_lshl_u32 v5, m0, 0.5, m0 +// GFX13: v_add_lshl_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x47,0xd7,0x7d,0xe0,0xf5,0x01] + +v_add_lshl_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_add_lshl_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x47,0xd7,0x7e,0x82,0xad,0x01] + +v_add_lshl_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_add_lshl_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x47,0xd7,0x7f,0xf8,0xa8,0x01] + +v_add_lshl_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_add_lshl_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x47,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_add_lshl_u32 v5, -1, exec_hi, src_scc +// GFX13: v_add_lshl_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x47,0xd7,0xc1,0xfe,0xf4,0x03] + +v_add_lshl_u32 v5, 0.5, m0, 0.5 +// GFX13: v_add_lshl_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x47,0xd7,0xf0,0xfa,0xc0,0x03] + +v_add_lshl_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_add_lshl_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x47,0xd7,0xfd,0xd4,0x04,0x03] + +v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_add_nc_i16 v5.l, v1.l, v2.l +// GFX13: v_add_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x02] + +v_add_nc_i16 v5.l, v255.l, v255.l +// GFX13: v_add_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x02] + +v_add_nc_i16 v5.l, s1, s2 +// GFX13: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x02] + +v_add_nc_i16 v5.l, s105, s105 +// GFX13: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x02] + +v_add_nc_i16 v5.l, vcc_lo, ttmp15 +// GFX13: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_nc_i16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_add_nc_i16 v5.l, ttmp15, src_scc +// GFX13: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_nc_i16 v5.l, m0, 0.5 +// GFX13-ASM: v_add_nc_i16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_i16 v5.l, exec_lo, -1 +// GFX13: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x02] + +v_add_nc_i16 v5.l, exec_hi, null +// GFX13: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] +// GFX13: v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_add_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_add_nc_i32 v5, v1, v2 +// GFX13: v_add_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x7f,0xd7,0x01,0x05,0x02,0x02] + +v_add_nc_i32 v5, v255, v255 +// GFX13: v_add_nc_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x7f,0xd7,0xff,0xff,0x03,0x02] + +v_add_nc_i32 v5, s1, s2 +// GFX13: v_add_nc_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x7f,0xd7,0x01,0x04,0x00,0x02] + +v_add_nc_i32 v5, s105, s105 +// GFX13: v_add_nc_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x7f,0xd7,0x69,0xd2,0x00,0x02] + +v_add_nc_i32 v5, vcc_lo, ttmp15 +// GFX13: v_add_nc_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x7f,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_nc_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_add_nc_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x7f,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_add_nc_i32 v5, ttmp15, src_scc +// GFX13: v_add_nc_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x7f,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_nc_i32 v5, m0, 0.5 +// GFX13: v_add_nc_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x7f,0xd7,0x7d,0xe0,0x01,0x02] + +v_add_nc_i32 v5, exec_lo, -1 +// GFX13: v_add_nc_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x7f,0xd7,0x7e,0x82,0x01,0x02] + +v_add_nc_i32 v5, exec_hi, null +// GFX13: v_add_nc_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x7f,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_nc_i32 v5, null, exec_lo +// GFX13: v_add_nc_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x7f,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_nc_i32 v5, -1, exec_hi +// GFX13: v_add_nc_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x7f,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_nc_i32 v5, 0.5, m0 +// GFX13: v_add_nc_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x7f,0xd7,0xf0,0xfa,0x00,0x02] + +v_add_nc_i32 v5, src_scc, vcc_lo +// GFX13: v_add_nc_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x7f,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp +// GFX13: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x7f,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_add_nc_u16 v5.l, v1.l, v2.l +// GFX13: v_add_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x02] + +v_add_nc_u16 v5.l, v255.l, v255.l +// GFX13: v_add_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x02] + +v_add_nc_u16 v5.l, s1, s2 +// GFX13: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x02] + +v_add_nc_u16 v5.l, s105, s105 +// GFX13: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x02] + +v_add_nc_u16 v5.l, vcc_lo, ttmp15 +// GFX13: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x02] + +v_add_nc_u16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_add_nc_u16 v5.l, ttmp15, src_scc +// GFX13: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x02] + +v_add_nc_u16 v5.l, m0, 0.5 +// GFX13-ASM: v_add_nc_u16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_u16 v5.l, exec_lo, -1 +// GFX13: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x02] + +v_add_nc_u16 v5.l, exec_hi, null +// GFX13: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x02] + +v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] +// GFX13: v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x02] + +v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_add_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x02] + +v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x02] + +v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_alignbit_b32 v5, v1, v2, s3 +// GFX13: v_alignbit_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00] + +v_alignbit_b32 v5, v255, s2, s3 +// GFX13: v_alignbit_b32 v5, v255, s2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0xff,0x05,0x0c,0x00] + +v_alignbit_b32 v5, s1, v255, s3 +// GFX13: v_alignbit_b32 v5, s1, v255, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0xfe,0x0f,0x00] + +v_alignbit_b32 v5, s105, s105, s105 +// GFX13: v_alignbit_b32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x16,0xd6,0x69,0xd2,0xa4,0x01] + +v_alignbit_b32 v5, vcc_lo, ttmp15, v3.l +// GFX13: v_alignbit_b32 v5, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x16,0xd6,0x6a,0xf6,0x0c,0x04] + +v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255.l +// GFX13: v_alignbit_b32 v5, vcc_hi, 0xaf123456, v255.l ; encoding: [0x05,0x00,0x16,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_alignbit_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_alignbit_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x16,0xd6,0x7b,0xfa,0xed,0x01] + +v_alignbit_b32 v5, m0, 0.5, exec_lo +// GFX13: v_alignbit_b32 v5, m0, 0.5, exec_lo ; encoding: [0x05,0x00,0x16,0xd6,0x7d,0xe0,0xf9,0x01] + +v_alignbit_b32 v5, exec_lo, -1, m0 +// GFX13: v_alignbit_b32 v5, exec_lo, -1, m0 ; encoding: [0x05,0x00,0x16,0xd6,0x7e,0x82,0xf5,0x01] + +v_alignbit_b32 v5, exec_hi, null, vcc_hi +// GFX13: v_alignbit_b32 v5, exec_hi, null, vcc_hi ; encoding: [0x05,0x00,0x16,0xd6,0x7f,0xf8,0xac,0x01] + +v_alignbit_b32 v5, null, exec_lo, vcc_lo +// GFX13: v_alignbit_b32 v5, null, exec_lo, vcc_lo ; encoding: [0x05,0x00,0x16,0xd6,0x7c,0xfc,0xa8,0x01] + +v_alignbit_b32 v5, -1, exec_hi, src_scc +// GFX13: v_alignbit_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x16,0xd6,0xc1,0xfe,0xf4,0x03] + +v_alignbit_b32 v5, 0.5, m0, exec_hi +// GFX13: v_alignbit_b32 v5, 0.5, m0, exec_hi ; encoding: [0x05,0x00,0x16,0xd6,0xf0,0xfa,0xfc,0x01] + +v_alignbit_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_alignbit_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x16,0xd6,0xfd,0xd4,0x04,0x03] + +v_alignbit_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_alignbit_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x16,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_alignbyte_b32 v5, v1, v2, s3 +// GFX13: v_alignbyte_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x17,0xd6,0x01,0x05,0x0e,0x00] + +v_alignbyte_b32 v5, v255, s2, s3 +// GFX13: v_alignbyte_b32 v5, v255, s2, s3 ; encoding: [0x05,0x00,0x17,0xd6,0xff,0x05,0x0c,0x00] + +v_alignbyte_b32 v5, s1, v255, s3 +// GFX13: v_alignbyte_b32 v5, s1, v255, s3 ; encoding: [0x05,0x00,0x17,0xd6,0x01,0xfe,0x0f,0x00] + +v_alignbyte_b32 v5, s105, s105, s105 +// GFX13: v_alignbyte_b32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x17,0xd6,0x69,0xd2,0xa4,0x01] + +v_alignbyte_b32 v5, vcc_lo, ttmp15, v3.l +// GFX13: v_alignbyte_b32 v5, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x17,0xd6,0x6a,0xf6,0x0c,0x04] + +v_alignbyte_b32 v5, vcc_hi, 0xaf123456, v255.l +// GFX13: v_alignbyte_b32 v5, vcc_hi, 0xaf123456, v255.l ; encoding: [0x05,0x00,0x17,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_alignbyte_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_alignbyte_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x17,0xd6,0x7b,0xfa,0xed,0x01] + +v_alignbyte_b32 v5, m0, 0.5, exec_lo +// GFX13: v_alignbyte_b32 v5, m0, 0.5, exec_lo ; encoding: [0x05,0x00,0x17,0xd6,0x7d,0xe0,0xf9,0x01] + +v_alignbyte_b32 v5, exec_lo, -1, m0 +// GFX13: v_alignbyte_b32 v5, exec_lo, -1, m0 ; encoding: [0x05,0x00,0x17,0xd6,0x7e,0x82,0xf5,0x01] + +v_alignbyte_b32 v5, exec_hi, null, vcc_hi +// GFX13: v_alignbyte_b32 v5, exec_hi, null, vcc_hi ; encoding: [0x05,0x00,0x17,0xd6,0x7f,0xf8,0xac,0x01] + +v_alignbyte_b32 v5, null, exec_lo, vcc_lo +// GFX13: v_alignbyte_b32 v5, null, exec_lo, vcc_lo ; encoding: [0x05,0x00,0x17,0xd6,0x7c,0xfc,0xa8,0x01] + +v_alignbyte_b32 v5, -1, exec_hi, src_scc +// GFX13: v_alignbyte_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x17,0xd6,0xc1,0xfe,0xf4,0x03] + +v_alignbyte_b32 v5, 0.5, m0, exec_hi +// GFX13: v_alignbyte_b32 v5, 0.5, m0, exec_hi ; encoding: [0x05,0x00,0x17,0xd6,0xf0,0xfa,0xfc,0x01] + +v_alignbyte_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_alignbyte_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x17,0xd6,0xfd,0xd4,0x04,0x03] + +v_alignbyte_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_alignbyte_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x17,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_and_b16 v5.l, v1.l, v2.l +// GFX13: v_and_b16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x02] + +v_and_b16 v5.l, v255.l, v255.l +// GFX13: v_and_b16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x02] + +v_and_b16 v5.l, s1, s2 +// GFX13: v_and_b16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x02] + +v_and_b16 v5.l, s105, s105 +// GFX13: v_and_b16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x02] + +v_and_b16 v5.l, vcc_lo, ttmp15 +// GFX13: v_and_b16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x02] + +v_and_b16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_and_b16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_and_b16 v5.l, ttmp15, src_scc +// GFX13: v_and_b16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x02] + +v_and_b16 v5.l, m0, 0.5 +// GFX13-ASM: v_and_b16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_and_b16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_and_b16 v5.l, exec_lo, -1 +// GFX13: v_and_b16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x02] + +v_and_b16 v5.l, exec_hi, null +// GFX13: v_and_b16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x67,0xd7,0x7f,0xf8,0x00,0x02] + +v_and_b16 v5.l, null, exec_lo +// GFX13: v_and_b16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x02] + +v_and_b16 v5.l, -1, exec_hi +// GFX13: v_and_b16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x02] + +v_and_b16 v5.l, 0.5, m0 +// GFX13-ASM: v_and_b16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_and_b16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_and_b16 v5.l, src_scc, vcc_lo +// GFX13: v_and_b16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x67,0xd7,0xfd,0xd4,0x00,0x02] + +v_and_b16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_and_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x67,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_and_or_b32 v5, v1, v2, s3 +// GFX13: v_and_or_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x71,0xd7,0x01,0x05,0x0e,0x00] + +v_and_or_b32 v5, v255, s2, s105 +// GFX13: v_and_or_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x71,0xd7,0xff,0x05,0xa4,0x01] + +v_and_or_b32 v5, s1, v255, exec_hi +// GFX13: v_and_or_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x71,0xd7,0x01,0xfe,0xff,0x01] + +v_and_or_b32 v5, s105, s105, exec_lo +// GFX13: v_and_or_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x71,0xd7,0x69,0xd2,0xf8,0x01] + +v_and_or_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_and_or_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x71,0xd7,0x6a,0xf6,0x0c,0x04] + +v_and_or_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_and_or_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x71,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_and_or_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_and_or_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x71,0xd7,0x7b,0xfa,0xed,0x01] + +v_and_or_b32 v5, m0, 0.5, m0 +// GFX13: v_and_or_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x71,0xd7,0x7d,0xe0,0xf5,0x01] + +v_and_or_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_and_or_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x71,0xd7,0x7e,0x82,0xad,0x01] + +v_and_or_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_and_or_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x71,0xd7,0x7f,0xf8,0xa8,0x01] + +v_and_or_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_and_or_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x71,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_and_or_b32 v5, -1, exec_hi, src_scc +// GFX13: v_and_or_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x71,0xd7,0xc1,0xfe,0xf4,0x03] + +v_and_or_b32 v5, 0.5, m0, 0.5 +// GFX13: v_and_or_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x71,0xd7,0xf0,0xfa,0xc0,0x03] + +v_and_or_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_and_or_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x71,0xd7,0xfd,0xd4,0x04,0x03] + +v_and_or_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_and_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x71,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] +// GFX13: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x90,0xd6,0x02,0x07,0x12,0x04] + +v_ashr_pk_i8_i32 v2, s4, 4, v2 +// GFX13: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x08,0x09,0x04] + +v_ashr_pk_i8_i32 v2, s4, v7, v8 +// GFX13: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x0e,0x22,0x04] + +v_ashr_pk_i8_i32 v2, v4, 0, 1 +// GFX13: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x01,0x05,0x02] + +v_ashr_pk_i8_i32 v2, v4, 3, s2 +// GFX13: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x07,0x09,0x00] + +v_ashr_pk_i8_i32 v2, v4, v7, 12345 +// GFX13: v_ashr_pk_i8_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x90,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] +// GFX13: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x91,0xd6,0x02,0x07,0x12,0x04] + +v_ashr_pk_u8_i32 v2, s4, 4, v2 +// GFX13: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x08,0x09,0x04] + +v_ashr_pk_u8_i32 v2, s4, v7, v8 +// GFX13: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x0e,0x22,0x04] + +v_ashr_pk_u8_i32 v2, v4, 0, 1 +// GFX13: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x01,0x05,0x02] + +v_ashr_pk_u8_i32 v2, v4, 3, s2 +// GFX13: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x07,0x09,0x00] + +v_ashr_pk_u8_i32 v2, v4, v7, 12345 +// GFX13: v_ashr_pk_u8_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x91,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_ashrrev_i16 v5.l, v1.l, v2.l +// GFX13: v_ashrrev_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x08,0xd7,0x01,0x05,0x02,0x02] + +v_ashrrev_i16 v5.l, v255.l, v255.l +// GFX13: v_ashrrev_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x08,0xd7,0xff,0xff,0x03,0x02] + +v_ashrrev_i16 v5.l, s1, s2 +// GFX13: v_ashrrev_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x08,0xd7,0x01,0x04,0x00,0x02] + +v_ashrrev_i16 v5.l, s105, s105 +// GFX13: v_ashrrev_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x08,0xd7,0x69,0xd2,0x00,0x02] + +v_ashrrev_i16 v5.l, vcc_lo, ttmp15 +// GFX13: v_ashrrev_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x08,0xd7,0x6a,0xf6,0x00,0x02] + +v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_ashrrev_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x08,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_ashrrev_i16 v5.l, ttmp15, src_scc +// GFX13: v_ashrrev_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x08,0xd7,0x7b,0xfa,0x01,0x02] + +v_ashrrev_i16 v5.l, m0, 0.5 +// GFX13-ASM: v_ashrrev_i16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x08,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_ashrrev_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x08,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_ashrrev_i16 v5.l, exec_lo, -1 +// GFX13: v_ashrrev_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x08,0xd7,0x7e,0x82,0x01,0x02] + +v_ashrrev_i16 v5.l, exec_hi, null +// GFX13: v_ashrrev_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x08,0xd7,0x7f,0xf8,0x00,0x02] + +v_ashrrev_i16 v5.l, null, exec_lo +// GFX13: v_ashrrev_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x08,0xd7,0x7c,0xfc,0x00,0x02] + +v_ashrrev_i16 v5.l, -1, exec_hi +// GFX13: v_ashrrev_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x08,0xd7,0xc1,0xfe,0x00,0x02] + +v_ashrrev_i16 v5.l, 0.5, m0 +// GFX13-ASM: v_ashrrev_i16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x08,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_ashrrev_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x08,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_ashrrev_i16 v5.l, src_scc, vcc_lo +// GFX13: v_ashrrev_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x08,0xd7,0xfd,0xd4,0x00,0x02] + +v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_ashrrev_i16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x08,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_ashrrev_i64 v[5:6], v1, vcc +// GFX13: v_ashrrev_i64 v[5:6], v1, vcc ; encoding: [0x05,0x00,0x01,0xd7,0x01,0xd5,0x00,0x02] + +v_ashrrev_i64 v[5:6], v255, exec +// GFX13: v_ashrrev_i64 v[5:6], v255, exec ; encoding: [0x05,0x00,0x01,0xd7,0xff,0xfd,0x00,0x02] + +v_ashrrev_i64 v[5:6], exec_lo, v[2:3] +// GFX13: v_ashrrev_i64 v[5:6], exec_lo, v[2:3] ; encoding: [0x05,0x00,0x01,0xd7,0x7e,0x04,0x02,0x02] + +v_ashrrev_i64 v[5:6], exec_hi, v[254:255] +// GFX13: v_ashrrev_i64 v[5:6], exec_hi, v[254:255] ; encoding: [0x05,0x00,0x01,0xd7,0x7f,0xfc,0x03,0x02] + +v_ashrrev_i64 v[5:6], null, null +// GFX13: v_ashrrev_i64 v[5:6], null, null ; encoding: [0x05,0x00,0x01,0xd7,0x7c,0xf8,0x00,0x02] + +v_ashrrev_i64 v[5:6], -1, -1 +// GFX13: v_ashrrev_i64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x01,0xd7,0xc1,0x82,0x01,0x02] + +v_ashrrev_i64 v[5:6], 0.5, 0xaf123456 +// GFX13: v_ashrrev_i64 v[5:6], 0.5, 0xaf123456 ; encoding: [0x05,0x00,0x01,0xd7,0xf0,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ashrrev_i64 v[5:6], src_scc, src_scc +// GFX13: v_ashrrev_i64 v[5:6], src_scc, src_scc ; encoding: [0x05,0x00,0x01,0xd7,0xfd,0xfa,0x01,0x02] + +v_ashrrev_i64 v[254:255], 0xaf123456, 0.5 +// GFX13: v_ashrrev_i64 v[254:255], 0xaf123456, 0.5 ; encoding: [0xfe,0x00,0x01,0xd7,0xff,0xe0,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_bcnt_u32_b32 v5, v1, v2 +// GFX13: v_bcnt_u32_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x64,0xd7,0x01,0x05,0x02,0x02] + +v_bcnt_u32_b32 v5, v255, v255 +// GFX13: v_bcnt_u32_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x64,0xd7,0xff,0xff,0x03,0x02] + +v_bcnt_u32_b32 v5, s1, s2 +// GFX13: v_bcnt_u32_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x64,0xd7,0x01,0x04,0x00,0x02] + +v_bcnt_u32_b32 v5, s105, s105 +// GFX13: v_bcnt_u32_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x64,0xd7,0x69,0xd2,0x00,0x02] + +v_bcnt_u32_b32 v5, vcc_lo, ttmp15 +// GFX13: v_bcnt_u32_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x64,0xd7,0x6a,0xf6,0x00,0x02] + +v_bcnt_u32_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_bcnt_u32_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x64,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_bcnt_u32_b32 v5, ttmp15, src_scc +// GFX13: v_bcnt_u32_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x64,0xd7,0x7b,0xfa,0x01,0x02] + +v_bcnt_u32_b32 v5, m0, 0.5 +// GFX13: v_bcnt_u32_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x64,0xd7,0x7d,0xe0,0x01,0x02] + +v_bcnt_u32_b32 v5, exec_lo, -1 +// GFX13: v_bcnt_u32_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x64,0xd7,0x7e,0x82,0x01,0x02] + +v_bcnt_u32_b32 v5, exec_hi, null +// GFX13: v_bcnt_u32_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x64,0xd7,0x7f,0xf8,0x00,0x02] + +v_bcnt_u32_b32 v5, null, exec_lo +// GFX13: v_bcnt_u32_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x64,0xd7,0x7c,0xfc,0x00,0x02] + +v_bcnt_u32_b32 v5, -1, exec_hi +// GFX13: v_bcnt_u32_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x64,0xd7,0xc1,0xfe,0x00,0x02] + +v_bcnt_u32_b32 v5, 0.5, m0 +// GFX13: v_bcnt_u32_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x64,0xd7,0xf0,0xfa,0x00,0x02] + +v_bcnt_u32_b32 v5, src_scc, vcc_lo +// GFX13: v_bcnt_u32_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x64,0xd7,0xfd,0xd4,0x00,0x02] + +v_bcnt_u32_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_bcnt_u32_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x64,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_bfe_i32 v5, v1, v2, s3 +// GFX13: v_bfe_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x11,0xd6,0x01,0x05,0x0e,0x00] + +v_bfe_i32 v5, v255, s2, s105 +// GFX13: v_bfe_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x11,0xd6,0xff,0x05,0xa4,0x01] + +v_bfe_i32 v5, s1, v255, exec_hi +// GFX13: v_bfe_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x11,0xd6,0x01,0xfe,0xff,0x01] + +v_bfe_i32 v5, s105, s105, exec_lo +// GFX13: v_bfe_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x11,0xd6,0x69,0xd2,0xf8,0x01] + +v_bfe_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_bfe_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x11,0xd6,0x6a,0xf6,0x0c,0x04] + +v_bfe_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_bfe_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x11,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_bfe_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_bfe_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x11,0xd6,0x7b,0xfa,0xed,0x01] + +v_bfe_i32 v5, m0, 0.5, m0 +// GFX13: v_bfe_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x11,0xd6,0x7d,0xe0,0xf5,0x01] + +v_bfe_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_bfe_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x11,0xd6,0x7e,0x82,0xad,0x01] + +v_bfe_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_bfe_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x11,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bfe_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_bfe_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x11,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_bfe_i32 v5, -1, exec_hi, src_scc +// GFX13: v_bfe_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x11,0xd6,0xc1,0xfe,0xf4,0x03] + +v_bfe_i32 v5, 0.5, m0, 0.5 +// GFX13: v_bfe_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x11,0xd6,0xf0,0xfa,0xc0,0x03] + +v_bfe_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_bfe_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x11,0xd6,0xfd,0xd4,0x04,0x03] + +v_bfe_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_bfe_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x11,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_bfe_u32 v5, v1, v2, s3 +// GFX13: v_bfe_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x10,0xd6,0x01,0x05,0x0e,0x00] + +v_bfe_u32 v5, v255, s2, s105 +// GFX13: v_bfe_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x10,0xd6,0xff,0x05,0xa4,0x01] + +v_bfe_u32 v5, s1, v255, exec_hi +// GFX13: v_bfe_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x10,0xd6,0x01,0xfe,0xff,0x01] + +v_bfe_u32 v5, s105, s105, exec_lo +// GFX13: v_bfe_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x10,0xd6,0x69,0xd2,0xf8,0x01] + +v_bfe_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_bfe_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x10,0xd6,0x6a,0xf6,0x0c,0x04] + +v_bfe_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_bfe_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x10,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_bfe_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_bfe_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x10,0xd6,0x7b,0xfa,0xed,0x01] + +v_bfe_u32 v5, m0, 0.5, m0 +// GFX13: v_bfe_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x10,0xd6,0x7d,0xe0,0xf5,0x01] + +v_bfe_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_bfe_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x10,0xd6,0x7e,0x82,0xad,0x01] + +v_bfe_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_bfe_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x10,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bfe_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_bfe_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x10,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_bfe_u32 v5, -1, exec_hi, src_scc +// GFX13: v_bfe_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x10,0xd6,0xc1,0xfe,0xf4,0x03] + +v_bfe_u32 v5, 0.5, m0, 0.5 +// GFX13: v_bfe_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x10,0xd6,0xf0,0xfa,0xc0,0x03] + +v_bfe_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_bfe_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x10,0xd6,0xfd,0xd4,0x04,0x03] + +v_bfe_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_bfe_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x10,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_bfi_b32 v5, v1, v2, s3 +// GFX13: v_bfi_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x12,0xd6,0x01,0x05,0x0e,0x00] + +v_bfi_b32 v5, v255, s2, s105 +// GFX13: v_bfi_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x12,0xd6,0xff,0x05,0xa4,0x01] + +v_bfi_b32 v5, s1, v255, exec_hi +// GFX13: v_bfi_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x12,0xd6,0x01,0xfe,0xff,0x01] + +v_bfi_b32 v5, s105, s105, exec_lo +// GFX13: v_bfi_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x12,0xd6,0x69,0xd2,0xf8,0x01] + +v_bfi_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_bfi_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x12,0xd6,0x6a,0xf6,0x0c,0x04] + +v_bfi_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_bfi_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x12,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_bfi_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_bfi_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x12,0xd6,0x7b,0xfa,0xed,0x01] + +v_bfi_b32 v5, m0, 0.5, m0 +// GFX13: v_bfi_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x12,0xd6,0x7d,0xe0,0xf5,0x01] + +v_bfi_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_bfi_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x12,0xd6,0x7e,0x82,0xad,0x01] + +v_bfi_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_bfi_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x12,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bfi_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_bfi_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x12,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_bfi_b32 v5, -1, exec_hi, src_scc +// GFX13: v_bfi_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x12,0xd6,0xc1,0xfe,0xf4,0x03] + +v_bfi_b32 v5, 0.5, m0, 0.5 +// GFX13: v_bfi_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x12,0xd6,0xf0,0xfa,0xc0,0x03] + +v_bfi_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_bfi_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x12,0xd6,0xfd,0xd4,0x04,0x03] + +v_bfi_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_bfi_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x12,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_bfm_b32 v5, v1, v2 +// GFX13: v_bfm_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x1d,0xd7,0x01,0x05,0x02,0x02] + +v_bfm_b32 v5, v255, v255 +// GFX13: v_bfm_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x1d,0xd7,0xff,0xff,0x03,0x02] + +v_bfm_b32 v5, s1, s2 +// GFX13: v_bfm_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x1d,0xd7,0x01,0x04,0x00,0x02] + +v_bfm_b32 v5, s105, s105 +// GFX13: v_bfm_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x1d,0xd7,0x69,0xd2,0x00,0x02] + +v_bfm_b32 v5, vcc_lo, ttmp15 +// GFX13: v_bfm_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x1d,0xd7,0x6a,0xf6,0x00,0x02] + +v_bfm_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_bfm_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x1d,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_bfm_b32 v5, ttmp15, src_scc +// GFX13: v_bfm_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x1d,0xd7,0x7b,0xfa,0x01,0x02] + +v_bfm_b32 v5, m0, 0.5 +// GFX13: v_bfm_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x1d,0xd7,0x7d,0xe0,0x01,0x02] + +v_bfm_b32 v5, exec_lo, -1 +// GFX13: v_bfm_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x1d,0xd7,0x7e,0x82,0x01,0x02] + +v_bfm_b32 v5, exec_hi, null +// GFX13: v_bfm_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x1d,0xd7,0x7f,0xf8,0x00,0x02] + +v_bfm_b32 v5, null, exec_lo +// GFX13: v_bfm_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x1d,0xd7,0x7c,0xfc,0x00,0x02] + +v_bfm_b32 v5, -1, exec_hi +// GFX13: v_bfm_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x1d,0xd7,0xc1,0xfe,0x00,0x02] + +v_bfm_b32 v5, 0.5, m0 +// GFX13: v_bfm_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x1d,0xd7,0xf0,0xfa,0x00,0x02] + +v_bfm_b32 v5, src_scc, vcc_lo +// GFX13: v_bfm_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x1d,0xd7,0xfd,0xd4,0x00,0x02] + +v_bfm_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_bfm_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_bitop3_b16 v255.h, 0xfe0b, vcc_hi, null bitop3:103 op_sel:[0,0,0,1] +// GFX13: v_bitop3_b16 v255.h, 0xfe0b, vcc_hi, null bitop3:0x67 op_sel:[0,0,0,1] ; encoding: [0xff,0x44,0x33,0xd6,0xff,0xd6,0xf0,0xe9,0x0b,0xfe,0x00,0x00] + +v_bitop3_b16 v5.l, -1, exec_hi, src_scc bitop3:99 op_sel:[1,0,0,0] +// GFX13: v_bitop3_b16 v5.l, -1, exec_hi, src_scc bitop3:0x63 op_sel:[1,0,0,0] ; encoding: [0x05,0x0c,0x33,0xd6,0xc1,0xfe,0xf4,0x6b] + +v_bitop3_b16 v5.l, 0.5, m0, 0.5 bitop3:101 op_sel:[0,1,0,0] +// GFX13-ASM: v_bitop3_b16 v5.l, 0.5, m0, 0.5 bitop3:0x65 op_sel:[0,1,0,0] ; encoding: [0x05,0x14,0x33,0xd6,0xf0,0xfa,0xc0,0xab] +// GFX13-DIS: v_bitop3_b16 v5.l, 0x3800, m0, 0x3800 bitop3:0x65 op_sel:[0,1,0,0] ; encoding: [0x05,0x14,0x33,0xd6,0xff,0xfa,0xfc,0xab,0x00,0x38,0x00,0x00] + +v_bitop3_b16 v5.h, exec_hi, null, vcc_lo bitop3:0x77 op_sel:[1,1,1,1] +// GFX13: v_bitop3_b16 v5.h, exec_hi, null, vcc_lo bitop3:0x77 op_sel:[1,1,1,1] ; encoding: [0x05,0x7e,0x33,0xd6,0x7f,0xf8,0xa8,0xe9] + +v_bitop3_b16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_bitop3_b16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x33,0xd6,0x7f,0xf8,0xa8,0x01] + +v_bitop3_b16 v5.l, exec_lo, -1, vcc_hi bitop3:6 +// GFX13: v_bitop3_b16 v5.l, exec_lo, -1, vcc_hi bitop3:6 ; encoding: [0x05,0x00,0x33,0xd6,0x7e,0x82,0xad,0xc1] + +v_bitop3_b16 v5.l, m0, 0.5, m0 bitop3:5 +// GFX13-ASM: v_bitop3_b16 v5.l, m0, 0.5, m0 bitop3:5 ; encoding: [0x05,0x00,0x33,0xd6,0x7d,0xe0,0xf5,0xa1] +// GFX13-DIS: v_bitop3_b16 v5.l, m0, 0x3800, m0 bitop3:5 ; encoding: [0x05,0x00,0x33,0xd6,0x7d,0xfe,0xf5,0xa1,0x00,0x38,0x00,0x00] + +v_bitop3_b16 v5.l, null, exec_lo, 0xfe0b bitop3:0x88 op_sel:[0,0,0,0] +// GFX13: v_bitop3_b16 v5.l, null, exec_lo, 0xfe0b bitop3:0x88 ; encoding: [0x05,0x01,0x33,0xd6,0x7c,0xfc,0xfc,0x13,0x0b,0xfe,0x00,0x00] + +v_bitop3_b16 v5.l, s1, v255.l, exec_hi bitop3:100 +// GFX13: v_bitop3_b16 v5.l, s1, v255.l, exec_hi bitop3:0x64 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0xfe,0xff,0x89] + +v_bitop3_b16 v5.l, s105, s105, exec_lo bitop3:0 +// GFX13: v_bitop3_b16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x33,0xd6,0x69,0xd2,0xf8,0x01] + +v_bitop3_b16 v5.l, src_scc, vcc_lo, -1 bitop3:102 op_sel:[0,0,1,0] +// GFX13: v_bitop3_b16 v5.l, src_scc, vcc_lo, -1 bitop3:0x66 op_sel:[0,0,1,0] ; encoding: [0x05,0x24,0x33,0xd6,0xfd,0xd4,0x04,0xcb] + +v_bitop3_b16 v5.l, ttmp15, src_scc, ttmp15 bitop3:0x24 +// GFX13: v_bitop3_b16 v5.l, ttmp15, src_scc, ttmp15 bitop3:0x24 ; encoding: [0x05,0x04,0x33,0xd6,0x7b,0xfa,0xed,0x81] + +v_bitop3_b16 v5.l, v1.l, v2.l, s3 +// GFX13: v_bitop3_b16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x33,0xd6,0x01,0x05,0x0e,0x00] + +v_bitop3_b16 v5.l, v1.l, v2.l, s3 bitop3:161 +// GFX13: v_bitop3_b16 v5.l, v1.l, v2.l, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0x05,0x0e,0x30] + +v_bitop3_b16 v5.l, v255.l, s2, s105 bitop3:0x27 +// GFX13: v_bitop3_b16 v5.l, v255.l, s2, s105 bitop3:0x27 ; encoding: [0x05,0x04,0x33,0xd6,0xff,0x05,0xa4,0xe1] + +v_bitop3_b16 v5.l, vcc_hi, 0xfe0b, v255.l bitop3:63 +// GFX13: v_bitop3_b16 v5.l, vcc_hi, 0xfe0b, v255.l bitop3:0x3f ; encoding: [0x05,0x07,0x33,0xd6,0x6b,0xfe,0xfd,0xe7,0x0b,0xfe,0x00,0x00] + +v_bitop3_b16 v5.l, vcc_lo, ttmp15, v3.l bitop3:0x15 +// GFX13: v_bitop3_b16 v5.l, vcc_lo, ttmp15, v3.l bitop3:0x15 ; encoding: [0x05,0x02,0x33,0xd6,0x6a,0xf6,0x0c,0xa4] + +v_bitop3_b16 v5.h, v1.h, v2.h, v255.h +// GFX13: v_bitop3_b16 v5.h, v1.h, v2.h, v255.h op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x33,0xd6,0x01,0x05,0xfe,0x07] + +v_bitop3_b32 v255, 0xaf123456, vcc_hi, null bitop3:103 +// GFX13: v_bitop3_b32 v255, 0xaf123456, vcc_hi, null bitop3:0x67 ; encoding: [0xff,0x04,0x34,0xd6,0xff,0xd6,0xf0,0xe9,0x56,0x34,0x12,0xaf] + +v_bitop3_b32 v5, -1, exec_hi, src_scc bitop3:99 +// GFX13: v_bitop3_b32 v5, -1, exec_hi, src_scc bitop3:0x63 ; encoding: [0x05,0x04,0x34,0xd6,0xc1,0xfe,0xf4,0x6b] + +v_bitop3_b32 v5, 0.5, m0, 0.5 bitop3:101 +// GFX13: v_bitop3_b32 v5, 0.5, m0, 0.5 bitop3:0x65 ; encoding: [0x05,0x04,0x34,0xd6,0xf0,0xfa,0xc0,0xab] + +v_bitop3_b32 v5, exec_hi, null, vcc_lo bitop3:77 +// GFX13: v_bitop3_b32 v5, exec_hi, null, vcc_lo bitop3:0x4d ; encoding: [0x05,0x01,0x34,0xd6,0x7f,0xf8,0xa8,0xa9] + +v_bitop3_b32 v5, exec_lo, -1, vcc_hi bitop3:6 +// GFX13: v_bitop3_b32 v5, exec_lo, -1, vcc_hi bitop3:6 ; encoding: [0x05,0x00,0x34,0xd6,0x7e,0x82,0xad,0xc1] + +v_bitop3_b32 v5, m0, 0.5, m0 bitop3:5 +// GFX13: v_bitop3_b32 v5, m0, 0.5, m0 bitop3:5 ; encoding: [0x05,0x00,0x34,0xd6,0x7d,0xe0,0xf5,0xa1] + +v_bitop3_b32 v5, null, exec_lo, 0xaf123456 bitop3:88 +// GFX13: v_bitop3_b32 v5, null, exec_lo, 0xaf123456 bitop3:0x58 ; encoding: [0x05,0x03,0x34,0xd6,0x7c,0xfc,0xfc,0x0b,0x56,0x34,0x12,0xaf] + +v_bitop3_b32 v5, s1, v255, exec_hi bitop3:100 +// GFX13: v_bitop3_b32 v5, s1, v255, exec_hi bitop3:0x64 ; encoding: [0x05,0x04,0x34,0xd6,0x01,0xfe,0xff,0x89] + +v_bitop3_b32 v5, s105, s105, exec_lo bitop3:0 +// GFX13: v_bitop3_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x34,0xd6,0x69,0xd2,0xf8,0x01] + +v_bitop3_b32 v5, src_scc, vcc_lo, -1 bitop3:102 +// GFX13: v_bitop3_b32 v5, src_scc, vcc_lo, -1 bitop3:0x66 ; encoding: [0x05,0x04,0x34,0xd6,0xfd,0xd4,0x04,0xcb] + +v_bitop3_b32 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 +// GFX13: v_bitop3_b32 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 ; encoding: [0x05,0x04,0x34,0xd6,0x7b,0xfa,0xed,0x81] + +v_bitop3_b32 v5, v1, v2, s3 +// GFX13: v_bitop3_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x34,0xd6,0x01,0x05,0x0e,0x00] + +v_bitop3_b32 v5, v1, v2, s3 bitop3:161 +// GFX13: v_bitop3_b32 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x34,0xd6,0x01,0x05,0x0e,0x30] + +v_bitop3_b32 v5, v255, s2, s105 bitop3:0x27 +// GFX13: v_bitop3_b32 v5, v255, s2, s105 bitop3:0x27 ; encoding: [0x05,0x04,0x34,0xd6,0xff,0x05,0xa4,0xe1] + +v_bitop3_b32 v5, vcc_hi, 0xaf123456, v255 bitop3:63 +// GFX13: v_bitop3_b32 v5, vcc_hi, 0xaf123456, v255 bitop3:0x3f ; encoding: [0x05,0x07,0x34,0xd6,0x6b,0xfe,0xfd,0xe7,0x56,0x34,0x12,0xaf] + +v_bitop3_b32 v5, vcc_lo, ttmp15, v3 bitop3:0x15 +// GFX13: v_bitop3_b32 v5, vcc_lo, ttmp15, v3 bitop3:0x15 ; encoding: [0x05,0x02,0x34,0xd6,0x6a,0xf6,0x0c,0xa4] + +v_cndmask_b16 v5.l, v1.l, src_scc, s3 +// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00] +// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction + +v_cndmask_b16 v5.l, v255.l, 0.5, s3 +// W32-ASM: v_cndmask_b16 v5.l, v255.l, 0.5, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00] +// W32-DIS: v_cndmask_b16 v5.l, v255.l, 0x3800, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x0d,0x00,0x00,0x38,0x00,0x00] +// W64-ERR: :[[@LINE-3]]:34: error: invalid operand for instruction + +v_cndmask_b16 v5.l, s105, s105, s3 +// W32: v_cndmask_b16 v5.l, s105, s105, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, vcc_hi, v2.l, s3 +// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, ttmp15, ttmp15, s3 +// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, m0, v255.l, s3 +// W32: v_cndmask_b16 v5.l, m0, v255.l, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 +// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 +// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, null, m0, s105 +// W32: v_cndmask_b16 v5.l, null, m0, s105 ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo +// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, 0.5, -1, vcc_hi +// W32-ASM: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01] +// W32-DIS: v_cndmask_b16 v5.l, 0x3800, -1, vcc_hi ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xad,0x01,0x00,0x38,0x00,0x00] +// W64-ERR: :[[@LINE-3]]:21: error: invalid operand for instruction + +v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 +// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21] +// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction + +v_cndmask_b16 v5, v1, src_scc, s[6:7] +// W32-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, v1, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] +// W64-DIS: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00] + +v_cndmask_b16 v5, v255, 0.5, s[6:7] +// W32-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, v255, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00] +// W64-DIS: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00] + +v_cndmask_b16 v5, s105, s105, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] +// W64-DIS: v_cndmask_b16 v5.l, s105, s105, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00] + +v_cndmask_b16 v5, vcc_hi, v2, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, vcc_hi, v2, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] +// W64-DIS: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00] + +v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] +// W64-DIS: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00] + +v_cndmask_b16 v5, m0, v255, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, m0, v255, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] +// W64-DIS: v_cndmask_b16 v5.l, m0, v255.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00] + +v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] +// W64-DIS: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00] + +v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] +// W64-DIS: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00] + +v_cndmask_b16 v5, null, m0, s[6:7] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] +// W64-DIS: v_cndmask_b16 v5.l, null, m0, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00] + +v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] +// W64-DIS: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41] + +v_cndmask_b16 v5, 0.5, -1, vcc +// W32-ERR: :[[@LINE-1]]:19: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, 0.5, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01] +// W64-DIS: v_cndmask_b16 v5.l, 0x3800, -1, vcc ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00] + +v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] +// W64-DIS: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21] + +v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null +// GFX13: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_cubeid_f32 v5, v1, v2, s3 +// GFX13: v_cubeid_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00] + +v_cubeid_f32 v5, v255, s2, s105 +// GFX13: v_cubeid_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0c,0xd6,0xff,0x05,0xa4,0x01] + +v_cubeid_f32 v5, s1, v255, exec_hi +// GFX13: v_cubeid_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0xfe,0xff,0x01] + +v_cubeid_f32 v5, s105, s105, exec_lo +// GFX13: v_cubeid_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0c,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubeid_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubeid_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0c,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubeid_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubeid_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0c,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubeid_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubeid_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0c,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubeid_f32 v5, m0, 0.5, m0 +// GFX13: v_cubeid_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0c,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubeid_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubeid_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0c,0xd6,0x7e,0x82,0xad,0x01] + +v_cubeid_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubeid_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0c,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubeid_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubeid_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0c,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubeid_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubeid_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0c,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubeid_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubeid_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0c,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubeid_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubeid_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0c,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubeid_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubeid_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0c,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cubema_f32 v5, v1, v2, s3 +// GFX13: v_cubema_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0f,0xd6,0x01,0x05,0x0e,0x00] + +v_cubema_f32 v5, v255, s2, s105 +// GFX13: v_cubema_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0f,0xd6,0xff,0x05,0xa4,0x01] + +v_cubema_f32 v5, s1, v255, exec_hi +// GFX13: v_cubema_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0f,0xd6,0x01,0xfe,0xff,0x01] + +v_cubema_f32 v5, s105, s105, exec_lo +// GFX13: v_cubema_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0f,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubema_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubema_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0f,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubema_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubema_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0f,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubema_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubema_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0f,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubema_f32 v5, m0, 0.5, m0 +// GFX13: v_cubema_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0f,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubema_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubema_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0f,0xd6,0x7e,0x82,0xad,0x01] + +v_cubema_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubema_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0f,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubema_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubema_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0f,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubema_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubema_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0f,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubema_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubema_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0f,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubema_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubema_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0f,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubema_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubema_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0f,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cubesc_f32 v5, v1, v2, s3 +// GFX13: v_cubesc_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0d,0xd6,0x01,0x05,0x0e,0x00] + +v_cubesc_f32 v5, v255, s2, s105 +// GFX13: v_cubesc_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0d,0xd6,0xff,0x05,0xa4,0x01] + +v_cubesc_f32 v5, s1, v255, exec_hi +// GFX13: v_cubesc_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0d,0xd6,0x01,0xfe,0xff,0x01] + +v_cubesc_f32 v5, s105, s105, exec_lo +// GFX13: v_cubesc_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0d,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubesc_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubesc_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubesc_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubesc_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubesc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubesc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0d,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubesc_f32 v5, m0, 0.5, m0 +// GFX13: v_cubesc_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubesc_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubesc_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0d,0xd6,0x7e,0x82,0xad,0x01] + +v_cubesc_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubesc_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0d,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubesc_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubesc_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0d,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubesc_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubesc_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0d,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubesc_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubesc_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0d,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubesc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubesc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0d,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubesc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubesc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cubetc_f32 v5, v1, v2, s3 +// GFX13: v_cubetc_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0e,0xd6,0x01,0x05,0x0e,0x00] + +v_cubetc_f32 v5, v255, s2, s105 +// GFX13: v_cubetc_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0e,0xd6,0xff,0x05,0xa4,0x01] + +v_cubetc_f32 v5, s1, v255, exec_hi +// GFX13: v_cubetc_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0e,0xd6,0x01,0xfe,0xff,0x01] + +v_cubetc_f32 v5, s105, s105, exec_lo +// GFX13: v_cubetc_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0e,0xd6,0x69,0xd2,0xf8,0x01] + +v_cubetc_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cubetc_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cubetc_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cubetc_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0e,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cubetc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_cubetc_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x0e,0xd6,0x7b,0xfa,0xed,0xe1] + +v_cubetc_f32 v5, m0, 0.5, m0 +// GFX13: v_cubetc_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cubetc_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cubetc_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x0e,0xd6,0x7e,0x82,0xad,0x01] + +v_cubetc_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_cubetc_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x0e,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_cubetc_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_cubetc_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x0e,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_cubetc_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_cubetc_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x0e,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_cubetc_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cubetc_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x0e,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x0e,0xd6,0xfd,0xd4,0x04,0x33] + +v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 +// GFX13: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6e,0xd7,0xff,0xd6,0x00,0x3a,0x56,0x34,0x12,0xaf] + +v_cvt_pk_bf16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6e,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 +// GFX13: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6e,0xd7,0xf0,0xfa,0x00,0x0a] + +v_cvt_pk_bf16_f32 v5, exec_hi, null +// GFX13: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6e,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6e,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_bf16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_bf16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6e,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6e,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 +// GFX13: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6e,0xd7,0xfd,0xd4,0x00,0x12] + +v_cvt_pk_bf16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6e,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_bf16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_bf16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6e,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6e,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6e,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_bf8_f32 v1.l, -v2, |v3| +// GFX13: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x7b,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_bf8_f32 v1.l, s2, 3 +// GFX13: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_pk_bf8_f32 v1.l, v2, v3 +// GFX13: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_pk_fp8_f32 v1.l, -v2, |v3| +// GFX13: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x7a,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_fp8_f32 v1.l, s2, 3 +// GFX13: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x7a,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_pk_fp8_f32 v1.l, v2, v3 +// GFX13: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x7a,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_pk_fp8_f32 v1.l, -v2, |v3| +// GFX13: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x7a,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_fp8_f32 v1.l, s2, 3 +// GFX13: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x7a,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_pk_bf8_f32 v1.l, v2, v3 +// GFX13: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_pk_bf8_f32 v1.l, -v2, |v3| +// GFX13: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x7b,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_pk_bf8_f32 v1.l, s2, 3 +// GFX13: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x7b,0xd7,0x02,0x06,0x01,0x02] + +v_cvt_sr_bf8_f16 v1, -v2.l, v3 +// GFX13: v_cvt_sr_bf8_f16 v1, -v2.l, v3 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_sr_bf8_f16 v1, v2.l, 0x1234 +// GFX13: v_cvt_sr_bf8_f16 v1, v2.l, 0x1234 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0xff,0x01,0x02,0x34,0x12,0x00,0x00] + +v_cvt_sr_bf8_f16 v1, v2.l, s3 +// GFX13: v_cvt_sr_bf8_f16 v1, v2.l, s3 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0x07,0x00,0x02] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 +// GFX13: v_cvt_sr_bf8_f16 v1, v2.l, v3 ; encoding: [0x01,0x00,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:1 +// GFX13: v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:1 ; encoding: [0x01,0x20,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:2 +// GFX13: v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:2 ; encoding: [0x01,0x40,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:3 +// GFX13: v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:3 ; encoding: [0x01,0x60,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f16 v1, |v2.l|, v3 +// GFX13: v_cvt_sr_bf8_f16 v1, |v2.l|, v3 ; encoding: [0x01,0x01,0x36,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX13: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x38,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX13: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x38,0xd7,0x02,0x0a,0x02,0x02] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX13: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x38,0xd7,0xff,0x09,0x02,0x22] + +v_cvt_sr_fp8_f16 v1, -v2.l, v3 +// GFX13: v_cvt_sr_fp8_f16 v1, -v2.l, v3 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0x07,0x02,0x22] + +v_cvt_sr_fp8_f16 v1, v2.l, 0x1234 +// GFX13: v_cvt_sr_fp8_f16 v1, v2.l, 0x1234 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0xff,0x01,0x02,0x34,0x12,0x00,0x00] + +v_cvt_sr_fp8_f16 v1, v2.l, s3 +// GFX13: v_cvt_sr_fp8_f16 v1, v2.l, s3 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0x07,0x00,0x02] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 +// GFX13: v_cvt_sr_fp8_f16 v1, v2.l, v3 ; encoding: [0x01,0x00,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:1 +// GFX13: v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:1 ; encoding: [0x01,0x20,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:2 +// GFX13: v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:2 ; encoding: [0x01,0x40,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:3 +// GFX13: v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:3 ; encoding: [0x01,0x60,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f16 v1, |v2.l|, v3 +// GFX13: v_cvt_sr_fp8_f16 v1, |v2.l|, v3 ; encoding: [0x01,0x01,0x35,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f32 v1, v2, v3 +// GFX13: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x37,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_fp8_f32 v10, s2, v5 +// GFX13: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x37,0xd7,0x02,0x0a,0x02,0x02] + +v_cvt_sr_fp8_f32 v5, -|v255|, v4 +// GFX13: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x37,0xd7,0xff,0x09,0x02,0x22] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX13: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x38,0xd7,0x02,0x07,0x02,0x02] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX13: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x38,0xd7,0x02,0x0a,0x02,0x02] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX13: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x38,0xd7,0xff,0x09,0x02,0x22] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xbe,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbe,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 +// W32: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xbe,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xbb,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbb,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 +// W32: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xbb,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 +// W32: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xbc,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 +// W32: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 +// W32: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xb9,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 +// W32: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xba,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 +// W32: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xba,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 +// W32: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xba,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 +// W32: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xbd,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 +// W32: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xbd,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 +// W32: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0xbd,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 +// W32: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 +// W32: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xc1,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 +// W32: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 +// W32: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 +// W32: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xbf,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 +// W32: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 +// W32: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0xff,0x01,0x02,0x00,0xcf,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 +// W32: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xc0,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_pk_i16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x21,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_i16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_i16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x21,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_i16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_i16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x21,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_i16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_i16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x21,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_i16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_i16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x21,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_i16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_i16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x21,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_i16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_i16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x21,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_i16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_i16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x21,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_i16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_i16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x21,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_i16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_i16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x21,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_i16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_i16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x21,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_i16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_i16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x21,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_i16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_i16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x21,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_i16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_i16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x21,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_i16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_i16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x21,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_cvt_pk_i16_i32 v5, v1, v2 +// GFX13: v_cvt_pk_i16_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x6b,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_i16_i32 v5, v255, v255 +// GFX13: v_cvt_pk_i16_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x6b,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_i16_i32 v5, s1, s2 +// GFX13: v_cvt_pk_i16_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x6b,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_i16_i32 v5, s105, s105 +// GFX13: v_cvt_pk_i16_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x6b,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_i16_i32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_i16_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6b,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_i16_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_i16_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6b,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_i16_i32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_i16_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6b,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_i16_i32 v5, m0, 0.5 +// GFX13: v_cvt_pk_i16_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6b,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_i16_i32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_i16_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6b,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_i16_i32 v5, exec_hi, null +// GFX13: v_cvt_pk_i16_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6b,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_i16_i32 v5, null, exec_lo +// GFX13: v_cvt_pk_i16_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6b,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_i16_i32 v5, -1, exec_hi +// GFX13: v_cvt_pk_i16_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6b,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_i16_i32 v5, 0.5, m0 +// GFX13: v_cvt_pk_i16_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd7,0xf0,0xfa,0x00,0x02] + +v_cvt_pk_i16_i32 v5, src_scc, vcc_lo +// GFX13: v_cvt_pk_i16_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x6b,0xd7,0xfd,0xd4,0x00,0x02] + +v_cvt_pk_i16_i32 v255, 0xaf123456, vcc_hi +// GFX13: v_cvt_pk_i16_i32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x6b,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_i16_f16 v5, v1.l, v2.l +// GFX13: v_cvt_pk_norm_i16_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_i16_f16 v5, v255.l, v255.l +// GFX13: v_cvt_pk_norm_i16_f16 v5, v255.l, v255.l ; encoding: [0x05,0x00,0x12,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_i16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x12,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x12,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x12,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x12,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_i16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x12,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x12,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x12,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_i16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x12,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_i16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x12,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x12,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x12,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x12,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f16 v5, v1.l, v2.l +// GFX13: v_cvt_pk_norm_u16_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_u16_f16 v5, v255.l, v255.l +// GFX13: v_cvt_pk_norm_u16_f16 v5, v255.l, v255.l ; encoding: [0x05,0x00,0x13,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_u16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x13,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x13,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x13,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x13,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_u16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x13,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x13,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x13,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_u16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x13,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_u16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x13,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x13,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x13,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x13,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_u16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_u16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x22,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_u16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_u16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x22,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_u16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_u16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x22,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_u16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_u16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x22,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_u16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_u16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x22,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_u16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_u16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x22,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_u16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x22,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_u16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_u16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x22,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_u16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_u16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x22,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_u16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_u16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x22,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_u16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_u16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x22,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_u16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_u16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x22,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_u16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_u16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x22,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_u16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_u16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x22,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_u16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_u16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x22,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u16_u32 v5, v1, v2 +// GFX13: v_cvt_pk_u16_u32 v5, v1, v2 ; encoding: [0x05,0x00,0x6a,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_u16_u32 v5, v255, v255 +// GFX13: v_cvt_pk_u16_u32 v5, v255, v255 ; encoding: [0x05,0x00,0x6a,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_u16_u32 v5, s1, s2 +// GFX13: v_cvt_pk_u16_u32 v5, s1, s2 ; encoding: [0x05,0x00,0x6a,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_u16_u32 v5, s105, s105 +// GFX13: v_cvt_pk_u16_u32 v5, s105, s105 ; encoding: [0x05,0x00,0x6a,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_u16_u32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_u16_u32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6a,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_u16_u32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_u16_u32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6a,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u16_u32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_u16_u32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6a,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_u16_u32 v5, m0, 0.5 +// GFX13: v_cvt_pk_u16_u32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6a,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_u16_u32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_u16_u32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6a,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_u16_u32 v5, exec_hi, null +// GFX13: v_cvt_pk_u16_u32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6a,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_u16_u32 v5, null, exec_lo +// GFX13: v_cvt_pk_u16_u32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6a,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_u16_u32 v5, -1, exec_hi +// GFX13: v_cvt_pk_u16_u32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6a,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_u16_u32 v5, 0.5, m0 +// GFX13: v_cvt_pk_u16_u32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd7,0xf0,0xfa,0x00,0x02] + +v_cvt_pk_u16_u32 v5, src_scc, vcc_lo +// GFX13: v_cvt_pk_u16_u32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x6a,0xd7,0xfd,0xd4,0x00,0x02] + +v_cvt_pk_u16_u32 v255, 0xaf123456, vcc_hi +// GFX13: v_cvt_pk_u16_u32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x6a,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u8_f32 v5, v1, v2, s3 +// GFX13: v_cvt_pk_u8_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x26,0xd6,0x01,0x05,0x0e,0x00] + +v_cvt_pk_u8_f32 v5, v255, s2, s105 +// GFX13: v_cvt_pk_u8_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x26,0xd6,0xff,0x05,0xa4,0x01] + +v_cvt_pk_u8_f32 v5, s1, v255, exec_hi +// GFX13: v_cvt_pk_u8_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x26,0xd6,0x01,0xfe,0xff,0x01] + +v_cvt_pk_u8_f32 v5, s105, s105, exec_lo +// GFX13: v_cvt_pk_u8_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x26,0xd6,0x69,0xd2,0xf8,0x01] + +v_cvt_pk_u8_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cvt_pk_u8_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x26,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cvt_pk_u8_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cvt_pk_u8_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x26,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u8_f32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_cvt_pk_u8_f32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x26,0xd6,0x7b,0xfa,0xed,0x01] + +v_cvt_pk_u8_f32 v5, m0, 0.5, m0 +// GFX13: v_cvt_pk_u8_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x26,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cvt_pk_u8_f32 v5, exec_lo, -1, vcc_hi +// GFX13: v_cvt_pk_u8_f32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x26,0xd6,0x7e,0x82,0xad,0x01] + +v_cvt_pk_u8_f32 v5, exec_hi, null, vcc_lo +// GFX13: v_cvt_pk_u8_f32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x26,0xd6,0x7f,0xf8,0xa8,0x01] + +v_cvt_pk_u8_f32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_cvt_pk_u8_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x26,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_cvt_pk_u8_f32 v5, -1, exec_hi, src_scc +// GFX13: v_cvt_pk_u8_f32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x26,0xd6,0xc1,0xfe,0xf4,0x03] + +v_cvt_pk_u8_f32 v5, 0.5, m0, 0.5 +// GFX13: v_cvt_pk_u8_f32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x26,0xd6,0xf0,0xfa,0xc0,0x03] + +v_cvt_pk_u8_f32 v5, src_scc, vcc_lo, -1 +// GFX13: v_cvt_pk_u8_f32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x26,0xd6,0xfd,0xd4,0x04,0x03] + +v_cvt_pk_u8_f32 v255, -|0xaf123456|, vcc_hi, null +// GFX13: v_cvt_pk_u8_f32 v255, -|0xaf123456|, vcc_hi, null ; encoding: [0xff,0x01,0x26,0xd6,0xff,0xd6,0xf0,0x21,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_i16_f16 v5, v1.l, v2.l +// GFX13: v_cvt_pk_norm_i16_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_i16_f16 v5, v255.l, v255.l +// GFX13: v_cvt_pk_norm_i16_f16 v5, v255.l, v255.l ; encoding: [0x05,0x00,0x12,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_i16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_i16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x12,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x12,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_i16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x12,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_i16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x12,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_i16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x12,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_i16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x12,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_i16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x12,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_i16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x12,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_i16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x12,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x12,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_i16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x12,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_i16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x12,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_i16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_norm_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_i16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_norm_i16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_i16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_norm_i16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_norm_i16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_i16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_norm_i16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_i16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_i16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_i16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_i16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_i16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_i16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_i16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_i16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_i16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_i16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_i16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_norm_i16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_i16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_norm_i16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_i16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_norm_i16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_u16_f16 v5, v1.l, v2.l +// GFX13: v_cvt_pk_norm_u16_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_u16_f16 v5, v255.l, v255.l +// GFX13: v_cvt_pk_norm_u16_f16 v5, v255.l, v255.l ; encoding: [0x05,0x00,0x13,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_u16_f16 v5, s1, s2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, s105, s105 +// GFX13: v_cvt_pk_norm_u16_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x13,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x13,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_cvt_pk_norm_u16_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x13,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_u16_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x13,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_u16_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x13,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_u16_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x13,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_u16_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x13,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_u16_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x13,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_u16_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x13,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x13,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_cvt_pk_norm_u16_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x13,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_cvt_pk_norm_u16_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x13,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_cvt_pk_norm_u16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_norm_u16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x69,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_norm_u16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_norm_u16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x69,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_norm_u16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_norm_u16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x69,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_norm_u16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x69,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_norm_u16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x69,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_norm_u16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x69,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_norm_u16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_norm_u16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x69,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_norm_u16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_norm_u16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x69,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_norm_u16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_norm_u16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x69,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_norm_u16_f32 v5, |exec_hi|, null +// GFX13: v_cvt_pk_norm_u16_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x69,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_norm_u16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x69,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_norm_u16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x69,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_norm_u16_f32 v5, 0.5, -m0 +// GFX13: v_cvt_pk_norm_u16_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x69,0xd7,0xf0,0xfa,0x00,0x42] + +v_cvt_pk_norm_u16_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_cvt_pk_norm_u16_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x69,0xd7,0xfd,0xd4,0x00,0x22] + +v_cvt_pk_norm_u16_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_cvt_pk_norm_u16_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x69,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_div_fixup_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_div_fixup_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x5f,0xd7,0x01,0x05,0x0e,0x00] + +v_div_fixup_f16 v5.l, v255.l, s2, s105 +// GFX13: v_div_fixup_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x5f,0xd7,0xff,0x05,0xa4,0x01] + +v_div_fixup_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_div_fixup_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x5f,0xd7,0x01,0xfe,0xff,0x01] + +v_div_fixup_f16 v5.l, s105, s105, exec_lo +// GFX13: v_div_fixup_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x5f,0xd7,0x69,0xd2,0xf8,0x01] + +v_div_fixup_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_div_fixup_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x5f,0xd7,0x6a,0xf6,0x0c,0x04] + +v_div_fixup_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_div_fixup_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x5f,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_div_fixup_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_div_fixup_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x5f,0xd7,0x7b,0xfa,0xed,0xe1] + +v_div_fixup_f16 v5.l, m0, 0.5, m0 +// GFX13: v_div_fixup_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x5f,0xd7,0x7d,0xe0,0xf5,0x01] + +v_div_fixup_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_div_fixup_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x5f,0xd7,0x7e,0x82,0xad,0x01] + +v_div_fixup_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_div_fixup_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x5f,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_div_fixup_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_div_fixup_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x5f,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_div_fixup_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_div_fixup_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x5f,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_div_fixup_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_div_fixup_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x5f,0xd7,0xf0,0xfa,0xc0,0x43] + +v_div_fixup_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_div_fixup_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x5f,0xd7,0xfd,0xd4,0x04,0x23] + +v_div_fixup_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_div_fixup_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x5f,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_div_fixup_f32 v5, v1, v2, s3 +// GFX13: v_div_fixup_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x27,0xd6,0x01,0x05,0x0e,0x00] + +v_div_fixup_f32 v5, v255, s2, s105 +// GFX13: v_div_fixup_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x27,0xd6,0xff,0x05,0xa4,0x01] + +v_div_fixup_f32 v5, s1, v255, exec_hi +// GFX13: v_div_fixup_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x27,0xd6,0x01,0xfe,0xff,0x01] + +v_div_fixup_f32 v5, s105, s105, exec_lo +// GFX13: v_div_fixup_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x27,0xd6,0x69,0xd2,0xf8,0x01] + +v_div_fixup_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_div_fixup_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x27,0xd6,0x6a,0xf6,0x0c,0x04] + +v_div_fixup_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_div_fixup_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x27,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_div_fixup_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_div_fixup_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x27,0xd6,0x7b,0xfa,0xed,0xe1] + +v_div_fixup_f32 v5, m0, 0.5, m0 +// GFX13: v_div_fixup_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x27,0xd6,0x7d,0xe0,0xf5,0x01] + +v_div_fixup_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_div_fixup_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x27,0xd6,0x7e,0x82,0xad,0x01] + +v_div_fixup_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_div_fixup_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x27,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_div_fixup_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_div_fixup_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x27,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_div_fixup_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_div_fixup_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x27,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_div_fixup_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_div_fixup_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x27,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_div_fixup_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_div_fixup_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x27,0xd6,0xfd,0xd4,0x04,0x33] + +v_div_fixup_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_div_fixup_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x27,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_div_fixup_f64 v[5:6], v[1:2], v[2:3], v[3:4] +// GFX13: v_div_fixup_f64 v[5:6], v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x00,0x28,0xd6,0x01,0x05,0x0e,0x04] + +v_div_fixup_f64 v[5:6], v[254:255], v[254:255], s[6:7] +// GFX13: v_div_fixup_f64 v[5:6], v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x00,0x28,0xd6,0xfe,0xfd,0x1b,0x00] + +v_div_fixup_f64 v[5:6], s[2:3], s[4:5], v[254:255] +// GFX13: v_div_fixup_f64 v[5:6], s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x00,0x28,0xd6,0x02,0x08,0xf8,0x07] + +v_div_fixup_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| +// GFX13: v_div_fixup_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| ; encoding: [0x05,0x05,0x28,0xd6,0x68,0xd0,0xa0,0xa1] + +v_div_fixup_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| +// GFX13: v_div_fixup_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| ; encoding: [0x05,0x06,0x28,0xd6,0x6a,0xf4,0xe8,0xc1] + +v_div_fixup_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null +// GFX13: v_div_fixup_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null ; encoding: [0x05,0x01,0x28,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] + +v_div_fixup_f64 v[5:6], -|exec|, -|src_scc|, -|exec| +// GFX13: v_div_fixup_f64 v[5:6], -|exec|, -|src_scc|, -|exec| ; encoding: [0x05,0x07,0x28,0xd6,0x7e,0xfa,0xf9,0xe1] + +v_div_fixup_f64 v[5:6], null, 0.5, vcc +// GFX13: v_div_fixup_f64 v[5:6], null, 0.5, vcc ; encoding: [0x05,0x00,0x28,0xd6,0x7c,0xe0,0xa9,0x01] + +v_div_fixup_f64 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_div_fixup_f64 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x28,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_fixup_f64 v[5:6], 0.5, null, -|src_scc| mul:2 +// GFX13: v_div_fixup_f64 v[5:6], 0.5, null, -|src_scc| mul:2 ; encoding: [0x05,0x04,0x28,0xd6,0xf0,0xf8,0xf4,0x8b] + +v_div_fixup_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 +// GFX13: v_div_fixup_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 ; encoding: [0x05,0x03,0x28,0xd6,0xfd,0xfc,0xc0,0x73] + +v_div_fixup_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 +// GFX13: v_div_fixup_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 ; encoding: [0xfe,0x82,0x28,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] + +v_div_fmas_f32 v5, vcc_lo, v2, vcc_lo +// GFX13: v_div_fmas_f32 v5, vcc_lo, v2, vcc_lo ; encoding: [0x05,0x00,0x37,0xd6,0x6a,0x04,0xaa,0x01] + +v_div_fmas_f32 v5, ttmp15, ttmp15, ttmp15 +// GFX13: v_div_fmas_f32 v5, ttmp15, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x37,0xd6,0x7b,0xf6,0xec,0x01] + +v_div_fmas_f32 v5, -|m0|, -|v255|, v3 +// GFX13: v_div_fmas_f32 v5, -|m0|, -|v255|, v3 ; encoding: [0x05,0x03,0x37,0xd6,0x7d,0xfe,0x0f,0x64] + +v_div_fmas_f32 v5, -|exec_lo|, -|exec_lo|, -|exec_lo| +// GFX13: v_div_fmas_f32 v5, -|exec_lo|, -|exec_lo|, -|exec_lo| ; encoding: [0x05,0x07,0x37,0xd6,0x7e,0xfc,0xf8,0xe1] + +v_div_fmas_f32 v5, -|exec_hi|, 0.5, -|v255| +// GFX13: v_div_fmas_f32 v5, -|exec_hi|, 0.5, -|v255| ; encoding: [0x05,0x05,0x37,0xd6,0x7f,0xe0,0xfd,0xa7] + +v_div_fmas_f32 v5, null, exec_hi, -|exec_hi| +// GFX13: v_div_fmas_f32 v5, null, exec_hi, -|exec_hi| ; encoding: [0x05,0x04,0x37,0xd6,0x7c,0xfe,0xfc,0x81] + +v_div_fmas_f32 v5, -1, -|m0|, -|m0| +// GFX13: v_div_fmas_f32 v5, -1, -|m0|, -|m0| ; encoding: [0x05,0x06,0x37,0xd6,0xc1,0xfa,0xf4,0xc1] + +v_div_fmas_f32 v5, 0.5, -|vcc_lo|, 0.5 mul:2 +// GFX13: v_div_fmas_f32 v5, 0.5, -|vcc_lo|, 0.5 mul:2 ; encoding: [0x05,0x02,0x37,0xd6,0xf0,0xd4,0xc0,0x4b] + +v_div_fmas_f32 v5, vcc_lo, v2, v3 +// GFX13: v_div_fmas_f32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x37,0xd6,0x6a,0x04,0x0e,0x04] + +v_div_fmas_f32 v5, vcc_hi, v255, vcc_hi +// GFX13: v_div_fmas_f32 v5, vcc_hi, v255, vcc_hi ; encoding: [0x05,0x00,0x37,0xd6,0x6b,0xfe,0xaf,0x01] + +v_div_fmas_f32 v5, -|ttmp15|, -|ttmp15|, ttmp15 +// GFX13: v_div_fmas_f32 v5, -|ttmp15|, -|ttmp15|, ttmp15 ; encoding: [0x05,0x03,0x37,0xd6,0x7b,0xf6,0xec,0x61] + +v_div_fmas_f32 v5, m0, 0.5, v255 +// GFX13: v_div_fmas_f32 v5, m0, 0.5, v255 ; encoding: [0x05,0x00,0x37,0xd6,0x7d,0xe0,0xfd,0x07] + +v_div_fmas_f32 v5, -|exec_lo|, exec_lo, -|exec_lo| +// GFX13: v_div_fmas_f32 v5, -|exec_lo|, exec_lo, -|exec_lo| ; encoding: [0x05,0x05,0x37,0xd6,0x7e,0xfc,0xf8,0xa1] + +v_div_fmas_f32 v5, -|exec_hi|, -|exec_hi|, -|exec_hi| +// GFX13: v_div_fmas_f32 v5, -|exec_hi|, -|exec_hi|, -|exec_hi| ; encoding: [0x05,0x07,0x37,0xd6,0x7f,0xfe,0xfc,0xe1] + +v_div_fmas_f32 v5, null, m0, -|m0| +// GFX13: v_div_fmas_f32 v5, null, m0, -|m0| ; encoding: [0x05,0x04,0x37,0xd6,0x7c,0xfa,0xf4,0x81] + +v_div_fmas_f32 v5, -1, -|vcc_lo|, -|vcc_lo| +// GFX13: v_div_fmas_f32 v5, -1, -|vcc_lo|, -|vcc_lo| ; encoding: [0x05,0x06,0x37,0xd6,0xc1,0xd4,0xa8,0xc1] + +v_div_fmas_f32 v5, 0.5, -|vcc_hi|, 0.5 mul:2 +// GFX13: v_div_fmas_f32 v5, 0.5, -|vcc_hi|, 0.5 mul:2 ; encoding: [0x05,0x02,0x37,0xd6,0xf0,0xd6,0xc0,0x4b] + +v_div_fmas_f32 v5, v1, 0xaf123456, 0xaf123456 +// GFX13: v_div_fmas_f32 v5, v1, 0xaf123456, 0xaf123456 ; encoding: [0x05,0x00,0x37,0xd6,0x01,0xff,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_fmas_f32 v5, v255, src_scc, src_scc +// GFX13: v_div_fmas_f32 v5, v255, src_scc, src_scc ; encoding: [0x05,0x00,0x37,0xd6,0xff,0xfb,0xf5,0x03] + +v_div_fmas_f32 v5, s105, s105, s105 +// GFX13: v_div_fmas_f32 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x37,0xd6,0x69,0xd2,0xa4,0x01] + +v_div_fmas_f32 v5, src_scc, -1, -1 mul:4 +// GFX13: v_div_fmas_f32 v5, src_scc, -1, -1 mul:4 ; encoding: [0x05,0x00,0x37,0xd6,0xfd,0x82,0x05,0x13] + +v_div_fmas_f32 v255, -|0xaf123456|, null, null clamp div:2 +// GFX13: v_div_fmas_f32 v255, -|0xaf123456|, null, null clamp div:2 ; encoding: [0xff,0x81,0x37,0xd6,0xff,0xf8,0xf0,0x39,0x56,0x34,0x12,0xaf] + +v_div_fmas_f64 v[5:6], v[1:2], 0xaf123456, 0xaf123456 +// GFX13: v_div_fmas_f64 v[5:6], v[1:2], 0xaf123456, 0xaf123456 ; encoding: [0x05,0x00,0x38,0xd6,0x01,0xff,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_fmas_f64 v[5:6], v[254:255], src_scc, v[3:4] +// GFX13: v_div_fmas_f64 v[5:6], v[254:255], src_scc, v[3:4] ; encoding: [0x05,0x00,0x38,0xd6,0xfe,0xfb,0x0d,0x04] + +v_div_fmas_f64 v[5:6], s[104:105], |s[104:105]|, s[104:105] +// GFX13: v_div_fmas_f64 v[5:6], s[104:105], |s[104:105]|, s[104:105] ; encoding: [0x05,0x02,0x38,0xd6,0x68,0xd0,0xa0,0x01] + +v_div_fmas_f64 v[5:6], -|vcc|, v[2:3], -|v[254:255]| +// GFX13: v_div_fmas_f64 v[5:6], -|vcc|, v[2:3], -|v[254:255]| ; encoding: [0x05,0x05,0x38,0xd6,0x6a,0x04,0xfa,0xa7] + +v_div_fmas_f64 v[5:6], -|ttmp[14:15]|, -|ttmp[14:15]|, -|ttmp[14:15]| +// GFX13: v_div_fmas_f64 v[5:6], -|ttmp[14:15]|, -|ttmp[14:15]|, -|ttmp[14:15]| ; encoding: [0x05,0x07,0x38,0xd6,0x7a,0xf4,0xe8,0xe1] + +v_div_fmas_f64 v[5:6], -|exec|, -|v[254:255]|, null +// GFX13: v_div_fmas_f64 v[5:6], -|exec|, -|v[254:255]|, null ; encoding: [0x05,0x03,0x38,0xd6,0x7e,0xfc,0xf3,0x61] + +v_div_fmas_f64 v[5:6], null, 0.5, -src_scc +// GFX13: v_div_fmas_f64 v[5:6], null, 0.5, -src_scc ; encoding: [0x05,0x00,0x38,0xd6,0x7c,0xe0,0xf5,0x83] + +v_div_fmas_f64 v[5:6], -1, -exec, |exec| +// GFX13: v_div_fmas_f64 v[5:6], -1, -exec, |exec| ; encoding: [0x05,0x04,0x38,0xd6,0xc1,0xfc,0xf8,0x41] + +v_div_fmas_f64 v[5:6], 0.5, -|vcc|, -|vcc| mul:2 +// GFX13: v_div_fmas_f64 v[5:6], 0.5, -|vcc|, -|vcc| mul:2 ; encoding: [0x05,0x06,0x38,0xd6,0xf0,0xd4,0xa8,0xc9] + +v_div_fmas_f64 v[5:6], -|src_scc|, -1, 0.5 mul:4 +// GFX13: v_div_fmas_f64 v[5:6], -|src_scc|, -1, 0.5 mul:4 ; encoding: [0x05,0x01,0x38,0xd6,0xfd,0x82,0xc1,0x33] + +v_div_fmas_f64 v[254:255], 0xaf123456, null, -1 clamp div:2 +// GFX13: v_div_fmas_f64 v[254:255], 0xaf123456, null, -1 clamp div:2 ; encoding: [0xfe,0x80,0x38,0xd6,0xff,0xf8,0x04,0x1b,0x56,0x34,0x12,0xaf] + +v_div_scale_f32 v5, vcc_lo, v1, v2, s3 +// W32: v_div_scale_f32 v5, vcc_lo, v1, v2, s3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0x05,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, v255, s2, s105 +// W32: v_div_scale_f32 v5, vcc_lo, v255, s2, s105 ; encoding: [0x05,0x6a,0xfc,0xd6,0xff,0x05,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, s1, v255, exec_hi +// W32: v_div_scale_f32 v5, vcc_lo, s1, v255, exec_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0xfe,0xff,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, s105, s105, exec_lo +// W32: v_div_scale_f32 v5, vcc_lo, s105, s105, exec_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x69,0xd2,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, vcc_lo, ttmp15, v3 +// W32: v_div_scale_f32 v5, vcc_lo, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6a,0xf6,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, vcc_hi, 0xaf123456, v255 +// W32: v_div_scale_f32 v5, vcc_lo, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -ttmp15, -src_scc, -ttmp15 +// W32: v_div_scale_f32 v5, vcc_lo, -ttmp15, -src_scc, -ttmp15 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7b,0xfa,0xed,0xe1] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, m0, 0.5, m0 +// W32: v_div_scale_f32 v5, vcc_lo, m0, 0.5, m0 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7d,0xe0,0xf5,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, exec_lo, -1, vcc_hi +// W32: v_div_scale_f32 v5, vcc_lo, exec_lo, -1, vcc_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x7e,0x82,0xad,0x01] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -exec_hi, null, -vcc_lo +// W32: v_div_scale_f32 v5, vcc_lo, -exec_hi, null, -vcc_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x7f,0xf8,0xa8,0xa1] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, null, exec_lo, neg(0xaf123456) +// W32: v_div_scale_f32 v5, vcc_lo, null, exec_lo, neg(0xaf123456) ; encoding: [0x05,0x6a,0xfc,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -1, -exec_hi, -src_scc +// W32: v_div_scale_f32 v5, vcc_lo, -1, -exec_hi, -src_scc ; encoding: [0x05,0x6a,0xfc,0xd6,0xc1,0xfe,0xf4,0xc3] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, 0.5, -m0, 0.5 mul:2 +// W32: v_div_scale_f32 v5, vcc_lo, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x6a,0xfc,0xd6,0xf0,0xfa,0xc0,0x4b] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc_lo, -src_scc, vcc_lo, -1 mul:4 +// W32: v_div_scale_f32 v5, vcc_lo, -src_scc, vcc_lo, -1 mul:4 ; encoding: [0x05,0x6a,0xfc,0xd6,0xfd,0xd4,0x04,0x33] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_div_scale_f32 v255, vcc_lo, neg(0xaf123456), -vcc_hi, null clamp div:2 +// W32: v_div_scale_f32 v255, vcc_lo, neg(0xaf123456), -vcc_hi, null clamp div:2 ; encoding: [0xff,0xea,0xfc,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction + +v_div_scale_f32 v5, vcc, v1, v2, s3 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, v1, v2, s3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0x05,0x0e,0x00] + +v_div_scale_f32 v5, vcc, v255, s2, s105 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, v255, s2, s105 ; encoding: [0x05,0x6a,0xfc,0xd6,0xff,0x05,0xa4,0x01] + +v_div_scale_f32 v5, vcc, s1, v255, exec_hi +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, s1, v255, exec_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x01,0xfe,0xff,0x01] + +v_div_scale_f32 v5, vcc, s105, s105, exec_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, s105, s105, exec_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x69,0xd2,0xf8,0x01] + +v_div_scale_f32 v5, vcc, vcc_lo, ttmp15, v3 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6a,0xf6,0x0c,0x04] + +v_div_scale_f32 v5, vcc, vcc_hi, 0xaf123456, v255 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x6a,0xfc,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_div_scale_f32 v5, vcc, -ttmp15, -src_scc, -ttmp15 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -ttmp15, -src_scc, -ttmp15 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7b,0xfa,0xed,0xe1] + +v_div_scale_f32 v5, vcc, m0, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, m0, 0.5, m0 ; encoding: [0x05,0x6a,0xfc,0xd6,0x7d,0xe0,0xf5,0x01] + +v_div_scale_f32 v5, vcc, exec_lo, -1, vcc_hi +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, exec_lo, -1, vcc_hi ; encoding: [0x05,0x6a,0xfc,0xd6,0x7e,0x82,0xad,0x01] + +v_div_scale_f32 v5, vcc, -exec_hi, null, -vcc_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -exec_hi, null, -vcc_lo ; encoding: [0x05,0x6a,0xfc,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_div_scale_f32 v5, vcc, null, exec_lo, neg(0xaf123456) +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, null, exec_lo, neg(0xaf123456) ; encoding: [0x05,0x6a,0xfc,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_div_scale_f32 v5, vcc, -1, -exec_hi, -src_scc +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -1, -exec_hi, -src_scc ; encoding: [0x05,0x6a,0xfc,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_div_scale_f32 v5, vcc, 0.5, -m0, 0.5 mul:2 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x6a,0xfc,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_div_scale_f32 v5, vcc, -src_scc, vcc_lo, -1 mul:4 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_div_scale_f32 v5, vcc, -src_scc, vcc_lo, -1 mul:4 ; encoding: [0x05,0x6a,0xfc,0xd6,0xfd,0xd4,0x04,0x33] + +v_div_scale_f32 v255, vcc, neg(0xaf123456), -vcc_hi, null clamp div:2 +// W32-ERR: :[[@LINE-1]]:23: error: invalid operand for instruction +// W64: v_div_scale_f32 v255, vcc, neg(0xaf123456), -vcc_hi, null clamp div:2 ; encoding: [0xff,0xea,0xfc,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_div_scale_f64 v[5:6], vcc_lo, v[1:2], v[2:3], v[3:4] +// W32: v_div_scale_f64 v[5:6], vcc_lo, v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x6a,0xfd,0xd6,0x01,0x05,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, v[254:255], v[254:255], s[6:7] +// W32: v_div_scale_f64 v[5:6], vcc_lo, v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x6a,0xfd,0xd6,0xfe,0xfd,0x1b,0x00] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, s[2:3], s[4:5], v[254:255] +// W32: v_div_scale_f64 v[5:6], vcc_lo, s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x6a,0xfd,0xd6,0x02,0x08,0xf8,0x07] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -s[104:105], s[104:105], -s[104:105] +// W32: v_div_scale_f64 v[5:6], vcc_lo, -s[104:105], s[104:105], -s[104:105] ; encoding: [0x05,0x6a,0xfd,0xd6,0x68,0xd0,0xa0,0xa1] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, vcc, -ttmp[14:15], -ttmp[14:15] +// W32: v_div_scale_f64 v[5:6], vcc_lo, vcc, -ttmp[14:15], -ttmp[14:15] ; encoding: [0x05,0x6a,0xfd,0xd6,0x6a,0xf4,0xe8,0xc1] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -ttmp[14:15], 0xaf123456, null +// W32: v_div_scale_f64 v[5:6], vcc_lo, -ttmp[14:15], 0xaf123456, null ; encoding: [0x05,0x6a,0xfd,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -exec, -src_scc, -exec +// W32: v_div_scale_f64 v[5:6], vcc_lo, -exec, -src_scc, -exec ; encoding: [0x05,0x6a,0xfd,0xd6,0x7e,0xfa,0xf9,0xe1] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, null, 0.5, vcc +// W32: v_div_scale_f64 v[5:6], vcc_lo, null, 0.5, vcc ; encoding: [0x05,0x6a,0xfd,0xd6,0x7c,0xe0,0xa9,0x01] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -1, -1, 0xaf123456 +// W32: v_div_scale_f64 v[5:6], vcc_lo, -1, -1, 0xaf123456 ; encoding: [0x05,0x6a,0xfd,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, 0.5, null, -src_scc mul:2 +// W32: v_div_scale_f64 v[5:6], vcc_lo, 0.5, null, -src_scc mul:2 ; encoding: [0x05,0x6a,0xfd,0xd6,0xf0,0xf8,0xf4,0x8b] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc_lo, -src_scc, -exec, 0.5 mul:4 +// W32: v_div_scale_f64 v[5:6], vcc_lo, -src_scc, -exec, 0.5 mul:4 ; encoding: [0x05,0x6a,0xfd,0xd6,0xfd,0xfc,0xc0,0x73] +// W64-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction + +v_div_scale_f64 v[254:255], vcc_lo, 0xaf123456, -vcc, -1 clamp div:2 +// W32: v_div_scale_f64 v[254:255], vcc_lo, 0xaf123456, -vcc, -1 clamp div:2 ; encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_div_scale_f64 v[5:6], vcc, v[1:2], v[2:3], v[3:4] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x6a,0xfd,0xd6,0x01,0x05,0x0e,0x04] + +v_div_scale_f64 v[5:6], vcc, v[254:255], v[254:255], s[6:7] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x6a,0xfd,0xd6,0xfe,0xfd,0x1b,0x00] + +v_div_scale_f64 v[5:6], vcc, s[2:3], s[4:5], v[254:255] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x6a,0xfd,0xd6,0x02,0x08,0xf8,0x07] + +v_div_scale_f64 v[5:6], vcc, -s[104:105], s[104:105], -s[104:105] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -s[104:105], s[104:105], -s[104:105] ; encoding: [0x05,0x6a,0xfd,0xd6,0x68,0xd0,0xa0,0xa1] + +v_div_scale_f64 v[5:6], vcc, vcc, -ttmp[14:15], -ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, vcc, -ttmp[14:15], -ttmp[14:15] ; encoding: [0x05,0x6a,0xfd,0xd6,0x6a,0xf4,0xe8,0xc1] + +v_div_scale_f64 v[5:6], vcc, -ttmp[14:15], 0xaf123456, null +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -ttmp[14:15], 0xaf123456, null ; encoding: [0x05,0x6a,0xfd,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] + +v_div_scale_f64 v[5:6], vcc, -exec, -src_scc, -exec +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -exec, -src_scc, -exec ; encoding: [0x05,0x6a,0xfd,0xd6,0x7e,0xfa,0xf9,0xe1] + +v_div_scale_f64 v[5:6], vcc, null, 0.5, vcc +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, null, 0.5, vcc ; encoding: [0x05,0x6a,0xfd,0xd6,0x7c,0xe0,0xa9,0x01] + +v_div_scale_f64 v[5:6], vcc, -1, -1, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -1, -1, 0xaf123456 ; encoding: [0x05,0x6a,0xfd,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_div_scale_f64 v[5:6], vcc, 0.5, null, -src_scc mul:2 +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, 0.5, null, -src_scc mul:2 ; encoding: [0x05,0x6a,0xfd,0xd6,0xf0,0xf8,0xf4,0x8b] + +v_div_scale_f64 v[5:6], vcc, -src_scc, -exec, 0.5 mul:4 +// W32-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction +// W64: v_div_scale_f64 v[5:6], vcc, -src_scc, -exec, 0.5 mul:4 ; encoding: [0x05,0x6a,0xfd,0xd6,0xfd,0xfc,0xc0,0x73] + +v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2 ; encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, v1, v2, s3 +// GFX13: v_fma_dx9_zero_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00] + +v_fma_dx9_zero_f32 v5, v255, s2, s105 +// GFX13: v_fma_dx9_zero_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x09,0xd6,0xff,0x05,0xa4,0x01] + +v_fma_dx9_zero_f32 v5, s1, v255, exec_hi +// GFX13: v_fma_dx9_zero_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x09,0xd6,0x01,0xfe,0xff,0x01] + +v_fma_dx9_zero_f32 v5, s105, s105, exec_lo +// GFX13: v_fma_dx9_zero_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x09,0xd6,0x69,0xd2,0xf8,0x01] + +v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x09,0xd6,0x6a,0xf6,0x0c,0x04] + +v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x09,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x09,0xd6,0x7b,0xfa,0xed,0xe1] + +v_fma_dx9_zero_f32 v5, m0, 0.5, m0 +// GFX13: v_fma_dx9_zero_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x09,0xd6,0x7d,0xe0,0xf5,0x01] + +v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x09,0xd6,0x7e,0x82,0xad,0x01] + +v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x09,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x09,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x09,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x09,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x09,0xd6,0xfd,0xd4,0x04,0x33] + +v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_fma_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_fma_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x4b,0xd7,0x01,0x05,0x0e,0x00] + +v_fma_f16 v5.l, v255.l, s2, s105 +// GFX13: v_fma_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x4b,0xd7,0xff,0x05,0xa4,0x01] + +v_fma_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_fma_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x4b,0xd7,0x01,0xfe,0xff,0x01] + +v_fma_f16 v5.l, s105, s105, exec_lo +// GFX13: v_fma_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x4b,0xd7,0x69,0xd2,0xf8,0x01] + +v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x4b,0xd7,0x6a,0xf6,0x0c,0x04] + +v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x4b,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x4b,0xd7,0x7b,0xfa,0xed,0xe1] + +v_fma_f16 v5.l, m0, 0.5, m0 +// GFX13: v_fma_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x4b,0xd7,0x7d,0xe0,0xf5,0x01] + +v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x4b,0xd7,0x7e,0x82,0xad,0x01] + +v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x4b,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x4b,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x4b,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x4b,0xd7,0xf0,0xfa,0xc0,0x43] + +v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x4b,0xd7,0xfd,0xd4,0x04,0x23] + +v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x4b,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_fma_f32 v5, v1, v2, s3 +// GFX13: v_fma_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00] + +v_fma_f32 v5, v255, s2, s105 +// GFX13: v_fma_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x13,0xd6,0xff,0x05,0xa4,0x01] + +v_fma_f32 v5, s1, v255, exec_hi +// GFX13: v_fma_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x13,0xd6,0x01,0xfe,0xff,0x01] + +v_fma_f32 v5, s105, s105, exec_lo +// GFX13: v_fma_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x13,0xd6,0x69,0xd2,0xf8,0x01] + +v_fma_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x13,0xd6,0x6a,0xf6,0x0c,0x04] + +v_fma_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_fma_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x13,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_fma_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x13,0xd6,0x7b,0xfa,0xed,0xe1] + +v_fma_f32 v5, m0, 0.5, m0 +// GFX13: v_fma_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x13,0xd6,0x7d,0xe0,0xf5,0x01] + +v_fma_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x13,0xd6,0x7e,0x82,0xad,0x01] + +v_fma_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_fma_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x13,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_fma_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_fma_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x13,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_fma_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_fma_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x13,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_fma_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_fma_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x13,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_fma_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_fma_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x13,0xd6,0xfd,0xd4,0x04,0x33] + +v_fma_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_fma_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x13,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_fma_f64 v[5:6], v[1:2], v[2:3], v[3:4] +// GFX13: v_fma_f64 v[5:6], v[1:2], v[2:3], v[3:4] ; encoding: [0x05,0x00,0x14,0xd6,0x01,0x05,0x0e,0x04] + +v_fma_f64 v[5:6], v[254:255], v[254:255], s[6:7] +// GFX13: v_fma_f64 v[5:6], v[254:255], v[254:255], s[6:7] ; encoding: [0x05,0x00,0x14,0xd6,0xfe,0xfd,0x1b,0x00] + +v_fma_f64 v[5:6], s[2:3], s[4:5], v[254:255] +// GFX13: v_fma_f64 v[5:6], s[2:3], s[4:5], v[254:255] ; encoding: [0x05,0x00,0x14,0xd6,0x02,0x08,0xf8,0x07] + +v_fma_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| +// GFX13: v_fma_f64 v[5:6], -|s[104:105]|, s[104:105], -|s[104:105]| ; encoding: [0x05,0x05,0x14,0xd6,0x68,0xd0,0xa0,0xa1] + +v_fma_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| +// GFX13: v_fma_f64 v[5:6], vcc, -|ttmp[14:15]|, -|ttmp[14:15]| ; encoding: [0x05,0x06,0x14,0xd6,0x6a,0xf4,0xe8,0xc1] + +v_fma_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null +// GFX13: v_fma_f64 v[5:6], -|ttmp[14:15]|, 0xaf123456, null ; encoding: [0x05,0x01,0x14,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] + +v_fma_f64 v[5:6], -|exec|, -|src_scc|, -|exec| +// GFX13: v_fma_f64 v[5:6], -|exec|, -|src_scc|, -|exec| ; encoding: [0x05,0x07,0x14,0xd6,0x7e,0xfa,0xf9,0xe1] + +v_fma_f64 v[5:6], null, 0.5, vcc +// GFX13: v_fma_f64 v[5:6], null, 0.5, vcc ; encoding: [0x05,0x00,0x14,0xd6,0x7c,0xe0,0xa9,0x01] + +v_fma_f64 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_fma_f64 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x14,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_fma_f64 v[5:6], 0.5, null, -|src_scc| mul:2 +// GFX13: v_fma_f64 v[5:6], 0.5, null, -|src_scc| mul:2 ; encoding: [0x05,0x04,0x14,0xd6,0xf0,0xf8,0xf4,0x8b] + +v_fma_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 +// GFX13: v_fma_f64 v[5:6], -|src_scc|, -|exec|, 0.5 mul:4 ; encoding: [0x05,0x03,0x14,0xd6,0xfd,0xfc,0xc0,0x73] + +v_fma_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 +// GFX13: v_fma_f64 v[254:255], 0xaf123456, -|vcc|, -1 clamp div:2 ; encoding: [0xfe,0x82,0x14,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, v1, v2, s3 +// GFX13: v_fma_dx9_zero_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00] + +v_fma_dx9_zero_f32 v5, v255, s2, s105 +// GFX13: v_fma_dx9_zero_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x09,0xd6,0xff,0x05,0xa4,0x01] + +v_fma_dx9_zero_f32 v5, s1, v255, exec_hi +// GFX13: v_fma_dx9_zero_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x09,0xd6,0x01,0xfe,0xff,0x01] + +v_fma_dx9_zero_f32 v5, s105, s105, exec_lo +// GFX13: v_fma_dx9_zero_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x09,0xd6,0x69,0xd2,0xf8,0x01] + +v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x09,0xd6,0x6a,0xf6,0x0c,0x04] + +v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_fma_dx9_zero_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x09,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_fma_dx9_zero_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x09,0xd6,0x7b,0xfa,0xed,0xe1] + +v_fma_dx9_zero_f32 v5, m0, 0.5, m0 +// GFX13: v_fma_dx9_zero_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x09,0xd6,0x7d,0xe0,0xf5,0x01] + +v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_fma_dx9_zero_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x09,0xd6,0x7e,0x82,0xad,0x01] + +v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_fma_dx9_zero_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x09,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_fma_dx9_zero_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x09,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_fma_dx9_zero_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x09,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_fma_dx9_zero_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x09,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x09,0xd6,0xfd,0xd4,0x04,0x33] + +v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_ldexp_f32 v5, v1, v2 +// GFX13: v_ldexp_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x62,0xd7,0x01,0x05,0x02,0x02] + +v_ldexp_f32 v5, v255, v255 +// GFX13: v_ldexp_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x62,0xd7,0xff,0xff,0x03,0x02] + +v_ldexp_f32 v5, s1, s2 +// GFX13: v_ldexp_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x62,0xd7,0x01,0x04,0x00,0x02] + +v_ldexp_f32 v5, s105, s105 +// GFX13: v_ldexp_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x62,0xd7,0x69,0xd2,0x00,0x02] + +v_ldexp_f32 v5, vcc_lo, ttmp15 +// GFX13: v_ldexp_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x62,0xd7,0x6a,0xf6,0x00,0x02] + +v_ldexp_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_ldexp_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x62,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ldexp_f32 v5, ttmp15, src_scc +// GFX13: v_ldexp_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x62,0xd7,0x7b,0xfa,0x01,0x02] + +v_ldexp_f32 v5, m0, 0.5 +// GFX13: v_ldexp_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x62,0xd7,0x7d,0xe0,0x01,0x02] + +v_ldexp_f32 v5, exec_lo, -1 +// GFX13: v_ldexp_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x62,0xd7,0x7e,0x82,0x01,0x02] + +v_ldexp_f32 v5, exec_hi, null +// GFX13: v_ldexp_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x62,0xd7,0x7f,0xf8,0x00,0x02] + +v_ldexp_f32 v5, null, exec_lo +// GFX13: v_ldexp_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x62,0xd7,0x7c,0xfc,0x00,0x02] + +v_ldexp_f32 v5, -1, exec_hi +// GFX13: v_ldexp_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x62,0xd7,0xc1,0xfe,0x00,0x02] + +v_ldexp_f32 v5, 0.5, m0 mul:2 +// GFX13: v_ldexp_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x62,0xd7,0xf0,0xfa,0x00,0x0a] + +v_ldexp_f32 v5, src_scc, vcc_lo mul:4 +// GFX13: v_ldexp_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x62,0xd7,0xfd,0xd4,0x00,0x12] + +v_ldexp_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 +// GFX13: v_ldexp_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x62,0xd7,0xff,0xd6,0x00,0x3a,0x56,0x34,0x12,0xaf] + +v_ldexp_f64 v[5:6], v[1:2], v2 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], v2 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0x05,0x02,0x02] + +v_ldexp_f64 v[5:6], v[1:2], v255 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], v255 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0xff,0x03,0x02] + +v_ldexp_f64 v[5:6], v[1:2], s2 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], s2 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0x05,0x00,0x02] + +v_ldexp_f64 v[5:6], v[1:2], s105 +// GFX13: v_ldexp_f64 v[5:6], v[1:2], s105 ; encoding: [0x05,0x00,0x2b,0xd7,0x01,0xd3,0x00,0x02] + +v_ldexp_f64 v[5:6], v[254:255], ttmp15 +// GFX13: v_ldexp_f64 v[5:6], v[254:255], ttmp15 ; encoding: [0x05,0x00,0x2b,0xd7,0xfe,0xf7,0x00,0x02] + +v_ldexp_f64 v[5:6], s[2:3], vcc_hi +// GFX13: v_ldexp_f64 v[5:6], s[2:3], vcc_hi ; encoding: [0x05,0x00,0x2b,0xd7,0x02,0xd6,0x00,0x02] + +v_ldexp_f64 v[5:6], s[104:105], vcc_lo +// GFX13: v_ldexp_f64 v[5:6], s[104:105], vcc_lo ; encoding: [0x05,0x00,0x2b,0xd7,0x68,0xd4,0x00,0x02] + +v_ldexp_f64 v[5:6], vcc, m0 +// GFX13: v_ldexp_f64 v[5:6], vcc, m0 ; encoding: [0x05,0x00,0x2b,0xd7,0x6a,0xfa,0x00,0x02] + +v_ldexp_f64 v[5:6], ttmp[14:15], exec_hi +// GFX13: v_ldexp_f64 v[5:6], ttmp[14:15], exec_hi ; encoding: [0x05,0x00,0x2b,0xd7,0x7a,0xfe,0x00,0x02] + +v_ldexp_f64 v[5:6], exec, exec_lo +// GFX13: v_ldexp_f64 v[5:6], exec, exec_lo ; encoding: [0x05,0x00,0x2b,0xd7,0x7e,0xfc,0x00,0x02] + +v_ldexp_f64 v[5:6], null, null +// GFX13: v_ldexp_f64 v[5:6], null, null ; encoding: [0x05,0x00,0x2b,0xd7,0x7c,0xf8,0x00,0x02] + +v_ldexp_f64 v[5:6], -1, -1 +// GFX13: v_ldexp_f64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x2b,0xd7,0xc1,0x82,0x01,0x02] + +v_ldexp_f64 v[5:6], 0.5, 0.5 mul:2 +// GFX13: v_ldexp_f64 v[5:6], 0.5, 0.5 mul:2 ; encoding: [0x05,0x00,0x2b,0xd7,0xf0,0xe0,0x01,0x0a] + +v_ldexp_f64 v[5:6], -|src_scc|, src_scc mul:4 +// GFX13: v_ldexp_f64 v[5:6], -|src_scc|, src_scc mul:4 ; encoding: [0x05,0x01,0x2b,0xd7,0xfd,0xfa,0x01,0x32] + +v_ldexp_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 +// GFX13: v_ldexp_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x2b,0xd7,0xff,0xfe,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_lerp_u8 v5, v1, v2, s3 +// GFX13: v_lerp_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x15,0xd6,0x01,0x05,0x0e,0x00] + +v_lerp_u8 v5, v255, s2, s105 +// GFX13: v_lerp_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x15,0xd6,0xff,0x05,0xa4,0x01] + +v_lerp_u8 v5, s1, v255, exec_hi +// GFX13: v_lerp_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x15,0xd6,0x01,0xfe,0xff,0x01] + +v_lerp_u8 v5, s105, s105, exec_lo +// GFX13: v_lerp_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x15,0xd6,0x69,0xd2,0xf8,0x01] + +v_lerp_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_lerp_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x15,0xd6,0x6a,0xf6,0x0c,0x04] + +v_lerp_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_lerp_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x15,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_lerp_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_lerp_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x15,0xd6,0x7b,0xfa,0xed,0x01] + +v_lerp_u8 v5, m0, 0.5, m0 +// GFX13: v_lerp_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x15,0xd6,0x7d,0xe0,0xf5,0x01] + +v_lerp_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_lerp_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x15,0xd6,0x7e,0x82,0xad,0x01] + +v_lerp_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_lerp_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x15,0xd6,0x7f,0xf8,0xa8,0x01] + +v_lerp_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_lerp_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x15,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_lerp_u8 v5, -1, exec_hi, src_scc +// GFX13: v_lerp_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x15,0xd6,0xc1,0xfe,0xf4,0x03] + +v_lerp_u8 v5, 0.5, m0, 0.5 +// GFX13: v_lerp_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x15,0xd6,0xf0,0xfa,0xc0,0x03] + +v_lerp_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_lerp_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x15,0xd6,0xfd,0xd4,0x04,0x03] + +v_lerp_u8 v255, 0xaf123456, vcc_hi, null +// GFX13: v_lerp_u8 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x15,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_lshl_add_u32 v5, v1, v2, s3 +// GFX13: v_lshl_add_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x46,0xd7,0x01,0x05,0x0e,0x00] + +v_lshl_add_u32 v5, v255, s2, s105 +// GFX13: v_lshl_add_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x46,0xd7,0xff,0x05,0xa4,0x01] + +v_lshl_add_u32 v5, s1, v255, exec_hi +// GFX13: v_lshl_add_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x46,0xd7,0x01,0xfe,0xff,0x01] + +v_lshl_add_u32 v5, s105, s105, exec_lo +// GFX13: v_lshl_add_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x46,0xd7,0x69,0xd2,0xf8,0x01] + +v_lshl_add_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_lshl_add_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x46,0xd7,0x6a,0xf6,0x0c,0x04] + +v_lshl_add_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_lshl_add_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x46,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_lshl_add_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_lshl_add_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x46,0xd7,0x7b,0xfa,0xed,0x01] + +v_lshl_add_u32 v5, m0, 0.5, m0 +// GFX13: v_lshl_add_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x46,0xd7,0x7d,0xe0,0xf5,0x01] + +v_lshl_add_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_lshl_add_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x46,0xd7,0x7e,0x82,0xad,0x01] + +v_lshl_add_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_lshl_add_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x46,0xd7,0x7f,0xf8,0xa8,0x01] + +v_lshl_add_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_lshl_add_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x46,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_lshl_add_u32 v5, -1, exec_hi, src_scc +// GFX13: v_lshl_add_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x46,0xd7,0xc1,0xfe,0xf4,0x03] + +v_lshl_add_u32 v5, 0.5, m0, 0.5 +// GFX13: v_lshl_add_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x46,0xd7,0xf0,0xfa,0xc0,0x03] + +v_lshl_add_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_lshl_add_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x46,0xd7,0xfd,0xd4,0x04,0x03] + +v_lshl_add_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_lshl_add_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x46,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_lshl_or_b32 v5, v1, v2, s3 +// GFX13: v_lshl_or_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6f,0xd7,0x01,0x05,0x0e,0x00] + +v_lshl_or_b32 v5, v255, s2, s105 +// GFX13: v_lshl_or_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd7,0xff,0x05,0xa4,0x01] + +v_lshl_or_b32 v5, s1, v255, exec_hi +// GFX13: v_lshl_or_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd7,0x01,0xfe,0xff,0x01] + +v_lshl_or_b32 v5, s105, s105, exec_lo +// GFX13: v_lshl_or_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd7,0x69,0xd2,0xf8,0x01] + +v_lshl_or_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_lshl_or_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd7,0x6a,0xf6,0x0c,0x04] + +v_lshl_or_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_lshl_or_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6f,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_lshl_or_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_lshl_or_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x6f,0xd7,0x7b,0xfa,0xed,0x01] + +v_lshl_or_b32 v5, m0, 0.5, m0 +// GFX13: v_lshl_or_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6f,0xd7,0x7d,0xe0,0xf5,0x01] + +v_lshl_or_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_lshl_or_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x6f,0xd7,0x7e,0x82,0xad,0x01] + +v_lshl_or_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_lshl_or_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x6f,0xd7,0x7f,0xf8,0xa8,0x01] + +v_lshl_or_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_lshl_or_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x6f,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_lshl_or_b32 v5, -1, exec_hi, src_scc +// GFX13: v_lshl_or_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x6f,0xd7,0xc1,0xfe,0xf4,0x03] + +v_lshl_or_b32 v5, 0.5, m0, 0.5 +// GFX13: v_lshl_or_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd7,0xf0,0xfa,0xc0,0x03] + +v_lshl_or_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_lshl_or_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x6f,0xd7,0xfd,0xd4,0x04,0x03] + +v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_lshl_or_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x6f,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_lshlrev_b16 v5.l, v1.l, v2.l +// GFX13: v_lshlrev_b16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x02,0x02] + +v_lshlrev_b16 v5.l, v255.l, v255.l +// GFX13: v_lshlrev_b16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x14,0xd7,0xff,0xff,0x03,0x02] + +v_lshlrev_b16 v5.l, s1, s2 +// GFX13: v_lshlrev_b16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x04,0x00,0x02] + +v_lshlrev_b16 v5.l, s105, s105 +// GFX13: v_lshlrev_b16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x69,0xd2,0x00,0x02] + +v_lshlrev_b16 v5.l, vcc_lo, ttmp15 +// GFX13: v_lshlrev_b16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x6a,0xf6,0x00,0x02] + +v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_lshlrev_b16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x14,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_lshlrev_b16 v5.l, ttmp15, src_scc +// GFX13: v_lshlrev_b16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x14,0xd7,0x7b,0xfa,0x01,0x02] + +v_lshlrev_b16 v5.l, m0, 0.5 +// GFX13-ASM: v_lshlrev_b16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x14,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_lshlrev_b16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x14,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_lshlrev_b16 v5.l, exec_lo, -1 +// GFX13: v_lshlrev_b16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x14,0xd7,0x7e,0x82,0x01,0x02] + +v_lshlrev_b16 v5.l, exec_hi, null +// GFX13: v_lshlrev_b16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x14,0xd7,0x7f,0xf8,0x00,0x02] + +v_lshlrev_b16 v5.l, null, exec_lo +// GFX13: v_lshlrev_b16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x7c,0xfc,0x00,0x02] + +v_lshlrev_b16 v5.l, -1, exec_hi +// GFX13: v_lshlrev_b16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0xc1,0xfe,0x00,0x02] + +v_lshlrev_b16 v5.l, 0.5, m0 +// GFX13-ASM: v_lshlrev_b16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x14,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_lshlrev_b16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x14,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_lshlrev_b16 v5.l, src_scc, vcc_lo +// GFX13: v_lshlrev_b16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0xfd,0xd4,0x00,0x02] + +v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_lshlrev_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x14,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_lshrrev_b16 v5.l, v1.l, v2.l +// GFX13: v_lshrrev_b16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x05,0x02,0x02] + +v_lshrrev_b16 v5.l, v255.l, v255.l +// GFX13: v_lshrrev_b16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xff,0x03,0x02] + +v_lshrrev_b16 v5.l, s1, s2 +// GFX13: v_lshrrev_b16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x39,0xd7,0x01,0x04,0x00,0x02] + +v_lshrrev_b16 v5.l, s105, s105 +// GFX13: v_lshrrev_b16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x39,0xd7,0x69,0xd2,0x00,0x02] + +v_lshrrev_b16 v5.l, vcc_lo, ttmp15 +// GFX13: v_lshrrev_b16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x39,0xd7,0x6a,0xf6,0x00,0x02] + +v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_lshrrev_b16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x39,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_lshrrev_b16 v5.l, ttmp15, src_scc +// GFX13: v_lshrrev_b16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x39,0xd7,0x7b,0xfa,0x01,0x02] + +v_lshrrev_b16 v5.l, m0, 0.5 +// GFX13-ASM: v_lshrrev_b16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_lshrrev_b16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x39,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_lshrrev_b16 v5.l, exec_lo, -1 +// GFX13: v_lshrrev_b16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x39,0xd7,0x7e,0x82,0x01,0x02] + +v_lshrrev_b16 v5.l, exec_hi, null +// GFX13: v_lshrrev_b16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x39,0xd7,0x7f,0xf8,0x00,0x02] + +v_lshrrev_b16 v5.l, null, exec_lo +// GFX13: v_lshrrev_b16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x39,0xd7,0x7c,0xfc,0x00,0x02] + +v_lshrrev_b16 v5.l, -1, exec_hi +// GFX13: v_lshrrev_b16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x39,0xd7,0xc1,0xfe,0x00,0x02] + +v_lshrrev_b16 v5.l, 0.5, m0 +// GFX13-ASM: v_lshrrev_b16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x39,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_lshrrev_b16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x39,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_lshrrev_b16 v5.l, src_scc, vcc_lo +// GFX13: v_lshrrev_b16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x39,0xd7,0xfd,0xd4,0x00,0x02] + +v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_lshrrev_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x39,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_lshrrev_b64 v[5:6], v1, vcc +// GFX13: v_lshrrev_b64 v[5:6], v1, vcc ; encoding: [0x05,0x00,0x00,0xd7,0x01,0xd5,0x00,0x02] + +v_lshrrev_b64 v[5:6], v255, exec +// GFX13: v_lshrrev_b64 v[5:6], v255, exec ; encoding: [0x05,0x00,0x00,0xd7,0xff,0xfd,0x00,0x02] + +v_lshrrev_b64 v[5:6], exec_lo, v[2:3] +// GFX13: v_lshrrev_b64 v[5:6], exec_lo, v[2:3] ; encoding: [0x05,0x00,0x00,0xd7,0x7e,0x04,0x02,0x02] + +v_lshrrev_b64 v[5:6], exec_hi, v[254:255] +// GFX13: v_lshrrev_b64 v[5:6], exec_hi, v[254:255] ; encoding: [0x05,0x00,0x00,0xd7,0x7f,0xfc,0x03,0x02] + +v_lshrrev_b64 v[5:6], null, null +// GFX13: v_lshrrev_b64 v[5:6], null, null ; encoding: [0x05,0x00,0x00,0xd7,0x7c,0xf8,0x00,0x02] + +v_lshrrev_b64 v[5:6], -1, -1 +// GFX13: v_lshrrev_b64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x00,0xd7,0xc1,0x82,0x01,0x02] + +v_lshrrev_b64 v[5:6], 0.5, 0xaf123456 +// GFX13: v_lshrrev_b64 v[5:6], 0.5, 0xaf123456 ; encoding: [0x05,0x00,0x00,0xd7,0xf0,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_lshrrev_b64 v[5:6], src_scc, src_scc +// GFX13: v_lshrrev_b64 v[5:6], src_scc, src_scc ; encoding: [0x05,0x00,0x00,0xd7,0xfd,0xfa,0x01,0x02] + +v_lshrrev_b64 v[254:255], 0xaf123456, 0.5 +// GFX13: v_lshrrev_b64 v[254:255], 0xaf123456, 0.5 ; encoding: [0xfe,0x00,0x00,0xd7,0xff,0xe0,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mad_i16 v5.l, v1.l, v2.l, s3 +// GFX13: v_mad_i16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x5e,0xd7,0x01,0x05,0x0e,0x00] + +v_mad_i16 v5.l, v255.l, s2, s105 +// GFX13: v_mad_i16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x5e,0xd7,0xff,0x05,0xa4,0x01] + +v_mad_i16 v5.l, s1, v255.l, exec_hi +// GFX13: v_mad_i16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x5e,0xd7,0x01,0xfe,0xff,0x01] + +v_mad_i16 v5.l, s105, s105, exec_lo +// GFX13: v_mad_i16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x5e,0xd7,0x69,0xd2,0xf8,0x01] + +v_mad_i16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_mad_i16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x5e,0xd7,0x6a,0xf6,0x0c,0x04] + +v_mad_i16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_mad_i16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x5e,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_mad_i16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_i16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x5e,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_i16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_mad_i16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x5e,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_i16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x5e,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_i16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_mad_i16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x5e,0xd7,0x7e,0x82,0xad,0x01] + +v_mad_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_mad_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x5e,0xd7,0x7f,0xf8,0xa8,0x01] + +v_mad_i16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_mad_i16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x5e,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x5e,0xd7,0xc1,0xfe,0xf4,0x03] + +v_mad_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_mad_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x5e,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_mad_i16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x5e,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_mad_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_mad_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x5e,0xd7,0xfd,0xd4,0x04,0x03] + +v_mad_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp +// GFX13: v_mad_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc0,0x5e,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_i32_i16 v5, v1.l, v2.l, v3 +// GFX13: v_mad_i32_i16 v5, v1.l, v2.l, v3 ; encoding: [0x05,0x00,0x75,0xd7,0x01,0x05,0x0e,0x04] + +v_mad_i32_i16 v5, v255.l, v255.l, s3 +// GFX13: v_mad_i32_i16 v5, v255.l, v255.l, s3 ; encoding: [0x05,0x00,0x75,0xd7,0xff,0xff,0x0f,0x00] + +v_mad_i32_i16 v5, s1, s2, v255 +// GFX13: v_mad_i32_i16 v5, s1, s2, v255 ; encoding: [0x05,0x00,0x75,0xd7,0x01,0x04,0xfc,0x07] + +v_mad_i32_i16 v5, s105, s105, s105 +// GFX13: v_mad_i32_i16 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x75,0xd7,0x69,0xd2,0xa4,0x01] + +v_mad_i32_i16 v5, vcc_lo, ttmp15, vcc_lo +// GFX13: v_mad_i32_i16 v5, vcc_lo, ttmp15, vcc_lo ; encoding: [0x05,0x00,0x75,0xd7,0x6a,0xf6,0xa8,0x01] + +v_mad_i32_i16 v5, vcc_hi, 0xfe0b, vcc_hi +// GFX13: v_mad_i32_i16 v5, vcc_hi, 0xfe0b, vcc_hi ; encoding: [0x05,0x00,0x75,0xd7,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_i32_i16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_i32_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x75,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_i32_i16 v5, m0, 0.5, m0 +// GFX13-ASM: v_mad_i32_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x75,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_i32_i16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x75,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_i32_i16 v5, exec_lo, -1, exec_hi +// GFX13: v_mad_i32_i16 v5, exec_lo, -1, exec_hi ; encoding: [0x05,0x00,0x75,0xd7,0x7e,0x82,0xfd,0x01] + +v_mad_i32_i16 v5, exec_hi, null, exec_lo +// GFX13: v_mad_i32_i16 v5, exec_hi, null, exec_lo ; encoding: [0x05,0x00,0x75,0xd7,0x7f,0xf8,0xf8,0x01] + +v_mad_i32_i16 v5, null, exec_lo, null +// GFX13: v_mad_i32_i16 v5, null, exec_lo, null ; encoding: [0x05,0x00,0x75,0xd7,0x7c,0xfc,0xf0,0x01] + +v_mad_i32_i16 v5, -1, exec_hi, 0xaf123456 +// GFX13: v_mad_i32_i16 v5, -1, exec_hi, 0xaf123456 ; encoding: [0x05,0x00,0x75,0xd7,0xc1,0xfe,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_i32_i16 v5, 0.5, m0, -1 op_sel:[0,0,0,0] +// GFX13-ASM: v_mad_i32_i16 v5, 0.5, m0, -1 ; encoding: [0x05,0x00,0x75,0xd7,0xf0,0xfa,0x04,0x03] +// GFX13-DIS: v_mad_i32_i16 v5, 0x3800, m0, -1 ; encoding: [0x05,0x00,0x75,0xd7,0xff,0xfa,0x04,0x03,0x00,0x38,0x00,0x00] + +v_mad_i32_i16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_i32_i16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x75,0xd7,0xfd,0xd4,0xf4,0x03] + +v_mad_i32_i16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp +// GFX13: v_mad_i32_i16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp ; encoding: [0xff,0x90,0x75,0xd7,0xff,0xd6,0xc0,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_i32_i24 v5, v1, v2, s3 +// GFX13: v_mad_i32_i24 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0a,0xd6,0x01,0x05,0x0e,0x00] + +v_mad_i32_i24 v5, v255, s2, s105 +// GFX13: v_mad_i32_i24 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0a,0xd6,0xff,0x05,0xa4,0x01] + +v_mad_i32_i24 v5, s1, v255, exec_hi +// GFX13: v_mad_i32_i24 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0a,0xd6,0x01,0xfe,0xff,0x01] + +v_mad_i32_i24 v5, s105, s105, exec_lo +// GFX13: v_mad_i32_i24 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0a,0xd6,0x69,0xd2,0xf8,0x01] + +v_mad_i32_i24 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mad_i32_i24 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_mad_i32_i24 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_mad_i32_i24 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0a,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_mad_i32_i24 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_i32_i24 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x0a,0xd6,0x7b,0xfa,0xed,0x01] + +v_mad_i32_i24 v5, m0, 0.5, m0 +// GFX13: v_mad_i32_i24 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_mad_i32_i24 v5, exec_lo, -1, vcc_hi +// GFX13: v_mad_i32_i24 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x0a,0xd6,0x7e,0x82,0xad,0x01] + +v_mad_i32_i24 v5, exec_hi, null, vcc_lo +// GFX13: v_mad_i32_i24 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x0a,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_i32_i24 v5, null, exec_lo, 0xaf123456 +// GFX13: v_mad_i32_i24 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x0a,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_i32_i24 v5, -1, exec_hi, src_scc +// GFX13: v_mad_i32_i24 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x0a,0xd6,0xc1,0xfe,0xf4,0x03] + +v_mad_i32_i24 v5, 0.5, m0, 0.5 +// GFX13: v_mad_i32_i24 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x0a,0xd6,0xf0,0xfa,0xc0,0x03] + +v_mad_i32_i24 v5, src_scc, vcc_lo, -1 +// GFX13: v_mad_i32_i24 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x0a,0xd6,0xfd,0xd4,0x04,0x03] + +v_mad_i32_i24 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_mad_i32_i24 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x0a,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mad_co_i64_i32 v[5:6], s6, s105, s105, s[6:7] +// W32: v_mad_co_i64_i32 v[5:6], s6, s105, s105, s[6:7] ; encoding: [0x05,0x06,0xf9,0xd6,0x69,0xd2,0x18,0x00] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, ttmp15, ttmp15, s[104:105] +// W32: v_mad_co_i64_i32 v[5:6], s6, ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x06,0xf9,0xd6,0x7b,0xf6,0xa0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, m0, 0.5, ttmp[14:15] +// W32: v_mad_co_i64_i32 v[5:6], s6, m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x06,0xf9,0xd6,0x7d,0xe0,0xe9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, exec_lo, -1, exec +// W32: v_mad_co_i64_i32 v[5:6], s6, exec_lo, -1, exec ; encoding: [0x05,0x06,0xf9,0xd6,0x7e,0x82,0xf9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s6, exec_hi, null, vcc +// W32: v_mad_co_i64_i32 v[5:6], s6, exec_hi, null, vcc ; encoding: [0x05,0x06,0xf9,0xd6,0x7f,0xf8,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s105, null, exec_lo, null +// W32: v_mad_co_i64_i32 v[5:6], s105, null, exec_lo, null ; encoding: [0x05,0x69,0xf9,0xd6,0x7c,0xfc,0xf0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], vcc_lo, -1, exec_hi, -1 +// W32: v_mad_co_i64_i32 v[5:6], vcc_lo, -1, exec_hi, -1 ; encoding: [0x05,0x6a,0xf9,0xd6,0xc1,0xfe,0x04,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 +// W32: v_mad_co_i64_i32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6b,0xf9,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc +// W32: v_mad_co_i64_i32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7b,0xf9,0xd6,0xfd,0xd4,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_i64_i32 v[5:6], s[12:13], s105, s105, s[6:7] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], s105, s105, s[6:7] ; encoding: [0x05,0x0c,0xf9,0xd6,0x69,0xd2,0x18,0x00] + +v_mad_co_i64_i32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x0c,0xf9,0xd6,0x7b,0xf6,0xa0,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x0c,0xf9,0xd6,0x7d,0xe0,0xe9,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], exec_lo, -1, exec +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], exec_lo, -1, exec ; encoding: [0x05,0x0c,0xf9,0xd6,0x7e,0x82,0xf9,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], exec_hi, null, vcc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], exec_hi, null, vcc ; encoding: [0x05,0x0c,0xf9,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_co_i64_i32 v[5:6], s[12:13], null, exec_lo, null +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[12:13], null, exec_lo, null ; encoding: [0x05,0x0c,0xf9,0xd6,0x7c,0xfc,0xf0,0x01] + +v_mad_co_i64_i32 v[5:6], s[104:105], -1, exec_hi, -1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], s[104:105], -1, exec_hi, -1 ; encoding: [0x05,0x68,0xf9,0xd6,0xc1,0xfe,0x04,0x03] + +v_mad_co_i64_i32 v[5:6], vcc, 0.5, m0, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], vcc, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6a,0xf9,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_co_i64_i32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_i64_i32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7a,0xf9,0xd6,0xfd,0xd4,0xf4,0x03] + +v_mad_co_i64_i32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp +// GFX13: v_mad_co_i64_i32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xf9,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] + +v_mad_u16 v5.l, v1.l, v2.l, s3 +// GFX13: v_mad_u16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x40,0xd7,0x01,0x05,0x0e,0x00] + +v_mad_u16 v5.l, v255.l, s2, s105 +// GFX13: v_mad_u16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x40,0xd7,0xff,0x05,0xa4,0x01] + +v_mad_u16 v5.l, s1, v255.l, exec_hi +// GFX13: v_mad_u16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x40,0xd7,0x01,0xfe,0xff,0x01] + +v_mad_u16 v5.l, s105, s105, exec_lo +// GFX13: v_mad_u16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x40,0xd7,0x69,0xd2,0xf8,0x01] + +v_mad_u16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_mad_u16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x40,0xd7,0x6a,0xf6,0x0c,0x04] + +v_mad_u16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_mad_u16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x40,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_mad_u16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_u16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x40,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_u16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_mad_u16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x40,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_u16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x40,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_u16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_mad_u16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x40,0xd7,0x7e,0x82,0xad,0x01] + +v_mad_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_mad_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x40,0xd7,0x7f,0xf8,0xa8,0x01] + +v_mad_u16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_mad_u16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x40,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x40,0xd7,0xc1,0xfe,0xf4,0x03] + +v_mad_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_mad_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x40,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_mad_u16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x40,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_mad_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_mad_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x40,0xd7,0xfd,0xd4,0x04,0x03] + +v_mad_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp +// GFX13: v_mad_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc0,0x40,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_u32_u16 v5, v1.l, v2.l, v3 +// GFX13: v_mad_u32_u16 v5, v1.l, v2.l, v3 ; encoding: [0x05,0x00,0x73,0xd7,0x01,0x05,0x0e,0x04] + +v_mad_u32_u16 v5, v255.l, v255.l, s3 +// GFX13: v_mad_u32_u16 v5, v255.l, v255.l, s3 ; encoding: [0x05,0x00,0x73,0xd7,0xff,0xff,0x0f,0x00] + +v_mad_u32_u16 v5, s1, s2, v255 +// GFX13: v_mad_u32_u16 v5, s1, s2, v255 ; encoding: [0x05,0x00,0x73,0xd7,0x01,0x04,0xfc,0x07] + +v_mad_u32_u16 v5, s105, s105, s105 +// GFX13: v_mad_u32_u16 v5, s105, s105, s105 ; encoding: [0x05,0x00,0x73,0xd7,0x69,0xd2,0xa4,0x01] + +v_mad_u32_u16 v5, vcc_lo, ttmp15, vcc_lo +// GFX13: v_mad_u32_u16 v5, vcc_lo, ttmp15, vcc_lo ; encoding: [0x05,0x00,0x73,0xd7,0x6a,0xf6,0xa8,0x01] + +v_mad_u32_u16 v5, vcc_hi, 0xfe0b, vcc_hi +// GFX13: v_mad_u32_u16 v5, vcc_hi, 0xfe0b, vcc_hi ; encoding: [0x05,0x00,0x73,0xd7,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00] + +v_mad_u32_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_u32_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x73,0xd7,0x7b,0xfa,0xed,0x01] + +v_mad_u32_u16 v5, m0, 0.5, m0 +// GFX13-ASM: v_mad_u32_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x73,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_mad_u32_u16 v5, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x73,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_mad_u32_u16 v5, exec_lo, -1, exec_hi +// GFX13: v_mad_u32_u16 v5, exec_lo, -1, exec_hi ; encoding: [0x05,0x00,0x73,0xd7,0x7e,0x82,0xfd,0x01] + +v_mad_u32_u16 v5, exec_hi, null, exec_lo +// GFX13: v_mad_u32_u16 v5, exec_hi, null, exec_lo ; encoding: [0x05,0x00,0x73,0xd7,0x7f,0xf8,0xf8,0x01] + +v_mad_u32_u16 v5, null, exec_lo, null +// GFX13: v_mad_u32_u16 v5, null, exec_lo, null ; encoding: [0x05,0x00,0x73,0xd7,0x7c,0xfc,0xf0,0x01] + +v_mad_u32_u16 v5, -1, exec_hi, 0xaf123456 +// GFX13: v_mad_u32_u16 v5, -1, exec_hi, 0xaf123456 ; encoding: [0x05,0x00,0x73,0xd7,0xc1,0xfe,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_u32_u16 v5, 0.5, m0, -1 op_sel:[0,0,0,0] +// GFX13-ASM: v_mad_u32_u16 v5, 0.5, m0, -1 ; encoding: [0x05,0x00,0x73,0xd7,0xf0,0xfa,0x04,0x03] +// GFX13-DIS: v_mad_u32_u16 v5, 0x3800, m0, -1 ; encoding: [0x05,0x00,0x73,0xd7,0xff,0xfa,0x04,0x03,0x00,0x38,0x00,0x00] + +v_mad_u32_u16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] +// GFX13: v_mad_u32_u16 v5, src_scc, vcc_lo, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x73,0xd7,0xfd,0xd4,0xf4,0x03] + +v_mad_u32_u16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp +// GFX13: v_mad_u32_u16 v255, 0xfe0b, vcc_hi, 0.5 op_sel:[0,1,0,0] clamp ; encoding: [0xff,0x90,0x73,0xd7,0xff,0xd6,0xc0,0x03,0x0b,0xfe,0x00,0x00] + +v_mad_u32_u24 v5, v1, v2, s3 +// GFX13: v_mad_u32_u24 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x0b,0xd6,0x01,0x05,0x0e,0x00] + +v_mad_u32_u24 v5, v255, s2, s105 +// GFX13: v_mad_u32_u24 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x0b,0xd6,0xff,0x05,0xa4,0x01] + +v_mad_u32_u24 v5, s1, v255, exec_hi +// GFX13: v_mad_u32_u24 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x0b,0xd6,0x01,0xfe,0xff,0x01] + +v_mad_u32_u24 v5, s105, s105, exec_lo +// GFX13: v_mad_u32_u24 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x0b,0xd6,0x69,0xd2,0xf8,0x01] + +v_mad_u32_u24 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mad_u32_u24 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x0b,0xd6,0x6a,0xf6,0x0c,0x04] + +v_mad_u32_u24 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_mad_u32_u24 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x0b,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_mad_u32_u24 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_mad_u32_u24 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x0b,0xd6,0x7b,0xfa,0xed,0x01] + +v_mad_u32_u24 v5, m0, 0.5, m0 +// GFX13: v_mad_u32_u24 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x0b,0xd6,0x7d,0xe0,0xf5,0x01] + +v_mad_u32_u24 v5, exec_lo, -1, vcc_hi +// GFX13: v_mad_u32_u24 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x0b,0xd6,0x7e,0x82,0xad,0x01] + +v_mad_u32_u24 v5, exec_hi, null, vcc_lo +// GFX13: v_mad_u32_u24 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x0b,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_u32_u24 v5, null, exec_lo, 0xaf123456 +// GFX13: v_mad_u32_u24 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x0b,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_u32_u24 v5, -1, exec_hi, src_scc +// GFX13: v_mad_u32_u24 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x0b,0xd6,0xc1,0xfe,0xf4,0x03] + +v_mad_u32_u24 v5, 0.5, m0, 0.5 +// GFX13: v_mad_u32_u24 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x0b,0xd6,0xf0,0xfa,0xc0,0x03] + +v_mad_u32_u24 v5, src_scc, vcc_lo, -1 +// GFX13: v_mad_u32_u24 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x0b,0xd6,0xfd,0xd4,0x04,0x03] + +v_mad_u32_u24 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_mad_u32_u24 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x0b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mad_co_u64_u32 v[5:6], s6, s105, s105, s[6:7] +// W32: v_mad_co_u64_u32 v[5:6], s6, s105, s105, s[6:7] ; encoding: [0x05,0x06,0xf8,0xd6,0x69,0xd2,0x18,0x00] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, ttmp15, ttmp15, s[104:105] +// W32: v_mad_co_u64_u32 v[5:6], s6, ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x06,0xf8,0xd6,0x7b,0xf6,0xa0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, m0, 0.5, ttmp[14:15] +// W32: v_mad_co_u64_u32 v[5:6], s6, m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x06,0xf8,0xd6,0x7d,0xe0,0xe9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, exec_lo, -1, exec +// W32: v_mad_co_u64_u32 v[5:6], s6, exec_lo, -1, exec ; encoding: [0x05,0x06,0xf8,0xd6,0x7e,0x82,0xf9,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s6, exec_hi, null, vcc +// W32: v_mad_co_u64_u32 v[5:6], s6, exec_hi, null, vcc ; encoding: [0x05,0x06,0xf8,0xd6,0x7f,0xf8,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s105, null, exec_lo, null +// W32: v_mad_co_u64_u32 v[5:6], s105, null, exec_lo, null ; encoding: [0x05,0x69,0xf8,0xd6,0x7c,0xfc,0xf0,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], vcc_lo, -1, exec_hi, -1 +// W32: v_mad_co_u64_u32 v[5:6], vcc_lo, -1, exec_hi, -1 ; encoding: [0x05,0x6a,0xf8,0xd6,0xc1,0xfe,0x04,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 +// W32: v_mad_co_u64_u32 v[5:6], vcc_hi, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6b,0xf8,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc +// W32: v_mad_co_u64_u32 v[5:6], ttmp15, src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7b,0xf8,0xd6,0xfd,0xd4,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_mad_co_u64_u32 v[5:6], s[12:13], s105, s105, s[6:7] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], s105, s105, s[6:7] ; encoding: [0x05,0x0c,0xf8,0xd6,0x69,0xd2,0x18,0x00] + +v_mad_co_u64_u32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], ttmp15, ttmp15, s[104:105] ; encoding: [0x05,0x0c,0xf8,0xd6,0x7b,0xf6,0xa0,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], m0, 0.5, ttmp[14:15] ; encoding: [0x05,0x0c,0xf8,0xd6,0x7d,0xe0,0xe9,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], exec_lo, -1, exec +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], exec_lo, -1, exec ; encoding: [0x05,0x0c,0xf8,0xd6,0x7e,0x82,0xf9,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], exec_hi, null, vcc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], exec_hi, null, vcc ; encoding: [0x05,0x0c,0xf8,0xd6,0x7f,0xf8,0xa8,0x01] + +v_mad_co_u64_u32 v[5:6], s[12:13], null, exec_lo, null +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[12:13], null, exec_lo, null ; encoding: [0x05,0x0c,0xf8,0xd6,0x7c,0xfc,0xf0,0x01] + +v_mad_co_u64_u32 v[5:6], s[104:105], -1, exec_hi, -1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], s[104:105], -1, exec_hi, -1 ; encoding: [0x05,0x68,0xf8,0xd6,0xc1,0xfe,0x04,0x03] + +v_mad_co_u64_u32 v[5:6], vcc, 0.5, m0, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], vcc, 0.5, m0, 0xaf123456 ; encoding: [0x05,0x6a,0xf8,0xd6,0xf0,0xfa,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc ; encoding: [0x05,0x7a,0xf8,0xd6,0xfd,0xd4,0xf4,0x03] + +v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp +// GFX13: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xf8,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf] + +v_max3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_max3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x54,0xd7,0x01,0x05,0x0e,0x00] + +v_max3_num_f16 v5.l, v255.l, s2, s105 +// GFX13: v_max3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x54,0xd7,0xff,0x05,0xa4,0x01] + +v_max3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x54,0xd7,0x01,0xfe,0xff,0x01] + +v_max3_num_f16 v5.l, s105, s105, exec_lo +// GFX13: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x54,0xd7,0x69,0xd2,0xf8,0x01] + +v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x54,0xd7,0x6a,0xf6,0x0c,0x04] + +v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x54,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x54,0xd7,0x7b,0xfa,0xed,0xe1] + +v_max3_num_f16 v5.l, m0, 0.5, m0 +// GFX13: v_max3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x54,0xd7,0x7d,0xe0,0xf5,0x01] + +v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x54,0xd7,0x7e,0x82,0xad,0x01] + +v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x54,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x54,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x54,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x54,0xd7,0xf0,0xfa,0xc0,0x43] + +v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x54,0xd7,0xfd,0xd4,0x04,0x23] + +v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x54,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_max3_num_f32 v5, v1, v2, s3 +// GFX13: v_max3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00] + +v_max3_num_f32 v5, v255, s2, s105 +// GFX13: v_max3_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2a,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_num_f32 v5, s1, v255, exec_hi +// GFX13: v_max3_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_num_f32 v5, s105, s105, exec_lo +// GFX13: v_max3_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2a,0xd6,0x69,0xd2,0xf8,0x01] + +v_max3_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_max3_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_max3_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x2a,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_max3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_max3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2a,0xd6,0x7b,0xfa,0xed,0xe1] + +v_max3_num_f32 v5, m0, 0.5, m0 +// GFX13: v_max3_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_max3_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_max3_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2a,0xd6,0x7e,0x82,0xad,0x01] + +v_max3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_max3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2a,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_max3_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_max3_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x2a,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_max3_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_max3_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2a,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_max3_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_max3_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x2a,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_max3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_max3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x2a,0xd6,0xfd,0xd4,0x04,0x33] + +v_max3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_max3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2a,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_max3_i16 v5.l, v1.l, v2.l, s3 +// GFX13: v_max3_i16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x55,0xd7,0x01,0x05,0x0e,0x00] + +v_max3_i16 v5.l, v255.l, s2, s105 +// GFX13: v_max3_i16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x55,0xd7,0xff,0x05,0xa4,0x01] + +v_max3_i16 v5.l, s1, v255.l, exec_hi +// GFX13: v_max3_i16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x55,0xd7,0x01,0xfe,0xff,0x01] + +v_max3_i16 v5.l, s105, s105, exec_lo +// GFX13: v_max3_i16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x55,0xd7,0x69,0xd2,0xf8,0x01] + +v_max3_i16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_max3_i16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x55,0xd7,0x6a,0xf6,0x0c,0x04] + +v_max3_i16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_max3_i16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x55,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_i16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_i16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x55,0xd7,0x7b,0xfa,0xed,0x01] + +v_max3_i16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_max3_i16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x55,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_max3_i16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x55,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_max3_i16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_max3_i16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x55,0xd7,0x7e,0x82,0xad,0x01] + +v_max3_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_max3_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x55,0xd7,0x7f,0xf8,0xa8,0x01] + +v_max3_i16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_max3_i16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x55,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_max3_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_max3_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x55,0xd7,0xc1,0xfe,0xf4,0x03] + +v_max3_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_max3_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x55,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_max3_i16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x55,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_max3_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_max3_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x55,0xd7,0xfd,0xd4,0x04,0x03] + +v_max3_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_max3_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x55,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_max3_i32 v5, v1, v2, s3 +// GFX13: v_max3_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1d,0xd6,0x01,0x05,0x0e,0x00] + +v_max3_i32 v5, v255, s2, s105 +// GFX13: v_max3_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1d,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_i32 v5, s1, v255, exec_hi +// GFX13: v_max3_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1d,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_i32 v5, s105, s105, exec_lo +// GFX13: v_max3_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1d,0xd6,0x69,0xd2,0xf8,0x01] + +v_max3_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_max3_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_max3_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_max3_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1d,0xd6,0x7b,0xfa,0xed,0x01] + +v_max3_i32 v5, m0, 0.5, m0 +// GFX13: v_max3_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_max3_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_max3_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1d,0xd6,0x7e,0x82,0xad,0x01] + +v_max3_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_max3_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1d,0xd6,0x7f,0xf8,0xa8,0x01] + +v_max3_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_max3_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1d,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_max3_i32 v5, -1, exec_hi, src_scc +// GFX13: v_max3_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1d,0xd6,0xc1,0xfe,0xf4,0x03] + +v_max3_i32 v5, 0.5, m0, 0.5 +// GFX13: v_max3_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1d,0xd6,0xf0,0xfa,0xc0,0x03] + +v_max3_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_max3_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1d,0xd6,0xfd,0xd4,0x04,0x03] + +v_max3_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_max3_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1d,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_max3_u16 v5.l, v1.l, v2.l, s3 +// GFX13: v_max3_u16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x56,0xd7,0x01,0x05,0x0e,0x00] + +v_max3_u16 v5.l, v255.l, s2, s105 +// GFX13: v_max3_u16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x56,0xd7,0xff,0x05,0xa4,0x01] + +v_max3_u16 v5.l, s1, v255.l, exec_hi +// GFX13: v_max3_u16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x56,0xd7,0x01,0xfe,0xff,0x01] + +v_max3_u16 v5.l, s105, s105, exec_lo +// GFX13: v_max3_u16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x56,0xd7,0x69,0xd2,0xf8,0x01] + +v_max3_u16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_max3_u16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x56,0xd7,0x6a,0xf6,0x0c,0x04] + +v_max3_u16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_max3_u16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x56,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_max3_u16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_u16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x56,0xd7,0x7b,0xfa,0xed,0x01] + +v_max3_u16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_max3_u16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x56,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_max3_u16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x56,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_max3_u16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_max3_u16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x56,0xd7,0x7e,0x82,0xad,0x01] + +v_max3_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_max3_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x56,0xd7,0x7f,0xf8,0xa8,0x01] + +v_max3_u16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_max3_u16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x56,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_max3_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_max3_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x56,0xd7,0xc1,0xfe,0xf4,0x03] + +v_max3_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_max3_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x56,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_max3_u16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x56,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_max3_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_max3_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x56,0xd7,0xfd,0xd4,0x04,0x03] + +v_max3_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_max3_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x56,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_max3_u32 v5, v1, v2, s3 +// GFX13: v_max3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1e,0xd6,0x01,0x05,0x0e,0x00] + +v_max3_u32 v5, v255, s2, s105 +// GFX13: v_max3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1e,0xd6,0xff,0x05,0xa4,0x01] + +v_max3_u32 v5, s1, v255, exec_hi +// GFX13: v_max3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1e,0xd6,0x01,0xfe,0xff,0x01] + +v_max3_u32 v5, s105, s105, exec_lo +// GFX13: v_max3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1e,0xd6,0x69,0xd2,0xf8,0x01] + +v_max3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_max3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_max3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_max3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1e,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_max3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_max3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1e,0xd6,0x7b,0xfa,0xed,0x01] + +v_max3_u32 v5, m0, 0.5, m0 +// GFX13: v_max3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_max3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_max3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1e,0xd6,0x7e,0x82,0xad,0x01] + +v_max3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_max3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1e,0xd6,0x7f,0xf8,0xa8,0x01] + +v_max3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_max3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1e,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_max3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_max3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1e,0xd6,0xc1,0xfe,0xf4,0x03] + +v_max3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_max3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1e,0xd6,0xf0,0xfa,0xc0,0x03] + +v_max3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_max3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1e,0xd6,0xfd,0xd4,0x04,0x03] + +v_max3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_max3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1e,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_max_i16 v5.l, v1.l, v2.l +// GFX13: v_max_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x02] + +v_max_i16 v5.l, v255.l, v255.l +// GFX13: v_max_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xff,0x03,0x02] + +v_max_i16 v5.l, s1, s2 +// GFX13: v_max_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0x04,0x00,0x02] + +v_max_i16 v5.l, s105, s105 +// GFX13: v_max_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0a,0xd7,0x69,0xd2,0x00,0x02] + +v_max_i16 v5.l, vcc_lo, ttmp15 +// GFX13: v_max_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0xf6,0x00,0x02] + +v_max_i16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_max_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0a,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_max_i16 v5.l, ttmp15, src_scc +// GFX13: v_max_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0a,0xd7,0x7b,0xfa,0x01,0x02] + +v_max_i16 v5.l, m0, 0.5 +// GFX13-ASM: v_max_i16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_max_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0a,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_max_i16 v5.l, exec_lo, -1 +// GFX13: v_max_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0a,0xd7,0x7e,0x82,0x01,0x02] + +v_max_i16 v5.l, exec_hi, null +// GFX13: v_max_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0a,0xd7,0x7f,0xf8,0x00,0x02] + +v_max_i16 v5.l, null, exec_lo +// GFX13: v_max_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0a,0xd7,0x7c,0xfc,0x00,0x02] + +v_max_i16 v5.l, -1, exec_hi +// GFX13: v_max_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0a,0xd7,0xc1,0xfe,0x00,0x02] + +v_max_i16 v5.l, 0.5, m0 +// GFX13-ASM: v_max_i16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x0a,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_max_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_max_i16 v5.l, src_scc, vcc_lo +// GFX13: v_max_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0a,0xd7,0xfd,0xd4,0x00,0x02] + +v_max_i16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_max_i16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0a,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_max_u16 v5.l, v1.l, v2.l +// GFX13: v_max_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x02] + +v_max_u16 v5.l, v255.l, v255.l +// GFX13: v_max_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xff,0x03,0x02] + +v_max_u16 v5.l, s1, s2 +// GFX13: v_max_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x09,0xd7,0x01,0x04,0x00,0x02] + +v_max_u16 v5.l, s105, s105 +// GFX13: v_max_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x09,0xd7,0x69,0xd2,0x00,0x02] + +v_max_u16 v5.l, vcc_lo, ttmp15 +// GFX13: v_max_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0xf6,0x00,0x02] + +v_max_u16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_max_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x09,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_max_u16 v5.l, ttmp15, src_scc +// GFX13: v_max_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x09,0xd7,0x7b,0xfa,0x01,0x02] + +v_max_u16 v5.l, m0, 0.5 +// GFX13-ASM: v_max_u16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_max_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x09,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_max_u16 v5.l, exec_lo, -1 +// GFX13: v_max_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x09,0xd7,0x7e,0x82,0x01,0x02] + +v_max_u16 v5.l, exec_hi, null +// GFX13: v_max_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x09,0xd7,0x7f,0xf8,0x00,0x02] + +v_max_u16 v5.l, null, exec_lo +// GFX13: v_max_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x09,0xd7,0x7c,0xfc,0x00,0x02] + +v_max_u16 v5.l, -1, exec_hi +// GFX13: v_max_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x09,0xd7,0xc1,0xfe,0x00,0x02] + +v_max_u16 v5.l, 0.5, m0 +// GFX13-ASM: v_max_u16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x09,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_max_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x09,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_max_u16 v5.l, src_scc, vcc_lo +// GFX13: v_max_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x09,0xd7,0xfd,0xd4,0x00,0x02] + +v_max_u16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_max_u16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_num_f16 v5.l, v255.l, s2, s105 +// GFX13: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_num_f16 v5.l, s105, s105, exec_lo +// GFX13: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maxmin_num_f16 v5.l, m0, 0.5, m0 +// GFX13: v_maxmin_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| +// GFX13: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] + +v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f32 v5, v1, v2, s3 +// GFX13: v_maxmin_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_num_f32 v5, v255, s2, s105 +// GFX13: v_maxmin_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x69,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_num_f32 v5, s1, v255, exec_hi +// GFX13: v_maxmin_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x69,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_num_f32 v5, s105, s105, exec_lo +// GFX13: v_maxmin_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x69,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x69,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maxmin_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x69,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maxmin_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maxmin_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x69,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maxmin_num_f32 v5, m0, 0.5, m0 +// GFX13: v_maxmin_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x69,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maxmin_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x69,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maxmin_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x69,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maxmin_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_maxmin_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x69,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_maxmin_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maxmin_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x69,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maxmin_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maxmin_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x69,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maxmin_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maxmin_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x69,0xd6,0xfd,0xd4,0x04,0x33] + +v_maxmin_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maxmin_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x69,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_maxmin_i32 v5, v1, v2, s3 +// GFX13: v_maxmin_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x64,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_i32 v5, v255, s2, s105 +// GFX13: v_maxmin_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x64,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_i32 v5, s1, v255, exec_hi +// GFX13: v_maxmin_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x64,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_i32 v5, s105, s105, exec_lo +// GFX13: v_maxmin_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x64,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x64,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maxmin_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x64,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maxmin_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_maxmin_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x64,0xd6,0x7b,0xfa,0xed,0x01] + +v_maxmin_i32 v5, m0, 0.5, m0 +// GFX13: v_maxmin_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x64,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_maxmin_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x64,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_maxmin_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x64,0xd6,0x7f,0xf8,0xa8,0x01] + +v_maxmin_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_maxmin_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x64,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_maxmin_i32 v5, -1, exec_hi, src_scc +// GFX13: v_maxmin_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x64,0xd6,0xc1,0xfe,0xf4,0x03] + +v_maxmin_i32 v5, 0.5, m0, 0.5 +// GFX13: v_maxmin_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x64,0xd6,0xf0,0xfa,0xc0,0x03] + +v_maxmin_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_maxmin_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x64,0xd6,0xfd,0xd4,0x04,0x03] + +v_maxmin_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_maxmin_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x64,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_maxmin_u32 v5, v1, v2, s3 +// GFX13: v_maxmin_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x62,0xd6,0x01,0x05,0x0e,0x00] + +v_maxmin_u32 v5, v255, s2, s105 +// GFX13: v_maxmin_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x62,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_u32 v5, s1, v255, exec_hi +// GFX13: v_maxmin_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x62,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_u32 v5, s105, s105, exec_lo +// GFX13: v_maxmin_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x62,0xd6,0x69,0xd2,0xf8,0x01] + +v_maxmin_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maxmin_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x62,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maxmin_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maxmin_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x62,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maxmin_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_maxmin_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x62,0xd6,0x7b,0xfa,0xed,0x01] + +v_maxmin_u32 v5, m0, 0.5, m0 +// GFX13: v_maxmin_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x62,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maxmin_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_maxmin_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x62,0xd6,0x7e,0x82,0xad,0x01] + +v_maxmin_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_maxmin_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x62,0xd6,0x7f,0xf8,0xa8,0x01] + +v_maxmin_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_maxmin_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x62,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_maxmin_u32 v5, -1, exec_hi, src_scc +// GFX13: v_maxmin_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x62,0xd6,0xc1,0xfe,0xf4,0x03] + +v_maxmin_u32 v5, 0.5, m0, 0.5 +// GFX13: v_maxmin_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x62,0xd6,0xf0,0xfa,0xc0,0x03] + +v_maxmin_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_maxmin_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x62,0xd6,0xfd,0xd4,0x04,0x03] + +v_maxmin_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_maxmin_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x62,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mbcnt_hi_u32_b32 v5, v1, v2 +// GFX13: v_mbcnt_hi_u32_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x66,0xd7,0x01,0x05,0x02,0x02] + +v_mbcnt_hi_u32_b32 v5, v255, v255 +// GFX13: v_mbcnt_hi_u32_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x66,0xd7,0xff,0xff,0x03,0x02] + +v_mbcnt_hi_u32_b32 v5, s1, s2 +// GFX13: v_mbcnt_hi_u32_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x66,0xd7,0x01,0x04,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, s105, s105 +// GFX13: v_mbcnt_hi_u32_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x66,0xd7,0x69,0xd2,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, vcc_lo, ttmp15 +// GFX13: v_mbcnt_hi_u32_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x66,0xd7,0x6a,0xf6,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mbcnt_hi_u32_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x66,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mbcnt_hi_u32_b32 v5, ttmp15, src_scc +// GFX13: v_mbcnt_hi_u32_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x66,0xd7,0x7b,0xfa,0x01,0x02] + +v_mbcnt_hi_u32_b32 v5, m0, 0.5 +// GFX13: v_mbcnt_hi_u32_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x66,0xd7,0x7d,0xe0,0x01,0x02] + +v_mbcnt_hi_u32_b32 v5, exec_lo, -1 +// GFX13: v_mbcnt_hi_u32_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x66,0xd7,0x7e,0x82,0x01,0x02] + +v_mbcnt_hi_u32_b32 v5, exec_hi, null +// GFX13: v_mbcnt_hi_u32_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x66,0xd7,0x7f,0xf8,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, null, exec_lo +// GFX13: v_mbcnt_hi_u32_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x66,0xd7,0x7c,0xfc,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, -1, exec_hi +// GFX13: v_mbcnt_hi_u32_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x66,0xd7,0xc1,0xfe,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, 0.5, m0 +// GFX13: v_mbcnt_hi_u32_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x66,0xd7,0xf0,0xfa,0x00,0x02] + +v_mbcnt_hi_u32_b32 v5, src_scc, vcc_lo +// GFX13: v_mbcnt_hi_u32_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x66,0xd7,0xfd,0xd4,0x00,0x02] + +v_mbcnt_hi_u32_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_mbcnt_hi_u32_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x66,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mbcnt_lo_u32_b32 v5, v1, v2 +// GFX13: v_mbcnt_lo_u32_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x65,0xd7,0x01,0x05,0x02,0x02] + +v_mbcnt_lo_u32_b32 v5, v255, v255 +// GFX13: v_mbcnt_lo_u32_b32 v5, v255, v255 ; encoding: [0x05,0x00,0x65,0xd7,0xff,0xff,0x03,0x02] + +v_mbcnt_lo_u32_b32 v5, s1, s2 +// GFX13: v_mbcnt_lo_u32_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x65,0xd7,0x01,0x04,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, s105, s105 +// GFX13: v_mbcnt_lo_u32_b32 v5, s105, s105 ; encoding: [0x05,0x00,0x65,0xd7,0x69,0xd2,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, vcc_lo, ttmp15 +// GFX13: v_mbcnt_lo_u32_b32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x65,0xd7,0x6a,0xf6,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mbcnt_lo_u32_b32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x65,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mbcnt_lo_u32_b32 v5, ttmp15, src_scc +// GFX13: v_mbcnt_lo_u32_b32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x65,0xd7,0x7b,0xfa,0x01,0x02] + +v_mbcnt_lo_u32_b32 v5, m0, 0.5 +// GFX13: v_mbcnt_lo_u32_b32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x65,0xd7,0x7d,0xe0,0x01,0x02] + +v_mbcnt_lo_u32_b32 v5, exec_lo, -1 +// GFX13: v_mbcnt_lo_u32_b32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x65,0xd7,0x7e,0x82,0x01,0x02] + +v_mbcnt_lo_u32_b32 v5, exec_hi, null +// GFX13: v_mbcnt_lo_u32_b32 v5, exec_hi, null ; encoding: [0x05,0x00,0x65,0xd7,0x7f,0xf8,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, null, exec_lo +// GFX13: v_mbcnt_lo_u32_b32 v5, null, exec_lo ; encoding: [0x05,0x00,0x65,0xd7,0x7c,0xfc,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, -1, exec_hi +// GFX13: v_mbcnt_lo_u32_b32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x65,0xd7,0xc1,0xfe,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, 0.5, m0 +// GFX13: v_mbcnt_lo_u32_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x65,0xd7,0xf0,0xfa,0x00,0x02] + +v_mbcnt_lo_u32_b32 v5, src_scc, vcc_lo +// GFX13: v_mbcnt_lo_u32_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x65,0xd7,0xfd,0xd4,0x00,0x02] + +v_mbcnt_lo_u32_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_mbcnt_lo_u32_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x65,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_med3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_med3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x57,0xd7,0x01,0x05,0x0e,0x00] + +v_med3_num_f16 v5.l, v255.l, s2, s105 +// GFX13: v_med3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x57,0xd7,0xff,0x05,0xa4,0x01] + +v_med3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_med3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x57,0xd7,0x01,0xfe,0xff,0x01] + +v_med3_num_f16 v5.l, s105, s105, exec_lo +// GFX13: v_med3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x57,0xd7,0x69,0xd2,0xf8,0x01] + +v_med3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_med3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x57,0xd7,0x6a,0xf6,0x0c,0x04] + +v_med3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_med3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x57,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_med3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_med3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x57,0xd7,0x7b,0xfa,0xed,0xe1] + +v_med3_num_f16 v5.l, m0, 0.5, m0 +// GFX13: v_med3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x57,0xd7,0x7d,0xe0,0xf5,0x01] + +v_med3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_med3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x57,0xd7,0x7e,0x82,0xad,0x01] + +v_med3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_med3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x57,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_med3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_med3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x57,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_med3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_med3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x57,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_med3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_med3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x57,0xd7,0xf0,0xfa,0xc0,0x43] + +v_med3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_med3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x57,0xd7,0xfd,0xd4,0x04,0x23] + +v_med3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_med3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x57,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_med3_num_f32 v5, v1, v2, s3 +// GFX13: v_med3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x00] + +v_med3_num_f32 v5, v255, s2, s105 +// GFX13: v_med3_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x31,0xd6,0xff,0x05,0xa4,0x01] + +v_med3_num_f32 v5, s1, v255, exec_hi +// GFX13: v_med3_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x31,0xd6,0x01,0xfe,0xff,0x01] + +v_med3_num_f32 v5, s105, s105, exec_lo +// GFX13: v_med3_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x31,0xd6,0x69,0xd2,0xf8,0x01] + +v_med3_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x31,0xd6,0x6a,0xf6,0x0c,0x04] + +v_med3_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_med3_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x31,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_med3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_med3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x31,0xd6,0x7b,0xfa,0xed,0xe1] + +v_med3_num_f32 v5, m0, 0.5, m0 +// GFX13: v_med3_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x31,0xd6,0x7d,0xe0,0xf5,0x01] + +v_med3_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_med3_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x31,0xd6,0x7e,0x82,0xad,0x01] + +v_med3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_med3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x31,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_med3_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_med3_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x31,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_med3_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_med3_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x31,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_med3_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_med3_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x31,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_med3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_med3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x31,0xd6,0xfd,0xd4,0x04,0x33] + +v_med3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_med3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x31,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_med3_i16 v5.l, v1.l, v2.l, s3 +// GFX13: v_med3_i16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x58,0xd7,0x01,0x05,0x0e,0x00] + +v_med3_i16 v5.l, v255.l, s2, s105 +// GFX13: v_med3_i16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x58,0xd7,0xff,0x05,0xa4,0x01] + +v_med3_i16 v5.l, s1, v255.l, exec_hi +// GFX13: v_med3_i16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x58,0xd7,0x01,0xfe,0xff,0x01] + +v_med3_i16 v5.l, s105, s105, exec_lo +// GFX13: v_med3_i16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x58,0xd7,0x69,0xd2,0xf8,0x01] + +v_med3_i16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_med3_i16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x58,0xd7,0x6a,0xf6,0x0c,0x04] + +v_med3_i16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_med3_i16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x58,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_med3_i16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_i16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x58,0xd7,0x7b,0xfa,0xed,0x01] + +v_med3_i16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_med3_i16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x58,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_med3_i16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x58,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_med3_i16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_med3_i16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x58,0xd7,0x7e,0x82,0xad,0x01] + +v_med3_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_med3_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x58,0xd7,0x7f,0xf8,0xa8,0x01] + +v_med3_i16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_med3_i16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x58,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_med3_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_med3_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x58,0xd7,0xc1,0xfe,0xf4,0x03] + +v_med3_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_med3_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x58,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_med3_i16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x58,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_med3_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_med3_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x58,0xd7,0xfd,0xd4,0x04,0x03] + +v_med3_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_med3_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x58,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_med3_i32 v5, v1, v2, s3 +// GFX13: v_med3_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x20,0xd6,0x01,0x05,0x0e,0x00] + +v_med3_i32 v5, v255, s2, s105 +// GFX13: v_med3_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x20,0xd6,0xff,0x05,0xa4,0x01] + +v_med3_i32 v5, s1, v255, exec_hi +// GFX13: v_med3_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x20,0xd6,0x01,0xfe,0xff,0x01] + +v_med3_i32 v5, s105, s105, exec_lo +// GFX13: v_med3_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x20,0xd6,0x69,0xd2,0xf8,0x01] + +v_med3_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x20,0xd6,0x6a,0xf6,0x0c,0x04] + +v_med3_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_med3_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x20,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_med3_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x20,0xd6,0x7b,0xfa,0xed,0x01] + +v_med3_i32 v5, m0, 0.5, m0 +// GFX13: v_med3_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x20,0xd6,0x7d,0xe0,0xf5,0x01] + +v_med3_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_med3_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x20,0xd6,0x7e,0x82,0xad,0x01] + +v_med3_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_med3_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x20,0xd6,0x7f,0xf8,0xa8,0x01] + +v_med3_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_med3_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x20,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_med3_i32 v5, -1, exec_hi, src_scc +// GFX13: v_med3_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x20,0xd6,0xc1,0xfe,0xf4,0x03] + +v_med3_i32 v5, 0.5, m0, 0.5 +// GFX13: v_med3_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x20,0xd6,0xf0,0xfa,0xc0,0x03] + +v_med3_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_med3_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x20,0xd6,0xfd,0xd4,0x04,0x03] + +v_med3_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_med3_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x20,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_med3_u16 v5.l, v1.l, v2.l, s3 +// GFX13: v_med3_u16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x59,0xd7,0x01,0x05,0x0e,0x00] + +v_med3_u16 v5.l, v255.l, s2, s105 +// GFX13: v_med3_u16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x59,0xd7,0xff,0x05,0xa4,0x01] + +v_med3_u16 v5.l, s1, v255.l, exec_hi +// GFX13: v_med3_u16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x59,0xd7,0x01,0xfe,0xff,0x01] + +v_med3_u16 v5.l, s105, s105, exec_lo +// GFX13: v_med3_u16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x59,0xd7,0x69,0xd2,0xf8,0x01] + +v_med3_u16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_med3_u16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x59,0xd7,0x6a,0xf6,0x0c,0x04] + +v_med3_u16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_med3_u16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x59,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_med3_u16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_u16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x59,0xd7,0x7b,0xfa,0xed,0x01] + +v_med3_u16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_med3_u16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x59,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_med3_u16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x59,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_med3_u16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_med3_u16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x59,0xd7,0x7e,0x82,0xad,0x01] + +v_med3_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_med3_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x59,0xd7,0x7f,0xf8,0xa8,0x01] + +v_med3_u16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_med3_u16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x59,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_med3_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_med3_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x59,0xd7,0xc1,0xfe,0xf4,0x03] + +v_med3_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_med3_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x59,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_med3_u16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x59,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_med3_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_med3_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x59,0xd7,0xfd,0xd4,0x04,0x03] + +v_med3_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_med3_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x59,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_med3_u32 v5, v1, v2, s3 +// GFX13: v_med3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x21,0xd6,0x01,0x05,0x0e,0x00] + +v_med3_u32 v5, v255, s2, s105 +// GFX13: v_med3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x21,0xd6,0xff,0x05,0xa4,0x01] + +v_med3_u32 v5, s1, v255, exec_hi +// GFX13: v_med3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x21,0xd6,0x01,0xfe,0xff,0x01] + +v_med3_u32 v5, s105, s105, exec_lo +// GFX13: v_med3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x21,0xd6,0x69,0xd2,0xf8,0x01] + +v_med3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_med3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x21,0xd6,0x6a,0xf6,0x0c,0x04] + +v_med3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_med3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x21,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_med3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_med3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x21,0xd6,0x7b,0xfa,0xed,0x01] + +v_med3_u32 v5, m0, 0.5, m0 +// GFX13: v_med3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x21,0xd6,0x7d,0xe0,0xf5,0x01] + +v_med3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_med3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x21,0xd6,0x7e,0x82,0xad,0x01] + +v_med3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_med3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x21,0xd6,0x7f,0xf8,0xa8,0x01] + +v_med3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_med3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x21,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_med3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_med3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x21,0xd6,0xc1,0xfe,0xf4,0x03] + +v_med3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_med3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x21,0xd6,0xf0,0xfa,0xc0,0x03] + +v_med3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_med3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x21,0xd6,0xfd,0xd4,0x04,0x03] + +v_med3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_min3_num_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_min3_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x51,0xd7,0x01,0x05,0x0e,0x00] + +v_min3_num_f16 v5.l, v255.l, s2, s105 +// GFX13: v_min3_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x51,0xd7,0xff,0x05,0xa4,0x01] + +v_min3_num_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x51,0xd7,0x01,0xfe,0xff,0x01] + +v_min3_num_f16 v5.l, s105, s105, exec_lo +// GFX13: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x51,0xd7,0x69,0xd2,0xf8,0x01] + +v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x51,0xd7,0x6a,0xf6,0x0c,0x04] + +v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x51,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x51,0xd7,0x7b,0xfa,0xed,0xe1] + +v_min3_num_f16 v5.l, m0, 0.5, m0 +// GFX13: v_min3_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x51,0xd7,0x7d,0xe0,0xf5,0x01] + +v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x51,0xd7,0x7e,0x82,0xad,0x01] + +v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] +// GFX13: v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x51,0xd7,0x7f,0xf8,0xa8,0xa1] + +v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0] +// GFX13: v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x51,0xd7,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] +// GFX13: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x51,0xd7,0xc1,0xfe,0xf4,0xc3] + +v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] +// GFX13: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x51,0xd7,0xf0,0xfa,0xc0,0x43] + +v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] +// GFX13: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x51,0xd7,0xfd,0xd4,0x04,0x23] + +v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp +// GFX13: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x51,0xd7,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] + +v_min3_num_f32 v5, v1, v2, s3 +// GFX13: v_min3_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_num_f32 v5, v255, s2, s105 +// GFX13: v_min3_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x29,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_num_f32 v5, s1, v255, exec_hi +// GFX13: v_min3_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x29,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_num_f32 v5, s105, s105, exec_lo +// GFX13: v_min3_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x29,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x29,0xd6,0x6a,0xf6,0x0c,0x04] + +v_min3_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_min3_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x29,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_min3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_min3_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x29,0xd6,0x7b,0xfa,0xed,0xe1] + +v_min3_num_f32 v5, m0, 0.5, m0 +// GFX13: v_min3_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x29,0xd6,0x7d,0xe0,0xf5,0x01] + +v_min3_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_min3_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x29,0xd6,0x7e,0x82,0xad,0x01] + +v_min3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_min3_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x29,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_min3_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_min3_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x29,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_min3_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_min3_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x29,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_min3_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_min3_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x29,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_min3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_min3_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x29,0xd6,0xfd,0xd4,0x04,0x33] + +v_min3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_min3_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x29,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_min3_i16 v5.l, v1.l, v2.l, s3 +// GFX13: v_min3_i16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x52,0xd7,0x01,0x05,0x0e,0x00] + +v_min3_i16 v5.l, v255.l, s2, s105 +// GFX13: v_min3_i16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x52,0xd7,0xff,0x05,0xa4,0x01] + +v_min3_i16 v5.l, s1, v255.l, exec_hi +// GFX13: v_min3_i16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x52,0xd7,0x01,0xfe,0xff,0x01] + +v_min3_i16 v5.l, s105, s105, exec_lo +// GFX13: v_min3_i16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x52,0xd7,0x69,0xd2,0xf8,0x01] + +v_min3_i16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_min3_i16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x52,0xd7,0x6a,0xf6,0x0c,0x04] + +v_min3_i16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_min3_i16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x52,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_min3_i16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_i16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x52,0xd7,0x7b,0xfa,0xed,0x01] + +v_min3_i16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_min3_i16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x52,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_min3_i16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x52,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_min3_i16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_min3_i16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x52,0xd7,0x7e,0x82,0xad,0x01] + +v_min3_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_min3_i16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x52,0xd7,0x7f,0xf8,0xa8,0x01] + +v_min3_i16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_min3_i16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x52,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_min3_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_min3_i16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x52,0xd7,0xc1,0xfe,0xf4,0x03] + +v_min3_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_min3_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x52,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_min3_i16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x52,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_min3_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_min3_i16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x52,0xd7,0xfd,0xd4,0x04,0x03] + +v_min3_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_min3_i16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x52,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_min3_i32 v5, v1, v2, s3 +// GFX13: v_min3_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1a,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_i32 v5, v255, s2, s105 +// GFX13: v_min3_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1a,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_i32 v5, s1, v255, exec_hi +// GFX13: v_min3_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1a,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_i32 v5, s105, s105, exec_lo +// GFX13: v_min3_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1a,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_min3_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_min3_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1a,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_min3_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1a,0xd6,0x7b,0xfa,0xed,0x01] + +v_min3_i32 v5, m0, 0.5, m0 +// GFX13: v_min3_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_min3_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_min3_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1a,0xd6,0x7e,0x82,0xad,0x01] + +v_min3_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_min3_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1a,0xd6,0x7f,0xf8,0xa8,0x01] + +v_min3_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_min3_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1a,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_min3_i32 v5, -1, exec_hi, src_scc +// GFX13: v_min3_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1a,0xd6,0xc1,0xfe,0xf4,0x03] + +v_min3_i32 v5, 0.5, m0, 0.5 +// GFX13: v_min3_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1a,0xd6,0xf0,0xfa,0xc0,0x03] + +v_min3_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_min3_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1a,0xd6,0xfd,0xd4,0x04,0x03] + +v_min3_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_min3_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1a,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_min3_u16 v5.l, v1.l, v2.l, s3 +// GFX13: v_min3_u16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x53,0xd7,0x01,0x05,0x0e,0x00] + +v_min3_u16 v5.l, v255.l, s2, s105 +// GFX13: v_min3_u16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x53,0xd7,0xff,0x05,0xa4,0x01] + +v_min3_u16 v5.l, s1, v255.l, exec_hi +// GFX13: v_min3_u16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x53,0xd7,0x01,0xfe,0xff,0x01] + +v_min3_u16 v5.l, s105, s105, exec_lo +// GFX13: v_min3_u16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x53,0xd7,0x69,0xd2,0xf8,0x01] + +v_min3_u16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_min3_u16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x53,0xd7,0x6a,0xf6,0x0c,0x04] + +v_min3_u16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_min3_u16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x53,0xd7,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_min3_u16 v5.l, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_u16 v5.l, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x53,0xd7,0x7b,0xfa,0xed,0x01] + +v_min3_u16 v5.l, m0, 0.5, m0 +// GFX13-ASM: v_min3_u16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x53,0xd7,0x7d,0xe0,0xf5,0x01] +// GFX13-DIS: v_min3_u16 v5.l, m0, 0x3800, m0 ; encoding: [0x05,0x00,0x53,0xd7,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] + +v_min3_u16 v5.l, exec_lo, -1, vcc_hi +// GFX13: v_min3_u16 v5.l, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x53,0xd7,0x7e,0x82,0xad,0x01] + +v_min3_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] +// GFX13: v_min3_u16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x53,0xd7,0x7f,0xf8,0xa8,0x01] + +v_min3_u16 v5.l, null, exec_lo, 0xfe0b op_sel:[0,0,0,0] +// GFX13: v_min3_u16 v5.l, null, exec_lo, 0xfe0b ; encoding: [0x05,0x00,0x53,0xd7,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00] + +v_min3_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] +// GFX13: v_min3_u16 v5.l, -1, exec_hi, src_scc op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x53,0xd7,0xc1,0xfe,0xf4,0x03] + +v_min3_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] +// GFX13-ASM: v_min3_u16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x53,0xd7,0xf0,0xfa,0xc0,0x03] +// GFX13-DIS: v_min3_u16 v5.l, 0x3800, m0, 0x3800 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x53,0xd7,0xff,0xfa,0xfc,0x03,0x00,0x38,0x00,0x00] + +v_min3_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] +// GFX13: v_min3_u16 v5.l, src_scc, vcc_lo, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x53,0xd7,0xfd,0xd4,0x04,0x03] + +v_min3_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] +// GFX13: v_min3_u16 v255.h, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] ; encoding: [0xff,0x40,0x53,0xd7,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_min3_u32 v5, v1, v2, s3 +// GFX13: v_min3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x1b,0xd6,0x01,0x05,0x0e,0x00] + +v_min3_u32 v5, v255, s2, s105 +// GFX13: v_min3_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x1b,0xd6,0xff,0x05,0xa4,0x01] + +v_min3_u32 v5, s1, v255, exec_hi +// GFX13: v_min3_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x1b,0xd6,0x01,0xfe,0xff,0x01] + +v_min3_u32 v5, s105, s105, exec_lo +// GFX13: v_min3_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x1b,0xd6,0x69,0xd2,0xf8,0x01] + +v_min3_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_min3_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x1b,0xd6,0x6a,0xf6,0x0c,0x04] + +v_min3_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_min3_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x1b,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_min3_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_min3_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x1b,0xd6,0x7b,0xfa,0xed,0x01] + +v_min3_u32 v5, m0, 0.5, m0 +// GFX13: v_min3_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x1b,0xd6,0x7d,0xe0,0xf5,0x01] + +v_min3_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_min3_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x1b,0xd6,0x7e,0x82,0xad,0x01] + +v_min3_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_min3_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x1b,0xd6,0x7f,0xf8,0xa8,0x01] + +v_min3_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_min3_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x1b,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_min3_u32 v5, -1, exec_hi, src_scc +// GFX13: v_min3_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x1b,0xd6,0xc1,0xfe,0xf4,0x03] + +v_min3_u32 v5, 0.5, m0, 0.5 +// GFX13: v_min3_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x1b,0xd6,0xf0,0xfa,0xc0,0x03] + +v_min3_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_min3_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x1b,0xd6,0xfd,0xd4,0x04,0x03] + +v_min3_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_min3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x1b,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_min_i16 v5.l, v1.l, v2.l +// GFX13: v_min_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x02] + +v_min_i16 v5.l, v255.l, v255.l +// GFX13: v_min_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xff,0x03,0x02] + +v_min_i16 v5.l, s1, s2 +// GFX13: v_min_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0c,0xd7,0x01,0x04,0x00,0x02] + +v_min_i16 v5.l, s105, s105 +// GFX13: v_min_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0c,0xd7,0x69,0xd2,0x00,0x02] + +v_min_i16 v5.l, vcc_lo, ttmp15 +// GFX13: v_min_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0xf6,0x00,0x02] + +v_min_i16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_min_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0c,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_min_i16 v5.l, ttmp15, src_scc +// GFX13: v_min_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0c,0xd7,0x7b,0xfa,0x01,0x02] + +v_min_i16 v5.l, m0, 0.5 +// GFX13-ASM: v_min_i16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_min_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0c,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_min_i16 v5.l, exec_lo, -1 +// GFX13: v_min_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0c,0xd7,0x7e,0x82,0x01,0x02] + +v_min_i16 v5.l, exec_hi, null +// GFX13: v_min_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0c,0xd7,0x7f,0xf8,0x00,0x02] + +v_min_i16 v5.l, null, exec_lo +// GFX13: v_min_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0c,0xd7,0x7c,0xfc,0x00,0x02] + +v_min_i16 v5.l, -1, exec_hi +// GFX13: v_min_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0c,0xd7,0xc1,0xfe,0x00,0x02] + +v_min_i16 v5.l, 0.5, m0 +// GFX13-ASM: v_min_i16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x0c,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_min_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0c,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_min_i16 v5.l, src_scc, vcc_lo +// GFX13: v_min_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0c,0xd7,0xfd,0xd4,0x00,0x02] + +v_min_i16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_min_i16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0c,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_min_u16 v5.l, v1.l, v2.l +// GFX13: v_min_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x02] + +v_min_u16 v5.l, v255.l, v255.l +// GFX13: v_min_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xff,0x03,0x02] + +v_min_u16 v5.l, s1, s2 +// GFX13: v_min_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0x04,0x00,0x02] + +v_min_u16 v5.l, s105, s105 +// GFX13: v_min_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0b,0xd7,0x69,0xd2,0x00,0x02] + +v_min_u16 v5.l, vcc_lo, ttmp15 +// GFX13: v_min_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0xf6,0x00,0x02] + +v_min_u16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_min_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0b,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_min_u16 v5.l, ttmp15, src_scc +// GFX13: v_min_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0b,0xd7,0x7b,0xfa,0x01,0x02] + +v_min_u16 v5.l, m0, 0.5 +// GFX13-ASM: v_min_u16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_min_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0b,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_min_u16 v5.l, exec_lo, -1 +// GFX13: v_min_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0b,0xd7,0x7e,0x82,0x01,0x02] + +v_min_u16 v5.l, exec_hi, null +// GFX13: v_min_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0b,0xd7,0x7f,0xf8,0x00,0x02] + +v_min_u16 v5.l, null, exec_lo +// GFX13: v_min_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0b,0xd7,0x7c,0xfc,0x00,0x02] + +v_min_u16 v5.l, -1, exec_hi +// GFX13: v_min_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0b,0xd7,0xc1,0xfe,0x00,0x02] + +v_min_u16 v5.l, 0.5, m0 +// GFX13-ASM: v_min_u16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x0b,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_min_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0b,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_min_u16 v5.l, src_scc, vcc_lo +// GFX13: v_min_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0b,0xd7,0xfd,0xd4,0x00,0x02] + +v_min_u16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_min_u16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_minmax_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_num_f16 v5.l, v255.l, s2, s105 +// GFX13: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_num_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_num_f16 v5.l, s105, s105, exec_lo +// GFX13: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX13: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minmax_num_f16 v5.l, m0, 0.5, m0 +// GFX13: v_minmax_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| +// GFX13: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] + +v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +v_minmax_num_f32 v5, v1, v2, s3 +// GFX13: v_minmax_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_num_f32 v5, v255, s2, s105 +// GFX13: v_minmax_num_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x68,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_num_f32 v5, s1, v255, exec_hi +// GFX13: v_minmax_num_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x68,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_num_f32 v5, s105, s105, exec_lo +// GFX13: v_minmax_num_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x68,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_num_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_num_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x68,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_num_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minmax_num_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x68,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minmax_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minmax_num_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x68,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minmax_num_f32 v5, m0, 0.5, m0 +// GFX13: v_minmax_num_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x68,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_num_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minmax_num_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x68,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_num_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minmax_num_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x68,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minmax_num_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_minmax_num_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x68,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_minmax_num_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minmax_num_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x68,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minmax_num_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minmax_num_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x68,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minmax_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minmax_num_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x68,0xd6,0xfd,0xd4,0x04,0x33] + +v_minmax_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minmax_num_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x68,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_minmax_i32 v5, v1, v2, s3 +// GFX13: v_minmax_i32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x65,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_i32 v5, v255, s2, s105 +// GFX13: v_minmax_i32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x65,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_i32 v5, s1, v255, exec_hi +// GFX13: v_minmax_i32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x65,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_i32 v5, s105, s105, exec_lo +// GFX13: v_minmax_i32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x65,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_i32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_i32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x65,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_i32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minmax_i32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x65,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minmax_i32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_minmax_i32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x65,0xd6,0x7b,0xfa,0xed,0x01] + +v_minmax_i32 v5, m0, 0.5, m0 +// GFX13: v_minmax_i32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x65,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_i32 v5, exec_lo, -1, vcc_hi +// GFX13: v_minmax_i32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x65,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_i32 v5, exec_hi, null, vcc_lo +// GFX13: v_minmax_i32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x65,0xd6,0x7f,0xf8,0xa8,0x01] + +v_minmax_i32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_minmax_i32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x65,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_minmax_i32 v5, -1, exec_hi, src_scc +// GFX13: v_minmax_i32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x65,0xd6,0xc1,0xfe,0xf4,0x03] + +v_minmax_i32 v5, 0.5, m0, 0.5 +// GFX13: v_minmax_i32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x65,0xd6,0xf0,0xfa,0xc0,0x03] + +v_minmax_i32 v5, src_scc, vcc_lo, -1 +// GFX13: v_minmax_i32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x65,0xd6,0xfd,0xd4,0x04,0x03] + +v_minmax_i32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_minmax_i32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x65,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_minmax_u32 v5, v1, v2, s3 +// GFX13: v_minmax_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x63,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_u32 v5, v255, s2, s105 +// GFX13: v_minmax_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x63,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_u32 v5, s1, v255, exec_hi +// GFX13: v_minmax_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x63,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_u32 v5, s105, s105, exec_lo +// GFX13: v_minmax_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x63,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minmax_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x63,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minmax_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minmax_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x63,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minmax_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_minmax_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x63,0xd6,0x7b,0xfa,0xed,0x01] + +v_minmax_u32 v5, m0, 0.5, m0 +// GFX13: v_minmax_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x63,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minmax_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_minmax_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x63,0xd6,0x7e,0x82,0xad,0x01] + +v_minmax_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_minmax_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x63,0xd6,0x7f,0xf8,0xa8,0x01] + +v_minmax_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_minmax_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x63,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_minmax_u32 v5, -1, exec_hi, src_scc +// GFX13: v_minmax_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x63,0xd6,0xc1,0xfe,0xf4,0x03] + +v_minmax_u32 v5, 0.5, m0, 0.5 +// GFX13: v_minmax_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x63,0xd6,0xf0,0xfa,0xc0,0x03] + +v_minmax_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_minmax_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x63,0xd6,0xfd,0xd4,0x04,0x03] + +v_minmax_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_minmax_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x63,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0x05,0xea,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0xff,0xeb,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0x05,0xe8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] ; encoding: [0x05,0x00,0x3b,0xd6,0x01,0xd3,0xe8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] ; encoding: [0x05,0x00,0x3b,0xd6,0xfe,0xf7,0x18,0x00] + +v_mqsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] ; encoding: [0x05,0x00,0x3b,0xd6,0x02,0xd6,0x0c,0x04] + +v_mqsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] ; encoding: [0x05,0x00,0x3b,0xd6,0x68,0xd4,0xa0,0x01] + +v_mqsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] +// GFX13: v_mqsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] ; encoding: [0x05,0x00,0x3b,0xd6,0x6a,0xfa,0xf8,0x07] + +v_mqsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null +// GFX13: v_mqsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null ; encoding: [0x05,0x00,0x3b,0xd6,0x7a,0xfe,0xf0,0x01] + +v_mqsad_pk_u16_u8 v[5:6], exec, exec_lo, exec +// GFX13: v_mqsad_pk_u16_u8 v[5:6], exec, exec_lo, exec ; encoding: [0x05,0x00,0x3b,0xd6,0x7e,0xfc,0xf8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], null, null, vcc +// GFX13: v_mqsad_pk_u16_u8 v[5:6], null, null, vcc ; encoding: [0x05,0x00,0x3b,0xd6,0x7c,0xf8,0xa8,0x01] + +v_mqsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_mqsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x3b,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_mqsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc +// GFX13: v_mqsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc ; encoding: [0x05,0x00,0x3b,0xd6,0xf0,0xe0,0xf5,0x03] + +v_mqsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 +// GFX13: v_mqsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 ; encoding: [0x05,0x00,0x3b,0xd6,0xfd,0xfa,0xc1,0x03] + +v_mqsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp +// GFX13: v_mqsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp ; encoding: [0xfe,0x80,0x3b,0xd6,0xff,0xfe,0x05,0x03,0x56,0x34,0x12,0xaf] + +v_mqsad_u32_u8 v[5:8], v[1:2], v2, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], v2, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0x05,0xf2,0x07] + +v_mqsad_u32_u8 v[5:8], v[1:2], v255, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], v255, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0xff,0xf3,0x07] + +v_mqsad_u32_u8 v[5:8], v[1:2], s2, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], s2, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0x05,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], v[1:2], s105, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[1:2], s105, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x01,0xd3,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], v[254:255], ttmp15, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], v[254:255], ttmp15, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xfe,0xf7,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], s[2:3], vcc_hi, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], s[2:3], vcc_hi, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x02,0xd6,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], s[104:105], vcc_lo, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], s[104:105], vcc_lo, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x68,0xd4,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], vcc, m0, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], vcc, m0, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x6a,0xfa,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], ttmp[14:15], exec_hi, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], ttmp[14:15], exec_hi, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x7a,0xfe,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], exec, exec_lo, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], exec, exec_lo, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x7e,0xfc,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], null, null, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], null, null, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0x7c,0xf8,0xf0,0x07] + +v_mqsad_u32_u8 v[5:8], -1, -1, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], -1, -1, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xc1,0x82,0xf1,0x07] + +v_mqsad_u32_u8 v[5:8], 0.5, 0.5, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], 0.5, 0.5, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xf0,0xe0,0xf1,0x07] + +v_mqsad_u32_u8 v[5:8], src_scc, src_scc, v[252:255] +// GFX13: v_mqsad_u32_u8 v[5:8], src_scc, src_scc, v[252:255] ; encoding: [0x05,0x00,0x3d,0xd6,0xfd,0xfa,0xf1,0x07] + +v_mqsad_u32_u8 v[252:255], 0xaf123456, 0xaf123456, v[3:6] clamp +// GFX13: v_mqsad_u32_u8 v[252:255], 0xaf123456, 0xaf123456, v[3:6] clamp ; encoding: [0xfc,0x80,0x3d,0xd6,0xff,0xfe,0x0d,0x04,0x56,0x34,0x12,0xaf] + +v_msad_u8 v5, v1, v2, s3 +// GFX13: v_msad_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x39,0xd6,0x01,0x05,0x0e,0x00] + +v_msad_u8 v5, v255, s2, s105 +// GFX13: v_msad_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x39,0xd6,0xff,0x05,0xa4,0x01] + +v_msad_u8 v5, s1, v255, exec_hi +// GFX13: v_msad_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x39,0xd6,0x01,0xfe,0xff,0x01] + +v_msad_u8 v5, s105, s105, exec_lo +// GFX13: v_msad_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x39,0xd6,0x69,0xd2,0xf8,0x01] + +v_msad_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_msad_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x39,0xd6,0x6a,0xf6,0x0c,0x04] + +v_msad_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_msad_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x39,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_msad_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_msad_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x39,0xd6,0x7b,0xfa,0xed,0x01] + +v_msad_u8 v5, m0, 0.5, m0 +// GFX13: v_msad_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x39,0xd6,0x7d,0xe0,0xf5,0x01] + +v_msad_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_msad_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x39,0xd6,0x7e,0x82,0xad,0x01] + +v_msad_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_msad_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x39,0xd6,0x7f,0xf8,0xa8,0x01] + +v_msad_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_msad_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x39,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_msad_u8 v5, -1, exec_hi, src_scc +// GFX13: v_msad_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x39,0xd6,0xc1,0xfe,0xf4,0x03] + +v_msad_u8 v5, 0.5, m0, 0.5 +// GFX13: v_msad_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x39,0xd6,0xf0,0xfa,0xc0,0x03] + +v_msad_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_msad_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x39,0xd6,0xfd,0xd4,0x04,0x03] + +v_msad_u8 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_msad_u8 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x39,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_mul_hi_i32 v5, v1, v2 +// GFX13: v_mul_hi_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x2e,0xd7,0x01,0x05,0x02,0x02] + +v_mul_hi_i32 v5, v255, v255 +// GFX13: v_mul_hi_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x2e,0xd7,0xff,0xff,0x03,0x02] + +v_mul_hi_i32 v5, s1, s2 +// GFX13: v_mul_hi_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x2e,0xd7,0x01,0x04,0x00,0x02] + +v_mul_hi_i32 v5, s105, s105 +// GFX13: v_mul_hi_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x2e,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_hi_i32 v5, vcc_lo, ttmp15 +// GFX13: v_mul_hi_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x2e,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_hi_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mul_hi_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x2e,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mul_hi_i32 v5, ttmp15, src_scc +// GFX13: v_mul_hi_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x2e,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_hi_i32 v5, m0, 0.5 +// GFX13: v_mul_hi_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x2e,0xd7,0x7d,0xe0,0x01,0x02] + +v_mul_hi_i32 v5, exec_lo, -1 +// GFX13: v_mul_hi_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x2e,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_hi_i32 v5, exec_hi, null +// GFX13: v_mul_hi_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x2e,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_hi_i32 v5, null, exec_lo +// GFX13: v_mul_hi_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x2e,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_hi_i32 v5, -1, exec_hi +// GFX13: v_mul_hi_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x2e,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_hi_i32 v5, 0.5, m0 +// GFX13: v_mul_hi_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x2e,0xd7,0xf0,0xfa,0x00,0x02] + +v_mul_hi_i32 v5, src_scc, vcc_lo +// GFX13: v_mul_hi_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x2e,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_hi_i32 v255, 0xaf123456, vcc_hi +// GFX13: v_mul_hi_i32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x2e,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mul_hi_u32 v5, v1, v2 +// GFX13: v_mul_hi_u32 v5, v1, v2 ; encoding: [0x05,0x00,0x2d,0xd7,0x01,0x05,0x02,0x02] + +v_mul_hi_u32 v5, v255, v255 +// GFX13: v_mul_hi_u32 v5, v255, v255 ; encoding: [0x05,0x00,0x2d,0xd7,0xff,0xff,0x03,0x02] + +v_mul_hi_u32 v5, s1, s2 +// GFX13: v_mul_hi_u32 v5, s1, s2 ; encoding: [0x05,0x00,0x2d,0xd7,0x01,0x04,0x00,0x02] + +v_mul_hi_u32 v5, s105, s105 +// GFX13: v_mul_hi_u32 v5, s105, s105 ; encoding: [0x05,0x00,0x2d,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_hi_u32 v5, vcc_lo, ttmp15 +// GFX13: v_mul_hi_u32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x2d,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_hi_u32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mul_hi_u32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x2d,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mul_hi_u32 v5, ttmp15, src_scc +// GFX13: v_mul_hi_u32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x2d,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_hi_u32 v5, m0, 0.5 +// GFX13: v_mul_hi_u32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x2d,0xd7,0x7d,0xe0,0x01,0x02] + +v_mul_hi_u32 v5, exec_lo, -1 +// GFX13: v_mul_hi_u32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x2d,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_hi_u32 v5, exec_hi, null +// GFX13: v_mul_hi_u32 v5, exec_hi, null ; encoding: [0x05,0x00,0x2d,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_hi_u32 v5, null, exec_lo +// GFX13: v_mul_hi_u32 v5, null, exec_lo ; encoding: [0x05,0x00,0x2d,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_hi_u32 v5, -1, exec_hi +// GFX13: v_mul_hi_u32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x2d,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_hi_u32 v5, 0.5, m0 +// GFX13: v_mul_hi_u32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x2d,0xd7,0xf0,0xfa,0x00,0x02] + +v_mul_hi_u32 v5, src_scc, vcc_lo +// GFX13: v_mul_hi_u32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x2d,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_hi_u32 v255, 0xaf123456, vcc_hi +// GFX13: v_mul_hi_u32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x2d,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mul_lo_u16 v5.l, v1.l, v2.l +// GFX13: v_mul_lo_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x02] + +v_mul_lo_u16 v5.l, v255.l, v255.l +// GFX13: v_mul_lo_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xff,0x03,0x02] + +v_mul_lo_u16 v5.l, s1, s2 +// GFX13: v_mul_lo_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x05,0xd7,0x01,0x04,0x00,0x02] + +v_mul_lo_u16 v5.l, s105, s105 +// GFX13: v_mul_lo_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x05,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_lo_u16 v5.l, vcc_lo, ttmp15 +// GFX13: v_mul_lo_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_mul_lo_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x05,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_mul_lo_u16 v5.l, ttmp15, src_scc +// GFX13: v_mul_lo_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x05,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_lo_u16 v5.l, m0, 0.5 +// GFX13-ASM: v_mul_lo_u16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_mul_lo_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x05,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_mul_lo_u16 v5.l, exec_lo, -1 +// GFX13: v_mul_lo_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x05,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_lo_u16 v5.l, exec_hi, null +// GFX13: v_mul_lo_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x05,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_lo_u16 v5.l, null, exec_lo +// GFX13: v_mul_lo_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x05,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_lo_u16 v5.l, -1, exec_hi +// GFX13: v_mul_lo_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x05,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_lo_u16 v5.l, 0.5, m0 +// GFX13-ASM: v_mul_lo_u16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x05,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_mul_lo_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x05,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_mul_lo_u16 v5.l, src_scc, vcc_lo +// GFX13: v_mul_lo_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x05,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_mul_lo_u16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x05,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_mul_lo_u32 v5, v1, v2 +// GFX13: v_mul_lo_u32 v5, v1, v2 ; encoding: [0x05,0x00,0x2c,0xd7,0x01,0x05,0x02,0x02] + +v_mul_lo_u32 v5, v255, v255 +// GFX13: v_mul_lo_u32 v5, v255, v255 ; encoding: [0x05,0x00,0x2c,0xd7,0xff,0xff,0x03,0x02] + +v_mul_lo_u32 v5, s1, s2 +// GFX13: v_mul_lo_u32 v5, s1, s2 ; encoding: [0x05,0x00,0x2c,0xd7,0x01,0x04,0x00,0x02] + +v_mul_lo_u32 v5, s105, s105 +// GFX13: v_mul_lo_u32 v5, s105, s105 ; encoding: [0x05,0x00,0x2c,0xd7,0x69,0xd2,0x00,0x02] + +v_mul_lo_u32 v5, vcc_lo, ttmp15 +// GFX13: v_mul_lo_u32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x2c,0xd7,0x6a,0xf6,0x00,0x02] + +v_mul_lo_u32 v5, vcc_hi, 0xaf123456 +// GFX13: v_mul_lo_u32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x2c,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_mul_lo_u32 v5, ttmp15, src_scc +// GFX13: v_mul_lo_u32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x2c,0xd7,0x7b,0xfa,0x01,0x02] + +v_mul_lo_u32 v5, m0, 0.5 +// GFX13: v_mul_lo_u32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x2c,0xd7,0x7d,0xe0,0x01,0x02] + +v_mul_lo_u32 v5, exec_lo, -1 +// GFX13: v_mul_lo_u32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x2c,0xd7,0x7e,0x82,0x01,0x02] + +v_mul_lo_u32 v5, exec_hi, null +// GFX13: v_mul_lo_u32 v5, exec_hi, null ; encoding: [0x05,0x00,0x2c,0xd7,0x7f,0xf8,0x00,0x02] + +v_mul_lo_u32 v5, null, exec_lo +// GFX13: v_mul_lo_u32 v5, null, exec_lo ; encoding: [0x05,0x00,0x2c,0xd7,0x7c,0xfc,0x00,0x02] + +v_mul_lo_u32 v5, -1, exec_hi +// GFX13: v_mul_lo_u32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x2c,0xd7,0xc1,0xfe,0x00,0x02] + +v_mul_lo_u32 v5, 0.5, m0 +// GFX13: v_mul_lo_u32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x2c,0xd7,0xf0,0xfa,0x00,0x02] + +v_mul_lo_u32 v5, src_scc, vcc_lo +// GFX13: v_mul_lo_u32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x2c,0xd7,0xfd,0xd4,0x00,0x02] + +v_mul_lo_u32 v255, 0xaf123456, vcc_hi +// GFX13: v_mul_lo_u32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x2c,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_mullit_f32 v5, v1, v2, s3 +// GFX13: v_mullit_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x18,0xd6,0x01,0x05,0x0e,0x00] + +v_mullit_f32 v5, v255, s2, s105 +// GFX13: v_mullit_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x18,0xd6,0xff,0x05,0xa4,0x01] + +v_mullit_f32 v5, s1, v255, exec_hi +// GFX13: v_mullit_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x18,0xd6,0x01,0xfe,0xff,0x01] + +v_mullit_f32 v5, s105, s105, exec_lo +// GFX13: v_mullit_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x18,0xd6,0x69,0xd2,0xf8,0x01] + +v_mullit_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_mullit_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x18,0xd6,0x6a,0xf6,0x0c,0x04] + +v_mullit_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_mullit_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x18,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_mullit_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_mullit_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x18,0xd6,0x7b,0xfa,0xed,0xe1] + +v_mullit_f32 v5, m0, 0.5, m0 +// GFX13: v_mullit_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x18,0xd6,0x7d,0xe0,0xf5,0x01] + +v_mullit_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_mullit_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x18,0xd6,0x7e,0x82,0xad,0x01] + +v_mullit_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_mullit_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x18,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_mullit_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_mullit_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x18,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_mullit_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_mullit_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x18,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_mullit_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_mullit_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x18,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_mullit_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_mullit_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x18,0xd6,0xfd,0xd4,0x04,0x33] + +v_mullit_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_mullit_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x18,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_or3_b32 v5, v1, v2, s3 +// GFX13: v_or3_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x72,0xd7,0x01,0x05,0x0e,0x00] + +v_or3_b32 v5, v255, s2, s105 +// GFX13: v_or3_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x72,0xd7,0xff,0x05,0xa4,0x01] + +v_or3_b32 v5, s1, v255, exec_hi +// GFX13: v_or3_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x72,0xd7,0x01,0xfe,0xff,0x01] + +v_or3_b32 v5, s105, s105, exec_lo +// GFX13: v_or3_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x72,0xd7,0x69,0xd2,0xf8,0x01] + +v_or3_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_or3_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x72,0xd7,0x6a,0xf6,0x0c,0x04] + +v_or3_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_or3_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x72,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_or3_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_or3_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x72,0xd7,0x7b,0xfa,0xed,0x01] + +v_or3_b32 v5, m0, 0.5, m0 +// GFX13: v_or3_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x72,0xd7,0x7d,0xe0,0xf5,0x01] + +v_or3_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_or3_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x72,0xd7,0x7e,0x82,0xad,0x01] + +v_or3_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_or3_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x72,0xd7,0x7f,0xf8,0xa8,0x01] + +v_or3_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_or3_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x72,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_or3_b32 v5, -1, exec_hi, src_scc +// GFX13: v_or3_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x72,0xd7,0xc1,0xfe,0xf4,0x03] + +v_or3_b32 v5, 0.5, m0, 0.5 +// GFX13: v_or3_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x72,0xd7,0xf0,0xfa,0xc0,0x03] + +v_or3_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_or3_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x72,0xd7,0xfd,0xd4,0x04,0x03] + +v_or3_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_or3_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x72,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_or_b16 v5.l, v1.l, v2.l +// GFX13: v_or_b16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x02] + +v_or_b16 v5.l, v255.l, v255.l +// GFX13: v_or_b16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x25,0xd7,0xff,0xff,0x03,0x02] + +v_or_b16 v5.l, s1, s2 +// GFX13: v_or_b16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x04,0x00,0x02] + +v_or_b16 v5.l, s105, s105 +// GFX13: v_or_b16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x25,0xd7,0x69,0xd2,0x00,0x02] + +v_or_b16 v5.l, vcc_lo, ttmp15 +// GFX13: v_or_b16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x25,0xd7,0x6a,0xf6,0x00,0x02] + +v_or_b16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_or_b16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x25,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_or_b16 v5.l, ttmp15, src_scc +// GFX13: v_or_b16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x25,0xd7,0x7b,0xfa,0x01,0x02] + +v_or_b16 v5.l, m0, 0.5 +// GFX13-ASM: v_or_b16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x25,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_or_b16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x25,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_or_b16 v5.l, exec_lo, -1 +// GFX13: v_or_b16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x25,0xd7,0x7e,0x82,0x01,0x02] + +v_or_b16 v5.l, exec_hi, null +// GFX13: v_or_b16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x25,0xd7,0x7f,0xf8,0x00,0x02] + +v_or_b16 v5.l, null, exec_lo +// GFX13: v_or_b16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x25,0xd7,0x7c,0xfc,0x00,0x02] + +v_or_b16 v5.l, -1, exec_hi +// GFX13: v_or_b16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x25,0xd7,0xc1,0xfe,0x00,0x02] + +v_or_b16 v5.l, 0.5, m0 +// GFX13-ASM: v_or_b16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x25,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_or_b16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x25,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_or_b16 v5.l, src_scc, vcc_lo +// GFX13: v_or_b16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x25,0xd7,0xfd,0xd4,0x00,0x02] + +v_or_b16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_or_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x25,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_pack_b32_f16 v5, v1.l, v2.l +// GFX13: v_pack_b32_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x05,0x02,0x02] + +v_pack_b32_f16 v5, v255.l, v255.l +// GFX13: v_pack_b32_f16 v5, v255.l, v255.l ; encoding: [0x05,0x00,0x11,0xd7,0xff,0xff,0x03,0x02] + +v_pack_b32_f16 v5, s1, s2 +// GFX13: v_pack_b32_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x04,0x00,0x02] + +v_pack_b32_f16 v5, s105, s105 +// GFX13: v_pack_b32_f16 v5, s105, s105 ; encoding: [0x05,0x00,0x11,0xd7,0x69,0xd2,0x00,0x02] + +v_pack_b32_f16 v5, vcc_lo, ttmp15 +// GFX13: v_pack_b32_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x11,0xd7,0x6a,0xf6,0x00,0x02] + +v_pack_b32_f16 v5, vcc_hi, 0xfe0b +// GFX13: v_pack_b32_f16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x11,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_pack_b32_f16 v5, ttmp15, src_scc +// GFX13: v_pack_b32_f16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x11,0xd7,0x7b,0xfa,0x01,0x02] + +v_pack_b32_f16 v5, m0, 0.5 +// GFX13: v_pack_b32_f16 v5, m0, 0.5 ; encoding: [0x05,0x00,0x11,0xd7,0x7d,0xe0,0x01,0x02] + +v_pack_b32_f16 v5, exec_lo, -1 +// GFX13: v_pack_b32_f16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x11,0xd7,0x7e,0x82,0x01,0x02] + +v_pack_b32_f16 v5, |exec_hi|, null +// GFX13: v_pack_b32_f16 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x11,0xd7,0x7f,0xf8,0x00,0x02] + +v_pack_b32_f16 v5, null, exec_lo +// GFX13: v_pack_b32_f16 v5, null, exec_lo ; encoding: [0x05,0x00,0x11,0xd7,0x7c,0xfc,0x00,0x02] + +v_pack_b32_f16 v5, -1, exec_hi +// GFX13: v_pack_b32_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x11,0xd7,0xc1,0xfe,0x00,0x02] + +v_pack_b32_f16 v5, 0.5, -m0 op_sel:[0,0,0] +// GFX13: v_pack_b32_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x11,0xd7,0xf0,0xfa,0x00,0x42] + +v_pack_b32_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] +// GFX13: v_pack_b32_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0] ; encoding: [0x05,0x0a,0x11,0xd7,0xfd,0xd4,0x00,0x22] + +v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] +// GFX13: v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x11,0xd7,0xff,0xd6,0x00,0x62,0x0b,0xfe,0x00,0x00] + +v_perm_b32 v5, v1, v2, s3 +// GFX13: v_perm_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x44,0xd7,0x01,0x05,0x0e,0x00] + +v_perm_b32 v5, v255, s2, s105 +// GFX13: v_perm_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x44,0xd7,0xff,0x05,0xa4,0x01] + +v_perm_b32 v5, s1, v255, exec_hi +// GFX13: v_perm_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x44,0xd7,0x01,0xfe,0xff,0x01] + +v_perm_b32 v5, s105, s105, exec_lo +// GFX13: v_perm_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x44,0xd7,0x69,0xd2,0xf8,0x01] + +v_perm_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_perm_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x44,0xd7,0x6a,0xf6,0x0c,0x04] + +v_perm_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_perm_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x44,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_perm_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_perm_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x44,0xd7,0x7b,0xfa,0xed,0x01] + +v_perm_b32 v5, m0, 0.5, m0 +// GFX13: v_perm_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x44,0xd7,0x7d,0xe0,0xf5,0x01] + +v_perm_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_perm_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x44,0xd7,0x7e,0x82,0xad,0x01] + +v_perm_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_perm_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x44,0xd7,0x7f,0xf8,0xa8,0x01] + +v_perm_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_perm_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x44,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_perm_b32 v5, -1, exec_hi, src_scc +// GFX13: v_perm_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x44,0xd7,0xc1,0xfe,0xf4,0x03] + +v_perm_b32 v5, 0.5, m0, 0.5 +// GFX13: v_perm_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x44,0xd7,0xf0,0xfa,0xc0,0x03] + +v_perm_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_perm_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x44,0xd7,0xfd,0xd4,0x04,0x03] + +v_perm_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_perm_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x44,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] +// GFX13: v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0a,0x1a,0x04] + +v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] ; encoding: [0x02,0x00,0x48,0xd7,0x04,0xe3,0x18,0x00] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 100 +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, v5, 0x64 ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0b,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0b,0x12,0x02] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] +// GFX13: v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] ; encoding: [0x02,0x00,0x48,0xd7,0x04,0x0b,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] +// GFX13: v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x49,0xd7,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 100 +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 0x64 ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x09,0x12,0x02] + +v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] +// GFX13: v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] ; encoding: [0x02,0x00,0x49,0xd7,0x04,0x11,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] +// GFX13: v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 100 +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 0x64 ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x09,0x12,0x02] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] +// GFX13: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x4a,0xd7,0x04,0x11,0x1a,0x04] + +v_permlane16_b32 v5, v1, s2, s3 +// GFX13: v_permlane16_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0x05,0x0c,0x00] + +v_permlane16_b32 v5, v1, s105, s105 +// GFX13: v_permlane16_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xd3,0xa4,0x01] + +v_permlane16_b32 v5, v1, ttmp15, ttmp15 +// GFX13: v_permlane16_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xf7,0xec,0x01] + +v_permlane16_b32 v5, v1, vcc_hi, exec_lo +// GFX13: v_permlane16_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xd7,0xf8,0x01] + +v_permlane16_b32 v5, v1, vcc_lo, m0 +// GFX13: v_permlane16_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xd5,0xf4,0x01] + +v_permlane16_b32 v5, v1, m0, vcc_hi +// GFX13: v_permlane16_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xfb,0xac,0x01] + +v_permlane16_b32 v5, v1, exec_hi, vcc_lo +// GFX13: v_permlane16_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xff,0xa8,0x01] + +v_permlane16_b32 v5, v1, exec_lo, src_scc +// GFX13: v_permlane16_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x77,0xd7,0x01,0xfd,0xf4,0x03] + +v_permlane16_b32 v5, v1, null, 0.5 op_sel:[1,1] +// GFX13: v_permlane16_b32 v5, v1, null, 0.5 op_sel:[1,1] ; encoding: [0x05,0x18,0x77,0xd7,0x01,0xf9,0xc0,0x03] + +v_permlane16_b32 v5, v1, -1, -1 op_sel:[0,0] +// GFX13: v_permlane16_b32 v5, v1, -1, -1 ; encoding: [0x05,0x00,0x77,0xd7,0x01,0x83,0x05,0x03] + +v_permlane16_b32 v5, v1, 0.5, null op_sel:[1,0] +// GFX13: v_permlane16_b32 v5, v1, 0.5, null op_sel:[1,0] ; encoding: [0x05,0x08,0x77,0xd7,0x01,0xe1,0xf1,0x01] + +v_permlane16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] +// GFX13: v_permlane16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] ; encoding: [0xff,0x10,0x77,0xd7,0xff,0xfb,0xfd,0x01] + +v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, s105, s105 +// W32: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, s2, s3 +// W32: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_bcast_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, s105, s105 +// W32: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, s2, s3 +// W32: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_down_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, exec_hi +// W32: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xff,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, exec_lo +// W32: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xfd,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, m0 +// W32: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xfb,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, s105 +// W32: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xd3,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, s2 +// W32: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0x05,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, ttmp15 +// W32: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xf7,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, vcc_hi +// W32: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xd7,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_idx_gen_b32 v5, v1, vcc_lo +// W32: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x17,0xd7,0x01,0xd5,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_up_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, s105, s105 +// W32: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, s2, s3 +// W32: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_up_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo +// W32: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, exec_lo, src_scc +// W32: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, m0, vcc_hi +// W32: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, s105, s105 +// W32: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, s2, s3 +// W32: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 +// W32: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo +// W32: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlane_xor_b32 v5, v1, vcc_lo, m0 +// W32: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_permlanex16_b32 v5, v1, s2, s3 +// GFX13: v_permlanex16_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0x05,0x0c,0x00] + +v_permlanex16_b32 v5, v1, s105, s105 +// GFX13: v_permlanex16_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xd3,0xa4,0x01] + +v_permlanex16_b32 v5, v1, ttmp15, ttmp15 +// GFX13: v_permlanex16_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xf7,0xec,0x01] + +v_permlanex16_b32 v5, v1, vcc_hi, exec_lo +// GFX13: v_permlanex16_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xd7,0xf8,0x01] + +v_permlanex16_b32 v5, v1, vcc_lo, m0 +// GFX13: v_permlanex16_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xd5,0xf4,0x01] + +v_permlanex16_b32 v5, v1, m0, vcc_hi +// GFX13: v_permlanex16_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xfb,0xac,0x01] + +v_permlanex16_b32 v5, v1, exec_hi, vcc_lo +// GFX13: v_permlanex16_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xff,0xa8,0x01] + +v_permlanex16_b32 v5, v1, exec_lo, src_scc +// GFX13: v_permlanex16_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x78,0xd7,0x01,0xfd,0xf4,0x03] + +v_permlanex16_b32 v5, v1, null, 0.5 op_sel:[1,1] +// GFX13: v_permlanex16_b32 v5, v1, null, 0.5 op_sel:[1,1] ; encoding: [0x05,0x18,0x78,0xd7,0x01,0xf9,0xc0,0x03] + +v_permlanex16_b32 v5, v1, -1, -1 op_sel:[0,0] +// GFX13: v_permlanex16_b32 v5, v1, -1, -1 ; encoding: [0x05,0x00,0x78,0xd7,0x01,0x83,0x05,0x03] + +v_permlanex16_b32 v5, v1, 0.5, null op_sel:[1,0] +// GFX13: v_permlanex16_b32 v5, v1, 0.5, null op_sel:[1,0] ; encoding: [0x05,0x08,0x78,0xd7,0x01,0xe1,0xf1,0x01] + +v_permlanex16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] +// GFX13: v_permlanex16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] ; encoding: [0xff,0x10,0x78,0xd7,0xff,0xfb,0xfd,0x01] + +v_permlane16_var_b32 v5, v1, v2 +// GFX13: v_permlane16_var_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x15,0xd7,0x01,0x05,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v255 +// GFX13: v_permlane16_var_b32 v5, v1, v255 ; encoding: [0x05,0x00,0x15,0xd7,0x01,0xff,0x03,0x02] + +v_permlane16_var_b32 v5, v255, v0 +// GFX13: v_permlane16_var_b32 v5, v255, v0 ; encoding: [0x05,0x00,0x15,0xd7,0xff,0x01,0x02,0x02] + +v_permlane16_var_b32 v255, v1, v2 +// GFX13: v_permlane16_var_b32 v255, v1, v2 ; encoding: [0xff,0x00,0x15,0xd7,0x01,0x05,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[1,1] +// GFX13: v_permlane16_var_b32 v5, v1, v50 op_sel:[1,1] ; encoding: [0x05,0x18,0x15,0xd7,0x01,0x65,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[0,0] +// GFX13: v_permlane16_var_b32 v5, v1, v50 ; encoding: [0x05,0x00,0x15,0xd7,0x01,0x65,0x02,0x02] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[1,0] +// GFX13: v_permlane16_var_b32 v5, v1, v50 op_sel:[1,0] ; encoding: [0x05,0x08,0x15,0xd7,0x01,0x65,0x02,0x02] + +v_permlane16_var_b32 v255, v255, v0, op_sel:[0,1] +// GFX13: v_permlane16_var_b32 v255, v255, v0 op_sel:[0,1] ; encoding: [0xff,0x10,0x15,0xd7,0xff,0x01,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v2 +// GFX13: v_permlanex16_var_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0x05,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v105 +// GFX13: v_permlanex16_var_b32 v5, v1, v105 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0xd3,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v255 +// GFX13: v_permlanex16_var_b32 v5, v1, v255 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0xff,0x03,0x02] + +v_permlanex16_var_b32 v255, v1, v2 +// GFX13: v_permlanex16_var_b32 v255, v1, v2 ; encoding: [0xff,0x00,0x16,0xd7,0x01,0x05,0x02,0x02] + +v_permlanex16_var_b32 v1, v255, v2 +// GFX13: v_permlanex16_var_b32 v1, v255, v2 ; encoding: [0x01,0x00,0x16,0xd7,0xff,0x05,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[1,1] +// GFX13: v_permlanex16_var_b32 v5, v1, v100 op_sel:[1,1] ; encoding: [0x05,0x18,0x16,0xd7,0x01,0xc9,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[0,0] +// GFX13: v_permlanex16_var_b32 v5, v1, v100 ; encoding: [0x05,0x00,0x16,0xd7,0x01,0xc9,0x02,0x02] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[1,0] +// GFX13: v_permlanex16_var_b32 v5, v1, v100 op_sel:[1,0] ; encoding: [0x05,0x08,0x16,0xd7,0x01,0xc9,0x02,0x02] + +v_permlanex16_var_b32 v255, v255, v100, op_sel:[0,1] +// GFX13: v_permlanex16_var_b32 v255, v255, v100 op_sel:[0,1] ; encoding: [0xff,0x10,0x16,0xd7,0xff,0xc9,0x02,0x02] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0x05,0xea,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], v255, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0xff,0xeb,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], s2, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0x05,0xe8,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[1:2], s105, ttmp[14:15] ; encoding: [0x05,0x00,0x3a,0xd6,0x01,0xd3,0xe8,0x01] + +v_qsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] +// GFX13: v_qsad_pk_u16_u8 v[5:6], v[254:255], ttmp15, s[6:7] ; encoding: [0x05,0x00,0x3a,0xd6,0xfe,0xf7,0x18,0x00] + +v_qsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] +// GFX13: v_qsad_pk_u16_u8 v[5:6], s[2:3], vcc_hi, v[3:4] ; encoding: [0x05,0x00,0x3a,0xd6,0x02,0xd6,0x0c,0x04] + +v_qsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] +// GFX13: v_qsad_pk_u16_u8 v[5:6], s[104:105], vcc_lo, s[104:105] ; encoding: [0x05,0x00,0x3a,0xd6,0x68,0xd4,0xa0,0x01] + +v_qsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] +// GFX13: v_qsad_pk_u16_u8 v[5:6], vcc, m0, v[254:255] ; encoding: [0x05,0x00,0x3a,0xd6,0x6a,0xfa,0xf8,0x07] + +v_qsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null +// GFX13: v_qsad_pk_u16_u8 v[5:6], ttmp[14:15], exec_hi, null ; encoding: [0x05,0x00,0x3a,0xd6,0x7a,0xfe,0xf0,0x01] + +v_qsad_pk_u16_u8 v[5:6], exec, exec_lo, exec +// GFX13: v_qsad_pk_u16_u8 v[5:6], exec, exec_lo, exec ; encoding: [0x05,0x00,0x3a,0xd6,0x7e,0xfc,0xf8,0x01] + +v_qsad_pk_u16_u8 v[5:6], null, null, vcc +// GFX13: v_qsad_pk_u16_u8 v[5:6], null, null, vcc ; encoding: [0x05,0x00,0x3a,0xd6,0x7c,0xf8,0xa8,0x01] + +v_qsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 +// GFX13: v_qsad_pk_u16_u8 v[5:6], -1, -1, 0xaf123456 ; encoding: [0x05,0x00,0x3a,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] + +v_qsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc +// GFX13: v_qsad_pk_u16_u8 v[5:6], 0.5, 0.5, src_scc ; encoding: [0x05,0x00,0x3a,0xd6,0xf0,0xe0,0xf5,0x03] + +v_qsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 +// GFX13: v_qsad_pk_u16_u8 v[5:6], src_scc, src_scc, 0.5 ; encoding: [0x05,0x00,0x3a,0xd6,0xfd,0xfa,0xc1,0x03] + +v_qsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp +// GFX13: v_qsad_pk_u16_u8 v[254:255], 0xaf123456, 0xaf123456, -1 clamp ; encoding: [0xfe,0x80,0x3a,0xd6,0xff,0xfe,0x05,0x03,0x56,0x34,0x12,0xaf] + +v_readlane_b32 s5, v1, s2 +// GFX13: v_readlane_b32 s5, v1, s2 ; encoding: [0x05,0x00,0x60,0xd7,0x01,0x05,0x00,0x02] + +v_readlane_b32 s5, v1, s105 +// GFX13: v_readlane_b32 s5, v1, s105 ; encoding: [0x05,0x00,0x60,0xd7,0x01,0xd3,0x00,0x02] + +v_readlane_b32 s105, v1, ttmp15 +// GFX13: v_readlane_b32 s105, v1, ttmp15 ; encoding: [0x69,0x00,0x60,0xd7,0x01,0xf7,0x00,0x02] + +v_readlane_b32 vcc_lo, v1, vcc_hi +// GFX13: v_readlane_b32 vcc_lo, v1, vcc_hi ; encoding: [0x6a,0x00,0x60,0xd7,0x01,0xd7,0x00,0x02] + +v_readlane_b32 vcc_hi, v1, vcc_lo +// GFX13: v_readlane_b32 vcc_hi, v1, vcc_lo ; encoding: [0x6b,0x00,0x60,0xd7,0x01,0xd5,0x00,0x02] + +v_readlane_b32 ttmp15, v1, m0 +// GFX13: v_readlane_b32 ttmp15, v1, m0 ; encoding: [0x7b,0x00,0x60,0xd7,0x01,0xfb,0x00,0x02] + +v_readlane_b32 null, v255, null +// GFX13: v_readlane_b32 null, v255, null ; encoding: [0x7c,0x00,0x60,0xd7,0xff,0xf9,0x00,0x02] + +v_sad_hi_u8 v5, v1, v2, s3 +// GFX13: v_sad_hi_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x23,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_hi_u8 v5, v255, s2, s105 +// GFX13: v_sad_hi_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x23,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_hi_u8 v5, s1, v255, exec_hi +// GFX13: v_sad_hi_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x23,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_hi_u8 v5, s105, s105, exec_lo +// GFX13: v_sad_hi_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x23,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_hi_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_hi_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x23,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_hi_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_sad_hi_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x23,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_sad_hi_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_hi_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x23,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_hi_u8 v5, m0, 0.5, m0 +// GFX13: v_sad_hi_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x23,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_hi_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_hi_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x23,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_hi_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_hi_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x23,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_hi_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_hi_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x23,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_hi_u8 v5, -1, exec_hi, src_scc +// GFX13: v_sad_hi_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x23,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_hi_u8 v5, 0.5, m0, 0.5 +// GFX13: v_sad_hi_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x23,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_hi_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_hi_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x23,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_hi_u8 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_sad_hi_u8 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x23,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_sad_u16 v5, v1, v2, s3 +// GFX13: v_sad_u16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x24,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_u16 v5, v255, s2, s105 +// GFX13: v_sad_u16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x24,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_u16 v5, s1, v255, exec_hi +// GFX13: v_sad_u16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x24,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_u16 v5, s105, s105, exec_lo +// GFX13: v_sad_u16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x24,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_u16 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_u16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x24,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_u16 v5, vcc_hi, 0xfe0b, v255 +// GFX13: v_sad_u16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x24,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_sad_u16 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x24,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_u16 v5, m0, 0.5, m0 +// GFX13: v_sad_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x24,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_u16 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_u16 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x24,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_u16 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_u16 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x24,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_u16 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_u16 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x24,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_u16 v5, -1, exec_hi, src_scc +// GFX13: v_sad_u16 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x24,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_u16 v5, 0.5, m0, 0.5 +// GFX13: v_sad_u16 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x24,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_u16 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_u16 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x24,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_u16 v255, 0xfe0b, vcc_hi, null clamp +// GFX13: v_sad_u16 v255, 0xfe0b, vcc_hi, null clamp ; encoding: [0xff,0x80,0x24,0xd6,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00] + +v_sad_u32 v5, v1, v2, s3 +// GFX13: v_sad_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x25,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_u32 v5, v255, s2, s105 +// GFX13: v_sad_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x25,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_u32 v5, s1, v255, exec_hi +// GFX13: v_sad_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x25,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_u32 v5, s105, s105, exec_lo +// GFX13: v_sad_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x25,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x25,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_sad_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x25,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_sad_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x25,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_u32 v5, m0, 0.5, m0 +// GFX13: v_sad_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x25,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x25,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x25,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x25,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_u32 v5, -1, exec_hi, src_scc +// GFX13: v_sad_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x25,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_u32 v5, 0.5, m0, 0.5 +// GFX13: v_sad_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x25,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x25,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_u32 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_sad_u32 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x25,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_sad_u8 v5, v1, v2, s3 +// GFX13: v_sad_u8 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x22,0xd6,0x01,0x05,0x0e,0x00] + +v_sad_u8 v5, v255, s2, s105 +// GFX13: v_sad_u8 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x22,0xd6,0xff,0x05,0xa4,0x01] + +v_sad_u8 v5, s1, v255, exec_hi +// GFX13: v_sad_u8 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x22,0xd6,0x01,0xfe,0xff,0x01] + +v_sad_u8 v5, s105, s105, exec_lo +// GFX13: v_sad_u8 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x22,0xd6,0x69,0xd2,0xf8,0x01] + +v_sad_u8 v5, vcc_lo, ttmp15, v3 +// GFX13: v_sad_u8 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x22,0xd6,0x6a,0xf6,0x0c,0x04] + +v_sad_u8 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_sad_u8 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x22,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_sad_u8 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_sad_u8 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x22,0xd6,0x7b,0xfa,0xed,0x01] + +v_sad_u8 v5, m0, 0.5, m0 +// GFX13: v_sad_u8 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x22,0xd6,0x7d,0xe0,0xf5,0x01] + +v_sad_u8 v5, exec_lo, -1, vcc_hi +// GFX13: v_sad_u8 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x22,0xd6,0x7e,0x82,0xad,0x01] + +v_sad_u8 v5, exec_hi, null, vcc_lo +// GFX13: v_sad_u8 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x22,0xd6,0x7f,0xf8,0xa8,0x01] + +v_sad_u8 v5, null, exec_lo, 0xaf123456 +// GFX13: v_sad_u8 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x22,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_sad_u8 v5, -1, exec_hi, src_scc +// GFX13: v_sad_u8 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x22,0xd6,0xc1,0xfe,0xf4,0x03] + +v_sad_u8 v5, 0.5, m0, 0.5 +// GFX13: v_sad_u8 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x22,0xd6,0xf0,0xfa,0xc0,0x03] + +v_sad_u8 v5, src_scc, vcc_lo, -1 +// GFX13: v_sad_u8 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x22,0xd6,0xfd,0xd4,0x04,0x03] + +v_sad_u8 v255, 0xaf123456, vcc_hi, null clamp +// GFX13: v_sad_u8 v255, 0xaf123456, vcc_hi, null clamp ; encoding: [0xff,0x80,0x22,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_sub_co_u32 v5, s6, v1, v2 +// W32: v_sub_co_u32 v5, s6, v1, v2 ; encoding: [0x05,0x06,0x10,0xd7,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, v255, v255 +// W32: v_sub_co_u32 v5, s6, v255, v255 ; encoding: [0x05,0x06,0x10,0xd7,0xff,0xff,0x03,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, s1, s2 +// W32: v_sub_co_u32 v5, s6, s1, s2 ; encoding: [0x05,0x06,0x10,0xd7,0x01,0x04,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, s105, s105 +// W32: v_sub_co_u32 v5, s6, s105, s105 ; encoding: [0x05,0x06,0x10,0xd7,0x69,0xd2,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, vcc_lo, ttmp15 +// W32: v_sub_co_u32 v5, s6, vcc_lo, ttmp15 ; encoding: [0x05,0x06,0x10,0xd7,0x6a,0xf6,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, vcc_hi, 0xaf123456 +// W32: v_sub_co_u32 v5, s6, vcc_hi, 0xaf123456 ; encoding: [0x05,0x06,0x10,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, ttmp15, src_scc +// W32: v_sub_co_u32 v5, s6, ttmp15, src_scc ; encoding: [0x05,0x06,0x10,0xd7,0x7b,0xfa,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, m0, 0.5 +// W32: v_sub_co_u32 v5, s6, m0, 0.5 ; encoding: [0x05,0x06,0x10,0xd7,0x7d,0xe0,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, exec_lo, -1 +// W32: v_sub_co_u32 v5, s6, exec_lo, -1 ; encoding: [0x05,0x06,0x10,0xd7,0x7e,0x82,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s6, exec_hi, null +// W32: v_sub_co_u32 v5, s6, exec_hi, null ; encoding: [0x05,0x06,0x10,0xd7,0x7f,0xf8,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s105, null, exec_lo +// W32: v_sub_co_u32 v5, s105, null, exec_lo ; encoding: [0x05,0x69,0x10,0xd7,0x7c,0xfc,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, vcc_lo, -1, exec_hi +// W32: v_sub_co_u32 v5, vcc_lo, -1, exec_hi ; encoding: [0x05,0x6a,0x10,0xd7,0xc1,0xfe,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, vcc_hi, 0.5, m0 +// W32: v_sub_co_u32 v5, vcc_hi, 0.5, m0 ; encoding: [0x05,0x6b,0x10,0xd7,0xf0,0xfa,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, ttmp15, src_scc, vcc_lo +// W32: v_sub_co_u32 v5, ttmp15, src_scc, vcc_lo ; encoding: [0x05,0x7b,0x10,0xd7,0xfd,0xd4,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:18: error: invalid operand for instruction + +v_sub_co_u32 v5, s[12:13], v1, v2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], v1, v2 ; encoding: [0x05,0x0c,0x10,0xd7,0x01,0x05,0x02,0x02] + +v_sub_co_u32 v5, s[12:13], v255, v255 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], v255, v255 ; encoding: [0x05,0x0c,0x10,0xd7,0xff,0xff,0x03,0x02] + +v_sub_co_u32 v5, s[12:13], s1, s2 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], s1, s2 ; encoding: [0x05,0x0c,0x10,0xd7,0x01,0x04,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], s105, s105 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], s105, s105 ; encoding: [0x05,0x0c,0x10,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], vcc_lo, ttmp15 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], vcc_lo, ttmp15 ; encoding: [0x05,0x0c,0x10,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 ; encoding: [0x05,0x0c,0x10,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_sub_co_u32 v5, s[12:13], ttmp15, src_scc +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], ttmp15, src_scc ; encoding: [0x05,0x0c,0x10,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_co_u32 v5, s[12:13], m0, 0.5 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], m0, 0.5 ; encoding: [0x05,0x0c,0x10,0xd7,0x7d,0xe0,0x01,0x02] + +v_sub_co_u32 v5, s[12:13], exec_lo, -1 +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], exec_lo, -1 ; encoding: [0x05,0x0c,0x10,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_co_u32 v5, s[12:13], exec_hi, null +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], exec_hi, null ; encoding: [0x05,0x0c,0x10,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_co_u32 v5, s[12:13], null, exec_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[12:13], null, exec_lo ; encoding: [0x05,0x0c,0x10,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_co_u32 v5, s[104:105], -1, exec_hi +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, s[104:105], -1, exec_hi ; encoding: [0x05,0x68,0x10,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_co_u32 v5, vcc, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// W64: v_sub_co_u32 v5, vcc, 0.5, m0 ; encoding: [0x05,0x6a,0x10,0xd7,0xf0,0xfa,0x00,0x02] + +v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo +// W32-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction +// W64: v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo ; encoding: [0x05,0x7a,0x10,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp +// GFX13: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x10,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_sub_nc_i16 v5.l, v1.l, v2.l +// GFX13: v_sub_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x02] + +v_sub_nc_i16 v5.l, v255.l, v255.l +// GFX13: v_sub_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x02] + +v_sub_nc_i16 v5.l, s1, s2 +// GFX13: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x02] + +v_sub_nc_i16 v5.l, s105, s105 +// GFX13: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_nc_i16 v5.l, vcc_lo, ttmp15 +// GFX13: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_sub_nc_i16 v5.l, ttmp15, src_scc +// GFX13: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_nc_i16 v5.l, m0, 0.5 +// GFX13-ASM: v_sub_nc_i16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_i16 v5.l, exec_lo, -1 +// GFX13: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_nc_i16 v5.l, exec_hi, null +// GFX13: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] +// GFX13: v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_sub_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_sub_nc_i32 v5, v1, v2 +// GFX13: v_sub_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x76,0xd7,0x01,0x05,0x02,0x02] + +v_sub_nc_i32 v5, v255, v255 +// GFX13: v_sub_nc_i32 v5, v255, v255 ; encoding: [0x05,0x00,0x76,0xd7,0xff,0xff,0x03,0x02] + +v_sub_nc_i32 v5, s1, s2 +// GFX13: v_sub_nc_i32 v5, s1, s2 ; encoding: [0x05,0x00,0x76,0xd7,0x01,0x04,0x00,0x02] + +v_sub_nc_i32 v5, s105, s105 +// GFX13: v_sub_nc_i32 v5, s105, s105 ; encoding: [0x05,0x00,0x76,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_nc_i32 v5, vcc_lo, ttmp15 +// GFX13: v_sub_nc_i32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x76,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_nc_i32 v5, vcc_hi, 0xaf123456 +// GFX13: v_sub_nc_i32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x76,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_sub_nc_i32 v5, ttmp15, src_scc +// GFX13: v_sub_nc_i32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x76,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_nc_i32 v5, m0, 0.5 +// GFX13: v_sub_nc_i32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x76,0xd7,0x7d,0xe0,0x01,0x02] + +v_sub_nc_i32 v5, exec_lo, -1 +// GFX13: v_sub_nc_i32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x76,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_nc_i32 v5, exec_hi, null +// GFX13: v_sub_nc_i32 v5, exec_hi, null ; encoding: [0x05,0x00,0x76,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_nc_i32 v5, null, exec_lo +// GFX13: v_sub_nc_i32 v5, null, exec_lo ; encoding: [0x05,0x00,0x76,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_nc_i32 v5, -1, exec_hi +// GFX13: v_sub_nc_i32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x76,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_nc_i32 v5, 0.5, m0 +// GFX13: v_sub_nc_i32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x76,0xd7,0xf0,0xfa,0x00,0x02] + +v_sub_nc_i32 v5, src_scc, vcc_lo +// GFX13: v_sub_nc_i32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x76,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp +// GFX13: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x76,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_sub_nc_u16 v5.l, v1.l, v2.l +// GFX13: v_sub_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x02] + +v_sub_nc_u16 v5.l, v255.l, v255.l +// GFX13: v_sub_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x02] + +v_sub_nc_u16 v5.l, s1, s2 +// GFX13: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x02] + +v_sub_nc_u16 v5.l, s105, s105 +// GFX13: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x02] + +v_sub_nc_u16 v5.l, vcc_lo, ttmp15 +// GFX13: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x02] + +v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_sub_nc_u16 v5.l, ttmp15, src_scc +// GFX13: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x02] + +v_sub_nc_u16 v5.l, m0, 0.5 +// GFX13-ASM: v_sub_nc_u16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_u16 v5.l, exec_lo, -1 +// GFX13: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x02] + +v_sub_nc_u16 v5.l, exec_hi, null +// GFX13: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x02] + +v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] +// GFX13: v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x02] + +v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0] +// GFX13: v_sub_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x02] + +v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] +// GFX13-ASM: v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] +// GFX13: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x02] + +v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX13: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_subrev_co_u32 v5, s6, v1, v2 +// W32: v_subrev_co_u32 v5, s6, v1, v2 ; encoding: [0x05,0x06,0x19,0xd7,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, v255, v255 +// W32: v_subrev_co_u32 v5, s6, v255, v255 ; encoding: [0x05,0x06,0x19,0xd7,0xff,0xff,0x03,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, s1, s2 +// W32: v_subrev_co_u32 v5, s6, s1, s2 ; encoding: [0x05,0x06,0x19,0xd7,0x01,0x04,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, s105, s105 +// W32: v_subrev_co_u32 v5, s6, s105, s105 ; encoding: [0x05,0x06,0x19,0xd7,0x69,0xd2,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, vcc_lo, ttmp15 +// W32: v_subrev_co_u32 v5, s6, vcc_lo, ttmp15 ; encoding: [0x05,0x06,0x19,0xd7,0x6a,0xf6,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, vcc_hi, 0xaf123456 +// W32: v_subrev_co_u32 v5, s6, vcc_hi, 0xaf123456 ; encoding: [0x05,0x06,0x19,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, ttmp15, src_scc +// W32: v_subrev_co_u32 v5, s6, ttmp15, src_scc ; encoding: [0x05,0x06,0x19,0xd7,0x7b,0xfa,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, m0, 0.5 +// W32: v_subrev_co_u32 v5, s6, m0, 0.5 ; encoding: [0x05,0x06,0x19,0xd7,0x7d,0xe0,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, exec_lo, -1 +// W32: v_subrev_co_u32 v5, s6, exec_lo, -1 ; encoding: [0x05,0x06,0x19,0xd7,0x7e,0x82,0x01,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s6, exec_hi, null +// W32: v_subrev_co_u32 v5, s6, exec_hi, null ; encoding: [0x05,0x06,0x19,0xd7,0x7f,0xf8,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s105, null, exec_lo +// W32: v_subrev_co_u32 v5, s105, null, exec_lo ; encoding: [0x05,0x69,0x19,0xd7,0x7c,0xfc,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, vcc_lo, -1, exec_hi +// W32: v_subrev_co_u32 v5, vcc_lo, -1, exec_hi ; encoding: [0x05,0x6a,0x19,0xd7,0xc1,0xfe,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, vcc_hi, 0.5, m0 +// W32: v_subrev_co_u32 v5, vcc_hi, 0.5, m0 ; encoding: [0x05,0x6b,0x19,0xd7,0xf0,0xfa,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, ttmp15, src_scc, vcc_lo +// W32: v_subrev_co_u32 v5, ttmp15, src_scc, vcc_lo ; encoding: [0x05,0x7b,0x19,0xd7,0xfd,0xd4,0x00,0x02] +// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction + +v_subrev_co_u32 v5, s[12:13], v1, v2 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], v1, v2 ; encoding: [0x05,0x0c,0x19,0xd7,0x01,0x05,0x02,0x02] + +v_subrev_co_u32 v5, s[12:13], v255, v255 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], v255, v255 ; encoding: [0x05,0x0c,0x19,0xd7,0xff,0xff,0x03,0x02] + +v_subrev_co_u32 v5, s[12:13], s1, s2 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], s1, s2 ; encoding: [0x05,0x0c,0x19,0xd7,0x01,0x04,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], s105, s105 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], s105, s105 ; encoding: [0x05,0x0c,0x19,0xd7,0x69,0xd2,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], vcc_lo, ttmp15 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], vcc_lo, ttmp15 ; encoding: [0x05,0x0c,0x19,0xd7,0x6a,0xf6,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], vcc_hi, 0xaf123456 ; encoding: [0x05,0x0c,0x19,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_subrev_co_u32 v5, s[12:13], ttmp15, src_scc +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], ttmp15, src_scc ; encoding: [0x05,0x0c,0x19,0xd7,0x7b,0xfa,0x01,0x02] + +v_subrev_co_u32 v5, s[12:13], m0, 0.5 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], m0, 0.5 ; encoding: [0x05,0x0c,0x19,0xd7,0x7d,0xe0,0x01,0x02] + +v_subrev_co_u32 v5, s[12:13], exec_lo, -1 +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], exec_lo, -1 ; encoding: [0x05,0x0c,0x19,0xd7,0x7e,0x82,0x01,0x02] + +v_subrev_co_u32 v5, s[12:13], exec_hi, null +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], exec_hi, null ; encoding: [0x05,0x0c,0x19,0xd7,0x7f,0xf8,0x00,0x02] + +v_subrev_co_u32 v5, s[12:13], null, exec_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[12:13], null, exec_lo ; encoding: [0x05,0x0c,0x19,0xd7,0x7c,0xfc,0x00,0x02] + +v_subrev_co_u32 v5, s[104:105], -1, exec_hi +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, s[104:105], -1, exec_hi ; encoding: [0x05,0x68,0x19,0xd7,0xc1,0xfe,0x00,0x02] + +v_subrev_co_u32 v5, vcc, 0.5, m0 +// W32-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// W64: v_subrev_co_u32 v5, vcc, 0.5, m0 ; encoding: [0x05,0x6a,0x19,0xd7,0xf0,0xfa,0x00,0x02] + +v_subrev_co_u32 v5, ttmp[14:15], src_scc, vcc_lo +// W32-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction +// W64: v_subrev_co_u32 v5, ttmp[14:15], src_scc, vcc_lo ; encoding: [0x05,0x7a,0x19,0xd7,0xfd,0xd4,0x00,0x02] + +v_subrev_co_u32 v255, null, 0xaf123456, vcc_hi clamp +// GFX13: v_subrev_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x19,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_trig_preop_f64 v[5:6], v[1:2], v2 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], v2 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0x05,0x02,0x02] + +v_trig_preop_f64 v[5:6], v[1:2], v255 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], v255 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0xff,0x03,0x02] + +v_trig_preop_f64 v[5:6], v[1:2], s2 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], s2 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0x05,0x00,0x02] + +v_trig_preop_f64 v[5:6], v[1:2], s105 +// GFX13: v_trig_preop_f64 v[5:6], v[1:2], s105 ; encoding: [0x05,0x00,0x2f,0xd7,0x01,0xd3,0x00,0x02] + +v_trig_preop_f64 v[5:6], v[254:255], ttmp15 +// GFX13: v_trig_preop_f64 v[5:6], v[254:255], ttmp15 ; encoding: [0x05,0x00,0x2f,0xd7,0xfe,0xf7,0x00,0x02] + +v_trig_preop_f64 v[5:6], s[2:3], vcc_hi +// GFX13: v_trig_preop_f64 v[5:6], s[2:3], vcc_hi ; encoding: [0x05,0x00,0x2f,0xd7,0x02,0xd6,0x00,0x02] + +v_trig_preop_f64 v[5:6], s[104:105], vcc_lo +// GFX13: v_trig_preop_f64 v[5:6], s[104:105], vcc_lo ; encoding: [0x05,0x00,0x2f,0xd7,0x68,0xd4,0x00,0x02] + +v_trig_preop_f64 v[5:6], vcc, m0 +// GFX13: v_trig_preop_f64 v[5:6], vcc, m0 ; encoding: [0x05,0x00,0x2f,0xd7,0x6a,0xfa,0x00,0x02] + +v_trig_preop_f64 v[5:6], ttmp[14:15], exec_hi +// GFX13: v_trig_preop_f64 v[5:6], ttmp[14:15], exec_hi ; encoding: [0x05,0x00,0x2f,0xd7,0x7a,0xfe,0x00,0x02] + +v_trig_preop_f64 v[5:6], exec, exec_lo +// GFX13: v_trig_preop_f64 v[5:6], exec, exec_lo ; encoding: [0x05,0x00,0x2f,0xd7,0x7e,0xfc,0x00,0x02] + +v_trig_preop_f64 v[5:6], null, null +// GFX13: v_trig_preop_f64 v[5:6], null, null ; encoding: [0x05,0x00,0x2f,0xd7,0x7c,0xf8,0x00,0x02] + +v_trig_preop_f64 v[5:6], -1, -1 +// GFX13: v_trig_preop_f64 v[5:6], -1, -1 ; encoding: [0x05,0x00,0x2f,0xd7,0xc1,0x82,0x01,0x02] + +v_trig_preop_f64 v[5:6], 0.5, 0.5 mul:2 +// GFX13: v_trig_preop_f64 v[5:6], 0.5, 0.5 mul:2 ; encoding: [0x05,0x00,0x2f,0xd7,0xf0,0xe0,0x01,0x0a] + +v_trig_preop_f64 v[5:6], -|src_scc|, src_scc mul:4 +// GFX13: v_trig_preop_f64 v[5:6], -|src_scc|, src_scc mul:4 ; encoding: [0x05,0x01,0x2f,0xd7,0xfd,0xfa,0x01,0x32] + +v_trig_preop_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 +// GFX13: v_trig_preop_f64 v[254:255], 0xaf123456, 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x2f,0xd7,0xff,0xfe,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_writelane_b32 v5, s1, s2 +// GFX13: v_writelane_b32 v5, s1, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x01,0x04,0x00,0x02] + +v_writelane_b32 v5, s105, s2 +// GFX13: v_writelane_b32 v5, s105, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x69,0x04,0x00,0x02] + +v_writelane_b32 v5, vcc_lo, s2 +// GFX13: v_writelane_b32 v5, vcc_lo, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x6a,0x04,0x00,0x02] + +v_writelane_b32 v5, vcc_hi, s2 +// GFX13: v_writelane_b32 v5, vcc_hi, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x6b,0x04,0x00,0x02] + +v_writelane_b32 v5, ttmp15, s2 +// GFX13: v_writelane_b32 v5, ttmp15, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x7b,0x04,0x00,0x02] + +v_writelane_b32 v5, m0, s2 +// GFX13: v_writelane_b32 v5, m0, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x7d,0x04,0x00,0x02] + +v_writelane_b32 v5, exec_lo, s2 +// GFX13: v_writelane_b32 v5, exec_lo, s2 ; encoding: [0x05,0x00,0x61,0xd7,0x7e,0x04,0x00,0x02] + +v_writelane_b32 v5, exec_hi, s105 +// GFX13: v_writelane_b32 v5, exec_hi, s105 ; encoding: [0x05,0x00,0x61,0xd7,0x7f,0xd2,0x00,0x02] + +v_writelane_b32 v5, null, ttmp15 +// GFX13: v_writelane_b32 v5, null, ttmp15 ; encoding: [0x05,0x00,0x61,0xd7,0x7c,0xf6,0x00,0x02] + +v_writelane_b32 v5, -1, null +// GFX13: v_writelane_b32 v5, -1, null ; encoding: [0x05,0x00,0x61,0xd7,0xc1,0xf8,0x00,0x02] + +v_writelane_b32 v5, 0.5, m0 +// GFX13: v_writelane_b32 v5, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd7,0xf0,0xfa,0x00,0x02] + +v_writelane_b32 v5, src_scc, vcc_lo +// GFX13: v_writelane_b32 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x61,0xd7,0xfd,0xd4,0x00,0x02] + +v_writelane_b32 v255, 0xaf123456, vcc_hi +// GFX13: v_writelane_b32 v255, 0xaf123456, vcc_hi ; encoding: [0xff,0x00,0x61,0xd7,0xff,0xd6,0x00,0x02,0x56,0x34,0x12,0xaf] + +v_xad_u32 v5, v1, v2, s3 +// GFX13: v_xad_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x45,0xd7,0x01,0x05,0x0e,0x00] + +v_xad_u32 v5, v255, s2, s105 +// GFX13: v_xad_u32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x45,0xd7,0xff,0x05,0xa4,0x01] + +v_xad_u32 v5, s1, v255, exec_hi +// GFX13: v_xad_u32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x45,0xd7,0x01,0xfe,0xff,0x01] + +v_xad_u32 v5, s105, s105, exec_lo +// GFX13: v_xad_u32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x45,0xd7,0x69,0xd2,0xf8,0x01] + +v_xad_u32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_xad_u32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x45,0xd7,0x6a,0xf6,0x0c,0x04] + +v_xad_u32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_xad_u32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x45,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_xad_u32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_xad_u32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x45,0xd7,0x7b,0xfa,0xed,0x01] + +v_xad_u32 v5, m0, 0.5, m0 +// GFX13: v_xad_u32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x45,0xd7,0x7d,0xe0,0xf5,0x01] + +v_xad_u32 v5, exec_lo, -1, vcc_hi +// GFX13: v_xad_u32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x45,0xd7,0x7e,0x82,0xad,0x01] + +v_xad_u32 v5, exec_hi, null, vcc_lo +// GFX13: v_xad_u32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x45,0xd7,0x7f,0xf8,0xa8,0x01] + +v_xad_u32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_xad_u32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x45,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_xad_u32 v5, -1, exec_hi, src_scc +// GFX13: v_xad_u32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x45,0xd7,0xc1,0xfe,0xf4,0x03] + +v_xad_u32 v5, 0.5, m0, 0.5 +// GFX13: v_xad_u32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x45,0xd7,0xf0,0xfa,0xc0,0x03] + +v_xad_u32 v5, src_scc, vcc_lo, -1 +// GFX13: v_xad_u32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x45,0xd7,0xfd,0xd4,0x04,0x03] + +v_xad_u32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_xad_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x45,0xd7,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_xor3_b32 v5, v1, v2, s3 +// GFX13: v_xor3_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x40,0xd6,0x01,0x05,0x0e,0x00] + +v_xor3_b32 v5, v255, s2, s105 +// GFX13: v_xor3_b32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x40,0xd6,0xff,0x05,0xa4,0x01] + +v_xor3_b32 v5, s1, v255, exec_hi +// GFX13: v_xor3_b32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x40,0xd6,0x01,0xfe,0xff,0x01] + +v_xor3_b32 v5, s105, s105, exec_lo +// GFX13: v_xor3_b32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x40,0xd6,0x69,0xd2,0xf8,0x01] + +v_xor3_b32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_xor3_b32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x40,0xd6,0x6a,0xf6,0x0c,0x04] + +v_xor3_b32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_xor3_b32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x40,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_xor3_b32 v5, ttmp15, src_scc, ttmp15 +// GFX13: v_xor3_b32 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x00,0x40,0xd6,0x7b,0xfa,0xed,0x01] + +v_xor3_b32 v5, m0, 0.5, m0 +// GFX13: v_xor3_b32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x40,0xd6,0x7d,0xe0,0xf5,0x01] + +v_xor3_b32 v5, exec_lo, -1, vcc_hi +// GFX13: v_xor3_b32 v5, exec_lo, -1, vcc_hi ; encoding: [0x05,0x00,0x40,0xd6,0x7e,0x82,0xad,0x01] + +v_xor3_b32 v5, exec_hi, null, vcc_lo +// GFX13: v_xor3_b32 v5, exec_hi, null, vcc_lo ; encoding: [0x05,0x00,0x40,0xd6,0x7f,0xf8,0xa8,0x01] + +v_xor3_b32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_xor3_b32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x40,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_xor3_b32 v5, -1, exec_hi, src_scc +// GFX13: v_xor3_b32 v5, -1, exec_hi, src_scc ; encoding: [0x05,0x00,0x40,0xd6,0xc1,0xfe,0xf4,0x03] + +v_xor3_b32 v5, 0.5, m0, 0.5 +// GFX13: v_xor3_b32 v5, 0.5, m0, 0.5 ; encoding: [0x05,0x00,0x40,0xd6,0xf0,0xfa,0xc0,0x03] + +v_xor3_b32 v5, src_scc, vcc_lo, -1 +// GFX13: v_xor3_b32 v5, src_scc, vcc_lo, -1 ; encoding: [0x05,0x00,0x40,0xd6,0xfd,0xd4,0x04,0x03] + +v_xor3_b32 v255, 0xaf123456, vcc_hi, null +// GFX13: v_xor3_b32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x40,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] + +v_xor_b16 v5.l, v1.l, v2.l +// GFX13: v_xor_b16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x70,0xd7,0x01,0x05,0x02,0x02] + +v_xor_b16 v5.l, v255.l, v255.l +// GFX13: v_xor_b16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x70,0xd7,0xff,0xff,0x03,0x02] + +v_xor_b16 v5.l, s1, s2 +// GFX13: v_xor_b16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x70,0xd7,0x01,0x04,0x00,0x02] + +v_xor_b16 v5.l, s105, s105 +// GFX13: v_xor_b16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x70,0xd7,0x69,0xd2,0x00,0x02] + +v_xor_b16 v5.l, vcc_lo, ttmp15 +// GFX13: v_xor_b16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x70,0xd7,0x6a,0xf6,0x00,0x02] + +v_xor_b16 v5.l, vcc_hi, 0xfe0b +// GFX13: v_xor_b16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x70,0xd7,0x6b,0xfe,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_xor_b16 v5.l, ttmp15, src_scc +// GFX13: v_xor_b16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x70,0xd7,0x7b,0xfa,0x01,0x02] + +v_xor_b16 v5.l, m0, 0.5 +// GFX13-ASM: v_xor_b16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x70,0xd7,0x7d,0xe0,0x01,0x02] +// GFX13-DIS: v_xor_b16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x70,0xd7,0x7d,0xfe,0x01,0x02,0x00,0x38,0x00,0x00] + +v_xor_b16 v5.l, exec_lo, -1 +// GFX13: v_xor_b16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x70,0xd7,0x7e,0x82,0x01,0x02] + +v_xor_b16 v5.l, exec_hi, null +// GFX13: v_xor_b16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x70,0xd7,0x7f,0xf8,0x00,0x02] + +v_xor_b16 v5.l, null, exec_lo +// GFX13: v_xor_b16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x70,0xd7,0x7c,0xfc,0x00,0x02] + +v_xor_b16 v5.l, -1, exec_hi +// GFX13: v_xor_b16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x70,0xd7,0xc1,0xfe,0x00,0x02] + +v_xor_b16 v5.l, 0.5, m0 +// GFX13-ASM: v_xor_b16 v5.l, 0.5, m0 ; encoding: [0x05,0x00,0x70,0xd7,0xf0,0xfa,0x00,0x02] +// GFX13-DIS: v_xor_b16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x70,0xd7,0xff,0xfa,0x00,0x02,0x00,0x38,0x00,0x00] + +v_xor_b16 v5.l, src_scc, vcc_lo +// GFX13: v_xor_b16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x70,0xd7,0xfd,0xd4,0x00,0x02] + +v_xor_b16 v255.l, 0xfe0b, vcc_hi +// GFX13: v_xor_b16 v255.l, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x70,0xd7,0xff,0xd6,0x00,0x02,0x0b,0xfe,0x00,0x00] + +v_minimum_f32 v5, v1, v2 +// GFX13: v_minimum_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x3c,0xd7,0x01,0x05,0x02,0x02] + +v_minimum_f32 v5, v255, v255 +// GFX13: v_minimum_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x3c,0xd7,0xff,0xff,0x03,0x02] + +v_minimum_f32 v5, s1, s2 +// GFX13: v_minimum_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x3c,0xd7,0x01,0x04,0x00,0x02] + +v_minimum_f32 v5, s105, s105 +// GFX13: v_minimum_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x3c,0xd7,0x69,0xd2,0x00,0x02] + +v_minimum_f32 v5, vcc_lo, ttmp15 +// GFX13: v_minimum_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3c,0xd7,0x6a,0xf6,0x00,0x02] + +v_minimum_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_minimum_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x3c,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_minimum_f32 v5, ttmp15, src_scc +// GFX13: v_minimum_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3c,0xd7,0x7b,0xfa,0x01,0x02] + +v_minimum_f32 v5, m0, 0.5 +// GFX13: v_minimum_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3c,0xd7,0x7d,0xe0,0x01,0x02] + +v_minimum_f32 v5, exec_lo, -1 +// GFX13: v_minimum_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3c,0xd7,0x7e,0x82,0x01,0x02] + +v_minimum_f32 v5, |exec_hi|, null +// GFX13: v_minimum_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x3c,0xd7,0x7f,0xf8,0x00,0x02] + +v_minimum_f32 v5, null, exec_lo +// GFX13: v_minimum_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x3c,0xd7,0x7c,0xfc,0x00,0x02] + +v_minimum_f32 v5, -1, exec_hi +// GFX13: v_minimum_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x3c,0xd7,0xc1,0xfe,0x00,0x02] + +v_minimum_f32 v5, 0.5, -m0 +// GFX13: v_minimum_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x3c,0xd7,0xf0,0xfa,0x00,0x42] + +v_minimum_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_minimum_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3c,0xd7,0xfd,0xd4,0x00,0x22] + +v_minimum_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_minimum_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x3c,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_maximum_f32 v5, v1, v2 +// GFX13: v_maximum_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x3d,0xd7,0x01,0x05,0x02,0x02] + +v_maximum_f32 v5, v255, v255 +// GFX13: v_maximum_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x3d,0xd7,0xff,0xff,0x03,0x02] + +v_maximum_f32 v5, s1, s2 +// GFX13: v_maximum_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x3d,0xd7,0x01,0x04,0x00,0x02] + +v_maximum_f32 v5, s105, s105 +// GFX13: v_maximum_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x3d,0xd7,0x69,0xd2,0x00,0x02] + +v_maximum_f32 v5, vcc_lo, ttmp15 +// GFX13: v_maximum_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3d,0xd7,0x6a,0xf6,0x00,0x02] + +v_maximum_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_maximum_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x3d,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_maximum_f32 v5, ttmp15, src_scc +// GFX13: v_maximum_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3d,0xd7,0x7b,0xfa,0x01,0x02] + +v_maximum_f32 v5, m0, 0.5 +// GFX13: v_maximum_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3d,0xd7,0x7d,0xe0,0x01,0x02] + +v_maximum_f32 v5, exec_lo, -1 +// GFX13: v_maximum_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3d,0xd7,0x7e,0x82,0x01,0x02] + +v_maximum_f32 v5, |exec_hi|, null +// GFX13: v_maximum_f32 v5, |exec_hi|, null ; encoding: [0x05,0x01,0x3d,0xd7,0x7f,0xf8,0x00,0x02] + +v_maximum_f32 v5, null, exec_lo +// GFX13: v_maximum_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x3d,0xd7,0x7c,0xfc,0x00,0x02] + +v_maximum_f32 v5, -1, exec_hi +// GFX13: v_maximum_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x3d,0xd7,0xc1,0xfe,0x00,0x02] + +v_maximum_f32 v5, 0.5, -m0 +// GFX13: v_maximum_f32 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x3d,0xd7,0xf0,0xfa,0x00,0x42] + +v_maximum_f32 v5, -src_scc, |vcc_lo| +// GFX13: v_maximum_f32 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3d,0xd7,0xfd,0xd4,0x00,0x22] + +v_maximum_f32 v255, -|0xaf123456|, -|vcc_hi| +// GFX13: v_maximum_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x3d,0xd7,0xff,0xd6,0x00,0x62,0x56,0x34,0x12,0xaf] + +v_minimum_f16 v5.l, v1.l, v2.l +// GFX13: v_minimum_f16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x05,0x02,0x02] + +v_minimum_f16 v5.l, v255.l, v255.l +// GFX13: v_minimum_f16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x3a,0xd7,0xff,0xff,0x03,0x02] + +v_minimum_f16 v5.l, s1, s2 +// GFX13: v_minimum_f16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x3a,0xd7,0x01,0x04,0x00,0x02] + +v_minimum_f16 v5.l, s105, s105 +// GFX13: v_minimum_f16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x3a,0xd7,0x69,0xd2,0x00,0x02] + +v_minimum_f16 v5.l, vcc_lo, ttmp15 +// GFX13: v_minimum_f16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3a,0xd7,0x6a,0xf6,0x00,0x02] + +v_minimum_f16 v5.l, vcc_hi, 0xaf12 +// GFX13: v_minimum_f16 v5.l, vcc_hi, 0xaf12 ; encoding: [0x05,0x00,0x3a,0xd7,0x6b,0xfe,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_minimum_f16 v5.l, ttmp15, src_scc +// GFX13: v_minimum_f16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x3a,0xd7,0x7b,0xfa,0x01,0x02] + +v_minimum_f16 v5.l, m0, 0.5 +// GFX13: v_minimum_f16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x3a,0xd7,0x7d,0xe0,0x01,0x02] + +v_minimum_f16 v5.l, exec_lo, -1 +// GFX13: v_minimum_f16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x3a,0xd7,0x7e,0x82,0x01,0x02] + +v_minimum_f16 v5.l, |exec_hi|, null +// GFX13: v_minimum_f16 v5.l, |exec_hi|, null ; encoding: [0x05,0x01,0x3a,0xd7,0x7f,0xf8,0x00,0x02] + +v_minimum_f16 v5.l, null, exec_lo +// GFX13: v_minimum_f16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x3a,0xd7,0x7c,0xfc,0x00,0x02] + +v_minimum_f16 v5.l, -1, exec_hi +// GFX13: v_minimum_f16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x3a,0xd7,0xc1,0xfe,0x00,0x02] + +v_minimum_f16 v5.l, 0.5, -m0 +// GFX13: v_minimum_f16 v5.l, 0.5, -m0 ; encoding: [0x05,0x00,0x3a,0xd7,0xf0,0xfa,0x00,0x42] + +v_minimum_f16 v5.l, -src_scc, |vcc_lo| +// GFX13: v_minimum_f16 v5.l, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3a,0xd7,0xfd,0xd4,0x00,0x22] + +v_minimum_f16 v255.l, -|0xaf12|, -|vcc_hi| +// GFX13: v_minimum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x3a,0xd7,0xff,0xd6,0x00,0x62,0x12,0xaf,0x00,0x00] + +v_minimum_f16 v205.l, v201.l, v200.l +// GFX13: v_minimum_f16 v205.l, v201.l, v200.l ; encoding: [0xcd,0x00,0x3a,0xd7,0xc9,0x91,0x03,0x02] + +v_maximum_f16 v5.l, v1.l, v2.l +// GFX13: v_maximum_f16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x3b,0xd7,0x01,0x05,0x02,0x02] + +v_maximum_f16 v5.l, v255.l, v255.l +// GFX13: v_maximum_f16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x3b,0xd7,0xff,0xff,0x03,0x02] + +v_maximum_f16 v5.l, s1, s2 +// GFX13: v_maximum_f16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x3b,0xd7,0x01,0x04,0x00,0x02] + +v_maximum_f16 v5.l, s105, s105 +// GFX13: v_maximum_f16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x3b,0xd7,0x69,0xd2,0x00,0x02] + +v_maximum_f16 v5.l, vcc_lo, ttmp15 +// GFX13: v_maximum_f16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x3b,0xd7,0x6a,0xf6,0x00,0x02] + +v_maximum_f16 v5.l, vcc_hi, 0xaf12 +// GFX13: v_maximum_f16 v5.l, vcc_hi, 0xaf12 ; encoding: [0x05,0x00,0x3b,0xd7,0x6b,0xfe,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_maximum_f16 v5.l, ttmp15, src_scc +// GFX13: v_maximum_f16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x3b,0xd7,0x7b,0xfa,0x01,0x02] + +v_maximum_f16 v5.l, m0, 0.5 +// GFX13: v_maximum_f16 v5.l, m0, 0.5 ; encoding: [0x05,0x00,0x3b,0xd7,0x7d,0xe0,0x01,0x02] + +v_maximum_f16 v5.l, exec_lo, -1 +// GFX13: v_maximum_f16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x3b,0xd7,0x7e,0x82,0x01,0x02] + +v_maximum_f16 v5.l, |exec_hi|, null +// GFX13: v_maximum_f16 v5.l, |exec_hi|, null ; encoding: [0x05,0x01,0x3b,0xd7,0x7f,0xf8,0x00,0x02] + +v_maximum_f16 v5.l, null, exec_lo +// GFX13: v_maximum_f16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x3b,0xd7,0x7c,0xfc,0x00,0x02] + +v_maximum_f16 v5.l, -1, exec_hi +// GFX13: v_maximum_f16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x3b,0xd7,0xc1,0xfe,0x00,0x02] + +v_maximum_f16 v5.l, 0.5, -m0 +// GFX13: v_maximum_f16 v5.l, 0.5, -m0 ; encoding: [0x05,0x00,0x3b,0xd7,0xf0,0xfa,0x00,0x42] + +v_maximum_f16 v5.l, -src_scc, |vcc_lo| +// GFX13: v_maximum_f16 v5.l, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x3b,0xd7,0xfd,0xd4,0x00,0x22] + +v_maximum_f16 v255.l, -|0xaf12|, -|vcc_hi| +// GFX13: v_maximum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x3b,0xd7,0xff,0xd6,0x00,0x62,0x12,0xaf,0x00,0x00] + +v_maximum_f16 v205.l, v201.l, v200.l +// GFX13: v_maximum_f16 v205.l, v201.l, v200.l ; encoding: [0xcd,0x00,0x3b,0xd7,0xc9,0x91,0x03,0x02] + +v_minimum_f64 v[5:6], v[1:2], v[3:4] +// GFX13: v_minimum_f64 v[5:6], v[1:2], v[3:4] ; encoding: [0x05,0x00,0x3e,0xd7,0x01,0x07,0x02,0x02] + +v_minimum_f64 v[5:6], v[254:255], v[254:255] +// GFX13: v_minimum_f64 v[5:6], v[254:255], v[254:255] ; encoding: [0x05,0x00,0x3e,0xd7,0xfe,0xfd,0x03,0x02] + +v_minimum_f64 v[5:6], s[6:7], s[4:5] +// GFX13: v_minimum_f64 v[5:6], s[6:7], s[4:5] ; encoding: [0x05,0x00,0x3e,0xd7,0x06,0x08,0x00,0x02] + +v_minimum_f64 v[5:6], s[104:105], s[104:105] +// GFX13: v_minimum_f64 v[5:6], s[104:105], s[104:105] ; encoding: [0x05,0x00,0x3e,0xd7,0x68,0xd0,0x00,0x02] + +v_minimum_f64 v[5:6], vcc, ttmp[14:15] +// GFX13: v_minimum_f64 v[5:6], vcc, ttmp[14:15] ; encoding: [0x05,0x00,0x3e,0xd7,0x6a,0xf4,0x00,0x02] + +v_minimum_f64 v[5:6], vcc, 0xaf121234 +// GFX13: v_minimum_f64 v[5:6], vcc, 0xaf121234 ; encoding: [0x05,0x00,0x3e,0xd7,0x6a,0xfe,0x01,0x02,0x34,0x12,0x12,0xaf] + +v_minimum_f64 v[5:6], ttmp[14:15], src_scc +// GFX13: v_minimum_f64 v[5:6], ttmp[14:15], src_scc ; encoding: [0x05,0x00,0x3e,0xd7,0x7a,0xfa,0x01,0x02] + +v_minimum_f64 v[5:6], vcc, 0.5 +// GFX13: v_minimum_f64 v[5:6], vcc, 0.5 ; encoding: [0x05,0x00,0x3e,0xd7,0x6a,0xe0,0x01,0x02] + +v_minimum_f64 v[5:6], exec, -1 +// GFX13: v_minimum_f64 v[5:6], exec, -1 ; encoding: [0x05,0x00,0x3e,0xd7,0x7e,0x82,0x01,0x02] + +v_minimum_f64 v[5:6], |exec|, null +// GFX13: v_minimum_f64 v[5:6], |exec|, null ; encoding: [0x05,0x01,0x3e,0xd7,0x7e,0xf8,0x00,0x02] + +v_minimum_f64 v[5:6], null, exec +// GFX13: v_minimum_f64 v[5:6], null, exec ; encoding: [0x05,0x00,0x3e,0xd7,0x7c,0xfc,0x00,0x02] + +v_minimum_f64 v[5:6], -1, exec +// GFX13: v_minimum_f64 v[5:6], -1, exec ; encoding: [0x05,0x00,0x3e,0xd7,0xc1,0xfc,0x00,0x02] + +v_minimum_f64 v[5:6], 0.5, -vcc +// GFX13: v_minimum_f64 v[5:6], 0.5, -vcc ; encoding: [0x05,0x00,0x3e,0xd7,0xf0,0xd4,0x00,0x42] + +v_minimum_f64 v[5:6], -src_scc, |vcc| +// GFX13: v_minimum_f64 v[5:6], -src_scc, |vcc| ; encoding: [0x05,0x02,0x3e,0xd7,0xfd,0xd4,0x00,0x22] + +v_minimum_f64 v[254:255], -|2|, -|vcc| +// GFX13: v_minimum_f64 v[254:255], -|2|, -|vcc| ; encoding: [0xfe,0x03,0x3e,0xd7,0x82,0xd4,0x00,0x62] + +v_maximum_f64 v[5:6], v[1:2], v[3:4] +// GFX13: v_maximum_f64 v[5:6], v[1:2], v[3:4] ; encoding: [0x05,0x00,0x3f,0xd7,0x01,0x07,0x02,0x02] + +v_maximum_f64 v[5:6], v[254:255], v[254:255] +// GFX13: v_maximum_f64 v[5:6], v[254:255], v[254:255] ; encoding: [0x05,0x00,0x3f,0xd7,0xfe,0xfd,0x03,0x02] + +v_maximum_f64 v[5:6], s[6:7], s[4:5] +// GFX13: v_maximum_f64 v[5:6], s[6:7], s[4:5] ; encoding: [0x05,0x00,0x3f,0xd7,0x06,0x08,0x00,0x02] + +v_maximum_f64 v[5:6], s[104:105], s[104:105] +// GFX13: v_maximum_f64 v[5:6], s[104:105], s[104:105] ; encoding: [0x05,0x00,0x3f,0xd7,0x68,0xd0,0x00,0x02] + +v_maximum_f64 v[5:6], vcc, ttmp[14:15] +// GFX13: v_maximum_f64 v[5:6], vcc, ttmp[14:15] ; encoding: [0x05,0x00,0x3f,0xd7,0x6a,0xf4,0x00,0x02] + +v_maximum_f64 v[5:6], vcc, 0xaf121234 +// GFX13: v_maximum_f64 v[5:6], vcc, 0xaf121234 ; encoding: [0x05,0x00,0x3f,0xd7,0x6a,0xfe,0x01,0x02,0x34,0x12,0x12,0xaf] + +v_maximum_f64 v[5:6], ttmp[14:15], src_scc +// GFX13: v_maximum_f64 v[5:6], ttmp[14:15], src_scc ; encoding: [0x05,0x00,0x3f,0xd7,0x7a,0xfa,0x01,0x02] + +v_maximum_f64 v[5:6], vcc, 0.5 +// GFX13: v_maximum_f64 v[5:6], vcc, 0.5 ; encoding: [0x05,0x00,0x3f,0xd7,0x6a,0xe0,0x01,0x02] + +v_maximum_f64 v[5:6], exec, -1 +// GFX13: v_maximum_f64 v[5:6], exec, -1 ; encoding: [0x05,0x00,0x3f,0xd7,0x7e,0x82,0x01,0x02] + +v_maximum_f64 v[5:6], |exec|, null +// GFX13: v_maximum_f64 v[5:6], |exec|, null ; encoding: [0x05,0x01,0x3f,0xd7,0x7e,0xf8,0x00,0x02] + +v_maximum_f64 v[5:6], null, exec +// GFX13: v_maximum_f64 v[5:6], null, exec ; encoding: [0x05,0x00,0x3f,0xd7,0x7c,0xfc,0x00,0x02] + +v_maximum_f64 v[5:6], -1, exec +// GFX13: v_maximum_f64 v[5:6], -1, exec ; encoding: [0x05,0x00,0x3f,0xd7,0xc1,0xfc,0x00,0x02] + +v_maximum_f64 v[5:6], 0.5, -vcc +// GFX13: v_maximum_f64 v[5:6], 0.5, -vcc ; encoding: [0x05,0x00,0x3f,0xd7,0xf0,0xd4,0x00,0x42] + +v_maximum_f64 v[5:6], -src_scc, |vcc| +// GFX13: v_maximum_f64 v[5:6], -src_scc, |vcc| ; encoding: [0x05,0x02,0x3f,0xd7,0xfd,0xd4,0x00,0x22] + +v_maximum_f64 v[254:255], -|2|, -|vcc| +// GFX13: v_maximum_f64 v[254:255], -|2|, -|vcc| ; encoding: [0xfe,0x03,0x3f,0xd7,0x82,0xd4,0x00,0x62] + +v_minimum3_f32 v5, v1, v2, s3 +// GFX13: v_minimum3_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2d,0xd6,0x01,0x05,0x0e,0x00] + +v_minimum3_f32 v5, v255, s2, s105 +// GFX13: v_minimum3_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2d,0xd6,0xff,0x05,0xa4,0x01] + +v_minimum3_f32 v5, s1, v255, exec_hi +// GFX13: v_minimum3_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2d,0xd6,0x01,0xfe,0xff,0x01] + +v_minimum3_f32 v5, s105, s105, exec_lo +// GFX13: v_minimum3_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2d,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimum3_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minimum3_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimum3_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minimum3_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x2d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minimum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2d,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimum3_f32 v5, m0, 0.5, m0 +// GFX13: v_minimum3_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimum3_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minimum3_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2d,0xd6,0x7e,0x82,0xad,0x01] + +v_minimum3_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimum3_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2d,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimum3_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_minimum3_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x2d,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_minimum3_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimum3_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2d,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimum3_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minimum3_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x2d,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minimum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minimum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x2d,0xd6,0xfd,0xd4,0x04,0x33] + +v_minimum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minimum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_maximum3_f32 v5, v1, v2, s3 +// GFX13: v_maximum3_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x2e,0xd6,0x01,0x05,0x0e,0x00] + +v_maximum3_f32 v5, v255, s2, s105 +// GFX13: v_maximum3_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x2e,0xd6,0xff,0x05,0xa4,0x01] + +v_maximum3_f32 v5, s1, v255, exec_hi +// GFX13: v_maximum3_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x2e,0xd6,0x01,0xfe,0xff,0x01] + +v_maximum3_f32 v5, s105, s105, exec_lo +// GFX13: v_maximum3_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2e,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximum3_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maximum3_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x2e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximum3_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maximum3_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x2e,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maximum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximum3_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2e,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximum3_f32 v5, m0, 0.5, m0 +// GFX13: v_maximum3_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximum3_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maximum3_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2e,0xd6,0x7e,0x82,0xad,0x01] + +v_maximum3_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximum3_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2e,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximum3_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_maximum3_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x2e,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_maximum3_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximum3_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2e,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximum3_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maximum3_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x2e,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maximum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maximum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x2e,0xd6,0xfd,0xd4,0x04,0x33] + +v_maximum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maximum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_minimum3_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_minimum3_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00] + +v_minimum3_f16 v5.l, v255.l, s2, s105 +// GFX13: v_minimum3_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01] + +v_minimum3_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_minimum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01] + +v_minimum3_f16 v5.l, s105, s105, exec_lo +// GFX13: v_minimum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimum3_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_minimum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimum3_f16 v5.l, vcc_hi, 0xaf12, v255.l +// GFX13: v_minimum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_minimum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimum3_f16 v5.l, m0, 0.5, m0 +// GFX13: v_minimum3_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimum3_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_minimum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01] + +v_minimum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimum3_f16 v5.l, null, exec_lo, -|0xaf12| +// GFX13: v_minimum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_minimum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimum3_f16 v5.l, 0.5, -m0, 0.5 +// GFX13: v_minimum3_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43] + +v_minimum3_f16 v5.l, -src_scc, |vcc_lo|, -1 +// GFX13: v_minimum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23] + +v_minimum3_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp +// GFX13: v_minimum3_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00] + +v_maximum3_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_maximum3_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00] + +v_maximum3_f16 v5.l, v255.l, s2, s105 +// GFX13: v_maximum3_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01] + +v_maximum3_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_maximum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01] + +v_maximum3_f16 v5.l, s105, s105, exec_lo +// GFX13: v_maximum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximum3_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_maximum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximum3_f16 v5.l, vcc_hi, 0xaf12, v255.l +// GFX13: v_maximum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_maximum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximum3_f16 v5.l, m0, 0.5, m0 +// GFX13: v_maximum3_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximum3_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_maximum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01] + +v_maximum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximum3_f16 v5.l, null, exec_lo, -|0xaf12| +// GFX13: v_maximum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_maximum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximum3_f16 v5.l, 0.5, -m0, 0.5 +// GFX13: v_maximum3_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43] + +v_maximum3_f16 v5.l, -src_scc, |vcc_lo|, -1 +// GFX13: v_maximum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23] + +v_maximumminimum_f32 v5, v1, v2, s3 +// GFX13: v_maximumminimum_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6d,0xd6,0x01,0x05,0x0e,0x00] + +v_maximumminimum_f32 v5, v255, s2, s105 +// GFX13: v_maximumminimum_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6d,0xd6,0xff,0x05,0xa4,0x01] + +v_maximumminimum_f32 v5, s1, v255, exec_hi +// GFX13: v_maximumminimum_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6d,0xd6,0x01,0xfe,0xff,0x01] + +v_maximumminimum_f32 v5, s105, s105, exec_lo +// GFX13: v_maximumminimum_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6d,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximumminimum_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_maximumminimum_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6d,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximumminimum_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_maximumminimum_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6d,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_maximumminimum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximumminimum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6d,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximumminimum_f32 v5, m0, 0.5, m0 +// GFX13: v_maximumminimum_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6d,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximumminimum_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_maximumminimum_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6d,0xd6,0x7e,0x82,0xad,0x01] + +v_maximumminimum_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximumminimum_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6d,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximumminimum_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_maximumminimum_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x6d,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_maximumminimum_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximumminimum_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6d,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximumminimum_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_maximumminimum_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6d,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_maximumminimum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_maximumminimum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6d,0xd6,0xfd,0xd4,0x04,0x33] + +v_maximumminimum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_maximumminimum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_minimummaximum_f32 v5, v1, v2, s3 +// GFX13: v_minimummaximum_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6c,0xd6,0x01,0x05,0x0e,0x00] + +v_minimummaximum_f32 v5, v255, s2, s105 +// GFX13: v_minimummaximum_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6c,0xd6,0xff,0x05,0xa4,0x01] + +v_minimummaximum_f32 v5, s1, v255, exec_hi +// GFX13: v_minimummaximum_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6c,0xd6,0x01,0xfe,0xff,0x01] + +v_minimummaximum_f32 v5, s105, s105, exec_lo +// GFX13: v_minimummaximum_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6c,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimummaximum_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_minimummaximum_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6c,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimummaximum_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_minimummaximum_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6c,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_minimummaximum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimummaximum_f32 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6c,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimummaximum_f32 v5, m0, 0.5, m0 +// GFX13: v_minimummaximum_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6c,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimummaximum_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_minimummaximum_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6c,0xd6,0x7e,0x82,0xad,0x01] + +v_minimummaximum_f32 v5, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimummaximum_f32 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6c,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimummaximum_f32 v5, null, exec_lo, -|0xaf123456| +// GFX13: v_minimummaximum_f32 v5, null, exec_lo, -|0xaf123456| ; encoding: [0x05,0x04,0x6c,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] + +v_minimummaximum_f32 v5, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimummaximum_f32 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6c,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimummaximum_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_minimummaximum_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6c,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_minimummaximum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_minimummaximum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6c,0xd6,0xfd,0xd4,0x04,0x33] + +v_minimummaximum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_minimummaximum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6c,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_maximumminimum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00] + +v_maximumminimum_f16 v5.l, v255.l, s2, s105 +// GFX13: v_maximumminimum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01] + +v_maximumminimum_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_maximumminimum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01] + +v_maximumminimum_f16 v5.l, s105, s105, exec_lo +// GFX13: v_maximumminimum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01] + +v_maximumminimum_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_maximumminimum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04] + +v_maximumminimum_f16 v5.l, vcc_hi, 0xaf12, v255.l +// GFX13: v_maximumminimum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_maximumminimum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_maximumminimum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1] + +v_maximumminimum_f16 v5.l, m0, 0.5, m0 +// GFX13: v_maximumminimum_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01] + +v_maximumminimum_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_maximumminimum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01] + +v_maximumminimum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_maximumminimum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_maximumminimum_f16 v5.l, null, exec_lo, -|0xaf12| +// GFX13: v_maximumminimum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_maximumminimum_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX13: v_maximumminimum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_maximumminimum_f16 v5.l, 0.5, -m0, 0.5 +// GFX13: v_maximumminimum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43] + +v_maximumminimum_f16 v5.l, -src_scc, |vcc_lo|, -1 +// GFX13: v_maximumminimum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23] + +v_maximumminimum_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp +// GFX13: v_maximumminimum_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, s3 +// GFX13: v_minimummaximum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00] + +v_minimummaximum_f16 v5.l, v255.l, s2, s105 +// GFX13: v_minimummaximum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01] + +v_minimummaximum_f16 v5.l, s1, v255.l, exec_hi +// GFX13: v_minimummaximum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01] + +v_minimummaximum_f16 v5.l, s105, s105, exec_lo +// GFX13: v_minimummaximum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01] + +v_minimummaximum_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX13: v_minimummaximum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04] + +v_minimummaximum_f16 v5.l, vcc_hi, 0xaf12, v255.l +// GFX13: v_minimummaximum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00] + +v_minimummaximum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX13: v_minimummaximum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1] + +v_minimummaximum_f16 v5.l, m0, 0.5, m0 +// GFX13: v_minimummaximum_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01] + +v_minimummaximum_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX13: v_minimummaximum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01] + +v_minimummaximum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX13: v_minimummaximum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1] + +v_minimummaximum_f16 v5.l, null, exec_lo, -|0xaf12| +// GFX13: v_minimummaximum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00] + +v_minimummaximum_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX13: v_minimummaximum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3] + +v_minimummaximum_f16 v5.l, 0.5, -m0, 0.5 +// GFX13: v_minimummaximum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43] + +v_minimummaximum_f16 v5.l, -src_scc, |vcc_lo|, -1 +// GFX13: v_minimummaximum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23] + +v_s_exp_f32 s5, s1 +// GFX13: v_s_exp_f32 s5, s1 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f32 s5, s105 +// GFX13: v_s_exp_f32 s5, s105 ; encoding: [0x05,0x00,0x80,0xd6,0x69,0x00,0x01,0x02] + +v_s_exp_f32 s5, vcc_lo +// GFX13: v_s_exp_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x80,0xd6,0x6a,0x00,0x01,0x02] + +v_s_exp_f32 s5, vcc_hi +// GFX13: v_s_exp_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x80,0xd6,0x6b,0x00,0x01,0x02] + +v_s_exp_f32 s5, ttmp15 +// GFX13: v_s_exp_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x80,0xd6,0x7b,0x00,0x01,0x02] + +v_s_exp_f32 s5, m0 +// GFX13: v_s_exp_f32 s5, m0 ; encoding: [0x05,0x00,0x80,0xd6,0x7d,0x00,0x01,0x02] + +v_s_exp_f32 s5, exec_lo +// GFX13: v_s_exp_f32 s5, exec_lo ; encoding: [0x05,0x00,0x80,0xd6,0x7e,0x00,0x01,0x02] + +v_s_exp_f32 s5, exec_hi +// GFX13: v_s_exp_f32 s5, exec_hi ; encoding: [0x05,0x00,0x80,0xd6,0x7f,0x00,0x01,0x02] + +v_s_exp_f32 s5, null +// GFX13: v_s_exp_f32 s5, null ; encoding: [0x05,0x00,0x80,0xd6,0x7c,0x00,0x01,0x02] + +v_s_exp_f32 s5, -1 +// GFX13: v_s_exp_f32 s5, -1 ; encoding: [0x05,0x00,0x80,0xd6,0xc1,0x00,0x01,0x02] + +v_s_exp_f32 s5, 0.5 +// GFX13: v_s_exp_f32 s5, 0.5 ; encoding: [0x05,0x00,0x80,0xd6,0xf0,0x00,0x01,0x02] + +v_s_exp_f32 s5, src_scc +// GFX13: v_s_exp_f32 s5, src_scc ; encoding: [0x05,0x00,0x80,0xd6,0xfd,0x00,0x01,0x02] + +v_s_exp_f32 s105, 0xaf123456 +// GFX13: v_s_exp_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x80,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_exp_f32 s5, -s1 +// GFX13: v_s_exp_f32 s5, -s1 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x22] + +v_s_exp_f32 s5, |s1| +// GFX13: v_s_exp_f32 s5, |s1| ; encoding: [0x05,0x01,0x80,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f32 s5, s1 clamp +// GFX13: v_s_exp_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x80,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f32 s5, s1 mul:2 +// GFX13: v_s_exp_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x0a] + +v_s_exp_f32 s5, s1 mul:4 +// GFX13: v_s_exp_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x12] + +v_s_exp_f32 s5, s1 div:2 +// GFX13: v_s_exp_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x01,0x1a] + +v_s_exp_f16 s5, s1 +// GFX13: v_s_exp_f16 s5, s1 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f16 s5, s105 +// GFX13: v_s_exp_f16 s5, s105 ; encoding: [0x05,0x00,0x81,0xd6,0x69,0x00,0x01,0x02] + +v_s_exp_f16 s5, vcc_lo +// GFX13: v_s_exp_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd6,0x6a,0x00,0x01,0x02] + +v_s_exp_f16 s5, vcc_hi +// GFX13: v_s_exp_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd6,0x6b,0x00,0x01,0x02] + +v_s_exp_f16 s5, ttmp15 +// GFX13: v_s_exp_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd6,0x7b,0x00,0x01,0x02] + +v_s_exp_f16 s5, m0 +// GFX13: v_s_exp_f16 s5, m0 ; encoding: [0x05,0x00,0x81,0xd6,0x7d,0x00,0x01,0x02] + +v_s_exp_f16 s5, exec_lo +// GFX13: v_s_exp_f16 s5, exec_lo ; encoding: [0x05,0x00,0x81,0xd6,0x7e,0x00,0x01,0x02] + +v_s_exp_f16 s5, exec_hi +// GFX13: v_s_exp_f16 s5, exec_hi ; encoding: [0x05,0x00,0x81,0xd6,0x7f,0x00,0x01,0x02] + +v_s_exp_f16 s5, null +// GFX13: v_s_exp_f16 s5, null ; encoding: [0x05,0x00,0x81,0xd6,0x7c,0x00,0x01,0x02] + +v_s_exp_f16 s5, -1 +// GFX13: v_s_exp_f16 s5, -1 ; encoding: [0x05,0x00,0x81,0xd6,0xc1,0x00,0x01,0x02] + +v_s_exp_f16 s5, 0.5 +// GFX13: v_s_exp_f16 s5, 0.5 ; encoding: [0x05,0x00,0x81,0xd6,0xf0,0x00,0x01,0x02] + +v_s_exp_f16 s5, src_scc +// GFX13: v_s_exp_f16 s5, src_scc ; encoding: [0x05,0x00,0x81,0xd6,0xfd,0x00,0x01,0x02] + +v_s_exp_f16 s105, 0xaf12 +// GFX13: v_s_exp_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x81,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_exp_f16 s5, -s1 +// GFX13: v_s_exp_f16 s5, -s1 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x22] + +v_s_exp_f16 s5, |s1| +// GFX13: v_s_exp_f16 s5, |s1| ; encoding: [0x05,0x01,0x81,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f16 s5, s1 clamp +// GFX13: v_s_exp_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x81,0xd6,0x01,0x00,0x01,0x02] + +v_s_exp_f16 s5, s1 mul:2 +// GFX13: v_s_exp_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x0a] + +v_s_exp_f16 s5, s1 mul:4 +// GFX13: v_s_exp_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x12] + +v_s_exp_f16 s5, s1 div:2 +// GFX13: v_s_exp_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x01,0x1a] + +v_s_log_f32 s5, s1 +// GFX13: v_s_log_f32 s5, s1 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f32 s5, s105 +// GFX13: v_s_log_f32 s5, s105 ; encoding: [0x05,0x00,0x82,0xd6,0x69,0x00,0x01,0x02] + +v_s_log_f32 s5, vcc_lo +// GFX13: v_s_log_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x82,0xd6,0x6a,0x00,0x01,0x02] + +v_s_log_f32 s5, vcc_hi +// GFX13: v_s_log_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x82,0xd6,0x6b,0x00,0x01,0x02] + +v_s_log_f32 s5, ttmp15 +// GFX13: v_s_log_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x82,0xd6,0x7b,0x00,0x01,0x02] + +v_s_log_f32 s5, m0 +// GFX13: v_s_log_f32 s5, m0 ; encoding: [0x05,0x00,0x82,0xd6,0x7d,0x00,0x01,0x02] + +v_s_log_f32 s5, exec_lo +// GFX13: v_s_log_f32 s5, exec_lo ; encoding: [0x05,0x00,0x82,0xd6,0x7e,0x00,0x01,0x02] + +v_s_log_f32 s5, exec_hi +// GFX13: v_s_log_f32 s5, exec_hi ; encoding: [0x05,0x00,0x82,0xd6,0x7f,0x00,0x01,0x02] + +v_s_log_f32 s5, null +// GFX13: v_s_log_f32 s5, null ; encoding: [0x05,0x00,0x82,0xd6,0x7c,0x00,0x01,0x02] + +v_s_log_f32 s5, -1 +// GFX13: v_s_log_f32 s5, -1 ; encoding: [0x05,0x00,0x82,0xd6,0xc1,0x00,0x01,0x02] + +v_s_log_f32 s5, 0.5 +// GFX13: v_s_log_f32 s5, 0.5 ; encoding: [0x05,0x00,0x82,0xd6,0xf0,0x00,0x01,0x02] + +v_s_log_f32 s5, src_scc +// GFX13: v_s_log_f32 s5, src_scc ; encoding: [0x05,0x00,0x82,0xd6,0xfd,0x00,0x01,0x02] + +v_s_log_f32 s105, 0xaf123456 +// GFX13: v_s_log_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x82,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_log_f32 s5, -s1 +// GFX13: v_s_log_f32 s5, -s1 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x22] + +v_s_log_f32 s5, |s1| +// GFX13: v_s_log_f32 s5, |s1| ; encoding: [0x05,0x01,0x82,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f32 s5, s1 clamp +// GFX13: v_s_log_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x82,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f32 s5, s1 mul:2 +// GFX13: v_s_log_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x0a] + +v_s_log_f32 s5, s1 mul:4 +// GFX13: v_s_log_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x12] + +v_s_log_f32 s5, s1 div:2 +// GFX13: v_s_log_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x01,0x1a] + +v_s_log_f16 s5, s1 +// GFX13: v_s_log_f16 s5, s1 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f16 s5, s105 +// GFX13: v_s_log_f16 s5, s105 ; encoding: [0x05,0x00,0x83,0xd6,0x69,0x00,0x01,0x02] + +v_s_log_f16 s5, vcc_lo +// GFX13: v_s_log_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x83,0xd6,0x6a,0x00,0x01,0x02] + +v_s_log_f16 s5, vcc_hi +// GFX13: v_s_log_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x83,0xd6,0x6b,0x00,0x01,0x02] + +v_s_log_f16 s5, ttmp15 +// GFX13: v_s_log_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x83,0xd6,0x7b,0x00,0x01,0x02] + +v_s_log_f16 s5, m0 +// GFX13: v_s_log_f16 s5, m0 ; encoding: [0x05,0x00,0x83,0xd6,0x7d,0x00,0x01,0x02] + +v_s_log_f16 s5, exec_lo +// GFX13: v_s_log_f16 s5, exec_lo ; encoding: [0x05,0x00,0x83,0xd6,0x7e,0x00,0x01,0x02] + +v_s_log_f16 s5, exec_hi +// GFX13: v_s_log_f16 s5, exec_hi ; encoding: [0x05,0x00,0x83,0xd6,0x7f,0x00,0x01,0x02] + +v_s_log_f16 s5, null +// GFX13: v_s_log_f16 s5, null ; encoding: [0x05,0x00,0x83,0xd6,0x7c,0x00,0x01,0x02] + +v_s_log_f16 s5, -1 +// GFX13: v_s_log_f16 s5, -1 ; encoding: [0x05,0x00,0x83,0xd6,0xc1,0x00,0x01,0x02] + +v_s_log_f16 s5, 0.5 +// GFX13: v_s_log_f16 s5, 0.5 ; encoding: [0x05,0x00,0x83,0xd6,0xf0,0x00,0x01,0x02] + +v_s_log_f16 s5, src_scc +// GFX13: v_s_log_f16 s5, src_scc ; encoding: [0x05,0x00,0x83,0xd6,0xfd,0x00,0x01,0x02] + +v_s_log_f16 s105, 0xaf12 +// GFX13: v_s_log_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x83,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_log_f16 s5, -s1 +// GFX13: v_s_log_f16 s5, -s1 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x22] + +v_s_log_f16 s5, |s1| +// GFX13: v_s_log_f16 s5, |s1| ; encoding: [0x05,0x01,0x83,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f16 s5, s1 clamp +// GFX13: v_s_log_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x83,0xd6,0x01,0x00,0x01,0x02] + +v_s_log_f16 s5, s1 mul:2 +// GFX13: v_s_log_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x0a] + +v_s_log_f16 s5, s1 mul:4 +// GFX13: v_s_log_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x12] + +v_s_log_f16 s5, s1 div:2 +// GFX13: v_s_log_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rcp_f32 s5, s1 +// GFX13: v_s_rcp_f32 s5, s1 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f32 s5, s105 +// GFX13: v_s_rcp_f32 s5, s105 ; encoding: [0x05,0x00,0x84,0xd6,0x69,0x00,0x01,0x02] + +v_s_rcp_f32 s5, vcc_lo +// GFX13: v_s_rcp_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x84,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rcp_f32 s5, vcc_hi +// GFX13: v_s_rcp_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x84,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rcp_f32 s5, ttmp15 +// GFX13: v_s_rcp_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x84,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rcp_f32 s5, m0 +// GFX13: v_s_rcp_f32 s5, m0 ; encoding: [0x05,0x00,0x84,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rcp_f32 s5, exec_lo +// GFX13: v_s_rcp_f32 s5, exec_lo ; encoding: [0x05,0x00,0x84,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rcp_f32 s5, exec_hi +// GFX13: v_s_rcp_f32 s5, exec_hi ; encoding: [0x05,0x00,0x84,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rcp_f32 s5, null +// GFX13: v_s_rcp_f32 s5, null ; encoding: [0x05,0x00,0x84,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rcp_f32 s5, -1 +// GFX13: v_s_rcp_f32 s5, -1 ; encoding: [0x05,0x00,0x84,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rcp_f32 s5, 0.5 +// GFX13: v_s_rcp_f32 s5, 0.5 ; encoding: [0x05,0x00,0x84,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rcp_f32 s5, src_scc +// GFX13: v_s_rcp_f32 s5, src_scc ; encoding: [0x05,0x00,0x84,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rcp_f32 s105, 0xaf123456 +// GFX13: v_s_rcp_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x84,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_rcp_f32 s5, -s1 +// GFX13: v_s_rcp_f32 s5, -s1 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x22] + +v_s_rcp_f32 s5, |s1| +// GFX13: v_s_rcp_f32 s5, |s1| ; encoding: [0x05,0x01,0x84,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f32 s5, s1 clamp +// GFX13: v_s_rcp_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x84,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f32 s5, s1 mul:2 +// GFX13: v_s_rcp_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rcp_f32 s5, s1 mul:4 +// GFX13: v_s_rcp_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x12] + +v_s_rcp_f32 s5, s1 div:2 +// GFX13: v_s_rcp_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rcp_f16 s5, s1 +// GFX13: v_s_rcp_f16 s5, s1 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f16 s5, s105 +// GFX13: v_s_rcp_f16 s5, s105 ; encoding: [0x05,0x00,0x85,0xd6,0x69,0x00,0x01,0x02] + +v_s_rcp_f16 s5, vcc_lo +// GFX13: v_s_rcp_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rcp_f16 s5, vcc_hi +// GFX13: v_s_rcp_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rcp_f16 s5, ttmp15 +// GFX13: v_s_rcp_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rcp_f16 s5, m0 +// GFX13: v_s_rcp_f16 s5, m0 ; encoding: [0x05,0x00,0x85,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rcp_f16 s5, exec_lo +// GFX13: v_s_rcp_f16 s5, exec_lo ; encoding: [0x05,0x00,0x85,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rcp_f16 s5, exec_hi +// GFX13: v_s_rcp_f16 s5, exec_hi ; encoding: [0x05,0x00,0x85,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rcp_f16 s5, null +// GFX13: v_s_rcp_f16 s5, null ; encoding: [0x05,0x00,0x85,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rcp_f16 s5, -1 +// GFX13: v_s_rcp_f16 s5, -1 ; encoding: [0x05,0x00,0x85,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rcp_f16 s5, 0.5 +// GFX13: v_s_rcp_f16 s5, 0.5 ; encoding: [0x05,0x00,0x85,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rcp_f16 s5, src_scc +// GFX13: v_s_rcp_f16 s5, src_scc ; encoding: [0x05,0x00,0x85,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rcp_f16 s105, 0xaf12 +// GFX13: v_s_rcp_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x85,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_rcp_f16 s5, -s1 +// GFX13: v_s_rcp_f16 s5, -s1 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x22] + +v_s_rcp_f16 s5, |s1| +// GFX13: v_s_rcp_f16 s5, |s1| ; encoding: [0x05,0x01,0x85,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f16 s5, s1 clamp +// GFX13: v_s_rcp_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x85,0xd6,0x01,0x00,0x01,0x02] + +v_s_rcp_f16 s5, s1 mul:2 +// GFX13: v_s_rcp_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rcp_f16 s5, s1 mul:4 +// GFX13: v_s_rcp_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x12] + +v_s_rcp_f16 s5, s1 div:2 +// GFX13: v_s_rcp_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rsq_f32 s5, s1 +// GFX13: v_s_rsq_f32 s5, s1 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f32 s5, s105 +// GFX13: v_s_rsq_f32 s5, s105 ; encoding: [0x05,0x00,0x86,0xd6,0x69,0x00,0x01,0x02] + +v_s_rsq_f32 s5, vcc_lo +// GFX13: v_s_rsq_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rsq_f32 s5, vcc_hi +// GFX13: v_s_rsq_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rsq_f32 s5, ttmp15 +// GFX13: v_s_rsq_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rsq_f32 s5, m0 +// GFX13: v_s_rsq_f32 s5, m0 ; encoding: [0x05,0x00,0x86,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rsq_f32 s5, exec_lo +// GFX13: v_s_rsq_f32 s5, exec_lo ; encoding: [0x05,0x00,0x86,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rsq_f32 s5, exec_hi +// GFX13: v_s_rsq_f32 s5, exec_hi ; encoding: [0x05,0x00,0x86,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rsq_f32 s5, null +// GFX13: v_s_rsq_f32 s5, null ; encoding: [0x05,0x00,0x86,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rsq_f32 s5, -1 +// GFX13: v_s_rsq_f32 s5, -1 ; encoding: [0x05,0x00,0x86,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rsq_f32 s5, 0.5 +// GFX13: v_s_rsq_f32 s5, 0.5 ; encoding: [0x05,0x00,0x86,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rsq_f32 s5, src_scc +// GFX13: v_s_rsq_f32 s5, src_scc ; encoding: [0x05,0x00,0x86,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rsq_f32 s105, 0xaf123456 +// GFX13: v_s_rsq_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x86,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_rsq_f32 s5, -s1 +// GFX13: v_s_rsq_f32 s5, -s1 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x22] + +v_s_rsq_f32 s5, |s1| +// GFX13: v_s_rsq_f32 s5, |s1| ; encoding: [0x05,0x01,0x86,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f32 s5, s1 clamp +// GFX13: v_s_rsq_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x86,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f32 s5, s1 mul:2 +// GFX13: v_s_rsq_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rsq_f32 s5, s1 mul:4 +// GFX13: v_s_rsq_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x12] + +v_s_rsq_f32 s5, s1 div:2 +// GFX13: v_s_rsq_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x01,0x1a] + +v_s_rsq_f16 s5, s1 +// GFX13: v_s_rsq_f16 s5, s1 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f16 s5, s105 +// GFX13: v_s_rsq_f16 s5, s105 ; encoding: [0x05,0x00,0x87,0xd6,0x69,0x00,0x01,0x02] + +v_s_rsq_f16 s5, vcc_lo +// GFX13: v_s_rsq_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd6,0x6a,0x00,0x01,0x02] + +v_s_rsq_f16 s5, vcc_hi +// GFX13: v_s_rsq_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd6,0x6b,0x00,0x01,0x02] + +v_s_rsq_f16 s5, ttmp15 +// GFX13: v_s_rsq_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd6,0x7b,0x00,0x01,0x02] + +v_s_rsq_f16 s5, m0 +// GFX13: v_s_rsq_f16 s5, m0 ; encoding: [0x05,0x00,0x87,0xd6,0x7d,0x00,0x01,0x02] + +v_s_rsq_f16 s5, exec_lo +// GFX13: v_s_rsq_f16 s5, exec_lo ; encoding: [0x05,0x00,0x87,0xd6,0x7e,0x00,0x01,0x02] + +v_s_rsq_f16 s5, exec_hi +// GFX13: v_s_rsq_f16 s5, exec_hi ; encoding: [0x05,0x00,0x87,0xd6,0x7f,0x00,0x01,0x02] + +v_s_rsq_f16 s5, null +// GFX13: v_s_rsq_f16 s5, null ; encoding: [0x05,0x00,0x87,0xd6,0x7c,0x00,0x01,0x02] + +v_s_rsq_f16 s5, -1 +// GFX13: v_s_rsq_f16 s5, -1 ; encoding: [0x05,0x00,0x87,0xd6,0xc1,0x00,0x01,0x02] + +v_s_rsq_f16 s5, 0.5 +// GFX13: v_s_rsq_f16 s5, 0.5 ; encoding: [0x05,0x00,0x87,0xd6,0xf0,0x00,0x01,0x02] + +v_s_rsq_f16 s5, src_scc +// GFX13: v_s_rsq_f16 s5, src_scc ; encoding: [0x05,0x00,0x87,0xd6,0xfd,0x00,0x01,0x02] + +v_s_rsq_f16 s105, 0xaf12 +// GFX13: v_s_rsq_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x87,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_rsq_f16 s5, -s1 +// GFX13: v_s_rsq_f16 s5, -s1 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x22] + +v_s_rsq_f16 s5, |s1| +// GFX13: v_s_rsq_f16 s5, |s1| ; encoding: [0x05,0x01,0x87,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f16 s5, s1 clamp +// GFX13: v_s_rsq_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x87,0xd6,0x01,0x00,0x01,0x02] + +v_s_rsq_f16 s5, s1 mul:2 +// GFX13: v_s_rsq_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x0a] + +v_s_rsq_f16 s5, s1 mul:4 +// GFX13: v_s_rsq_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x12] + +v_s_rsq_f16 s5, s1 div:2 +// GFX13: v_s_rsq_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x01,0x1a] + +v_s_sqrt_f32 s5, s1 +// GFX13: v_s_sqrt_f32 s5, s1 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, s105 +// GFX13: v_s_sqrt_f32 s5, s105 ; encoding: [0x05,0x00,0x88,0xd6,0x69,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, vcc_lo +// GFX13: v_s_sqrt_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd6,0x6a,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, vcc_hi +// GFX13: v_s_sqrt_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd6,0x6b,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, ttmp15 +// GFX13: v_s_sqrt_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd6,0x7b,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, m0 +// GFX13: v_s_sqrt_f32 s5, m0 ; encoding: [0x05,0x00,0x88,0xd6,0x7d,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, exec_lo +// GFX13: v_s_sqrt_f32 s5, exec_lo ; encoding: [0x05,0x00,0x88,0xd6,0x7e,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, exec_hi +// GFX13: v_s_sqrt_f32 s5, exec_hi ; encoding: [0x05,0x00,0x88,0xd6,0x7f,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, null +// GFX13: v_s_sqrt_f32 s5, null ; encoding: [0x05,0x00,0x88,0xd6,0x7c,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, -1 +// GFX13: v_s_sqrt_f32 s5, -1 ; encoding: [0x05,0x00,0x88,0xd6,0xc1,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, 0.5 +// GFX13: v_s_sqrt_f32 s5, 0.5 ; encoding: [0x05,0x00,0x88,0xd6,0xf0,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, src_scc +// GFX13: v_s_sqrt_f32 s5, src_scc ; encoding: [0x05,0x00,0x88,0xd6,0xfd,0x00,0x01,0x02] + +v_s_sqrt_f32 s105, 0xaf123456 +// GFX13: v_s_sqrt_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x88,0xd6,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_s_sqrt_f32 s5, -s1 +// GFX13: v_s_sqrt_f32 s5, -s1 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x22] + +v_s_sqrt_f32 s5, |s1| +// GFX13: v_s_sqrt_f32 s5, |s1| ; encoding: [0x05,0x01,0x88,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, s1 clamp +// GFX13: v_s_sqrt_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x88,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f32 s5, s1 mul:2 +// GFX13: v_s_sqrt_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x0a] + +v_s_sqrt_f32 s5, s1 mul:4 +// GFX13: v_s_sqrt_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x12] + +v_s_sqrt_f32 s5, s1 div:2 +// GFX13: v_s_sqrt_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x01,0x1a] + +v_s_sqrt_f16 s5, s1 +// GFX13: v_s_sqrt_f16 s5, s1 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, s105 +// GFX13: v_s_sqrt_f16 s5, s105 ; encoding: [0x05,0x00,0x89,0xd6,0x69,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, vcc_lo +// GFX13: v_s_sqrt_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x89,0xd6,0x6a,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, vcc_hi +// GFX13: v_s_sqrt_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x89,0xd6,0x6b,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, ttmp15 +// GFX13: v_s_sqrt_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x89,0xd6,0x7b,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, m0 +// GFX13: v_s_sqrt_f16 s5, m0 ; encoding: [0x05,0x00,0x89,0xd6,0x7d,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, exec_lo +// GFX13: v_s_sqrt_f16 s5, exec_lo ; encoding: [0x05,0x00,0x89,0xd6,0x7e,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, exec_hi +// GFX13: v_s_sqrt_f16 s5, exec_hi ; encoding: [0x05,0x00,0x89,0xd6,0x7f,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, null +// GFX13: v_s_sqrt_f16 s5, null ; encoding: [0x05,0x00,0x89,0xd6,0x7c,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, -1 +// GFX13: v_s_sqrt_f16 s5, -1 ; encoding: [0x05,0x00,0x89,0xd6,0xc1,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, 0.5 +// GFX13: v_s_sqrt_f16 s5, 0.5 ; encoding: [0x05,0x00,0x89,0xd6,0xf0,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, src_scc +// GFX13: v_s_sqrt_f16 s5, src_scc ; encoding: [0x05,0x00,0x89,0xd6,0xfd,0x00,0x01,0x02] + +v_s_sqrt_f16 s105, 0xaf12 +// GFX13: v_s_sqrt_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x89,0xd6,0xff,0x00,0x01,0x02,0x12,0xaf,0x00,0x00] + +v_s_sqrt_f16 s5, -s1 +// GFX13: v_s_sqrt_f16 s5, -s1 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x22] + +v_s_sqrt_f16 s5, |s1| +// GFX13: v_s_sqrt_f16 s5, |s1| ; encoding: [0x05,0x01,0x89,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, s1 clamp +// GFX13: v_s_sqrt_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x89,0xd6,0x01,0x00,0x01,0x02] + +v_s_sqrt_f16 s5, s1 mul:2 +// GFX13: v_s_sqrt_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x0a] + +v_s_sqrt_f16 s5, s1 mul:4 +// GFX13: v_s_sqrt_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x12] + +v_s_sqrt_f16 s5, s1 div:2 +// GFX13: v_s_sqrt_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x01,0x1a] + +v_cvt_sr_pk_f16_f32 v5, v1, v2, s3 +// GFX13: v_cvt_sr_pk_f16_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0xc3,0xd6,0x01,0x05,0x0e,0x00] + +v_cvt_sr_pk_f16_f32 v5, v255, s2, s105 +// GFX13: v_cvt_sr_pk_f16_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0xc3,0xd6,0xff,0x05,0xa4,0x01] + +v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi +// GFX13: v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0xc3,0xd6,0x01,0xfe,0xff,0x01] + +v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo +// GFX13: v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0xc3,0xd6,0x69,0xd2,0xf8,0x01] + +v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0xc3,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0xc3,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 +// GFX13: v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0xc3,0xd6,0x7b,0xfa,0xed,0x61] + +v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0 +// GFX13: v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0xc3,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0xc3,0xd6,0x7e,0x82,0xad,0x01] + +v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo +// GFX13: v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0xc3,0xd6,0x7f,0xf8,0xa8,0x21] + +v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0xc3,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc +// GFX13: v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0xc3,0xd6,0xc1,0xfe,0xf4,0x43] + +v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0xc3,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0xc3,0xd6,0xfd,0xd4,0x04,0x33] + +v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0xc3,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cvt_pk_f16_f32 v5, v1, v2 +// GFX13: v_cvt_pk_f16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x74,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pk_f16_f32 v5, v255, v255 +// GFX13: v_cvt_pk_f16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x74,0xd7,0xff,0xff,0x03,0x02] + +v_cvt_pk_f16_f32 v5, s1, s2 +// GFX13: v_cvt_pk_f16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x74,0xd7,0x01,0x04,0x00,0x02] + +v_cvt_pk_f16_f32 v5, s105, s105 +// GFX13: v_cvt_pk_f16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x74,0xd7,0x69,0xd2,0x00,0x02] + +v_cvt_pk_f16_f32 v5, vcc_lo, ttmp15 +// GFX13: v_cvt_pk_f16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x74,0xd7,0x6a,0xf6,0x00,0x02] + +v_cvt_pk_f16_f32 v5, vcc_hi, 0xaf123456 +// GFX13: v_cvt_pk_f16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x74,0xd7,0x6b,0xfe,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_pk_f16_f32 v5, ttmp15, src_scc +// GFX13: v_cvt_pk_f16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x74,0xd7,0x7b,0xfa,0x01,0x02] + +v_cvt_pk_f16_f32 v5, m0, 0.5 +// GFX13: v_cvt_pk_f16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x74,0xd7,0x7d,0xe0,0x01,0x02] + +v_cvt_pk_f16_f32 v5, exec_lo, -1 +// GFX13: v_cvt_pk_f16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x74,0xd7,0x7e,0x82,0x01,0x02] + +v_cvt_pk_f16_f32 v5, exec_hi, null +// GFX13: v_cvt_pk_f16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x74,0xd7,0x7f,0xf8,0x00,0x02] + +v_cvt_pk_f16_f32 v5, null, exec_lo +// GFX13: v_cvt_pk_f16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x74,0xd7,0x7c,0xfc,0x00,0x02] + +v_cvt_pk_f16_f32 v5, -1, exec_hi +// GFX13: v_cvt_pk_f16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x74,0xd7,0xc1,0xfe,0x00,0x02] + +v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 +// GFX13: v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x74,0xd7,0xf0,0xfa,0x00,0x0a] + +v_cvt_pk_f16_f32 v5, src_scc, vcc_lo mul:4 +// GFX13: v_cvt_pk_f16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x74,0xd7,0xfd,0xd4,0x00,0x12] + +v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 +// GFX13: v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x74,0xd7,0xff,0xd6,0x00,0x3a,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0xc2,0xd6,0x01,0x05,0x0e,0x00] + +v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0xc2,0xd6,0xff,0x05,0xa4,0x01] + +v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi +// GFX13: v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0xc2,0xd6,0x01,0xfe,0xff,0x01] + +v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo +// GFX13: v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0xc2,0xd6,0x69,0xd2,0xf8,0x01] + +v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0xc2,0xd6,0x6a,0xf6,0x0c,0x04] + +v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0xc2,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0xc2,0xd6,0x7b,0xfa,0xed,0x61] + +v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0xc2,0xd6,0x7d,0xe0,0xf5,0x01] + +v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi +// GFX13: v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0xc2,0xd6,0x7e,0x82,0xad,0x01] + +v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0xc2,0xd6,0x7f,0xf8,0xa8,0x21] + +v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0xc2,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf] + +v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0xc2,0xd6,0xc1,0xfe,0xf4,0x43] + +v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0xc2,0xd6,0xf0,0xfa,0xc0,0x4b] + +v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 +// GFX13: v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0xc2,0xd6,0xfd,0xd4,0x04,0x33] + +v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 +// GFX13: v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0xc2,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] + +v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 +// W32: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xa3,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 100.0 +// W32: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xa3,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 +// W32: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 100.0 +// W32: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 ; encoding: [0x0a,0x00,0xaf,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xaf,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0xb2,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb2,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0x9b,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9b,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0x9e,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9e,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xae,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xae,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb1,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb1,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 +// W32: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x09,0x22,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 100.0 +// W32: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xa2,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xa2,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 +// W32: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0x9d,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 100.0 +// W32: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0x9d,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 +// W32: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0x9c,0xd6,0x14,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 100.0 +// W32: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0x9c,0xd6,0x14,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x95,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x95,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x96,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x96,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], v8 +// W32: v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], v8 ; encoding: [0x14,0x00,0x97,0xd6,0x06,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], 100.0 +// W32: v_cvt_scalef32_pk32_bf6_f32 v[20:25], v[6:37], 0x42c80000 ; encoding: [0x14,0x00,0x97,0xd6,0x06,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x98,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x98,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 +// W32: v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x99,0xd6,0x0a,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], 100.0 +// W32: v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], 0x42c80000 ; encoding: [0x14,0x00,0x99,0xd6,0x0a,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], v8 +// W32: v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], v8 ; encoding: [0x14,0x00,0x9a,0xd6,0x06,0x11,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], 100.0 +// W32: v_cvt_scalef32_pk32_fp6_f32 v[20:25], v[6:37], 0x42c80000 ; encoding: [0x14,0x00,0x9a,0xd6,0x06,0xff,0x01,0x02,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa4,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa4,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa5,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa5,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 +// W32: v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 ; encoding: [0x00,0x00,0xa6,0xd6,0x06,0x4d,0x9e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa6,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa7,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa7,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], v22, v23 +// W32: v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], v22, v23 ; encoding: [0x00,0x00,0xa8,0xd6,0x06,0x2d,0x5e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[6:21], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa8,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 +// W32: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 ; encoding: [0x00,0x00,0xa9,0xd6,0x06,0x4d,0x9e,0x04] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 + +v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], s3, 100.0 +// W32: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], s3, 0x42c80000 ; encoding: [0x00,0x00,0xa9,0xd6,0x06,0x07,0xfc,0x03,0x00,0x00,0xc8,0x42] +// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32 diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_aliases.s new file mode 100644 index 0000000000000..b48bdb81c9616 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_aliases.s @@ -0,0 +1,54 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding %s | FileCheck --check-prefixes=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13 %s + +v_min3_f32 v5, v1, v2, v3 +// GFX13: v_min3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x04] + +v_max3_f32 v5, v1, v2, v3 +// GFX13: v_max3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x04] + +v_min3_f16 v5, v1, v2, v3 +// GFX13: v_min3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x51,0xd7,0x01,0x05,0x0e,0x04] + +v_max3_f16 v5, v1, v2, v3 +// GFX13: v_max3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x54,0xd7,0x01,0x05,0x0e,0x04] + +v_med3_f32 v5, v1, v2, v3 +// GFX13: v_med3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x04] + +v_med3_f16 v5, v1, v2, v3 +// GFX13: v_med3_num_f16 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x57,0xd7,0x01,0x05,0x0e,0x04] + +v_minmax_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7] +// GFX13: v_minmax_num_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x00,0x68,0xd6,0xe9,0x04,0x0e,0xe4,0x01,0x88,0xc6,0xfa] + +v_maxmin_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] +// GFX13: v_maxmin_num_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x80,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0xc6,0xfa] + +v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i64_i32 v[5:6], s12, v1, v2, v[3:4] +// GFX13: v_mad_co_i64_i32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xf9,0xd6,0x01,0x05,0x0e,0x04] + +v_mad_u64_u32 v[5:6], s12, v1, v2, v[3:4] +// GFX13: v_mad_co_u64_u32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xf8,0xd6,0x01,0x05,0x0e,0x04] + +v_max_f64 v[5:6], s[2:3], s[4:5] +// GFX13: v_max_num_f64_e64 v[5:6], s[2:3], s[4:5] ; encoding: [0x05,0x00,0x0e,0xd5,0x02,0x08,0x00,0x02] + +v_min_f64 v[5:6], s[2:3], s[4:5] +// GFX13: v_min_num_f64_e64 v[5:6], s[2:3], s[4:5] ; encoding: [0x05,0x00,0x0d,0xd5,0x02,0x08,0x00,0x02] + +v_cvt_pknorm_i16_f16 v5, v1, v2 +// GFX13: v_cvt_pk_norm_i16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x02] + +v_cvt_pknorm_u16_f16 v5, v1, v2 +// GFX13: v_cvt_pk_norm_u16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x02] + +v_perm_pk4_b8_u8 v5, v1, v2, s3 +// GFX13: v_perm_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x44,0xd7,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16-fake16.s new file mode 100644 index 0000000000000..5bd92cfc0355a --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16-fake16.s @@ -0,0 +1,5574 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s + +v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_add3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_add3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6d,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_mirror +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 +// W32: v_add_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x69,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x6a,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6b,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7b,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W64: v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x68,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_add_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6a,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7a,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xfc,0x0f,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x47,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x7f,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_alignbit_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_alignbit_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_alignbit_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x16,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_alignbyte_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x17,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_and_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_and_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_and_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_and_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_and_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_and_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_and_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_and_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x67,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_and_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x71,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_ashr_pk_i8_i32 v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 ; encoding: [0x02,0x40,0x90,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0x53] + +v_ashr_pk_i8_i32 v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x90,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0xff] + +v_ashr_pk_i8_i32 v2, v4, v7, v8 quad_perm:[1,2,3,1] +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, v8 quad_perm:[1,2,3,1] row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x90,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x79,0x00,0xff] + +v_ashr_pk_i8_i32 v2, v4, v7, v8 row_share:3 fi:1 +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, v8 row_share:3 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x02,0x00,0x90,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x53,0x05,0xff] + +v_ashr_pk_u8_i32 v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 ; encoding: [0x02,0x40,0x91,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0x53] + +v_ashr_pk_u8_i32 v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x91,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0xff] + +v_ashr_pk_u8_i32 v2, v4, v7, v8 quad_perm:[1,2,3,1] +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, v8 quad_perm:[1,2,3,1] row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x91,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x79,0x00,0xff] + +v_ashr_pk_u8_i32 v2, v4, v7, v8 row_share:3 fi:1 +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, v8 row_share:3 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x02,0x00,0x91,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x53,0x05,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_ashrrev_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x08,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_bcnt_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_bfe_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_bfe_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfe_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x11,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_bfe_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_bfe_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfe_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x10,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_bfi_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_bfi_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfi_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x12,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:101 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:0x65 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x04,0x33,0xd6,0xfa,0xfe,0xf7,0xab,0xff,0x6f,0x05,0x30] + +v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:104 op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:0x68 op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x45,0x33,0xd6,0xfa,0xfe,0xf7,0x0b,0xff,0x6f,0x0d,0x30] + +v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:104 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:0x68 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x25,0x33,0xd6,0xfa,0x04,0x06,0x0b,0x01,0x60,0x01,0x13] + +v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:99 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:0x63 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x06,0x6b,0x01,0x60,0x09,0x13] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:102 op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:0x66 op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x33,0xd6,0xfa,0x04,0xfe,0xc9,0x01,0x2f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:6 row_ror:15 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0xc1,0x01,0x2f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x33,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:103 op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:0x67 op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x33,0xd6,0xfa,0x04,0xfa,0xe9,0x01,0x50,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:77 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:0x4d row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x33,0xd6,0xfa,0x04,0xfa,0xa9,0x01,0x50,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:104 op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:0x68 op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x33,0xd6,0xfa,0x04,0xf2,0x09,0x01,0x5f,0x01,0x01] + +v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:88 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:0x58 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x03,0x33,0xd6,0xfa,0x04,0xf2,0x09,0x01,0x5f,0x01,0x01] + +v_bitop3_b16_e64_dpp v5, v1, v2, s105 bitop3:0x16 row_shl:15 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, s105 bitop3:0x16 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x33,0xd6,0xfa,0x04,0xa6,0xc1,0x01,0x0f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, ttmp15 bitop3:5 row_ror:1 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, ttmp15 bitop3:5 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xee,0xa1,0x01,0x21,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, v255 bitop3:0 row_shl:1 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0xe4,0x01,0x40,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:100 row_half_mirror +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0x64 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x8c,0x01,0x41,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:161 quad_perm:[0,1,2,3] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0xa1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x34,0x01,0xe4,0x00,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, vcc_hi bitop3:63 row_shr:1 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, vcc_hi bitop3:0x3f row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x07,0x33,0xd6,0xfa,0x04,0xae,0xe1,0x01,0x11,0x01,0xff] + +v_bitop3_b16_e64_dpp v5, v1, v2, vcc_lo bitop3:0x24 row_shr:15 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, vcc_lo bitop3:0x24 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0xaa,0x81,0x01,0x1f,0x01,0xff] + +v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:101 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:0x65 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x04,0x34,0xd6,0xfa,0xfe,0xf7,0xab,0xff,0x6f,0x05,0x30] + +v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:88 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:0x58 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x03,0x34,0xd6,0xfa,0x04,0x06,0x0b,0x01,0x5f,0x01,0x01] + +v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:99 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:0x63 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0xc2,0x6b,0x01,0x60,0x09,0x13] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:5 row_ror:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:5 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:6 row_ror:15 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:77 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:0x4d row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x34,0xd6,0xfa,0x04,0xf2,0xa9,0x01,0x50,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, s105 bitop3:0 row_shl:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x24 row_shr:15 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x24 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:100 row_half_mirror +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:0x64 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0xfe,0x8f,0x01,0x41,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0x0e,0xe4,0x01,0x40,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:161 quad_perm:[0,1,2,3] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:0xa1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0x0e,0x34,0x01,0xe4,0x00,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:0x15 row_shl:15 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:0x15 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x34,0xd6,0xfa,0x04,0xae,0xa1,0x01,0x0f,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo bitop3:63 row_shr:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo bitop3:0x3f row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x07,0x34,0xd6,0xfa,0x04,0xaa,0xe1,0x01,0x11,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0c,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0c,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0c,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0f,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0f,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0d,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0d,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0d,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0d,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0d,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0e,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0e,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6e,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f16 v1, v2, v3 quad_perm:[0,1,2,3] fi:1 +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x04,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f16 v1, v2, v3 quad_perm:[0,1,2,3] fi:1 +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x04,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x21,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x21,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x21,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x12,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x12,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x13,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x13,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x13,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x22,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x22,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x22,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0x26,0xd6,0xfa,0xfe,0xf7,0x23,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x12,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x12,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x13,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x13,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x13,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x69,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x69,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x69,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_div_fixup_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x5f,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x5f,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x5f,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x5f,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x5f,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x5f,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x5f,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x4b,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x4b,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x4b,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x4b,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x4b,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x4b,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x4b,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x13,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x13,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x13,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x13,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x13,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x13,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x13,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13] + +v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x62,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_lerp_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_lerp_u8_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lerp_u8_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x15,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x46,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6f,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshlrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x14,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x5e,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x75,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0a,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x40,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x73,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x54,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x54,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x54,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x54,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x54,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x54,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x54,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_max3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x55,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_max3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_max3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_max3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x56,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_max3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_max_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_max_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x69,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x69,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x69,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x69,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x69,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x69,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x69,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_maxmin_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x64,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_maxmin_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x62,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x66,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x65,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_med3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x57,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x57,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x57,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x57,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x57,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x57,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x57,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x31,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x31,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x31,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x31,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x31,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x31,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x31,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_med3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x58,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_med3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_med3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x20,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_med3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x59,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x21,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x51,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x51,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x51,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x51,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x51,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x51,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x51,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x29,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x29,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x29,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x29,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x29,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x29,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x29,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_min3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x52,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_min3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_min3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1a,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, v3 row_half_mirror +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, v255 row_shl:1 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_min3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x53,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_min3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_min_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min_i16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_min_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x68,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x68,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x68,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x68,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x68,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x68,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x68,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_minmax_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_minmax_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x65,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_minmax_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_minmax_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x63,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_msad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x18,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x18,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x18,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x18,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x18,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x18,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x18,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_or3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_or3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_or3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x72,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_or_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_or_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_or_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_or_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_or_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_or_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_or_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_or_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_pack_b32_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_pack_b32_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x11,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_pack_b32_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x11,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_perm_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_perm_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_perm_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x44,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x23,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_u16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x24,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x25,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x22,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_mirror +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 +// W32: v_sub_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x69,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x6a,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6b,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7b,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W64: v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x68,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_sub_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6a,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7a,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xfc,0x10,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x76,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_mirror +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 +// W32: v_subrev_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x69,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x6a,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6b,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7b,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W64: v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x68,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6a,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7a,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xfc,0x19,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_xad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_xad_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_xad_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x45,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_xor3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_xor3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_xor3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x40,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_xor_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_xor_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x70,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x12,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x0a,0x13,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x13,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x5f,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x5f,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x5f,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x5f,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x5f,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x4b,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x4b,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x4b,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x4b,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x4b,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_mad_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x5e,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x5e,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x5e,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x5e,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_mad_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x5e,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0x75,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x01,0x13] + +v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x75,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_mad_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x40,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x40,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x40,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x40,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_mad_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x40,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0x73,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x01,0x13] + +v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x73,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x54,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x54,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x54,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x54,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_max3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x55,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x55,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x55,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x55,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_max3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_max3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x55,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_max3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x56,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x56,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x56,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x56,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_max3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_max3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x56,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x57,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x57,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x57,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x57,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x57,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_med3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x58,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x58,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x58,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x58,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_med3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_med3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x58,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_med3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x59,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x59,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x59,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x59,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x59,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x51,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x51,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x51,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x51,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x51,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_min3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x52,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x52,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x52,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x52,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_min3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_min3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x52,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_min3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x53,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x53,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x53,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x53,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_min3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_min3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x53,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x0a,0x11,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_minimum_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_minimum_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_minimum_f32 v5, v1, v2 row_mirror +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_half_mirror +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shl:1 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shl:15 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shr:1 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shr:15 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_ror:1 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_ror:15 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_minimum_f32 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3c,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_minimum_f32 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3c,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_minimum_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3c,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_maximum_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_maximum_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_maximum_f32 v5, v1, v2 row_mirror +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_half_mirror +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shl:1 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shl:15 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shr:1 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shr:15 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_ror:1 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_ror:15 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_maximum_f32 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3d,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_maximum_f32 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3d,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_maximum_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3d,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_minimum_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_minimum_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_minimum_f16 v5, v1, v2 row_mirror +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_half_mirror +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_shl:1 +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_shl:15 +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_shr:1 +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_shr:15 +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_ror:1 +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_ror:15 +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_minimum_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_minimum_f16 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3a,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_minimum_f16 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3a,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_minimum_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3a,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_maximum_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_maximum_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_maximum_f16 v5, v1, v2 row_mirror +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_half_mirror +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_shl:1 +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_shl:15 +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_shr:1 +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_shr:15 +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_ror:1 +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_ror:15 +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_maximum_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_maximum_f16 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3b,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_maximum_f16 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3b,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_maximum_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3b,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_minimum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimum3_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimum3_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minimum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimum3_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2d,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimum3_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minimum3_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2d,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimum3_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2d,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimum3_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2d,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minimum3_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2d,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maximum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maximum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximum3_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximum3_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maximum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximum3_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximum3_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maximum3_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximum3_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximum3_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2e,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maximum3_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2e,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minimum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimum3_f16 v5, v1, v2, v3 row_mirror +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimum3_f16 v5, v1, v2, v255 row_half_mirror +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimum3_f16 v5, v1, v2, s105 row_shl:1 +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimum3_f16 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimum3_f16 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimum3_f16 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimum3_f16 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimum3_f16 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimum3_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimum3_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_minimum3_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_maximum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maximum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximum3_f16 v5, v1, v2, v3 row_mirror +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximum3_f16 v5, v1, v2, v255 row_half_mirror +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximum3_f16 v5, v1, v2, s105 row_shl:1 +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximum3_f16 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximum3_f16 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximum3_f16 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximum3_f16 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximum3_f16 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximum3_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximum3_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_maximum3_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_maximum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_maximumminimum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximumminimum_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximumminimum_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maximumminimum_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximumminimum_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6d,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximumminimum_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maximumminimum_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6d,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximumminimum_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6d,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximumminimum_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6d,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maximumminimum_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6d,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximumminimum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimummaximum_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimummaximum_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minimummaximum_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimummaximum_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimummaximum_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minimummaximum_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimummaximum_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimummaximum_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6c,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minimummaximum_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6c,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimummaximum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6c,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximumminimum_f16 v5, v1, v2, v3 row_mirror +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximumminimum_f16 v5, v1, v2, v255 row_half_mirror +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximumminimum_f16 v5, v1, v2, s105 row_shl:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximumminimum_f16 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximumminimum_f16 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximumminimum_f16 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximumminimum_f16 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximumminimum_f16 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximumminimum_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximumminimum_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_maximumminimum_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimummaximum_f16 v5, v1, v2, v3 row_mirror +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimummaximum_f16 v5, v1, v2, v255 row_half_mirror +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimummaximum_f16 v5, v1, v2, s105 row_shl:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimummaximum_f16 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimummaximum_f16 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimummaximum_f16 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimummaximum_f16 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimummaximum_f16 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimummaximum_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimummaximum_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_minimummaximum_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_minimummaximum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0xc3,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0xc3,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0xc3,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0xc3,0xd6,0xfa,0x04,0x16,0x52,0x01,0x60,0x09,0x13] + +v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0xc3,0xd6,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13] + +v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x74,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0xc2,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0xc2,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0xc2,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0xc2,0xd6,0xfa,0x04,0x16,0x52,0x01,0x60,0x09,0x13] + +v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0xc2,0xd6,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16.s new file mode 100644 index 0000000000000..e8b6c03eeae7d --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp16.s @@ -0,0 +1,5587 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W64,W64-ASM %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W64,W64-DIS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s + +v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_add3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_add3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_add3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6d,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_mirror +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 +// W32: v_add_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x69,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x6a,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6b,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7b,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x68,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6a,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7a,0x0f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xfc,0x0f,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_add_lshl_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x47,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x47,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x7f,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x7f,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[0,1,2,3] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3.l row_mirror +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3.l row_half_mirror +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, v255.l row_shl:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_alignbit_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_alignbit_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_alignbit_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_alignbit_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x16,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[0,1,2,3] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l row_mirror +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l row_half_mirror +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v255.l row_shl:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, s105 row_shl:15 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_alignbyte_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_alignbyte_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x17,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_and_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_and_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x67,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_and_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_and_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x71,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_and_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x71,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_ashr_pk_i8_i32 v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 ; encoding: [0x02,0x40,0x90,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0x53] + +v_ashr_pk_i8_i32 v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x90,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0xff] + +v_ashr_pk_i8_i32 v2, v4, v7, v8 quad_perm:[1,2,3,1] +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, v8 quad_perm:[1,2,3,1] row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x90,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x79,0x00,0xff] + +v_ashr_pk_i8_i32 v2, v4, v7, v8 row_share:3 fi:1 +// GFX13: v_ashr_pk_i8_i32_e64_dpp v2, v4, v7, v8 row_share:3 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x02,0x00,0x90,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x53,0x05,0xff] + +v_ashr_pk_u8_i32 v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, 1 op_sel:[0,0,0,1] row_share:0 row_mask:0x5 bank_mask:0x3 ; encoding: [0x02,0x40,0x91,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0x53] + +v_ashr_pk_u8_i32 v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, 1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x91,0xd6,0xfa,0x0e,0x06,0x02,0x04,0x50,0x01,0xff] + +v_ashr_pk_u8_i32 v2, v4, v7, v8 quad_perm:[1,2,3,1] +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, v8 quad_perm:[1,2,3,1] row_mask:0xf bank_mask:0xf ; encoding: [0x02,0x00,0x91,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x79,0x00,0xff] + +v_ashr_pk_u8_i32 v2, v4, v7, v8 row_share:3 fi:1 +// GFX13: v_ashr_pk_u8_i32_e64_dpp v2, v4, v7, v8 row_share:3 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x02,0x00,0x91,0xd6,0xfa,0x0e,0x22,0x04,0x04,0x53,0x05,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x08,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x08,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x64,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_bcnt_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_bfe_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_bfe_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_bfe_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfe_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x11,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_bfe_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_bfe_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_bfe_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfe_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x10,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_bfi_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_bfi_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_bfi_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfi_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x12,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_bfm_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_bitop3_b16_e64_dpp v255.l, v255.l, v255.l, src_scc bitop3:101 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bitop3_b16_e64_dpp v255.l, v255.l, v255.l, src_scc bitop3:0x65 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x04,0x33,0xd6,0xfa,0xfe,0xf7,0xab,0xff,0x6f,0x05,0x30] + +v_bitop3_b16_e64_dpp v255.h, v255.l, v255.l, src_scc bitop3:104 op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_bitop3_b16_e64_dpp v255.h, v255.l, v255.l, src_scc bitop3:0x68 op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x45,0x33,0xd6,0xfa,0xfe,0xf7,0x0b,0xff,0x6f,0x0d,0x30] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:104 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:0x68 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x25,0x33,0xd6,0xfa,0x04,0x06,0x0b,0x01,0x60,0x01,0x13] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:99 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:0x63 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x06,0x6b,0x01,0x60,0x09,0x13] + +v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi bitop3:102 op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi bitop3:0x66 op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x33,0xd6,0xfa,0x04,0xfe,0xc9,0x01,0x2f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_hi bitop3:6 row_ror:15 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_hi bitop3:6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0xc1,0x01,0x2f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x33,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.h, v2.l, exec_lo bitop3:103 op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.h, v2.l, exec_lo bitop3:0x67 op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x33,0xd6,0xfa,0x04,0xfa,0xe9,0x01,0x50,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo bitop3:77 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo bitop3:0x4d row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x33,0xd6,0xfa,0x04,0xfa,0xa9,0x01,0x50,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.h, null bitop3:104 op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.h, null bitop3:0x68 op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x33,0xd6,0xfa,0x04,0xf2,0x09,0x01,0x5f,0x01,0x01] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, null bitop3:88 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, null bitop3:0x58 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x03,0x33,0xd6,0xfa,0x04,0xf2,0x09,0x01,0x5f,0x01,0x01] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, s105 bitop3:0x16 row_shl:15 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, s105 bitop3:0x16 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x33,0xd6,0xfa,0x04,0xa6,0xc1,0x01,0x0f,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, ttmp15 bitop3:5 row_ror:1 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, ttmp15 bitop3:5 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xee,0xa1,0x01,0x21,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v255.l bitop3:0 row_shl:1 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0x27 row_mirror +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0x27 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0xe4,0x01,0x40,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:100 row_half_mirror +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0x64 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x8c,0x01,0x41,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:161 quad_perm:[0,1,2,3] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0xa1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x34,0x01,0xe4,0x00,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi bitop3:63 row_shr:1 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi bitop3:0x3f row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x07,0x33,0xd6,0xfa,0x04,0xae,0xe1,0x01,0x11,0x01,0xff] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo bitop3:0x24 row_shr:15 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo bitop3:0x24 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0xaa,0x81,0x01,0x1f,0x01,0xff] + +v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:101 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:0x65 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x04,0x34,0xd6,0xfa,0xfe,0xf7,0xab,0xff,0x6f,0x05,0x30] + +v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:88 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:0x58 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x03,0x34,0xd6,0xfa,0x04,0x06,0x0b,0x01,0x5f,0x01,0x01] + +v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:99 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:0x63 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0xc2,0x6b,0x01,0x60,0x09,0x13] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:5 row_ror:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:5 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:6 row_ror:15 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:77 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:0x4d row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x34,0xd6,0xfa,0x04,0xf2,0xa9,0x01,0x50,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, s105 bitop3:0 row_shl:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x24 row_shr:15 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x24 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:100 row_half_mirror +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:0x64 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0xfe,0x8f,0x01,0x41,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0x0e,0xe4,0x01,0x40,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:161 quad_perm:[0,1,2,3] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 bitop3:0xa1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x34,0xd6,0xfa,0x04,0x0e,0x34,0x01,0xe4,0x00,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:0x15 row_shl:15 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:0x15 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x34,0xd6,0xfa,0x04,0xae,0xa1,0x01,0x0f,0x01,0xff] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo bitop3:63 row_shr:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo bitop3:0x3f row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x07,0x34,0xd6,0xfa,0x04,0xaa,0xe1,0x01,0x11,0x01,0xff] + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff] + +v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W32-ERR: :[[@LINE-1]]:38: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01] + +v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32-ERR: :[[@LINE-1]]:38: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13] + +v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0c,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0c,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0c,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0f,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0f,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0d,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0d,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0d,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0d,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0d,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x0e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x0e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x0e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x0e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x0e,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x0e,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6e,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x7b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x7a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 quad_perm:[0,1,2,3] fi:1 +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0x36,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x04,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x38,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 quad_perm:[0,1,2,3] fi:1 +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x00,0x35,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x04,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x37,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x21,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x21,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x21,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x12,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x12,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x13,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x13,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x13,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x22,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x22,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x22,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0x26,0xd6,0xfa,0xfe,0xf7,0x23,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x12,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x12,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x13,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x13,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x13,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x69,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x69,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x69,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x5f,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x5f,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x5f,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x5f,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x5f,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x5f,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_div_fixup_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x5f,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x4b,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x4b,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x4b,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x4b,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x4b,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x4b,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x4b,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x13,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x13,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x13,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x13,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x13,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x13,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x13,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x62,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13] + +v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x62,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_lerp_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_lerp_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_lerp_u8_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lerp_u8_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x15,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_lshl_add_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x46,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x46,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_lshl_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6f,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x6f,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x14,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x14,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x39,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x39,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x5e,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_mad_i16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_i16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x5e,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 row_mirror +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v255 row_half_mirror +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, ttmp15 row_shr:15 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_hi row_ror:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_lo row_ror:15 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x75,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_i32_i16_e64_dpp v255, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x75,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_i32_i24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0a,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x40,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_mad_u16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_u16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x40,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 row_mirror +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v255 row_half_mirror +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, ttmp15 row_shr:15 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_hi row_ror:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_lo row_ror:15 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x73,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_u32_u16_e64_dpp v255, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255.l, v255.l, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x73,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_mad_u32_u24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x54,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x54,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x54,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x54,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x54,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x54,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x54,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x55,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_max3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x55,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_max3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_max3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_max3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x56,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_max3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x56,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_max3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_max3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1e,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x0a,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x09,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_max_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x69,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x69,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x69,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x69,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x69,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x69,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x69,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_maxmin_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_maxmin_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x64,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_maxmin_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_maxmin_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maxmin_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x62,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x66,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x66,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x65,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_med3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x57,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x57,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_med3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x57,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x57,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x57,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x57,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_med3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x57,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x31,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x31,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x31,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x31,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x31,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x31,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x31,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x58,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_med3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x58,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_med3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_med3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_med3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x20,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x59,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_med3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x59,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_med3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x21,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x51,0xd7,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x51,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x51,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x51,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x51,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x51,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x51,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x29,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x29,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x29,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x29,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x29,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x29,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x29,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x52,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_min3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x52,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_min3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_min3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_min3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1a,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x41,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x0f,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xae,0x01,0x01,0x11,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x1f,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xee,0x01,0x01,0x21,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, null row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x53,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x09,0x13] + +v_min3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x53,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_min3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_min3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min_i16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x0c,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_min_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x68,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x68,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x68,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x68,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x68,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x68,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x68,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_minmax_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_minmax_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_minmax_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x65,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_minmax_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_minmax_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_minmax_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minmax_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x63,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_msad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_msad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x39,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x05,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x05,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x18,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x18,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x18,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x18,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x18,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x18,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x18,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_or3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_or3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x72,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_or3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_or3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x72,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_or_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_or_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_mirror +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_half_mirror +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shl:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shl:15 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shr:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shr:15 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_ror:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_ror:15 +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_pack_b32_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_pack_b32_f16_e64_dpp v5, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x11,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_pack_b32_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x11,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_perm_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_perm_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x44,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_perm_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_perm_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x44,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_hi_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x23,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_u16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_u16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x24,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x25,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_sad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_sad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x22,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_mirror +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 +// W32: v_sub_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x69,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x6a,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6b,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7b,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x68,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6a,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7a,0x10,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xfc,0x10,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x76,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x76,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_mirror +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 +// W32: v_subrev_co_u32_e64_dpp v5, s105, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x69,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32: v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x6a,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6b,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7b,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x68,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x6a,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x7a,0x19,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xfc,0x19,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_xad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_xad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x45,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_xad_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_xad_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x45,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x21,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x2f,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xf2,0x01,0x01,0x50,0x01,0xff] + +v_xor3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, -1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x06,0x03,0x01,0x5f,0x01,0x01] + +v_xor3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x60,0x09,0x13] + +v_xor3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_xor3_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x40,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_xor_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_xor_b16_e64_dpp v255.l, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x70,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] + +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x12,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x0a,0x13,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x13,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +v_div_fixup_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_div_fixup_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x5f,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x5f,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x5f,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x5f,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x5f,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x4b,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x4b,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x4b,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x4b,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x4b,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x5e,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x5e,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x5e,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x5e,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_mad_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x5e,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_mad_i32_i16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0x75,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x01,0x13] + +v_mad_i32_i16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x75,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_mad_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x40,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_mad_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x40,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x40,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x40,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_mad_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x40,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_mad_u32_u16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0x73,0xd7,0xfa,0x04,0xc2,0x03,0x01,0x60,0x01,0x13] + +v_mad_u32_u16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x73,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x54,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x54,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x54,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x54,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x55,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x55,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x55,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x55,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_max3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_max3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x55,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_max3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x56,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_max3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x56,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x56,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x56,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_max3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_max3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x56,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_med3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x57,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x57,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x57,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x57,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_med3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_med3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x57,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_med3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x58,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x58,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x58,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x58,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_med3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_med3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x58,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_med3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x59,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_med3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x59,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x59,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x59,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x59,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x51,0xd7,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x51,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x51,0xd7,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x51,0xd7,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13] + +v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x51,0xd7,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30] + +v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x52,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x52,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x52,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x52,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_min3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_min3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x52,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_min3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x53,0xd7,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_min3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0x53,0xd7,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x10,0x53,0xd7,0xfa,0x04,0xf2,0x01,0x01,0x5f,0x01,0x01] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x20,0x53,0xd7,0xfa,0x04,0x06,0x03,0x01,0x60,0x01,0x13] + +v_min3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_min3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x53,0xd7,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30] + +v_pack_b32_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x0a,0x11,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] + +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] + +v_minimum_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_minimum_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_minimum_f32 v5, v1, v2 row_mirror +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_half_mirror +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shl:1 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shl:15 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shr:1 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_shr:15 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_ror:1 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_ror:15 +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_minimum_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3c,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_minimum_f32 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3c,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_minimum_f32 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3c,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_minimum_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3c,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_maximum_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_maximum_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_maximum_f32 v5, v1, v2 row_mirror +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_half_mirror +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shl:1 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shl:15 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shr:1 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_shr:15 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_ror:1 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_ror:15 +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_maximum_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_maximum_f32 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3d,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_maximum_f32 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3d,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_maximum_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3d,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_minimum_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_mirror +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_minimum_f16 v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_minimum_f16 v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3a,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_minimum_f16 v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3a,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_minimum_f16 v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3a,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_maximum_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_mirror +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_half_mirror +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_shl:1 +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_shl:15 +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_shr:1 +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_shr:15 +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_ror:1 +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_ror:15 +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_maximum_f16 v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_maximum_f16 v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x3b,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_maximum_f16 v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x3b,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_maximum_f16 v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x3b,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_minimum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimum3_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimum3_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimum3_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minimum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimum3_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2d,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimum3_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minimum3_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2d,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimum3_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2d,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimum3_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2d,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minimum3_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2d,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maximum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maximum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximum3_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximum3_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximum3_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maximum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximum3_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximum3_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maximum3_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximum3_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximum3_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2e,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maximum3_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2e,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minimum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimum3_f16 v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimum3_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimum3_f16 v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimum3_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimum3_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimum3_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimum3_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimum3_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimum3_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_minimum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_minimum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_maximum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maximum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximum3_f16 v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximum3_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximum3_f16 v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximum3_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximum3_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximum3_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximum3_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximum3_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximum3_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_maximum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_maximum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_maximumminimum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximumminimum_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximumminimum_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximumminimum_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_maximumminimum_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximumminimum_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6d,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximumminimum_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_maximumminimum_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6d,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximumminimum_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6d,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximumminimum_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6d,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_maximumminimum_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6d,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximumminimum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimummaximum_f32 v5, v1, v2, v3 row_mirror +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, v255 row_half_mirror +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, s105 row_shl:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimummaximum_f32 v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimummaximum_f32 v5, |v1|, v2, -ttmp15 row_shr:15 +// GFX13: v_minimummaximum_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimummaximum_f32 v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimummaximum_f32 v5, -v1, v2, |exec_lo| row_ror:15 +// GFX13: v_minimummaximum_f32_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimummaximum_f32 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimummaximum_f32 v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6c,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minimummaximum_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6c,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimummaximum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6c,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_maximumminimum_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_maximumminimum_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_maximumminimum_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_maximumminimum_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_maximumminimum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l row_mirror +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minimummaximum_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minimummaximum_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minimummaximum_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minimummaximum_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01] + +v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13] + +v_minimummaximum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc3,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0xc3,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0xc3,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0xc3,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0xc3,0xd6,0xfa,0x04,0x16,0x52,0x01,0x60,0x09,0x13] + +v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0xc3,0xd6,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_mirror +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_half_mirror +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:15 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:15 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:15 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x74,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13] + +v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x74,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 row_mirror +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc2,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0xc2,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0xc2,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0xc2,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0xc2,0xd6,0xfa,0x04,0x16,0x52,0x01,0x60,0x09,0x13] + +v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0xc2,0xd6,0xfa,0xfe,0xf7,0x7b,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8-fake16.s new file mode 100644 index 0000000000000..8989147174ca7 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8-fake16.s @@ -0,0 +1,3520 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s + +v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6d,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_add_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x69,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6b,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7b,0x0f,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x68,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_add_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7a,0x0f,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xfc,0x0f,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x47,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x47,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x7f,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x7f,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x16,0xd6,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_alignbit_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x16,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x17,0xd6,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x17,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_and_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_and_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_and_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x67,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_and_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_and_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x67,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x71,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x71,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_ashr_pk_i8_i32 v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ashr_pk_i8_i32_e64_dpp v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x40,0x90,0xd6,0xea,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_ashr_pk_i8_i32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ashr_pk_i8_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x90,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_ashr_pk_u8_i32 v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ashr_pk_u8_i32_e64_dpp v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x40,0x91,0xd6,0xea,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_ashr_pk_u8_i32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ashr_pk_u8_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x91,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x08,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ashrrev_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x08,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_ashrrev_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x08,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x64,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bcnt_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bcnt_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x11,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfe_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x11,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x10,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfe_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x10,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x12,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfi_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x12,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:103 op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:0x67 op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x44,0x33,0xd6,0xea,0xfe,0xf7,0xeb,0xff,0x00,0x00,0x00] + +v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:77 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:0x4d dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x33,0xd6,0xe9,0xfe,0xf7,0xab,0xff,0x00,0x00,0x00] + +v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:102 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:0x66 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x24,0x33,0xd6,0xe9,0x04,0x06,0xcb,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x33,0xd6,0xea,0x04,0x06,0xc3,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:63 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:0x3f dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x07,0x33,0xd6,0xe9,0x04,0xfe,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:88 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:0x58 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7b,0x33,0xd6,0xe9,0x04,0xfe,0x09,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x33,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xfa,0x81,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:99 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:0x63 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x33,0xd6,0xe9,0x04,0xfa,0x69,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd6,0xe9,0x04,0xf2,0xa1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x33,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xa6,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, ttmp15 bitop3:15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, ttmp15 bitop3:0xf dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x33,0xd6,0xe9,0x04,0xee,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, v255 bitop3:161 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v255 bitop3:0xa1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xfe,0x37,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, vcc_hi bitop3:100 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, vcc_hi bitop3:0x64 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xae,0x89,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5, v1, v2, vcc_lo bitop3:0 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:88 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:0x58 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x34,0xd6,0xe9,0xfe,0xf7,0x0b,0xff,0x00,0x00,0x00] + +v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0x06,0xc3,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:77 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:0x4d dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x01,0x34,0xd6,0xea,0x04,0xc2,0xab,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:63 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:0x3f dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x07,0x34,0xd6,0xe9,0x04,0xfe,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xfa,0x81,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0xf2,0xa1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xa6,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x34,0xd6,0xe9,0x04,0xee,0xa1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:161 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:0xa1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xfe,0x37,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:100 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:0x64 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xae,0x89,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo bitop3:0 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0c,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0c,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0c,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0f,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0f,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0d,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0d,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0d,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0d,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0d,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0e,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0e,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6e,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6e,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x7a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] + +v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x7a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x7a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x7a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x7b,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x7b,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x7b,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f16 v1, v2, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 ; encoding: [0x01,0x00,0x36,0xd7,0xea,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x20,0x36,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x40,0x36,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x60,0x36,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 ; encoding: [0x01,0x00,0x35,0xd7,0xea,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x20,0x35,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x40,0x35,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x60,0x35,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x37,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x37,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x37,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x37,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x38,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x21,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x21,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x21,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x12,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x12,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x12,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x13,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x13,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x13,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x22,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x22,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x22,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x26,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x26,0xd6,0xe9,0xfe,0xf7,0x23,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x12,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x12,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x12,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x13,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x13,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x13,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_div_fixup_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5f,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5f,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x5f,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x5f,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x5f,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x5f,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x5f,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x4b,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x4b,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x4b,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x4b,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x4b,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x4b,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x4b,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x13,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x13,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x13,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x13,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x13,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x13,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x13,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_ldexp_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x62,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x62,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x15,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lerp_u8_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x15,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x46,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x46,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6f,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6f,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x14,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshlrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x14,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshlrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x14,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshrrev_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x39,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x5e,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x5e,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_i32_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x75,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x75,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0a,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0a,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x40,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x40,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u32_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x73,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x73,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0b,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x54,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x54,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x54,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x54,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x54,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x54,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x54,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_max3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x55,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x55,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1d,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x56,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x56,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1e,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1e,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x09,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x69,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x69,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x69,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x69,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x69,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x69,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x64,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maxmin_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x64,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x62,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maxmin_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x62,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x66,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x66,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x66,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x65,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x65,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_med3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x57,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x57,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x57,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x57,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x57,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x57,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x57,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x31,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x31,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x31,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x31,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x31,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x31,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x31,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_med3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x58,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x58,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x20,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x20,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x59,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x59,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x21,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x21,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x51,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x51,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x51,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x51,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x51,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x51,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x51,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x29,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x29,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x29,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x29,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x29,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x29,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x29,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_min3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x52,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x52,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1a,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1a,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x53,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x53,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1b,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0c,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min_i16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x68,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x68,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x68,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x68,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x68,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x68,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x65,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minmax_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x65,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x63,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minmax_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x63,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x39,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x39,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mul_lo_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x05,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x18,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x18,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x18,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x18,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x18,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x18,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x18,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x72,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_or3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x72,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_or_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_or_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_or_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x25,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_or_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_or_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_pack_b32_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x11,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x11,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x11,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x44,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_perm_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x44,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x23,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x23,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x24,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x24,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x25,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x25,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x22,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x22,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x69,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6b,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7b,0x10,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x68,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_sub_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7a,0x10,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xfc,0x10,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x76,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x76,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x76,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x69,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6b,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7b,0x19,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x68,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W64: v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7a,0x19,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xfc,0x19,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x45,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_xad_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x45,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x40,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_xor3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x40,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x70,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x70,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x70,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x12,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x13,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x13,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x5f,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x5f,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x5f,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x5f,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x5f,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x4b,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x4b,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x4b,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x4b,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x4b,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_mad_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x5e,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x5e,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x5e,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x5e,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x5e,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x75,0xd7,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x75,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x40,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x40,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x40,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x40,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x40,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x73,0xd7,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x73,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x54,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x54,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x54,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x54,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_max3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x55,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x55,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x55,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x55,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_max3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x55,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x56,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x56,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x56,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x56,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_max3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x56,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x57,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x57,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x57,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x57,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x57,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_med3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x58,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x58,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x58,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x58,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_med3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x58,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x59,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x59,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x59,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x59,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x59,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x51,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x51,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x51,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x51,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x51,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_min3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x52,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x52,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x52,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x52,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_min3_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x52,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x53,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x53,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5, v1, v2, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x53,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_min3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x53,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x11,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_minimum_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_minimum_f32 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3c,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_minimum_f32 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3c,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_minimum_f32 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3c,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_maximum_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_maximum_f32 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3d,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_maximum_f32 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3d,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_maximum_f32 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3d,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_minimum_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_minimum_f16 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_minimum_f16 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_minimum_f16 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_maximum_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_maximum_f16 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3b,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_maximum_f16 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3b,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_maximum_f16 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3b,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_minimum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2d,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2d,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2d,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2d,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2d,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maximum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2e,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2e,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minimum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_maximum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_maximumminimum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6d,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6d,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6d,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6d,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6d,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximumminimum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minimummaximum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6c,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6c,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimummaximum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6c,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maximumminimum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_minimummaximum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0xc3,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0xc3,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0xc3,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0xc3,0xd6,0xea,0x04,0x16,0x52,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0xc3,0xd6,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x74,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x74,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x74,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x74,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0xc2,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0xc2,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0xc2,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0xc2,0xd6,0xea,0x04,0x16,0x52,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0xc2,0xd6,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8.s new file mode 100644 index 0000000000000..654bdbf28f752 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_dpp8.s @@ -0,0 +1,3524 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,W64,W64-ASM %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,W64,W64-DIS %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s + +v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6d,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_add_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x69,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_add_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6b,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_add_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7b,0x0f,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x68,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x0f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_add_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7a,0x0f,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xfc,0x0f,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x47,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x47,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7f,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x7f,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x7f,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_alignbit_b32_e64_dpp v5, v1, v2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_alignbit_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x16,0xd6,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_alignbit_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_alignbit_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x16,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_alignbyte_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x17,0xd6,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_alignbyte_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x17,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_and_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_and_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x67,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_and_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_and_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x67,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x71,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_and_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x71,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_and_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x71,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_ashr_pk_i8_i32 v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ashr_pk_i8_i32_e64_dpp v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x40,0x90,0xd6,0xea,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_ashr_pk_i8_i32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ashr_pk_i8_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x90,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_ashr_pk_u8_i32 v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ashr_pk_u8_i32_e64_dpp v5, v1, v2, s3 op_sel:[0,0,0,1] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x40,0x91,0xd6,0xea,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_ashr_pk_u8_i32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ashr_pk_u8_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x91,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x08,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ashrrev_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x08,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_ashrrev_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x08,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bcnt_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x64,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bcnt_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bcnt_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfe_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x11,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_bfe_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfe_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x11,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfe_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x10,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_bfe_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfe_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x10,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfi_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x12,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_bfi_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfi_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x12,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_bitop3_b16_e64_dpp v255.h, v255.l, v255.l, src_scc bitop3:103 op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_bitop3_b16_e64_dpp v255.h, v255.l, v255.l, src_scc bitop3:0x67 op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x44,0x33,0xd6,0xea,0xfe,0xf7,0xeb,0xff,0x00,0x00,0x00] + +v_bitop3_b16_e64_dpp v255.l, v255.l, v255.l, src_scc bitop3:77 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bitop3_b16_e64_dpp v255.l, v255.l, v255.l, src_scc bitop3:0x4d dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x33,0xd6,0xe9,0xfe,0xf7,0xab,0xff,0x00,0x00,0x00] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:102 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:0x66 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x24,0x33,0xd6,0xe9,0x04,0x06,0xcb,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x33,0xd6,0xea,0x04,0x06,0xc3,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_hi bitop3:63 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_hi bitop3:0x3f dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x07,0x33,0xd6,0xe9,0x04,0xfe,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi bitop3:88 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi bitop3:0x58 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7b,0x33,0xd6,0xe9,0x04,0xfe,0x09,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x33,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xfa,0x81,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.h, v2.l, exec_lo bitop3:99 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.h, v2.l, exec_lo bitop3:0x63 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x33,0xd6,0xe9,0x04,0xfa,0x69,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd6,0xe9,0x04,0xf2,0xa1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x33,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xa6,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, ttmp15 bitop3:15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, ttmp15 bitop3:0xf dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x33,0xd6,0xe9,0x04,0xee,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v255.l bitop3:161 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v255.l bitop3:0xa1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xfe,0x37,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi bitop3:100 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi bitop3:0x64 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x33,0xd6,0xe9,0x04,0xae,0x89,0x01,0x77,0x39,0x05] + +v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo bitop3:0 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:88 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_bitop3_b32_e64_dpp v255, v255, v255, src_scc bitop3:0x58 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x34,0xd6,0xe9,0xfe,0xf7,0x0b,0xff,0x00,0x00,0x00] + +v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, -1 bitop3:6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0x06,0xc3,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:77 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, 0.5 bitop3:0x4d dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x01,0x34,0xd6,0xea,0x04,0xc2,0xab,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:63 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_hi bitop3:0x3f dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x07,0x34,0xd6,0xe9,0x04,0xfe,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, exec_lo bitop3:0x24 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xfa,0x81,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, null bitop3:5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0xf2,0xa1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, s105 bitop3:0x27 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xa6,0xe1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, ttmp15 bitop3:0x15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x34,0xd6,0xe9,0x04,0xee,0xa1,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:161 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v255 bitop3:0xa1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xfe,0x37,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:100 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_hi bitop3:0x64 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x34,0xd6,0xe9,0x04,0xae,0x89,0x01,0x77,0x39,0x05] + +v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo bitop3:0 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_bitop3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction + +v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] + +v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:35: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05] + +v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:38: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05] + +v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32-ERR: :[[@LINE-1]]:38: error: invalid operand for instruction +// W64-ASM: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] +// W64-DIS: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05] + +v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00] + +v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubeid_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0c,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubeid_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0c,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0c,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubema_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0f,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubema_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0f,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0d,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0d,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0d,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubesc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0d,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubesc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0d,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x0e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x0e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x0e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x0e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cubetc_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x0e,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x0e,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6e,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6e,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x7a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] + +v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x7a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x7a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x7a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x7b,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x7b,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_pk_bf8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x7b,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 ; encoding: [0x01,0x00,0x36,0xd7,0xea,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x20,0x36,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x40,0x36,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x60,0x36,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 dpp8:[1,2,3,4,5,6,7,0] fi:1 ; encoding: [0x01,0x00,0x35,0xd7,0xea,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:1 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x20,0x35,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:2 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x40,0x35,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] +// GFX13: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:3 dpp8:[1,2,3,4,5,6,7,0] ; encoding: [0x01,0x60,0x35,0xd7,0xe9,0x06,0x02,0x00,0x02,0xd1,0x58,0x1f] + +v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x37,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x37,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x37,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x37,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x38,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x38,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x38,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x21,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x21,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x21,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_i16_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x12,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x12,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x12,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x13,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x13,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x13,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x22,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x22,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x22,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x26,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_u8_f32_e64_dpp v255, -|v255|, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x26,0xd6,0xe9,0xfe,0xf7,0x23,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x12,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x12,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x12,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_i16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x13,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x13,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x13,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5f,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5f,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x5f,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x5f,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x5f,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x5f,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_div_fixup_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x5f,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x4b,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x4b,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x4b,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x4b,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x4b,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x4b,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x4b,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x13,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x13,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x13,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x13,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x13,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_fma_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x13,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_fma_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x13,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_ldexp_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05] + +v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ldexp_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x62,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x62,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00] + +v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lerp_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x15,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_lerp_u8_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lerp_u8_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x15,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshl_add_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x46,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x46,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshl_or_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6f,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshl_or_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x6f,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x14,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshlrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x14,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshlrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x14,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_lshrrev_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x39,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_lshrrev_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x39,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x5e,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_i16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x5e,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x75,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.l, v2.l, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x75,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v255, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x75,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_i32_i24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0a,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0a,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x40,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_u16_e64_dpp v255.l, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x40,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x73,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.l, v2.l, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x73,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v255, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255.l, v255.l, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x73,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0b,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x54,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x54,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x54,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x54,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x54,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x54,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x54,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x55,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x55,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1d,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_max3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x56,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x56,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1e,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1e,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0a,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x0a,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_max_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x09,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_max_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x69,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x69,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x69,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x69,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x69,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x69,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x64,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maxmin_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x64,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maxmin_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x62,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_maxmin_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maxmin_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x62,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x66,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x66,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mbcnt_hi_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x66,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x65,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x65,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x57,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x57,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x57,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x57,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x57,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x57,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_med3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x57,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x31,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x31,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x31,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x31,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x31,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x31,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x31,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x58,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x58,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x20,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_med3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x20,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x59,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x59,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x21,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x21,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x51,0xd7,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x51,0xd7,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x51,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x51,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x51,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x51,0xd7,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x51,0xd7,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x29,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x29,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x29,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x29,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x29,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x29,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x29,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x52,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x52,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_i16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x52,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1a,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_min3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1a,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x53,0xd7,0xea,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_u16_e64_dpp v255.l, v255.l, v255.l, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x53,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x1b,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0c,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min_i16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x0c,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_min_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0b,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_min_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x68,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x68,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x68,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_num_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x68,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_num_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x68,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX13: v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x68,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_i32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x65,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_minmax_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minmax_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x65,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minmax_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x63,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_minmax_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minmax_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x63,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_msad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x39,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_msad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x39,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mul_lo_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x05,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mul_lo_u16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x05,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x18,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x18,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x18,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x18,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mullit_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x18,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_mullit_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x18,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x18,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x72,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_or3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x72,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_or3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_or3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x72,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_or_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_or_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x25,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_or_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_or_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_pack_b32_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x11,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x11,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x11,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_perm_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x44,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_perm_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_perm_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x44,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_hi_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x23,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x23,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_u16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x24,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x24,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x25,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x25,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sad_u8_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x22,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sad_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x22,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_sub_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x69,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_sub_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6b,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_sub_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7b,0x10,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction + +v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x68,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x10,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32-ERR: :[[@LINE-1]]:26: error: invalid operand for instruction +// W64: v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7a,0x10,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xfc,0x10,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x76,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x76,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x76,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x69,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32: v_subrev_co_u32_e64_dpp v5, vcc_hi, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6b,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: v_subrev_co_u32_e64_dpp v5, ttmp15, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7b,0x19,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction + +v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x68,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x6a,0x19,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32-ERR: :[[@LINE-1]]:29: error: invalid operand for instruction +// W64: v_subrev_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x7a,0x19,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xfc,0x19,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_xad_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x45,0xd7,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_xad_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_xad_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x45,0xd7,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, exec_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, -1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_xor3_b32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x40,0xd6,0xea,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_xor3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_xor3_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x40,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x70,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_xor_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_xor_b16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x70,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_xor_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_xor_b16_e64_dpp v255.l, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x70,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_cvt_pk_norm_i16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x12,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x13,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_cvt_pk_norm_u16_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x13,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_div_fixup_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x5f,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x5f,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x5f,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x5f,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x5f,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x4b,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x4b,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x4b,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x4b,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x4b,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x5e,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x5e,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x5e,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x5e,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x5e,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_i32_i16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_i32_i16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x75,0xd7,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_i32_i16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_i32_i16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x75,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x40,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x40,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x40,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x40,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_mad_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x40,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_mad_u32_u16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_mad_u32_u16_e64_dpp v5, v1.h, v2.l, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x73,0xd7,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05] + +v_mad_u32_u16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_mad_u32_u16_e64_dpp v255, v255.l, v255.h, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x73,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x54,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x54,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x54,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x54,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x55,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x55,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x55,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x55,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_max3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x55,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_max3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x56,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x56,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x56,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_max3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x56,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_max3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_max3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x56,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x57,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x57,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x57,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x57,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_med3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_med3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x57,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_med3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x58,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x58,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x58,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x58,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_med3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x58,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_med3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x59,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x59,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x59,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x59,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x59,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x51,0xd7,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x51,0xd7,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x51,0xd7,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x51,0xd7,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x51,0xd7,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x52,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x52,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x52,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x52,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_min3_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x52,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_min3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd7,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.h, v2.l, exec_lo op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x53,0xd7,0xe9,0x04,0xfa,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.h, null op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x53,0xd7,0xe9,0x04,0xf2,0x01,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_min3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x53,0xd7,0xe9,0x04,0x06,0x03,0x01,0x77,0x39,0x05] + +v_min3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_min3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x53,0xd7,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] + +v_pack_b32_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_pack_b32_f16_e64_dpp v5, -v1.h, |v2.l| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x11,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX13: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_minimum_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_minimum_f32 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3c,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_minimum_f32 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3c,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_minimum_f32 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3c,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_maximum_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_maximum_f32 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3d,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_maximum_f32 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3d,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_maximum_f32 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3d,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_minimum_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_minimum_f16 v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_minimum_f16 v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_minimum_f16 v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_maximum_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_maximum_f16 v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x3b,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_maximum_f16 v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x3b,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_maximum_f16 v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x3b,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_minimum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2d,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2d,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2d,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2d,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2d,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maximum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2e,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum3_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2e,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minimum3_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_minimum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_maximum3_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_maximum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_maximumminimum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6d,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6d,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6d,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6d,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximumminimum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6d,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximumminimum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minimummaximum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f32_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6c,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimummaximum_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6c,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimummaximum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6c,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_maximumminimum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05] + +v_minimummaximum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc3,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0xc3,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0xc3,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0xc3,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0xc3,0xd6,0xea,0x04,0x16,0x52,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_sr_pk_f16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0xc3,0xd6,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x74,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x74,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05] + +v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x74,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05] + +v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x74,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc2,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0xc2,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0xc2,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0xc2,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0xc2,0xd6,0xea,0x04,0x16,0x52,0x01,0x77,0x39,0x05] + +v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_cvt_sr_pk_bf16_f32_e64_dpp v255, -|v255|, -|v255|, src_scc clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0xc2,0xd6,0xe9,0xfe,0xf7,0x7b,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1-fake16.s new file mode 100644 index 0000000000000..2f858bb7fa486 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1-fake16.s @@ -0,0 +1,4106 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX13,GFX13-ASM %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX13,GFX13-DIS %s + +v_bfrev_b32_e64 v5, v1 +// GFX13: v_bfrev_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x01,0x02] + +v_bfrev_b32_e64 v5, v255 +// GFX13: v_bfrev_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x01,0x02] + +v_bfrev_b32_e64 v5, s1 +// GFX13: v_bfrev_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, s105 +// GFX13: v_bfrev_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, vcc_lo +// GFX13: v_bfrev_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, vcc_hi +// GFX13: v_bfrev_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, ttmp15 +// GFX13: v_bfrev_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, m0 +// GFX13: v_bfrev_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, exec_lo +// GFX13: v_bfrev_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, exec_hi +// GFX13: v_bfrev_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, null +// GFX13: v_bfrev_b32_e64 v5, null ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, -1 +// GFX13: v_bfrev_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, 0.5 +// GFX13: v_bfrev_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x01,0x02] + +v_bfrev_b32_e64 v5, src_scc +// GFX13: v_bfrev_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x01,0x02] + +v_bfrev_b32_e64 v255, 0xaf123456 +// GFX13: v_bfrev_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ceil_f16_e64 v5, v1 +// GFX13: v_ceil_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x01,0x02] + +v_ceil_f16_e64 v5, v255 +// GFX13: v_ceil_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x01,0x02] + +v_ceil_f16_e64 v5, s1 +// GFX13: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, s105 +// GFX13: v_ceil_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, vcc_lo +// GFX13: v_ceil_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, vcc_hi +// GFX13: v_ceil_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, ttmp15 +// GFX13: v_ceil_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, m0 +// GFX13: v_ceil_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, exec_lo +// GFX13: v_ceil_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, exec_hi +// GFX13: v_ceil_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, null +// GFX13: v_ceil_f16_e64 v5, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, -1 +// GFX13: v_ceil_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x01,0x02] + +v_ceil_f16_e64 v5, 0.5 mul:2 +// GFX13: v_ceil_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x01,0x0a] + +v_ceil_f16_e64 v5, src_scc mul:4 +// GFX13: v_ceil_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x01,0x12] + +v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_ceil_f32_e64 v5, v1 +// GFX13: v_ceil_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x01,0x02] + +v_ceil_f32_e64 v5, v255 +// GFX13: v_ceil_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x01,0x02] + +v_ceil_f32_e64 v5, s1 +// GFX13: v_ceil_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, s105 +// GFX13: v_ceil_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, vcc_lo +// GFX13: v_ceil_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, vcc_hi +// GFX13: v_ceil_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, ttmp15 +// GFX13: v_ceil_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, m0 +// GFX13: v_ceil_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, exec_lo +// GFX13: v_ceil_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, exec_hi +// GFX13: v_ceil_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, null +// GFX13: v_ceil_f32_e64 v5, null ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, -1 +// GFX13: v_ceil_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x01,0x02] + +v_ceil_f32_e64 v5, 0.5 mul:2 +// GFX13: v_ceil_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x01,0x0a] + +v_ceil_f32_e64 v5, src_scc mul:4 +// GFX13: v_ceil_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x01,0x12] + +v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_ceil_f64_e64 v[5:6], v[1:2] +// GFX13: v_ceil_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0x98,0xd5,0x01,0x01,0x01,0x02] + +v_ceil_f64_e64 v[5:6], v[254:255] +// GFX13: v_ceil_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0x98,0xd5,0xfe,0x01,0x01,0x02] + +v_ceil_f64_e64 v[5:6], s[2:3] +// GFX13: v_ceil_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0x98,0xd5,0x02,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], s[104:105] +// GFX13: v_ceil_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0x98,0xd5,0x68,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], vcc +// GFX13: v_ceil_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_ceil_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0x98,0xd5,0x7a,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], exec +// GFX13: v_ceil_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0x98,0xd5,0x7e,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], null +// GFX13: v_ceil_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0x98,0xd5,0x7c,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], -1 +// GFX13: v_ceil_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x98,0xd5,0xc1,0x00,0x01,0x02] + +v_ceil_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_ceil_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x98,0xd5,0xf0,0x00,0x01,0x0a] + +v_ceil_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_ceil_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0x98,0xd5,0xfd,0x00,0x01,0x32] + +v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cls_i32_e64 v5, v1 +// GFX13: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x01,0x02] + +v_cls_i32_e64 v5, v255 +// GFX13: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x01,0x02] + +v_cls_i32_e64 v5, s1 +// GFX13: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x01,0x02] + +v_cls_i32_e64 v5, s105 +// GFX13: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x01,0x02] + +v_cls_i32_e64 v5, vcc_lo +// GFX13: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x01,0x02] + +v_cls_i32_e64 v5, vcc_hi +// GFX13: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x01,0x02] + +v_cls_i32_e64 v5, ttmp15 +// GFX13: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x01,0x02] + +v_cls_i32_e64 v5, m0 +// GFX13: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x01,0x02] + +v_cls_i32_e64 v5, exec_lo +// GFX13: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x01,0x02] + +v_cls_i32_e64 v5, exec_hi +// GFX13: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x01,0x02] + +v_cls_i32_e64 v5, null +// GFX13: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x01,0x02] + +v_cls_i32_e64 v5, -1 +// GFX13: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x01,0x02] + +v_cls_i32_e64 v5, 0.5 +// GFX13: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x01,0x02] + +v_cls_i32_e64 v5, src_scc +// GFX13: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x01,0x02] + +v_cls_i32_e64 v255, 0xaf123456 +// GFX13: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_clz_i32_u32_e64 v5, v1 +// GFX13: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x01,0x02] + +v_clz_i32_u32_e64 v5, v255 +// GFX13: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x01,0x02] + +v_clz_i32_u32_e64 v5, s1 +// GFX13: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, s105 +// GFX13: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, vcc_lo +// GFX13: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, vcc_hi +// GFX13: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, ttmp15 +// GFX13: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, m0 +// GFX13: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, exec_lo +// GFX13: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, exec_hi +// GFX13: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, null +// GFX13: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, -1 +// GFX13: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, 0.5 +// GFX13: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v5, src_scc +// GFX13: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x01,0x02] + +v_clz_i32_u32_e64 v255, 0xaf123456 +// GFX13: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cos_bf16_e64 v5, -1 +// GFX13: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, exec_hi +// GFX13: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, exec_lo +// GFX13: v_cos_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, m0 +// GFX13: v_cos_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, null +// GFX13: v_cos_bf16_e64 v5, null ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, s1 +// GFX13: v_cos_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, s105 +// GFX13: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, ttmp15 +// GFX13: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, v1 +// GFX13: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x01,0x02] + +v_cos_bf16_e64 v5, v255 +// GFX13: v_cos_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x01,0x02] + +v_cos_bf16_e64 v5, vcc_hi +// GFX13: v_cos_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x01,0x02] + +v_cos_bf16_e64 v5, vcc_lo +// GFX13: v_cos_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x01,0x02] + +v_cos_f16_e64 v5, v1 +// GFX13: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x01,0x02] + +v_cos_f16_e64 v5, v255 +// GFX13: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x01,0x02] + +v_cos_f16_e64 v5, s1 +// GFX13: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x01,0x02] + +v_cos_f16_e64 v5, s105 +// GFX13: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x01,0x02] + +v_cos_f16_e64 v5, vcc_lo +// GFX13: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x01,0x02] + +v_cos_f16_e64 v5, vcc_hi +// GFX13: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x01,0x02] + +v_cos_f16_e64 v5, ttmp15 +// GFX13: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x01,0x02] + +v_cos_f16_e64 v5, m0 +// GFX13: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x01,0x02] + +v_cos_f16_e64 v5, exec_lo +// GFX13: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x01,0x02] + +v_cos_f16_e64 v5, exec_hi +// GFX13: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x01,0x02] + +v_cos_f16_e64 v5, null +// GFX13: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x01,0x02] + +v_cos_f16_e64 v5, -1 +// GFX13: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x01,0x02] + +v_cos_f16_e64 v5, 0.5 mul:2 +// GFX13: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x01,0x0a] + +v_cos_f16_e64 v5, src_scc mul:4 +// GFX13: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x01,0x12] + +v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_cos_f32_e64 v5, v1 +// GFX13: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x01,0x02] + +v_cos_f32_e64 v5, v255 +// GFX13: v_cos_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x01,0x02] + +v_cos_f32_e64 v5, s1 +// GFX13: v_cos_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x01,0x02] + +v_cos_f32_e64 v5, s105 +// GFX13: v_cos_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x01,0x02] + +v_cos_f32_e64 v5, vcc_lo +// GFX13: v_cos_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x01,0x02] + +v_cos_f32_e64 v5, vcc_hi +// GFX13: v_cos_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x01,0x02] + +v_cos_f32_e64 v5, ttmp15 +// GFX13: v_cos_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x01,0x02] + +v_cos_f32_e64 v5, m0 +// GFX13: v_cos_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x01,0x02] + +v_cos_f32_e64 v5, exec_lo +// GFX13: v_cos_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x01,0x02] + +v_cos_f32_e64 v5, exec_hi +// GFX13: v_cos_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x01,0x02] + +v_cos_f32_e64 v5, null +// GFX13: v_cos_f32_e64 v5, null ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x01,0x02] + +v_cos_f32_e64 v5, -1 +// GFX13: v_cos_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x01,0x02] + +v_cos_f32_e64 v5, 0.5 mul:2 +// GFX13: v_cos_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x01,0x0a] + +v_cos_f32_e64 v5, src_scc mul:4 +// GFX13: v_cos_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x01,0x12] + +v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_ctz_i32_b32_e64 v5, v1 +// GFX13: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x01,0x02] + +v_ctz_i32_b32_e64 v5, v255 +// GFX13: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x01,0x02] + +v_ctz_i32_b32_e64 v5, s1 +// GFX13: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, s105 +// GFX13: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, vcc_lo +// GFX13: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, vcc_hi +// GFX13: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, ttmp15 +// GFX13: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, m0 +// GFX13: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, exec_lo +// GFX13: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, exec_hi +// GFX13: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, null +// GFX13: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, -1 +// GFX13: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, 0.5 +// GFX13: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v5, src_scc +// GFX13: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x01,0x02] + +v_ctz_i32_b32_e64 v255, 0xaf123456 +// GFX13: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_f16_bf8 v1, v2 byte_sel:2 +// GFX13: v_cvt_f16_bf8_e64 v1, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf8,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_bf8 v1, v2 byte_sel:1 +// GFX13: v_cvt_f16_bf8_e64 v1, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf8,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_bf8 v1, v2 byte_sel:3 +// GFX13: v_cvt_f16_bf8_e64 v1, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf8,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_bf8 v150, 0x1234 +// GFX13: v_cvt_f16_bf8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf8,0xd5,0xff,0x00,0x01,0x02,0x34,0x12,0x00,0x00] + +v_cvt_f16_bf8 v150, 2 +// GFX13: v_cvt_f16_bf8_e64 v150, 2 ; encoding: [0x96,0x00,0xf8,0xd5,0x82,0x00,0x01,0x02] + +v_cvt_f16_bf8 v150, s2 +// GFX13: v_cvt_f16_bf8_e64 v150, s2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_f16_bf8 v150, v2 +// GFX13: v_cvt_f16_bf8_e64 v150, v2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_fp8 v1, v2 byte_sel:2 +// GFX13: v_cvt_f16_fp8_e64 v1, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf7,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_fp8 v1, v2 byte_sel:1 +// GFX13: v_cvt_f16_fp8_e64 v1, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf7,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_fp8 v1, v2 byte_sel:3 +// GFX13: v_cvt_f16_fp8_e64 v1, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf7,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f16_fp8 v150, 0x1234 +// GFX13: v_cvt_f16_fp8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf7,0xd5,0xff,0x00,0x01,0x02,0x34,0x12,0x00,0x00] + +v_cvt_f16_fp8 v150, 2 +// GFX13: v_cvt_f16_fp8_e64 v150, 2 ; encoding: [0x96,0x00,0xf7,0xd5,0x82,0x00,0x01,0x02] + +v_cvt_f16_fp8 v150, s2 +// GFX13: v_cvt_f16_fp8_e64 v150, s2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_f16_fp8 v150, v2 +// GFX13: v_cvt_f16_fp8_e64 v150, v2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, -1 +// GFX13: v_cvt_f32_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, exec_hi +// GFX13: v_cvt_f32_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xf2,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, exec_lo +// GFX13: v_cvt_f32_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xf2,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, m0 +// GFX13: v_cvt_f32_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xf2,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, null +// GFX13: v_cvt_f32_bf16_e64 v5, null ; encoding: [0x05,0x00,0xf2,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, s1 +// GFX13: v_cvt_f32_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, s105 +// GFX13: v_cvt_f32_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xf2,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, ttmp15 +// GFX13: v_cvt_f32_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xf2,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, v1 +// GFX13: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, v255 +// GFX13: v_cvt_f32_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, vcc_hi +// GFX13: v_cvt_f32_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xf2,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_bf16_e64 v5, vcc_lo +// GFX13: v_cvt_f32_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xf2,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, s3 +// GFX13: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 +// GFX13: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 +// GFX13: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 +// GFX13: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX13: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 +// GFX13: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 +// GFX13: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 +// GFX13: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX13: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 +// GFX13: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 +// GFX13: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 +// GFX13: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX13: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 +// GFX13: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 +// GFX13: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 +// GFX13: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX13: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 +// GFX13: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 +// GFX13: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 +// GFX13: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX13: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 +// GFX13: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 +// GFX13: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 +// GFX13: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f16_bf8 v1, s2 op_sel:[1] +// GFX13: v_cvt_pk_f16_bf8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_pk_f16_bf8 v1, v150 +// GFX13: v_cvt_pk_f16_bf8 v1, v150 ; encoding: [0x01,0x00,0xf6,0xd5,0x96,0x01,0x01,0x02] + +v_cvt_pk_f16_bf8 v1, v150 op_sel:[1] +// GFX13: v_cvt_pk_f16_bf8 v1, v150 op_sel:[1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0x96,0x01,0x01,0x02] + +v_cvt_pk_f16_bf8 v1, v2 op_sel:[1] +// GFX13: v_cvt_pk_f16_bf8 v1, v2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_pk_f16_fp8 v1, s2 op_sel:[1] +// GFX13: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_pk_f16_fp8 v1, v150 +// GFX13: v_cvt_pk_f16_fp8 v1, v150 ; encoding: [0x01,0x00,0xf5,0xd5,0x96,0x01,0x01,0x02] + +v_cvt_pk_f16_fp8 v1, v150 op_sel:[1] +// GFX13: v_cvt_pk_f16_fp8 v1, v150 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x96,0x01,0x01,0x02] + +v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] +// GFX13: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX13: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX13: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX13: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX13: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX13: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX13: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 +// GFX13: v_cvt_pk_f32_bf8_e64 v[3:4], s3 ; encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0] ; encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0] ; encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 +// GFX13: v_cvt_pk_f32_bf8_e64 v[3:4], v3 ; encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0] ; encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[3:4], s3 +// GFX13: v_cvt_pk_f32_fp8_e64 v[3:4], s3 ; encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 +// GFX13: v_cvt_pk_f32_fp8_e64 v[3:4], 3 ; encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0] ; encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 +// GFX13: v_cvt_pk_f32_fp8_e64 v[3:4], v3 ; encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0] +// GFX13: v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0] ; encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x01,0x02] + +v_cvt_f16_f32_e64 v5, v1 +// GFX13: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f16_f32_e64 v5, v255 +// GFX13: v_cvt_f16_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f16_f32_e64 v5, s1 +// GFX13: v_cvt_f16_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, s105 +// GFX13: v_cvt_f16_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, vcc_lo +// GFX13: v_cvt_f16_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, vcc_hi +// GFX13: v_cvt_f16_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, ttmp15 +// GFX13: v_cvt_f16_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, m0 +// GFX13: v_cvt_f16_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, exec_lo +// GFX13: v_cvt_f16_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, exec_hi +// GFX13: v_cvt_f16_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, null +// GFX13: v_cvt_f16_f32_e64 v5, null ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, -1 +// GFX13: v_cvt_f16_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f16_f32_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f16_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f16_f32_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f16_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_cvt_f16_i16_e64 v5, v1 +// GFX13: v_cvt_f16_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f16_i16_e64 v5, v255 +// GFX13: v_cvt_f16_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f16_i16_e64 v5, s1 +// GFX13: v_cvt_f16_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, s105 +// GFX13: v_cvt_f16_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, vcc_lo +// GFX13: v_cvt_f16_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, vcc_hi +// GFX13: v_cvt_f16_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, ttmp15 +// GFX13: v_cvt_f16_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, m0 +// GFX13: v_cvt_f16_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, exec_lo +// GFX13: v_cvt_f16_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, exec_hi +// GFX13: v_cvt_f16_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, null +// GFX13: v_cvt_f16_i16_e64 v5, null ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, -1 +// GFX13: v_cvt_f16_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f16_i16_e64 v5, 0.5 mul:2 +// GFX13-ASM: v_cvt_f16_i16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x01,0x0a] +// GFX13-DIS: v_cvt_f16_i16_e64 v5, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x01,0x0a,0x00,0x38,0x00,0x00] + +v_cvt_f16_i16_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f16_i16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 +// GFX13: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_cvt_f16_u16_e64 v5, v1 +// GFX13: v_cvt_f16_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f16_u16_e64 v5, v255 +// GFX13: v_cvt_f16_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f16_u16_e64 v5, s1 +// GFX13: v_cvt_f16_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, s105 +// GFX13: v_cvt_f16_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, vcc_lo +// GFX13: v_cvt_f16_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, vcc_hi +// GFX13: v_cvt_f16_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, ttmp15 +// GFX13: v_cvt_f16_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, m0 +// GFX13: v_cvt_f16_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, exec_lo +// GFX13: v_cvt_f16_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, exec_hi +// GFX13: v_cvt_f16_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, null +// GFX13: v_cvt_f16_u16_e64 v5, null ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, -1 +// GFX13: v_cvt_f16_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f16_u16_e64 v5, 0.5 mul:2 +// GFX13-ASM: v_cvt_f16_u16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x01,0x0a] +// GFX13-DIS: v_cvt_f16_u16_e64 v5, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x01,0x0a,0x00,0x38,0x00,0x00] + +v_cvt_f16_u16_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f16_u16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 +// GFX13: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x01,0x1a,0x0b,0xfe,0x00,0x00] + +v_cvt_f32_f16_e64 v5, v1 +// GFX13: v_cvt_f32_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_f16_e64 v5, v255 +// GFX13: v_cvt_f32_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_f16_e64 v5, s1 +// GFX13: v_cvt_f32_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, s105 +// GFX13: v_cvt_f32_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, vcc_lo +// GFX13: v_cvt_f32_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, vcc_hi +// GFX13: v_cvt_f32_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, ttmp15 +// GFX13: v_cvt_f32_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, m0 +// GFX13: v_cvt_f32_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, exec_lo +// GFX13: v_cvt_f32_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, exec_hi +// GFX13: v_cvt_f32_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, null +// GFX13: v_cvt_f32_f16_e64 v5, null ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, -1 +// GFX13: v_cvt_f32_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_f16_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_f16_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_cvt_f32_f64_e64 v5, v[1:2] +// GFX13: v_cvt_f32_f64_e64 v5, v[1:2] ; encoding: [0x05,0x00,0x8f,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_f64_e64 v5, v[254:255] +// GFX13: v_cvt_f32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x01,0x02] + +v_cvt_f32_f64_e64 v5, s[2:3] +// GFX13: v_cvt_f32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, s[104:105] +// GFX13: v_cvt_f32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, vcc +// GFX13: v_cvt_f32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, ttmp[14:15] +// GFX13: v_cvt_f32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, exec +// GFX13: v_cvt_f32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, null +// GFX13: v_cvt_f32_f64_e64 v5, null ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, -1 +// GFX13: v_cvt_f32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_f64_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_f64_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 +// GFX13: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x01,0x32] + +v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f32_i32_e64 v5, v1 +// GFX13: v_cvt_f32_i32_e64 v5, v1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_i32_e64 v5, v255 +// GFX13: v_cvt_f32_i32_e64 v5, v255 ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_i32_e64 v5, s1 +// GFX13: v_cvt_f32_i32_e64 v5, s1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, s105 +// GFX13: v_cvt_f32_i32_e64 v5, s105 ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, vcc_lo +// GFX13: v_cvt_f32_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, vcc_hi +// GFX13: v_cvt_f32_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, ttmp15 +// GFX13: v_cvt_f32_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, m0 +// GFX13: v_cvt_f32_i32_e64 v5, m0 ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, exec_lo +// GFX13: v_cvt_f32_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, exec_hi +// GFX13: v_cvt_f32_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, null +// GFX13: v_cvt_f32_i32_e64 v5, null ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, -1 +// GFX13: v_cvt_f32_i32_e64 v5, -1 ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_i32_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_i32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_i32_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_i32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f32_u32_e64 v5, v1 +// GFX13: v_cvt_f32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_u32_e64 v5, v255 +// GFX13: v_cvt_f32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_u32_e64 v5, s1 +// GFX13: v_cvt_f32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, s105 +// GFX13: v_cvt_f32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, vcc_lo +// GFX13: v_cvt_f32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, vcc_hi +// GFX13: v_cvt_f32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, ttmp15 +// GFX13: v_cvt_f32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, m0 +// GFX13: v_cvt_f32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, exec_lo +// GFX13: v_cvt_f32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, exec_hi +// GFX13: v_cvt_f32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, null +// GFX13: v_cvt_f32_u32_e64 v5, null ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, -1 +// GFX13: v_cvt_f32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_u32_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_u32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_u32_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_u32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte0_e64 v5, v1 +// GFX13: v_cvt_f32_ubyte0_e64 v5, v1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, v255 +// GFX13: v_cvt_f32_ubyte0_e64 v5, v255 ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, s1 +// GFX13: v_cvt_f32_ubyte0_e64 v5, s1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, s105 +// GFX13: v_cvt_f32_ubyte0_e64 v5, s105 ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, vcc_lo +// GFX13: v_cvt_f32_ubyte0_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, vcc_hi +// GFX13: v_cvt_f32_ubyte0_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, ttmp15 +// GFX13: v_cvt_f32_ubyte0_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, m0 +// GFX13: v_cvt_f32_ubyte0_e64 v5, m0 ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, exec_lo +// GFX13: v_cvt_f32_ubyte0_e64 v5, exec_lo ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, exec_hi +// GFX13: v_cvt_f32_ubyte0_e64 v5, exec_hi ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, null +// GFX13: v_cvt_f32_ubyte0_e64 v5, null ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, -1 +// GFX13: v_cvt_f32_ubyte0_e64 v5, -1 ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte1_e64 v5, v1 +// GFX13: v_cvt_f32_ubyte1_e64 v5, v1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, v255 +// GFX13: v_cvt_f32_ubyte1_e64 v5, v255 ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, s1 +// GFX13: v_cvt_f32_ubyte1_e64 v5, s1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, s105 +// GFX13: v_cvt_f32_ubyte1_e64 v5, s105 ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, vcc_lo +// GFX13: v_cvt_f32_ubyte1_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, vcc_hi +// GFX13: v_cvt_f32_ubyte1_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, ttmp15 +// GFX13: v_cvt_f32_ubyte1_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, m0 +// GFX13: v_cvt_f32_ubyte1_e64 v5, m0 ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, exec_lo +// GFX13: v_cvt_f32_ubyte1_e64 v5, exec_lo ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, exec_hi +// GFX13: v_cvt_f32_ubyte1_e64 v5, exec_hi ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, null +// GFX13: v_cvt_f32_ubyte1_e64 v5, null ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, -1 +// GFX13: v_cvt_f32_ubyte1_e64 v5, -1 ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte2_e64 v5, v1 +// GFX13: v_cvt_f32_ubyte2_e64 v5, v1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, v255 +// GFX13: v_cvt_f32_ubyte2_e64 v5, v255 ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, s1 +// GFX13: v_cvt_f32_ubyte2_e64 v5, s1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, s105 +// GFX13: v_cvt_f32_ubyte2_e64 v5, s105 ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, vcc_lo +// GFX13: v_cvt_f32_ubyte2_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, vcc_hi +// GFX13: v_cvt_f32_ubyte2_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, ttmp15 +// GFX13: v_cvt_f32_ubyte2_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, m0 +// GFX13: v_cvt_f32_ubyte2_e64 v5, m0 ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, exec_lo +// GFX13: v_cvt_f32_ubyte2_e64 v5, exec_lo ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, exec_hi +// GFX13: v_cvt_f32_ubyte2_e64 v5, exec_hi ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, null +// GFX13: v_cvt_f32_ubyte2_e64 v5, null ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, -1 +// GFX13: v_cvt_f32_ubyte2_e64 v5, -1 ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte3_e64 v5, v1 +// GFX13: v_cvt_f32_ubyte3_e64 v5, v1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, v255 +// GFX13: v_cvt_f32_ubyte3_e64 v5, v255 ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, s1 +// GFX13: v_cvt_f32_ubyte3_e64 v5, s1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, s105 +// GFX13: v_cvt_f32_ubyte3_e64 v5, s105 ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, vcc_lo +// GFX13: v_cvt_f32_ubyte3_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, vcc_hi +// GFX13: v_cvt_f32_ubyte3_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, ttmp15 +// GFX13: v_cvt_f32_ubyte3_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, m0 +// GFX13: v_cvt_f32_ubyte3_e64 v5, m0 ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, exec_lo +// GFX13: v_cvt_f32_ubyte3_e64 v5, exec_lo ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, exec_hi +// GFX13: v_cvt_f32_ubyte3_e64 v5, exec_hi ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, null +// GFX13: v_cvt_f32_ubyte3_e64 v5, null ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, -1 +// GFX13: v_cvt_f32_ubyte3_e64 v5, -1 ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 +// GFX13: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 +// GFX13: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f64_f32_e64 v[5:6], v1 +// GFX13: v_cvt_f64_f32_e64 v[5:6], v1 ; encoding: [0x05,0x00,0x90,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], v255 +// GFX13: v_cvt_f64_f32_e64 v[5:6], v255 ; encoding: [0x05,0x00,0x90,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], s1 +// GFX13: v_cvt_f64_f32_e64 v[5:6], s1 ; encoding: [0x05,0x00,0x90,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], s105 +// GFX13: v_cvt_f64_f32_e64 v[5:6], s105 ; encoding: [0x05,0x00,0x90,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], vcc_lo +// GFX13: v_cvt_f64_f32_e64 v[5:6], vcc_lo ; encoding: [0x05,0x00,0x90,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], vcc_hi +// GFX13: v_cvt_f64_f32_e64 v[5:6], vcc_hi ; encoding: [0x05,0x00,0x90,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], ttmp15 +// GFX13: v_cvt_f64_f32_e64 v[5:6], ttmp15 ; encoding: [0x05,0x00,0x90,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], m0 +// GFX13: v_cvt_f64_f32_e64 v[5:6], m0 ; encoding: [0x05,0x00,0x90,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], exec_lo +// GFX13: v_cvt_f64_f32_e64 v[5:6], exec_lo ; encoding: [0x05,0x00,0x90,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], exec_hi +// GFX13: v_cvt_f64_f32_e64 v[5:6], exec_hi ; encoding: [0x05,0x00,0x90,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], null +// GFX13: v_cvt_f64_f32_e64 v[5:6], null ; encoding: [0x05,0x00,0x90,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], -1 +// GFX13: v_cvt_f64_f32_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x90,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f64_f32_e64 v[5:6], 0.5 mul:2 +// GFX13: v_cvt_f64_f32_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x90,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f64_f32_e64 v[5:6], src_scc mul:4 +// GFX13: v_cvt_f64_f32_e64 v[5:6], src_scc mul:4 ; encoding: [0x05,0x00,0x90,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 +// GFX13: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_cvt_f64_i32_e64 v[5:6], v1 +// GFX13: v_cvt_f64_i32_e64 v[5:6], v1 ; encoding: [0x05,0x00,0x84,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], v255 +// GFX13: v_cvt_f64_i32_e64 v[5:6], v255 ; encoding: [0x05,0x00,0x84,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], s1 +// GFX13: v_cvt_f64_i32_e64 v[5:6], s1 ; encoding: [0x05,0x00,0x84,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], s105 +// GFX13: v_cvt_f64_i32_e64 v[5:6], s105 ; encoding: [0x05,0x00,0x84,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], vcc_lo +// GFX13: v_cvt_f64_i32_e64 v[5:6], vcc_lo ; encoding: [0x05,0x00,0x84,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], vcc_hi +// GFX13: v_cvt_f64_i32_e64 v[5:6], vcc_hi ; encoding: [0x05,0x00,0x84,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], ttmp15 +// GFX13: v_cvt_f64_i32_e64 v[5:6], ttmp15 ; encoding: [0x05,0x00,0x84,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], m0 +// GFX13: v_cvt_f64_i32_e64 v[5:6], m0 ; encoding: [0x05,0x00,0x84,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], exec_lo +// GFX13: v_cvt_f64_i32_e64 v[5:6], exec_lo ; encoding: [0x05,0x00,0x84,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], exec_hi +// GFX13: v_cvt_f64_i32_e64 v[5:6], exec_hi ; encoding: [0x05,0x00,0x84,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], null +// GFX13: v_cvt_f64_i32_e64 v[5:6], null ; encoding: [0x05,0x00,0x84,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], -1 +// GFX13: v_cvt_f64_i32_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x84,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f64_i32_e64 v[5:6], 0.5 mul:2 +// GFX13: v_cvt_f64_i32_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x84,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f64_i32_e64 v[5:6], src_scc mul:4 +// GFX13: v_cvt_f64_i32_e64 v[5:6], src_scc mul:4 ; encoding: [0x05,0x00,0x84,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_f64_u32_e64 v[5:6], v1 +// GFX13: v_cvt_f64_u32_e64 v[5:6], v1 ; encoding: [0x05,0x00,0x96,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], v255 +// GFX13: v_cvt_f64_u32_e64 v[5:6], v255 ; encoding: [0x05,0x00,0x96,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], s1 +// GFX13: v_cvt_f64_u32_e64 v[5:6], s1 ; encoding: [0x05,0x00,0x96,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], s105 +// GFX13: v_cvt_f64_u32_e64 v[5:6], s105 ; encoding: [0x05,0x00,0x96,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], vcc_lo +// GFX13: v_cvt_f64_u32_e64 v[5:6], vcc_lo ; encoding: [0x05,0x00,0x96,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], vcc_hi +// GFX13: v_cvt_f64_u32_e64 v[5:6], vcc_hi ; encoding: [0x05,0x00,0x96,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], ttmp15 +// GFX13: v_cvt_f64_u32_e64 v[5:6], ttmp15 ; encoding: [0x05,0x00,0x96,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], m0 +// GFX13: v_cvt_f64_u32_e64 v[5:6], m0 ; encoding: [0x05,0x00,0x96,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], exec_lo +// GFX13: v_cvt_f64_u32_e64 v[5:6], exec_lo ; encoding: [0x05,0x00,0x96,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], exec_hi +// GFX13: v_cvt_f64_u32_e64 v[5:6], exec_hi ; encoding: [0x05,0x00,0x96,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], null +// GFX13: v_cvt_f64_u32_e64 v[5:6], null ; encoding: [0x05,0x00,0x96,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], -1 +// GFX13: v_cvt_f64_u32_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x96,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_f64_u32_e64 v[5:6], 0.5 mul:2 +// GFX13: v_cvt_f64_u32_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x96,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_f64_u32_e64 v[5:6], src_scc mul:4 +// GFX13: v_cvt_f64_u32_e64 v[5:6], src_scc mul:4 ; encoding: [0x05,0x00,0x96,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_cvt_floor_i32_f32_e64 v5, v1 +// GFX13: v_cvt_floor_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, v255 +// GFX13: v_cvt_floor_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, s1 +// GFX13: v_cvt_floor_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, s105 +// GFX13: v_cvt_floor_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, vcc_lo +// GFX13: v_cvt_floor_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, vcc_hi +// GFX13: v_cvt_floor_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, ttmp15 +// GFX13: v_cvt_floor_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, m0 +// GFX13: v_cvt_floor_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, exec_lo +// GFX13: v_cvt_floor_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, exec_hi +// GFX13: v_cvt_floor_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, null +// GFX13: v_cvt_floor_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, -1 +// GFX13: v_cvt_floor_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, 0.5 +// GFX13: v_cvt_floor_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v5, src_scc +// GFX13: v_cvt_floor_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| +// GFX13: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x01,0x22,0x56,0x34,0x12,0xaf] + +v_cvt_i16_f16_e64 v5, v1 +// GFX13: v_cvt_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_i16_f16_e64 v5, v255 +// GFX13: v_cvt_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_i16_f16_e64 v5, s1 +// GFX13: v_cvt_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, s105 +// GFX13: v_cvt_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, vcc_lo +// GFX13: v_cvt_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, vcc_hi +// GFX13: v_cvt_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, ttmp15 +// GFX13: v_cvt_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, m0 +// GFX13: v_cvt_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, exec_lo +// GFX13: v_cvt_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, exec_hi +// GFX13: v_cvt_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, null +// GFX13: v_cvt_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, -1 +// GFX13: v_cvt_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, 0.5 +// GFX13: v_cvt_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v5, src_scc +// GFX13: v_cvt_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp +// GFX13: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x01,0x22,0x0b,0xfe,0x00,0x00] + +v_cvt_i32_f32_e64 v5, v1 +// GFX13: v_cvt_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_i32_f32_e64 v5, v255 +// GFX13: v_cvt_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_i32_f32_e64 v5, s1 +// GFX13: v_cvt_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, s105 +// GFX13: v_cvt_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, vcc_lo +// GFX13: v_cvt_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, vcc_hi +// GFX13: v_cvt_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, ttmp15 +// GFX13: v_cvt_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, m0 +// GFX13: v_cvt_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, exec_lo +// GFX13: v_cvt_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, exec_hi +// GFX13: v_cvt_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, null +// GFX13: v_cvt_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, -1 +// GFX13: v_cvt_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, 0.5 +// GFX13: v_cvt_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v5, src_scc +// GFX13: v_cvt_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp +// GFX13: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x01,0x22,0x56,0x34,0x12,0xaf] + +v_cvt_i32_f64_e64 v5, v[1:2] +// GFX13: v_cvt_i32_f64_e64 v5, v[1:2] ; encoding: [0x05,0x00,0x83,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_i32_f64_e64 v5, v[254:255] +// GFX13: v_cvt_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x01,0x02] + +v_cvt_i32_f64_e64 v5, s[2:3] +// GFX13: v_cvt_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, s[104:105] +// GFX13: v_cvt_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, vcc +// GFX13: v_cvt_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, ttmp[14:15] +// GFX13: v_cvt_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, exec +// GFX13: v_cvt_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, null +// GFX13: v_cvt_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, -1 +// GFX13: v_cvt_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, 0.5 +// GFX13: v_cvt_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_i32_f64_e64 v5, -|src_scc| +// GFX13: v_cvt_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x01,0x22] + +v_cvt_i32_f64_e64 v255, 0xaf123456 clamp +// GFX13: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_i32_i16_e64 v5, v1 +// GFX13: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_i32_i16_e64 v5, v255 +// GFX13: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_i32_i16_e64 v5, s1 +// GFX13: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, s105 +// GFX13: v_cvt_i32_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, vcc_lo +// GFX13: v_cvt_i32_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, vcc_hi +// GFX13: v_cvt_i32_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, ttmp15 +// GFX13: v_cvt_i32_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, m0 +// GFX13: v_cvt_i32_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, exec_lo +// GFX13: v_cvt_i32_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, exec_hi +// GFX13: v_cvt_i32_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, null +// GFX13: v_cvt_i32_i16_e64 v5, null ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, -1 +// GFX13: v_cvt_i32_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v5, 0.5 +// GFX13-ASM: v_cvt_i32_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xea,0xd5,0xf0,0x00,0x01,0x02] +// GFX13-DIS: v_cvt_i32_i16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00] + +v_cvt_i32_i16_e64 v5, src_scc +// GFX13: v_cvt_i32_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_i32_i16_e64 v255, 0xfe0b +// GFX13: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, v1 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, v255 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, s1 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, s105 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, vcc_lo +// GFX13: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, vcc_hi +// GFX13: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, ttmp15 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, m0 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, exec_lo +// GFX13: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, exec_hi +// GFX13: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, null +// GFX13: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, -1 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, 0.5 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v5, src_scc +// GFX13: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| +// GFX13: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x01,0x22,0x56,0x34,0x12,0xaf] + +v_cvt_norm_i16_f16_e64 v5, v1 +// GFX13: v_cvt_norm_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, v255 +// GFX13: v_cvt_norm_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, s1 +// GFX13: v_cvt_norm_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, s105 +// GFX13: v_cvt_norm_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, vcc_lo +// GFX13: v_cvt_norm_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, vcc_hi +// GFX13: v_cvt_norm_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, ttmp15 +// GFX13: v_cvt_norm_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, m0 +// GFX13: v_cvt_norm_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, exec_lo +// GFX13: v_cvt_norm_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, exec_hi +// GFX13: v_cvt_norm_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, null +// GFX13: v_cvt_norm_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, -1 +// GFX13: v_cvt_norm_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, 0.5 +// GFX13: v_cvt_norm_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v5, src_scc +// GFX13: v_cvt_norm_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| +// GFX13: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x01,0x22,0x0b,0xfe,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, v1 +// GFX13: v_cvt_norm_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, v255 +// GFX13: v_cvt_norm_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, s1 +// GFX13: v_cvt_norm_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, s105 +// GFX13: v_cvt_norm_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, vcc_lo +// GFX13: v_cvt_norm_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, vcc_hi +// GFX13: v_cvt_norm_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, ttmp15 +// GFX13: v_cvt_norm_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, m0 +// GFX13: v_cvt_norm_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, exec_lo +// GFX13: v_cvt_norm_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, exec_hi +// GFX13: v_cvt_norm_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, null +// GFX13: v_cvt_norm_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, -1 +// GFX13: v_cvt_norm_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, 0.5 +// GFX13: v_cvt_norm_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v5, src_scc +// GFX13: v_cvt_norm_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| +// GFX13: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x01,0x22,0x0b,0xfe,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, v1 +// GFX13: v_cvt_off_f32_i4_e64 v5, v1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, v255 +// GFX13: v_cvt_off_f32_i4_e64 v5, v255 ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, s1 +// GFX13: v_cvt_off_f32_i4_e64 v5, s1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, s105 +// GFX13: v_cvt_off_f32_i4_e64 v5, s105 ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, vcc_lo +// GFX13: v_cvt_off_f32_i4_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, vcc_hi +// GFX13: v_cvt_off_f32_i4_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, ttmp15 +// GFX13: v_cvt_off_f32_i4_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, m0 +// GFX13: v_cvt_off_f32_i4_e64 v5, m0 ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, exec_lo +// GFX13: v_cvt_off_f32_i4_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, exec_hi +// GFX13: v_cvt_off_f32_i4_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, null +// GFX13: v_cvt_off_f32_i4_e64 v5, null ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, -1 +// GFX13: v_cvt_off_f32_i4_e64 v5, -1 ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 +// GFX13: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x01,0x0a] + +v_cvt_off_f32_i4_e64 v5, src_scc mul:4 +// GFX13: v_cvt_off_f32_i4_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x01,0x12] + +v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 +// GFX13: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x01,0x1a,0x4f,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, v1 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, v255 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, s1 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, s105 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, vcc_lo +// GFX13: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, vcc_hi +// GFX13: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, ttmp15 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, m0 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, exec_lo +// GFX13: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, exec_hi +// GFX13: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, null +// GFX13: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, -1 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, 0.5 +// GFX13: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v5, src_scc +// GFX13: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_rpi_i32_f32_e64 v255, -|0xaf123456| +// GFX13: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x01,0x22,0x56,0x34,0x12,0xaf] + +v_cvt_u16_f16_e64 v5, v1 +// GFX13: v_cvt_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_u16_f16_e64 v5, v255 +// GFX13: v_cvt_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_u16_f16_e64 v5, s1 +// GFX13: v_cvt_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, s105 +// GFX13: v_cvt_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, vcc_lo +// GFX13: v_cvt_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, vcc_hi +// GFX13: v_cvt_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, ttmp15 +// GFX13: v_cvt_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, m0 +// GFX13: v_cvt_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, exec_lo +// GFX13: v_cvt_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, exec_hi +// GFX13: v_cvt_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, null +// GFX13: v_cvt_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, -1 +// GFX13: v_cvt_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, 0.5 +// GFX13: v_cvt_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v5, src_scc +// GFX13: v_cvt_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp +// GFX13: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x01,0x22,0x0b,0xfe,0x00,0x00] + +v_cvt_u32_f32_e64 v5, v1 +// GFX13: v_cvt_u32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_u32_f32_e64 v5, v255 +// GFX13: v_cvt_u32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_u32_f32_e64 v5, s1 +// GFX13: v_cvt_u32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, s105 +// GFX13: v_cvt_u32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, vcc_lo +// GFX13: v_cvt_u32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, vcc_hi +// GFX13: v_cvt_u32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, ttmp15 +// GFX13: v_cvt_u32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, m0 +// GFX13: v_cvt_u32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, exec_lo +// GFX13: v_cvt_u32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, exec_hi +// GFX13: v_cvt_u32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, null +// GFX13: v_cvt_u32_f32_e64 v5, null ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, -1 +// GFX13: v_cvt_u32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, 0.5 +// GFX13: v_cvt_u32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v5, src_scc +// GFX13: v_cvt_u32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp +// GFX13: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x01,0x22,0x56,0x34,0x12,0xaf] + +v_cvt_u32_f64_e64 v5, v[1:2] +// GFX13: v_cvt_u32_f64_e64 v5, v[1:2] ; encoding: [0x05,0x00,0x95,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_u32_f64_e64 v5, v[254:255] +// GFX13: v_cvt_u32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x01,0x02] + +v_cvt_u32_f64_e64 v5, s[2:3] +// GFX13: v_cvt_u32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, s[104:105] +// GFX13: v_cvt_u32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, vcc +// GFX13: v_cvt_u32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, ttmp[14:15] +// GFX13: v_cvt_u32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, exec +// GFX13: v_cvt_u32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, null +// GFX13: v_cvt_u32_f64_e64 v5, null ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, -1 +// GFX13: v_cvt_u32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, 0.5 +// GFX13: v_cvt_u32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x01,0x02] + +v_cvt_u32_f64_e64 v5, -|src_scc| +// GFX13: v_cvt_u32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x01,0x22] + +v_cvt_u32_f64_e64 v255, 0xaf123456 clamp +// GFX13: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_cvt_u32_u16_e64 v5, v1 +// GFX13: v_cvt_u32_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x01,0x02] + +v_cvt_u32_u16_e64 v5, v255 +// GFX13: v_cvt_u32_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x01,0x02] + +v_cvt_u32_u16_e64 v5, s1 +// GFX13: v_cvt_u32_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, s105 +// GFX13: v_cvt_u32_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, vcc_lo +// GFX13: v_cvt_u32_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, vcc_hi +// GFX13: v_cvt_u32_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, ttmp15 +// GFX13: v_cvt_u32_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, m0 +// GFX13: v_cvt_u32_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, exec_lo +// GFX13: v_cvt_u32_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, exec_hi +// GFX13: v_cvt_u32_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, null +// GFX13: v_cvt_u32_u16_e64 v5, null ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, -1 +// GFX13: v_cvt_u32_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v5, 0.5 +// GFX13-ASM: v_cvt_u32_u16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xeb,0xd5,0xf0,0x00,0x01,0x02] +// GFX13-DIS: v_cvt_u32_u16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00] + +v_cvt_u32_u16_e64 v5, src_scc +// GFX13: v_cvt_u32_u16_e64 v5, src_scc ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x01,0x02] + +v_cvt_u32_u16_e64 v255, 0xfe0b +// GFX13: v_cvt_u32_u16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_exp_bf16_e64 v5, -1 +// GFX13: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, exec_hi +// GFX13: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, exec_lo +// GFX13: v_exp_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, m0 +// GFX13: v_exp_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, null +// GFX13: v_exp_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, s1 +// GFX13: v_exp_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, s105 +// GFX13: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, ttmp15 +// GFX13: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, v1 +// GFX13: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x01,0x02] + +v_exp_bf16_e64 v5, v255 +// GFX13: v_exp_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x01,0x02] + +v_exp_bf16_e64 v5, vcc_hi +// GFX13: v_exp_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x01,0x02] + +v_exp_bf16_e64 v5, vcc_lo +// GFX13: v_exp_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x01,0x02] + +v_exp_f16_e64 v5, v1 +// GFX13: v_exp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x01,0x02] + +v_exp_f16_e64 v5, v255 +// GFX13: v_exp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x01,0x02] + +v_exp_f16_e64 v5, s1 +// GFX13: v_exp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x01,0x02] + +v_exp_f16_e64 v5, s105 +// GFX13: v_exp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x01,0x02] + +v_exp_f16_e64 v5, vcc_lo +// GFX13: v_exp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x01,0x02] + +v_exp_f16_e64 v5, vcc_hi +// GFX13: v_exp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x01,0x02] + +v_exp_f16_e64 v5, ttmp15 +// GFX13: v_exp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x01,0x02] + +v_exp_f16_e64 v5, m0 +// GFX13: v_exp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x01,0x02] + +v_exp_f16_e64 v5, exec_lo +// GFX13: v_exp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x01,0x02] + +v_exp_f16_e64 v5, exec_hi +// GFX13: v_exp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x01,0x02] + +v_exp_f16_e64 v5, null +// GFX13: v_exp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x01,0x02] + +v_exp_f16_e64 v5, -1 +// GFX13: v_exp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x01,0x02] + +v_exp_f16_e64 v5, 0.5 mul:2 +// GFX13: v_exp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x01,0x0a] + +v_exp_f16_e64 v5, src_scc mul:4 +// GFX13: v_exp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x01,0x12] + +v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_exp_f32_e64 v5, v1 +// GFX13: v_exp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x01,0x02] + +v_exp_f32_e64 v5, v255 +// GFX13: v_exp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x01,0x02] + +v_exp_f32_e64 v5, s1 +// GFX13: v_exp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x01,0x02] + +v_exp_f32_e64 v5, s105 +// GFX13: v_exp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x01,0x02] + +v_exp_f32_e64 v5, vcc_lo +// GFX13: v_exp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x01,0x02] + +v_exp_f32_e64 v5, vcc_hi +// GFX13: v_exp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x01,0x02] + +v_exp_f32_e64 v5, ttmp15 +// GFX13: v_exp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x01,0x02] + +v_exp_f32_e64 v5, m0 +// GFX13: v_exp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x01,0x02] + +v_exp_f32_e64 v5, exec_lo +// GFX13: v_exp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x01,0x02] + +v_exp_f32_e64 v5, exec_hi +// GFX13: v_exp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x01,0x02] + +v_exp_f32_e64 v5, null +// GFX13: v_exp_f32_e64 v5, null ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x01,0x02] + +v_exp_f32_e64 v5, -1 +// GFX13: v_exp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x01,0x02] + +v_exp_f32_e64 v5, 0.5 mul:2 +// GFX13: v_exp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x01,0x0a] + +v_exp_f32_e64 v5, src_scc mul:4 +// GFX13: v_exp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x01,0x12] + +v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_ffbh_i32_e64 v5, v1 +// GFX13: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x01,0x02] + +v_ffbh_i32_e64 v5, v255 +// GFX13: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x01,0x02] + +v_ffbh_i32_e64 v5, s1 +// GFX13: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, s105 +// GFX13: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, vcc_lo +// GFX13: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, vcc_hi +// GFX13: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, ttmp15 +// GFX13: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, m0 +// GFX13: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, exec_lo +// GFX13: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, exec_hi +// GFX13: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, null +// GFX13: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, -1 +// GFX13: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, 0.5 +// GFX13: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x01,0x02] + +v_ffbh_i32_e64 v5, src_scc +// GFX13: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x01,0x02] + +v_ffbh_i32_e64 v255, 0xaf123456 +// GFX13: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ffbh_u32_e64 v5, v1 +// GFX13: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x01,0x02] + +v_ffbh_u32_e64 v5, v255 +// GFX13: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x01,0x02] + +v_ffbh_u32_e64 v5, s1 +// GFX13: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, s105 +// GFX13: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, vcc_lo +// GFX13: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, vcc_hi +// GFX13: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, ttmp15 +// GFX13: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, m0 +// GFX13: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, exec_lo +// GFX13: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, exec_hi +// GFX13: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, null +// GFX13: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, -1 +// GFX13: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, 0.5 +// GFX13: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x01,0x02] + +v_ffbh_u32_e64 v5, src_scc +// GFX13: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x01,0x02] + +v_ffbh_u32_e64 v255, 0xaf123456 +// GFX13: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_ffbl_b32_e64 v5, v1 +// GFX13: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x01,0x02] + +v_ffbl_b32_e64 v5, v255 +// GFX13: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x01,0x02] + +v_ffbl_b32_e64 v5, s1 +// GFX13: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, s105 +// GFX13: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, vcc_lo +// GFX13: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, vcc_hi +// GFX13: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, ttmp15 +// GFX13: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, m0 +// GFX13: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, exec_lo +// GFX13: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, exec_hi +// GFX13: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, null +// GFX13: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, -1 +// GFX13: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, 0.5 +// GFX13: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x01,0x02] + +v_ffbl_b32_e64 v5, src_scc +// GFX13: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x01,0x02] + +v_ffbl_b32_e64 v255, 0xaf123456 +// GFX13: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_floor_f16_e64 v5, v1 +// GFX13: v_floor_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x01,0x02] + +v_floor_f16_e64 v5, v255 +// GFX13: v_floor_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x01,0x02] + +v_floor_f16_e64 v5, s1 +// GFX13: v_floor_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x01,0x02] + +v_floor_f16_e64 v5, s105 +// GFX13: v_floor_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x01,0x02] + +v_floor_f16_e64 v5, vcc_lo +// GFX13: v_floor_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x01,0x02] + +v_floor_f16_e64 v5, vcc_hi +// GFX13: v_floor_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x01,0x02] + +v_floor_f16_e64 v5, ttmp15 +// GFX13: v_floor_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x01,0x02] + +v_floor_f16_e64 v5, m0 +// GFX13: v_floor_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x01,0x02] + +v_floor_f16_e64 v5, exec_lo +// GFX13: v_floor_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x01,0x02] + +v_floor_f16_e64 v5, exec_hi +// GFX13: v_floor_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x01,0x02] + +v_floor_f16_e64 v5, null +// GFX13: v_floor_f16_e64 v5, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x01,0x02] + +v_floor_f16_e64 v5, -1 +// GFX13: v_floor_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x01,0x02] + +v_floor_f16_e64 v5, 0.5 mul:2 +// GFX13: v_floor_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x01,0x0a] + +v_floor_f16_e64 v5, src_scc mul:4 +// GFX13: v_floor_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x01,0x12] + +v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_floor_f32_e64 v5, v1 +// GFX13: v_floor_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x01,0x02] + +v_floor_f32_e64 v5, v255 +// GFX13: v_floor_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x01,0x02] + +v_floor_f32_e64 v5, s1 +// GFX13: v_floor_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x01,0x02] + +v_floor_f32_e64 v5, s105 +// GFX13: v_floor_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x01,0x02] + +v_floor_f32_e64 v5, vcc_lo +// GFX13: v_floor_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x01,0x02] + +v_floor_f32_e64 v5, vcc_hi +// GFX13: v_floor_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x01,0x02] + +v_floor_f32_e64 v5, ttmp15 +// GFX13: v_floor_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x01,0x02] + +v_floor_f32_e64 v5, m0 +// GFX13: v_floor_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x01,0x02] + +v_floor_f32_e64 v5, exec_lo +// GFX13: v_floor_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x01,0x02] + +v_floor_f32_e64 v5, exec_hi +// GFX13: v_floor_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x01,0x02] + +v_floor_f32_e64 v5, null +// GFX13: v_floor_f32_e64 v5, null ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x01,0x02] + +v_floor_f32_e64 v5, -1 +// GFX13: v_floor_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x01,0x02] + +v_floor_f32_e64 v5, 0.5 mul:2 +// GFX13: v_floor_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x01,0x0a] + +v_floor_f32_e64 v5, src_scc mul:4 +// GFX13: v_floor_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x01,0x12] + +v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_floor_f64_e64 v[5:6], v[1:2] +// GFX13: v_floor_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0x9a,0xd5,0x01,0x01,0x01,0x02] + +v_floor_f64_e64 v[5:6], v[254:255] +// GFX13: v_floor_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0x9a,0xd5,0xfe,0x01,0x01,0x02] + +v_floor_f64_e64 v[5:6], s[2:3] +// GFX13: v_floor_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0x9a,0xd5,0x02,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], s[104:105] +// GFX13: v_floor_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0x9a,0xd5,0x68,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], vcc +// GFX13: v_floor_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0x9a,0xd5,0x6a,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_floor_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0x9a,0xd5,0x7a,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], exec +// GFX13: v_floor_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0x9a,0xd5,0x7e,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], null +// GFX13: v_floor_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0x9a,0xd5,0x7c,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], -1 +// GFX13: v_floor_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x9a,0xd5,0xc1,0x00,0x01,0x02] + +v_floor_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_floor_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x9a,0xd5,0xf0,0x00,0x01,0x0a] + +v_floor_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_floor_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0x9a,0xd5,0xfd,0x00,0x01,0x32] + +v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_fract_f16_e64 v5, v1 +// GFX13: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x01,0x02] + +v_fract_f16_e64 v5, v255 +// GFX13: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x01,0x02] + +v_fract_f16_e64 v5, s1 +// GFX13: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x01,0x02] + +v_fract_f16_e64 v5, s105 +// GFX13: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x01,0x02] + +v_fract_f16_e64 v5, vcc_lo +// GFX13: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x01,0x02] + +v_fract_f16_e64 v5, vcc_hi +// GFX13: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x01,0x02] + +v_fract_f16_e64 v5, ttmp15 +// GFX13: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x01,0x02] + +v_fract_f16_e64 v5, m0 +// GFX13: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x01,0x02] + +v_fract_f16_e64 v5, exec_lo +// GFX13: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x01,0x02] + +v_fract_f16_e64 v5, exec_hi +// GFX13: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x01,0x02] + +v_fract_f16_e64 v5, null +// GFX13: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x01,0x02] + +v_fract_f16_e64 v5, -1 +// GFX13: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x01,0x02] + +v_fract_f16_e64 v5, 0.5 mul:2 +// GFX13: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x01,0x0a] + +v_fract_f16_e64 v5, src_scc mul:4 +// GFX13: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x01,0x12] + +v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_fract_f32_e64 v5, v1 +// GFX13: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x01,0x02] + +v_fract_f32_e64 v5, v255 +// GFX13: v_fract_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x01,0x02] + +v_fract_f32_e64 v5, s1 +// GFX13: v_fract_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x01,0x02] + +v_fract_f32_e64 v5, s105 +// GFX13: v_fract_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x01,0x02] + +v_fract_f32_e64 v5, vcc_lo +// GFX13: v_fract_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x01,0x02] + +v_fract_f32_e64 v5, vcc_hi +// GFX13: v_fract_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x01,0x02] + +v_fract_f32_e64 v5, ttmp15 +// GFX13: v_fract_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x01,0x02] + +v_fract_f32_e64 v5, m0 +// GFX13: v_fract_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x01,0x02] + +v_fract_f32_e64 v5, exec_lo +// GFX13: v_fract_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x01,0x02] + +v_fract_f32_e64 v5, exec_hi +// GFX13: v_fract_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x01,0x02] + +v_fract_f32_e64 v5, null +// GFX13: v_fract_f32_e64 v5, null ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x01,0x02] + +v_fract_f32_e64 v5, -1 +// GFX13: v_fract_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x01,0x02] + +v_fract_f32_e64 v5, 0.5 mul:2 +// GFX13: v_fract_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x01,0x0a] + +v_fract_f32_e64 v5, src_scc mul:4 +// GFX13: v_fract_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x01,0x12] + +v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_fract_f64_e64 v[5:6], v[1:2] +// GFX13: v_fract_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0xbe,0xd5,0x01,0x01,0x01,0x02] + +v_fract_f64_e64 v[5:6], v[254:255] +// GFX13: v_fract_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0xbe,0xd5,0xfe,0x01,0x01,0x02] + +v_fract_f64_e64 v[5:6], s[2:3] +// GFX13: v_fract_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0xbe,0xd5,0x02,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], s[104:105] +// GFX13: v_fract_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0xbe,0xd5,0x68,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], vcc +// GFX13: v_fract_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0xbe,0xd5,0x6a,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_fract_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0xbe,0xd5,0x7a,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], exec +// GFX13: v_fract_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0xbe,0xd5,0x7e,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], null +// GFX13: v_fract_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0xbe,0xd5,0x7c,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], -1 +// GFX13: v_fract_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0xbe,0xd5,0xc1,0x00,0x01,0x02] + +v_fract_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_fract_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0xbe,0xd5,0xf0,0x00,0x01,0x0a] + +v_fract_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_fract_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0xbe,0xd5,0xfd,0x00,0x01,0x32] + +v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_frexp_exp_i16_f16_e64 v5, v1 +// GFX13: v_frexp_exp_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, v255 +// GFX13: v_frexp_exp_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, s1 +// GFX13: v_frexp_exp_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, s105 +// GFX13: v_frexp_exp_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, vcc_lo +// GFX13: v_frexp_exp_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, vcc_hi +// GFX13: v_frexp_exp_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, ttmp15 +// GFX13: v_frexp_exp_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, m0 +// GFX13: v_frexp_exp_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, exec_lo +// GFX13: v_frexp_exp_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, exec_hi +// GFX13: v_frexp_exp_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, null +// GFX13: v_frexp_exp_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, -1 +// GFX13: v_frexp_exp_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, 0.5 +// GFX13: v_frexp_exp_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v5, src_scc +// GFX13: v_frexp_exp_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x01,0x02] + +v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| +// GFX13: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x01,0x22,0x0b,0xfe,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, v1 +// GFX13: v_frexp_exp_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, v255 +// GFX13: v_frexp_exp_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, s1 +// GFX13: v_frexp_exp_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, s105 +// GFX13: v_frexp_exp_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, vcc_lo +// GFX13: v_frexp_exp_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, vcc_hi +// GFX13: v_frexp_exp_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, ttmp15 +// GFX13: v_frexp_exp_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, m0 +// GFX13: v_frexp_exp_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, exec_lo +// GFX13: v_frexp_exp_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, exec_hi +// GFX13: v_frexp_exp_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, null +// GFX13: v_frexp_exp_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, -1 +// GFX13: v_frexp_exp_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, 0.5 +// GFX13: v_frexp_exp_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v5, src_scc +// GFX13: v_frexp_exp_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x01,0x02] + +v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| +// GFX13: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x01,0x22,0x56,0x34,0x12,0xaf] + +v_frexp_exp_i32_f64_e64 v5, v[1:2] +// GFX13: v_frexp_exp_i32_f64_e64 v5, v[1:2] ; encoding: [0x05,0x00,0xbc,0xd5,0x01,0x01,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, v[254:255] +// GFX13: v_frexp_exp_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, s[2:3] +// GFX13: v_frexp_exp_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, s[104:105] +// GFX13: v_frexp_exp_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, vcc +// GFX13: v_frexp_exp_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] +// GFX13: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, exec +// GFX13: v_frexp_exp_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, null +// GFX13: v_frexp_exp_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, -1 +// GFX13: v_frexp_exp_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, 0.5 +// GFX13: v_frexp_exp_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x01,0x02] + +v_frexp_exp_i32_f64_e64 v5, -|src_scc| +// GFX13: v_frexp_exp_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x01,0x22] + +v_frexp_exp_i32_f64_e64 v255, 0xaf123456 +// GFX13: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_frexp_mant_f16_e64 v5, v1 +// GFX13: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x01,0x02] + +v_frexp_mant_f16_e64 v5, v255 +// GFX13: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x01,0x02] + +v_frexp_mant_f16_e64 v5, s1 +// GFX13: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, s105 +// GFX13: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, vcc_lo +// GFX13: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, vcc_hi +// GFX13: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, ttmp15 +// GFX13: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, m0 +// GFX13: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, exec_lo +// GFX13: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, exec_hi +// GFX13: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, null +// GFX13: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, -1 +// GFX13: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x01,0x02] + +v_frexp_mant_f16_e64 v5, 0.5 mul:2 +// GFX13: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x01,0x0a] + +v_frexp_mant_f16_e64 v5, src_scc mul:4 +// GFX13: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x01,0x12] + +v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f32_e64 v5, v1 +// GFX13: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x01,0x02] + +v_frexp_mant_f32_e64 v5, v255 +// GFX13: v_frexp_mant_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x01,0x02] + +v_frexp_mant_f32_e64 v5, s1 +// GFX13: v_frexp_mant_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, s105 +// GFX13: v_frexp_mant_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, vcc_lo +// GFX13: v_frexp_mant_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, vcc_hi +// GFX13: v_frexp_mant_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, ttmp15 +// GFX13: v_frexp_mant_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, m0 +// GFX13: v_frexp_mant_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, exec_lo +// GFX13: v_frexp_mant_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, exec_hi +// GFX13: v_frexp_mant_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, null +// GFX13: v_frexp_mant_f32_e64 v5, null ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, -1 +// GFX13: v_frexp_mant_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x01,0x02] + +v_frexp_mant_f32_e64 v5, 0.5 mul:2 +// GFX13: v_frexp_mant_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x01,0x0a] + +v_frexp_mant_f32_e64 v5, src_scc mul:4 +// GFX13: v_frexp_mant_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x01,0x12] + +v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_frexp_mant_f64_e64 v[5:6], v[1:2] +// GFX13: v_frexp_mant_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0xbd,0xd5,0x01,0x01,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], v[254:255] +// GFX13: v_frexp_mant_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0xbd,0xd5,0xfe,0x01,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], s[2:3] +// GFX13: v_frexp_mant_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0xbd,0xd5,0x02,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], s[104:105] +// GFX13: v_frexp_mant_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0xbd,0xd5,0x68,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], vcc +// GFX13: v_frexp_mant_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0xbd,0xd5,0x6a,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_frexp_mant_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0xbd,0xd5,0x7a,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], exec +// GFX13: v_frexp_mant_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0xbd,0xd5,0x7e,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], null +// GFX13: v_frexp_mant_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0xbd,0xd5,0x7c,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], -1 +// GFX13: v_frexp_mant_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0xbd,0xd5,0xc1,0x00,0x01,0x02] + +v_frexp_mant_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_frexp_mant_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0xbd,0xd5,0xf0,0x00,0x01,0x0a] + +v_frexp_mant_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_frexp_mant_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0xbd,0xd5,0xfd,0x00,0x01,0x32] + +v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_log_bf16_e64 v5, -1 +// GFX13: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x01,0x02] + +v_log_bf16_e64 v5, exec_hi +// GFX13: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x01,0x02] + +v_log_bf16_e64 v5, exec_lo +// GFX13: v_log_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x01,0x02] + +v_log_bf16_e64 v5, m0 +// GFX13: v_log_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x01,0x02] + +v_log_bf16_e64 v5, null +// GFX13: v_log_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x01,0x02] + +v_log_bf16_e64 v5, s1 +// GFX13: v_log_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x01,0x02] + +v_log_bf16_e64 v5, s105 +// GFX13: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x01,0x02] + +v_log_bf16_e64 v5, ttmp15 +// GFX13: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x01,0x02] + +v_log_bf16_e64 v5, v1 +// GFX13: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x01,0x02] + +v_log_bf16_e64 v5, v255 +// GFX13: v_log_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x01,0x02] + +v_log_bf16_e64 v5, vcc_hi +// GFX13: v_log_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x01,0x02] + +v_log_bf16_e64 v5, vcc_lo +// GFX13: v_log_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x01,0x02] + +v_log_f16_e64 v5, v1 +// GFX13: v_log_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x01,0x02] + +v_log_f16_e64 v5, v255 +// GFX13: v_log_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x01,0x02] + +v_log_f16_e64 v5, s1 +// GFX13: v_log_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x01,0x02] + +v_log_f16_e64 v5, s105 +// GFX13: v_log_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x01,0x02] + +v_log_f16_e64 v5, vcc_lo +// GFX13: v_log_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x01,0x02] + +v_log_f16_e64 v5, vcc_hi +// GFX13: v_log_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x01,0x02] + +v_log_f16_e64 v5, ttmp15 +// GFX13: v_log_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x01,0x02] + +v_log_f16_e64 v5, m0 +// GFX13: v_log_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x01,0x02] + +v_log_f16_e64 v5, exec_lo +// GFX13: v_log_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x01,0x02] + +v_log_f16_e64 v5, exec_hi +// GFX13: v_log_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x01,0x02] + +v_log_f16_e64 v5, null +// GFX13: v_log_f16_e64 v5, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x01,0x02] + +v_log_f16_e64 v5, -1 +// GFX13: v_log_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x01,0x02] + +v_log_f16_e64 v5, 0.5 mul:2 +// GFX13: v_log_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x01,0x0a] + +v_log_f16_e64 v5, src_scc mul:4 +// GFX13: v_log_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x01,0x12] + +v_log_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_log_f32_e64 v5, v1 +// GFX13: v_log_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x01,0x02] + +v_log_f32_e64 v5, v255 +// GFX13: v_log_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x01,0x02] + +v_log_f32_e64 v5, s1 +// GFX13: v_log_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x01,0x02] + +v_log_f32_e64 v5, s105 +// GFX13: v_log_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x01,0x02] + +v_log_f32_e64 v5, vcc_lo +// GFX13: v_log_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x01,0x02] + +v_log_f32_e64 v5, vcc_hi +// GFX13: v_log_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x01,0x02] + +v_log_f32_e64 v5, ttmp15 +// GFX13: v_log_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x01,0x02] + +v_log_f32_e64 v5, m0 +// GFX13: v_log_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x01,0x02] + +v_log_f32_e64 v5, exec_lo +// GFX13: v_log_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x01,0x02] + +v_log_f32_e64 v5, exec_hi +// GFX13: v_log_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x01,0x02] + +v_log_f32_e64 v5, null +// GFX13: v_log_f32_e64 v5, null ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x01,0x02] + +v_log_f32_e64 v5, -1 +// GFX13: v_log_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x01,0x02] + +v_log_f32_e64 v5, 0.5 mul:2 +// GFX13: v_log_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x01,0x0a] + +v_log_f32_e64 v5, src_scc mul:4 +// GFX13: v_log_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x01,0x12] + +v_log_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_mov_b32_e64 v5, v1 +// GFX13: v_mov_b32_e64 v5, v1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x01,0x02] + +v_mov_b32_e64 v5, v255 +// GFX13: v_mov_b32_e64 v5, v255 ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x01,0x02] + +v_mov_b32_e64 v5, s1 +// GFX13: v_mov_b32_e64 v5, s1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x01,0x02] + +v_mov_b32_e64 v5, s105 +// GFX13: v_mov_b32_e64 v5, s105 ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x01,0x02] + +v_mov_b32_e64 v5, vcc_lo +// GFX13: v_mov_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x01,0x02] + +v_mov_b32_e64 v5, vcc_hi +// GFX13: v_mov_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x01,0x02] + +v_mov_b32_e64 v5, ttmp15 +// GFX13: v_mov_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x01,0x02] + +v_mov_b32_e64 v5, m0 +// GFX13: v_mov_b32_e64 v5, m0 ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x01,0x02] + +v_mov_b32_e64 v5, exec_lo +// GFX13: v_mov_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x01,0x02] + +v_mov_b32_e64 v5, exec_hi +// GFX13: v_mov_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x01,0x02] + +v_mov_b32_e64 v5, null +// GFX13: v_mov_b32_e64 v5, null ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x01,0x02] + +v_mov_b32_e64 v5, -1 +// GFX13: v_mov_b32_e64 v5, -1 ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x01,0x02] + +v_mov_b32_e64 v5, 0.5 +// GFX13: v_mov_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x01,0x02] + +v_mov_b32_e64 v5, src_scc +// GFX13: v_mov_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x01,0x02] + +v_mov_b32_e64 v255, 0xaf123456 +// GFX13: v_mov_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_nop_e64 +// GFX13: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x80,0x00,0x01,0x02] + +v_not_b16_e64 v5, v1 +// GFX13: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x01,0x02] + +v_not_b16_e64 v5, v255 +// GFX13: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x01,0x02] + +v_not_b16_e64 v5, s1 +// GFX13: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x01,0x02] + +v_not_b16_e64 v5, s105 +// GFX13: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x01,0x02] + +v_not_b16_e64 v5, vcc_lo +// GFX13: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x01,0x02] + +v_not_b16_e64 v5, vcc_hi +// GFX13: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x01,0x02] + +v_not_b16_e64 v5, ttmp15 +// GFX13: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x01,0x02] + +v_not_b16_e64 v5, m0 +// GFX13: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x01,0x02] + +v_not_b16_e64 v5, exec_lo +// GFX13: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x01,0x02] + +v_not_b16_e64 v5, exec_hi +// GFX13: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x01,0x02] + +v_not_b16_e64 v5, null +// GFX13: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x01,0x02] + +v_not_b16_e64 v5, -1 +// GFX13: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x01,0x02] + +v_not_b16_e64 v5, 0.5 +// GFX13-ASM: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x01,0x02] +// GFX13-DIS: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00] + +v_not_b16_e64 v5, src_scc +// GFX13: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x01,0x02] + +v_not_b16_e64 v255, 0xfe0b +// GFX13: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_not_b32_e64 v5, v1 +// GFX13: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x01,0x02] + +v_not_b32_e64 v5, v255 +// GFX13: v_not_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x01,0x02] + +v_not_b32_e64 v5, s1 +// GFX13: v_not_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x01,0x02] + +v_not_b32_e64 v5, s105 +// GFX13: v_not_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x01,0x02] + +v_not_b32_e64 v5, vcc_lo +// GFX13: v_not_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x01,0x02] + +v_not_b32_e64 v5, vcc_hi +// GFX13: v_not_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x01,0x02] + +v_not_b32_e64 v5, ttmp15 +// GFX13: v_not_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x01,0x02] + +v_not_b32_e64 v5, m0 +// GFX13: v_not_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x01,0x02] + +v_not_b32_e64 v5, exec_lo +// GFX13: v_not_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x01,0x02] + +v_not_b32_e64 v5, exec_hi +// GFX13: v_not_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x01,0x02] + +v_not_b32_e64 v5, null +// GFX13: v_not_b32_e64 v5, null ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x01,0x02] + +v_not_b32_e64 v5, -1 +// GFX13: v_not_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x01,0x02] + +v_not_b32_e64 v5, 0.5 +// GFX13: v_not_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x01,0x02] + +v_not_b32_e64 v5, src_scc +// GFX13: v_not_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x01,0x02] + +v_not_b32_e64 v255, 0xaf123456 +// GFX13: v_not_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x01,0x02,0x56,0x34,0x12,0xaf] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:0 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:1 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_permlane16_swap_b32 v1, v2 fi:0 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_permlane16_swap_b32 v1, v2 fi:1 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_permlane16_swap_b32_e64 v1, v2 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 +// GFX13: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x01,0x02] + +v_pipeflush_e64 +// GFX13: v_pipeflush ; encoding: [0x00,0x00,0x9b,0xd5,0x80,0x00,0x01,0x02] + +v_prng_b32_e64 v5, -1 +// GFX13: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x01,0x02] + +v_prng_b32_e64 v5, exec_hi +// GFX13: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x01,0x02] + +v_prng_b32_e64 v5, exec_lo +// GFX13: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x01,0x02] + +v_prng_b32_e64 v5, m0 +// GFX13: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x01,0x02] + +v_prng_b32_e64 v5, null +// GFX13: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x01,0x02] + +v_prng_b32_e64 v5, s1 +// GFX13: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x01,0x02] + +v_prng_b32_e64 v5, s105 +// GFX13: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x01,0x02] + +v_prng_b32_e64 v5, ttmp15 +// GFX13: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x01,0x02] + +v_prng_b32_e64 v5, v1 +// GFX13: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x01,0x02] + +v_prng_b32_e64 v5, v255 +// GFX13: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x01,0x02] + +v_prng_b32_e64 v5, vcc_hi +// GFX13: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x01,0x02] + +v_prng_b32_e64 v5, vcc_lo +// GFX13: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, -1 +// GFX13: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, exec_hi +// GFX13: v_rcp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, exec_lo +// GFX13: v_rcp_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, m0 +// GFX13: v_rcp_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, null +// GFX13: v_rcp_bf16_e64 v5, null ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, s1 +// GFX13: v_rcp_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, s105 +// GFX13: v_rcp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, ttmp15 +// GFX13: v_rcp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, v1 +// GFX13: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x01,0x02] + +v_rcp_bf16_e64 v5, v255 +// GFX13: v_rcp_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x01,0x02] + +v_rcp_bf16_e64 v5, vcc_hi +// GFX13: v_rcp_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x01,0x02] + +v_rcp_bf16_e64 v5, vcc_lo +// GFX13: v_rcp_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, v1 +// GFX13: v_rcp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x01,0x02] + +v_rcp_f16_e64 v5, v255 +// GFX13: v_rcp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x01,0x02] + +v_rcp_f16_e64 v5, s1 +// GFX13: v_rcp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, s105 +// GFX13: v_rcp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, vcc_lo +// GFX13: v_rcp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, vcc_hi +// GFX13: v_rcp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, ttmp15 +// GFX13: v_rcp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, m0 +// GFX13: v_rcp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, exec_lo +// GFX13: v_rcp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, exec_hi +// GFX13: v_rcp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, null +// GFX13: v_rcp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, -1 +// GFX13: v_rcp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x01,0x02] + +v_rcp_f16_e64 v5, 0.5 mul:2 +// GFX13: v_rcp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x01,0x0a] + +v_rcp_f16_e64 v5, src_scc mul:4 +// GFX13: v_rcp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x01,0x12] + +v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_rcp_f32_e64 v5, v1 +// GFX13: v_rcp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x01,0x02] + +v_rcp_f32_e64 v5, v255 +// GFX13: v_rcp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x01,0x02] + +v_rcp_f32_e64 v5, s1 +// GFX13: v_rcp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, s105 +// GFX13: v_rcp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, vcc_lo +// GFX13: v_rcp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, vcc_hi +// GFX13: v_rcp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, ttmp15 +// GFX13: v_rcp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, m0 +// GFX13: v_rcp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, exec_lo +// GFX13: v_rcp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, exec_hi +// GFX13: v_rcp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, null +// GFX13: v_rcp_f32_e64 v5, null ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, -1 +// GFX13: v_rcp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x01,0x02] + +v_rcp_f32_e64 v5, 0.5 mul:2 +// GFX13: v_rcp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x01,0x0a] + +v_rcp_f32_e64 v5, src_scc mul:4 +// GFX13: v_rcp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x01,0x12] + +v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_rcp_f64_e64 v[5:6], v[1:2] +// GFX13: v_rcp_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0xaf,0xd5,0x01,0x01,0x01,0x02] + +v_rcp_f64_e64 v[5:6], v[254:255] +// GFX13: v_rcp_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0xaf,0xd5,0xfe,0x01,0x01,0x02] + +v_rcp_f64_e64 v[5:6], s[2:3] +// GFX13: v_rcp_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0xaf,0xd5,0x02,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], s[104:105] +// GFX13: v_rcp_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0xaf,0xd5,0x68,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], vcc +// GFX13: v_rcp_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0xaf,0xd5,0x6a,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_rcp_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0xaf,0xd5,0x7a,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], exec +// GFX13: v_rcp_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0xaf,0xd5,0x7e,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], null +// GFX13: v_rcp_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0xaf,0xd5,0x7c,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], -1 +// GFX13: v_rcp_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0xaf,0xd5,0xc1,0x00,0x01,0x02] + +v_rcp_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_rcp_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0xaf,0xd5,0xf0,0x00,0x01,0x0a] + +v_rcp_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_rcp_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0xaf,0xd5,0xfd,0x00,0x01,0x32] + +v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_rcp_iflag_f32_e64 v5, v1 +// GFX13: v_rcp_iflag_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, v255 +// GFX13: v_rcp_iflag_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, s1 +// GFX13: v_rcp_iflag_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, s105 +// GFX13: v_rcp_iflag_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, vcc_lo +// GFX13: v_rcp_iflag_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, vcc_hi +// GFX13: v_rcp_iflag_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, ttmp15 +// GFX13: v_rcp_iflag_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, m0 +// GFX13: v_rcp_iflag_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, exec_lo +// GFX13: v_rcp_iflag_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, exec_hi +// GFX13: v_rcp_iflag_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, null +// GFX13: v_rcp_iflag_f32_e64 v5, null ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, -1 +// GFX13: v_rcp_iflag_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x01,0x02] + +v_rcp_iflag_f32_e64 v5, 0.5 mul:2 +// GFX13: v_rcp_iflag_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x01,0x0a] + +v_rcp_iflag_f32_e64 v5, src_scc mul:4 +// GFX13: v_rcp_iflag_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x01,0x12] + +v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_rndne_f16_e64 v5, v1 +// GFX13: v_rndne_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x01,0x02] + +v_rndne_f16_e64 v5, v255 +// GFX13: v_rndne_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x01,0x02] + +v_rndne_f16_e64 v5, s1 +// GFX13: v_rndne_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, s105 +// GFX13: v_rndne_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, vcc_lo +// GFX13: v_rndne_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, vcc_hi +// GFX13: v_rndne_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, ttmp15 +// GFX13: v_rndne_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, m0 +// GFX13: v_rndne_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, exec_lo +// GFX13: v_rndne_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, exec_hi +// GFX13: v_rndne_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, null +// GFX13: v_rndne_f16_e64 v5, null ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, -1 +// GFX13: v_rndne_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x01,0x02] + +v_rndne_f16_e64 v5, 0.5 mul:2 +// GFX13: v_rndne_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x01,0x0a] + +v_rndne_f16_e64 v5, src_scc mul:4 +// GFX13: v_rndne_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x01,0x12] + +v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_rndne_f32_e64 v5, v1 +// GFX13: v_rndne_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x01,0x02] + +v_rndne_f32_e64 v5, v255 +// GFX13: v_rndne_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x01,0x02] + +v_rndne_f32_e64 v5, s1 +// GFX13: v_rndne_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, s105 +// GFX13: v_rndne_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, vcc_lo +// GFX13: v_rndne_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, vcc_hi +// GFX13: v_rndne_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, ttmp15 +// GFX13: v_rndne_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, m0 +// GFX13: v_rndne_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, exec_lo +// GFX13: v_rndne_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, exec_hi +// GFX13: v_rndne_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, null +// GFX13: v_rndne_f32_e64 v5, null ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, -1 +// GFX13: v_rndne_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x01,0x02] + +v_rndne_f32_e64 v5, 0.5 mul:2 +// GFX13: v_rndne_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x01,0x0a] + +v_rndne_f32_e64 v5, src_scc mul:4 +// GFX13: v_rndne_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x01,0x12] + +v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_rndne_f64_e64 v[5:6], v[1:2] +// GFX13: v_rndne_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0x99,0xd5,0x01,0x01,0x01,0x02] + +v_rndne_f64_e64 v[5:6], v[254:255] +// GFX13: v_rndne_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0x99,0xd5,0xfe,0x01,0x01,0x02] + +v_rndne_f64_e64 v[5:6], s[2:3] +// GFX13: v_rndne_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0x99,0xd5,0x02,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], s[104:105] +// GFX13: v_rndne_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0x99,0xd5,0x68,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], vcc +// GFX13: v_rndne_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0x99,0xd5,0x6a,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_rndne_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0x99,0xd5,0x7a,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], exec +// GFX13: v_rndne_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0x99,0xd5,0x7e,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], null +// GFX13: v_rndne_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0x99,0xd5,0x7c,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], -1 +// GFX13: v_rndne_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x99,0xd5,0xc1,0x00,0x01,0x02] + +v_rndne_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_rndne_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x99,0xd5,0xf0,0x00,0x01,0x0a] + +v_rndne_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_rndne_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0x99,0xd5,0xfd,0x00,0x01,0x32] + +v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_rsq_bf16_e64 v5, -1 +// GFX13: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, exec_hi +// GFX13: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, exec_lo +// GFX13: v_rsq_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, m0 +// GFX13: v_rsq_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, null +// GFX13: v_rsq_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, s1 +// GFX13: v_rsq_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, s105 +// GFX13: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, ttmp15 +// GFX13: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, v1 +// GFX13: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x01,0x02] + +v_rsq_bf16_e64 v5, v255 +// GFX13: v_rsq_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x01,0x02] + +v_rsq_bf16_e64 v5, vcc_hi +// GFX13: v_rsq_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x01,0x02] + +v_rsq_bf16_e64 v5, vcc_lo +// GFX13: v_rsq_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, v1 +// GFX13: v_rsq_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x01,0x02] + +v_rsq_f16_e64 v5, v255 +// GFX13: v_rsq_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x01,0x02] + +v_rsq_f16_e64 v5, s1 +// GFX13: v_rsq_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, s105 +// GFX13: v_rsq_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, vcc_lo +// GFX13: v_rsq_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, vcc_hi +// GFX13: v_rsq_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, ttmp15 +// GFX13: v_rsq_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, m0 +// GFX13: v_rsq_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, exec_lo +// GFX13: v_rsq_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, exec_hi +// GFX13: v_rsq_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, null +// GFX13: v_rsq_f16_e64 v5, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, -1 +// GFX13: v_rsq_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x01,0x02] + +v_rsq_f16_e64 v5, 0.5 mul:2 +// GFX13: v_rsq_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x01,0x0a] + +v_rsq_f16_e64 v5, src_scc mul:4 +// GFX13: v_rsq_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x01,0x12] + +v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_rsq_f32_e64 v5, v1 +// GFX13: v_rsq_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x01,0x02] + +v_rsq_f32_e64 v5, v255 +// GFX13: v_rsq_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x01,0x02] + +v_rsq_f32_e64 v5, s1 +// GFX13: v_rsq_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, s105 +// GFX13: v_rsq_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, vcc_lo +// GFX13: v_rsq_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, vcc_hi +// GFX13: v_rsq_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, ttmp15 +// GFX13: v_rsq_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, m0 +// GFX13: v_rsq_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, exec_lo +// GFX13: v_rsq_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, exec_hi +// GFX13: v_rsq_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, null +// GFX13: v_rsq_f32_e64 v5, null ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, -1 +// GFX13: v_rsq_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x01,0x02] + +v_rsq_f32_e64 v5, 0.5 mul:2 +// GFX13: v_rsq_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x01,0x0a] + +v_rsq_f32_e64 v5, src_scc mul:4 +// GFX13: v_rsq_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x01,0x12] + +v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_rsq_f64_e64 v[5:6], v[1:2] +// GFX13: v_rsq_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0xb1,0xd5,0x01,0x01,0x01,0x02] + +v_rsq_f64_e64 v[5:6], v[254:255] +// GFX13: v_rsq_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0xb1,0xd5,0xfe,0x01,0x01,0x02] + +v_rsq_f64_e64 v[5:6], s[2:3] +// GFX13: v_rsq_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0xb1,0xd5,0x02,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], s[104:105] +// GFX13: v_rsq_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0xb1,0xd5,0x68,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], vcc +// GFX13: v_rsq_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0xb1,0xd5,0x6a,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_rsq_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0xb1,0xd5,0x7a,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], exec +// GFX13: v_rsq_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0xb1,0xd5,0x7e,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], null +// GFX13: v_rsq_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0xb1,0xd5,0x7c,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], -1 +// GFX13: v_rsq_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0xb1,0xd5,0xc1,0x00,0x01,0x02] + +v_rsq_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_rsq_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0xb1,0xd5,0xf0,0x00,0x01,0x0a] + +v_rsq_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_rsq_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0xb1,0xd5,0xfd,0x00,0x01,0x32] + +v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_sat_pk4_i4_i8 v150, 0x1234 +// GFX13: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x01,0x02,0x34,0x12,0x00,0x00] + +v_sat_pk4_i4_i8 v150, 2 +// GFX13: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x01,0x02] + +v_sat_pk4_i4_i8 v150, s2 +// GFX13: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x01,0x02] + +v_sat_pk4_i4_i8 v150, v2 +// GFX13: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x01,0x02] + +v_sat_pk4_u4_u8 v150, 0x1234 +// GFX13: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x01,0x02,0x34,0x12,0x00,0x00] + +v_sat_pk4_u4_u8 v150, 2 +// GFX13: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x01,0x02] + +v_sat_pk4_u4_u8 v150, s2 +// GFX13: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x01,0x02] + +v_sat_pk4_u4_u8 v150, v2 +// GFX13: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, v1 +// GFX13: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, v255 +// GFX13: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, s1 +// GFX13: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, s105 +// GFX13: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, vcc_lo +// GFX13: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, vcc_hi +// GFX13: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, ttmp15 +// GFX13: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, m0 +// GFX13: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, exec_lo +// GFX13: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, exec_hi +// GFX13: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, null +// GFX13: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, -1 +// GFX13: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, 0.5 +// GFX13: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v5, src_scc +// GFX13: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x01,0x02] + +v_sat_pk_u8_i16_e64 v255, 0xfe0b +// GFX13: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x01,0x02,0x0b,0xfe,0x00,0x00] + +v_sin_bf16_e64 v5, -1 +// GFX13: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, exec_hi +// GFX13: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, exec_lo +// GFX13: v_sin_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, m0 +// GFX13: v_sin_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, null +// GFX13: v_sin_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, s1 +// GFX13: v_sin_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, s105 +// GFX13: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, ttmp15 +// GFX13: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, v1 +// GFX13: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x01,0x02] + +v_sin_bf16_e64 v5, v255 +// GFX13: v_sin_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x01,0x02] + +v_sin_bf16_e64 v5, vcc_hi +// GFX13: v_sin_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x01,0x02] + +v_sin_bf16_e64 v5, vcc_lo +// GFX13: v_sin_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x01,0x02] + +v_sin_f16_e64 v5, v1 +// GFX13: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x01,0x02] + +v_sin_f16_e64 v5, v255 +// GFX13: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x01,0x02] + +v_sin_f16_e64 v5, s1 +// GFX13: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x01,0x02] + +v_sin_f16_e64 v5, s105 +// GFX13: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x01,0x02] + +v_sin_f16_e64 v5, vcc_lo +// GFX13: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x01,0x02] + +v_sin_f16_e64 v5, vcc_hi +// GFX13: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x01,0x02] + +v_sin_f16_e64 v5, ttmp15 +// GFX13: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x01,0x02] + +v_sin_f16_e64 v5, m0 +// GFX13: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x01,0x02] + +v_sin_f16_e64 v5, exec_lo +// GFX13: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x01,0x02] + +v_sin_f16_e64 v5, exec_hi +// GFX13: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x01,0x02] + +v_sin_f16_e64 v5, null +// GFX13: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x01,0x02] + +v_sin_f16_e64 v5, -1 +// GFX13: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x01,0x02] + +v_sin_f16_e64 v5, 0.5 mul:2 +// GFX13: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x01,0x0a] + +v_sin_f16_e64 v5, src_scc mul:4 +// GFX13: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x01,0x12] + +v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_sin_f32_e64 v5, v1 +// GFX13: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x01,0x02] + +v_sin_f32_e64 v5, v255 +// GFX13: v_sin_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x01,0x02] + +v_sin_f32_e64 v5, s1 +// GFX13: v_sin_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x01,0x02] + +v_sin_f32_e64 v5, s105 +// GFX13: v_sin_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x01,0x02] + +v_sin_f32_e64 v5, vcc_lo +// GFX13: v_sin_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x01,0x02] + +v_sin_f32_e64 v5, vcc_hi +// GFX13: v_sin_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x01,0x02] + +v_sin_f32_e64 v5, ttmp15 +// GFX13: v_sin_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x01,0x02] + +v_sin_f32_e64 v5, m0 +// GFX13: v_sin_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x01,0x02] + +v_sin_f32_e64 v5, exec_lo +// GFX13: v_sin_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x01,0x02] + +v_sin_f32_e64 v5, exec_hi +// GFX13: v_sin_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x01,0x02] + +v_sin_f32_e64 v5, null +// GFX13: v_sin_f32_e64 v5, null ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x01,0x02] + +v_sin_f32_e64 v5, -1 +// GFX13: v_sin_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x01,0x02] + +v_sin_f32_e64 v5, 0.5 mul:2 +// GFX13: v_sin_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x01,0x0a] + +v_sin_f32_e64 v5, src_scc mul:4 +// GFX13: v_sin_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x01,0x12] + +v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_sqrt_bf16_e64 v5, -1 +// GFX13: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, exec_hi +// GFX13: v_sqrt_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, exec_lo +// GFX13: v_sqrt_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, m0 +// GFX13: v_sqrt_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, null +// GFX13: v_sqrt_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, s1 +// GFX13: v_sqrt_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, s105 +// GFX13: v_sqrt_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, ttmp15 +// GFX13: v_sqrt_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, v1 +// GFX13: v_sqrt_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x01,0x02] + +v_sqrt_bf16_e64 v5, v255 +// GFX13: v_sqrt_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x01,0x02] + +v_sqrt_bf16_e64 v5, vcc_hi +// GFX13: v_sqrt_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x01,0x02] + +v_sqrt_bf16_e64 v5, vcc_lo +// GFX13: v_sqrt_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, v1 +// GFX13: v_sqrt_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x01,0x02] + +v_sqrt_f16_e64 v5, v255 +// GFX13: v_sqrt_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x01,0x02] + +v_sqrt_f16_e64 v5, s1 +// GFX13: v_sqrt_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, s105 +// GFX13: v_sqrt_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, vcc_lo +// GFX13: v_sqrt_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, vcc_hi +// GFX13: v_sqrt_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, ttmp15 +// GFX13: v_sqrt_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, m0 +// GFX13: v_sqrt_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, exec_lo +// GFX13: v_sqrt_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, exec_hi +// GFX13: v_sqrt_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, null +// GFX13: v_sqrt_f16_e64 v5, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, -1 +// GFX13: v_sqrt_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x01,0x02] + +v_sqrt_f16_e64 v5, 0.5 mul:2 +// GFX13: v_sqrt_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x01,0x0a] + +v_sqrt_f16_e64 v5, src_scc mul:4 +// GFX13: v_sqrt_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x01,0x12] + +v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_sqrt_f32_e64 v5, v1 +// GFX13: v_sqrt_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x01,0x02] + +v_sqrt_f32_e64 v5, v255 +// GFX13: v_sqrt_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x01,0x02] + +v_sqrt_f32_e64 v5, s1 +// GFX13: v_sqrt_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, s105 +// GFX13: v_sqrt_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, vcc_lo +// GFX13: v_sqrt_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, vcc_hi +// GFX13: v_sqrt_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, ttmp15 +// GFX13: v_sqrt_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, m0 +// GFX13: v_sqrt_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, exec_lo +// GFX13: v_sqrt_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, exec_hi +// GFX13: v_sqrt_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, null +// GFX13: v_sqrt_f32_e64 v5, null ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, -1 +// GFX13: v_sqrt_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x01,0x02] + +v_sqrt_f32_e64 v5, 0.5 mul:2 +// GFX13: v_sqrt_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x01,0x0a] + +v_sqrt_f32_e64 v5, src_scc mul:4 +// GFX13: v_sqrt_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x01,0x12] + +v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_sqrt_f64_e64 v[5:6], v[1:2] +// GFX13: v_sqrt_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0xb4,0xd5,0x01,0x01,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], v[254:255] +// GFX13: v_sqrt_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0xb4,0xd5,0xfe,0x01,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], s[2:3] +// GFX13: v_sqrt_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0xb4,0xd5,0x02,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], s[104:105] +// GFX13: v_sqrt_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0xb4,0xd5,0x68,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], vcc +// GFX13: v_sqrt_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0xb4,0xd5,0x6a,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_sqrt_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0xb4,0xd5,0x7a,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], exec +// GFX13: v_sqrt_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0xb4,0xd5,0x7e,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], null +// GFX13: v_sqrt_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0xb4,0xd5,0x7c,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], -1 +// GFX13: v_sqrt_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0xb4,0xd5,0xc1,0x00,0x01,0x02] + +v_sqrt_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_sqrt_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0xb4,0xd5,0xf0,0x00,0x01,0x0a] + +v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0xb4,0xd5,0xfd,0x00,0x01,0x32] + +v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] + +v_tanh_bf16_e64 v5, -1 +// GFX13: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, exec_hi +// GFX13: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, exec_lo +// GFX13: v_tanh_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, m0 +// GFX13: v_tanh_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, null +// GFX13: v_tanh_bf16_e64 v5, null ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, s1 +// GFX13: v_tanh_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, s105 +// GFX13: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, ttmp15 +// GFX13: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, v1 +// GFX13: v_tanh_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x01,0x02] + +v_tanh_bf16_e64 v5, v255 +// GFX13: v_tanh_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x01,0x02] + +v_tanh_bf16_e64 v5, vcc_hi +// GFX13: v_tanh_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x01,0x02] + +v_tanh_bf16_e64 v5, vcc_lo +// GFX13: v_tanh_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x01,0x02] + +v_tanh_f16_e64 v255, -|0x8000| clamp div:2 +// GFX13: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x01,0x3a,0x00,0x80,0x00,0x00] + +v_tanh_f16_e64 v5, -1 +// GFX13: v_tanh_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, 0.5 mul:2 +// GFX13: v_tanh_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x01,0x0a] + +v_tanh_f16_e64 v5, exec_hi +// GFX13: v_tanh_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, exec_lo +// GFX13: v_tanh_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, m0 +// GFX13: v_tanh_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, null +// GFX13: v_tanh_f16_e64 v5, null ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, s1 +// GFX13: v_tanh_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, s105 +// GFX13: v_tanh_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, src_scc mul:4 +// GFX13: v_tanh_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x01,0x12] + +v_tanh_f16_e64 v5, ttmp15 +// GFX13: v_tanh_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, v1 +// GFX13: v_tanh_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x01,0x02] + +v_tanh_f16_e64 v5, v255 +// GFX13: v_tanh_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x01,0x02] + +v_tanh_f16_e64 v5, vcc_hi +// GFX13: v_tanh_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x01,0x02] + +v_tanh_f16_e64 v5, vcc_lo +// GFX13: v_tanh_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x01,0x02] + +v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_tanh_f32_e64 v5, -1 +// GFX13: v_tanh_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, 0.5 mul:2 +// GFX13: v_tanh_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x01,0x0a] + +v_tanh_f32_e64 v5, exec_hi +// GFX13: v_tanh_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, exec_lo +// GFX13: v_tanh_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, m0 +// GFX13: v_tanh_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, null +// GFX13: v_tanh_f32_e64 v5, null ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, s1 +// GFX13: v_tanh_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, s105 +// GFX13: v_tanh_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, src_scc mul:4 +// GFX13: v_tanh_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x01,0x12] + +v_tanh_f32_e64 v5, ttmp15 +// GFX13: v_tanh_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, v1 +// GFX13: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x01,0x02] + +v_tanh_f32_e64 v5, v255 +// GFX13: v_tanh_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x01,0x02] + +v_tanh_f32_e64 v5, vcc_hi +// GFX13: v_tanh_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x01,0x02] + +v_tanh_f32_e64 v5, vcc_lo +// GFX13: v_tanh_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, v1 +// GFX13: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x01,0x02] + +v_trunc_f16_e64 v5, v255 +// GFX13: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x01,0x02] + +v_trunc_f16_e64 v5, s1 +// GFX13: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, s105 +// GFX13: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, vcc_lo +// GFX13: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, vcc_hi +// GFX13: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, ttmp15 +// GFX13: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, m0 +// GFX13: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, exec_lo +// GFX13: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, exec_hi +// GFX13: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, null +// GFX13: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, -1 +// GFX13: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x01,0x02] + +v_trunc_f16_e64 v5, 0.5 mul:2 +// GFX13: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x01,0x0a] + +v_trunc_f16_e64 v5, src_scc mul:4 +// GFX13: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x01,0x12] + +v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX13: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x01,0x3a,0x0b,0xfe,0x00,0x00] + +v_trunc_f32_e64 v5, v1 +// GFX13: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x01,0x02] + +v_trunc_f32_e64 v5, v255 +// GFX13: v_trunc_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x01,0x02] + +v_trunc_f32_e64 v5, s1 +// GFX13: v_trunc_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, s105 +// GFX13: v_trunc_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, vcc_lo +// GFX13: v_trunc_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, vcc_hi +// GFX13: v_trunc_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, ttmp15 +// GFX13: v_trunc_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, m0 +// GFX13: v_trunc_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, exec_lo +// GFX13: v_trunc_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, exec_hi +// GFX13: v_trunc_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, null +// GFX13: v_trunc_f32_e64 v5, null ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, -1 +// GFX13: v_trunc_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x01,0x02] + +v_trunc_f32_e64 v5, 0.5 mul:2 +// GFX13: v_trunc_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x01,0x0a] + +v_trunc_f32_e64 v5, src_scc mul:4 +// GFX13: v_trunc_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x01,0x12] + +v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX13: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x01,0x3a,0x56,0x34,0x12,0xaf] + +v_trunc_f64_e64 v[5:6], v[1:2] +// GFX13: v_trunc_f64_e64 v[5:6], v[1:2] ; encoding: [0x05,0x00,0x97,0xd5,0x01,0x01,0x01,0x02] + +v_trunc_f64_e64 v[5:6], v[254:255] +// GFX13: v_trunc_f64_e64 v[5:6], v[254:255] ; encoding: [0x05,0x00,0x97,0xd5,0xfe,0x01,0x01,0x02] + +v_trunc_f64_e64 v[5:6], s[2:3] +// GFX13: v_trunc_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0x97,0xd5,0x02,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], s[104:105] +// GFX13: v_trunc_f64_e64 v[5:6], s[104:105] ; encoding: [0x05,0x00,0x97,0xd5,0x68,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], vcc +// GFX13: v_trunc_f64_e64 v[5:6], vcc ; encoding: [0x05,0x00,0x97,0xd5,0x6a,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], ttmp[14:15] +// GFX13: v_trunc_f64_e64 v[5:6], ttmp[14:15] ; encoding: [0x05,0x00,0x97,0xd5,0x7a,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], exec +// GFX13: v_trunc_f64_e64 v[5:6], exec ; encoding: [0x05,0x00,0x97,0xd5,0x7e,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], null +// GFX13: v_trunc_f64_e64 v[5:6], null ; encoding: [0x05,0x00,0x97,0xd5,0x7c,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], -1 +// GFX13: v_trunc_f64_e64 v[5:6], -1 ; encoding: [0x05,0x00,0x97,0xd5,0xc1,0x00,0x01,0x02] + +v_trunc_f64_e64 v[5:6], 0.5 mul:2 +// GFX13: v_trunc_f64_e64 v[5:6], 0.5 mul:2 ; encoding: [0x05,0x00,0x97,0xd5,0xf0,0x00,0x01,0x0a] + +v_trunc_f64_e64 v[5:6], -|src_scc| mul:4 +// GFX13: v_trunc_f64_e64 v[5:6], -|src_scc| mul:4 ; encoding: [0x05,0x01,0x97,0xd5,0xfd,0x00,0x01,0x32] + +v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX13: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x01,0x1a,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp16-fake16.s new file mode 100644 index 0000000000000..238e77c6f6dce --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp16-fake16.s @@ -0,0 +1,297 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -disassemble -show-encoding | FileCheck --check-prefix=GFX13 %s + +v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_floor_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_rcp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_rcp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_rcp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_rcp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_rcp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_rcp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_rcp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_rcp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_rcp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_sqrt_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_sqrt_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_sqrt_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_sqrt_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_sqrt_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_sqrt_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_rsq_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_rsq_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_rsq_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_rsq_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_rsq_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_rsq_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_rsq_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_rsq_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_rsq_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd6,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_log_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_log_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_log_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_log_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_log_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_log_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_log_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_log_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_log_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_log_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_log_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_log_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_log_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_log_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_log_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_log_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_log_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_log_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_log_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_log_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd7,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX13: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_exp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX13: v_exp_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +v_exp_f16_e64_dpp v5, v1 row_mirror +// GFX13: v_exp_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_half_mirror +// GFX13: v_exp_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_shl:1 +// GFX13: v_exp_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_shl:15 +// GFX13: v_exp_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_shr:1 +// GFX13: v_exp_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_shr:15 +// GFX13: v_exp_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_ror:1 +// GFX13: v_exp_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_ror:15 +// GFX13: v_exp_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX13: v_exp_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +v_exp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX13: v_exp_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +v_exp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX13: v_exp_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +v_exp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX13: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd8,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp8-fake16.s new file mode 100644 index 0000000000000..80c9cad7159a0 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_vop3_from_vop1_dpp8-fake16.s @@ -0,0 +1,87 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 -disassemble -show-encoding | FileCheck --check-prefix=GFX13 %s + +v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_rcp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_rcp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_rcp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_rcp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd4,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_rcp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_rcp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd4,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_rcp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_sqrt_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sqrt_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_sqrt_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_sqrt_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd5,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_sqrt_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_sqrt_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd5,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_sqrt_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd5,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_rsq_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_rsq_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_rsq_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_rsq_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd6,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_rsq_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_rsq_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd6,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_rsq_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd6,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_log_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_log_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_log_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_log_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd7,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_log_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_log_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd7,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_log_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_log_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd7,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +v_exp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX13: v_exp_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +v_exp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX13: v_exp_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd8,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +v_exp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX13: v_exp_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd8,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From c85f29f709da17fba95e2d91e915f5e854ca0690 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Tue, 12 May 2026 09:11:24 +0200 Subject: [PATCH 405/538] [PowerPC] Fix types when emitting ppc_altivec_vupklsw (#187789) When lowering BUILD_VECTOR, we produce this intrinsic node, but fail to adjust the input/output types to ensure ISel works. This patch simply adds the necessary bitcasts. Fixes: https://github.com/llvm/llvm-project/issues/175297 --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 5 +- llvm/test/CodeGen/PowerPC/pr175297.ll | 92 +++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/PowerPC/pr175297.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 0f43555b6bca4..68958a8cf32d5 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9809,7 +9809,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, getCanonicalConstSplat(SextVal, UseSize, Op.getValueType(), DAG, dl); if (SplatSize != 8) return Res; - return BuildIntrinsicOp(Intrinsic::ppc_altivec_vupklsw, Res, DAG, dl); + SDValue IntrinsicOp = + BuildIntrinsicOp(Intrinsic::ppc_altivec_vupklsw, + DAG.getBitcast(MVT::v4i32, Res), DAG, dl, MVT::v2i64); + return DAG.getBitcast(Op.getValueType(), IntrinsicOp); } // Two instruction sequences. diff --git a/llvm/test/CodeGen/PowerPC/pr175297.ll b/llvm/test/CodeGen/PowerPC/pr175297.ll new file mode 100644 index 0000000000000..3c8179275407d --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr175297.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s + +define <32 x i16> @backsmith_pure_7(<8 x i8> %conv) { +; CHECK-LABEL: backsmith_pure_7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-NEXT: vspltisw v3, 1 +; CHECK-NEXT: addis r4, r2, .LCPI0_5@toc@ha +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-NEXT: addi r4, r4, .LCPI0_5@toc@l +; CHECK-NEXT: rldimi r6, r5, 32, 0 +; CHECK-NEXT: vupklsw v5, v3 +; CHECK-NEXT: mtvsrd v6, r5 +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI0_3@toc@l +; CHECK-NEXT: lxvd2x vs1, 0, r3 +; CHECK-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; CHECK-NEXT: xxmrgld v2, v2, v5 +; CHECK-NEXT: addi r3, r3, .LCPI0_4@toc@l +; CHECK-NEXT: lxvd2x vs2, 0, r3 +; CHECK-NEXT: addis r3, r2, .LCPI0_6@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI0_6@toc@l +; CHECK-NEXT: lxvd2x vs3, 0, r3 +; CHECK-NEXT: xxswapd v4, vs0 +; CHECK-NEXT: xxspltd vs0, v5, 1 +; CHECK-NEXT: mffprwz r3, f0 +; CHECK-NEXT: xxmrgld v3, v4, v5 +; CHECK-NEXT: xxswapd v0, vs1 +; CHECK-NEXT: xxswapd vs1, v3 +; CHECK-NEXT: vperm v0, v2, v4, v0 +; CHECK-NEXT: xxpermdi v4, v4, v2, 2 +; CHECK-NEXT: xxswapd v1, vs2 +; CHECK-NEXT: lxvd2x vs2, 0, r4 +; CHECK-NEXT: mffprwz r4, f0 +; CHECK-NEXT: rldimi r4, r3, 32, 0 +; CHECK-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI0_1@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: mfvsrwz r3, v3 +; CHECK-NEXT: vperm v0, v5, v0, v1 +; CHECK-NEXT: mtvsrd v1, r5 +; CHECK-NEXT: addis r5, r2, .LCPI0_2@toc@ha +; CHECK-NEXT: addi r5, r5, .LCPI0_2@toc@l +; CHECK-NEXT: xxswapd v7, vs3 +; CHECK-NEXT: xxmrglw v8, v0, v0 +; CHECK-NEXT: xxmrghw v9, v0, v0 +; CHECK-NEXT: vperm v5, v2, v5, v7 +; CHECK-NEXT: xxswapd v7, v2 +; CHECK-NEXT: vpkudum v2, v2, v3 +; CHECK-NEXT: vmrghh v1, v1, v6 +; CHECK-NEXT: xxswapd v6, vs2 +; CHECK-NEXT: lxvd2x vs2, 0, r5 +; CHECK-NEXT: mffprwz r5, f1 +; CHECK-NEXT: vpkudum v4, v7, v4 +; CHECK-NEXT: vpkudum v7, v3, v7 +; CHECK-NEXT: mtfprd f1, r6 +; CHECK-NEXT: rldimi r5, r3, 32, 0 +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: vperm v6, v8, v0, v6 +; CHECK-NEXT: vmrghh v0, v9, v0 +; CHECK-NEXT: rldimi r3, r3, 32, 0 +; CHECK-NEXT: xxswapd v8, vs2 +; CHECK-NEXT: mtfprd f2, r5 +; CHECK-NEXT: xxswapd v3, vs0 +; CHECK-NEXT: mtfprd f0, r4 +; CHECK-NEXT: xxland v3, v4, v3 +; CHECK-NEXT: xxmrghd v9, vs1, vs0 +; CHECK-NEXT: mtfprd f0, r3 +; CHECK-NEXT: xxmrglw vs1, v0, v6 +; CHECK-NEXT: xxlxor v6, v6, v6 +; CHECK-NEXT: vperm v2, v6, v2, v8 +; CHECK-NEXT: vpkuwum v3, v3, v9 +; CHECK-NEXT: xxmrghd v0, vs0, vs2 +; CHECK-NEXT: xxspltw vs0, v1, 3 +; CHECK-NEXT: vperm v1, v6, v5, v8 +; CHECK-NEXT: vpkuwum v5, v2, v0 +; CHECK-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-NEXT: vpkuwum v4, v7, v1 +; CHECK-NEXT: blr +entry: + %0 = bitcast <8 x i8> %conv to i64 + %vecinit9 = insertelement <2 x i64> splat (i64 1), i64 %0, i64 1 + %shuffle = shufflevector <2 x i64> %vecinit9, <2 x i64> , <32 x i32> + %conv11 = trunc <32 x i64> %shuffle to <32 x i16> + ret <32 x i16> %conv11 +} From e51bb36f668be06089d27ba0f14d1aeedc413584 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 12 May 2026 09:16:49 +0200 Subject: [PATCH 406/538] [AA] Respect potential synchronization effects of inline asm (#196965) Respect potential synchronization effects of inline assembly calls on not-yet-escaped memory. We only do this if the call is both non-nosync and ModRefs "other" memory. This is consistent with the atomic memory effects established in https://github.com/llvm/llvm-project/pull/193768 and makes sure that things like readonly/argmemonly continue to work as expected even for frontends that do not emit nosync (which, right now, is all of them). The limitation to inline asm should not actually exist: The issue applies to all calls. This just fixes a particularly important case in a targeted way. (The fact that inline asm memory barrier do not work as expected is a problem for making optimizations of monotonic accesses more aggressive, e.g. it caused issues for https://github.com/llvm/llvm-project/pull/195015.) The ability of inline asm (with a `~{memory}` clobber) to synchronize was explicitly specified in https://github.com/llvm/llvm-project/pull/150191. --- llvm/include/llvm/Analysis/AliasAnalysis.h | 5 ++++ llvm/lib/Analysis/AliasAnalysis.cpp | 6 ++-- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 16 +++++++++- llvm/test/Analysis/BasicAA/atomics.ll | 35 ++++++++++++++++++++++ 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index 4997c41f37273..7e9b17bcfa4d9 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -995,6 +995,11 @@ LLVM_ABI bool isNotVisibleOnUnwind(const Value *Object, LLVM_ABI bool isWritableObject(const Value *Object, bool &ExplicitlyDereferenceableOnly); +/// Get ModRefInfo for a synchronizing operation, such as a fence or stronger +/// than monotonic atomic load/store. +LLVM_ABI ModRefInfo getSyncEffects(AAResults *AA, const MemoryLocation &Loc, + AAQueryInfo &AAQI); + /// A manager for alias analyses. /// /// This class can have analyses registered with it and when run, it will run diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 8f311cd0bfeac..9d89a6d90f706 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -458,10 +458,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) { // Helper method implementation //===----------------------------------------------------------------------===// -/// Get ModRefInfo for a synchronizing operation, such as a fence or stronger -/// than monotonic atomic load/store. -static ModRefInfo getSyncEffects(AAResults *AA, const MemoryLocation &Loc, - AAQueryInfo &AAQI) { +ModRefInfo llvm::getSyncEffects(AAResults *AA, const MemoryLocation &Loc, + AAQueryInfo &AAQI) { if (!Loc.Ptr) return ModRefInfo::ModRef; diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 2d54d9815eb61..8b120f0ad1e11 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -959,6 +959,20 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call, ModRefInfo ErrnoMR = ME.getModRef(IRMemLocation::ErrnoMem); ModRefInfo OtherMR = ME.getModRef(IRMemLocation::Other); + // Take into account potential synchronization effects of the call. + // We assume synchronization can not occur if the call does not read/write + // other memory (this in particular ensures that readonly/argmemonly continue + // to work as expected for frontends that do not emit nosync). + // FIXME: This should apply to all calls, but is limited to inline asm to + // limit impact. This ensures that inline asm memory barriers work correctly. + ModRefInfo SyncMR = ModRefInfo::NoModRef; + if (isModAndRefSet(OtherMR) && Call->maySynchronize() && + Call->isInlineAsm()) { + SyncMR = getSyncEffects(&AAQI.AAR, Loc, AAQI); + if (isModAndRefSet(SyncMR)) + return SyncMR; + } + // An identified function-local object that does not escape can only be // accessed via call arguments. Reduce OtherMR (which includes accesses to // escaped memory) based on that. @@ -1002,7 +1016,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call, ArgMR = NewArgMR; } - ModRefInfo Result = ArgMR | OtherMR; + ModRefInfo Result = ArgMR | OtherMR | SyncMR; // Refine accesses to errno memory. if ((ErrnoMR | Result) != Result) { diff --git a/llvm/test/Analysis/BasicAA/atomics.ll b/llvm/test/Analysis/BasicAA/atomics.ll index 9a96b94a34c78..0aedd7f3816ba 100644 --- a/llvm/test/Analysis/BasicAA/atomics.ll +++ b/llvm/test/Analysis/BasicAA/atomics.ll @@ -2,6 +2,7 @@ declare void @escape(ptr) declare noalias ptr @malloc(i64) +declare void @call() ; CHECK-LABEL: Function: alloca_no_escape: ; CHECK: NoModRef: Ptr: i32* %a <-> %1 = atomicrmw add ptr %x, i32 1 monotonic, align 4 @@ -207,3 +208,37 @@ define ptr @malloc_escape_after(ptr %x) { ret ptr %a } + +; CHECK-LABEL: Function: inline_asm +; CHECK: Both ModRef: Ptr: i32* %a <-> call void asm sideeffect "", "~{memory}"() +; CHECK: NoModRef: Ptr: i32* %a <-> call void asm sideeffect "", "~{memory}"() #0 +; CHECK: NoModRef: Ptr: i32* %a <-> call void asm sideeffect "", "~{memory}"() #1 +; CHECK: NoModRef: Ptr: i32* %a <-> call void asm sideeffect "", "~{memory}"() #2 +define ptr @inline_asm() { + %a = call ptr @malloc(i64 4) + store i32 0, ptr %a + + call void asm sideeffect "", "~{memory}"() + call void asm sideeffect "", "~{memory}"() memory(read) + call void asm sideeffect "", "~{memory}"() memory(argmem: readwrite) + call void asm sideeffect "", "~{memory}"() nosync + + ret ptr %a +} + +; CHECK-LABEL: Function: arbitrary_call +; CHECK: NoModRef: Ptr: i32* %a <-> call void @call() +; CHECK: NoModRef: Ptr: i32* %a <-> call void @call() #0 +; CHECK: NoModRef: Ptr: i32* %a <-> call void @call() #1 +; CHECK: NoModRef: Ptr: i32* %a <-> call void @call() #2 +define ptr @arbitrary_call() { + %a = call ptr @malloc(i64 4) + store i32 0, ptr %a + + call void @call() + call void @call() memory(read) + call void @call() memory(argmem: readwrite) + call void @call() nosync + + ret ptr %a +} From 51893b4f238af1e0caaf8796042d6abc5e8b9616 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 12 May 2026 00:25:53 -0700 Subject: [PATCH 407/538] [MachineBlockPlacement] Fix use-after-erase (#197109) `ComputedEdges.erase(FoundEdge)` invalidates `FoundEdge`, but the function then returns `FoundEdge->second`. Read the bucket value into a local before erasing. --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 73d040bdaa19a..fffb5fd192771 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -1679,12 +1679,13 @@ MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB, // applicable. auto FoundEdge = ComputedEdges.find(BB); if (FoundEdge != ComputedEdges.end()) { - MachineBasicBlock *Succ = FoundEdge->second.BB; + BlockAndTailDupResult Result = FoundEdge->second; ComputedEdges.erase(FoundEdge); - BlockChain *SuccChain = BlockToChain[Succ]; - if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) && - SuccChain != &Chain && Succ == *SuccChain->begin()) - return FoundEdge->second; + BlockChain *SuccChain = BlockToChain[Result.BB]; + if (BB->isSuccessor(Result.BB) && + (!BlockFilter || BlockFilter->count(Result.BB)) && + SuccChain != &Chain && Result.BB == *SuccChain->begin()) + return Result; } // if BB is part of a trellis, Use the trellis to determine the optimal From 58a639d9962026083a00973d319eb2a5a28a611d Mon Sep 17 00:00:00 2001 From: Zeyi Xu Date: Tue, 12 May 2026 15:35:52 +0800 Subject: [PATCH 408/538] [clang-tidy] Remove hicpp module [3/4] (#197076) This is part three of removing the hicpp-* checks. RFC: https://discourse.llvm.org/t/rfc-regarding-the-current-status-of-hicpp-checks/89883 Part of https://github.com/llvm/llvm-project/issues/183462 --- .../clang-tidy/hicpp/HICPPTidyModule.cpp | 24 ----------------- .../tool/check_alphabetical_order_test.py | 16 ++++++----- clang-tools-extra/clangd/TidyFastChecks.inc | 8 ------ clang-tools-extra/docs/ReleaseNotes.rst | 27 +++++++++++-------- .../checks/hicpp/multiway-paths-covered.rst | 8 ------ .../clang-tidy/checks/hicpp/no-assembler.rst | 8 ------ .../clang-tidy/checks/hicpp/no-malloc.rst | 9 ------- .../checks/hicpp/special-member-functions.rst | 7 ----- .../clang-tidy/checks/hicpp/static-assert.rst | 8 ------ .../checks/hicpp/undelegated-constructor.rst | 23 ---------------- .../docs/clang-tidy/checks/hicpp/use-auto.rst | 8 ------ .../clang-tidy/checks/hicpp/use-emplace.rst | 8 ------ .../docs/clang-tidy/checks/list.rst | 8 ------ .../no-assembler-msvc.cpp | 4 +-- .../Inputs/config-file/config-file | 2 +- .../config-file/config-file-list-bracket | 6 ++--- .../Inputs/config-file/config-file-list-dash | 6 ++--- .../Inputs/config-file/config-file-spaces | 6 ++--- .../clang-tidy/infrastructure/config-file.cpp | 20 +++++++------- 19 files changed, 48 insertions(+), 158 deletions(-) delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/no-assembler.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/no-malloc.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/special-member-functions.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/static-assert.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/undelegated-constructor.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/use-auto.rst delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/hicpp/use-emplace.rst rename clang-tools-extra/test/clang-tidy/checkers/{hicpp => portability}/no-assembler-msvc.cpp (63%) diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp index 8d08e0cb87046..bc399308c4bc2 100644 --- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp @@ -8,20 +8,12 @@ #include "../ClangTidy.h" #include "../ClangTidyModule.h" -#include "../bugprone/UndelegatedConstructorCheck.h" -#include "../bugprone/UnhandledCodePathsCheck.h" -#include "../cppcoreguidelines/NoMallocCheck.h" #include "../cppcoreguidelines/ProTypeVarargCheck.h" -#include "../cppcoreguidelines/SpecialMemberFunctionsCheck.h" -#include "../misc/StaticAssertCheck.h" -#include "../modernize/UseAutoCheck.h" -#include "../modernize/UseEmplaceCheck.h" #include "../modernize/UseEqualsDefaultCheck.h" #include "../modernize/UseEqualsDeleteCheck.h" #include "../modernize/UseNoexceptCheck.h" #include "../modernize/UseNullptrCheck.h" #include "../modernize/UseOverrideCheck.h" -#include "../portability/NoAssemblerCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" namespace clang::tidy { @@ -31,22 +23,6 @@ namespace { class HICPPModule : public ClangTidyModule { public: void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { - CheckFactories.registerCheck( - "hicpp-multiway-paths-covered"); - CheckFactories.registerCheck( - "hicpp-no-assembler"); - CheckFactories.registerCheck( - "hicpp-no-malloc"); - CheckFactories - .registerCheck( - "hicpp-special-member-functions"); - CheckFactories.registerCheck( - "hicpp-static-assert"); - CheckFactories.registerCheck("hicpp-use-auto"); - CheckFactories.registerCheck( - "hicpp-undelegated-constructor"); - CheckFactories.registerCheck( - "hicpp-use-emplace"); CheckFactories.registerCheck( "hicpp-use-equals-default"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/tool/check_alphabetical_order_test.py b/clang-tools-extra/clang-tidy/tool/check_alphabetical_order_test.py index 09e23ce55e00a..ae8b923be2753 100644 --- a/clang-tools-extra/clang-tidy/tool/check_alphabetical_order_test.py +++ b/clang-tools-extra/clang-tidy/tool/check_alphabetical_order_test.py @@ -368,9 +368,11 @@ def test_release_notes_handles_multiline_doc(self) -> None: `. The `performance-faster-string-find` name is kept as an alias. - - Renamed :doc:`hicpp-no-assembler ` - to :doc:`portability-no-assembler - `. The `hicpp-no-assembler` + - Renamed :doc:`google-explicit-constructor + ` + to :doc:`misc-explicit-constructor + `. The + `google-explicit-constructor` name is kept as an alias. """ @@ -383,9 +385,11 @@ def test_release_notes_handles_multiline_doc(self) -> None: Changes in existing checks ^^^^^^^^^^^^^^^^^^^^^^^^^^ - - Renamed :doc:`hicpp-no-assembler ` - to :doc:`portability-no-assembler - `. The `hicpp-no-assembler` + - Renamed :doc:`google-explicit-constructor + ` + to :doc:`misc-explicit-constructor + `. The + `google-explicit-constructor` name is kept as an alias. - Renamed :doc:`performance-faster-string-find diff --git a/clang-tools-extra/clangd/TidyFastChecks.inc b/clang-tools-extra/clangd/TidyFastChecks.inc index e289e3cce99a5..c32a10ce0d9c5 100644 --- a/clang-tools-extra/clangd/TidyFastChecks.inc +++ b/clang-tools-extra/clangd/TidyFastChecks.inc @@ -272,15 +272,7 @@ FAST(google-runtime-float, 1.0) FAST(google-runtime-int, 2.0) FAST(google-runtime-operator, 1.0) FAST(google-upgrade-googletest-case, 1.0) -FAST(hicpp-multiway-paths-covered, -0.0) -FAST(hicpp-no-assembler, 1.0) -FAST(hicpp-no-malloc, 1.0) -FAST(hicpp-special-member-functions, -1.0) -FAST(hicpp-static-assert, 2.0) -FAST(hicpp-undelegated-constructor, 1.0) FAST(hicpp-uppercase-literal-suffix, 0.0) -FAST(hicpp-use-auto, -0.0) -FAST(hicpp-use-emplace, 2.0) FAST(hicpp-use-equals-default, 2.0) FAST(hicpp-use-equals-delete, -2.0) FAST(hicpp-use-noexcept, 2.0) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 1b469cad4eb9c..ad3cbee4fe8b1 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -83,16 +83,32 @@ Potentially Breaking Changes ` ``hicpp-move-const-arg`` :doc:`performance-move-const-arg ` + ``hicpp-multiway-paths-covered`` :doc:`bugprone-unhandled-code-paths + ` ``hicpp-named-parameter`` :doc:`readability-named-parameter ` ``hicpp-new-delete-operators`` :doc:`misc-new-delete-overloads ` ``hicpp-no-array-decay`` :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay ` + ``hicpp-no-assembler`` :doc:`portability-no-assembler + ` + ``hicpp-no-malloc`` :doc:`cppcoreguidelines-no-malloc + ` ``hicpp-noexcept-move`` :doc:`performance-noexcept-move-constructor ` ``hicpp-signed-bitwise`` :doc:`bugprone-signed-bitwise ` + ``hicpp-special-member-functions`` :doc:`cppcoreguidelines-special-member-functions + ` + ``hicpp-static-assert`` :doc:`misc-static-assert + ` + ``hicpp-undelegated-constructor`` :doc:`bugprone-undelegated-constructor + ` + ``hicpp-use-auto`` :doc:`modernize-use-auto + ` + ``hicpp-use-emplace`` :doc:`modernize-use-emplace + ` ================================== ======================================================================== Improvements to clangd @@ -270,17 +286,6 @@ New check aliases `. The `google-explicit-constructor` name is kept as an alias. -- Renamed :doc:`hicpp-multiway-paths-covered - ` - to :doc:`bugprone-unhandled-code-paths - `. - The `hicpp-multiway-paths-covered` name is kept as an alias. - -- Renamed :doc:`hicpp-no-assembler ` - to :doc:`portability-no-assembler - `. The `hicpp-no-assembler` - name is kept as an alias. - - Renamed :doc:`performance-faster-string-find ` to :doc:`performance-prefer-single-char-overloads diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst deleted file mode 100644 index e6cef1d302059..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/multiway-paths-covered.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-multiway-paths-covered - -hicpp-multiway-paths-covered -============================ - -The `hicpp-multiway-paths-covered` check is an alias, please see -`bugprone-unhandled-code-paths <../bugprone/unhandled-code-paths.html>`_ -for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-assembler.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-assembler.rst deleted file mode 100644 index 94ad0b01fae79..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-assembler.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-no-assembler - -hicpp-no-assembler -================== - -The `hicpp-no-assembler` check is an alias, please see -`portability-no-assembler <../portability/no-assembler.html>`_ for more -information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-malloc.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-malloc.rst deleted file mode 100644 index 9ec8937f9e8fc..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/no-malloc.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. title:: clang-tidy - hicpp-no-malloc - -hicpp-no-malloc -=============== - -The `hicpp-no-malloc` check is an alias, please see -:doc:`cppcoreguidelines-no-malloc <../cppcoreguidelines/no-malloc>` -for more information. -It enforces the `rule 5.3.2 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/special-member-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/special-member-functions.rst deleted file mode 100644 index 740fd1af0f646..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/special-member-functions.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. title:: clang-tidy - hicpp-special-member-functions - -hicpp-special-member-functions -============================== - -This check is an alias for :doc:`cppcoreguidelines-special-member-functions <../cppcoreguidelines/special-member-functions>`. -Checks that special member functions have the correct signature, according to `rule 12.5.7 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/static-assert.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/static-assert.rst deleted file mode 100644 index c1dce38859929..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/static-assert.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-static-assert - -hicpp-static-assert -=================== - -The `hicpp-static-assert` check is an alias, please see -:doc:`misc-static-assert <../misc/static-assert>` for more information. -It enforces the `rule 7.1.10 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/undelegated-constructor.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/undelegated-constructor.rst deleted file mode 100644 index 2eb1d9dada7ec..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/undelegated-constructor.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. title:: clang-tidy - hicpp-undelegated-constructor - -hicpp-undelegated-constructor -============================= - -This check is an alias for :doc:`bugprone-undelegated-constructor <../bugprone/undelegated-constructor>`. -Partially implements `rule 12.4.5 `_ -to find misplaced constructor calls inside a constructor. - -.. code-block:: c++ - - struct Ctor { - Ctor(); - Ctor(int); - Ctor(int, int); - Ctor(Ctor *i) { - // All Ctor() calls result in a temporary object - Ctor(); // did you intend to call a delegated constructor? - Ctor(0); // did you intend to call a delegated constructor? - Ctor(1, 2); // did you intend to call a delegated constructor? - foo(); - } - }; diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/use-auto.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/use-auto.rst deleted file mode 100644 index 7e088f41fb7fb..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/use-auto.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-use-auto - -hicpp-use-auto -============== - -The `hicpp-use-auto` check is an alias, please see -:doc:`modernize-use-auto <../modernize/use-auto>` for more information. -It enforces the `rule 7.1.8 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/hicpp/use-emplace.rst b/clang-tools-extra/docs/clang-tidy/checks/hicpp/use-emplace.rst deleted file mode 100644 index 05aa6037c9a93..0000000000000 --- a/clang-tools-extra/docs/clang-tidy/checks/hicpp/use-emplace.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. title:: clang-tidy - hicpp-use-emplace - -hicpp-use-emplace -================= - -The `hicpp-use-emplace` check is an alias, please see -:doc:`modernize-use-emplace <../modernize/use-emplace>` for more information. -It enforces the `rule 17.4.2 `_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index f80df56d1a4ad..d21ed3f104a01 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -603,15 +603,7 @@ Check aliases :doc:`google-readability-casting `, :doc:`modernize-avoid-c-style-cast `, "Yes" :doc:`google-readability-function-size `, :doc:`readability-function-size `, :doc:`google-readability-namespace-comments `, :doc:`llvm-namespace-comment `, - :doc:`hicpp-multiway-paths-covered `, :doc:`bugprone-unhandled-code-paths `, - :doc:`hicpp-no-assembler `, :doc:`portability-no-assembler `, - :doc:`hicpp-no-malloc `, :doc:`cppcoreguidelines-no-malloc `, - :doc:`hicpp-special-member-functions `, :doc:`cppcoreguidelines-special-member-functions `, - :doc:`hicpp-static-assert `, :doc:`misc-static-assert `, "Yes" - :doc:`hicpp-undelegated-constructor `, :doc:`bugprone-undelegated-constructor `, :doc:`hicpp-uppercase-literal-suffix `, :doc:`readability-uppercase-literal-suffix `, "Yes" - :doc:`hicpp-use-auto `, :doc:`modernize-use-auto `, "Yes" - :doc:`hicpp-use-emplace `, :doc:`modernize-use-emplace `, "Yes" :doc:`hicpp-use-equals-default `, :doc:`modernize-use-equals-default `, "Yes" :doc:`hicpp-use-equals-delete `, :doc:`modernize-use-equals-delete `, "Yes" :doc:`hicpp-use-noexcept `, :doc:`modernize-use-noexcept `, "Yes" diff --git a/clang-tools-extra/test/clang-tidy/checkers/hicpp/no-assembler-msvc.cpp b/clang-tools-extra/test/clang-tidy/checkers/portability/no-assembler-msvc.cpp similarity index 63% rename from clang-tools-extra/test/clang-tidy/checkers/hicpp/no-assembler-msvc.cpp rename to clang-tools-extra/test/clang-tidy/checkers/portability/no-assembler-msvc.cpp index d29a9e93108b9..67abe0d722a9f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/hicpp/no-assembler-msvc.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/portability/no-assembler-msvc.cpp @@ -1,11 +1,11 @@ // REQUIRES: system-windows // FIXME: Re-enable test on windows (PR36855) // UNSUPPORTED: system-windows -// RUN: %check_clang_tidy %s hicpp-no-assembler %t +// RUN: %check_clang_tidy %s portability-no-assembler %t void f() { _asm { mov al, 2; - // CHECK-MESSAGES: :[[@LINE-2]]:3: warning: do not use inline assembler in safety-critical code [hicpp-no-assembler] + // CHECK-MESSAGES: :[[@LINE-2]]:3: warning: do not use inline assembler in safety-critical code [portability-no-assembler] } } diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file index 23bb65e0155b1..afe6d836bd537 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file +++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file @@ -1 +1 @@ -Checks: "-*,hicpp-uppercase-literal-suffix" +Checks: "-*,modernize-use-nullptr" diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-bracket b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-bracket index d2bfe57880c2b..e1c43ede8dd67 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-bracket +++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-bracket @@ -1,6 +1,6 @@ Checks: [ "-*", - "hicpp-uppercase-literal-suffix", - "hicpp-use-auto", - "hicpp-use-emplace", + "modernize-use-auto", + "modernize-use-emplace", + "modernize-use-nullptr", ] diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-dash b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-dash index ce1c88d1148cb..43e83dbe50b70 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-dash +++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-list-dash @@ -1,5 +1,5 @@ Checks: - "-*" - - "hicpp-uppercase-literal-suffix" - - "hicpp-use-auto" - - "hicpp-use-emplace" + - "modernize-use-auto" + - "modernize-use-emplace" + - "modernize-use-nullptr" diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-spaces b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-spaces index 4aa1f846ade65..337429dea6733 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-spaces +++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/config-file/config-file-spaces @@ -1,9 +1,9 @@ Checks: " -* , - hicpp-uppercase-literal-suffix - ,hicpp-use-auto + modernize-use-auto + ,modernize-use-emplace - , hicpp-use-emplace + , modernize-use-nullptr " diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/config-file.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/config-file.cpp index b929fb15193d1..8942c8b8e5fc7 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/config-file.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/config-file.cpp @@ -1,20 +1,20 @@ // RUN: clang-tidy -config-file=%S/Inputs/config-file/config-file -dump-config -- | FileCheck %s -check-prefix=CHECK-BASE -// CHECK-BASE: Checks: {{.*}}hicpp-uppercase-literal-suffix +// CHECK-BASE: Checks: {{.*}}modernize-use-nullptr // RUN: clang-tidy -config-file=%S/Inputs/config-file/config-file-spaces --list-checks -- | FileCheck %s -check-prefix=CHECK-SPACES // CHECK-SPACES: Enabled checks: -// CHECK-SPACES-NEXT: hicpp-uppercase-literal-suffix -// CHECK-SPACES-NEXT: hicpp-use-auto -// CHECK-SPACES-NEXT: hicpp-use-emplace +// CHECK-SPACES-NEXT: modernize-use-auto +// CHECK-SPACES-NEXT: modernize-use-emplace +// CHECK-SPACES-NEXT: modernize-use-nullptr // CHECK-SPACES-EMPTY: // RUN: clang-tidy -config-file=%S/Inputs/config-file/config-file-list-dash --list-checks -- | FileCheck %s -check-prefix=CHECK-LIST-DASH // CHECK-LIST-DASH: Enabled checks: -// CHECK-LIST-DASH-NEXT: hicpp-uppercase-literal-suffix -// CHECK-LIST-DASH-NEXT: hicpp-use-auto -// CHECK-LIST-DASH-NEXT: hicpp-use-emplace +// CHECK-LIST-DASH-NEXT: modernize-use-auto +// CHECK-LIST-DASH-NEXT: modernize-use-emplace +// CHECK-LIST-DASH-NEXT: modernize-use-nullptr // CHECK-LIST-DASH-EMPTY: // RUN: clang-tidy -config-file=%S/Inputs/config-file/config-file-list-bracket --list-checks -- | FileCheck %s -check-prefix=CHECK-LIST-BRACKET // CHECK-LIST-BRACKET: Enabled checks: -// CHECK-LIST-BRACKET-NEXT: hicpp-uppercase-literal-suffix -// CHECK-LIST-BRACKET-NEXT: hicpp-use-auto -// CHECK-LIST-BRACKET-NEXT: hicpp-use-emplace +// CHECK-LIST-BRACKET-NEXT: modernize-use-auto +// CHECK-LIST-BRACKET-NEXT: modernize-use-emplace +// CHECK-LIST-BRACKET-NEXT: modernize-use-nullptr // CHECK-LIST-BRACKET-EMPTY: From a469fe98f7e77f59efbfaf32fb1a1e3b6ad18792 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Tue, 12 May 2026 13:20:35 +0530 Subject: [PATCH 409/538] [LV] Regenerate skip-iterations checks (NFC) (#197105) --- .../LoopVectorize/skip-iterations.ll | 162 +++++++++++++++++- 1 file changed, 154 insertions(+), 8 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/skip-iterations.ll b/llvm/test/Transforms/LoopVectorize/skip-iterations.ll index 0cd27579b9ae2..96c79bb02da05 100644 --- a/llvm/test/Transforms/LoopVectorize/skip-iterations.ll +++ b/llvm/test/Transforms/LoopVectorize/skip-iterations.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 ; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -20,9 +21,31 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; end: ; return 0; ; } -; CHECK-LABEL: test1( -; CHECK-NOT: <4 x i32> define i32 @test1(ptr nocapture %A, i32 %Length) { +; CHECK-LABEL: define i32 @test1( +; CHECK-SAME: ptr captures(none) [[A:%.*]], i32 [[LENGTH:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[LENGTH]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_ELSE:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0:![0-9]+]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 10 +; CHECK-NEXT: br i1 [[CMP1]], label %[[END_LOOPEXIT:.*]], label %[[IF_ELSE]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA0]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[LENGTH]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[END_LOOPEXIT]] +; CHECK: [[END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret i32 0 +; entry: %cmp8 = icmp sgt i32 %Length, 0 br i1 %cmp8, label %for.body.preheader, label %end @@ -63,9 +86,32 @@ end: ; the hard part of proving/speculating A[i:VF - 1] loads does not fault is handled by the ; compiler/hardware. -; CHECK-LABEL: test2( -; CHECK-NOT: <4 x i32> define i32 @test2(ptr nocapture %A, i32 %Length, i32 %K) { +; CHECK-LABEL: define i32 @test2( +; CHECK-SAME: ptr captures(none) [[A:%.*]], i32 [[LENGTH:%.*]], i32 [[K:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[LENGTH]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_ELSE:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LD]], [[K]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[END_LOOPEXIT:.*]], label %[[IF_ELSE]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TRUNC]], [[LENGTH]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[END_LOOPEXIT]] +; CHECK: [[END_LOOPEXIT]]: +; CHECK-NEXT: [[RESULT_LCSSA:%.*]] = phi i32 [ 1, %[[FOR_BODY]] ], [ 0, %[[IF_ELSE]] ] +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[RESULT_LCSSA]], %[[END_LOOPEXIT]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i32 [[RESULT]] +; entry: %cmp8 = icmp sgt i32 %Length, 0 br i1 %cmp8, label %for.body.preheader, label %end @@ -105,9 +151,33 @@ end: ; } ; TODO: Today we do not vectorize this, but we could teach the vectorizer (once ; we handle the speculation safety of the widened load). -; CHECK-LABEL: test3( -; CHECK-NOT: <4 x i32> define i32 @test3(ptr nocapture %A, i32 %Length, i32 %K) { +; CHECK-LABEL: define i32 @test3( +; CHECK-SAME: ptr captures(none) [[A:%.*]], i32 [[LENGTH:%.*]], i32 [[K:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[LENGTH]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_ELSE:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LD]], [[K]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[END_LOOPEXIT:.*]], label %[[IF_ELSE]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TRUNC]], [[LENGTH]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[END_LOOPEXIT]] +; CHECK: [[END_LOOPEXIT]]: +; CHECK-NEXT: [[RESULT_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[FOR_BODY]] ], [ -1, %[[IF_ELSE]] ] +; CHECK-NEXT: [[RES_TRUNC:%.*]] = trunc i64 [[RESULT_LCSSA]] to i32 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[RES_TRUNC]], %[[END_LOOPEXIT]] ], [ -1, %[[ENTRY]] ] +; CHECK-NEXT: ret i32 [[RESULT]] +; entry: %cmp8 = icmp sgt i32 %Length, 0 br i1 %cmp8, label %for.body.preheader, label %end @@ -145,9 +215,85 @@ end: ; } ; } ; For this test, we vectorize and generate predicated stores to A[i]. -; CHECK-LABEL: test4( -; CHECK: <4 x i32> define void @test4(ptr nocapture %A, i32 %Length, i32 %K, i32 %J) { +; CHECK-LABEL: define void @test4( +; CHECK-SAME: ptr captures(none) [[A:%.*]], i32 [[LENGTH:%.*]], i32 [[K:%.*]], i32 [[J:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[LENGTH]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[END_LOOPEXIT:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: store i32 [[J]], ptr [[TMP1]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: store i32 [[J]], ptr [[TMP6]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_CONTINUE2]]: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; CHECK: [[PRED_STORE_IF3]]: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: store i32 [[J]], ptr [[TMP9]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; CHECK: [[PRED_STORE_CONTINUE4]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_IF5]]: +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: store i32 [[J]], ptr [[TMP12]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[END_LOOPEXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LD]], [[K]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[LATCH]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: store i32 [[J]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TRUNC]], [[LENGTH]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[END_LOOPEXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[END_LOOPEXIT]] +; CHECK: [[END_LOOPEXIT]]: +; CHECK-NEXT: ret void +; entry: %cmp8 = icmp sgt i32 %Length, 0 br i1 %cmp8, label %for.body.preheader, label %end.loopexit From 9f3d3048715a733b4ceb94d39722cf68752d6683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Tue, 12 May 2026 10:04:00 +0200 Subject: [PATCH 410/538] [clang-tidy] Add new check 'misc-static-initialization-cycle' (#175342) --- .../clang-tidy/misc/CMakeLists.txt | 1 + .../clang-tidy/misc/MiscTidyModule.cpp | 3 + .../misc/StaticInitializationCycleCheck.cpp | 395 ++++++++++++++++++ .../misc/StaticInitializationCycleCheck.h | 31 ++ clang-tools-extra/docs/ReleaseNotes.rst | 5 + .../docs/clang-tidy/checks/list.rst | 1 + .../misc/static-initialization-cycle.rst | 63 +++ .../misc/static-initialization-cycle.cpp | 277 ++++++++++++ 8 files changed, 776 insertions(+) create mode 100644 clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp create mode 100644 clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.h create mode 100644 clang-tools-extra/docs/clang-tidy/checks/misc/static-initialization-cycle.rst create mode 100644 clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp diff --git a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt index 83a23b65f86db..ec7d132352ee9 100644 --- a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt @@ -39,6 +39,7 @@ add_clang_library(clangTidyMiscModule STATIC PredictableRandCheck.cpp RedundantExpressionCheck.cpp StaticAssertCheck.cpp + StaticInitializationCycleCheck.cpp ThrowByValueCatchByReferenceCheck.cpp UnconventionalAssignOperatorCheck.cpp UniqueptrResetReleaseCheck.cpp diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp index 5a716606495db..52d3b4297ba26 100644 --- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp @@ -28,6 +28,7 @@ #include "PredictableRandCheck.h" #include "RedundantExpressionCheck.h" #include "StaticAssertCheck.h" +#include "StaticInitializationCycleCheck.h" #include "ThrowByValueCatchByReferenceCheck.h" #include "UnconventionalAssignOperatorCheck.h" #include "UniqueptrResetReleaseCheck.h" @@ -79,6 +80,8 @@ class MiscModule : public ClangTidyModule { CheckFactories.registerCheck( "misc-redundant-expression"); CheckFactories.registerCheck("misc-static-assert"); + CheckFactories.registerCheck( + "misc-static-initialization-cycle"); CheckFactories.registerCheck( "misc-throw-by-value-catch-by-reference"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp new file mode 100644 index 0000000000000..eb230983c8a7a --- /dev/null +++ b/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp @@ -0,0 +1,395 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "StaticInitializationCycleCheck.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/DynamicRecursiveASTVisitor.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/Analysis/CallGraph.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SCCIterator.h" + +using namespace clang; +using namespace clang::ast_matchers; + +// Check if a reference to a static variable (that was reached while traversal +// of a function declaration) should be ignored by the check. This returns true +// if the value of the variable has no effect on the return value of the +// function, or the reference is ignored for other reason to eliminate FP +// results. +// Ignore happens if the variable appears at LHS of an assignment or it appears +// inside a compile-time constant expression (like 'sizeof'). +// Additional condition is if the reference appears in a not immediately called +// lambda function. +static bool shouldIgnoreRef(const DeclRefExpr *DRE, const Decl *ParentD) { + ASTContext &ACtx = ParentD->getASTContext(); + ParentMapContext &PMC = ACtx.getParentMapContext(); + DynTypedNodeList Parents = PMC.getParents(*DRE); + // While going upwards on the parent graph, this stores the last encountered + // lambda expression that did not appear (until now) as a callee of a + // 'operator ()'. + const LambdaExpr *ParentLambda = nullptr; + while (!Parents.empty()) { + if (Parents.size() > 1) + return true; + if (const Expr *E = Parents[0].get()) { + if (!E->getType().isNull() && !E->isValueDependent() && + E->isIntegerConstantExpr(ACtx)) + return true; + if (const auto *ParentBO = dyn_cast(E)) { + if (ParentBO->isAssignmentOp() && + ParentBO->getLHS()->IgnoreParenCasts() == DRE) + return true; + } else if (const auto *LambdaE = dyn_cast(E)) { + // Found another lambda while the last found do not appear to be called + // by '()'. + if (ParentLambda) + return true; + ParentLambda = LambdaE; + } else if (const auto *OpCallE = dyn_cast(E)) { + // Check if the last found lambda is called with this 'operator ()'. + if (ParentLambda && + OpCallE->getOperator() == OverloadedOperatorKind::OO_Call && + OpCallE->getCalleeDecl() == ParentLambda->getCallOperator()) + ParentLambda = nullptr; + } + } else if (const Decl *D = Parents[0].get()) { + // Check if we reached the root of the context (variable or function + // declaration) to check. + if ([D, ParentD]() { + if (const auto *ParentF = dyn_cast(ParentD)) { + if (const auto *FD = dyn_cast(D)) + return FD == ParentF->getDefinition(); + return false; + } + return D->getCanonicalDecl() == ParentD->getCanonicalDecl(); + }()) + return ParentLambda != nullptr; + } + Parents = PMC.getParents(Parents[0]); + } + llvm_unreachable("declaration of ParentD should be reached"); + return false; +} + +namespace { + +class VarUseNode; + +// Store the reference to a variable or the call location of a function. +// 'Ref' is a DeclRefExpr or a CallExpr. +// 'Node' contains information about corresponding VarDecl or FunctionDecl. +struct VarUseRecord { + const Expr *Ref; + VarUseNode *Node; + + VarUseRecord() = default; + VarUseRecord(const Expr *Ref, VarUseNode *N) : Ref(Ref), Node(N) {} + operator VarUseNode *() const { return Node; } +}; + +// One node in the variable usage graph. +// If 'D' is a VarDecl: +// 'Uses' contains all static variables and global function calls in the +// initializer expression. +// If 'D' is a FunctionDecl: +// 'Uses' contains all static variable references and global function calls in +// the function body. +class VarUseNode { + const NamedDecl *D; + llvm::SmallVector Uses; + +public: + VarUseNode(const NamedDecl *D) : D(D) {} + + const NamedDecl *getDecl() const { return D; } + bool isVar() const { return isa(D); } + bool isFunction() const { return isa(D); } + const VarDecl *getVar() const { return cast(D); } + const FunctionDecl *getFunction() const { return cast(D); } + + using const_iterator = llvm::SmallVectorImpl::const_iterator; + + const_iterator begin() const { return Uses.begin(); } + const_iterator end() const { return Uses.end(); } + + llvm::iterator_range uses() const { + return llvm::make_range(begin(), end()); + } + + bool empty() const { return Uses.empty(); } + unsigned size() const { return Uses.size(); } + + friend class VarUseCollector; + friend class VarUseGraphBuilder; + friend class VarUseGraph; +}; + +// "Variable usage graph": +// Stores dependencies of variables from other variables or function calls, +// and dependencies of function results from variables or functions. +// Only static variables (static member, static local variable, or global +// variable) and global or static functions are stored. +// Stored are the canonical declarations of variables and definitions of +// functions. +class VarUseGraph { + using UseMapTy = llvm::DenseMap>; + + UseMapTy UseMap; + +public: + VarUseGraph() { + // A special "root" is added at nullptr location. + // It contains edges to all other nodes, without a "Ref" expression. + // This is used by the SCC algorithm. + UseMap[nullptr] = std::make_unique(nullptr); + } + + VarUseNode *addNode(const NamedDecl *D) { + std::unique_ptr &N = UseMap[D]; + if (N) + return N.get(); + N = std::make_unique(D); + UseMap[nullptr]->Uses.emplace_back(nullptr, N.get()); + return N.get(); + } + + using const_iterator = UseMapTy::const_iterator; + + const_iterator begin() const { return UseMap.begin(); } + const_iterator end() const { return UseMap.end(); } + + unsigned size() const { return UseMap.size(); } + + VarUseNode *getRoot() { return UseMap[nullptr].get(); } + + friend class VarUseGraphBuilder; +}; + +// Collect static variable references and static function calls. +// This is used with initializer expressions and function body statements. +// At initializer expressions only statements (and expressions) should be +// traversed. But for functions declarations are needed too (to reach +// initializations of variables) (only inside the given function). +class VarUseCollector : public DynamicRecursiveASTVisitor { + VarUseNode *Node; + VarUseGraph &G; + const DeclContext *DC; + +public: + VarUseCollector(VarUseNode *N, VarUseGraph &G) + : Node(N), G(G), DC(N->isFunction() ? N->getFunction() : nullptr) {} + + bool TraverseType(QualType T, bool TraverseQualifier) override { + return true; + } + bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier) override { + return true; + } + bool TraverseAttr(Attr *At) override { return true; } + bool TraverseDecl(Decl *D) override { + if (DC && DC->containsDecl(D)) + return DynamicRecursiveASTVisitor::TraverseDecl(D); + return true; + } + + bool VisitDeclRefExpr(DeclRefExpr *DRE) override { + if (const auto *VarD = dyn_cast(DRE->getDecl())) { + if (!shouldIgnoreRef(DRE, Node->getDecl()) && + (VarD->hasGlobalStorage() || VarD->isStaticLocal())) + Node->Uses.emplace_back(DRE, G.addNode(VarD->getCanonicalDecl())); + } + return true; + } + + bool VisitCallExpr(CallExpr *CE) override { + if (const FunctionDecl *F = CE->getDirectCallee()) { + if (F->isGlobal() || F->isStatic()) { + const FunctionDecl *Def = F->getDefinition(); + if (Def) + Node->Uses.emplace_back(CE, G.addNode(Def)); + } + } + return true; + } +}; + +// Build the complete graph by visiting all static variables and functions and +// add all "usages" (children in the graph) to it. +// Every variable and function is visited once (at canonical declaration or the +// definition). When visiting an object, a node for it may already exist +// (without added children) if a reference to it was found already. +class VarUseGraphBuilder : public DynamicRecursiveASTVisitor { + VarUseGraph &G; + +public: + VarUseGraphBuilder(VarUseGraph &G) : G(G) {} + + bool VisitVarDecl(VarDecl *VD) override { + if ((VD->hasGlobalStorage() || VD->isStaticLocal()) && + VD->isCanonicalDecl()) { + if (VarDecl *InitD = VD->getInitializingDeclaration()) { + VarUseNode *N = G.addNode(VD); + VarUseCollector Collector(N, G); + Collector.TraverseStmt(InitD->getInit()); + } + } + return true; + } + + bool VisitFunctionDecl(FunctionDecl *FD) override { + if (FD->isGlobal() || FD->isStatic()) { + if (Stmt *Body = FD->getBody()) { + VarUseNode *N = G.addNode(FD); + VarUseCollector Collector(N, G); + Collector.TraverseStmt(Body); + } + } + return true; + } +}; + +} // namespace + +namespace llvm { + +// These structures are required by scc_iterator. + +template <> struct GraphTraits { + using NodeType = const VarUseNode; + using NodeRef = const VarUseNode *; + using ChildIteratorType = NodeType::const_iterator; + + static NodeType *getEntryNode(const VarUseNode *N) { return N; } + static ChildIteratorType + child_begin(NodeType *N) { // NOLINT(readability-identifier-naming) + return N->begin(); + } + static ChildIteratorType + child_end(NodeType *N) { // NOLINT(readability-identifier-naming) + return N->end(); + } +}; + +template <> +struct GraphTraits + : public GraphTraits { + static NodeType *getEntryNode(const VarUseGraph *G) { + return const_cast(G)->getRoot(); + } + + static VarUseNode *getValue(VarUseGraph::const_iterator::value_type &P) { + return P.second.get(); + } + + using nodes_iterator = + mapped_iterator; + + static nodes_iterator + nodes_begin(const VarUseGraph *G) { // NOLINT(readability-identifier-naming) + return {G->begin(), &getValue}; + } + + static nodes_iterator + nodes_end(const VarUseGraph *G) { // NOLINT(readability-identifier-naming) + return {G->end(), &getValue}; + } + + static unsigned size(const VarUseGraph *G) { return G->size(); } +}; + +} // namespace llvm + +static void +reportCycles(ArrayRef SCC, + clang::tidy::misc::StaticInitializationCycleCheck &Chk) { + // Check if the SCC contains any variable, otherwise it is a function + // recursion. + auto NodeIsVar = [](const VarUseNode *N) { return N->isVar(); }; + const auto *VarNode = llvm::find_if(SCC, NodeIsVar); + if (VarNode == SCC.end()) + return; + + Chk.diag((*VarNode)->getDecl()->getLocation(), + "static variable initialization cycle detected involving %0") + << (*VarNode)->getDecl(); + + // SCC may contain multiple cycles. + // Find one path with the front node as start. + + // Lookup if a node is part of current SCC. + const llvm::SmallPtrSet SCCElts(SCC.begin(), + SCC.end()); + + // Visit all paths in the SCC until we reach the front again. + llvm::DenseMap NextNode; + llvm::SmallVector FoundPath; + FoundPath.push_back(SCC.front()); + while (!FoundPath.empty()) { + if (!NextNode.contains(FoundPath.back())) { + NextNode[FoundPath.back()] = FoundPath.back()->begin(); + } else { + NextNode[FoundPath.back()]++; + if (NextNode[FoundPath.back()] == FoundPath.back()->end()) { + FoundPath.pop_back(); + continue; + } + } + const VarUseNode *N = (*NextNode[FoundPath.back()]).Node; + if (N == SCC.front()) + break; + if (!SCCElts.contains(N) || NextNode.contains(N)) + continue; + FoundPath.push_back(N); + } + + std::string OutStr; + llvm::raw_string_ostream CycleOs(OutStr); + + for (const VarUseNode *N : FoundPath) { + const VarUseRecord &U = *NextNode[N]; + // 'U' is the source of the value, 'N->getDecl()' is the destination + Chk.diag(U.Ref->getBeginLoc(), + "%select{result|value}2 of %0 may be used to %select{compute " + "result of|initialize variable}3 %1 here", + DiagnosticIDs::Note) + << U.Node->getDecl() << N->getDecl() << U.Node->isVar() << N->isVar(); + + CycleOs << *N->getDecl() << " -> "; + } + CycleOs << *(FoundPath.front()->getDecl()); + + Chk.diag((*VarNode)->getDecl()->getLocation(), + "possible cyclical initialization: %0", DiagnosticIDs::Note) + << CycleOs.str(); +} + +namespace clang::tidy::misc { + +void StaticInitializationCycleCheck::registerMatchers(MatchFinder *Finder) { + Finder->addMatcher(translationUnitDecl().bind("TUDecl"), this); +} + +void StaticInitializationCycleCheck::check( + const MatchFinder::MatchResult &Result) { + const auto *TU = Result.Nodes.getNodeAs("TUDecl"); + + VarUseGraph Uses; + VarUseGraphBuilder Builder(Uses); + Builder.TraverseDecl(const_cast(TU)); + + for (llvm::scc_iterator SCCI = + llvm::scc_begin(const_cast(&Uses)); + !SCCI.isAtEnd(); ++SCCI) { + if (!SCCI.hasCycle()) + continue; + reportCycles(*SCCI, *this); + } +} + +} // namespace clang::tidy::misc diff --git a/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.h b/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.h new file mode 100644 index 0000000000000..5bc95c8c00efe --- /dev/null +++ b/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.h @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_STATICINITIALIZATIONCYCLECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_STATICINITIALIZATIONCYCLECHECK_H + +#include "../ClangTidyCheck.h" + +namespace clang::tidy::misc { + +/// Finds cyclical initialization of static variables. +/// +/// For the user-facing documentation see: +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/static-initialization-cycle.html +class StaticInitializationCycleCheck : public ClangTidyCheck { +public: + StaticInitializationCycleCheck(StringRef Name, ClangTidyContext *Context) + : ClangTidyCheck(Name, Context) {} + + void registerMatchers(ast_matchers::MatchFinder *Finder) override; + void check(const ast_matchers::MatchFinder::MatchResult &Result) override; +}; + +} // namespace clang::tidy::misc + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_STATICINITIALIZATIONCYCLECHECK_H diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index ad3cbee4fe8b1..34d9d90c70a47 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -220,6 +220,11 @@ New checks ``llvm::to_vector(llvm::make_filter_range(...))`` that can be replaced with ``llvm::map_to_vector`` and ``llvm::filter_to_vector``. +- New :doc:`misc-static-initialization-cycle + ` check. + + Finds cyclical initialization of static variables. + - New :doc:`modernize-use-std-bit ` check. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index d21ed3f104a01..96b8231d2b618 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -279,6 +279,7 @@ Clang-Tidy Checks :doc:`misc-predictable-rand `, :doc:`misc-redundant-expression `, "Yes" :doc:`misc-static-assert `, "Yes" + :doc:`misc-static-initialization-cycle `, :doc:`misc-throw-by-value-catch-by-reference `, :doc:`misc-unconventional-assign-operator `, :doc:`misc-uniqueptr-reset-release `, "Yes" diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/static-initialization-cycle.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/static-initialization-cycle.rst new file mode 100644 index 0000000000000..7a50428b53df7 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/static-initialization-cycle.rst @@ -0,0 +1,63 @@ +.. title:: clang-tidy - misc-static-initialization-cycle + +misc-static-initialization-cycle +================================ + +Finds cyclical initialization of static variables. + +The cycle can come from reference to static variables or from (static) function +calls during initialization. Such cycles can cause undefined behavior. In this +context "static" means C++ ``static`` class members, global variables, global +functions, and ``static`` variables inside functions. + +For the purpose of this check, the initialization of a static variable +*uses* another static variable or function if it appears in the initializer +expression. A function *uses* a static variable or function if the variable +or function appears at any place in the function code (except if the variable +is assigned to). The check can detect cycles in this "usage graph". + +The check does not consider conditions in function code and does not follow the +value of static variables (if assigned to another variable). For this reason it +can produce false positives in some cases. + +Examples +-------- + +.. code-block:: c++ + + struct S { static int A; }; + int B = S::A; + int S::A = B; + +Cycle in variable initialization. + +.. code-block:: c++ + + int f1(int X, int Y); + + struct S { static int A; }; + + int B = S::A + 1; + int S::A = f1(B, 2); + +Cyclical initialization: ``B`` uses value of ``S::A``, and ``S::A`` may use +value of ``B`` (the check gives always warning regardless of the code of +``f1``). + +.. code-block:: c++ + + struct S { static int A; }; + int f1() { + return S::A; + } + int S::A = f1(); + +This code results in initialization of ``S::A`` with itself through a function +call. The check would emit a warning in any case when ``S::A`` appears in +``f1`` (even if the return value is not affected by it). + +References +---------- + +* CERT C++ Coding Standard rule `DCL56-CPP. Avoid cycles during initialization + of static objects `_. diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp new file mode 100644 index 0000000000000..2e5af81b6af8c --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp @@ -0,0 +1,277 @@ +// RUN: %check_clang_tidy %s misc-static-initialization-cycle %t -- -- -fno-delayed-template-parsing + +namespace simple_cycle { +struct S { static int A; }; + +int B = S::A; +int S::A = B; +} +// CHECK-NOTES: :[[@LINE-3]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-4]]:9: note: value of 'A' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-4]]:12: note: value of 'B' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:5: note: possible cyclical initialization: B -> A -> B + +namespace self_init { +struct S { static int A; }; +int S::A = S::A; +} +// CHECK-NOTES: :[[@LINE-3]]:23: warning: static variable initialization cycle detected involving 'A' +// CHECK-NOTES: :[[@LINE-3]]:12: note: value of 'A' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-5]]:23: note: possible cyclical initialization: A -> A + +namespace cycle_at_end { +struct S { static int A; }; + +int B = 1; +int C = B + S::A; +int S::A = C; +} +// CHECK-NOTES: :[[@LINE-3]]:5: warning: static variable initialization cycle detected involving 'C' +// CHECK-NOTES: :[[@LINE-4]]:13: note: value of 'A' may be used to initialize variable 'C' here +// CHECK-NOTES: :[[@LINE-4]]:12: note: value of 'C' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:5: note: possible cyclical initialization: C -> A -> C + +namespace cycle_at_start { +struct S { static int A; }; + +int B = S::A; +int S::A = B; +int C = B + 1; +} +// CHECK-NOTES: :[[@LINE-4]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-5]]:9: note: value of 'A' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-5]]:12: note: value of 'B' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-7]]:5: note: possible cyclical initialization: B -> A -> B + +namespace multiple_cycle { +struct S { static int A; }; + +int B = S::A; +int C = S::A; +int S::A = B + C; +} +// CHECK-NOTES: :[[@LINE-3]]:5: warning: static variable initialization cycle detected involving 'C' +// CHECK-NOTES: :[[@LINE-4]]:9: note: value of 'A' may be used to initialize variable 'C' here +// CHECK-NOTES: :[[@LINE-4]]:16: note: value of 'C' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:5: note: possible cyclical initialization: C -> A -> C + +namespace long_cycle { +struct S { static int A; }; + +int B = S::A; +int C = B + 1; +int S::A = C; +} +// CHECK-NOTES: :[[@LINE-4]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-5]]:9: note: value of 'A' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-4]]:12: note: value of 'C' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:9: note: value of 'B' may be used to initialize variable 'C' here +// CHECK-NOTES: :[[@LINE-8]]:5: note: possible cyclical initialization: B -> A -> C -> B + +namespace no_cycle { +int A = 2; +int B = A; +int C = B + A; +} + +namespace init_expr { +struct S { static int A; }; +int f1(int X, int Y); + +int B = S::A + 1; +int S::A = f1(B, 2); +} +// CHECK-NOTES: :[[@LINE-3]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-4]]:9: note: value of 'A' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-4]]:15: note: value of 'B' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:5: note: possible cyclical initialization: B -> A -> B + +namespace func_static_ref_1 { +struct S { static int A; }; +int f1() { + return S::A; +} +int S::A = f1(); +} +// CHECK-NOTES: :[[@LINE-6]]:23: warning: static variable initialization cycle detected involving 'A' +// CHECK-NOTES: :[[@LINE-5]]:10: note: value of 'A' may be used to compute result of 'f1' +// CHECK-NOTES: :[[@LINE-4]]:12: note: result of 'f1' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-9]]:23: note: possible cyclical initialization: f1 -> A -> f1 + +namespace func_static_ref_2 { +struct S { static int A; }; +int f1() { + static int X = S::A; + return 1; +} +int S::A = f1(); +} +// CHECK-NOTES: :[[@LINE-7]]:23: warning: static variable initialization cycle detected involving 'A' +// CHECK-NOTES: :[[@LINE-6]]:18: note: value of 'A' may be used to compute result of 'f1' +// CHECK-NOTES: :[[@LINE-4]]:12: note: result of 'f1' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-10]]:23: note: possible cyclical initialization: f1 -> A -> f1 + +namespace func_static_ref_3 { +struct S { static int A; }; +int f1() { + S::A = 3; + return 34; +} +int S::A = f1(); +} + +namespace recursive_calls { +int f2(); +int f1() { + return f2(); +} +int f2() { + return f1(); +} +int A = f1(); +} + +namespace use_static_compile_time { +int f() { + static int A = f(); + static decltype(A) B = 2; + return sizeof(A) + B; +} +} + +namespace static_var_recursive_init { +int f(int i) { + static int A = f(1); + if (i == 1) + return 1; + return A + i; +} +} +// CHECK-NOTES: :[[@LINE-6]]:14: warning: static variable initialization cycle detected involving 'A' +// CHECK-NOTES: :[[@LINE-7]]:18: note: result of 'f' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-5]]:10: note: value of 'A' may be used to compute result of 'f' +// CHECK-NOTES: :[[@LINE-9]]:14: note: possible cyclical initialization: A -> f -> A + +namespace singleton { +struct S { int X; }; + +S *get_S() { + static S *TheS; + if (!TheS) { + TheS = new S; + } + return TheS; +} +} + +namespace compound_assign_in_func { +struct S { static int A; }; +int f() { + int local = 0; + local += S::A; + return local; +} +int S::A = f(); +} +// CHECK-NOTES: :[[@LINE-8]]:23: warning: static variable initialization cycle detected involving 'A' +// CHECK-NOTES: :[[@LINE-6]]:12: note: value of 'A' may be used to compute result of 'f' +// CHECK-NOTES: :[[@LINE-4]]:12: note: result of 'f' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-11]]:23: note: possible cyclical initialization: f -> A -> f + +namespace compound_assign_lhs { +struct S { static int A; }; +int f(); +int B = S::A + f(); +int f() { + S::A -= B; + return 1; +} +} +// CHECK-NOTES: :[[@LINE-6]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-5]]:11: note: value of 'B' may be used to compute result of 'f' +// CHECK-NOTES: :[[@LINE-8]]:16: note: result of 'f' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-9]]:5: note: possible cyclical initialization: f -> B -> f + +namespace template_test { +template +struct S { + static T f1(); + static T A; +}; +template +T S::A = f1(); +template +T S::f1() { + return A; +} + +S X; +} +// CHECK-NOTES: :[[@LINE-11]]:12: warning: static variable initialization cycle detected involving 'A' +// CHECK-NOTES: :[[@LINE-6]]:10: note: value of 'A' may be used to compute result of 'f1' +// CHECK-NOTES: :[[@LINE-10]]:13: note: result of 'f1' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-14]]:12: note: possible cyclical initialization: f1 -> A -> f1 + +namespace test_lambda_1 { +struct S { static int A; }; +int B = []() { return S::A; }(); +int S::A = B; +} +// CHECK-NOTES: :[[@LINE-3]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-4]]:23: note: value of 'A' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-4]]:12: note: value of 'B' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:5: note: possible cyclical initialization: B -> A -> B + +namespace test_lambda_2 { +struct S { static int A; }; +auto B = []() { return S::A; }; +int S::A = B(); +} +// this is not found by the check +// value of 'A' is not needed to initialize 'B' +// the check does not maintain values of variables (to find the stored +// lambda and relation to 'A') + +namespace test_lambda_3 { +struct S { static int A; }; +int f() { + return []() { return 2 * S::A; }() + 3; +} +int B = f(); +int S::A = B; +} +// CHECK-NOTES: :[[@LINE-3]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-6]]:28: note: value of 'A' may be used to compute result of 'f' +// CHECK-NOTES: :[[@LINE-4]]:12: note: value of 'B' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:9: note: result of 'f' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-7]]:5: note: possible cyclical initialization: f -> A -> B -> f + +namespace test_lambda_4 { +template +void f1(L) {}; + +struct S { static int A; }; + +int f() { + f1([]() { return 2 * S::A; }); + return 1; +} +int B = f(); +int S::A = B; +} + +namespace mixed_cycle { +struct S { static int A; }; +int B = S::A; +int f_b() { return B + 1; } +int C = f_b(); +int D = C + 1; +int S::A = []() { return D + 1; }(); +} +// CHECK-NOTES: :[[@LINE-6]]:5: warning: static variable initialization cycle detected involving 'B' +// CHECK-NOTES: :[[@LINE-7]]:9: note: value of 'A' may be used to initialize variable 'B' here +// CHECK-NOTES: :[[@LINE-4]]:26: note: value of 'D' may be used to initialize variable 'A' here +// CHECK-NOTES: :[[@LINE-6]]:9: note: value of 'C' may be used to initialize variable 'D' here +// CHECK-NOTES: :[[@LINE-8]]:9: note: result of 'f_b' may be used to initialize variable 'C' here +// CHECK-NOTES: :[[@LINE-10]]:20: note: value of 'B' may be used to compute result of 'f_b' +// CHECK-NOTES: :[[@LINE-12]]:5: note: possible cyclical initialization: B -> A -> D -> C -> f_b -> B From fb18fe73ea1b9f24b20aed6b480563b6bd5fc774 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 12 May 2026 09:13:14 +0100 Subject: [PATCH 411/538] [AArch64] Guard against vector invalidation in EmitAArch64CpuSupports. (#196909) This prevents the Vector from being invalidated whilst iterator over it. As far as I can tell we were adding elements twice. Fixes #196789 --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 5 +++-- clang/test/CodeGen/AArch64/cpu-supports-target.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 8bfacc5580bd6..647c3ff44928a 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7306,9 +7306,10 @@ Value *CodeGenFunction::EmitAArch64CpuInit() { Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) { const Expr *ArgExpr = E->getArg(0)->IgnoreParenCasts(); StringRef ArgStr = cast(ArgExpr)->getString(); + llvm::SmallVector OrigFeatures; + ArgStr.split(OrigFeatures, "+"); llvm::SmallVector Features; - ArgStr.split(Features, "+"); - for (auto &Feature : Features) { + for (StringRef Feature : OrigFeatures) { Feature = Feature.trim(); if (!llvm::AArch64::parseFMVExtension(Feature)) return Builder.getFalse(); diff --git a/clang/test/CodeGen/AArch64/cpu-supports-target.c b/clang/test/CodeGen/AArch64/cpu-supports-target.c index 3d26fd6f2bd03..1cd86210095b4 100644 --- a/clang/test/CodeGen/AArch64/cpu-supports-target.c +++ b/clang/test/CodeGen/AArch64/cpu-supports-target.c @@ -217,6 +217,21 @@ int test_versions() { return code(); } +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define dso_local i32 @test_long( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 577586773744664575 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 577586773744664575 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TMP3]] to i32 +// CHECK-NEXT: ret i32 [[CONV]] +// +int test_long(void) { + return __builtin_cpu_supports("rng+flagm+flagm2+fp16fml+dotprod+sm4+rdm+lse+fp+simd+aes+bf16+bti+crc+cssc+dit+dotprod+f32mm+f64mm+flagm+fp16fml+fp16+i8mm+mops+sha2+sha3+sm4+sve2"); +} + //. // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } From c9560139d3053fe5eb49131c0354ed6a67177fbc Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Tue, 12 May 2026 11:15:09 +0300 Subject: [PATCH 412/538] [lld] Remove unused argument of DataExtractor constructor (NFC) (#196361) `AddressSize` parameter is not used by `DataExtractor` and will be removed in the future. See #190519 for more context. --- lld/ELF/SyntheticSections.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 805d0530f12ad..c4807e1e150b5 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -3062,8 +3062,7 @@ DebugNamesSection::DebugNamesSection(Ctx &ctx) ELFT::Is64Bits ? 8 : 4); // .debug_str is needed to get symbol names from string offsets. DataExtractor strExtractor(dobj.getStrSection(), - ELFT::Endianness == endianness::little, - ELFT::Is64Bits ? 8 : 4); + ELFT::Endianness == endianness::little); inputChunk.section = dobj.getNamesSection(); inputChunk.llvmDebugNames.emplace(namesExtractor, strExtractor); From 9ca55c0c99830cce8e21f5477596c97d86e0c879 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Tue, 12 May 2026 10:51:13 +0200 Subject: [PATCH 413/538] [InstCombine] Relax the requirements for (X ^ C2) + C -> (C2 + C) - X (#196897) If (C2 - X) has no borrow between bits, it is equivalent to (X ^ C2). A borrow would occur when c2_bit=0 and x_bit=1. It follows that c2_bit=1 or x_bit=0 means no borrow. Remove an artificial condition that C2 must be a low bits mask. Proof: https://alive2.llvm.org/ce/z/uNMsg_ --- .../InstCombine/InstCombineAddSub.cpp | 13 ++-- llvm/test/Transforms/InstCombine/sub-xor.ll | 78 +++++++++++++++++++ 2 files changed, 84 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 192355b12c4da..90ea19541653e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -951,13 +951,12 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) { if (C2->isSignMask()) return BinaryOperator::CreateAdd(X, ConstantInt::get(Ty, *C2 ^ *C)); - // If X has no high-bits set above an xor mask: - // add (xor X, LowMaskC), C --> sub (LowMaskC + C), X - if (C2->isMask()) { - KnownBits LHSKnown = computeKnownBits(X, &Add); - if ((*C2 | LHSKnown.Zero).isAllOnes()) - return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C2 + *C), X); - } + // If X has no bits set other than an xor mask, + // xor is equivalent to sub with no borrow between bits: + // add (xor X, C2), C --> sub (C2 + C), X + KnownBits LHSKnown = computeKnownBits(X, &Add); + if ((*C2 | LHSKnown.Zero).isAllOnes()) + return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C2 + *C), X); // Look for a math+logic pattern that corresponds to sext-in-register of a // value with cleared high bits. Convert that into a pair of shifts: diff --git a/llvm/test/Transforms/InstCombine/sub-xor.ll b/llvm/test/Transforms/InstCombine/sub-xor.ll index a4135e0b51453..48e6d830c41f6 100644 --- a/llvm/test/Transforms/InstCombine/sub-xor.ll +++ b/llvm/test/Transforms/InstCombine/sub-xor.ll @@ -158,6 +158,84 @@ define <2 x i8> @xor_add_splat_undef(<2 x i8> %x) { ret <2 x i8> %add } +define i32 @xor_notmask_add(i32 %x) { +; CHECK-LABEL: @xor_notmask_add( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 28 +; CHECK-NEXT: [[ADD:%.*]] = sub nuw nsw i32 51, [[AND]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %and = and i32 %x, 28 + %xor = xor i32 %and, 61 + %add = add i32 %xor, -10 + ret i32 %add +} + +define i32 @xor_notmask_add_multiuse(i32 %x) { +; CHECK-LABEL: @xor_notmask_add_multiuse( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 28 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[AND]], 29 +; CHECK-NEXT: call void @use(i32 [[XOR]]) +; CHECK-NEXT: [[ADD:%.*]] = sub nuw nsw i32 39, [[AND]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %and = and i32 %x, 28 + %xor = xor i32 %and, 29 + call void @use(i32 %xor) + %add = add i32 %xor, 10 + ret i32 %add +} + +define <2 x i8> @xor_notmask_add_splat(<2 x i8> %x) { +; CHECK-LABEL: @xor_notmask_add_splat( +; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[X:%.*]], splat (i8 24) +; CHECK-NEXT: [[ADD:%.*]] = sub nuw nsw <2 x i8> splat (i8 103), [[AND]] +; CHECK-NEXT: ret <2 x i8> [[ADD]] +; + %and = and <2 x i8> %x, + %xor = xor <2 x i8> %and, + %add = add <2 x i8> %xor, + ret <2 x i8> %add +} + +define <2 x i8> @xor_notmask_add_splat_poison(<2 x i8> %x) { +; CHECK-LABEL: @xor_notmask_add_splat_poison( +; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[X:%.*]], splat (i8 24) +; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[AND]], +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i8> [[XOR]], splat (i8 42) +; CHECK-NEXT: ret <2 x i8> [[ADD]] +; + %and = and <2 x i8> %x, + %xor = xor <2 x i8> %and, + %add = add <2 x i8> %xor, + ret <2 x i8> %add +} + +define <2 x i8> @xor_notmask_add_non_splat(<2 x i8> %x) { +; CHECK-LABEL: @xor_notmask_add_non_splat( +; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[X:%.*]], splat (i8 24) +; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[AND]], +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i8> [[XOR]], splat (i8 42) +; CHECK-NEXT: ret <2 x i8> [[ADD]] +; + %and = and <2 x i8> %x, + %xor = xor <2 x i8> %and, + %add = add <2 x i8> %xor, + ret <2 x i8> %add +} + +define i32 @xor_notmask_add_negative(i32 %x) { +; CHECK-LABEL: @xor_notmask_add_negative( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 28 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[AND]], 23 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[XOR]], 10 +; CHECK-NEXT: ret i32 [[ADD]] +; + %and = and i32 %x, 28 + %xor = xor i32 %and, 23 + %add = add i32 %xor, 10 + ret i32 %add +} + ; Make sure we don't convert sub to xor using dominating condition. That makes ; it hard for other passe to reverse. define i32 @xor_dominating_cond(i32 %x) { From b4aa4d4dcb6f1c8a00d1d1e53d2b353c97ec98b7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2026 10:15:39 +0100 Subject: [PATCH 414/538] [X86] combineINSERT_SUBVECTOR - only fold vXi1 zero-widening if scalar mask source has one use (#197125) Fixes infinite loop reported on #192699 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++-- .../CodeGen/X86/avx512-skx-insert-subvec.ll | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3be7d35a08b6d..aa5b864df5936 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -61204,8 +61204,8 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, // See if were inserting into a zero vXi1 vector and the subvector was // bitcast from a gpr that could be zero-extended directly. - if (IsI1Vector && TLI.isTypeLegal(OpVT)) { - SDValue SubInt = peekThroughBitcasts(SubVec); + if (IsI1Vector && TLI.isTypeLegal(OpVT) && SubVec.hasOneUse()) { + SDValue SubInt = peekThroughOneUseBitcasts(SubVec); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VecNumElts); if (TLI.isTypeLegal(IntVT) && SubInt.getValueType().isScalarInteger()) { SubInt = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, SubInt); diff --git a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 2fd2eba42315e..a05c0727a4522 100644 --- a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -209,3 +209,24 @@ define i8 @test15(<2 x i64> %x) { %c = bitcast <8 x i1> %b to i8 ret i8 %c } + +; Ensure multiple uses of mask source doesn't cause infinite loop +define <16 x i8> @test_insert_subvector_v8i1_v16i1(i8 %a0) { +; CHECK-LABEL: test_insert_subvector_v8i1_v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorb $1, %dil +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: kmovb %k1, %k2 +; CHECK-NEXT: vmovdqu8 0, %xmm0 {%k2} {z} +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, 0 {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %xor = xor i8 %a0, 1 + %zext = zext i8 %xor to i16 + %mask16 = bitcast i16 %zext to <16 x i1> + %load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr null, <16 x i1> %mask16, <16 x i8> zeroinitializer) + %mask8 = bitcast i8 %xor to <8 x i1> + call void @llvm.masked.store.v8i64.p0(<8 x i64> zeroinitializer, ptr null, <8 x i1> %mask8) + ret <16 x i8> %load +} From bfe5d5bed983fcdd5004a304d8c54b5f4c6ce3a2 Mon Sep 17 00:00:00 2001 From: Zeyi Xu Date: Tue, 12 May 2026 17:59:27 +0800 Subject: [PATCH 415/538] [LifetimeSafety] Diagnose invalidated-field (#196680) Teach lifetime safety invalidation diagnostics to handle origins that escape through fields before the referenced object is invalidated. Previously they were skipped. Partially addresses https://github.com/llvm/llvm-project/issues/195706 --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 6 ++ .../clang/Basic/DiagnosticSemaKinds.td | 4 ++ clang/lib/Analysis/LifetimeSafety/Checker.cpp | 21 ++++++- clang/lib/Sema/SemaLifetimeSafety.h | 32 ++++++++++ .../warn-lifetime-safety-invalidations.cpp | 60 +++++++++++++++++++ 5 files changed, 122 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 7b0799d923f40..37ffa36fbe865 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -88,6 +88,12 @@ class LifetimeSafetySemaHelper { virtual void reportUseAfterInvalidation(const ParmVarDecl *PVD, const Expr *UseExpr, const Expr *InvalidationExpr) {} + virtual void reportInvalidatedField(const Expr *IssueExpr, + const FieldDecl *Field, + const Expr *InvalidationExpr) {} + virtual void reportInvalidatedField(const ParmVarDecl *PVD, + const FieldDecl *Field, + const Expr *InvalidationExpr) {} using EscapingTarget = llvm::PointerUnion; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f687e759d1267..8c549f121e032 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10985,6 +10985,10 @@ def warn_lifetime_safety_invalidation : Warning<"%select{object whose reference is captured|parameter}0 is later invalidated">, InGroup, DefaultIgnore; +def warn_lifetime_safety_invalidated_field + : Warning<"%select{object whose reference|parameter which}0 escapes to a field is later invalidated">, + InGroup, + DefaultIgnore; def warn_lifetime_safety_dangling_field : Warning<"address of stack memory escapes to a field">, diff --git a/clang/lib/Analysis/LifetimeSafety/Checker.cpp b/clang/lib/Analysis/LifetimeSafety/Checker.cpp index fc77ed3097602..bad17e88f0b9b 100644 --- a/clang/lib/Analysis/LifetimeSafety/Checker.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Checker.cpp @@ -264,7 +264,26 @@ class LifetimeChecker { MovedExpr, ExpiryLoc); } else if (const auto *OEF = CausingFact.dyn_cast()) { - if (const auto *RetEscape = dyn_cast(OEF)) + if (Warning.InvalidatedByExpr) { + if (const auto *FieldEscape = dyn_cast(OEF)) { + // Invalidated object escapes to a field. + if (IssueExpr) + // Invalidated object on stack escapes to a field. + SemaHelper->reportInvalidatedField(IssueExpr, + FieldEscape->getFieldDecl(), + Warning.InvalidatedByExpr); + else if (InvalidatedPVD) + // Invalidated parameter escapes to a field. + SemaHelper->reportInvalidatedField(InvalidatedPVD, + FieldEscape->getFieldDecl(), + Warning.InvalidatedByExpr); + } else if (isa(OEF)) { + // FIXME: Diagnose invalidated global escapes separately. + } else if (isa(OEF)) { + // FIXME: Diagnose invalidated return escapes separately. + } else + llvm_unreachable("Unhandled OriginEscapesFact type"); + } else if (const auto *RetEscape = dyn_cast(OEF)) // Return stack address. SemaHelper->reportUseAfterReturn( IssueExpr, RetEscape->getReturnExpr(), MovedExpr, ExpiryLoc); diff --git a/clang/lib/Sema/SemaLifetimeSafety.h b/clang/lib/Sema/SemaLifetimeSafety.h index 1ef28d8ba2cee..4d20c4c337b0f 100644 --- a/clang/lib/Sema/SemaLifetimeSafety.h +++ b/clang/lib/Sema/SemaLifetimeSafety.h @@ -148,6 +148,38 @@ class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper { << UseExpr->getSourceRange(); } + void reportInvalidatedField(const Expr *IssueExpr, + const FieldDecl *DanglingField, + const Expr *InvalidationExpr) override { + auto InvalidationDiag = isa(InvalidationExpr) + ? diag::note_lifetime_safety_freed_here + : diag::note_lifetime_safety_invalidated_here; + S.Diag(IssueExpr->getExprLoc(), + diag::warn_lifetime_safety_invalidated_field) + << false << IssueExpr->getSourceRange(); + S.Diag(InvalidationExpr->getExprLoc(), InvalidationDiag) + << InvalidationExpr->getSourceRange(); + S.Diag(DanglingField->getLocation(), + diag::note_lifetime_safety_dangling_field_here) + << DanglingField->getEndLoc(); + } + + void reportInvalidatedField(const ParmVarDecl *PVD, + const FieldDecl *DanglingField, + const Expr *InvalidationExpr) override { + auto InvalidationDiag = isa(InvalidationExpr) + ? diag::note_lifetime_safety_freed_here + : diag::note_lifetime_safety_invalidated_here; + S.Diag(PVD->getSourceRange().getBegin(), + diag::warn_lifetime_safety_invalidated_field) + << true << PVD->getSourceRange(); + S.Diag(InvalidationExpr->getExprLoc(), InvalidationDiag) + << InvalidationExpr->getSourceRange(); + S.Diag(DanglingField->getLocation(), + diag::note_lifetime_safety_dangling_field_here) + << DanglingField->getEndLoc(); + } + void suggestLifetimeboundToParmVar(SuggestionScope Scope, const ParmVarDecl *ParmToAnnotate, EscapingTarget Target) override { diff --git a/clang/test/Sema/warn-lifetime-safety-invalidations.cpp b/clang/test/Sema/warn-lifetime-safety-invalidations.cpp index df9f7288144b1..1c7bdf1df3528 100644 --- a/clang/test/Sema/warn-lifetime-safety-invalidations.cpp +++ b/clang/test/Sema/warn-lifetime-safety-invalidations.cpp @@ -447,6 +447,66 @@ void ChangingRegionOwnedByContainerIsOk() { } // namespace ContainersAsFields +namespace InvalidatedField { +std::string StableString; + +struct S { + std::string_view FieldFromLocalVector; // expected-note {{this field dangles}} + std::string_view FieldFromByValueParamVector; // expected-note {{this field dangles}} + std::string_view FieldFromLocalString; // expected-note {{this field dangles}} + std::string_view FieldFromByValueParamString; // expected-note {{this field dangles}} + std::string_view FieldFromRefParamString; // expected-note {{this field dangles}} + int *FieldFromNew; // expected-note {{this field dangles}} + int *FieldFromPointerParam; // expected-note {{this field dangles}} + std::string_view FieldReassigned; + + void InvalidatedFieldLocalVector() { + std::vector strings; + FieldFromLocalVector = *strings.begin(); // expected-warning {{object whose reference escapes to a field is later invalidated}} + strings.push_back("1"); // expected-note {{invalidated here}} + } + + void InvalidatedFieldByValueParamVector(std::vector strings) { + FieldFromByValueParamVector = *strings.begin(); // expected-warning {{object whose reference escapes to a field is later invalidated}} + strings.push_back("1"); // expected-note {{invalidated here}} + } + + void InvalidatedFieldLocalString() { + std::string s; + FieldFromLocalString = s; // expected-warning {{object whose reference escapes to a field is later invalidated}} + s.clear(); // expected-note {{invalidated here}} + } + + void InvalidatedFieldByValueParamString(std::string s) { + FieldFromByValueParamString = s; // expected-warning {{object whose reference escapes to a field is later invalidated}} + s.clear(); // expected-note {{invalidated here}} + } + + void InvalidatedFieldRefParamString(std::string &s) { // expected-warning {{parameter which escapes to a field is later invalidated}} + FieldFromRefParamString = s; + s.~basic_string(); // expected-note {{invalidated here}} + } + + void InvalidatedFieldDelete() { + int *p = new int; // expected-warning {{object whose reference escapes to a field is later invalidated}} + FieldFromNew = p; + delete p; // expected-note {{freed here}} + } + + void InvalidatedFieldDeleteParam(int *p) { // expected-warning {{parameter which escapes to a field is later invalidated}} + FieldFromPointerParam = p; + delete p; // expected-note {{freed here}} + } + + void FieldReassignedBeforeInvalidation() { + std::vector strings; + FieldReassigned = *strings.begin(); + FieldReassigned = StableString; + strings.push_back("1"); + } +}; +} // namespace InvalidatedField + namespace AssociativeContainers { void SetInsertDoesNotInvalidate() { std::set s; From 9346acd7806aee8a675089d5b5bcbd8ce98147e0 Mon Sep 17 00:00:00 2001 From: jofrn Date: Tue, 12 May 2026 03:03:54 -0700 Subject: [PATCH 416/538] [TableGen] Add submulticlass typechecking to template arg values (#197128) Some typechecking was missing when parsing a submulticlass reference. Add the CheckTemplateArgValues call in ParseSubMultiClassReference. Resolves https://github.com/llvm/llvm-project/issues/84910. --- llvm/lib/TableGen/TGParser.cpp | 5 +++++ llvm/test/TableGen/submulticlass-leteq.td | 21 +++++++++++++++++++ llvm/test/TableGen/submulticlass-typecheck.td | 12 +++++++++++ 3 files changed, 38 insertions(+) create mode 100644 llvm/test/TableGen/submulticlass-leteq.td create mode 100644 llvm/test/TableGen/submulticlass-typecheck.td diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 25405eef60366..c44e067a9da9f 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -863,6 +863,11 @@ TGParser::ParseSubMultiClassReference(MultiClass *CurMC) { return Result; } + if (CheckTemplateArgValues(Result.TemplateArgs, ArgLocs, &Result.MC->Rec)) { + Result.MC = nullptr; // Error checking value list. + return Result; + } + Result.RefRange.End = Lex.getLoc(); return Result; diff --git a/llvm/test/TableGen/submulticlass-leteq.td b/llvm/test/TableGen/submulticlass-leteq.td new file mode 100644 index 0000000000000..eeaa523d870ec --- /dev/null +++ b/llvm/test/TableGen/submulticlass-leteq.td @@ -0,0 +1,21 @@ +// RUN: llvm-tblgen %s | FileCheck %s +// XFAIL: vg_leak +// CHECK: def X0 { // C +// CHECK-NEXT: bit x = 1; +// CHECK-NEXT: } +// CHECK-NEXT: def X1 { // C +// CHECK-NEXT: bit x = 0; +// CHECK-NEXT: } +class C { + bit x; +} +multiclass M0 Val> { + let x = !eq(Val, !cast>(-1)) in def NAME : C; +} +multiclass M1 Val> { + let x = !eq(Val, -1) in def NAME : C; +} +multiclass M2_0 : M0<-1>; +multiclass M2_1 : M1<-1>; +defm X0 : M2_0; +defm X1 : M2_1; diff --git a/llvm/test/TableGen/submulticlass-typecheck.td b/llvm/test/TableGen/submulticlass-typecheck.td new file mode 100644 index 0000000000000..2abc892a7a211 --- /dev/null +++ b/llvm/test/TableGen/submulticlass-typecheck.td @@ -0,0 +1,12 @@ +// RUN: not llvm-tblgen %s 2>&1 | FileCheck %s +// XFAIL: vg_leak +// CHECK: {{.*}}:11:30: error: Value specified for template argument 'B::op' is of type bits<4>; expected type bits<8>: C::op +// CHECK-NEXT: multiclass C op> : B; +class A op> { + bits<8> f = op; +} +multiclass B op> { + def : A; +} +multiclass C op> : B; +defm D : C<0>; From d176a1ee7b34260da8fdb461b479c872f70b54af Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Tue, 12 May 2026 11:18:58 +0100 Subject: [PATCH 417/538] [GlobalISel][AMDGPU][AArch64] Fix GlobalISel copy propagation (#188781) Disallow propagation of sub-registers after GlobalISel, as the current code is blindly dropping any sub-register information. This also fixes bugs in AArch64 and AMDGPU back-end that rely on the incorrect behavior and would fail with the fix: * Update `selectG_UNMERGE_VALUES` in AMDGPU so instead of generating `hi16` for SGPR it shifts higher bits into the destination register using `lshr`. * Prevent AArch64 back-end from generating spurious `sub_32:gpr32all` when selecting copy. * Test changes: `fpto[s/u]i-sat-vector.ll`: The correct number of conversions is now generated as higher 16-bits are handled correctly; however, it introduces `lshr` instructions. This should be resolved in #188287 by enabling `s_cvt_hi_*`. --- .../CodeGen/GlobalISel/InstructionSelect.cpp | 16 +- .../GISel/AArch64InstructionSelector.cpp | 7 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 12 +- llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll | 177 +++++++++++++----- llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll | 157 ++++++++++++---- 5 files changed, 276 insertions(+), 93 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 2649b18d5a698..c750f643e99e2 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -242,13 +242,15 @@ bool InstructionSelect::selectMachineFunction(MachineFunction &MF) { continue; Register SrcReg = MI.getOperand(1).getReg(); Register DstReg = MI.getOperand(0).getReg(); - if (SrcReg.isVirtual() && DstReg.isVirtual()) { - auto SrcRC = MRI.getRegClass(SrcReg); - auto DstRC = MRI.getRegClass(DstReg); - if (SrcRC == DstRC) { - MRI.replaceRegWith(DstReg, SrcReg); - MI.eraseFromParent(); - } + unsigned SrcSubIdx = MI.getOperand(1).getSubReg(); + if (!SrcReg.isVirtual() || !DstReg.isVirtual() || SrcSubIdx) + continue; + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + if (SrcRC == DstRC) { + MRI.replaceRegWith(DstReg, SrcReg); + MI.eraseFromParent(); } } } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 1a6749b9e5c0c..5d2b4bc7ac6d3 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1053,8 +1053,15 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC); const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC); + unsigned SrcSubReg = I.getOperand(1).getSubReg(); unsigned SubReg; + if (SrcSubReg) { + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) + return false; + return true; + } + // If the source bank doesn't support a subregister copy small enough, // then we first need to copy to the destination bank. if (getMinSizeForRegBank(SrcRegBank) > DstSize) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index f3c4f559e3615..c157a88694c39 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -730,8 +730,16 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { ArrayRef SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); for (int I = 0, E = NumDst; I != E; ++I) { MachineOperand &Dst = MI.getOperand(I); - BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) - .addReg(SrcReg, {}, SubRegs[I]); + // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits. + if (SrcBank->getID() == AMDGPU::SGPRRegBankID && + SubRegs[I] == AMDGPU::hi16) { + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg()) + .addReg(SrcReg) + .addImm(16); + } else { + BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) + .addReg(SrcReg, {}, SubRegs[I]); + } // Make sure the subregister index is valid for the source register. SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); diff --git a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll index 597547ac8345e..8674f748a3f73 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll @@ -3346,20 +3346,31 @@ define <4 x i1> @test_s_signed_v4f16_v4i1(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 ; GFX12-GI-NEXT: s_min_i32 s0, s0, 0 -; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_i32 s2, s2, 0 +; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 +; GFX12-GI-NEXT: s_min_i32 s3, s3, 0 ; GFX12-GI-NEXT: s_max_i32 s0, s0, -1 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_max_i32 s2, s2, -1 ; GFX12-GI-NEXT: s_max_i32 s1, s1, -1 +; GFX12-GI-NEXT: s_max_i32 s3, s3, -1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f) ret <4 x i1> %x @@ -3595,20 +3606,31 @@ define <4 x i8> @test_s_signed_v4f16_v4i8(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 ; GFX12-GI-NEXT: s_min_i32 s0, s0, 0x7f -; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_i32 s2, s2, 0x7f +; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f +; GFX12-GI-NEXT: s_min_i32 s3, s3, 0x7f ; GFX12-GI-NEXT: s_max_i32 s0, s0, 0xffffff80 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_max_i32 s2, s2, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s1, s1, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s3, s3, 0xffffff80 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i8> @llvm.fptosi.sat.v4f16.v4i8(<4 x half> %f) ret <4 x i8> %x @@ -3755,10 +3777,13 @@ define <4 x i16> @test_s_signed_v4f16_v4i16(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v0.l, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v0.h, s2 ; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v1.l, s1 -; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX12-GI-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v1.h, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i16> @llvm.fptosi.sat.v4f16.v4i16(<4 x half> %f) ret <4 x i16> %x @@ -3876,19 +3901,23 @@ define <4 x i64> @test_s_signed_v4f16_v4i64(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-GI-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 -; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s0 -; GFX12-GI-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GI-NEXT: v_mov_b32_e32 v4, s1 -; GFX12-GI-NEXT: v_mov_b32_e32 v6, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i64> @llvm.fptosi.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x @@ -5461,30 +5490,51 @@ define <8 x i1> @test_s_signed_v8f16_v8i1(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 ; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 ; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 ; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 ; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s4, s4 ; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s5, s5 ; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s6, s6 ; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s7, s7 ; GFX12-GI-NEXT: s_min_i32 s0, s0, 0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_i32 s4, s4, 0 ; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 +; GFX12-GI-NEXT: s_min_i32 s5, s5, 0 ; GFX12-GI-NEXT: s_min_i32 s2, s2, 0 +; GFX12-GI-NEXT: s_min_i32 s6, s6, 0 ; GFX12-GI-NEXT: s_min_i32 s3, s3, 0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_i32 s7, s7, 0 ; GFX12-GI-NEXT: s_max_i32 s0, s0, -1 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_max_i32 s4, s4, -1 ; GFX12-GI-NEXT: s_max_i32 s1, s1, -1 +; GFX12-GI-NEXT: s_max_i32 s5, s5, -1 ; GFX12-GI-NEXT: s_max_i32 s2, s2, -1 +; GFX12-GI-NEXT: s_max_i32 s6, s6, -1 ; GFX12-GI-NEXT: s_max_i32 s3, s3, -1 +; GFX12-GI-NEXT: s_max_i32 s7, s7, -1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f) ret <8 x i1> %x @@ -5889,30 +5939,51 @@ define <8 x i8> @test_s_signed_v8f16_v8i8(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 ; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 ; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 ; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 ; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s4, s4 ; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s5, s5 ; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s6, s6 ; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s7, s7 ; GFX12-GI-NEXT: s_min_i32 s0, s0, 0x7f +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_i32 s4, s4, 0x7f ; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f +; GFX12-GI-NEXT: s_min_i32 s5, s5, 0x7f ; GFX12-GI-NEXT: s_min_i32 s2, s2, 0x7f +; GFX12-GI-NEXT: s_min_i32 s6, s6, 0x7f ; GFX12-GI-NEXT: s_min_i32 s3, s3, 0x7f -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_i32 s7, s7, 0x7f ; GFX12-GI-NEXT: s_max_i32 s0, s0, 0xffffff80 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_max_i32 s4, s4, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s1, s1, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s5, s5, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s2, s2, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s6, s6, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s3, s3, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s7, s7, 0xffffff80 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i8> @llvm.fptosi.sat.v8f16.v8i8(<8 x half> %f) ret <8 x i8> %x @@ -6146,14 +6217,19 @@ define <8 x i16> @test_s_signed_v8f16_v8i16(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v0.l, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v0.h, s4 ; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v1.l, s1 +; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v1.h, s5 ; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v2.l, s2 +; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v2.h, s6 ; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v3.l, s3 -; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX12-GI-NEXT: v_mov_b16_e32 v1.h, v1.l -; GFX12-GI-NEXT: v_mov_b16_e32 v2.h, v2.l -; GFX12-GI-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-GI-NEXT: v_cvt_i16_f16_e32 v3.h, s7 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i16> @llvm.fptosi.sat.v8f16.v8i16(<8 x half> %f) ret <8 x i16> %x @@ -6343,24 +6419,37 @@ define <8 x i64> @test_s_signed_v8f16_v8i64(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 ; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 ; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 ; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 ; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s4, s4 ; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s5, s5 ; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s6, s6 ; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_i32_f32 s7, s7 ; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, 0 ; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, 0 ; GFX12-GI-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, 0 ; GFX12-GI-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v15, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f) ret <8 x i64> %x diff --git a/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll index af1a8872f2995..613871827584a 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll @@ -3117,17 +3117,26 @@ define <4 x i1> @test_s_unsigned_v4f16_v4i1(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 ; GFX12-GI-NEXT: s_min_u32 s0, s0, 1 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_u32 s2, s2, 1 ; GFX12-GI-NEXT: s_min_u32 s1, s1, 1 +; GFX12-GI-NEXT: s_min_u32 s3, s3, 1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i1> @llvm.fptoui.sat.v4f16.v4i1(<4 x half> %f) ret <4 x i1> %x @@ -3331,17 +3340,26 @@ define <4 x i8> @test_s_unsigned_v4f16_v4i8(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 ; GFX12-GI-NEXT: s_min_u32 s0, s0, 0xff +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_u32 s2, s2, 0xff ; GFX12-GI-NEXT: s_min_u32 s1, s1, 0xff +; GFX12-GI-NEXT: s_min_u32 s3, s3, 0xff ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i8> @llvm.fptoui.sat.v4f16.v4i8(<4 x half> %f) ret <4 x i8> %x @@ -3478,10 +3496,13 @@ define <4 x i16> @test_s_unsigned_v4f16_v4i16(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v0.l, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v0.h, s2 ; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v1.l, s1 -; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX12-GI-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v1.h, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i16> @llvm.fptoui.sat.v4f16.v4i16(<4 x half> %f) ret <4 x i16> %x @@ -3594,19 +3615,23 @@ define <4 x i64> @test_s_unsigned_v4f16_v4i64(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-GI-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 -; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s0 -; GFX12-GI-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GI-NEXT: v_mov_b32_e32 v4, s1 -; GFX12-GI-NEXT: v_mov_b32_e32 v6, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i64> @llvm.fptoui.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x @@ -5110,25 +5135,42 @@ define <8 x i1> @test_s_unsigned_v8f16_v8i1(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 ; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 ; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 ; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 ; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s4, s4 ; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s5, s5 ; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s6, s6 ; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s7, s7 ; GFX12-GI-NEXT: s_min_u32 s0, s0, 1 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_u32 s4, s4, 1 ; GFX12-GI-NEXT: s_min_u32 s1, s1, 1 +; GFX12-GI-NEXT: s_min_u32 s5, s5, 1 ; GFX12-GI-NEXT: s_min_u32 s2, s2, 1 +; GFX12-GI-NEXT: s_min_u32 s6, s6, 1 ; GFX12-GI-NEXT: s_min_u32 s3, s3, 1 +; GFX12-GI-NEXT: s_min_u32 s7, s7, 1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i1> @llvm.fptoui.sat.v8f16.v8i1(<8 x half> %f) ret <8 x i1> %x @@ -5486,25 +5528,42 @@ define <8 x i8> @test_s_unsigned_v8f16_v8i8(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 ; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 ; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 ; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 ; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s4, s4 ; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s5, s5 ; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s6, s6 ; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s7, s7 ; GFX12-GI-NEXT: s_min_u32 s0, s0, 0xff +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_min_u32 s4, s4, 0xff ; GFX12-GI-NEXT: s_min_u32 s1, s1, 0xff +; GFX12-GI-NEXT: s_min_u32 s5, s5, 0xff ; GFX12-GI-NEXT: s_min_u32 s2, s2, 0xff +; GFX12-GI-NEXT: s_min_u32 s6, s6, 0xff ; GFX12-GI-NEXT: s_min_u32 s3, s3, 0xff +; GFX12-GI-NEXT: s_min_u32 s7, s7, 0xff ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s1 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i8> @llvm.fptoui.sat.v8f16.v8i8(<8 x half> %f) ret <8 x i8> %x @@ -5722,14 +5781,19 @@ define <8 x i16> @test_s_unsigned_v8f16_v8i16(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v0.l, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v0.h, s4 ; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v1.l, s1 +; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v1.h, s5 ; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v2.l, s2 +; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v2.h, s6 ; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v3.l, s3 -; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX12-GI-NEXT: v_mov_b16_e32 v1.h, v1.l -; GFX12-GI-NEXT: v_mov_b16_e32 v2.h, v2.l -; GFX12-GI-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-GI-NEXT: v_cvt_u16_f16_e32 v3.h, s7 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i16> @llvm.fptoui.sat.v8f16.v8i16(<8 x half> %f) ret <8 x i16> %x @@ -5910,24 +5974,37 @@ define <8 x i64> @test_s_unsigned_v8f16_v8i64(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 +; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 +; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 +; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 ; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 ; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 ; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 ; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 ; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s4, s4 ; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s5, s5 ; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s6, s6 ; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: s_cvt_u32_f32 s7, s7 ; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, 0 ; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, 0 ; GFX12-GI-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, 0 ; GFX12-GI-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v15, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i64> @llvm.fptoui.sat.v8f16.v8i64(<8 x half> %f) ret <8 x i64> %x From 65a206f2ec552cccf7c96c5306147f0437832ec7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2026 11:38:32 +0100 Subject: [PATCH 418/538] [X86] fold-int-pow2-with-fmul-or-fdiv.ll - regenerate with (V)PADD asm comments (#197137) Reduce diff in #197097 --- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index a8a6786d97ea0..b3c46bc865b25 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -13,7 +13,7 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) { ; CHECK-SSE-LABEL: fmul_pow2_4xfloat: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1091567616,1091567616,1091567616,1091567616] ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow2_4xfloat: @@ -911,19 +911,19 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: psllq $52, %xmm0 -; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4629137466983448576,4629137466983448576] ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4629137466983448576,4629137466983448576] ; CHECK-AVX2-NEXT: retq ; ; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec: ; CHECK-ONLY-AVX512F: # %bb.0: ; CHECK-ONLY-AVX512F-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-ONLY-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4629137466983448576,4629137466983448576] ; CHECK-ONLY-AVX512F-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow_shl_cnt_vec: @@ -941,7 +941,7 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1092616192,1092616192,1092616192,1092616192] ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; @@ -980,7 +980,7 @@ define <4 x float> @fmul_pow_shl_cnt_vec_no_fma(<4 x i32> %cnt, <4 x float> %add ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_no_fma: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1092616192,1092616192,1092616192,1092616192] ; CHECK-SSE-NEXT: addps %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; @@ -1017,13 +1017,13 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: psllq $52, %xmm0 -; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4629137466983448576,4628574517030027264] ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: ; CHECK-AVX: # %bb.0: ; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4629137466983448576,4628574517030027264] ; CHECK-AVX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -1035,13 +1035,13 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-SSE: # %bb.0: ; CHECK-SSE-NEXT: psllq $52, %xmm0 -; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4629137466983448576,4624633867356078080] ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: ; CHECK-AVX: # %bb.0: ; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4629137466983448576,4624633867356078080] ; CHECK-AVX-NEXT: retq %shl = shl nsw nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> @@ -1056,7 +1056,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-SSE-NEXT: pslld $23, %xmm0 -; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1065353216,1065353216,1065353216,1065353216] ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u] From f7f911f72f050f2a796590f415fe738f21246d73 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Tue, 12 May 2026 12:59:52 +0200 Subject: [PATCH 419/538] [libc] Add some types to netinet/in.h (#196932) Not including more types because I need to fix in_addr definition first. This exposes stdint macros and types through the header, but POSIX permits that behavior (and explicitly requires that we define uint8_t and uint32_t). No test as this is just adding a typedef, and I don't *think* we have tests for that, but I can add a "check that type is defined" test if that is desirable. --- libc/include/CMakeLists.txt | 5 ++++- libc/include/llvm-libc-types/CMakeLists.txt | 1 + libc/include/llvm-libc-types/in_port_t.h | 16 ++++++++++++++++ libc/include/netinet/in.yaml | 5 ++++- 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 libc/include/llvm-libc-types/in_port_t.h diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index b1b4a4fd20982..5f836e0f480e1 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -216,8 +216,11 @@ add_header_macro( ../libc/include/netinet/in.yaml netinet/in.h DEPENDS - .llvm_libc_common_h .llvm-libc-macros.netinet_in_macros + .llvm-libc-types.in_addr_t + .llvm-libc-types.in_port_t + .llvm-libc-types.sa_family_t + .llvm_libc_common_h ) add_header_macro( diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index e967b43d81df0..aad6650d90428 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -57,6 +57,7 @@ add_header(gid_t HDR gid_t.h) add_header(uid_t HDR uid_t.h) add_header(imaxdiv_t HDR imaxdiv_t.h) add_header(in_addr_t HDR in_addr_t.h) +add_header(in_port_t HDR in_port_t.h DEPENDS libc.include.llvm-libc-macros.stdint_macros) add_header(struct_in_addr HDR struct_in_addr.h DEPENDS .in_addr_t) add_header(ino_t HDR ino_t.h) add_header(key_t HDR key_t.h) diff --git a/libc/include/llvm-libc-types/in_port_t.h b/libc/include/llvm-libc-types/in_port_t.h new file mode 100644 index 0000000000000..daf33f91d95af --- /dev/null +++ b/libc/include/llvm-libc-types/in_port_t.h @@ -0,0 +1,16 @@ +//===-- Definition of in_port_t type --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_IN_PORT_T_H +#define LLVM_LIBC_TYPES_IN_PORT_T_H + +#include "../llvm-libc-types/stdint-macros.h" + +typedef uint16_t in_port_t; + +#endif // LLVM_LIBC_TYPES_IN_PORT_T_H diff --git a/libc/include/netinet/in.yaml b/libc/include/netinet/in.yaml index e7b33dc87334f..8d9749f36ab78 100644 --- a/libc/include/netinet/in.yaml +++ b/libc/include/netinet/in.yaml @@ -14,7 +14,10 @@ macros: macro_header: netinet-in-macros.h - macro_name: IPPROTO_RAW macro_header: netinet-in-macros.h -types: [] +types: + - type_name: in_port_t + - type_name: in_addr_t + - type_name: sa_family_t enums: [] objects: [] functions: [] From 26cae628b42c974597fc12caf1d5a80d7c83156c Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Tue, 12 May 2026 13:00:33 +0200 Subject: [PATCH 420/538] Reapply "[lldb] Do not refcount breakpoints in lldb-server" (#195858) (#196891) This reapplies #195858 with a fix for 32-bit arm (and generally, any architecture that uses software single-stepping). The problem was that the temporary breakpoints used for single-stepping were interfering with the breakpoints set by the client. The fix is to check for existing breakpoints before setting the temporary ones. To achieve this, I've separated the notion of "next PC candidates for a thread" from "step breakpoints we've actually set". The freebsd code had some software single stepping code, but: - this was [introduced](https://reviews.llvm.org/D95802) for mips64 support, which was [removed](https://github.com/llvm/llvm-project/pull/179582) earlier this year - AFAICT, this never worked since the original patch only checked `m_threads_stepping_with_breakpoint`, but never set it to anything. This is why I'm removing the remnants of the single step support instead of trying to adapt it. The original commit message was: We did not say so explictly, but I'd argue that via https://github.com/llvm/llvm-project/pull/195815, we are supporting stubs which do not refcount breakpoints. In these stubs the set/clear breakpoint packets are idempotent: - setting a breakpoint for the second time is a no-op (returns OK) - clearing a breakpoint clears it, regardless of how many times it has been set - clearing a non-existent breakpoint (either because it was already cleared, or because it was never set) returns an error This makes lldb-server one of those stubs, which makes the code slightly simpler, but more importantly, ensures we do not regress this behavior. --- .../lldb/Host/common/NativeProcessProtocol.h | 5 ++- .../Host/common/NativeProcessProtocol.cpp | 14 ++----- .../Process/FreeBSD/NativeProcessFreeBSD.cpp | 17 +------- .../Process/FreeBSD/NativeProcessFreeBSD.h | 3 +- .../Process/Linux/NativeProcessLinux.cpp | 13 +++--- .../NativeProcessSoftwareSingleStep.cpp | 40 +++++++++---------- .../Utility/NativeProcessSoftwareSingleStep.h | 7 +++- .../multi-breakpoint/TestMultiBreakpoint.py | 13 +++++- 8 files changed, 51 insertions(+), 61 deletions(-) diff --git a/lldb/include/lldb/Host/common/NativeProcessProtocol.h b/lldb/include/lldb/Host/common/NativeProcessProtocol.h index 06b36c2cc9eb5..b9c0120016a2d 100644 --- a/lldb/include/lldb/Host/common/NativeProcessProtocol.h +++ b/lldb/include/lldb/Host/common/NativeProcessProtocol.h @@ -158,6 +158,10 @@ class NativeProcessProtocol { virtual Status RemoveBreakpoint(lldb::addr_t addr, bool hardware = false); + bool HasSoftwareBreakpoint(lldb::addr_t addr) { + return m_software_breakpoints.find(addr) != m_software_breakpoints.end(); + } + // Hardware Breakpoint functions virtual const HardwareBreakpointMap &GetHardwareBreakpointMap() const; @@ -419,7 +423,6 @@ class NativeProcessProtocol { protected: struct SoftwareBreakpoint { - uint32_t ref_count; llvm::SmallVector saved_opcodes; llvm::ArrayRef breakpoint_opcodes; }; diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp index 196f54b93538d..dbffdc619ef42 100644 --- a/lldb/source/Host/common/NativeProcessProtocol.cpp +++ b/lldb/source/Host/common/NativeProcessProtocol.cpp @@ -344,10 +344,8 @@ Status NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr, LLDB_LOG(log, "addr = {0:x}, size_hint = {1}", addr, size_hint); auto it = m_software_breakpoints.find(addr); - if (it != m_software_breakpoints.end()) { - ++it->second.ref_count; + if (it != m_software_breakpoints.end()) return Status(); - } auto expected_bkpt = EnableSoftwareBreakpoint(addr, size_hint); if (!expected_bkpt) return Status::FromError(expected_bkpt.takeError()); @@ -362,14 +360,10 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) { auto it = m_software_breakpoints.find(addr); if (it == m_software_breakpoints.end()) return Status::FromErrorString("Breakpoint not found."); - assert(it->second.ref_count > 0); - if (--it->second.ref_count > 0) - return Status(); // Remove the entry from m_software_breakpoints rightaway, so that we don't - // leave behind an entry with ref_count == 0 in case one of the following - // conditions returns an error. The breakpoint is moved so that it can be - // accessed below. + // leave behind an entry in case one of the following conditions returns an + // error. The breakpoint is moved so that it can be accessed below. SoftwareBreakpoint bkpt = std::move(it->second); m_software_breakpoints.erase(it); @@ -503,7 +497,7 @@ NativeProcessProtocol::EnableSoftwareBreakpoint(lldb::addr_t addr, } LLDB_LOG(log, "addr = {0:x}: SUCCESS", addr); - return SoftwareBreakpoint{1, saved_opcode_bytes, *expected_trap}; + return SoftwareBreakpoint{saved_opcode_bytes, *expected_trap}; } llvm::Expected> diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp index 46e9ac1cfd6fa..4853ab2827d9e 100644 --- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp +++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp @@ -316,22 +316,7 @@ void NativeProcessFreeBSD::MonitorSIGTRAP(lldb::pid_t pid) { info.pl_siginfo.si_addr); if (thread) { - auto ®ctx = static_cast( - thread->GetRegisterContext()); - auto thread_info = - m_threads_stepping_with_breakpoint.find(thread->GetID()); - if (thread_info != m_threads_stepping_with_breakpoint.end() && - llvm::is_contained(thread_info->second, regctx.GetPC())) { - thread->SetStoppedByTrace(); - for (auto &&bp_addr : thread_info->second) { - Status brkpt_error = RemoveBreakpoint(bp_addr); - if (brkpt_error.Fail()) - LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}", - thread_info->first, brkpt_error); - } - m_threads_stepping_with_breakpoint.erase(thread_info); - } else - thread->SetStoppedByBreakpoint(); + thread->SetStoppedByBreakpoint(); FixupBreakpointPCAsNeeded(*thread); SetCurrentThreadID(thread->GetID()); } diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.h b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.h index 4a3da9e987e3c..7e8bdc527f420 100644 --- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.h +++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.h @@ -27,8 +27,7 @@ namespace process_freebsd { /// for debugging. /// /// Changes in the inferior process state are broadcasted. -class NativeProcessFreeBSD : public NativeProcessELF, - private NativeProcessSoftwareSingleStep { +class NativeProcessFreeBSD : public NativeProcessELF { public: class Manager : public NativeProcessProtocol::Manager { public: diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp index 1ad57bd0c19e1..80f1b5662ba61 100644 --- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp @@ -1931,14 +1931,13 @@ void NativeProcessLinux::SignalIfAllThreadsStopped() { // Clear any temporary breakpoints we used to implement software single // stepping. - for (const auto &thread_info : m_threads_stepping_with_breakpoint) { - for (auto &&bp_addr : thread_info.second) { - Status error = RemoveBreakpoint(bp_addr); - if (error.Fail()) - LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}", - thread_info.first, error); - } + for (addr_t bp_addr : m_step_breakpoints) { + Status error = RemoveBreakpoint(bp_addr); + if (error.Fail()) + LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}", bp_addr, + error); } + m_step_breakpoints.clear(); m_threads_stepping_with_breakpoint.clear(); // Notify the delegate about the stop diff --git a/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp b/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp index eddf4b97babae..e962b5ae939b0 100644 --- a/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp +++ b/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp @@ -87,21 +87,6 @@ static size_t WriteMemoryCallback(EmulateInstruction *instruction, void *baton, return length; } -static Status SetSoftwareBreakpoint(lldb::addr_t bp_addr, unsigned bp_size, - NativeProcessProtocol &process) { - Status error; - error = process.SetBreakpoint(bp_addr, bp_size, /*hardware=*/false); - - // If setting the breakpoint fails because pc is out of the address - // space, ignore it and let the debugee segfault. - if (error.GetError() == EIO || error.GetError() == EFAULT) - return Status(); - if (error.Fail()) - return error; - - return Status(); -} - Status NativeProcessSoftwareSingleStep::SetupSoftwareSingleStepping( NativeThreadProtocol &thread) { Status error; @@ -122,24 +107,37 @@ Status NativeProcessSoftwareSingleStep::SetupSoftwareSingleStepping( emulator_up->SetWriteMemCallback(&WriteMemoryCallback); emulator_up->SetWriteRegCallback(&WriteRegisterCallback); - auto bp_locaions_predictor = + auto bp_locations_predictor = EmulateInstruction::CreateBreakpointLocationPredictor( std::move(emulator_up)); - auto bp_locations = bp_locaions_predictor->GetBreakpointLocations(error); + BreakpointLocations candidates = + bp_locations_predictor->GetBreakpointLocations(error); if (error.Fail()) return error; - for (auto &&bp_addr : bp_locations) { - auto bp_size = bp_locaions_predictor->GetBreakpointSize(bp_addr); + for (addr_t bp_addr : candidates) { + if (process.HasSoftwareBreakpoint(bp_addr)) + continue; + auto bp_size = bp_locations_predictor->GetBreakpointSize(bp_addr); if (auto err = bp_size.takeError()) return Status(toString(std::move(err))); - error = SetSoftwareBreakpoint(bp_addr, *bp_size, process); + error = process.SetBreakpoint(bp_addr, *bp_size, /*hardware=*/false); + + // If setting the breakpoint fails because pc is out of the address + // space, ignore it and let the debugee segfault. + if (error.GetError() == EIO || error.GetError() == EFAULT) { + error.Clear(); + continue; + } if (error.Fail()) return error; + + m_step_breakpoints.emplace(bp_addr); } - m_threads_stepping_with_breakpoint.insert({thread.GetID(), bp_locations}); + m_threads_stepping_with_breakpoint.emplace(thread.GetID(), + std::move(candidates)); return error; } diff --git a/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.h b/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.h index 4e3fca30684fa..0e4d7c5656bb5 100644 --- a/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.h +++ b/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.h @@ -11,8 +11,8 @@ #include "lldb/Host/common/NativeProcessProtocol.h" #include "lldb/Host/common/NativeThreadProtocol.h" - #include +#include namespace lldb_private { @@ -22,9 +22,12 @@ class NativeProcessSoftwareSingleStep { protected: // List of thread ids stepping with a breakpoint with the address of - // the relevan breakpoint + // next PC candidates. std::map> m_threads_stepping_with_breakpoint; + + // The list of stepping breakpoints. + std::set m_step_breakpoints; }; } // namespace lldb_private diff --git a/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py b/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py index eb9e2952d5a49..a4d6351e05d65 100644 --- a/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py +++ b/lldb/test/API/functionalities/multi-breakpoint/TestMultiBreakpoint.py @@ -17,6 +17,8 @@ # Runs on systems where we can always predict the software break size @skipIf(archs=no_match(["x86_64", "arm64", "aarch64"])) class TestMultiBreakpoint(TestBase): + NO_DEBUG_INFO_TESTCASE = True + def check_invalid_packet(self, packet_str): reply = lldbutil.send_packet_get_reply(self, packet_str) if reply.startswith("E"): @@ -60,6 +62,9 @@ def get_function_address(self, name): return f"{addr:x}" def test_multi_breakpoint(self): + # Debugserver uses refcounted breakpoints + breakpoints_are_refcounted = self.platformIsDarwin() + self.build() source_file = lldb.SBFileSpec("main.c") self.target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( @@ -151,7 +156,9 @@ def make_packet(array): # Clean up both. array = [f"z0,{addr_a},{bp_kind}", f"z0,{addr_a},{bp_kind}"] reply = self.send_packet(make_packet(array)) - self.assertMultiResponse(reply, ["OK", "OK"]) + self.assertMultiResponse( + reply, ["OK", "OK" if breakpoints_are_refcounted else "error"] + ) # --- Set the same breakpoint twice, but remove it thrice. array = [f"Z0,{addr_a},{bp_kind}", f"Z0,{addr_a},{bp_kind}"] @@ -163,7 +170,9 @@ def make_packet(array): f"z0,{addr_a},{bp_kind}", ] reply = self.send_packet(make_packet(array)) - self.assertMultiResponse(reply, ["OK", "OK", "error"]) + self.assertMultiResponse( + reply, ["OK", "OK" if breakpoints_are_refcounted else "error", "error"] + ) # --- Set and remove the same address in a single packet --- # The spec requires requests to be executed in order, so the set From 215bd25a7bec346445402ae6535d42d71bf2cc18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Tue, 12 May 2026 13:08:37 +0200 Subject: [PATCH 421/538] [analyzer] Clean up evalBind, fix bad logic (#196313) This commit refactors `ExprEngine::evalBind` to eliminate the use of a `NodeBuilder` and fix incorrect logic that was apparently introduced because the `NodeBuilder` had obfuscated the underlying set operations. In the special case when the engine is binding to an `Unknown` or `Undefined` memory location, with the old code on each execution path _either_ only the `check::Bind` checkers _or_ only the pointer escape checkers were invoked. This commit ensures that on each execution path _both_ the `check::Bind` checkers _and then_ the pointer escape checkers get a chance to activate. I'm pretty sure that the bad logic did not cause incorrect behavior of the analyzer, because there are no `checkBind` checkers that generate non-sink transitions when the location is `Unknown` or `Undefined`. I also added an assertion that the location argument of `evalBind` cannot be a `NonLoc`, because this is a common sense precondition, seems to be actually true and makes it easier to reason about the behavior of this function. --- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 52 +++++++------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 1efe7e6f84b23..123207f4a1e0d 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -3713,52 +3713,38 @@ ExprEngine::notifyCheckersOfPointerEscape(ProgramStateRef State, } /// evalBind - Handle the semantics of binding a value to a specific location. -/// This method is used by evalStore and (soon) VisitDeclStmt, and others. +/// This method is used by evalStore, VisitDeclStmt, and others. void ExprEngine::evalBind(ExplodedNodeSet &Dst, const Stmt *StoreE, - ExplodedNode *Pred, SVal location, SVal Val, + ExplodedNode *Pred, SVal Location, SVal Val, bool AtDeclInit, const ProgramPoint *PP) { + // It may be a Loc, UnknownVal or perhaps UndefinedVal. + assert(!isa(Location) && "evalBind location should not be NonLoc!"); + const LocationContext *LC = Pred->getLocationContext(); - PostStmt PS(StoreE, LC); + PostStmt DefaultPP(StoreE, LC); if (!PP) - PP = &PS; + PP = &DefaultPP; // Do a previsit of the bind. ExplodedNodeSet CheckedSet; - getCheckerManager().runCheckersForBind(CheckedSet, Pred, location, Val, + getCheckerManager().runCheckersForBind(CheckedSet, Pred, Location, Val, StoreE, AtDeclInit, *this, *PP); - NodeBuilder Bldr(CheckedSet, Dst, *currBldrCtx); - - // If the location is not a 'Loc', it will already be handled by - // the checkers. There is nothing left to do. - if (!isa(location)) { - const ProgramPoint L = PostStore(StoreE, LC, /*Loc*/nullptr, - /*tag*/nullptr); - ProgramStateRef state = Pred->getState(); - state = processPointerEscapedOnBind(state, location, Val, LC); - Bldr.generateNode(L, state, Pred); - return; - } - - for (const auto PredI : CheckedSet) { - ProgramStateRef state = PredI->getState(); - - state = processPointerEscapedOnBind(state, location, Val, LC); + for (ExplodedNode *PredI : CheckedSet) { + ProgramStateRef State = PredI->getState(); - // When binding the value, pass on the hint that this is a initialization. - // For initializations, we do not need to inform clients of region - // changes. - state = state->bindLoc(location.castAs(), Val, LC, - /* notifyChanges = */ !AtDeclInit); + // Check and record that 'Val' may escape: + State = processPointerEscapedOnBind(State, Location, Val, LC); - const MemRegion *LocReg = nullptr; - if (std::optional LocRegVal = - location.getAs()) { - LocReg = LocRegVal->getRegion(); + if (auto AsLoc = Location.getAs()) { + // When binding the value, pass on the hint that this is a + // initialization. For initializations, we do not need to inform clients + // of region changes. + State = State->bindLoc(*AsLoc, Val, LC, /*notifyChanges=*/!AtDeclInit); } - const ProgramPoint L = PostStore(StoreE, LC, LocReg, nullptr); - Bldr.generateNode(L, state, PredI); + PostStore PS(StoreE, LC, Location.getAsRegion(), /*tag=*/nullptr); + Dst.insert(Engine.makeNode(PS, State, PredI)); } } From c2d51a2719026acd30d505cf11d7f33bc27adafd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 12 May 2026 12:22:04 +0100 Subject: [PATCH 422/538] [VPlan] Add Type* and getType() to VPSymbolicValue (NFC) (#195183) Add a Type* field to VPSymbolicValue, along with a getType() methods to query the stored scalar type. This makes it easier to retrieve the type of various symbolic values, and also simplifies VPTypeAnalysis construction. PR: https://github.com/llvm/llvm-project/pull/195183 --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 11 +++++--- llvm/lib/Transforms/Vectorize/VPlan.h | 24 +++++++++++------ .../Transforms/Vectorize/VPlanAnalysis.cpp | 26 +++---------------- llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 8 ++---- .../Vectorize/VPlanConstruction.cpp | 7 ++--- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 8 +++++- .../Transforms/Vectorize/VPlanTest.cpp | 2 +- .../Transforms/Vectorize/VPlanTestBase.h | 3 ++- 11 files changed, 45 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 3c8f3362ae93a..2ec1b002f56f8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -862,7 +862,8 @@ VPInstruction *VPRegionBlock::getOrCreateCanonicalIVIncrement() { CanIV->getDebugLoc(), "index.next"); } -VPlan::VPlan(Loop *L) { +VPlan::VPlan(Loop *L, Type *IdxTy) + : VectorTripCount(IdxTy), VF(IdxTy), UF(IdxTy), VFxUF(IdxTy) { setEntry(createVPIRBasicBlock(L->getLoopPreheader())); ScalarHeader = createVPIRBasicBlock(L->getHeader()); @@ -873,7 +874,7 @@ VPlan::VPlan(Loop *L) { } VPlan::~VPlan() { - VPSymbolicValue DummyValue; + VPSymbolicValue DummyValue(nullptr); for (auto *VPB : CreatedBlocks) { if (auto *VPBB = dyn_cast(VPB)) { @@ -1236,7 +1237,8 @@ VPlan *VPlan::duplicate() { NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); } // Create VPlan, clone live-ins and remap operands in the cloned blocks. - auto *NewPlan = new VPlan(cast(NewEntry), NewScalarHeader); + auto *NewPlan = + new VPlan(cast(NewEntry), NewScalarHeader, getIndexType()); DenseMap Old2NewVPValues; for (VPIRValue *OldLiveIn : getLiveIns()) Old2NewVPValues[OldLiveIn] = NewPlan->getOrAddLiveIn(OldLiveIn); @@ -1258,7 +1260,8 @@ VPlan *VPlan::duplicate() { "All VPSymbolicValues must be handled below"); if (BackedgeTakenCount) - NewPlan->BackedgeTakenCount = new VPSymbolicValue(); + NewPlan->BackedgeTakenCount = + new VPSymbolicValue(BackedgeTakenCount->getType()); // Map and propagate materialized state for symbolic values. for (auto [OldSV, NewSV] : diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 63436c79e9a98..f6e77092e016c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4559,9 +4559,11 @@ class VPlan { SmallVector CreatedBlocks; /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader - /// wrapping the original header of the scalar loop. - VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) - : Entry(Entry), ScalarHeader(ScalarHeader) { + /// wrapping the original header of the scalar loop. The vector loop will have + /// index type \p IdxTy. + VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader, Type *IdxTy) + : Entry(Entry), ScalarHeader(ScalarHeader), VectorTripCount(IdxTy), + VF(IdxTy), UF(IdxTy), VFxUF(IdxTy) { Entry->setPlan(this); assert(ScalarHeader->getNumSuccessors() == 0 && "scalar header must be a leaf node"); @@ -4570,12 +4572,14 @@ class VPlan { public: /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the /// original preheader and scalar header of \p L, to be used as entry and - /// scalar header blocks of the new VPlan. - VPlan(Loop *L); + /// scalar header blocks of the new VPlan. The vector loop will have index + /// type \p IdxTy. + VPlan(Loop *L, Type *IdxTy); /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock - /// wrapping \p ScalarHeaderBB and a trip count of \p TC. - VPlan(BasicBlock *ScalarHeaderBB) { + /// wrapping \p ScalarHeaderBB and vector loop index of type \p IdxTy. + VPlan(BasicBlock *ScalarHeaderBB, Type *IdxTy) + : VectorTripCount(IdxTy), VF(IdxTy), UF(IdxTy), VFxUF(IdxTy) { setEntry(createVPBasicBlock("preheader")); ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB); } @@ -4678,8 +4682,9 @@ class VPlan { /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { + // BTC shares the canonical IV type with VectorTripCount. if (!BackedgeTakenCount) - BackedgeTakenCount = new VPSymbolicValue(); + BackedgeTakenCount = new VPSymbolicValue(VectorTripCount.getType()); return BackedgeTakenCount; } VPValue *getBackedgeTakenCount() const { return BackedgeTakenCount; } @@ -4907,6 +4912,9 @@ class VPlan { return ScalarPH && is_contained(ScalarPH->getPredecessors(), getMiddleBlock()); } + + /// The type of the canonical induction variable of the vector loop. + Type *getIndexType() const { return VF.getType(); } }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index a42b631cd3304..ddfd528d42217 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -24,23 +24,6 @@ using namespace VPlanPatternMatch; #define DEBUG_TYPE "vplan" -VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) - : Ctx(Plan.getContext()), DL(Plan.getDataLayout()) { - if (auto LoopRegion = Plan.getVectorLoopRegion()) { - CanonicalIVTy = LoopRegion->getCanonicalIVType(); - return; - } - - // If there's no loop region, retrieve the type from the trip count - // expression. - auto *TC = Plan.getTripCount(); - if (auto *TCIRV = dyn_cast(TC)) { - CanonicalIVTy = TCIRV->getType(); - return; - } - CanonicalIVTy = cast(TC)->getSCEV()->getType(); -} - Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) { Type *ResTy = inferScalarType(R->getIncomingValue(0)); for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) { @@ -128,7 +111,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::LastActiveLane: // Assume that the maximum possible number of elements in a vector fits // within the index type for the default address space. - return DL.getIndexType(Ctx, 0); + return R->getParent()->getPlan()->getDataLayout().getIndexType(Ctx, 0); case VPInstruction::LogicalAnd: case VPInstruction::LogicalOr: assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) && @@ -286,11 +269,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { if (auto *IRV = dyn_cast(V)) return IRV->getType(); - if (isa(V)) { - // All VPValues without any underlying IR value (like the vector trip count - // or the backedge-taken count) have the same type as the canonical IV. - return CanonicalIVTy; - } + if (auto *SymbolicV = dyn_cast(V)) + return SymbolicV->getType(); if (auto *RegionV = dyn_cast(V)) return RegionV->getType(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index c1c9075420d1c..b73636c31f0a7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -9,6 +9,7 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H #define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H +#include "VPlan.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" @@ -45,12 +46,7 @@ struct VPCostContext; /// of the previously inferred types. class VPTypeAnalysis { DenseMap CachedTypes; - /// Type of the canonical induction variable. Used for all VPValues without - /// any underlying IR value (like the vector trip count or the backedge-taken - /// count). - Type *CanonicalIVTy; LLVMContext &Ctx; - const DataLayout &DL; Type *inferScalarTypeForRecipe(const VPBlendRecipe *R); Type *inferScalarTypeForRecipe(const VPInstruction *R); @@ -60,7 +56,7 @@ class VPTypeAnalysis { Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R); public: - VPTypeAnalysis(const VPlan &Plan); + VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) {} /// Infer the type of \p V. Returns the scalar type of \p V. Type *inferScalarType(const VPValue *V); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index e20d5d947ac54..c2b05984db272 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -78,8 +78,9 @@ class PlainCFGBuilder { void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB); public: - PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer) - : TheLoop(Lp), LI(LI), LVer(LVer), Plan(std::make_unique(Lp)) {} + PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer, Type *IdxTy) + : TheLoop(Lp), LI(LI), LVer(LVer), + Plan(std::make_unique(Lp, IdxTy)) {} /// Build plain CFG for TheLoop and connect it to Plan's entry. std::unique_ptr buildPlainCFG(); @@ -635,7 +636,7 @@ std::unique_ptr VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE, LoopVersioning *LVer) { - PlainCFGBuilder Builder(TheLoop, &LI, LVer); + PlainCFGBuilder Builder(TheLoop, &LI, LVer, InductionTy); std::unique_ptr VPlan0 = Builder.buildPlainCFG(); addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop); simplifyLiveInsWithSCEV(*VPlan0, PSE); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 11a91dcd46867..6d5db90436c79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3017,7 +3017,7 @@ VPExpressionRecipe::VPExpressionRecipe( if (Def && ExpressionRecipesAsSetOfUsers.contains(Def)) continue; addOperand(Op); - LiveInPlaceholders.push_back(new VPSymbolicValue()); + LiveInPlaceholders.push_back(new VPSymbolicValue(nullptr)); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 32d89a34105a4..c836a280eac19 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2678,7 +2678,7 @@ void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) { // Detach all unreachable blocks from their successors, removing their recipes // and incoming values from phi recipes. - VPSymbolicValue Tmp; + VPSymbolicValue Tmp(nullptr); for (VPBlockBase *B : AllBlocks) { if (Reachable.contains(B)) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index f1b9efae08377..30dd61ba5b232 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -345,7 +345,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { VPBuilder Builder(VPR); const DataLayout &DL = Plan.getDataLayout(); Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR)); - Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF()); + Type *VFTy = Plan.getVF().getType(); VPValue *VF = Builder.createScalarZExtOrTrunc( &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown()); // VFxUF does not wrap, so VF * Part also cannot wrap. diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 8f9806adf774e..777da17c904f3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -278,12 +278,15 @@ struct VPConstantInt : public VPIRValue { /// A symbolic live-in VPValue, used for values like vector trip count, VF, and /// VFxUF. struct VPSymbolicValue : public VPValue { - VPSymbolicValue() : VPValue(VPVSymbolicSC, nullptr) {} + VPSymbolicValue(Type *Ty) : VPValue(VPVSymbolicSC, nullptr), Ty(Ty) {} static bool classof(const VPValue *V) { return V->getVPValueID() == VPVSymbolicSC; } + /// Returns the scalar type of this symbolic value. + Type *getType() const { return Ty; } + /// Returns true if this symbolic value has been materialized. bool isMaterialized() const { return Materialized; } @@ -294,6 +297,9 @@ struct VPSymbolicValue : public VPValue { } private: + /// The scalar type of this symbolic value. + Type *Ty; + /// Track whether this symbolic value has been materialized (replaced). /// After materialization, accessing users should trigger an assertion. bool Materialized = false; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 61acb5846a9cb..1dceed39b2d03 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -53,7 +53,7 @@ define void @f(i32 %x) { Loop *L = LI->getLoopFor(LoopHeader); PredicatedScalarEvolution PSE(*SE, *L); - VPlan Plan(LoopHeader); + VPlan Plan(LoopHeader, IntegerType::get(*Ctx, 32)); Argument *X = F->getArg(0); VPValue *Op = Plan.getOrAddLiveIn(X); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index 2c1797a5a724e..4f8f3581afa8c 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -133,7 +133,8 @@ class VPlanTestBase : public testing::Test { } VPlan &getPlan() { - Plans.push_back(std::make_unique(ScalarHeader)); + Plans.push_back( + std::make_unique(ScalarHeader, IntegerType::get(C, 64))); VPlan &Plan = *Plans.back(); VPValue *DefaultTC = Plan.getConstantInt(32, 1024); Plan.setTripCount(DefaultTC); From 41e493aec0b9db0c9758c2c10449583d9ea297fb Mon Sep 17 00:00:00 2001 From: Romanov Vlad Date: Tue, 12 May 2026 13:35:19 +0200 Subject: [PATCH 423/538] [RegAlloc] Trace through COPYs to find rematerializable definitions (#190955) After live range splitting, successful rematerialization in one split interval can remove the original defining instruction, leaving only COPY instructions in other split intervals. When attempting to rematerialize uses in those intervals, the code fails to find the original definition and gives up. This patch traces backwards through COPY chains to recover the original rematerializable definition instead of giving up. --- llvm/lib/CodeGen/InlineSpiller.cpp | 43 +++++++++- .../CodeGen/AMDGPU/remat-through-copy.mir | 82 +++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/remat.ll | 38 ++------- 3 files changed, 129 insertions(+), 34 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/remat-through-copy.mir diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 5544941d66b63..ae5f374539707 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -688,10 +688,45 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // live interval; this happens if we rematted to all uses, and // then further split one of those live ranges. if (!DefMI) { - markValueUsed(&VirtReg, ParentVNI); - LLVM_DEBUG(dbgs() << "\tcannot remat missing def for " << UseIdx << '\t' - << MI); - return false; + // Try to find the rematerializable definition by tracing through COPY + // chains. + LiveInterval &LI = LIS.getInterval(VirtReg.reg()); + VNInfo *CurVNI = LI.getVNInfoAt(UseIdx); + MachineInstr *CurDef = nullptr; + + LLVM_DEBUG(dbgs() << "\ttracing COPY chain from " + << printReg(VirtReg.reg(), &TRI) << "\n"); + + // Trace backwards through COPY chain using VNInfo + while (CurVNI) { + CurDef = LIS.getInstructionFromIndex(CurVNI->def); + + LLVM_DEBUG(dbgs() << "\t -> def at " << CurVNI->def << ": " + << (CurDef ? TII.getName(CurDef->getOpcode()) : "null") + << "\n"); + + if (!CurDef || !CurDef->isFullCopy()) + break; + + Register SrcReg = CurDef->getOperand(1).getReg(); + if (!SrcReg.isVirtual()) + break; + LLVM_DEBUG(dbgs() << "\t -> tracing through COPY to " + << printReg(SrcReg, &TRI) << "\n"); + LiveInterval &SrcLI = LIS.getInterval(SrcReg); + CurVNI = SrcLI.getVNInfoBefore(CurVNI->def); + } + if (CurDef && TII.isReMaterializable(*CurDef)) { + DefMI = CurDef; + LLVM_DEBUG(dbgs() << "\tFound remat possibility through COPY chain: " + << *DefMI); + } + if (!DefMI) { + markValueUsed(&VirtReg, ParentVNI); + LLVM_DEBUG(dbgs() << "\tcannot remat missing def for " << UseIdx << '\t' + << MI); + return false; + } } LiveRangeEdit::Remat RM(ParentVNI); diff --git a/llvm/test/CodeGen/AMDGPU/remat-through-copy.mir b/llvm/test/CodeGen/AMDGPU/remat-through-copy.mir new file mode 100644 index 0000000000000..44cedf20640d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat-through-copy.mir @@ -0,0 +1,82 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --stress-regalloc=3 -run-pass=greedy,virtregrewriter -o - %s | FileCheck %s + +# Test that remat is successful when there are artificial COPY instructions due to live ranges splitting + +--- +name: test_remat_s_add_i32 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_remat_s_add_i32 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr0, $vgpr1, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = V_XOR_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec + ; CHECK-NEXT: renamable $sgpr2 = S_MOV_B32 0 + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $sgpr2, $vgpr0, $vgpr2, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5, implicit renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5, implicit renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5, implicit renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5, implicit renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5, implicit renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5, implicit renamable $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $sgpr2 = S_ADD_U32 killed renamable $sgpr2, 1, implicit-def $scc + ; CHECK-NEXT: S_CMP_LT_U32 renamable $sgpr2, 10, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr2, implicit killed renamable $vgpr0 + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1 + + %in1:vgpr_32 = COPY $vgpr0 + %in2:vgpr_32 = COPY $vgpr1 + %vec0:vgpr_32 = V_OR_B32_e32 %in1, %in2, implicit $exec + %vec1:vgpr_32 = V_XOR_B32_e32 %in1, %in2, implicit $exec + %17:sgpr_64 = COPY $sgpr0_sgpr1 + + %new_var:sreg_64 = V_CMP_EQ_U32_e64 %vec1:vgpr_32, %vec0:vgpr_32, implicit $exec + %new_var2:sreg_64 = V_CMP_EQ_U32_e64 %vec1:vgpr_32, %vec0:vgpr_32, implicit $exec + + %i:sgpr_32 = S_MOV_B32 0 + S_BRANCH %bb.1 + + bb.1: + + S_NOP 0, implicit %new_var, implicit %17 + + S_NOP 0, implicit %new_var2, implicit %17 + + S_NOP 0, implicit %new_var, implicit %17 + + S_NOP 0, implicit %new_var2, implicit %17 + + S_NOP 0, implicit %new_var, implicit %17 + + S_NOP 0, implicit %new_var2, implicit %17 + + %i:sgpr_32 = S_ADD_U32 %i:sgpr_32, 1, implicit-def $scc + S_CMP_LT_U32 %i:sgpr_32, 10, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_NOP 0, implicit %vec0, implicit %vec1 + S_ENDPGM 0 +... + diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 57f1977c27b82..430db5b1290fb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -314,13 +314,10 @@ define i64 @dual_remat(i64 %0, %1, %2, p ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 @@ -338,12 +335,6 @@ define i64 @dual_remat(i64 %0, %1, %2, p ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a5, a4, 4 -; CHECK-NEXT: add a4, a5, a4 -; CHECK-NEXT: add a4, sp, a4 -; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vand.vv v16, v0, v8 ; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: vmsne.vi v24, v16, 0 @@ -353,12 +344,7 @@ define i64 @dual_remat(i64 %0, %1, %2, p ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a5, a4, 4 -; CHECK-NEXT: add a4, a5, a4 -; CHECK-NEXT: add a4, sp, a4 -; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vand.vv v16, v24, v8 ; CHECK-NEXT: vmsne.vi v8, v16, 0 ; CHECK-NEXT: csrr a4, vlenb @@ -370,17 +356,12 @@ define i64 @dual_remat(i64 %0, %1, %2, p ; CHECK-NEXT: vslideup.vx v9, v8, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma ; CHECK-NEXT: vcpop.m a4, v9 -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a6, a5, 4 -; CHECK-NEXT: add a5, a6, a5 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vs8r.v v8, (a3) ; CHECK-NEXT: vs8r.v v8, (a2) ; CHECK-NEXT: addi a5, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; CHECK-NEXT: vor.vv v0, v0, v8 ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: slli a5, a5, 3 @@ -392,11 +373,8 @@ define i64 @dual_remat(i64 %0, %1, %2, p ; CHECK-NEXT: # %bb.2: # %middle.block ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add sp, sp, a1 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 From a464577bb084061ffa2ed14c16adcd514abec8cf Mon Sep 17 00:00:00 2001 From: paperchalice Date: Tue, 12 May 2026 19:44:12 +0800 Subject: [PATCH 424/538] [IR] Preserve samesign when cloning ICmpInst (#197118) Clone should preserve IR flags faithfully. --- llvm/lib/IR/Instructions.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index f940893ab0296..93ec59846d360 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4360,7 +4360,9 @@ FCmpInst *FCmpInst::cloneImpl() const { } ICmpInst *ICmpInst::cloneImpl() const { - return new ICmpInst(getPredicate(), Op<0>(), Op<1>()); + auto *Result = new ICmpInst(getPredicate(), Op<0>(), Op<1>()); + Result->setSameSign(hasSameSign()); + return Result; } ExtractValueInst *ExtractValueInst::cloneImpl() const { From 06615e07e1829c49d6633acbd9f7115548f5e200 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2026 12:44:44 +0100 Subject: [PATCH 425/538] [X86] vector-reduce-ctpop.ll - add 32-bit test coverage (#197149) --- llvm/test/CodeGen/X86/vector-reduce-ctpop.ll | 5616 +++++++++++++++--- 1 file changed, 4692 insertions(+), 924 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll index 29366f74da12a..e936f1e4faf21 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll @@ -1,50 +1,179 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX512VPOPCNT +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX1 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+avx512vl,+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX512VPOPCNT ; ; Reductions of per-element ctpop results (count all bits in a vector) ; -define i64 @reduce_ctpop_v2i64(<2 x i64> %a0) { -; SSE42-LABEL: reduce_ctpop_v2i64: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pand %xmm1, %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: pshufb %xmm2, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm1, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm3 -; SSE42-NEXT: paddb %xmm4, %xmm3 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: psadbw %xmm3, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: retq -; -; AVX2-LABEL: reduce_ctpop_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: retq +define i64 @reduce_ctpop_v2i64(<2 x i64> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v2i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v2i64: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: paddb %xmm2, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v2i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: pand %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm4 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE4-NEXT: paddb %xmm4, %xmm3 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v2i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pand %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm4 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X64-SSE4-NEXT: paddb %xmm4, %xmm3 +; X64-SSE4-NEXT: pxor %xmm0, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm3, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v2i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v2i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: reduce_ctpop_v2i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; X86-AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: reduce_ctpop_v2i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; X64-AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v2i64: ; AVX512VL: # %bb.0: @@ -75,31 +204,139 @@ define i64 @reduce_ctpop_v2i64(<2 x i64> %a0) { ret i64 %r0 } -define i32 @reduce_ctpop_v4i32(<4 x i32> %a0) { -; SSE42-LABEL: reduce_ctpop_v4i32: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pand %xmm1, %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: pshufb %xmm2, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm1, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm3 -; SSE42-NEXT: paddb %xmm4, %xmm3 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE42-NEXT: psadbw %xmm0, %xmm3 -; SSE42-NEXT: psadbw %xmm0, %xmm1 -; SSE42-NEXT: packuswb %xmm3, %xmm1 -; SSE42-NEXT: packuswb %xmm3, %xmm3 -; SSE42-NEXT: paddd %xmm1, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE42-NEXT: paddd %xmm3, %xmm0 -; SSE42-NEXT: movd %xmm0, %eax -; SSE42-NEXT: retq +define i32 @reduce_ctpop_v4i32(<4 x i32> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v4i32: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE2-NEXT: psadbw %xmm0, %xmm2 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm2 +; X86-SSE2-NEXT: paddd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: paddd %xmm2, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v4i32: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: paddb %xmm2, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X64-SSE2-NEXT: psadbw %xmm0, %xmm2 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE2-NEXT: packuswb %xmm2, %xmm1 +; X64-SSE2-NEXT: packuswb %xmm2, %xmm2 +; X64-SSE2-NEXT: paddd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X64-SSE2-NEXT: paddd %xmm2, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v4i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: pand %xmm2, %xmm3 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE4-NEXT: pshufb %xmm3, %xmm4 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm2, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X86-SSE4-NEXT: paddb %xmm4, %xmm1 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm2 +; X86-SSE4-NEXT: packuswb %xmm1, %xmm2 +; X86-SSE4-NEXT: packuswb %xmm1, %xmm1 +; X86-SSE4-NEXT: paddd %xmm2, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE4-NEXT: paddd %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v4i32: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pand %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm4 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X64-SSE4-NEXT: paddb %xmm4, %xmm3 +; X64-SSE4-NEXT: pxor %xmm0, %xmm0 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-SSE4-NEXT: psadbw %xmm0, %xmm3 +; X64-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE4-NEXT: packuswb %xmm3, %xmm1 +; X64-SSE4-NEXT: packuswb %xmm3, %xmm3 +; X64-SSE4-NEXT: paddd %xmm1, %xmm3 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; X64-SSE4-NEXT: paddd %xmm3, %xmm0 +; X64-SSE4-NEXT: movd %xmm0, %eax +; X64-SSE4-NEXT: retq +; +; AVX1-LABEL: reduce_ctpop_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: reduce_ctpop_v4i32: ; AVX2: # %bb.0: @@ -122,7 +359,7 @@ define i32 @reduce_ctpop_v4i32(<4 x i32> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: reduce_ctpop_v4i32: ; AVX512VL: # %bb.0: @@ -158,29 +395,103 @@ define i32 @reduce_ctpop_v4i32(<4 x i32> %a0) { ret i32 %r0 } -define i16 @reduce_ctpop_v8i16(<8 x i16> %a0) { -; SSE42-LABEL: reduce_ctpop_v8i16: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pand %xmm1, %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: pshufb %xmm2, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm1, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm3 -; SSE42-NEXT: paddb %xmm4, %xmm3 -; SSE42-NEXT: movdqa %xmm3, %xmm0 -; SSE42-NEXT: psllw $8, %xmm0 -; SSE42-NEXT: paddb %xmm3, %xmm0 -; SSE42-NEXT: psrlw $8, %xmm0 -; SSE42-NEXT: packuswb %xmm0, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: psadbw %xmm0, %xmm1 -; SSE42-NEXT: movd %xmm1, %eax -; SSE42-NEXT: # kill: def $ax killed $ax killed $eax -; SSE42-NEXT: retq +define i16 @reduce_ctpop_v8i16(<8 x i16> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v8i16: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psllw $8, %xmm0 +; X86-SSE2-NEXT: paddb %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v8i16: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: paddb %xmm2, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psllw $8, %xmm0 +; X64-SSE2-NEXT: paddb %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm0, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; SSE4-LABEL: reduce_ctpop_v8i16: +; SSE4: # %bb.0: +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: pand %xmm1, %xmm2 +; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE4-NEXT: movdqa %xmm3, %xmm4 +; SSE4-NEXT: pshufb %xmm2, %xmm4 +; SSE4-NEXT: psrlw $4, %xmm0 +; SSE4-NEXT: pand %xmm1, %xmm0 +; SSE4-NEXT: pshufb %xmm0, %xmm3 +; SSE4-NEXT: paddb %xmm4, %xmm3 +; SSE4-NEXT: movdqa %xmm3, %xmm0 +; SSE4-NEXT: psllw $8, %xmm0 +; SSE4-NEXT: paddb %xmm3, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: packuswb %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: psadbw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: reduce_ctpop_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: reduce_ctpop_v8i16: ; AVX2: # %bb.0: @@ -200,7 +511,7 @@ define i16 @reduce_ctpop_v8i16(<8 x i16> %a0) { ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: reduce_ctpop_v8i16: ; AVX512VL: # %bb.0: @@ -238,26 +549,92 @@ define i16 @reduce_ctpop_v8i16(<8 x i16> %a0) { ret i16 %r0 } -define i8 @reduce_ctpop_v16i8(<16 x i8> %a0) { -; SSE42-LABEL: reduce_ctpop_v16i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pand %xmm1, %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: pshufb %xmm2, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm1, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm3 -; SSE42-NEXT: paddb %xmm4, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE42-NEXT: paddb %xmm3, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: psadbw %xmm0, %xmm1 -; SSE42-NEXT: movd %xmm1, %eax -; SSE42-NEXT: # kill: def $al killed $al killed $eax -; SSE42-NEXT: retq +define i8 @reduce_ctpop_v16i8(<16 x i8> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v16i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddb %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v16i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: paddb %xmm2, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: paddb %xmm1, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE2-NEXT: retq +; +; SSE4-LABEL: reduce_ctpop_v16i8: +; SSE4: # %bb.0: +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: pand %xmm1, %xmm2 +; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE4-NEXT: movdqa %xmm3, %xmm4 +; SSE4-NEXT: pshufb %xmm2, %xmm4 +; SSE4-NEXT: psrlw $4, %xmm0 +; SSE4-NEXT: pand %xmm1, %xmm0 +; SSE4-NEXT: pshufb %xmm0, %xmm3 +; SSE4-NEXT: paddb %xmm4, %xmm3 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE4-NEXT: paddb %xmm3, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: psadbw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $al killed $al killed $eax +; SSE4-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: reduce_ctpop_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: reduce_ctpop_v16i8: ; AVX2: # %bb.0: @@ -275,7 +652,7 @@ define i8 @reduce_ctpop_v16i8(<16 x i8> %a0) { ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: reduce_ctpop_v16i8: ; AVX512VL: # %bb.0: @@ -313,56 +690,241 @@ define i8 @reduce_ctpop_v16i8(<16 x i8> %a0) { ret i8 %r0 } -define i64 @reduce_ctpop_v4i64(<4 x i64> %a0) { -; SSE42-LABEL: reduce_ctpop_v4i64: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pand %xmm2, %xmm3 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm4, %xmm5 -; SSE42-NEXT: pshufb %xmm3, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm2, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm3 -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: paddb %xmm5, %xmm3 -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pand %xmm2, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm5 -; SSE42-NEXT: pshufb %xmm1, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm2, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm4 -; SSE42-NEXT: paddb %xmm5, %xmm4 -; SSE42-NEXT: paddb %xmm3, %xmm4 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: psadbw %xmm4, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: retq -; -; AVX2-LABEL: reduce_ctpop_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @reduce_ctpop_v4i64(<4 x i64> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v4i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $1, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: psubb %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddb %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v4i64: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlw $1, %xmm2 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-SSE2-NEXT: pand %xmm3, %xmm2 +; X64-SSE2-NEXT: psubb %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: psrlw $2, %xmm1 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: paddb %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: psrlw $4, %xmm4 +; X64-SSE2-NEXT: paddb %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: psrlw $1, %xmm5 +; X64-SSE2-NEXT: pand %xmm3, %xmm5 +; X64-SSE2-NEXT: psubb %xmm5, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: paddb %xmm3, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: psrlw $4, %xmm2 +; X64-SSE2-NEXT: paddb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: paddb %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v4i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm4 +; X86-SSE4-NEXT: paddb %xmm5, %xmm4 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: paddb %xmm5, %xmm2 +; X86-SSE4-NEXT: paddb %xmm4, %xmm2 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v4i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE4-NEXT: pand %xmm2, %xmm3 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm2, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm3 +; X64-SSE4-NEXT: paddb %xmm5, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE4-NEXT: pand %xmm2, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm2, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm4 +; X64-SSE4-NEXT: paddb %xmm5, %xmm4 +; X64-SSE4-NEXT: paddb %xmm3, %xmm4 +; X64-SSE4-NEXT: pxor %xmm0, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm4, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v4i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: reduce_ctpop_v4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: reduce_ctpop_v4i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; X64-AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X64-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v4i64: ; AVX512VL: # %bb.0: @@ -398,46 +960,212 @@ define i64 @reduce_ctpop_v4i64(<4 x i64> %a0) { ret i64 %r0 } -define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) { -; SSE42-LABEL: reduce_ctpop_v8i32: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm2, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm4 -; SSE42-NEXT: pshufb %xmm1, %xmm4 -; SSE42-NEXT: paddb %xmm5, %xmm4 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: psadbw %xmm1, %xmm4 -; SSE42-NEXT: psadbw %xmm1, %xmm5 -; SSE42-NEXT: packuswb %xmm4, %xmm5 -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pshufb %xmm4, %xmm6 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm3, %xmm0 -; SSE42-NEXT: pshufb %xmm0, %xmm2 -; SSE42-NEXT: paddb %xmm6, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE42-NEXT: psadbw %xmm1, %xmm2 -; SSE42-NEXT: psadbw %xmm1, %xmm0 -; SSE42-NEXT: packuswb %xmm2, %xmm0 -; SSE42-NEXT: paddd %xmm5, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE42-NEXT: paddd %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE42-NEXT: paddd %xmm1, %xmm0 -; SSE42-NEXT: movd %xmm0, %eax -; SSE42-NEXT: retq +define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) nounwind { +; SSE2-LABEL: reduce_ctpop_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: psubb %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psrlw $1, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: psubb %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: paddb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm3 +; SSE2-NEXT: paddb %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: ret{{[l|q]}} +; +; X86-SSE4-LABEL: reduce_ctpop_v8i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm6 +; X86-SSE4-NEXT: paddb %xmm5, %xmm6 +; X86-SSE4-NEXT: pxor %xmm4, %xmm4 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; X86-SSE4-NEXT: psadbw %xmm4, %xmm6 +; X86-SSE4-NEXT: psadbw %xmm4, %xmm1 +; X86-SSE4-NEXT: packuswb %xmm6, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE4-NEXT: pand %xmm3, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm6 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: paddb %xmm6, %xmm2 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE4-NEXT: psadbw %xmm4, %xmm2 +; X86-SSE4-NEXT: psadbw %xmm4, %xmm0 +; X86-SSE4-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE4-NEXT: paddd %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddd %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE4-NEXT: paddd %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v8i32: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE4-NEXT: pand %xmm3, %xmm4 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm3, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm4 +; X64-SSE4-NEXT: paddb %xmm5, %xmm4 +; X64-SSE4-NEXT: pxor %xmm1, %xmm1 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X64-SSE4-NEXT: psadbw %xmm1, %xmm4 +; X64-SSE4-NEXT: psadbw %xmm1, %xmm5 +; X64-SSE4-NEXT: packuswb %xmm4, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE4-NEXT: pand %xmm3, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm6 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm3, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X64-SSE4-NEXT: paddb %xmm6, %xmm2 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE4-NEXT: psadbw %xmm1, %xmm2 +; X64-SSE4-NEXT: psadbw %xmm1, %xmm0 +; X64-SSE4-NEXT: packuswb %xmm2, %xmm0 +; X64-SSE4-NEXT: paddd %xmm5, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE4-NEXT: paddd %xmm1, %xmm0 +; X64-SSE4-NEXT: movd %xmm0, %eax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm1, %xmm3, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; X86-AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v8i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq ; ; AVX2-LABEL: reduce_ctpop_v8i32: ; AVX2: # %bb.0: @@ -464,7 +1192,7 @@ define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: reduce_ctpop_v8i32: ; AVX512VL: # %bb.0: @@ -504,83 +1232,395 @@ define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) { ret i32 %r0 } -define i64 @reduce_ctpop_v8i64(<8 x i64> %a0) { -; SSE42-LABEL: reduce_ctpop_v8i64: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm4, %xmm7 -; SSE42-NEXT: pshufb %xmm6, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm2, %xmm6 -; SSE42-NEXT: paddb %xmm7, %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm7 -; SSE42-NEXT: pshufb %xmm2, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm4, %xmm2 -; SSE42-NEXT: pshufb %xmm0, %xmm2 -; SSE42-NEXT: paddb %xmm7, %xmm2 -; SSE42-NEXT: paddb %xmm6, %xmm2 -; SSE42-NEXT: movdqa %xmm3, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm0, %xmm6 -; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: movdqa %xmm4, %xmm0 -; SSE42-NEXT: pshufb %xmm3, %xmm0 -; SSE42-NEXT: paddb %xmm6, %xmm0 -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm3, %xmm6 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: pshufb %xmm1, %xmm4 -; SSE42-NEXT: paddb %xmm6, %xmm4 -; SSE42-NEXT: paddb %xmm0, %xmm4 -; SSE42-NEXT: paddb %xmm2, %xmm4 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: psadbw %xmm4, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: retq -; -; AVX2-LABEL: reduce_ctpop_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @reduce_ctpop_v8i64(<8 x i64> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v8i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: paddb %xmm6, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: psubb %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlw $4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm0, %xmm6 +; X86-SSE2-NEXT: pand %xmm2, %xmm6 +; X86-SSE2-NEXT: paddb %xmm7, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: paddb %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: psrlw $2, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddb %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: paddb %xmm6, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v8i64: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: psrlw $1, %xmm4 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-SSE2-NEXT: pand %xmm5, %xmm4 +; X64-SSE2-NEXT: psubb %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pand %xmm4, %xmm6 +; X64-SSE2-NEXT: psrlw $2, %xmm2 +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: paddb %xmm6, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X64-SSE2-NEXT: psrlw $4, %xmm7 +; X64-SSE2-NEXT: paddb %xmm2, %xmm7 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE2-NEXT: pand %xmm2, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: psrlw $1, %xmm6 +; X64-SSE2-NEXT: pand %xmm5, %xmm6 +; X64-SSE2-NEXT: psubb %xmm6, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: pand %xmm4, %xmm6 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: paddb %xmm6, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: psrlw $4, %xmm6 +; X64-SSE2-NEXT: paddb %xmm0, %xmm6 +; X64-SSE2-NEXT: pand %xmm2, %xmm6 +; X64-SSE2-NEXT: paddb %xmm7, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrlw $1, %xmm0 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: psubb %xmm0, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: psrlw $2, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm0, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrlw $4, %xmm0 +; X64-SSE2-NEXT: paddb %xmm3, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: psrlw $1, %xmm3 +; X64-SSE2-NEXT: pand %xmm5, %xmm3 +; X64-SSE2-NEXT: psubb %xmm3, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm1 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm3, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: psrlw $4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: paddb %xmm0, %xmm3 +; X64-SSE2-NEXT: paddb %xmm6, %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v8i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE4-NEXT: pand %xmm4, %xmm6 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm6 +; X86-SSE4-NEXT: paddb %xmm7, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: paddb %xmm7, %xmm2 +; X86-SSE4-NEXT: paddb %xmm6, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm6 +; X86-SSE4-NEXT: psrlw $4, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm0 +; X86-SSE4-NEXT: paddb %xmm6, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm6 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm3 +; X86-SSE4-NEXT: paddb %xmm6, %xmm3 +; X86-SSE4-NEXT: paddb %xmm0, %xmm3 +; X86-SSE4-NEXT: paddb %xmm2, %xmm3 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v8i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE4-NEXT: pand %xmm5, %xmm6 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm2 +; X64-SSE4-NEXT: pand %xmm5, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm6 +; X64-SSE4-NEXT: paddb %xmm7, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pand %xmm5, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm5, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X64-SSE4-NEXT: paddb %xmm7, %xmm2 +; X64-SSE4-NEXT: paddb %xmm6, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE4-NEXT: pand %xmm5, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm6 +; X64-SSE4-NEXT: psrlw $4, %xmm3 +; X64-SSE4-NEXT: pand %xmm5, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm0 +; X64-SSE4-NEXT: paddb %xmm6, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE4-NEXT: pand %xmm5, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm6 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm5, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm4 +; X64-SSE4-NEXT: paddb %xmm6, %xmm4 +; X64-SSE4-NEXT: paddb %xmm0, %xmm4 +; X64-SSE4-NEXT: paddb %xmm2, %xmm4 +; X64-SSE4-NEXT: pxor %xmm0, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm4, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v8i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpand %xmm2, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v8i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; X64-AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; X64-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm6 +; X64-AVX1-NEXT: vpand %xmm2, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm5 +; X64-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: reduce_ctpop_v8i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; X86-AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: reduce_ctpop_v8i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; X64-AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v8i64: ; AVX512VL: # %bb.0: @@ -616,76 +1656,406 @@ define i64 @reduce_ctpop_v8i64(<8 x i64> %a0) { ret i64 %r0 } -define i32 @reduce_ctpop_v16i32(<16 x i32> %a0) { -; SSE42-LABEL: reduce_ctpop_v16i32: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm4, %xmm7 -; SSE42-NEXT: pshufb %xmm6, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm5, %xmm2 -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm2, %xmm6 -; SSE42-NEXT: paddb %xmm7, %xmm6 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE42-NEXT: psadbw %xmm2, %xmm6 -; SSE42-NEXT: psadbw %xmm2, %xmm7 -; SSE42-NEXT: packuswb %xmm6, %xmm7 -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa %xmm4, %xmm8 -; SSE42-NEXT: pshufb %xmm6, %xmm8 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm5, %xmm0 -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm0, %xmm6 -; SSE42-NEXT: paddb %xmm8, %xmm6 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE42-NEXT: psadbw %xmm2, %xmm6 -; SSE42-NEXT: psadbw %xmm2, %xmm0 -; SSE42-NEXT: packuswb %xmm6, %xmm0 -; SSE42-NEXT: paddd %xmm7, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa %xmm4, %xmm7 -; SSE42-NEXT: pshufb %xmm6, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm5, %xmm3 -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pshufb %xmm3, %xmm6 -; SSE42-NEXT: paddb %xmm7, %xmm6 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE42-NEXT: psadbw %xmm2, %xmm6 -; SSE42-NEXT: psadbw %xmm2, %xmm3 -; SSE42-NEXT: packuswb %xmm6, %xmm3 -; SSE42-NEXT: movdqa %xmm1, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa %xmm4, %xmm7 -; SSE42-NEXT: pshufb %xmm6, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm5, %xmm1 -; SSE42-NEXT: pshufb %xmm1, %xmm4 -; SSE42-NEXT: paddb %xmm7, %xmm4 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE42-NEXT: psadbw %xmm2, %xmm4 -; SSE42-NEXT: psadbw %xmm2, %xmm1 -; SSE42-NEXT: packuswb %xmm4, %xmm1 -; SSE42-NEXT: paddd %xmm3, %xmm1 -; SSE42-NEXT: paddd %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE42-NEXT: paddd %xmm1, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE42-NEXT: paddd %xmm0, %xmm1 -; SSE42-NEXT: movd %xmm1, %eax -; SSE42-NEXT: retq +define i32 @reduce_ctpop_v16i32(<16 x i32> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v16i32: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: paddb %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm5, %xmm7 +; X86-SSE2-NEXT: pxor %xmm2, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm6 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm6 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm7 +; X86-SSE2-NEXT: packuswb %xmm6, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: psubb %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlw $4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm0, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm6 +; X86-SSE2-NEXT: packuswb %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm0 +; X86-SSE2-NEXT: paddd %xmm7, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: psrlw $1, %xmm7 +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: psubb %xmm7, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: pand %xmm3, %xmm7 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm7, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm0, %xmm7 +; X86-SSE2-NEXT: pand %xmm5, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm7 +; X86-SSE2-NEXT: packuswb %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm5, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-SSE2-NEXT: psadbw %xmm2, %xmm3 +; X86-SSE2-NEXT: packuswb %xmm0, %xmm3 +; X86-SSE2-NEXT: paddd %xmm7, %xmm3 +; X86-SSE2-NEXT: paddd %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: paddd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: paddd %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v16i32: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: psrlw $1, %xmm4 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-SSE2-NEXT: pand %xmm5, %xmm4 +; X64-SSE2-NEXT: psubb %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pand %xmm4, %xmm6 +; X64-SSE2-NEXT: psrlw $2, %xmm2 +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: paddb %xmm6, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm8 +; X64-SSE2-NEXT: psrlw $4, %xmm8 +; X64-SSE2-NEXT: paddb %xmm2, %xmm8 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE2-NEXT: pand %xmm6, %xmm8 +; X64-SSE2-NEXT: pxor %xmm2, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm7 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm7 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm8 +; X64-SSE2-NEXT: packuswb %xmm7, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: psrlw $1, %xmm7 +; X64-SSE2-NEXT: pand %xmm5, %xmm7 +; X64-SSE2-NEXT: psubb %xmm7, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pand %xmm4, %xmm7 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: paddb %xmm7, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: psrlw $4, %xmm7 +; X64-SSE2-NEXT: paddb %xmm0, %xmm7 +; X64-SSE2-NEXT: pand %xmm6, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm7 +; X64-SSE2-NEXT: packuswb %xmm0, %xmm7 +; X64-SSE2-NEXT: paddd %xmm8, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrlw $1, %xmm0 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: psubb %xmm0, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: psrlw $2, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm0, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrlw $4, %xmm0 +; X64-SSE2-NEXT: paddb %xmm3, %xmm0 +; X64-SSE2-NEXT: pand %xmm6, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm3 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm3, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: psrlw $1, %xmm3 +; X64-SSE2-NEXT: pand %xmm5, %xmm3 +; X64-SSE2-NEXT: psubb %xmm3, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm1 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm3, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: psrlw $4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm1 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-SSE2-NEXT: psadbw %xmm2, %xmm3 +; X64-SSE2-NEXT: packuswb %xmm1, %xmm3 +; X64-SSE2-NEXT: paddd %xmm0, %xmm3 +; X64-SSE2-NEXT: paddd %xmm7, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X64-SSE2-NEXT: paddd %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: paddd %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v16i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm6, %xmm3 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm3 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X86-SSE4-NEXT: paddb %xmm3, %xmm7 +; X86-SSE4-NEXT: pxor %xmm2, %xmm2 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; X86-SSE4-NEXT: psadbw %xmm2, %xmm7 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm5 +; X86-SSE4-NEXT: packuswb %xmm7, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: pand %xmm4, %xmm3 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm3, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm3 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm0 +; X86-SSE4-NEXT: paddb %xmm7, %xmm3 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE4-NEXT: psadbw %xmm2, %xmm3 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm7 +; X86-SSE4-NEXT: packuswb %xmm3, %xmm7 +; X86-SSE4-NEXT: paddd %xmm5, %xmm7 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: pand %xmm4, %xmm3 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm3, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm3 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE4-NEXT: paddb %xmm5, %xmm3 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE4-NEXT: psadbw %xmm2, %xmm3 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm5 +; X86-SSE4-NEXT: packuswb %xmm3, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm3 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm6 +; X86-SSE4-NEXT: paddb %xmm3, %xmm6 +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero +; X86-SSE4-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X86-SSE4-NEXT: psadbw %xmm2, %xmm6 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE4-NEXT: packuswb %xmm6, %xmm0 +; X86-SSE4-NEXT: paddd %xmm5, %xmm0 +; X86-SSE4-NEXT: paddd %xmm7, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddd %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE4-NEXT: paddd %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v16i32: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE4-NEXT: pand %xmm5, %xmm6 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm2 +; X64-SSE4-NEXT: pand %xmm5, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm6 +; X64-SSE4-NEXT: paddb %xmm7, %xmm6 +; X64-SSE4-NEXT: pxor %xmm2, %xmm2 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X64-SSE4-NEXT: psadbw %xmm2, %xmm6 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm7 +; X64-SSE4-NEXT: packuswb %xmm6, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE4-NEXT: pand %xmm5, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm8 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm8 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm5, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm6 +; X64-SSE4-NEXT: paddb %xmm8, %xmm6 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X64-SSE4-NEXT: psadbw %xmm2, %xmm6 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm0 +; X64-SSE4-NEXT: packuswb %xmm6, %xmm0 +; X64-SSE4-NEXT: paddd %xmm7, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X64-SSE4-NEXT: pand %xmm5, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm3 +; X64-SSE4-NEXT: pand %xmm5, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm6 +; X64-SSE4-NEXT: paddb %xmm7, %xmm6 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; X64-SSE4-NEXT: psadbw %xmm2, %xmm6 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm3 +; X64-SSE4-NEXT: packuswb %xmm6, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE4-NEXT: pand %xmm5, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm5, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm4 +; X64-SSE4-NEXT: paddb %xmm7, %xmm4 +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero +; X64-SSE4-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X64-SSE4-NEXT: psadbw %xmm2, %xmm4 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm1 +; X64-SSE4-NEXT: packuswb %xmm4, %xmm1 +; X64-SSE4-NEXT: paddd %xmm3, %xmm1 +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: paddd %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: movd %xmm1, %eax +; X64-SSE4-NEXT: retq +; +; AVX1-LABEL: reduce_ctpop_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm7 +; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: reduce_ctpop_v16i32: ; AVX2: # %bb.0: @@ -724,7 +2094,7 @@ define i32 @reduce_ctpop_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: reduce_ctpop_v16i32: ; AVX512VL: # %bb.0: @@ -768,137 +2138,708 @@ define i32 @reduce_ctpop_v16i32(<16 x i32> %a0) { ret i32 %r0 } -define i64 @reduce_ctpop_v16i64(<16 x i64> %a0) { -; SSE42-LABEL: reduce_ctpop_v16i64: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm5, %xmm10 -; SSE42-NEXT: pand %xmm9, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm8, %xmm11 -; SSE42-NEXT: pshufb %xmm10, %xmm11 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: pand %xmm9, %xmm5 -; SSE42-NEXT: movdqa %xmm8, %xmm10 -; SSE42-NEXT: pshufb %xmm5, %xmm10 -; SSE42-NEXT: paddb %xmm11, %xmm10 -; SSE42-NEXT: movdqa %xmm1, %xmm5 -; SSE42-NEXT: pand %xmm9, %xmm5 -; SSE42-NEXT: movdqa %xmm8, %xmm11 -; SSE42-NEXT: pshufb %xmm5, %xmm11 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm9, %xmm1 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm1, %xmm5 -; SSE42-NEXT: paddb %xmm11, %xmm5 -; SSE42-NEXT: paddb %xmm10, %xmm5 -; SSE42-NEXT: movdqa %xmm7, %xmm1 -; SSE42-NEXT: pand %xmm9, %xmm1 -; SSE42-NEXT: movdqa %xmm8, %xmm10 -; SSE42-NEXT: pshufb %xmm1, %xmm10 -; SSE42-NEXT: psrlw $4, %xmm7 -; SSE42-NEXT: pand %xmm9, %xmm7 -; SSE42-NEXT: movdqa %xmm8, %xmm11 -; SSE42-NEXT: pshufb %xmm7, %xmm11 -; SSE42-NEXT: paddb %xmm10, %xmm11 -; SSE42-NEXT: movdqa %xmm3, %xmm1 -; SSE42-NEXT: pand %xmm9, %xmm1 -; SSE42-NEXT: movdqa %xmm8, %xmm7 -; SSE42-NEXT: pshufb %xmm1, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm9, %xmm3 -; SSE42-NEXT: movdqa %xmm8, %xmm1 -; SSE42-NEXT: pshufb %xmm3, %xmm1 -; SSE42-NEXT: paddb %xmm7, %xmm1 -; SSE42-NEXT: paddb %xmm11, %xmm1 -; SSE42-NEXT: paddb %xmm5, %xmm1 -; SSE42-NEXT: movdqa %xmm4, %xmm3 -; SSE42-NEXT: pand %xmm9, %xmm3 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm3, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm4 -; SSE42-NEXT: pand %xmm9, %xmm4 -; SSE42-NEXT: movdqa %xmm8, %xmm7 -; SSE42-NEXT: pshufb %xmm4, %xmm7 -; SSE42-NEXT: paddb %xmm5, %xmm7 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: pand %xmm9, %xmm3 -; SSE42-NEXT: movdqa %xmm8, %xmm4 -; SSE42-NEXT: pshufb %xmm3, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: movdqa %xmm8, %xmm3 -; SSE42-NEXT: pshufb %xmm0, %xmm3 -; SSE42-NEXT: paddb %xmm4, %xmm3 -; SSE42-NEXT: paddb %xmm7, %xmm3 -; SSE42-NEXT: movdqa %xmm6, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: movdqa %xmm8, %xmm4 -; SSE42-NEXT: pshufb %xmm0, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm6 -; SSE42-NEXT: pand %xmm9, %xmm6 -; SSE42-NEXT: movdqa %xmm8, %xmm0 -; SSE42-NEXT: pshufb %xmm6, %xmm0 -; SSE42-NEXT: paddb %xmm4, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm4 -; SSE42-NEXT: pand %xmm9, %xmm4 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm9, %xmm2 -; SSE42-NEXT: pshufb %xmm2, %xmm8 -; SSE42-NEXT: paddb %xmm5, %xmm8 -; SSE42-NEXT: paddb %xmm0, %xmm8 -; SSE42-NEXT: paddb %xmm3, %xmm8 -; SSE42-NEXT: paddb %xmm1, %xmm8 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: psadbw %xmm8, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: retq -; -; AVX2-LABEL: reduce_ctpop_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vpaddb %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpaddb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm3 -; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @reduce_ctpop_v16i64(<16 x i64> %a0) nounwind { +; X86-SSE2-LABEL: reduce_ctpop_v16i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: psrlw $2, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: paddb %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm6, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: psrlw $1, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: psubb %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: psrlw $2, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddb %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: psrlw $4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm1, %xmm7 +; X86-SSE2-NEXT: pand %xmm1, %xmm6 +; X86-SSE2-NEXT: paddb %xmm7, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $2, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: paddb %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm5, %xmm7 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $2, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: paddb %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm7, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm5 +; X86-SSE2-NEXT: paddb %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: psubb %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $2, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: paddb %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm5, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlw $4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm7, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm2, %xmm7 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: paddb %xmm7, %xmm0 +; X86-SSE2-NEXT: paddb %xmm6, %xmm0 +; X86-SSE2-NEXT: paddb %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v16i64: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm5, %xmm8 +; X64-SSE2-NEXT: psrlw $1, %xmm8 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-SSE2-NEXT: pand %xmm9, %xmm8 +; X64-SSE2-NEXT: psubb %xmm8, %xmm5 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm5, %xmm10 +; X64-SSE2-NEXT: pand %xmm8, %xmm10 +; X64-SSE2-NEXT: psrlw $2, %xmm5 +; X64-SSE2-NEXT: pand %xmm8, %xmm5 +; X64-SSE2-NEXT: paddb %xmm10, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm11 +; X64-SSE2-NEXT: psrlw $4, %xmm11 +; X64-SSE2-NEXT: paddb %xmm5, %xmm11 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE2-NEXT: pand %xmm5, %xmm11 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm10 +; X64-SSE2-NEXT: psrlw $1, %xmm10 +; X64-SSE2-NEXT: pand %xmm9, %xmm10 +; X64-SSE2-NEXT: psubb %xmm10, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm10 +; X64-SSE2-NEXT: pand %xmm8, %xmm10 +; X64-SSE2-NEXT: psrlw $2, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: paddb %xmm10, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm10 +; X64-SSE2-NEXT: psrlw $4, %xmm10 +; X64-SSE2-NEXT: paddb %xmm1, %xmm10 +; X64-SSE2-NEXT: pand %xmm5, %xmm10 +; X64-SSE2-NEXT: paddb %xmm11, %xmm10 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: psrlw $2, %xmm7 +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: paddb %xmm1, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm11 +; X64-SSE2-NEXT: psrlw $4, %xmm11 +; X64-SSE2-NEXT: paddb %xmm7, %xmm11 +; X64-SSE2-NEXT: pand %xmm5, %xmm11 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: psrlw $2, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: paddb %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: paddb %xmm11, %xmm1 +; X64-SSE2-NEXT: paddb %xmm10, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE2-NEXT: psrlw $1, %xmm3 +; X64-SSE2-NEXT: pand %xmm9, %xmm3 +; X64-SSE2-NEXT: psubb %xmm3, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: paddb %xmm3, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm7 +; X64-SSE2-NEXT: psrlw $4, %xmm7 +; X64-SSE2-NEXT: paddb %xmm4, %xmm7 +; X64-SSE2-NEXT: pand %xmm5, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psrlw $1, %xmm3 +; X64-SSE2-NEXT: pand %xmm9, %xmm3 +; X64-SSE2-NEXT: psubb %xmm3, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: paddb %xmm3, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psrlw $4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm0, %xmm3 +; X64-SSE2-NEXT: pand %xmm5, %xmm3 +; X64-SSE2-NEXT: paddb %xmm7, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: psrlw $1, %xmm0 +; X64-SSE2-NEXT: pand %xmm9, %xmm0 +; X64-SSE2-NEXT: psubb %xmm0, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: psrlw $2, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: paddb %xmm0, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: psrlw $4, %xmm0 +; X64-SSE2-NEXT: paddb %xmm6, %xmm0 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: psrlw $1, %xmm4 +; X64-SSE2-NEXT: pand %xmm9, %xmm4 +; X64-SSE2-NEXT: psubb %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: psrlw $2, %xmm2 +; X64-SSE2-NEXT: pand %xmm8, %xmm2 +; X64-SSE2-NEXT: paddb %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: psrlw $4, %xmm4 +; X64-SSE2-NEXT: paddb %xmm2, %xmm4 +; X64-SSE2-NEXT: pand %xmm5, %xmm4 +; X64-SSE2-NEXT: paddb %xmm0, %xmm4 +; X64-SSE2-NEXT: paddb %xmm3, %xmm4 +; X64-SSE2-NEXT: paddb %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v16i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $32, %esp +; X86-SSE4-NEXT: movaps %xmm2, (%esp) # 16-byte Spill +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE4-NEXT: pand %xmm4, %xmm6 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm6 +; X86-SSE4-NEXT: paddb %xmm7, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm1 +; X86-SSE4-NEXT: paddb %xmm7, %xmm5 +; X86-SSE4-NEXT: paddb %xmm6, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE4-NEXT: pand %xmm4, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm6 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm1 +; X86-SSE4-NEXT: paddb %xmm7, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE4-NEXT: pand %xmm4, %xmm7 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm7, %xmm0 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm7 +; X86-SSE4-NEXT: paddb %xmm0, %xmm7 +; X86-SSE4-NEXT: movdqa 24(%ebp), %xmm0 +; X86-SSE4-NEXT: paddb %xmm6, %xmm7 +; X86-SSE4-NEXT: paddb %xmm5, %xmm7 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm6 +; X86-SSE4-NEXT: paddb %xmm5, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm5 +; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm2 +; X86-SSE4-NEXT: paddb %xmm1, %xmm5 +; X86-SSE4-NEXT: paddb %xmm6, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm0 +; X86-SSE4-NEXT: paddb %xmm1, %xmm0 +; X86-SSE4-NEXT: movdqa (%esp), %xmm6 # 16-byte Reload +; X86-SSE4-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm6 +; X86-SSE4-NEXT: pand %xmm4, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm3 +; X86-SSE4-NEXT: paddb %xmm2, %xmm3 +; X86-SSE4-NEXT: paddb %xmm0, %xmm3 +; X86-SSE4-NEXT: paddb %xmm5, %xmm3 +; X86-SSE4-NEXT: paddb %xmm7, %xmm3 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v16i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm5, %xmm10 +; X64-SSE4-NEXT: pand %xmm9, %xmm10 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm8 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm10, %xmm11 +; X64-SSE4-NEXT: psrlw $4, %xmm5 +; X64-SSE4-NEXT: pand %xmm9, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm10 +; X64-SSE4-NEXT: pshufb %xmm5, %xmm10 +; X64-SSE4-NEXT: paddb %xmm11, %xmm10 +; X64-SSE4-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE4-NEXT: pand %xmm9, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm5, %xmm11 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm9, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X64-SSE4-NEXT: paddb %xmm11, %xmm5 +; X64-SSE4-NEXT: paddb %xmm10, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE4-NEXT: pand %xmm9, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm10 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm10 +; X64-SSE4-NEXT: psrlw $4, %xmm7 +; X64-SSE4-NEXT: pand %xmm9, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm7, %xmm11 +; X64-SSE4-NEXT: paddb %xmm10, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE4-NEXT: pand %xmm9, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm3 +; X64-SSE4-NEXT: pand %xmm9, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm1 +; X64-SSE4-NEXT: paddb %xmm7, %xmm1 +; X64-SSE4-NEXT: paddb %xmm11, %xmm1 +; X64-SSE4-NEXT: paddb %xmm5, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE4-NEXT: pand %xmm9, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm4 +; X64-SSE4-NEXT: pand %xmm9, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm7 +; X64-SSE4-NEXT: paddb %xmm5, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE4-NEXT: pand %xmm9, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm4 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm9, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm3 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X64-SSE4-NEXT: paddb %xmm4, %xmm3 +; X64-SSE4-NEXT: paddb %xmm7, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE4-NEXT: pand %xmm9, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm4 +; X64-SSE4-NEXT: psrlw $4, %xmm6 +; X64-SSE4-NEXT: pand %xmm9, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm0 +; X64-SSE4-NEXT: paddb %xmm4, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE4-NEXT: pand %xmm9, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm2 +; X64-SSE4-NEXT: pand %xmm9, %xmm2 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm8 +; X64-SSE4-NEXT: paddb %xmm5, %xmm8 +; X64-SSE4-NEXT: paddb %xmm0, %xmm8 +; X64-SSE4-NEXT: paddb %xmm3, %xmm8 +; X64-SSE4-NEXT: paddb %xmm1, %xmm8 +; X64-SSE4-NEXT: pxor %xmm0, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm8, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v16i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $96, %esp +; X86-AVX1-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm7 +; X86-AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; X86-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm7 +; X86-AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; X86-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; X86-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm7, %xmm7 +; X86-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 +; X86-AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpaddb {{[-0-9]+}}(%e{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %ymm6 # 32-byte Reload +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v16i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm7 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; X64-AVX1-NEXT: vpand %xmm4, %xmm7, %xmm8 +; X64-AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; X64-AVX1-NEXT: vpsrlw $4, %xmm7, %xmm7 +; X64-AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 +; X64-AVX1-NEXT: vpand %xmm4, %xmm7, %xmm8 +; X64-AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; X64-AVX1-NEXT: vpsrlw $4, %xmm7, %xmm7 +; X64-AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; X64-AVX1-NEXT: vpand %xmm4, %xmm8, %xmm9 +; X64-AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm9 +; X64-AVX1-NEXT: vpsrlw $4, %xmm8, %xmm8 +; X64-AVX1-NEXT: vpand %xmm4, %xmm8, %xmm8 +; X64-AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; X64-AVX1-NEXT: vpaddb %xmm9, %xmm8, %xmm8 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpand %xmm4, %xmm0, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; X64-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: reduce_ctpop_v16i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm5 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm6 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; X86-AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; X86-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpaddb %ymm6, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm6 +; X86-AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm6, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm3, %ymm5, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpsrlw $4, %ymm5, %ymm5 +; X86-AVX2-NEXT: vpand %ymm3, %ymm5, %ymm5 +; X86-AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm5, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm5 +; X86-AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; X86-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: reduce_ctpop_v16i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; X64-AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; X64-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; X64-AVX2-NEXT: vpaddb %ymm5, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm5 +; X64-AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; X64-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; X64-AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; X64-AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm3 +; X64-AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; X64-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v16i64: ; AVX512VL: # %bb.0: @@ -955,145 +2896,736 @@ define i64 @reduce_ctpop_v16i64(<16 x i64> %a0) { ; define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i64> %a3) nounwind { -; SSE42-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm1, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm9, %xmm11 -; SSE42-NEXT: pshufb %xmm0, %xmm11 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm10, %xmm1 -; SSE42-NEXT: movdqa %xmm9, %xmm12 -; SSE42-NEXT: pshufb %xmm1, %xmm12 -; SSE42-NEXT: paddb %xmm11, %xmm12 -; SSE42-NEXT: movdqa %xmm8, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: movdqa %xmm9, %xmm1 -; SSE42-NEXT: pshufb %xmm0, %xmm1 -; SSE42-NEXT: psrlw $4, %xmm8 -; SSE42-NEXT: pand %xmm10, %xmm8 -; SSE42-NEXT: movdqa %xmm9, %xmm0 -; SSE42-NEXT: pshufb %xmm8, %xmm0 -; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb %xmm12, %xmm0 -; SSE42-NEXT: movdqa %xmm3, %xmm1 -; SSE42-NEXT: pand %xmm10, %xmm1 -; SSE42-NEXT: movdqa %xmm9, %xmm8 -; SSE42-NEXT: pshufb %xmm1, %xmm8 -; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm10, %xmm3 -; SSE42-NEXT: movdqa %xmm9, %xmm1 -; SSE42-NEXT: pshufb %xmm3, %xmm1 -; SSE42-NEXT: paddb %xmm8, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm3 -; SSE42-NEXT: pand %xmm10, %xmm3 -; SSE42-NEXT: movdqa %xmm9, %xmm8 -; SSE42-NEXT: pshufb %xmm3, %xmm8 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm10, %xmm2 -; SSE42-NEXT: movdqa %xmm9, %xmm3 -; SSE42-NEXT: pshufb %xmm2, %xmm3 -; SSE42-NEXT: paddb %xmm8, %xmm3 -; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: movdqa %xmm5, %xmm1 -; SSE42-NEXT: pand %xmm10, %xmm1 -; SSE42-NEXT: movdqa %xmm9, %xmm2 -; SSE42-NEXT: pshufb %xmm1, %xmm2 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: pand %xmm10, %xmm5 -; SSE42-NEXT: movdqa %xmm9, %xmm8 -; SSE42-NEXT: pshufb %xmm5, %xmm8 -; SSE42-NEXT: paddb %xmm2, %xmm8 -; SSE42-NEXT: movdqa %xmm4, %xmm1 -; SSE42-NEXT: pand %xmm10, %xmm1 -; SSE42-NEXT: movdqa %xmm9, %xmm2 -; SSE42-NEXT: pshufb %xmm1, %xmm2 -; SSE42-NEXT: psrlw $4, %xmm4 -; SSE42-NEXT: pand %xmm10, %xmm4 -; SSE42-NEXT: movdqa %xmm9, %xmm1 -; SSE42-NEXT: pshufb %xmm4, %xmm1 -; SSE42-NEXT: paddb %xmm2, %xmm1 -; SSE42-NEXT: paddb %xmm8, %xmm1 -; SSE42-NEXT: movdqa %xmm7, %xmm2 -; SSE42-NEXT: pand %xmm10, %xmm2 -; SSE42-NEXT: movdqa %xmm9, %xmm4 -; SSE42-NEXT: pshufb %xmm2, %xmm4 -; SSE42-NEXT: psrlw $4, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: movdqa %xmm9, %xmm2 -; SSE42-NEXT: pshufb %xmm7, %xmm2 -; SSE42-NEXT: paddb %xmm4, %xmm2 -; SSE42-NEXT: movdqa %xmm6, %xmm4 -; SSE42-NEXT: pand %xmm10, %xmm4 -; SSE42-NEXT: movdqa %xmm9, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm6 -; SSE42-NEXT: pand %xmm10, %xmm6 -; SSE42-NEXT: pshufb %xmm6, %xmm9 -; SSE42-NEXT: paddb %xmm5, %xmm9 -; SSE42-NEXT: paddb %xmm2, %xmm9 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: psadbw %xmm2, %xmm0 -; SSE42-NEXT: psadbw %xmm2, %xmm3 -; SSE42-NEXT: psadbw %xmm2, %xmm1 -; SSE42-NEXT: psadbw %xmm2, %xmm9 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE42-NEXT: paddq %xmm2, %xmm0 -; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; SSE42-NEXT: paddq %xmm2, %xmm1 -; SSE42-NEXT: retq -; -; AVX2-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpsadbw %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm7 -; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpaddb %ymm7, %ymm1, %ymm1 -; AVX2-NEXT: vpsadbw %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm7 -; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vpaddb %ymm7, %ymm2, %ymm2 -; AVX2-NEXT: vpsadbw %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm7 -; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpaddb %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpsadbw %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: retq +; X86-SSE2-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm7 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: paddb %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: paddb %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm7 +; X86-SSE2-NEXT: pand %xmm3, %xmm7 +; X86-SSE2-NEXT: paddb %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm7, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: paddb %xmm1, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $2, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: paddb %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: paddb %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE2-NEXT: psrlw $4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm6, %xmm2 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm6 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6 +; X86-SSE2-NEXT: psubb %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddb %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pand %xmm5, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm4 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm7 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; X86-SSE2-NEXT: paddq %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa %xmm1, %xmm8 +; X64-SSE2-NEXT: psrlw $1, %xmm8 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-SSE2-NEXT: pand %xmm9, %xmm8 +; X64-SSE2-NEXT: psubb %xmm8, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm10 +; X64-SSE2-NEXT: pand %xmm8, %xmm10 +; X64-SSE2-NEXT: psrlw $2, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: paddb %xmm1, %xmm10 +; X64-SSE2-NEXT: movdqa %xmm10, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm10, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE2-NEXT: pand %xmm10, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm11 +; X64-SSE2-NEXT: psrlw $1, %xmm11 +; X64-SSE2-NEXT: pand %xmm9, %xmm11 +; X64-SSE2-NEXT: psubb %xmm11, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm11 +; X64-SSE2-NEXT: pand %xmm8, %xmm11 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: paddb %xmm11, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm11 +; X64-SSE2-NEXT: psrlw $4, %xmm11 +; X64-SSE2-NEXT: paddb %xmm11, %xmm0 +; X64-SSE2-NEXT: pand %xmm10, %xmm0 +; X64-SSE2-NEXT: paddb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: psrlw $2, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: paddb %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm10, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: psrlw $1, %xmm3 +; X64-SSE2-NEXT: pand %xmm9, %xmm3 +; X64-SSE2-NEXT: psubb %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm2 +; X64-SSE2-NEXT: pand %xmm8, %xmm2 +; X64-SSE2-NEXT: paddb %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: psrlw $4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm10, %xmm3 +; X64-SSE2-NEXT: paddb %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: psrlw $2, %xmm5 +; X64-SSE2-NEXT: pand %xmm8, %xmm5 +; X64-SSE2-NEXT: paddb %xmm5, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlw $4, %xmm2 +; X64-SSE2-NEXT: paddb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm10, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pand %xmm8, %xmm5 +; X64-SSE2-NEXT: psrlw $2, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: paddb %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm5, %xmm1 +; X64-SSE2-NEXT: pand %xmm10, %xmm1 +; X64-SSE2-NEXT: paddb %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm2 +; X64-SSE2-NEXT: psrlw $1, %xmm2 +; X64-SSE2-NEXT: pand %xmm9, %xmm2 +; X64-SSE2-NEXT: psubb %xmm2, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm2 +; X64-SSE2-NEXT: pand %xmm8, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm7 +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: paddb %xmm2, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm2 +; X64-SSE2-NEXT: psrlw $4, %xmm2 +; X64-SSE2-NEXT: paddb %xmm7, %xmm2 +; X64-SSE2-NEXT: pand %xmm10, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE2-NEXT: psrlw $1, %xmm4 +; X64-SSE2-NEXT: pand %xmm9, %xmm4 +; X64-SSE2-NEXT: psubb %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: psrlw $2, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: paddb %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE2-NEXT: psrlw $4, %xmm4 +; X64-SSE2-NEXT: paddb %xmm6, %xmm4 +; X64-SSE2-NEXT: pand %xmm10, %xmm4 +; X64-SSE2-NEXT: paddb %xmm2, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm2 +; X64-SSE2-NEXT: psadbw %xmm2, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm2, %xmm3 +; X64-SSE2-NEXT: psadbw %xmm2, %xmm1 +; X64-SSE2-NEXT: psadbw %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; X64-SSE2-NEXT: paddq %xmm2, %xmm1 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm7, %xmm0 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm7, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm7 +; X86-SSE4-NEXT: paddb %xmm2, %xmm7 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: pand %xmm2, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X86-SSE4-NEXT: psrlw $4, %xmm3 +; X86-SSE4-NEXT: pand %xmm2, %xmm3 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm3, %xmm0 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm2 +; X86-SSE4-NEXT: paddb %xmm1, %xmm0 +; X86-SSE4-NEXT: paddb %xmm7, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: pand %xmm7, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm3 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm7, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm1 +; X86-SSE4-NEXT: paddb %xmm3, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE4-NEXT: pand %xmm7, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm7, %xmm3 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pand %xmm3, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm3 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE4-NEXT: paddb %xmm7, %xmm3 +; X86-SSE4-NEXT: paddb %xmm1, %xmm3 +; X86-SSE4-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE4-NEXT: pand %xmm2, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm5 +; X86-SSE4-NEXT: pand %xmm2, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm2 +; X86-SSE4-NEXT: paddb %xmm7, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: pand %xmm7, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm6 +; X86-SSE4-NEXT: pand %xmm7, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm1 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm6 +; X86-SSE4-NEXT: paddb %xmm5, %xmm1 +; X86-SSE4-NEXT: paddb %xmm2, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE4-NEXT: pand %xmm7, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm6 +; X86-SSE4-NEXT: pand %xmm7, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm2 +; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm6 +; X86-SSE4-NEXT: paddb %xmm5, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE4-NEXT: pand %xmm7, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm6 +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm4 +; X86-SSE4-NEXT: paddb %xmm7, %xmm4 +; X86-SSE4-NEXT: paddb %xmm2, %xmm4 +; X86-SSE4-NEXT: pxor %xmm2, %xmm2 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm3 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm1 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm4 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; X86-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE4-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; X86-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; X86-SSE4-NEXT: paddq %xmm2, %xmm1 +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa %xmm0, %xmm8 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE4-NEXT: pand %xmm10, %xmm0 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm9, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm11 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm10, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm12 +; X64-SSE4-NEXT: paddb %xmm11, %xmm12 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm0 +; X64-SSE4-NEXT: pand %xmm10, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X64-SSE4-NEXT: psrlw $4, %xmm8 +; X64-SSE4-NEXT: pand %xmm10, %xmm8 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm0 +; X64-SSE4-NEXT: pshufb %xmm8, %xmm0 +; X64-SSE4-NEXT: paddb %xmm1, %xmm0 +; X64-SSE4-NEXT: paddb %xmm12, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE4-NEXT: pand %xmm10, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm8 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm8 +; X64-SSE4-NEXT: psrlw $4, %xmm3 +; X64-SSE4-NEXT: pand %xmm10, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm1 +; X64-SSE4-NEXT: paddb %xmm8, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE4-NEXT: pand %xmm10, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm8 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm8 +; X64-SSE4-NEXT: psrlw $4, %xmm2 +; X64-SSE4-NEXT: pand %xmm10, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm3 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm3 +; X64-SSE4-NEXT: paddb %xmm8, %xmm3 +; X64-SSE4-NEXT: paddb %xmm1, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm5, %xmm1 +; X64-SSE4-NEXT: pand %xmm10, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm2 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm2 +; X64-SSE4-NEXT: psrlw $4, %xmm5 +; X64-SSE4-NEXT: pand %xmm10, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm8 +; X64-SSE4-NEXT: pshufb %xmm5, %xmm8 +; X64-SSE4-NEXT: paddb %xmm2, %xmm8 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE4-NEXT: pand %xmm10, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm2 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm2 +; X64-SSE4-NEXT: psrlw $4, %xmm4 +; X64-SSE4-NEXT: pand %xmm10, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm1 +; X64-SSE4-NEXT: paddb %xmm2, %xmm1 +; X64-SSE4-NEXT: paddb %xmm8, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm7, %xmm2 +; X64-SSE4-NEXT: pand %xmm10, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm4 +; X64-SSE4-NEXT: psrlw $4, %xmm7 +; X64-SSE4-NEXT: pand %xmm10, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm2 +; X64-SSE4-NEXT: pshufb %xmm7, %xmm2 +; X64-SSE4-NEXT: paddb %xmm4, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE4-NEXT: pand %xmm10, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm6 +; X64-SSE4-NEXT: pand %xmm10, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm9 +; X64-SSE4-NEXT: paddb %xmm5, %xmm9 +; X64-SSE4-NEXT: paddb %xmm2, %xmm9 +; X64-SSE4-NEXT: pxor %xmm2, %xmm2 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm3 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm1 +; X64-SSE4-NEXT: psadbw %xmm2, %xmm9 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; X64-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE4-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; X64-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; X64-SSE4-NEXT: paddq %xmm2, %xmm1 +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm5 +; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm7 +; X86-AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; X86-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm7 +; X86-AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; X86-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 +; X86-AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpaddb %xmm7, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm3[1] +; X86-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm0[1],xmm1[1] +; X86-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm7 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm0, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm1, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; X64-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpsadbw %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm3[1] +; X64-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm0[1],xmm1[1] +; X64-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm3 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm6 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; X86-AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm6 +; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm6, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; X86-AVX2-NEXT: vpsadbw %ymm6, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm7 +; X86-AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; X86-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm7, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpsadbw %ymm6, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm7 +; X86-AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; X86-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; X86-AVX2-NEXT: vpaddb %ymm7, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpsadbw %ymm6, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm7 +; X86-AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; X86-AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; X86-AVX2-NEXT: vpaddb %ymm7, %ymm3, %ymm3 +; X86-AVX2-NEXT: vpsadbw %ymm6, %ymm3, %ymm3 +; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] +; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; X86-AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; X86-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm5 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; X64-AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; X64-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpsadbw %ymm5, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm7 +; X64-AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; X64-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm7, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpsadbw %ymm5, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm7 +; X64-AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; X64-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; X64-AVX2-NEXT: vpaddb %ymm7, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpsadbw %ymm5, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm7 +; X64-AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; X64-AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; X64-AVX2-NEXT: vpaddb %ymm7, %ymm3, %ymm3 +; X64-AVX2-NEXT: vpsadbw %ymm5, %ymm3, %ymm3 +; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] +; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; X64-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: ; AVX512VL: # %bb.0: @@ -1172,310 +3704,1541 @@ define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64> } define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i64> %a3, <4 x i64> %a4, <4 x i64> %a5, <4 x i64> %a6, <4 x i64> %a7) nounwind { -; SSE42-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE42-NEXT: movdqa %xmm1, %xmm11 -; SSE42-NEXT: pand %xmm10, %xmm11 -; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm11, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm1 -; SSE42-NEXT: pand %xmm10, %xmm1 -; SSE42-NEXT: movdqa %xmm8, %xmm11 -; SSE42-NEXT: pshufb %xmm1, %xmm11 -; SSE42-NEXT: paddb %xmm12, %xmm11 -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pand %xmm10, %xmm1 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm1, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: movdqa %xmm8, %xmm1 -; SSE42-NEXT: pshufb %xmm0, %xmm1 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE42-NEXT: paddb %xmm12, %xmm1 -; SSE42-NEXT: paddb %xmm11, %xmm1 -; SSE42-NEXT: movdqa %xmm3, %xmm11 -; SSE42-NEXT: pand %xmm10, %xmm11 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm11, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm3 -; SSE42-NEXT: pand %xmm10, %xmm3 -; SSE42-NEXT: movdqa %xmm8, %xmm13 -; SSE42-NEXT: pshufb %xmm3, %xmm13 -; SSE42-NEXT: paddb %xmm12, %xmm13 -; SSE42-NEXT: movdqa %xmm2, %xmm3 -; SSE42-NEXT: pand %xmm10, %xmm3 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm3, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm2 -; SSE42-NEXT: pand %xmm10, %xmm2 -; SSE42-NEXT: movdqa %xmm8, %xmm3 -; SSE42-NEXT: pshufb %xmm2, %xmm3 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE42-NEXT: paddb %xmm12, %xmm3 -; SSE42-NEXT: paddb %xmm13, %xmm3 -; SSE42-NEXT: movdqa %xmm5, %xmm2 -; SSE42-NEXT: pand %xmm10, %xmm2 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm2, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm5 -; SSE42-NEXT: pand %xmm10, %xmm5 -; SSE42-NEXT: movdqa %xmm8, %xmm13 -; SSE42-NEXT: pshufb %xmm5, %xmm13 -; SSE42-NEXT: paddb %xmm12, %xmm13 -; SSE42-NEXT: movdqa %xmm4, %xmm2 -; SSE42-NEXT: pand %xmm10, %xmm2 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm2, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm4 -; SSE42-NEXT: pand %xmm10, %xmm4 -; SSE42-NEXT: movdqa %xmm8, %xmm2 -; SSE42-NEXT: pshufb %xmm4, %xmm2 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE42-NEXT: paddb %xmm5, %xmm2 -; SSE42-NEXT: paddb %xmm13, %xmm2 -; SSE42-NEXT: movdqa %xmm7, %xmm4 -; SSE42-NEXT: pand %xmm10, %xmm4 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: movdqa %xmm8, %xmm13 -; SSE42-NEXT: pshufb %xmm7, %xmm13 -; SSE42-NEXT: paddb %xmm5, %xmm13 -; SSE42-NEXT: movdqa %xmm6, %xmm4 -; SSE42-NEXT: pand %xmm10, %xmm4 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm4, %xmm5 -; SSE42-NEXT: psrlw $4, %xmm6 -; SSE42-NEXT: pand %xmm10, %xmm6 -; SSE42-NEXT: movdqa %xmm8, %xmm4 -; SSE42-NEXT: pshufb %xmm6, %xmm4 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 -; SSE42-NEXT: paddb %xmm5, %xmm4 -; SSE42-NEXT: paddb %xmm13, %xmm4 -; SSE42-NEXT: movdqa %xmm6, %xmm5 -; SSE42-NEXT: pand %xmm10, %xmm5 -; SSE42-NEXT: movdqa %xmm8, %xmm7 -; SSE42-NEXT: pshufb %xmm5, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm6 -; SSE42-NEXT: pand %xmm10, %xmm6 -; SSE42-NEXT: movdqa %xmm8, %xmm13 -; SSE42-NEXT: pshufb %xmm6, %xmm13 -; SSE42-NEXT: paddb %xmm7, %xmm13 -; SSE42-NEXT: movdqa %xmm12, %xmm5 -; SSE42-NEXT: pand %xmm10, %xmm5 -; SSE42-NEXT: movdqa %xmm8, %xmm6 -; SSE42-NEXT: pshufb %xmm5, %xmm6 -; SSE42-NEXT: psrlw $4, %xmm12 -; SSE42-NEXT: pand %xmm10, %xmm12 -; SSE42-NEXT: movdqa %xmm8, %xmm5 -; SSE42-NEXT: pshufb %xmm12, %xmm5 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE42-NEXT: paddb %xmm6, %xmm5 -; SSE42-NEXT: paddb %xmm13, %xmm5 -; SSE42-NEXT: movdqa %xmm7, %xmm6 -; SSE42-NEXT: pand %xmm10, %xmm6 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm6, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: movdqa %xmm8, %xmm13 -; SSE42-NEXT: pshufb %xmm7, %xmm13 -; SSE42-NEXT: paddb %xmm12, %xmm13 -; SSE42-NEXT: movdqa %xmm11, %xmm6 -; SSE42-NEXT: pand %xmm10, %xmm6 -; SSE42-NEXT: movdqa %xmm8, %xmm7 -; SSE42-NEXT: pshufb %xmm6, %xmm7 -; SSE42-NEXT: psrlw $4, %xmm11 -; SSE42-NEXT: pand %xmm10, %xmm11 -; SSE42-NEXT: movdqa %xmm8, %xmm6 -; SSE42-NEXT: pshufb %xmm11, %xmm6 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE42-NEXT: paddb %xmm7, %xmm6 -; SSE42-NEXT: paddb %xmm13, %xmm6 -; SSE42-NEXT: movdqa %xmm11, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm7, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm11 -; SSE42-NEXT: pand %xmm10, %xmm11 -; SSE42-NEXT: movdqa %xmm8, %xmm13 -; SSE42-NEXT: pshufb %xmm11, %xmm13 -; SSE42-NEXT: paddb %xmm12, %xmm13 -; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: pand %xmm10, %xmm7 -; SSE42-NEXT: movdqa %xmm8, %xmm11 -; SSE42-NEXT: pshufb %xmm7, %xmm11 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: movdqa %xmm8, %xmm7 -; SSE42-NEXT: pshufb %xmm0, %xmm7 -; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE42-NEXT: paddb %xmm11, %xmm7 -; SSE42-NEXT: paddb %xmm13, %xmm7 -; SSE42-NEXT: movdqa %xmm0, %xmm11 -; SSE42-NEXT: pand %xmm10, %xmm11 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm11, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: movdqa %xmm8, %xmm11 -; SSE42-NEXT: pshufb %xmm0, %xmm11 -; SSE42-NEXT: paddb %xmm12, %xmm11 -; SSE42-NEXT: movdqa %xmm9, %xmm0 -; SSE42-NEXT: pand %xmm10, %xmm0 -; SSE42-NEXT: movdqa %xmm8, %xmm12 -; SSE42-NEXT: pshufb %xmm0, %xmm12 -; SSE42-NEXT: psrlw $4, %xmm9 -; SSE42-NEXT: pand %xmm10, %xmm9 -; SSE42-NEXT: pshufb %xmm9, %xmm8 -; SSE42-NEXT: paddb %xmm12, %xmm8 -; SSE42-NEXT: paddb %xmm11, %xmm8 -; SSE42-NEXT: pxor %xmm9, %xmm9 -; SSE42-NEXT: psadbw %xmm9, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE42-NEXT: paddq %xmm1, %xmm0 -; SSE42-NEXT: psadbw %xmm9, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE42-NEXT: paddq %xmm3, %xmm1 -; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE42-NEXT: psadbw %xmm9, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE42-NEXT: paddq %xmm2, %xmm1 -; SSE42-NEXT: psadbw %xmm9, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE42-NEXT: paddq %xmm4, %xmm2 -; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: psadbw %xmm9, %xmm5 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE42-NEXT: paddq %xmm5, %xmm1 -; SSE42-NEXT: psadbw %xmm9, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE42-NEXT: paddq %xmm6, %xmm2 -; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE42-NEXT: psadbw %xmm9, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE42-NEXT: paddq %xmm7, %xmm2 -; SSE42-NEXT: psadbw %xmm9, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] -; SSE42-NEXT: paddq %xmm8, %xmm3 -; SSE42-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE42-NEXT: retq -; -; AVX2-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm0, %ymm8, %ymm10 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm10, %ymm0 -; AVX2-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX2-NEXT: vpsadbw %ymm0, %ymm10, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm10, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-NEXT: vpaddb %ymm2, %ymm11, %ymm2 -; AVX2-NEXT: vpsadbw %ymm2, %ymm10, %ymm2 -; AVX2-NEXT: vpand %ymm3, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX2-NEXT: vpand %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX2-NEXT: vpaddb %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm10, %ymm3 -; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm4, %ymm4 -; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX2-NEXT: vpaddb %ymm4, %ymm11, %ymm4 -; AVX2-NEXT: vpsadbw %ymm4, %ymm10, %ymm4 -; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm5, %ymm5 -; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpshufb %ymm5, %ymm9, %ymm5 -; AVX2-NEXT: vpaddb %ymm5, %ymm11, %ymm5 -; AVX2-NEXT: vpsadbw %ymm5, %ymm10, %ymm5 -; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm6, %ymm6 -; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm6 -; AVX2-NEXT: vpaddb %ymm6, %ymm11, %ymm6 -; AVX2-NEXT: vpsadbw %ymm6, %ymm10, %ymm6 -; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX2-NEXT: vpsrlw $4, %ymm7, %ymm7 -; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vpaddb %ymm7, %ymm11, %ymm7 -; AVX2-NEXT: vpsadbw %ymm7, %ymm10, %ymm7 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-NEXT: vpaddq %xmm1, %xmm9, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX2-NEXT: vpaddq %xmm2, %xmm10, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX2-NEXT: vpaddq %xmm3, %xmm11, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-NEXT: vpaddq %xmm4, %xmm12, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-NEXT: vpaddq %xmm5, %xmm13, %xmm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX2-NEXT: vpaddq %xmm6, %xmm14, %xmm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm15 -; AVX2-NEXT: vpaddq %xmm7, %xmm15, %xmm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 -; AVX2-NEXT: vpaddq %xmm1, %xmm9, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vpaddq %xmm2, %xmm10, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: vpaddq %xmm3, %xmm11, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %edx -; AVX2-NEXT: vpaddq %xmm4, %xmm12, %xmm1 -; AVX2-NEXT: vpaddq %xmm5, %xmm13, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %esi -; AVX2-NEXT: vpaddq %xmm6, %xmm14, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %edi -; AVX2-NEXT: vpaddq %xmm7, %xmm15, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %r8d -; AVX2-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $2, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; X86-SSE2-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $80, %esp +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm7, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand %xmm7, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: paddb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: psrlw $2, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: paddb %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: psubb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm2 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pand %xmm7, %xmm3 +; X86-SSE2-NEXT: paddb %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE2-NEXT: psrlw $1, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: psubb %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm6, %xmm3 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm2 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pand %xmm7, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm6 +; X86-SSE2-NEXT: paddb %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 104(%ebp), %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa 88(%ebp), %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand %xmm5, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: psrlw $4, %xmm7 +; X86-SSE2-NEXT: paddb %xmm2, %xmm7 +; X86-SSE2-NEXT: pand %xmm6, %xmm0 +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa 136(%ebp), %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa 120(%ebp), %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: psrlw $1, %xmm2 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: psubb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $4, %xmm6 +; X86-SSE2-NEXT: paddb %xmm3, %xmm6 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm6 +; X86-SSE2-NEXT: paddb %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa 168(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrlw $1, %xmm2 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: psubb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa 152(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrlw $1, %xmm2 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: psubb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrlw $4, %xmm2 +; X86-SSE2-NEXT: paddb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: paddb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa 200(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand %xmm5, %xmm3 +; X86-SSE2-NEXT: psubb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: paddb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa 184(%ebp), %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: psubb %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm5, %xmm4 +; X86-SSE2-NEXT: paddb %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: paddb %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload +; X86-SSE2-NEXT: psadbw %xmm0, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm4 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa (%esp), %xmm0 # 16-byte Reload +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm7, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm6, %xmm4 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; X86-SSE2-NEXT: psadbw %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm2, %xmm4 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm3, %xmm2 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm9 +; X64-SSE2-NEXT: psrlw $1, %xmm9 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm10 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-SSE2-NEXT: pand %xmm10, %xmm9 +; X64-SSE2-NEXT: psubb %xmm9, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm12 +; X64-SSE2-NEXT: pand %xmm9, %xmm12 +; X64-SSE2-NEXT: psrlw $2, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: paddb %xmm1, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm13 +; X64-SSE2-NEXT: psrlw $4, %xmm13 +; X64-SSE2-NEXT: paddb %xmm12, %xmm13 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $1, %xmm1 +; X64-SSE2-NEXT: pand %xmm10, %xmm1 +; X64-SSE2-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm9, %xmm1 +; X64-SSE2-NEXT: psrlw $2, %xmm0 +; X64-SSE2-NEXT: pand %xmm9, %xmm0 +; X64-SSE2-NEXT: paddb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrlw $4, %xmm1 +; X64-SSE2-NEXT: paddb %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE2-NEXT: pand %xmm1, %xmm13 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: paddb %xmm13, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm12 +; X64-SSE2-NEXT: psrlw $1, %xmm12 +; X64-SSE2-NEXT: pand %xmm10, %xmm12 +; X64-SSE2-NEXT: psubb %xmm12, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm12 +; X64-SSE2-NEXT: pand %xmm9, %xmm12 +; X64-SSE2-NEXT: psrlw $2, %xmm3 +; X64-SSE2-NEXT: pand %xmm9, %xmm3 +; X64-SSE2-NEXT: paddb %xmm12, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm13 +; X64-SSE2-NEXT: psrlw $4, %xmm13 +; X64-SSE2-NEXT: paddb %xmm3, %xmm13 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: psrlw $1, %xmm3 +; X64-SSE2-NEXT: pand %xmm10, %xmm3 +; X64-SSE2-NEXT: psubb %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm9, %xmm3 +; X64-SSE2-NEXT: psrlw $2, %xmm2 +; X64-SSE2-NEXT: pand %xmm9, %xmm2 +; X64-SSE2-NEXT: paddb %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: psrlw $4, %xmm3 +; X64-SSE2-NEXT: paddb %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; X64-SSE2-NEXT: pand %xmm1, %xmm13 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: paddb %xmm13, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm2 +; X64-SSE2-NEXT: psrlw $1, %xmm2 +; X64-SSE2-NEXT: pand %xmm10, %xmm2 +; X64-SSE2-NEXT: psubb %xmm2, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm2 +; X64-SSE2-NEXT: pand %xmm9, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm5 +; X64-SSE2-NEXT: pand %xmm9, %xmm5 +; X64-SSE2-NEXT: paddb %xmm2, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm14 +; X64-SSE2-NEXT: psrlw $4, %xmm14 +; X64-SSE2-NEXT: paddb %xmm5, %xmm14 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE2-NEXT: psrlw $1, %xmm2 +; X64-SSE2-NEXT: pand %xmm10, %xmm2 +; X64-SSE2-NEXT: psubb %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE2-NEXT: pand %xmm9, %xmm2 +; X64-SSE2-NEXT: psrlw $2, %xmm4 +; X64-SSE2-NEXT: pand %xmm9, %xmm4 +; X64-SSE2-NEXT: paddb %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE2-NEXT: psrlw $4, %xmm2 +; X64-SSE2-NEXT: paddb %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; X64-SSE2-NEXT: pand %xmm1, %xmm14 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: paddb %xmm14, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE2-NEXT: psrlw $1, %xmm4 +; X64-SSE2-NEXT: pand %xmm10, %xmm4 +; X64-SSE2-NEXT: psubb %xmm4, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE2-NEXT: pand %xmm9, %xmm4 +; X64-SSE2-NEXT: psrlw $2, %xmm7 +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: paddb %xmm4, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm5 +; X64-SSE2-NEXT: psrlw $4, %xmm5 +; X64-SSE2-NEXT: paddb %xmm7, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE2-NEXT: psrlw $1, %xmm4 +; X64-SSE2-NEXT: pand %xmm10, %xmm4 +; X64-SSE2-NEXT: psubb %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE2-NEXT: pand %xmm9, %xmm4 +; X64-SSE2-NEXT: psrlw $2, %xmm6 +; X64-SSE2-NEXT: pand %xmm9, %xmm6 +; X64-SSE2-NEXT: paddb %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE2-NEXT: psrlw $4, %xmm4 +; X64-SSE2-NEXT: paddb %xmm6, %xmm4 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: paddb %xmm5, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm5 +; X64-SSE2-NEXT: psrlw $1, %xmm5 +; X64-SSE2-NEXT: pand %xmm10, %xmm5 +; X64-SSE2-NEXT: psubb %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm5 +; X64-SSE2-NEXT: pand %xmm9, %xmm5 +; X64-SSE2-NEXT: psrlw $2, %xmm6 +; X64-SSE2-NEXT: pand %xmm9, %xmm6 +; X64-SSE2-NEXT: paddb %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: psrlw $4, %xmm7 +; X64-SSE2-NEXT: paddb %xmm6, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm13, %xmm5 +; X64-SSE2-NEXT: psrlw $1, %xmm5 +; X64-SSE2-NEXT: pand %xmm10, %xmm5 +; X64-SSE2-NEXT: psubb %xmm5, %xmm13 +; X64-SSE2-NEXT: movdqa %xmm13, %xmm5 +; X64-SSE2-NEXT: pand %xmm9, %xmm5 +; X64-SSE2-NEXT: psrlw $2, %xmm13 +; X64-SSE2-NEXT: pand %xmm9, %xmm13 +; X64-SSE2-NEXT: paddb %xmm5, %xmm13 +; X64-SSE2-NEXT: movdqa %xmm13, %xmm5 +; X64-SSE2-NEXT: psrlw $4, %xmm5 +; X64-SSE2-NEXT: paddb %xmm13, %xmm5 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: paddb %xmm7, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: psrlw $1, %xmm7 +; X64-SSE2-NEXT: pand %xmm10, %xmm7 +; X64-SSE2-NEXT: psubb %xmm7, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: psrlw $2, %xmm6 +; X64-SSE2-NEXT: pand %xmm9, %xmm6 +; X64-SSE2-NEXT: paddb %xmm7, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: psrlw $4, %xmm7 +; X64-SSE2-NEXT: paddb %xmm6, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm6 +; X64-SSE2-NEXT: psrlw $1, %xmm6 +; X64-SSE2-NEXT: pand %xmm10, %xmm6 +; X64-SSE2-NEXT: psubb %xmm6, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm6 +; X64-SSE2-NEXT: pand %xmm9, %xmm6 +; X64-SSE2-NEXT: psrlw $2, %xmm12 +; X64-SSE2-NEXT: pand %xmm9, %xmm12 +; X64-SSE2-NEXT: paddb %xmm6, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm6 +; X64-SSE2-NEXT: psrlw $4, %xmm6 +; X64-SSE2-NEXT: paddb %xmm12, %xmm6 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: pand %xmm1, %xmm6 +; X64-SSE2-NEXT: paddb %xmm7, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm7 +; X64-SSE2-NEXT: psrlw $1, %xmm7 +; X64-SSE2-NEXT: pand %xmm10, %xmm7 +; X64-SSE2-NEXT: psubb %xmm7, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm7 +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: psrlw $2, %xmm12 +; X64-SSE2-NEXT: pand %xmm9, %xmm12 +; X64-SSE2-NEXT: paddb %xmm7, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm12, %xmm13 +; X64-SSE2-NEXT: psrlw $4, %xmm13 +; X64-SSE2-NEXT: paddb %xmm12, %xmm13 +; X64-SSE2-NEXT: movdqa %xmm11, %xmm7 +; X64-SSE2-NEXT: psrlw $1, %xmm7 +; X64-SSE2-NEXT: pand %xmm10, %xmm7 +; X64-SSE2-NEXT: psubb %xmm7, %xmm11 +; X64-SSE2-NEXT: movdqa %xmm11, %xmm7 +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: psrlw $2, %xmm11 +; X64-SSE2-NEXT: pand %xmm9, %xmm11 +; X64-SSE2-NEXT: paddb %xmm7, %xmm11 +; X64-SSE2-NEXT: movdqa %xmm11, %xmm7 +; X64-SSE2-NEXT: psrlw $4, %xmm7 +; X64-SSE2-NEXT: paddb %xmm11, %xmm7 +; X64-SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; X64-SSE2-NEXT: pand %xmm1, %xmm13 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: paddb %xmm13, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm11, %xmm12 +; X64-SSE2-NEXT: psrlw $1, %xmm12 +; X64-SSE2-NEXT: pand %xmm10, %xmm12 +; X64-SSE2-NEXT: psubb %xmm12, %xmm11 +; X64-SSE2-NEXT: movdqa %xmm11, %xmm12 +; X64-SSE2-NEXT: pand %xmm9, %xmm12 +; X64-SSE2-NEXT: psrlw $2, %xmm11 +; X64-SSE2-NEXT: pand %xmm9, %xmm11 +; X64-SSE2-NEXT: paddb %xmm12, %xmm11 +; X64-SSE2-NEXT: movdqa %xmm11, %xmm12 +; X64-SSE2-NEXT: psrlw $4, %xmm12 +; X64-SSE2-NEXT: paddb %xmm11, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE2-NEXT: psrlw $1, %xmm11 +; X64-SSE2-NEXT: pand %xmm10, %xmm11 +; X64-SSE2-NEXT: psubb %xmm11, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm10 +; X64-SSE2-NEXT: pand %xmm9, %xmm10 +; X64-SSE2-NEXT: psrlw $2, %xmm8 +; X64-SSE2-NEXT: pand %xmm9, %xmm8 +; X64-SSE2-NEXT: paddb %xmm10, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm9 +; X64-SSE2-NEXT: psrlw $4, %xmm9 +; X64-SSE2-NEXT: paddb %xmm8, %xmm9 +; X64-SSE2-NEXT: pand %xmm1, %xmm12 +; X64-SSE2-NEXT: pand %xmm1, %xmm9 +; X64-SSE2-NEXT: paddb %xmm12, %xmm9 +; X64-SSE2-NEXT: pxor %xmm8, %xmm8 +; X64-SSE2-NEXT: psadbw %xmm8, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm1, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm8, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm3, %xmm1 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE2-NEXT: psadbw %xmm8, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm2, %xmm1 +; X64-SSE2-NEXT: psadbw %xmm8, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm4, %xmm2 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE2-NEXT: psadbw %xmm8, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm5, %xmm1 +; X64-SSE2-NEXT: psadbw %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm6, %xmm2 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE2-NEXT: psadbw %xmm8, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm7, %xmm2 +; X64-SSE2-NEXT: psadbw %xmm8, %xmm9 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm9, %xmm3 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $80, %esp +; X86-SSE4-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE4-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: paddb %xmm7, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm1 +; X86-SSE4-NEXT: paddb %xmm7, %xmm0 +; X86-SSE4-NEXT: paddb %xmm5, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm0 +; X86-SSE4-NEXT: paddb %xmm5, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm1 +; X86-SSE4-NEXT: paddb %xmm5, %xmm7 +; X86-SSE4-NEXT: paddb %xmm0, %xmm7 +; X86-SSE4-NEXT: movdqa %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm0 +; X86-SSE4-NEXT: paddb %xmm2, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm6 +; X86-SSE4-NEXT: pand %xmm4, %xmm6 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm6, %xmm1 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE4-NEXT: paddb %xmm2, %xmm1 +; X86-SSE4-NEXT: paddb %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE4-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm0 +; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE4-NEXT: paddb %xmm2, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X86-SSE4-NEXT: psrlw $4, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm1 +; X86-SSE4-NEXT: paddb %xmm7, %xmm1 +; X86-SSE4-NEXT: movdqa 104(%ebp), %xmm2 +; X86-SSE4-NEXT: paddb %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, (%esp) # 16-byte Spill +; X86-SSE4-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm0 +; X86-SSE4-NEXT: paddb %xmm5, %xmm0 +; X86-SSE4-NEXT: movdqa 88(%ebp), %xmm2 +; X86-SSE4-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE4-NEXT: pand %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufb %xmm5, %xmm1 +; X86-SSE4-NEXT: psrlw $4, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm7 +; X86-SSE4-NEXT: paddb %xmm1, %xmm7 +; X86-SSE4-NEXT: paddb %xmm0, %xmm7 +; X86-SSE4-NEXT: movdqa 136(%ebp), %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm0 +; X86-SSE4-NEXT: paddb %xmm2, %xmm0 +; X86-SSE4-NEXT: movdqa 120(%ebp), %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE4-NEXT: pand %xmm4, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm5 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm6 +; X86-SSE4-NEXT: paddb %xmm5, %xmm6 +; X86-SSE4-NEXT: paddb %xmm0, %xmm6 +; X86-SSE4-NEXT: movdqa 168(%ebp), %xmm0 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm5 +; X86-SSE4-NEXT: paddb %xmm2, %xmm5 +; X86-SSE4-NEXT: movdqa 152(%ebp), %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm4 +; X86-SSE4-NEXT: paddb %xmm2, %xmm4 +; X86-SSE4-NEXT: paddb %xmm5, %xmm4 +; X86-SSE4-NEXT: movdqa 200(%ebp), %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE4-NEXT: pand %xmm0, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm2, %xmm0 +; X86-SSE4-NEXT: psrlw $4, %xmm1 +; X86-SSE4-NEXT: pand %xmm5, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm5 +; X86-SSE4-NEXT: paddb %xmm0, %xmm5 +; X86-SSE4-NEXT: movdqa 184(%ebp), %xmm0 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: pand %xmm2, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufb %xmm1, %xmm2 +; X86-SSE4-NEXT: psrlw $4, %xmm0 +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: pshufb %xmm0, %xmm3 +; X86-SSE4-NEXT: paddb %xmm2, %xmm3 +; X86-SSE4-NEXT: paddb %xmm5, %xmm3 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm5 +; X86-SSE4-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE4-NEXT: psadbw %xmm0, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm2, %xmm1 +; X86-SSE4-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; X86-SSE4-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE4-NEXT: psadbw %xmm0, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm2, %xmm1 +; X86-SSE4-NEXT: movdqa (%esp), %xmm0 # 16-byte Reload +; X86-SSE4-NEXT: pxor %xmm2, %xmm2 +; X86-SSE4-NEXT: psadbw %xmm2, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm2 +; X86-SSE4-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE4-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm7 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm7, %xmm1 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm6 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm6, %xmm2 +; X86-SSE4-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE4-NEXT: psadbw %xmm0, %xmm4 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm4, %xmm2 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm3, %xmm4 +; X86-SSE4-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; X86-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE4-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-SSE4-NEXT: movdqa %xmm1, %xmm11 +; X64-SSE4-NEXT: pand %xmm10, %xmm11 +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm8 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm11, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm1 +; X64-SSE4-NEXT: pand %xmm10, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm11 +; X64-SSE4-NEXT: paddb %xmm12, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE4-NEXT: pand %xmm10, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm1, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm10, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm1 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm1 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; X64-SSE4-NEXT: paddb %xmm12, %xmm1 +; X64-SSE4-NEXT: paddb %xmm11, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm3, %xmm11 +; X64-SSE4-NEXT: pand %xmm10, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm11, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm3 +; X64-SSE4-NEXT: pand %xmm10, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm13 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm13 +; X64-SSE4-NEXT: paddb %xmm12, %xmm13 +; X64-SSE4-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE4-NEXT: pand %xmm10, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm3, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm2 +; X64-SSE4-NEXT: pand %xmm10, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm3 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm3 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; X64-SSE4-NEXT: paddb %xmm12, %xmm3 +; X64-SSE4-NEXT: paddb %xmm13, %xmm3 +; X64-SSE4-NEXT: movdqa %xmm5, %xmm2 +; X64-SSE4-NEXT: pand %xmm10, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm5 +; X64-SSE4-NEXT: pand %xmm10, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm13 +; X64-SSE4-NEXT: pshufb %xmm5, %xmm13 +; X64-SSE4-NEXT: paddb %xmm12, %xmm13 +; X64-SSE4-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE4-NEXT: pand %xmm10, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm2, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm4 +; X64-SSE4-NEXT: pand %xmm10, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm2 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm2 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; X64-SSE4-NEXT: paddb %xmm5, %xmm2 +; X64-SSE4-NEXT: paddb %xmm13, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE4-NEXT: pand %xmm10, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm7 +; X64-SSE4-NEXT: pand %xmm10, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm13 +; X64-SSE4-NEXT: pshufb %xmm7, %xmm13 +; X64-SSE4-NEXT: paddb %xmm5, %xmm13 +; X64-SSE4-NEXT: movdqa %xmm6, %xmm4 +; X64-SSE4-NEXT: pand %xmm10, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm4, %xmm5 +; X64-SSE4-NEXT: psrlw $4, %xmm6 +; X64-SSE4-NEXT: pand %xmm10, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm4 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm4 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; X64-SSE4-NEXT: paddb %xmm5, %xmm4 +; X64-SSE4-NEXT: paddb %xmm13, %xmm4 +; X64-SSE4-NEXT: movdqa %xmm6, %xmm5 +; X64-SSE4-NEXT: pand %xmm10, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm5, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm6 +; X64-SSE4-NEXT: pand %xmm10, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm13 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm13 +; X64-SSE4-NEXT: paddb %xmm7, %xmm13 +; X64-SSE4-NEXT: movdqa %xmm12, %xmm5 +; X64-SSE4-NEXT: pand %xmm10, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm5, %xmm6 +; X64-SSE4-NEXT: psrlw $4, %xmm12 +; X64-SSE4-NEXT: pand %xmm10, %xmm12 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm5 +; X64-SSE4-NEXT: pshufb %xmm12, %xmm5 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; X64-SSE4-NEXT: paddb %xmm6, %xmm5 +; X64-SSE4-NEXT: paddb %xmm13, %xmm5 +; X64-SSE4-NEXT: movdqa %xmm7, %xmm6 +; X64-SSE4-NEXT: pand %xmm10, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm7 +; X64-SSE4-NEXT: pand %xmm10, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm13 +; X64-SSE4-NEXT: pshufb %xmm7, %xmm13 +; X64-SSE4-NEXT: paddb %xmm12, %xmm13 +; X64-SSE4-NEXT: movdqa %xmm11, %xmm6 +; X64-SSE4-NEXT: pand %xmm10, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm6, %xmm7 +; X64-SSE4-NEXT: psrlw $4, %xmm11 +; X64-SSE4-NEXT: pand %xmm10, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm6 +; X64-SSE4-NEXT: pshufb %xmm11, %xmm6 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; X64-SSE4-NEXT: paddb %xmm7, %xmm6 +; X64-SSE4-NEXT: paddb %xmm13, %xmm6 +; X64-SSE4-NEXT: movdqa %xmm11, %xmm7 +; X64-SSE4-NEXT: pand %xmm10, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm7, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm11 +; X64-SSE4-NEXT: pand %xmm10, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm13 +; X64-SSE4-NEXT: pshufb %xmm11, %xmm13 +; X64-SSE4-NEXT: paddb %xmm12, %xmm13 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE4-NEXT: pand %xmm10, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm7, %xmm11 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm10, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm7 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm7 +; X64-SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; X64-SSE4-NEXT: paddb %xmm11, %xmm7 +; X64-SSE4-NEXT: paddb %xmm13, %xmm7 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm11 +; X64-SSE4-NEXT: pand %xmm10, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm11, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm0 +; X64-SSE4-NEXT: pand %xmm10, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm11 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm11 +; X64-SSE4-NEXT: paddb %xmm12, %xmm11 +; X64-SSE4-NEXT: movdqa %xmm9, %xmm0 +; X64-SSE4-NEXT: pand %xmm10, %xmm0 +; X64-SSE4-NEXT: movdqa %xmm8, %xmm12 +; X64-SSE4-NEXT: pshufb %xmm0, %xmm12 +; X64-SSE4-NEXT: psrlw $4, %xmm9 +; X64-SSE4-NEXT: pand %xmm10, %xmm9 +; X64-SSE4-NEXT: pshufb %xmm9, %xmm8 +; X64-SSE4-NEXT: paddb %xmm12, %xmm8 +; X64-SSE4-NEXT: paddb %xmm11, %xmm8 +; X64-SSE4-NEXT: pxor %xmm9, %xmm9 +; X64-SSE4-NEXT: psadbw %xmm9, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm1, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm9, %xmm3 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm3, %xmm1 +; X64-SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE4-NEXT: psadbw %xmm9, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm2, %xmm1 +; X64-SSE4-NEXT: psadbw %xmm9, %xmm4 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm4, %xmm2 +; X64-SSE4-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE4-NEXT: psadbw %xmm9, %xmm5 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm5, %xmm1 +; X64-SSE4-NEXT: psadbw %xmm9, %xmm6 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm6, %xmm2 +; X64-SSE4-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE4-NEXT: psadbw %xmm9, %xmm7 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm7, %xmm2 +; X64-SSE4-NEXT: psadbw %xmm9, %xmm8 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm8, %xmm3 +; X64-SSE4-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $96, %esp +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm0 +; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX1-NEXT: vmovdqa 56(%ebp), %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa 40(%ebp), %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm6 +; X86-AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm6, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm6 +; X86-AVX1-NEXT: vmovdqa 88(%ebp), %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa 72(%ebp), %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm7 +; X86-AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm7, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm7 +; X86-AVX1-NEXT: vmovdqa 120(%ebp), %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa 104(%ebp), %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm5 +; X86-AVX1-NEXT: vmovdqa 152(%ebp), %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vmovdqa 136(%ebp), %xmm2 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsadbw %xmm0, %xmm6, %xmm1 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm0, %xmm7, %xmm2 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-AVX1-NEXT: vpsadbw %xmm0, %xmm5, %xmm1 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; X86-AVX1-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; X86-AVX1-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; X86-AVX1-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; X86-AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 +; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm0, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 +; X64-AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm11, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm10, %xmm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm1, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 +; X64-AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm11, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm10, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm2, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 +; X64-AVX1-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm11, %xmm2 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm3, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; X64-AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm11, %xmm3 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm10, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm4, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4 +; X64-AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; X64-AVX1-NEXT: vpaddb %xmm4, %xmm11, %xmm4 +; X64-AVX1-NEXT: vpaddb %xmm4, %xmm10, %xmm4 +; X64-AVX1-NEXT: vextractf128 $1, %ymm5, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm5, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm5, %xmm5 +; X64-AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; X64-AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm11, %xmm5 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm10, %xmm5 +; X64-AVX1-NEXT: vextractf128 $1, %ymm6, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm6, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 +; X64-AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm11, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm10, %xmm6 +; X64-AVX1-NEXT: vextractf128 $1, %ymm7, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm8, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; X64-AVX1-NEXT: vpaddb %xmm11, %xmm10, %xmm10 +; X64-AVX1-NEXT: vpand %xmm7, %xmm8, %xmm11 +; X64-AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; X64-AVX1-NEXT: vpsrlw $4, %xmm7, %xmm7 +; X64-AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; X64-AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm11, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm7, %xmm10, %xmm7 +; X64-AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; X64-AVX1-NEXT: vpsadbw %xmm0, %xmm8, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm0 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm8, %xmm1 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm2, %xmm8, %xmm2 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm9, %xmm2 +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm8, %xmm3 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm9, %xmm3 +; X64-AVX1-NEXT: vpsadbw %xmm4, %xmm8, %xmm4 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm9, %xmm4 +; X64-AVX1-NEXT: vpsadbw %xmm5, %xmm8, %xmm5 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm9, %xmm5 +; X64-AVX1-NEXT: vpsadbw %xmm6, %xmm8, %xmm6 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm6, %xmm9, %xmm6 +; X64-AVX1-NEXT: vpsadbw %xmm7, %xmm8, %xmm7 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm7, %xmm8, %xmm7 +; X64-AVX1-NEXT: vmovd %xmm1, %eax +; X64-AVX1-NEXT: vmovd %xmm2, %ecx +; X64-AVX1-NEXT: vmovd %xmm3, %edx +; X64-AVX1-NEXT: vmovd %xmm5, %esi +; X64-AVX1-NEXT: vmovd %xmm6, %edi +; X64-AVX1-NEXT: vmovd %xmm7, %r8d +; X64-AVX1-NEXT: vpinsrd $1, %esi, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpinsrd $2, %edi, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $192, %esp +; X86-AVX2-NEXT: vmovdqa 40(%ebp), %ymm6 +; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm5 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm7 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; X86-AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm7, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X86-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X86-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX2-NEXT: vpand %ymm3, %ymm5, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpsrlw $4, %ymm5, %ymm5 +; X86-AVX2-NEXT: vpand %ymm3, %ymm5, %ymm5 +; X86-AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm5, %ymm5 +; X86-AVX2-NEXT: vpand %ymm3, %ymm6, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpsrlw $4, %ymm6, %ymm6 +; X86-AVX2-NEXT: vpand %ymm3, %ymm6, %ymm6 +; X86-AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm6, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, (%esp) # 32-byte Spill +; X86-AVX2-NEXT: vmovdqa 72(%ebp), %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm7 +; X86-AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; X86-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpaddb %ymm7, %ymm2, %ymm6 +; X86-AVX2-NEXT: vmovdqa 104(%ebp), %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 +; X86-AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; X86-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm7 +; X86-AVX2-NEXT: vmovdqa 136(%ebp), %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X86-AVX2-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; X86-AVX2-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; X86-AVX2-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; X86-AVX2-NEXT: vpsadbw %ymm0, %ymm5, %ymm5 +; X86-AVX2-NEXT: vpsadbw (%esp), %ymm0, %ymm4 # 32-byte Folded Reload +; X86-AVX2-NEXT: vpsadbw %ymm0, %ymm6, %ymm6 +; X86-AVX2-NEXT: vpsadbw %ymm0, %ymm7, %ymm7 +; X86-AVX2-NEXT: vmovdqa %ymm7, (%esp) # 32-byte Spill +; X86-AVX2-NEXT: vpsadbw {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; X86-AVX2-NEXT: vmovdqa %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; X86-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; X86-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm3 +; X86-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2 +; X86-AVX2-NEXT: vpaddq %xmm2, %xmm5, %xmm7 +; X86-AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 +; X86-AVX2-NEXT: vpaddq %xmm2, %xmm4, %xmm1 +; X86-AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 +; X86-AVX2-NEXT: vpaddq %xmm2, %xmm6, %xmm6 +; X86-AVX2-NEXT: vmovdqa (%esp), %ymm1 # 32-byte Reload +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm4 +; X86-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %ymm1 # 32-byte Reload +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm5, %xmm0, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm5, %xmm0, %xmm5 +; X86-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm5, %xmm7, %xmm5 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm7, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpbroadcastd %xmm5, %xmm5 +; X86-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; X86-AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm6, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; X86-AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 +; X86-AVX2-NEXT: vpbroadcastd %xmm3, %xmm3 +; X86-AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %ymm0, %ymm8, %ymm10 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X64-AVX2-NEXT: # ymm9 = mem[0,1,0,1] +; X64-AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm10 +; X64-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm0, %ymm8, %ymm0 +; X64-AVX2-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm0, %ymm10, %ymm0 +; X64-AVX2-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; X64-AVX2-NEXT: vpsadbw %ymm0, %ymm10, %ymm0 +; X64-AVX2-NEXT: vpand %ymm1, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm1, %ymm8, %ymm1 +; X64-AVX2-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm1, %ymm11, %ymm1 +; X64-AVX2-NEXT: vpsadbw %ymm1, %ymm10, %ymm1 +; X64-AVX2-NEXT: vpand %ymm2, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm2, %ymm8, %ymm2 +; X64-AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm11, %ymm2 +; X64-AVX2-NEXT: vpsadbw %ymm2, %ymm10, %ymm2 +; X64-AVX2-NEXT: vpand %ymm3, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vpand %ymm3, %ymm8, %ymm3 +; X64-AVX2-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; X64-AVX2-NEXT: vpaddb %ymm3, %ymm11, %ymm3 +; X64-AVX2-NEXT: vpsadbw %ymm3, %ymm10, %ymm3 +; X64-AVX2-NEXT: vpand %ymm4, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm4, %ymm4 +; X64-AVX2-NEXT: vpand %ymm4, %ymm8, %ymm4 +; X64-AVX2-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; X64-AVX2-NEXT: vpaddb %ymm4, %ymm11, %ymm4 +; X64-AVX2-NEXT: vpsadbw %ymm4, %ymm10, %ymm4 +; X64-AVX2-NEXT: vpand %ymm5, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm5, %ymm5 +; X64-AVX2-NEXT: vpand %ymm5, %ymm8, %ymm5 +; X64-AVX2-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; X64-AVX2-NEXT: vpaddb %ymm5, %ymm11, %ymm5 +; X64-AVX2-NEXT: vpsadbw %ymm5, %ymm10, %ymm5 +; X64-AVX2-NEXT: vpand %ymm6, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm6, %ymm6 +; X64-AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6 +; X64-AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm6 +; X64-AVX2-NEXT: vpaddb %ymm6, %ymm11, %ymm6 +; X64-AVX2-NEXT: vpsadbw %ymm6, %ymm10, %ymm6 +; X64-AVX2-NEXT: vpand %ymm7, %ymm8, %ymm11 +; X64-AVX2-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; X64-AVX2-NEXT: vpsrlw $4, %ymm7, %ymm7 +; X64-AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7 +; X64-AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; X64-AVX2-NEXT: vpaddb %ymm7, %ymm11, %ymm7 +; X64-AVX2-NEXT: vpsadbw %ymm7, %ymm10, %ymm7 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8 +; X64-AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm9, %xmm1 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm10 +; X64-AVX2-NEXT: vpaddq %xmm2, %xmm10, %xmm2 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm3, %xmm11 +; X64-AVX2-NEXT: vpaddq %xmm3, %xmm11, %xmm3 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 +; X64-AVX2-NEXT: vpaddq %xmm4, %xmm12, %xmm4 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13 +; X64-AVX2-NEXT: vpaddq %xmm5, %xmm13, %xmm5 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm6, %xmm14 +; X64-AVX2-NEXT: vpaddq %xmm6, %xmm14, %xmm6 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] +; X64-AVX2-NEXT: vextracti128 $1, %ymm7, %xmm15 +; X64-AVX2-NEXT: vpaddq %xmm7, %xmm15, %xmm7 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm9, %xmm1 +; X64-AVX2-NEXT: vmovd %xmm1, %eax +; X64-AVX2-NEXT: vpaddq %xmm2, %xmm10, %xmm1 +; X64-AVX2-NEXT: vmovd %xmm1, %ecx +; X64-AVX2-NEXT: vpaddq %xmm3, %xmm11, %xmm1 +; X64-AVX2-NEXT: vmovd %xmm1, %edx +; X64-AVX2-NEXT: vpaddq %xmm4, %xmm12, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm5, %xmm13, %xmm2 +; X64-AVX2-NEXT: vmovd %xmm2, %esi +; X64-AVX2-NEXT: vpaddq %xmm6, %xmm14, %xmm2 +; X64-AVX2-NEXT: vmovd %xmm2, %edi +; X64-AVX2-NEXT: vpaddq %xmm7, %xmm15, %xmm2 +; X64-AVX2-NEXT: vmovd %xmm2, %r8d +; X64-AVX2-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpinsrd $2, %edi, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: ; AVX512VL: # %bb.0: @@ -1644,3 +5407,8 @@ declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE: {{.*}} +; X64-SSE: {{.*}} +; X86-SSE: {{.*}} From ab87cb0598da192cf5f4e60b5fbcda28a82dd5d0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2026 12:46:34 +0100 Subject: [PATCH 426/538] [X86] vector-reduce-add-*.ll - add 32-bit test coverage (#197152) --- .../CodeGen/X86/vector-reduce-add-mask.ll | 2538 +++++++++++------ .../CodeGen/X86/vector-reduce-add-sext.ll | 2387 ++++++++++------ .../CodeGen/X86/vector-reduce-add-zext.ll | 653 +++-- llvm/test/CodeGen/X86/vector-reduce-add.ll | 1713 +++++++---- 4 files changed, 4834 insertions(+), 2457 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 3d85d5587a45f..43bc6f5ef8429 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -1,48 +1,84 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X86-AVX1,X86-AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X64-AVX1,X64-AVX1-SLOW +; RUN: llc < %s -mtriple=i686-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X86-AVX1,X86-AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X64-AVX1,X64-AVX1-FAST +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL ; ; vXi64 ; -define i64 @test_v2i64_v2i32(<2 x i64> %a0) { -; SSE2-LABEL: test_v2i64_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v2i64_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v2i64_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: retq +define i64 @test_v2i64_v2i32(<2 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v2i64_v2i32: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v2i64_v2i32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v2i64_v2i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v2i64_v2i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v2i64_v2i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i64_v2i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i64_v2i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64_v2i32: ; AVX512BW: # %bb.0: @@ -64,48 +100,85 @@ define i64 @test_v2i64_v2i32(<2 x i64> %a0) { ret i64 %2 } -define i64 @test_v4i64_v4i16(<4 x i64> %a0) { -; SSE2-LABEL: test_v4i64_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v4i64_v4i16(<4 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v4i64_v4i16: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v4i64_v4i16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v4i64_v4i16: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v4i64_v4i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v4i64_v4i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v4i64_v4i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v4i64_v4i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v4i64_v4i16: ; AVX512BW: # %bb.0: @@ -133,64 +206,127 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) { ret i64 %2 } -define i64 @test_v8i64_v8i8(<8 x i64> %a0) { -; SSE2-LABEL: test_v8i64_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $60, %xmm2 -; SSE2-NEXT: psrlq $60, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: psrlq $60, %xmm3 -; SSE2-NEXT: psrlq $60, %xmm1 -; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i64_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: psrlq $60, %xmm2 -; SSE41-NEXT: psrlq $60, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: psrlq $60, %xmm3 -; SSE41-NEXT: psrlq $60, %xmm1 -; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i64_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $60, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $60, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v8i64_v8i8(<8 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v8i64_v8i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: psrlq $60, %xmm2 +; X86-SSE2-NEXT: psrlq $60, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlq $60, %xmm3 +; X86-SSE2-NEXT: psrlq $60, %xmm1 +; X86-SSE2-NEXT: paddq %xmm3, %xmm1 +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v8i64_v8i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: psrlq $60, %xmm2 +; X64-SSE-NEXT: psrlq $60, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: psrlq $60, %xmm3 +; X64-SSE-NEXT: psrlq $60, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: movq %xmm0, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v8i64_v8i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE4-NEXT: psrlq $60, %xmm2 +; X86-SSE4-NEXT: psrlq $60, %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: psrlq $60, %xmm3 +; X86-SSE4-NEXT: psrlq $60, %xmm1 +; X86-SSE4-NEXT: paddq %xmm3, %xmm1 +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v8i64_v8i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpsrlq $60, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v8i64_v8i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpsrlq $60, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i64_v8i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlq $60, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpsrlq $60, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i64_v8i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlq $60, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpsrlq $60, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64_v8i8: ; AVX512: # %bb.0: @@ -206,94 +342,219 @@ define i64 @test_v8i64_v8i8(<8 x i64> %a0) { ret i64 %2 } -define i64 @test_v16i64_v16i8(<16 x i64> %a0) { -; SSE2-LABEL: test_v16i64_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [1,1] -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: paddq %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: paddq %xmm7, %xmm3 -; SSE2-NEXT: paddq %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: paddq %xmm6, %xmm2 -; SSE2-NEXT: paddq %xmm0, %xmm2 -; SSE2-NEXT: paddq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i64_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm8 = [1,1] -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: paddq %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm3 -; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: paddq %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: paddq %xmm6, %xmm2 -; SSE41-NEXT: paddq %xmm0, %xmm2 -; SSE41-NEXT: paddq %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1] -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v16i64_v16i8(<16 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v16i64_v16i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,0,1,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: paddq %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddq %xmm5, %xmm1 +; X86-SSE2-NEXT: paddq %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: paddq %xmm0, %xmm4 +; X86-SSE2-NEXT: pand 56(%ebp), %xmm3 +; X86-SSE2-NEXT: paddq %xmm2, %xmm3 +; X86-SSE2-NEXT: paddq %xmm4, %xmm3 +; X86-SSE2-NEXT: paddq %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm3, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v16i64_v16i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [1,1] +; X64-SSE2-NEXT: pand %xmm8, %xmm5 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: paddq %xmm5, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: paddq %xmm7, %xmm3 +; X64-SSE2-NEXT: paddq %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: paddq %xmm4, %xmm0 +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm2 +; X64-SSE2-NEXT: paddq %xmm6, %xmm2 +; X64-SSE2-NEXT: paddq %xmm0, %xmm2 +; X64-SSE2-NEXT: paddq %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v16i64_v16i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1] +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm2 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: paddq %xmm1, %xmm4 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE4-NEXT: pand %xmm3, %xmm5 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: paddq %xmm5, %xmm1 +; X86-SSE4-NEXT: paddq %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa 24(%ebp), %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: paddq %xmm0, %xmm4 +; X86-SSE4-NEXT: pand 56(%ebp), %xmm3 +; X86-SSE4-NEXT: paddq %xmm2, %xmm3 +; X86-SSE4-NEXT: paddq %xmm4, %xmm3 +; X86-SSE4-NEXT: paddq %xmm1, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm3, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v16i64_v16i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm8 = [1,1] +; X64-SSE4-NEXT: pand %xmm8, %xmm5 +; X64-SSE4-NEXT: pand %xmm8, %xmm1 +; X64-SSE4-NEXT: paddq %xmm5, %xmm1 +; X64-SSE4-NEXT: pand %xmm8, %xmm7 +; X64-SSE4-NEXT: pand %xmm8, %xmm3 +; X64-SSE4-NEXT: paddq %xmm7, %xmm3 +; X64-SSE4-NEXT: paddq %xmm1, %xmm3 +; X64-SSE4-NEXT: pand %xmm8, %xmm4 +; X64-SSE4-NEXT: pand %xmm8, %xmm0 +; X64-SSE4-NEXT: paddq %xmm4, %xmm0 +; X64-SSE4-NEXT: pand %xmm8, %xmm6 +; X64-SSE4-NEXT: pand %xmm8, %xmm2 +; X64-SSE4-NEXT: paddq %xmm6, %xmm2 +; X64-SSE4-NEXT: paddq %xmm0, %xmm2 +; X64-SSE4-NEXT: paddq %xmm3, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: movq %xmm0, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v16i64_v16i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1,0,1,0,1,0,1,0] +; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vandps 8(%ebp), %ymm3, %ymm3 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v16i64_v16i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1] +; X64-AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v16i64_v16i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,0,1,0,1,0,1,0] +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand 8(%ebp), %ymm3, %ymm2 +; X86-AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v16i64_v16i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v16i64_v16i8: ; AVX512BW: # %bb.0: @@ -331,45 +592,68 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) { ; vXi32 ; -define i32 @test_v2i32_v2i16(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i32_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i32_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v2i32_v2i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v2i32_v2i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v2i32_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +define i32 @test_v2i32_v2i16(<2 x i32> %a0) nounwind { +; X86-SSE-LABEL: test_v2i32_v2i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: paddd %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v2i32_v2i16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE-NEXT: paddd %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-SLOW-LABEL: test_v2i32_v2i16: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v2i32_v2i16: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v2i32_v2i16: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v2i32_v2i16: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i32_v2i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i32_v2i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2i32_v2i16: ; AVX512: # %bb.0: @@ -383,26 +667,16 @@ define i32 @test_v2i32_v2i16(<2 x i32> %a0) { ret i32 %2 } -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: psrld $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +define i32 @test_v4i32(<4 x i32> %a0) nounwind { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: psrld $31, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i32: ; AVX1-SLOW: # %bb.0: @@ -412,7 +686,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: @@ -420,7 +694,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: @@ -430,7 +704,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512BW-LABEL: test_v4i32: ; AVX512BW: # %bb.0: @@ -454,67 +728,104 @@ define i32 @test_v4i32(<4 x i32> %a0) { ret i32 %2 } -define i32 @test_v8i32_v8i8(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v8i32_v8i8: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v8i32_v8i8: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v8i32_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i32 @test_v8i32_v8i8(<8 x i32> %a0) nounwind { +; X86-SSE-LABEL: test_v8i32_v8i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: por %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE-NEXT: paddd %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE-NEXT: paddd %xmm1, %xmm0 +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v8i32_v8i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: por %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddd %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE-NEXT: paddd %xmm1, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-SLOW-LABEL: test_v8i32_v8i8: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: vzeroupper +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v8i32_v8i8: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: vzeroupper +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v8i32_v8i8: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: vzeroupper +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v8i32_v8i8: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: vzeroupper +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i32_v8i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i32_v8i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v8i32_v8i8: ; AVX512BW: # %bb.0: @@ -540,40 +851,86 @@ define i32 @test_v8i32_v8i8(<8 x i32> %a0) { ret i32 %2 } -define i32 @test_v16i32_v16i8(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +define i32 @test_v16i32_v16i8(<16 x i32> %a0) nounwind { +; X86-SSE2-LABEL: test_v16i32_v16i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: paddd %xmm2, %xmm0 +; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: paddd %xmm1, %xmm3 +; X86-SSE2-NEXT: paddd %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: paddd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: paddd %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v16i32_v16i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: paddd %xmm2, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: paddd %xmm3, %xmm1 +; X64-SSE2-NEXT: paddd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: paddd %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: paddd %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v16i32_v16i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = [255,255,255,255] +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm2 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: paddd %xmm2, %xmm0 +; X86-SSE4-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE4-NEXT: paddd %xmm1, %xmm3 +; X86-SSE4-NEXT: paddd %xmm0, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE4-NEXT: paddd %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: paddd %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v16i32_v16i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] +; X64-SSE4-NEXT: pand %xmm4, %xmm2 +; X64-SSE4-NEXT: pand %xmm4, %xmm0 +; X64-SSE4-NEXT: paddd %xmm2, %xmm0 +; X64-SSE4-NEXT: pand %xmm4, %xmm3 +; X64-SSE4-NEXT: pand %xmm4, %xmm1 +; X64-SSE4-NEXT: paddd %xmm3, %xmm1 +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: paddd %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: movd %xmm1, %eax +; X64-SSE4-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16i32_v16i8: ; AVX1-SLOW: # %bb.0: @@ -591,7 +948,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v16i32_v16i8: ; AVX1-FAST: # %bb.0: @@ -608,7 +965,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v16i32_v16i8: ; AVX2: # %bb.0: @@ -624,7 +981,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v16i32_v16i8: ; AVX512: # %bb.0: @@ -641,127 +998,286 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ret i32 %2 } -define i32 @test_v32i32_v32i8(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = [255,255,255,255] -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: paddd %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm3 -; SSE41-NEXT: paddd %xmm7, %xmm3 -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: paddd %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v32i32_v32i8: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] -; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-SLOW-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v32i32_v32i8: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] -; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-FAST-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v32i32_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i32 @test_v32i32_v32i8(<32 x i32> %a0) nounwind { +; X86-SSE2-LABEL: test_v32i32_v32i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: paddd %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: paddd %xmm5, %xmm1 +; X86-SSE2-NEXT: paddd %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: paddd %xmm0, %xmm4 +; X86-SSE2-NEXT: pand 56(%ebp), %xmm3 +; X86-SSE2-NEXT: paddd %xmm2, %xmm3 +; X86-SSE2-NEXT: paddd %xmm4, %xmm3 +; X86-SSE2-NEXT: paddd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: paddd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: paddd %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v32i32_v32i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-SSE2-NEXT: pand %xmm8, %xmm5 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: paddd %xmm5, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: paddd %xmm7, %xmm3 +; X64-SSE2-NEXT: paddd %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: paddd %xmm4, %xmm0 +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm2 +; X64-SSE2-NEXT: paddd %xmm6, %xmm2 +; X64-SSE2-NEXT: paddd %xmm0, %xmm2 +; X64-SSE2-NEXT: paddd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X64-SSE2-NEXT: paddd %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: paddd %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v32i32_v32i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = [255,255,255,255] +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm2 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: paddd %xmm1, %xmm4 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE4-NEXT: pand %xmm3, %xmm5 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: paddd %xmm5, %xmm1 +; X86-SSE4-NEXT: paddd %xmm4, %xmm1 +; X86-SSE4-NEXT: movdqa 24(%ebp), %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: paddd %xmm0, %xmm4 +; X86-SSE4-NEXT: pand 56(%ebp), %xmm3 +; X86-SSE4-NEXT: paddd %xmm2, %xmm3 +; X86-SSE4-NEXT: paddd %xmm4, %xmm3 +; X86-SSE4-NEXT: paddd %xmm1, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE4-NEXT: paddd %xmm3, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: paddd %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v32i32_v32i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxbd {{.*#+}} xmm8 = [255,255,255,255] +; X64-SSE4-NEXT: pand %xmm8, %xmm5 +; X64-SSE4-NEXT: pand %xmm8, %xmm1 +; X64-SSE4-NEXT: paddd %xmm5, %xmm1 +; X64-SSE4-NEXT: pand %xmm8, %xmm7 +; X64-SSE4-NEXT: pand %xmm8, %xmm3 +; X64-SSE4-NEXT: paddd %xmm7, %xmm3 +; X64-SSE4-NEXT: paddd %xmm1, %xmm3 +; X64-SSE4-NEXT: pand %xmm8, %xmm4 +; X64-SSE4-NEXT: pand %xmm8, %xmm0 +; X64-SSE4-NEXT: paddd %xmm4, %xmm0 +; X64-SSE4-NEXT: pand %xmm8, %xmm6 +; X64-SSE4-NEXT: pand %xmm8, %xmm2 +; X64-SSE4-NEXT: paddd %xmm6, %xmm2 +; X64-SSE4-NEXT: paddd %xmm0, %xmm2 +; X64-SSE4-NEXT: paddd %xmm3, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X64-SSE4-NEXT: paddd %xmm2, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: movd %xmm1, %eax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-SLOW-LABEL: test_v32i32_v32i8: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: pushl %ebp +; X86-AVX1-SLOW-NEXT: movl %esp, %ebp +; X86-AVX1-SLOW-NEXT: andl $-32, %esp +; X86-AVX1-SLOW-NEXT: subl $32, %esp +; X86-AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255] +; X86-AVX1-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-SLOW-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX1-SLOW-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX1-SLOW-NEXT: vandps 8(%ebp), %ymm3, %ymm3 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm2 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: movl %ebp, %esp +; X86-AVX1-SLOW-NEXT: popl %ebp +; X86-AVX1-SLOW-NEXT: vzeroupper +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v32i32_v32i8: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; X64-AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X64-AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-SLOW-NEXT: vandps %ymm4, %ymm3, %ymm3 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: vzeroupper +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v32i32_v32i8: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: pushl %ebp +; X86-AVX1-FAST-NEXT: movl %esp, %ebp +; X86-AVX1-FAST-NEXT: andl $-32, %esp +; X86-AVX1-FAST-NEXT: subl $32, %esp +; X86-AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255] +; X86-AVX1-FAST-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-FAST-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX1-FAST-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX1-FAST-NEXT: vandps 8(%ebp), %ymm3, %ymm3 +; X86-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm2 +; X86-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X86-AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X86-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; X86-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: movl %ebp, %esp +; X86-AVX1-FAST-NEXT: popl %ebp +; X86-AVX1-FAST-NEXT: vzeroupper +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v32i32_v32i8: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; X64-AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X64-AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-FAST-NEXT: vandps %ymm4, %ymm3, %ymm3 +; X64-AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; X64-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; X64-AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; X64-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: vzeroupper +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v32i32_v32i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand 8(%ebp), %ymm3, %ymm2 +; X86-AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v32i32_v32i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i32_v32i8: ; AVX512: # %bb.0: @@ -786,52 +1302,78 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; vXi16 ; -define i16 @test_v2i16_v2i8(<2 x i16> %a0) { -; SSE2-LABEL: test_v2i16_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v2i16_v2i8: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v2i16_v2i8: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v2i16_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +define i16 @test_v2i16_v2i8(<2 x i16> %a0) nounwind { +; X86-SSE-LABEL: test_v2i16_v2i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: paddw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v2i16_v2i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: paddw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-SLOW-LABEL: test_v2i16_v2i8: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v2i16_v2i8: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v2i16_v2i8: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v2i16_v2i8: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i16_v2i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i16_v2i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2i16_v2i8: ; AVX512: # %bb.0: @@ -846,70 +1388,120 @@ define i16 @test_v2i16_v2i8(<2 x i16> %a0) { ret i16 %2 } -define i16 @test_v4i16_v4i8(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i16_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0] -; SSE41-NEXT: pmulhuw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v4i16_v4i8: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v4i16_v4i8: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] -; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v4i16_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +define i16 @test_v4i16_v4i8(<4 x i16> %a0) nounwind { +; X86-SSE2-LABEL: test_v4i16_v4i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v4i16_v4i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u] +; X64-SSE2-NEXT: por %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; SSE4-LABEL: test_v4i16_v4i8: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0] +; SSE4-NEXT: pmulhuw %xmm0, %xmm1 +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $16, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} +; +; X86-AVX1-SLOW-LABEL: test_v4i16_v4i8: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] +; X86-AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v4i16_v4i8: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] +; X64-AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v4i16_v4i8: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] +; X86-AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X86-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v4i16_v4i8: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] +; X64-AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X64-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v4i16_v4i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] +; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v4i16_v4i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,u,u,u,u] +; X64-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v4i16_v4i8: ; AVX512BW: # %bb.0: @@ -940,101 +1532,201 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ret i16 %2 } -define i16 @test_v8i16_v8i8(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +define i16 @test_v8i16_v8i8(<8 x i16> %a0) nounwind { +; X86-SSE-LABEL: test_v8i16_v8i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: packuswb %xmm0, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v8i16_v8i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: packuswb %xmm0, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_v8i16_v8i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v8i16_v8i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i16_v8i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i16_v8i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = and <8 x i16> %a0, %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) ret i16 %2 } -define i16 @test_v16i16_v16i8(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i16 @test_v16i16_v16i8(<16 x i16> %a0) nounwind { +; X86-SSE2-LABEL: test_v16i16_v16i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v16i16_v16i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v16i16_v16i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X86-SSE4-NEXT: pand %xmm2, %xmm1 +; X86-SSE4-NEXT: pand %xmm2, %xmm0 +; X86-SSE4-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: pxor %xmm1, %xmm1 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v16i16_v16i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X64-SSE4-NEXT: pand %xmm2, %xmm1 +; X64-SSE4-NEXT: pand %xmm2, %xmm0 +; X64-SSE4-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE4-NEXT: pxor %xmm1, %xmm1 +; X64-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm1, %xmm0 +; X64-SSE4-NEXT: movd %xmm0, %eax +; X64-SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v16i16_v16i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v16i16_v16i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v16i16_v16i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v16i16_v16i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v16i16_v16i8: ; AVX512BW: # %bb.0: @@ -1067,42 +1759,49 @@ define i16 @test_v16i16_v16i8(<16 x i16> %a0) { ret i16 %2 } -define i16 @test_v32i16_v32i8(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psadbw %xmm1, %xmm2 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psadbw %xmm1, %xmm2 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +define i16 @test_v32i16_v32i8(<32 x i16> %a0) nounwind { +; X86-SSE-LABEL: test_v32i16_v32i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE-NEXT: psrlw $8, %xmm1 +; X86-SSE-NEXT: psrlw $8, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE-NEXT: psrlw $8, %xmm3 +; X86-SSE-NEXT: psrlw $8, %xmm2 +; X86-SSE-NEXT: packuswb %xmm3, %xmm2 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: psadbw %xmm1, %xmm2 +; X86-SSE-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE-NEXT: paddq %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v32i16_v32i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: psrlw $8, %xmm1 +; X64-SSE-NEXT: psrlw $8, %xmm0 +; X64-SSE-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE-NEXT: psrlw $8, %xmm3 +; X64-SSE-NEXT: psrlw $8, %xmm2 +; X64-SSE-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: psadbw %xmm1, %xmm2 +; X64-SSE-NEXT: psadbw %xmm1, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i16_v32i8: ; AVX1: # %bb.0: @@ -1123,7 +1822,7 @@ define i16 @test_v32i16_v32i8(<32 x i16> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i16_v32i8: ; AVX2: # %bb.0: @@ -1140,7 +1839,7 @@ define i16 @test_v32i16_v32i8(<32 x i16> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: @@ -1161,110 +1860,246 @@ define i16 @test_v32i16_v32i8(<32 x i16> %a0) { ret i16 %2 } -define i16 @test_v64i16_v64i8(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: paddb %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packuswb %xmm7, %xmm6 -; SSE2-NEXT: paddb %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm6 -; SSE2-NEXT: psadbw %xmm0, %xmm4 -; SSE2-NEXT: paddq %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] -; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: packuswb %xmm5, %xmm4 -; SSE41-NEXT: paddb %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: packuswb %xmm7, %xmm6 -; SSE41-NEXT: paddb %xmm2, %xmm6 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm0, %xmm6 -; SSE41-NEXT: psadbw %xmm0, %xmm4 -; SSE41-NEXT: paddq %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE41-NEXT: paddq %xmm4, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i16_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i16 @test_v64i16_v64i8(<64 x i16> %a0) nounwind { +; X86-SSE2-LABEL: test_v64i16_v64i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pand 56(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm0, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm3 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: paddq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v64i16_v64i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm8, %xmm5 +; X64-SSE2-NEXT: pand %xmm8, %xmm4 +; X64-SSE2-NEXT: packuswb %xmm5, %xmm4 +; X64-SSE2-NEXT: paddb %xmm0, %xmm4 +; X64-SSE2-NEXT: pand %xmm8, %xmm3 +; X64-SSE2-NEXT: pand %xmm8, %xmm2 +; X64-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: packuswb %xmm7, %xmm6 +; X64-SSE2-NEXT: paddb %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: psadbw %xmm0, %xmm6 +; X64-SSE2-NEXT: psadbw %xmm0, %xmm4 +; X64-SSE2-NEXT: paddq %xmm6, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm4, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v64i16_v64i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127] +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm2 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE4-NEXT: pand %xmm3, %xmm4 +; X86-SSE4-NEXT: movdqa 24(%ebp), %xmm1 +; X86-SSE4-NEXT: pand %xmm3, %xmm1 +; X86-SSE4-NEXT: packuswb %xmm4, %xmm1 +; X86-SSE4-NEXT: paddb %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: packuswb %xmm0, %xmm2 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm0 +; X86-SSE4-NEXT: pand %xmm3, %xmm0 +; X86-SSE4-NEXT: pand 56(%ebp), %xmm3 +; X86-SSE4-NEXT: packuswb %xmm0, %xmm3 +; X86-SSE4-NEXT: paddb %xmm2, %xmm3 +; X86-SSE4-NEXT: pxor %xmm0, %xmm0 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm3 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE4-NEXT: paddq %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v64i16_v64i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] +; X64-SSE4-NEXT: pand %xmm8, %xmm1 +; X64-SSE4-NEXT: pand %xmm8, %xmm0 +; X64-SSE4-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE4-NEXT: pand %xmm8, %xmm5 +; X64-SSE4-NEXT: pand %xmm8, %xmm4 +; X64-SSE4-NEXT: packuswb %xmm5, %xmm4 +; X64-SSE4-NEXT: paddb %xmm0, %xmm4 +; X64-SSE4-NEXT: pand %xmm8, %xmm3 +; X64-SSE4-NEXT: pand %xmm8, %xmm2 +; X64-SSE4-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE4-NEXT: pand %xmm8, %xmm7 +; X64-SSE4-NEXT: pand %xmm8, %xmm6 +; X64-SSE4-NEXT: packuswb %xmm7, %xmm6 +; X64-SSE4-NEXT: paddb %xmm2, %xmm6 +; X64-SSE4-NEXT: pxor %xmm0, %xmm0 +; X64-SSE4-NEXT: psadbw %xmm0, %xmm6 +; X64-SSE4-NEXT: psadbw %xmm0, %xmm4 +; X64-SSE4-NEXT: paddq %xmm6, %xmm4 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm4, %xmm0 +; X64-SSE4-NEXT: movd %xmm0, %eax +; X64-SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v64i16_v64i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vandps 8(%ebp), %ymm3, %ymm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; X86-AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X86-AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v64i16_v64i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X64-AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; X64-AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v64i16_v64i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpand 8(%ebp), %ymm3, %ymm1 +; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v64i16_v64i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm1 +; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i16_v64i8: ; AVX512: # %bb.0: @@ -1314,3 +2149,6 @@ declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE2: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll index dc0ebe6b2e2ef..3b041304e252d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll @@ -1,95 +1,210 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,SSE41,X86-SSE4,X86-SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,SSE41,X64-SSE4,X64-SSE41 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,SSE42,X86-SSE4,X86-SSE42 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,SSE42,X64-SSE4,X64-SSE42 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X86-AVX1,X86-AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X64-AVX1,X64-AVX1-SLOW +; RUN: llc < %s -mtriple=i686-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X86-AVX1,X86-AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X64-AVX1,X64-AVX1-FAST +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL ; ; vXi64 ; -define i64 @test_v2i64_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i64_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +define i64 @test_v2i64_v2i32(<2 x i32> %a0) nounwind { +; X86-SSE2-LABEL: test_v2i64_v2i32: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v2i64_v2i32: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v2i64_v2i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovsxdq %xmm0, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v2i64_v2i32: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxdq %xmm0, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v2i64_v2i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v2i64_v2i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i64_v2i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i64_v2i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2i64_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq %1 = sext <2 x i32> %a0 to <2 x i64> %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1) ret i64 %2 } -define i64 @test_v4i64_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i64_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxwq %xmm0, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v4i64_v4i16(<4 x i16> %a0) nounwind { +; X86-SSE2-LABEL: test_v4i64_v4i16: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v4i64_v4i16: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v4i64_v4i16: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovsxwq %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: pmovsxwq %xmm0, %xmm0 +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v4i64_v4i16: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxwq %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: pmovsxwq %xmm0, %xmm0 +; X64-SSE4-NEXT: paddq %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v4i64_v4i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v4i64_v4i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v4i64_v4i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v4i64_v4i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64_v4i16: ; AVX512: # %bb.0: @@ -106,80 +221,159 @@ define i64 @test_v4i64_v4i16(<4 x i16> %a0) { ret i64 %2 } -define i64 @test_v8i64_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i64_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i64_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 -; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlq $48, %xmm1 -; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i64_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm3 -; AVX1-NEXT: vpmovsxbq %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v8i64_v8i8(<8 x i8> %a0) nounwind { +; X86-SSE2-LABEL: test_v8i64_v8i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE2-NEXT: psrad $24, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm2 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: psrad $24, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; X86-SSE2-NEXT: paddq %xmm4, %xmm5 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: paddq %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v8i64_v8i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-SSE2-NEXT: psrad $24, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm2 +; X64-SSE2-NEXT: pxor %xmm3, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: psrad $24, %xmm0 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; X64-SSE2-NEXT: paddq %xmm4, %xmm5 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE2-NEXT: paddq %xmm1, %xmm0 +; X64-SSE2-NEXT: paddq %xmm5, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v8i64_v8i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovsxbq %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: pmovsxbq %xmm2, %xmm2 +; X86-SSE4-NEXT: paddq %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: psrlq $48, %xmm1 +; X86-SSE4-NEXT: pmovsxbq %xmm1, %xmm1 +; X86-SSE4-NEXT: psrld $16, %xmm0 +; X86-SSE4-NEXT: pmovsxbq %xmm0, %xmm0 +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v8i64_v8i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbq %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: pmovsxbq %xmm2, %xmm2 +; X64-SSE4-NEXT: paddq %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE4-NEXT: psrlq $48, %xmm1 +; X64-SSE4-NEXT: pmovsxbq %xmm1, %xmm1 +; X64-SSE4-NEXT: psrld $16, %xmm0 +; X64-SSE4-NEXT: pmovsxbq %xmm0, %xmm0 +; X64-SSE4-NEXT: paddq %xmm1, %xmm0 +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v8i64_v8i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X86-AVX1-NEXT: vpsrld $16, %xmm2, %xmm3 +; X86-AVX1-NEXT: vpmovsxbq %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v8i64_v8i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X64-AVX1-NEXT: vpsrld $16, %xmm2, %xmm3 +; X64-AVX1-NEXT: vpmovsxbq %xmm3, %xmm3 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i64_v8i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; X86-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i64_v8i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64_v8i8: ; AVX512: # %bb.0: @@ -198,134 +392,275 @@ define i64 @test_v8i64_v8i8(<8 x i8> %a0) { ret i64 %2 } -define i64 @test_v16i64_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i64_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE2-NEXT: paddq %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE2-NEXT: paddq %xmm9, %xmm10 -; SSE2-NEXT: paddq %xmm8, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE2-NEXT: paddq %xmm4, %xmm6 -; SSE2-NEXT: paddq %xmm0, %xmm6 -; SSE2-NEXT: paddq %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: paddq %xmm6, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i64_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmovsxbq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovsxbq %xmm2, %xmm0 -; SSE41-NEXT: psrlq $48, %xmm3 -; SSE41-NEXT: pmovsxbq %xmm3, %xmm2 -; SSE41-NEXT: paddq %xmm0, %xmm2 -; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: pmovsxbq %xmm5, %xmm0 -; SSE41-NEXT: paddq %xmm4, %xmm0 -; SSE41-NEXT: pmovsxbq %xmm6, %xmm1 -; SSE41-NEXT: pmovsxbq %xmm7, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm3 -; SSE41-NEXT: paddq %xmm0, %xmm3 -; SSE41-NEXT: paddq %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] -; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] -; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v16i64_v16i8(<16 x i8> %a0) nounwind { +; X86-SSE2-LABEL: test_v16i64_v16i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: subl $48, %esp +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-SSE2-NEXT: psrad $24, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; X86-SSE2-NEXT: psrad $24, %xmm7 +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; X86-SSE2-NEXT: movdqu %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X86-SSE2-NEXT: paddq %xmm4, %xmm1 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: psrad $24, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqu %xmm4, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; X86-SSE2-NEXT: psrad $24, %xmm4 +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; X86-SSE2-NEXT: paddq %xmm6, %xmm0 +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE2-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; X86-SSE2-NEXT: paddq %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-SSE2-NEXT: paddq %xmm3, %xmm4 +; X86-SSE2-NEXT: paddq %xmm7, %xmm4 +; X86-SSE2-NEXT: paddq %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm4, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $48, %esp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v16i64_v16i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE2-NEXT: psrad $24, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pxor %xmm3, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; X64-SSE2-NEXT: psrad $24, %xmm0 +; X64-SSE2-NEXT: pxor %xmm7, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm8 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; X64-SSE2-NEXT: paddq %xmm5, %xmm8 +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: psrad $24, %xmm4 +; X64-SSE2-NEXT: pxor %xmm5, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm9 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: psrad $24, %xmm6 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm10 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; X64-SSE2-NEXT: paddq %xmm9, %xmm10 +; X64-SSE2-NEXT: paddq %xmm8, %xmm10 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; X64-SSE2-NEXT: paddq %xmm4, %xmm6 +; X64-SSE2-NEXT: paddq %xmm0, %xmm6 +; X64-SSE2-NEXT: paddq %xmm10, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm6, %xmm0 +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v16i64_v16i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: pmovsxbq %xmm0, %xmm4 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE4-NEXT: pmovsxbq %xmm0, %xmm0 +; X86-SSE4-NEXT: psrld $16, %xmm1 +; X86-SSE4-NEXT: pmovsxbq %xmm1, %xmm1 +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE4-NEXT: pmovsxbq %xmm2, %xmm0 +; X86-SSE4-NEXT: psrlq $48, %xmm3 +; X86-SSE4-NEXT: pmovsxbq %xmm3, %xmm2 +; X86-SSE4-NEXT: paddq %xmm0, %xmm2 +; X86-SSE4-NEXT: paddq %xmm1, %xmm2 +; X86-SSE4-NEXT: pmovsxbq %xmm5, %xmm0 +; X86-SSE4-NEXT: paddq %xmm4, %xmm0 +; X86-SSE4-NEXT: pmovsxbq %xmm6, %xmm1 +; X86-SSE4-NEXT: pmovsxbq %xmm7, %xmm3 +; X86-SSE4-NEXT: paddq %xmm1, %xmm3 +; X86-SSE4-NEXT: paddq %xmm0, %xmm3 +; X86-SSE4-NEXT: paddq %xmm2, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm3, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v16i64_v16i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE4-NEXT: pmovsxbq %xmm0, %xmm4 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE4-NEXT: pmovsxbq %xmm0, %xmm0 +; X64-SSE4-NEXT: psrld $16, %xmm1 +; X64-SSE4-NEXT: pmovsxbq %xmm1, %xmm1 +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE4-NEXT: pmovsxbq %xmm2, %xmm0 +; X64-SSE4-NEXT: psrlq $48, %xmm3 +; X64-SSE4-NEXT: pmovsxbq %xmm3, %xmm2 +; X64-SSE4-NEXT: paddq %xmm0, %xmm2 +; X64-SSE4-NEXT: paddq %xmm1, %xmm2 +; X64-SSE4-NEXT: pmovsxbq %xmm5, %xmm0 +; X64-SSE4-NEXT: paddq %xmm4, %xmm0 +; X64-SSE4-NEXT: pmovsxbq %xmm6, %xmm1 +; X64-SSE4-NEXT: pmovsxbq %xmm7, %xmm3 +; X64-SSE4-NEXT: paddq %xmm1, %xmm3 +; X64-SSE4-NEXT: paddq %xmm0, %xmm3 +; X64-SSE4-NEXT: paddq %xmm2, %xmm3 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm3, %xmm0 +; X64-SSE4-NEXT: movq %xmm0, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v16i64_v16i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmovsxbq %xmm1, %xmm2 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpmovsxbw %xmm1, %xmm3 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; X86-AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 +; X86-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] +; X86-AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] +; X86-AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; X86-AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X86-AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v16i64_v16i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; X64-AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] +; X64-AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] +; X64-AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X64-AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X64-AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v16i64_v16i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; X86-AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; X86-AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; X86-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v16i64_v16i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; X64-AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; X64-AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; X64-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i64_v16i8: ; AVX512: # %bb.0: @@ -352,7 +687,7 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) { ; vXi32 ; -define i32 @test_v2i32_v2i16(<2 x i16> %a0) { +define i32 @test_v2i32_v2i16(<2 x i16> %a0) nounwind { ; SSE2-LABEL: test_v2i32_v2i16: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] @@ -360,15 +695,15 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} ; -; SSE41-LABEL: test_v2i32_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +; SSE4-LABEL: test_v2i32_v2i16: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: paddd %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v2i32_v2i16: ; AVX1-SLOW: # %bb.0: @@ -376,14 +711,14 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v2i32_v2i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v2i32_v2i16: ; AVX2: # %bb.0: @@ -391,7 +726,7 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i32_v2i16: ; AVX512: # %bb.0: @@ -405,7 +740,7 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ret i32 %2 } -define i32 @test_v4i32(<4 x i8> %a0) { +define i32 @test_v4i32(<4 x i8> %a0) nounwind { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -416,17 +751,17 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v4i32: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-NEXT: paddd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddd %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i32: ; AVX1-SLOW: # %bb.0: @@ -436,7 +771,7 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: @@ -444,7 +779,7 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: @@ -454,7 +789,7 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: @@ -470,7 +805,7 @@ define i32 @test_v4i32(<4 x i8> %a0) { ret i32 %2 } -define i32 @test_v8i32_v8i8(<8 x i8> %a0) { +define i32 @test_v8i32_v8i8(<8 x i8> %a0) nounwind { ; SSE2-LABEL: test_v8i32_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -484,20 +819,20 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v8i32_v8i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE4-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE4-NEXT: paddd %xmm1, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-NEXT: paddd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddd %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v8i32_v8i8: ; AVX1-SLOW: # %bb.0: @@ -510,7 +845,7 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v8i32_v8i8: ; AVX1-FAST: # %bb.0: @@ -521,7 +856,7 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v8i32_v8i8: ; AVX2: # %bb.0: @@ -534,7 +869,7 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v8i32_v8i8: ; AVX512: # %bb.0: @@ -553,7 +888,7 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) { ret i32 %2 } -define i32 @test_v16i32_v16i8(<16 x i8> %a0) { +define i32 @test_v16i32_v16i8(<16 x i8> %a0) nounwind { ; SSE2-LABEL: test_v16i32_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -574,26 +909,26 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE41-NEXT: paddd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v16i32_v16i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE4-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE4-NEXT: paddd %xmm1, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE4-NEXT: pmovsxbd %xmm1, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE4-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE4-NEXT: paddd %xmm1, %xmm0 +; SSE4-NEXT: paddd %xmm2, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-NEXT: paddd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddd %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v16i32_v16i8: ; AVX1-SLOW: # %bb.0: @@ -612,7 +947,7 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v16i32_v16i8: ; AVX1-FAST: # %bb.0: @@ -630,7 +965,7 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v16i32_v16i8: ; AVX2: # %bb.0: @@ -646,7 +981,7 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v16i32_v16i8: ; AVX512: # %bb.0: @@ -667,72 +1002,138 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ret i32 %2 } -define i32 @test_v32i32_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i32_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: psrad $24, %xmm5 -; SSE2-NEXT: paddd %xmm3, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: psrad $24, %xmm6 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: paddd %xmm5, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 -; SSE41-NEXT: paddd %xmm2, %xmm4 -; SSE41-NEXT: paddd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: paddd %xmm3, %xmm0 -; SSE41-NEXT: paddd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +define i32 @test_v32i32_v32i8(<32 x i8> %a0) nounwind { +; X86-SSE2-LABEL: test_v32i32_v32i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE2-NEXT: psrad $24, %xmm4 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; X86-SSE2-NEXT: psrad $24, %xmm5 +; X86-SSE2-NEXT: paddd %xmm4, %xmm5 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; X86-SSE2-NEXT: psrad $24, %xmm6 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE2-NEXT: psrad $24, %xmm0 +; X86-SSE2-NEXT: paddd %xmm6, %xmm0 +; X86-SSE2-NEXT: paddd %xmm5, %xmm0 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: psrad $24, %xmm2 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: psrad $24, %xmm3 +; X86-SSE2-NEXT: paddd %xmm2, %xmm3 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: psrad $24, %xmm1 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE2-NEXT: psrad $24, %xmm2 +; X86-SSE2-NEXT: paddd %xmm1, %xmm2 +; X86-SSE2-NEXT: paddd %xmm3, %xmm2 +; X86-SSE2-NEXT: paddd %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X86-SSE2-NEXT: paddd %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: paddd %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v32i32_v32i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-SSE2-NEXT: psrad $24, %xmm3 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; X64-SSE2-NEXT: psrad $24, %xmm5 +; X64-SSE2-NEXT: paddd %xmm3, %xmm5 +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X64-SSE2-NEXT: psrad $24, %xmm3 +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; X64-SSE2-NEXT: psrad $24, %xmm6 +; X64-SSE2-NEXT: paddd %xmm3, %xmm6 +; X64-SSE2-NEXT: paddd %xmm5, %xmm6 +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: psrad $24, %xmm2 +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE2-NEXT: psrad $24, %xmm3 +; X64-SSE2-NEXT: paddd %xmm2, %xmm3 +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: psrad $24, %xmm1 +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: psrad $24, %xmm0 +; X64-SSE2-NEXT: paddd %xmm1, %xmm0 +; X64-SSE2-NEXT: paddd %xmm3, %xmm0 +; X64-SSE2-NEXT: paddd %xmm6, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE2-NEXT: paddd %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v32i32_v32i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; X86-SSE4-NEXT: pmovsxbd %xmm2, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: pmovsxbd %xmm3, %xmm3 +; X86-SSE4-NEXT: paddd %xmm2, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X86-SSE4-NEXT: pmovsxbd %xmm2, %xmm4 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; X86-SSE4-NEXT: pmovsxbd %xmm2, %xmm2 +; X86-SSE4-NEXT: paddd %xmm4, %xmm2 +; X86-SSE4-NEXT: paddd %xmm3, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: pmovsxbd %xmm3, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: pmovsxbd %xmm4, %xmm4 +; X86-SSE4-NEXT: paddd %xmm3, %xmm4 +; X86-SSE4-NEXT: pmovsxbd %xmm1, %xmm1 +; X86-SSE4-NEXT: pmovsxbd %xmm0, %xmm0 +; X86-SSE4-NEXT: paddd %xmm1, %xmm0 +; X86-SSE4-NEXT: paddd %xmm4, %xmm0 +; X86-SSE4-NEXT: paddd %xmm2, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddd %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE4-NEXT: paddd %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v32i32_v32i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; X64-SSE4-NEXT: pmovsxbd %xmm2, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: pmovsxbd %xmm3, %xmm3 +; X64-SSE4-NEXT: paddd %xmm2, %xmm3 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X64-SSE4-NEXT: pmovsxbd %xmm2, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; X64-SSE4-NEXT: pmovsxbd %xmm4, %xmm4 +; X64-SSE4-NEXT: paddd %xmm2, %xmm4 +; X64-SSE4-NEXT: paddd %xmm3, %xmm4 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: pmovsxbd %xmm2, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: pmovsxbd %xmm3, %xmm3 +; X64-SSE4-NEXT: paddd %xmm2, %xmm3 +; X64-SSE4-NEXT: pmovsxbd %xmm1, %xmm1 +; X64-SSE4-NEXT: pmovsxbd %xmm0, %xmm0 +; X64-SSE4-NEXT: paddd %xmm1, %xmm0 +; X64-SSE4-NEXT: paddd %xmm3, %xmm0 +; X64-SSE4-NEXT: paddd %xmm4, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddd %xmm0, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE4-NEXT: paddd %xmm1, %xmm0 +; X64-SSE4-NEXT: movd %xmm0, %eax +; X64-SSE4-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v32i32_v32i8: ; AVX1-SLOW: # %bb.0: @@ -764,7 +1165,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v32i32_v32i8: ; AVX1-FAST: # %bb.0: @@ -795,7 +1196,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i32_v32i8: ; AVX2: # %bb.0: @@ -817,7 +1218,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i32_v32i8: ; AVX512: # %bb.0: @@ -845,7 +1246,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; vXi16 ; -define i16 @test_v2i16_v2i8(<2 x i8> %a0) { +define i16 @test_v2i16_v2i8(<2 x i8> %a0) nounwind { ; SSE2-LABEL: test_v2i16_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -855,17 +1256,17 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v2i16_v2i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $16, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v2i16_v2i8: ; AVX1-SLOW: # %bb.0: @@ -874,7 +1275,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v2i16_v2i8: ; AVX1-FAST: # %bb.0: @@ -882,7 +1283,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v2i16_v2i8: ; AVX2: # %bb.0: @@ -891,7 +1292,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i16_v2i8: ; AVX512: # %bb.0: @@ -906,7 +1307,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ret i16 %2 } -define i16 @test_v4i16_v4i8(<4 x i8> %a0) { +define i16 @test_v4i16_v4i8(<4 x i8> %a0) nounwind { ; SSE2-LABEL: test_v4i16_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -918,19 +1319,19 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v4i16_v4i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrld $16, %xmm0 +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i16_v4i8: ; AVX1-SLOW: # %bb.0: @@ -941,7 +1342,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i16_v4i8: ; AVX1-FAST: # %bb.0: @@ -951,7 +1352,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i16_v4i8: ; AVX2: # %bb.0: @@ -962,7 +1363,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i16_v4i8: ; AVX512: # %bb.0: @@ -980,7 +1381,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { } -define i16 @test_v8i16_v8i8(<8 x i8> %a0) { +define i16 @test_v8i16_v8i8(<8 x i8> %a0) nounwind { ; SSE2-LABEL: test_v8i16_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -994,21 +1395,21 @@ define i16 @test_v8i16_v8i8(<8 x i8> %a0) { ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v8i16_v8i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $16, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v8i16_v8i8: ; AVX1-SLOW: # %bb.0: @@ -1021,7 +1422,7 @@ define i16 @test_v8i16_v8i8(<8 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v8i16_v8i8: ; AVX1-FAST: # %bb.0: @@ -1031,7 +1432,7 @@ define i16 @test_v8i16_v8i8(<8 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v8i16_v8i8: ; AVX2: # %bb.0: @@ -1044,7 +1445,7 @@ define i16 @test_v8i16_v8i8(<8 x i8> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v8i16_v8i8: ; AVX512: # %bb.0: @@ -1063,7 +1464,7 @@ define i16 @test_v8i16_v8i8(<8 x i8> %a0) { ret i16 %2 } -define i16 @test_v16i16_v16i8(<16 x i8> %a0) { +define i16 @test_v16i16_v16i8(<16 x i8> %a0) nounwind { ; SSE2-LABEL: test_v16i16_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -1080,24 +1481,24 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v16i16_v16i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbw %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $16, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v16i16_v16i8: ; AVX1-SLOW: # %bb.0: @@ -1113,7 +1514,7 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v16i16_v16i8: ; AVX1-FAST: # %bb.0: @@ -1126,7 +1527,7 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v16i16_v16i8: ; AVX2: # %bb.0: @@ -1142,7 +1543,7 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v16i16_v16i8: ; AVX512: # %bb.0: @@ -1164,7 +1565,7 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ret i16 %2 } -define i16 @test_v32i16_v32i8(<32 x i8> %a0) { +define i16 @test_v32i16_v32i8(<32 x i8> %a0) nounwind { ; SSE2-LABEL: test_v32i16_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -1187,29 +1588,29 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: paddw %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v32i16_v32i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE4-NEXT: pmovsxbw %xmm2, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE4-NEXT: pmovsxbw %xmm3, %xmm3 +; SSE4-NEXT: paddw %xmm2, %xmm3 +; SSE4-NEXT: pmovsxbw %xmm1, %xmm1 +; SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: paddw %xmm3, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $16, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v32i16_v32i8: ; AVX1-SLOW: # %bb.0: @@ -1232,7 +1633,7 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v32i16_v32i8: ; AVX1-FAST: # %bb.0: @@ -1254,7 +1655,7 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i16_v32i8: ; AVX2: # %bb.0: @@ -1273,7 +1674,7 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: @@ -1297,74 +1698,156 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ret i16 %2 } -define i16 @test_v64i16_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i16_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: paddw %xmm4, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: paddw %xmm4, %xmm6 -; SSE2-NEXT: paddw %xmm5, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 -; SSE41-NEXT: paddw %xmm4, %xmm5 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm6 -; SSE41-NEXT: paddw %xmm4, %xmm6 -; SSE41-NEXT: paddw %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: paddw %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +define i16 @test_v64i16_v64i8(<64 x i8> %a0) nounwind { +; X86-SSE2-LABEL: test_v64i16_v64i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; X86-SSE2-NEXT: psraw $8, %xmm4 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; X86-SSE2-NEXT: psraw $8, %xmm5 +; X86-SSE2-NEXT: paddw %xmm4, %xmm5 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; X86-SSE2-NEXT: psraw $8, %xmm6 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X86-SSE2-NEXT: psraw $8, %xmm4 +; X86-SSE2-NEXT: paddw %xmm6, %xmm4 +; X86-SSE2-NEXT: paddw %xmm5, %xmm4 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: psraw $8, %xmm2 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: psraw $8, %xmm0 +; X86-SSE2-NEXT: paddw %xmm2, %xmm0 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X86-SSE2-NEXT: psraw $8, %xmm2 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: psraw $8, %xmm1 +; X86-SSE2-NEXT: paddw %xmm2, %xmm1 +; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: paddw %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v64i16_v64i8: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; X64-SSE2-NEXT: psraw $8, %xmm4 +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; X64-SSE2-NEXT: psraw $8, %xmm5 +; X64-SSE2-NEXT: paddw %xmm4, %xmm5 +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; X64-SSE2-NEXT: psraw $8, %xmm4 +; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] +; X64-SSE2-NEXT: psraw $8, %xmm6 +; X64-SSE2-NEXT: paddw %xmm4, %xmm6 +; X64-SSE2-NEXT: paddw %xmm5, %xmm6 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: psraw $8, %xmm2 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: psraw $8, %xmm0 +; X64-SSE2-NEXT: paddw %xmm2, %xmm0 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; X64-SSE2-NEXT: psraw $8, %xmm2 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: psraw $8, %xmm1 +; X64-SSE2-NEXT: paddw %xmm2, %xmm1 +; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: paddw %xmm6, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v64i16_v64i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm4 +; X86-SSE4-NEXT: pmovsxbw %xmm2, %xmm3 +; X86-SSE4-NEXT: pmovsxbw %xmm0, %xmm5 +; X86-SSE4-NEXT: paddw %xmm3, %xmm5 +; X86-SSE4-NEXT: pmovsxbw %xmm4, %xmm6 +; X86-SSE4-NEXT: pmovsxbw %xmm1, %xmm3 +; X86-SSE4-NEXT: paddw %xmm6, %xmm3 +; X86-SSE4-NEXT: paddw %xmm5, %xmm3 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X86-SSE4-NEXT: pmovsxbw %xmm2, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; X86-SSE4-NEXT: paddw %xmm2, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; X86-SSE4-NEXT: pmovsxbw %xmm2, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: pmovsxbw %xmm1, %xmm1 +; X86-SSE4-NEXT: paddw %xmm2, %xmm1 +; X86-SSE4-NEXT: paddw %xmm0, %xmm1 +; X86-SSE4-NEXT: paddw %xmm3, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddw %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: paddw %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE4-NEXT: psrld $16, %xmm0 +; X86-SSE4-NEXT: paddw %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v64i16_v64i8: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbw %xmm2, %xmm4 +; X64-SSE4-NEXT: pmovsxbw %xmm0, %xmm5 +; X64-SSE4-NEXT: paddw %xmm4, %xmm5 +; X64-SSE4-NEXT: pmovsxbw %xmm3, %xmm4 +; X64-SSE4-NEXT: pmovsxbw %xmm1, %xmm6 +; X64-SSE4-NEXT: paddw %xmm4, %xmm6 +; X64-SSE4-NEXT: paddw %xmm5, %xmm6 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X64-SSE4-NEXT: pmovsxbw %xmm2, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: pmovsxbw %xmm0, %xmm0 +; X64-SSE4-NEXT: paddw %xmm2, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; X64-SSE4-NEXT: pmovsxbw %xmm2, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: pmovsxbw %xmm1, %xmm1 +; X64-SSE4-NEXT: paddw %xmm2, %xmm1 +; X64-SSE4-NEXT: paddw %xmm0, %xmm1 +; X64-SSE4-NEXT: paddw %xmm6, %xmm1 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE4-NEXT: paddw %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: paddw %xmm0, %xmm1 +; X64-SSE4-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE4-NEXT: psrld $16, %xmm0 +; X64-SSE4-NEXT: paddw %xmm1, %xmm0 +; X64-SSE4-NEXT: movd %xmm0, %eax +; X64-SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE4-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v64i16_v64i8: ; AVX1-SLOW: # %bb.0: @@ -1398,7 +1881,7 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) { ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v64i16_v64i8: ; AVX1-FAST: # %bb.0: @@ -1431,7 +1914,7 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) { ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v64i16_v64i8: ; AVX2: # %bb.0: @@ -1455,7 +1938,7 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v64i16_v64i8: ; AVX512: # %bb.0: @@ -1487,63 +1970,132 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) { ; vXi1 - sum of extended bool vectors ; -define i64 @test_v2i64_v2i1(<2 x i64> %a0) { -; SSE2-LABEL: test_v2i64_v2i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64_v2i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +define i64 @test_v2i64_v2i1(<2 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v2i64_v2i1: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v2i64_v2i1: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm1, %xmm0 +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE41-LABEL: test_v2i64_v2i1: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: pxor %xmm1, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE41-NEXT: paddq %xmm1, %xmm0 +; X86-SSE41-NEXT: movd %xmm0, %eax +; X86-SSE41-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE41-NEXT: retl +; +; X64-SSE41-LABEL: test_v2i64_v2i1: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE41-NEXT: pxor %xmm1, %xmm1 +; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE41-NEXT: paddq %xmm1, %xmm0 +; X64-SSE41-NEXT: movq %xmm0, %rax +; X64-SSE41-NEXT: retq +; +; X86-SSE42-LABEL: test_v2i64_v2i1: +; X86-SSE42: # %bb.0: +; X86-SSE42-NEXT: pxor %xmm1, %xmm1 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: paddq %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE42-NEXT: retl +; +; X64-SSE42-LABEL: test_v2i64_v2i1: +; X64-SSE42: # %bb.0: +; X64-SSE42-NEXT: pxor %xmm1, %xmm1 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE42-NEXT: paddq %xmm1, %xmm0 +; X64-SSE42-NEXT: movq %xmm0, %rax +; X64-SSE42-NEXT: retq +; +; X86-AVX1-LABEL: test_v2i64_v2i1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v2i64_v2i1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i64_v2i1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i64_v2i1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2i64_v2i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq %1 = icmp slt <2 x i64> %a0, zeroinitializer %2 = sext <2 x i1> %1 to <2 x i64> %3 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %2) ret i64 %3 } -define i32 @test_v4i32_v4i1(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +define i32 @test_v4i32_v4i1(<4 x i32> %a0) nounwind { +; SSE-LABEL: test_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i32_v4i1: ; AVX1-SLOW: # %bb.0: @@ -1554,7 +2106,7 @@ define i32 @test_v4i32_v4i1(<4 x i32> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i32_v4i1: ; AVX1-FAST: # %bb.0: @@ -1563,7 +2115,7 @@ define i32 @test_v4i32_v4i1(<4 x i32> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i32_v4i1: ; AVX2: # %bb.0: @@ -1574,7 +2126,7 @@ define i32 @test_v4i32_v4i1(<4 x i32> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i32_v4i1: ; AVX512: # %bb.0: @@ -1592,36 +2144,21 @@ define i32 @test_v4i32_v4i1(<4 x i32> %a0) { ret i32 %3 } -define i16 @test_v8i16_v8i1(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +define i16 @test_v8i16_v8i1(<8 x i16> %a0) nounwind { +; SSE-LABEL: test_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v8i16_v8i1: ; AVX1-SLOW: # %bb.0: @@ -1635,7 +2172,7 @@ define i16 @test_v8i16_v8i1(<8 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v8i16_v8i1: ; AVX1-FAST: # %bb.0: @@ -1646,7 +2183,7 @@ define i16 @test_v8i16_v8i1(<8 x i16> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v8i16_v8i1: ; AVX2: # %bb.0: @@ -1660,7 +2197,7 @@ define i16 @test_v8i16_v8i1(<8 x i16> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v8i16_v8i1: ; AVX512: # %bb.0: @@ -1681,30 +2218,18 @@ define i16 @test_v8i16_v8i1(<8 x i16> %a0) { ret i16 %3 } -define i8 @test_v16i8_v16i1(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE41-NEXT: paddb %xmm2, %xmm0 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +define i8 @test_v16i8_v16i1(<16 x i8> %a0) nounwind { +; SSE-LABEL: test_v16i8_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: paddb %xmm2, %xmm0 +; SSE-NEXT: psadbw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v16i8_v16i1: ; AVX: # %bb.0: @@ -1715,43 +2240,28 @@ define i8 @test_v16i8_v16i1(<16 x i8> %a0) { ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = icmp slt <16 x i8> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i8> %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) ret i8 %3 } -define i8 @test_v32i8_v32i1(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psadbw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: psadbw %xmm2, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +define i8 @test_v32i8_v32i1(<32 x i8> %a0) nounwind { +; SSE-LABEL: test_v32i8_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE-NEXT: paddb %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: psadbw %xmm2, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_v32i8_v32i1: ; AVX1: # %bb.0: @@ -1766,7 +2276,7 @@ define i8 @test_v32i8_v32i1(<32 x i8> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i8_v32i1: ; AVX2: # %bb.0: @@ -1781,7 +2291,7 @@ define i8 @test_v32i8_v32i1(<32 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i8_v32i1: ; AVX512: # %bb.0: @@ -1803,48 +2313,54 @@ define i8 @test_v32i8_v32i1(<32 x i8> %a0) { ret i8 %3 } -define i8 @test_v64i8_v64i1(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8_v64i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: psadbw %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8_v64i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE41-NEXT: paddb %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtb %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE41-NEXT: paddb %xmm3, %xmm0 -; SSE41-NEXT: psadbw %xmm4, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +define i8 @test_v64i8_v64i1(<64 x i8> %a0) nounwind { +; X86-SSE-LABEL: test_v64i8_v64i1: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: pxor %xmm3, %xmm3 +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm4 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE-NEXT: paddb %xmm1, %xmm2 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: pcmpgtb 8(%ebp), %xmm0 +; X86-SSE-NEXT: paddb %xmm4, %xmm0 +; X86-SSE-NEXT: paddb %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE-NEXT: paddb %xmm0, %xmm1 +; X86-SSE-NEXT: psadbw %xmm3, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v64i8_v64i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: pxor %xmm5, %xmm5 +; X64-SSE-NEXT: pcmpgtb %xmm2, %xmm5 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE-NEXT: paddb %xmm5, %xmm2 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: pcmpgtb %xmm3, %xmm0 +; X64-SSE-NEXT: pxor %xmm3, %xmm3 +; X64-SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; X64-SSE-NEXT: paddb %xmm0, %xmm3 +; X64-SSE-NEXT: paddb %xmm2, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X64-SSE-NEXT: paddb %xmm3, %xmm0 +; X64-SSE-NEXT: psadbw %xmm4, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: test_v64i8_v64i1: ; AVX1: # %bb.0: @@ -1864,7 +2380,7 @@ define i8 @test_v64i8_v64i1(<64 x i8> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v64i8_v64i1: ; AVX2: # %bb.0: @@ -1881,7 +2397,7 @@ define i8 @test_v64i8_v64i1(<64 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v64i8_v64i1: ; AVX512: # %bb.0: @@ -1905,123 +2421,191 @@ define i8 @test_v64i8_v64i1(<64 x i8> %a0) { ret i8 %3 } -define i8 @test_v128i8_v128i1(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8_v128i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm9 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: paddb %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 -; SSE2-NEXT: paddb %xmm0, %xmm6 -; SSE2-NEXT: paddb %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psadbw %xmm8, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8_v128i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: pxor %xmm9, %xmm9 -; SSE41-NEXT: pcmpgtb %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE41-NEXT: paddb %xmm9, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtb %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm6 -; SSE41-NEXT: paddb %xmm0, %xmm6 -; SSE41-NEXT: paddb %xmm4, %xmm6 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtb %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE41-NEXT: paddb %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpgtb %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: psadbw %xmm8, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8_v128i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm7 -; AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8_v128i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtb %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpcmpgtb %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i8 @test_v128i8_v128i1(<128 x i8> %a0) nounwind { +; X86-SSE-LABEL: test_v128i8_v128i1: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: pxor %xmm3, %xmm3 +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm0, %xmm4 +; X86-SSE-NEXT: pxor %xmm5, %xmm5 +; X86-SSE-NEXT: pcmpgtb %xmm2, %xmm5 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: pcmpgtb 24(%ebp), %xmm1 +; X86-SSE-NEXT: paddb %xmm4, %xmm1 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: pcmpgtb 56(%ebp), %xmm0 +; X86-SSE-NEXT: paddb %xmm5, %xmm0 +; X86-SSE-NEXT: paddb %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: pcmpgtb 40(%ebp), %xmm1 +; X86-SSE-NEXT: paddb %xmm2, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb 72(%ebp), %xmm2 +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: pcmpgtb 8(%ebp), %xmm4 +; X86-SSE-NEXT: paddb %xmm2, %xmm4 +; X86-SSE-NEXT: paddb %xmm1, %xmm4 +; X86-SSE-NEXT: paddb %xmm0, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X86-SSE-NEXT: paddb %xmm4, %xmm0 +; X86-SSE-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v128i8_v128i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm8, %xmm8 +; X64-SSE-NEXT: pxor %xmm9, %xmm9 +; X64-SSE-NEXT: pcmpgtb %xmm4, %xmm9 +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: pcmpgtb %xmm0, %xmm4 +; X64-SSE-NEXT: paddb %xmm9, %xmm4 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: pcmpgtb %xmm6, %xmm0 +; X64-SSE-NEXT: pxor %xmm6, %xmm6 +; X64-SSE-NEXT: pcmpgtb %xmm2, %xmm6 +; X64-SSE-NEXT: paddb %xmm0, %xmm6 +; X64-SSE-NEXT: paddb %xmm4, %xmm6 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: pcmpgtb %xmm5, %xmm0 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE-NEXT: paddb %xmm0, %xmm2 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: pcmpgtb %xmm7, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: pcmpgtb %xmm3, %xmm1 +; X64-SSE-NEXT: paddb %xmm0, %xmm1 +; X64-SSE-NEXT: paddb %xmm2, %xmm1 +; X64-SSE-NEXT: paddb %xmm6, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE-NEXT: paddb %xmm1, %xmm0 +; X64-SSE-NEXT: psadbw %xmm8, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_v128i8_v128i1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm6 +; X86-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpgtb 8(%ebp), %xmm3, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 +; X86-AVX1-NEXT: vpcmpgtb 24(%ebp), %xmm3, %xmm4 +; X86-AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v128i8_v128i1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5 +; X64-AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; X64-AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm6 +; X64-AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm7 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v128i8_v128i1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm1 +; X86-AVX2-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vpcmpgtb %ymm0, %ymm3, %ymm0 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpcmpgtb 8(%ebp), %ymm3, %ymm2 +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v128i8_v128i1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpcmpgtb %ymm2, %ymm4, %ymm2 +; X64-AVX2-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm2 +; X64-AVX2-NEXT: vpcmpgtb %ymm1, %ymm4, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v128i8_v128i1: ; AVX512: # %bb.0: @@ -2073,3 +2657,12 @@ declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512BW: {{.*}} +; AVX512BWVL: {{.*}} +; SSE41: {{.*}} +; SSE42: {{.*}} +; X64-AVX1-FAST: {{.*}} +; X64-AVX1-SLOW: {{.*}} +; X86-AVX1-FAST: {{.*}} +; X86-AVX1-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll index d99b200385585..9d13de751812f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll @@ -1,92 +1,204 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4,SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4,SSE41 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X86-AVX1,X86-AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X64-AVX1,X64-AVX1-SLOW +; RUN: llc < %s -mtriple=i686-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X86-AVX1,X86-AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X64-AVX1,X64-AVX1-FAST +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL ; ; vXi64 ; -define i64 @test_v2i64_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i64_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +define i64 @test_v2i64_v2i32(<2 x i32> %a0) nounwind { +; X86-SSE2-LABEL: test_v2i64_v2i32: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v2i64_v2i32: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: xorps %xmm1, %xmm1 +; X64-SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-SSE2-NEXT: psrlq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm1, %xmm0 +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v2i64_v2i32: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v2i64_v2i32: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v2i64_v2i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v2i64_v2i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i64_v2i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i64_v2i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2i64_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq %1 = zext <2 x i32> %a0 to <2 x i64> %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1) ret i64 %2 } -define i64 @test_v4i64_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i64_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v4i64_v4i16(<4 x i16> %a0) nounwind { +; X86-SSE2-LABEL: test_v4i64_v4i16: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: test_v4i64_v4i16: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE2-NEXT: paddq %xmm0, %xmm1 +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: test_v4i64_v4i16: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE4-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: test_v4i64_v4i16: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE4-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X64-SSE4-NEXT: paddq %xmm1, %xmm0 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE4-NEXT: paddq %xmm0, %xmm1 +; X64-SSE4-NEXT: movq %xmm1, %rax +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: test_v4i64_v4i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v4i64_v4i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v4i64_v4i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v4i64_v4i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64_v4i16: ; AVX512: # %bb.0: @@ -103,43 +215,148 @@ define i64 @test_v4i64_v4i16(<4 x i16> %a0) { ret i64 %2 } -define i64 @test_v8i64_v8i8(<8 x i8> %a0) { -; SSE-LABEL: test_v8i64_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psadbw %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v8i64_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +define i64 @test_v8i64_v8i8(<8 x i8> %a0) nounwind { +; X86-SSE2-LABEL: test_v8i64_v8i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v8i64_v8i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v8i64_v8i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pxor %xmm1, %xmm1 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v8i64_v8i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v8i64_v8i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i64_v8i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i64_v8i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i64> %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1) ret i64 %2 } -define i64 @test_v16i64_v16i8(<16 x i8> %a0) { -; SSE-LABEL: test_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psadbw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v16i64_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +define i64 @test_v16i64_v16i8(<16 x i8> %a0) nounwind { +; X86-SSE2-LABEL: test_v16i64_v16i8: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v16i64_v16i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: movq %xmm0, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v16i64_v16i8: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pxor %xmm1, %xmm1 +; X86-SSE4-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v16i64_v16i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v16i64_v16i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v16i64_v16i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v16i64_v16i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq %1 = zext <16 x i8> %a0 to <16 x i64> %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1) ret i64 %2 @@ -149,7 +366,7 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) { ; vXi32 ; -define i32 @test_v2i32_v2i16(<2 x i16> %a0) { +define i32 @test_v2i32_v2i16(<2 x i16> %a0) nounwind { ; SSE2-LABEL: test_v2i32_v2i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -158,15 +375,15 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} ; -; SSE41-LABEL: test_v2i32_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE4-LABEL: test_v2i32_v2i16: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE4-NEXT: psrld $16, %xmm0 +; SSE4-NEXT: paddd %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v2i32_v2i16: ; AVX1-SLOW: # %bb.0: @@ -174,14 +391,14 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v2i32_v2i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v2i32_v2i16: ; AVX2: # %bb.0: @@ -189,7 +406,7 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i32_v2i16: ; AVX512: # %bb.0: @@ -203,22 +420,22 @@ define i32 @test_v2i32_v2i16(<2 x i16> %a0) { ret i32 %2 } -define i32 @test_v4i32(<4 x i8> %a0) { +define i32 @test_v4i32(<4 x i8> %a0) nounwind { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} ; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE4-LABEL: test_v4i32: +; SSE4: # %bb.0: +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE4-NEXT: psadbw %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_v4i32: ; AVX1: # %bb.0: @@ -226,7 +443,7 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: @@ -234,7 +451,7 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: @@ -249,26 +466,26 @@ define i32 @test_v4i32(<4 x i8> %a0) { ret i32 %2 } -define i32 @test_v8i32_v8i8(<8 x i8> %a0) { +define i32 @test_v8i32_v8i8(<8 x i8> %a0) nounwind { ; SSE-LABEL: test_v8i32_v8i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v8i32_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = zext <8 x i8> %a0 to <8 x i32> %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 } -define i32 @test_v16i32_v16i8(<16 x i8> %a0) { +define i32 @test_v16i32_v16i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: test_v16i32_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 @@ -276,7 +493,7 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v16i32_v16i8: ; AVX: # %bb.0: @@ -285,13 +502,13 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) { ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = zext <16 x i8> %a0 to <16 x i32> %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) ret i32 %2 } -define i32 @test_v32i32_v32i8(<32 x i8> %a0) { +define i32 @test_v32i32_v32i8(<32 x i8> %a0) nounwind { ; SSE-LABEL: test_v32i32_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 @@ -301,7 +518,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_v32i32_v32i8: ; AVX1: # %bb.0: @@ -314,7 +531,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i32_v32i8: ; AVX2: # %bb.0: @@ -326,7 +543,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i32_v32i8: ; AVX512: # %bb.0: @@ -348,7 +565,7 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) { ; vXi16 ; -define i16 @test_v2i16_v2i8(<2 x i8> %a0) { +define i16 @test_v2i16_v2i8(<2 x i8> %a0) nounwind { ; SSE2-LABEL: test_v2i16_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -358,17 +575,17 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v2i16_v2i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $16, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v2i16_v2i8: ; AVX1-SLOW: # %bb.0: @@ -377,7 +594,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v2i16_v2i8: ; AVX1-FAST: # %bb.0: @@ -385,7 +602,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v2i16_v2i8: ; AVX2: # %bb.0: @@ -394,7 +611,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i16_v2i8: ; AVX512: # %bb.0: @@ -409,7 +626,7 @@ define i16 @test_v2i16_v2i8(<2 x i8> %a0) { ret i16 %2 } -define i16 @test_v4i16_v4i8(<4 x i8> %a0) { +define i16 @test_v4i16_v4i8(<4 x i8> %a0) nounwind { ; SSE2-LABEL: test_v4i16_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -421,19 +638,19 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: test_v4i16_v4i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrld $16, %xmm0 +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: # kill: def $ax killed $ax killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i16_v4i8: ; AVX1-SLOW: # %bb.0: @@ -444,7 +661,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i16_v4i8: ; AVX1-FAST: # %bb.0: @@ -454,7 +671,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i16_v4i8: ; AVX2: # %bb.0: @@ -465,7 +682,7 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i16_v4i8: ; AVX512: # %bb.0: @@ -483,14 +700,14 @@ define i16 @test_v4i16_v4i8(<4 x i8> %a0) { } -define i16 @test_v8i16_v8i8(<8 x i8> %a0) { +define i16 @test_v8i16_v8i8(<8 x i8> %a0) nounwind { ; SSE-LABEL: test_v8i16_v8i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v8i16_v8i8: ; AVX: # %bb.0: @@ -498,13 +715,13 @@ define i16 @test_v8i16_v8i8(<8 x i8> %a0) { ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = zext <8 x i8> %a0 to <8 x i16> %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) ret i16 %2 } -define i16 @test_v16i16_v16i8(<16 x i8> %a0) { +define i16 @test_v16i16_v16i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: test_v16i16_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 @@ -513,7 +730,7 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v16i16_v16i8: ; AVX: # %bb.0: @@ -523,13 +740,13 @@ define i16 @test_v16i16_v16i8(<16 x i8> %a0) { ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = zext <16 x i8> %a0 to <16 x i16> %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) ret i16 %2 } -define i16 @test_v32i16_v32i8(<32 x i8> %a0) { +define i16 @test_v32i16_v32i8(<32 x i8> %a0) nounwind { ; SSE-LABEL: test_v32i16_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 @@ -540,7 +757,7 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_v32i16_v32i8: ; AVX1: # %bb.0: @@ -554,7 +771,7 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i16_v32i8: ; AVX2: # %bb.0: @@ -567,7 +784,7 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: @@ -586,22 +803,44 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) { ret i16 %2 } -define i16 @test_v64i16_v64i8(<64 x i8> %a0) { -; SSE-LABEL: test_v64i16_v64i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: psadbw %xmm4, %xmm3 -; SSE-NEXT: psadbw %xmm4, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: psadbw %xmm4, %xmm2 -; SSE-NEXT: psadbw %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +define i16 @test_v64i16_v64i8(<64 x i8> %a0) nounwind { +; X86-SSE-LABEL: test_v64i16_v64i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: pxor %xmm3, %xmm3 +; X86-SSE-NEXT: psadbw %xmm3, %xmm2 +; X86-SSE-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: psadbw %xmm3, %xmm1 +; X86-SSE-NEXT: psadbw 8(%ebp), %xmm3 +; X86-SSE-NEXT: paddq %xmm1, %xmm3 +; X86-SSE-NEXT: paddq %xmm0, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v64i16_v64i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: psadbw %xmm4, %xmm3 +; X64-SSE-NEXT: psadbw %xmm4, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: psadbw %xmm4, %xmm2 +; X64-SSE-NEXT: psadbw %xmm4, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: test_v64i16_v64i8: ; AVX1: # %bb.0: @@ -620,7 +859,7 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v64i16_v64i8: ; AVX2: # %bb.0: @@ -635,7 +874,7 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v64i16_v64i8: ; AVX512: # %bb.0: @@ -682,3 +921,11 @@ declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512BW: {{.*}} +; AVX512BWVL: {{.*}} +; SSE41: {{.*}} +; X64-AVX1-FAST: {{.*}} +; X64-AVX1-SLOW: {{.*}} +; X86-AVX1-FAST: {{.*}} +; X86-AVX1-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index aed4e023e340c..ff5de0e587505 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -1,30 +1,77 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE4,X86-SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE4,X64-SSE4 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X86-AVX1,X86-AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW,X64-AVX1,X64-AVX1-SLOW +; RUN: llc < %s -mtriple=i686-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X86-AVX1,X86-AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST,X64-AVX1,X64-AVX1-FAST +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 ; ; vXi64 ; -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE-LABEL: test_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +define i64 @test_v2i64(<2 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v2i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v2i64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v2i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v2i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v2i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2i64: ; AVX512: # %bb.0: @@ -36,34 +83,75 @@ define i64 @test_v2i64(<2 x i64> %a0) { ret i64 %1 } -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE-LABEL: test_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v4i64(<4 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v4i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v4i64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v4i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: movd %xmm1, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v4i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v4i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: @@ -78,40 +166,101 @@ define i64 @test_v4i64(<4 x i64> %a0) { ret i64 %1 } -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE-LABEL: test_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v8i64(<8 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v8i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: paddq 8(%ebp), %xmm1 +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v8i64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v8i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: paddq 8(%ebp), %xmm1 +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v8i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v8i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: @@ -128,52 +277,142 @@ define i64 @test_v8i64(<8 x i64> %a0) { ret i64 %1 } -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE-LABEL: test_v16i64: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i64 @test_v16i64(<16 x i64> %a0) nounwind { +; X86-SSE2-LABEL: test_v16i64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: paddq 56(%ebp), %xmm2 +; X86-SSE2-NEXT: paddq 24(%ebp), %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: paddq 72(%ebp), %xmm3 +; X86-SSE2-NEXT: paddq 40(%ebp), %xmm1 +; X86-SSE2-NEXT: paddq %xmm3, %xmm1 +; X86-SSE2-NEXT: paddq %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: paddq %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X64-SSE-LABEL: test_v16i64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddq %xmm6, %xmm2 +; X64-SSE-NEXT: paddq %xmm4, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: paddq %xmm7, %xmm3 +; X64-SSE-NEXT: paddq %xmm5, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: paddq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE-NEXT: paddq %xmm1, %xmm0 +; X64-SSE-NEXT: movq %xmm0, %rax +; X64-SSE-NEXT: retq +; +; X86-SSE4-LABEL: test_v16i64: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE4-NEXT: paddq 56(%ebp), %xmm2 +; X86-SSE4-NEXT: paddq 24(%ebp), %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: paddq 72(%ebp), %xmm3 +; X86-SSE4-NEXT: paddq 40(%ebp), %xmm1 +; X86-SSE4-NEXT: paddq %xmm3, %xmm1 +; X86-SSE4-NEXT: paddq %xmm0, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE4-NEXT: paddq %xmm1, %xmm0 +; X86-SSE4-NEXT: movd %xmm0, %eax +; X86-SSE4-NEXT: pextrd $1, %xmm0, %edx +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: retl +; +; X86-AVX1-LABEL: test_v16i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq 8(%ebp), %xmm1, %xmm2 +; X86-AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vpaddq 24(%ebp), %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v16i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v16i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddq 8(%ebp), %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v16i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i64: ; AVX512: # %bb.0: @@ -195,33 +434,33 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; vXi32 ; -define i32 @test_v2i32(<2 x i32> %a0) { +define i32 @test_v2i32(<2 x i32> %a0) nounwind { ; SSE-LABEL: test_v2i32: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v2i32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v2i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i32: ; AVX512: # %bb.0: @@ -233,7 +472,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { ret i32 %1 } -define i32 @test_v4i32(<4 x i32> %a0) { +define i32 @test_v4i32(<4 x i32> %a0) nounwind { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -241,7 +480,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i32: ; AVX1-SLOW: # %bb.0: @@ -250,14 +489,14 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: @@ -266,7 +505,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: @@ -280,7 +519,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ret i32 %1 } -define i32 @test_v8i32(<8 x i32> %a0) { +define i32 @test_v8i32(<8 x i32> %a0) nounwind { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 @@ -289,7 +528,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v8i32: ; AVX1-SLOW: # %bb.0: @@ -301,7 +540,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: @@ -311,7 +550,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: @@ -323,7 +562,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: @@ -340,18 +579,36 @@ define i32 @test_v8i32(<8 x i32> %a0) { ret i32 %1 } -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE-LABEL: test_v16i32: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq +define i32 @test_v16i32(<16 x i32> %a0) nounwind { +; X86-SSE-LABEL: test_v16i32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: paddd %xmm2, %xmm0 +; X86-SSE-NEXT: paddd 8(%ebp), %xmm1 +; X86-SSE-NEXT: paddd %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE-NEXT: paddd %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: paddd %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v16i32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddd %xmm3, %xmm1 +; X64-SSE-NEXT: paddd %xmm2, %xmm0 +; X64-SSE-NEXT: paddd %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddd %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE-NEXT: paddd %xmm1, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16i32: ; AVX1-SLOW: # %bb.0: @@ -366,7 +623,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v16i32: ; AVX1-FAST: # %bb.0: @@ -380,7 +637,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v16i32: ; AVX2: # %bb.0: @@ -393,7 +650,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: @@ -412,78 +669,173 @@ define i32 @test_v16i32(<16 x i32> %a0) { ret i32 %1 } -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE-LABEL: test_v32i32: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm4, %xmm0 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm7, %xmm3 -; SSE-NEXT: paddd %xmm5, %xmm1 -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v32i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v32i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i32 @test_v32i32(<32 x i32> %a0) nounwind { +; X86-SSE-LABEL: test_v32i32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE-NEXT: paddd 56(%ebp), %xmm2 +; X86-SSE-NEXT: paddd 24(%ebp), %xmm0 +; X86-SSE-NEXT: paddd %xmm2, %xmm0 +; X86-SSE-NEXT: paddd 72(%ebp), %xmm3 +; X86-SSE-NEXT: paddd 40(%ebp), %xmm1 +; X86-SSE-NEXT: paddd %xmm3, %xmm1 +; X86-SSE-NEXT: paddd %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE-NEXT: paddd %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: paddd %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v32i32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddd %xmm6, %xmm2 +; X64-SSE-NEXT: paddd %xmm4, %xmm0 +; X64-SSE-NEXT: paddd %xmm2, %xmm0 +; X64-SSE-NEXT: paddd %xmm7, %xmm3 +; X64-SSE-NEXT: paddd %xmm5, %xmm1 +; X64-SSE-NEXT: paddd %xmm3, %xmm1 +; X64-SSE-NEXT: paddd %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE-NEXT: paddd %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE-NEXT: paddd %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-SLOW-LABEL: test_v32i32: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: pushl %ebp +; X86-AVX1-SLOW-NEXT: movl %esp, %ebp +; X86-AVX1-SLOW-NEXT: andl $-32, %esp +; X86-AVX1-SLOW-NEXT: subl $32, %esp +; X86-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm3 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddd 8(%ebp), %xmm1, %xmm2 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddd 24(%ebp), %xmm1, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: movl %ebp, %esp +; X86-AVX1-SLOW-NEXT: popl %ebp +; X86-AVX1-SLOW-NEXT: vzeroupper +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v32i32: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: vzeroupper +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v32i32: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: pushl %ebp +; X86-AVX1-FAST-NEXT: movl %esp, %ebp +; X86-AVX1-FAST-NEXT: andl $-32, %esp +; X86-AVX1-FAST-NEXT: subl $32, %esp +; X86-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm3 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddd 8(%ebp), %xmm1, %xmm2 +; X86-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-FAST-NEXT: vpaddd 24(%ebp), %xmm1, %xmm1 +; X86-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; X86-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: movl %ebp, %esp +; X86-AVX1-FAST-NEXT: popl %ebp +; X86-AVX1-FAST-NEXT: vzeroupper +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v32i32: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; X64-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; X64-AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; X64-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: vzeroupper +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v32i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v32i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: @@ -507,7 +859,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; vXi16 ; -define i16 @test_v2i16(<2 x i16> %a0) { +define i16 @test_v2i16(<2 x i16> %a0) nounwind { ; SSE-LABEL: test_v2i16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -515,7 +867,7 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v2i16: ; AVX1-SLOW: # %bb.0: @@ -523,14 +875,14 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v2i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v2i16: ; AVX2: # %bb.0: @@ -538,7 +890,7 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i16: ; AVX512: # %bb.0: @@ -551,7 +903,7 @@ define i16 @test_v2i16(<2 x i16> %a0) { ret i16 %1 } -define i16 @test_v4i16(<4 x i16> %a0) { +define i16 @test_v4i16(<4 x i16> %a0) nounwind { ; SSE-LABEL: test_v4i16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -561,7 +913,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v4i16: ; AVX1-SLOW: # %bb.0: @@ -571,7 +923,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v4i16: ; AVX1-FAST: # %bb.0: @@ -580,7 +932,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i16: ; AVX2: # %bb.0: @@ -590,7 +942,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i16: ; AVX512: # %bb.0: @@ -605,7 +957,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { ret i16 %1 } -define i16 @test_v8i16(<8 x i16> %a0) { +define i16 @test_v8i16(<8 x i16> %a0) nounwind { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -617,7 +969,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v8i16: ; AVX1-SLOW: # %bb.0: @@ -629,7 +981,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: @@ -638,7 +990,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v8i16: ; AVX2: # %bb.0: @@ -650,7 +1002,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: @@ -667,7 +1019,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ret i16 %1 } -define i16 @test_v16i16(<16 x i16> %a0) { +define i16 @test_v16i16(<16 x i16> %a0) nounwind { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -680,7 +1032,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-SLOW-LABEL: test_v16i16: ; AVX1-SLOW: # %bb.0: @@ -695,7 +1047,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: @@ -707,7 +1059,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: @@ -722,7 +1074,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: @@ -742,22 +1094,44 @@ define i16 @test_v16i16(<16 x i16> %a0) { ret i16 %1 } -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE-LABEL: test_v32i16: -; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq +define i16 @test_v32i16(<32 x i16> %a0) nounwind { +; X86-SSE-LABEL: test_v32i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: paddw %xmm2, %xmm0 +; X86-SSE-NEXT: paddw 8(%ebp), %xmm1 +; X86-SSE-NEXT: paddw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE-NEXT: paddw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: paddw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: paddw %xmm1, %xmm0 +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v32i16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddw %xmm3, %xmm1 +; X64-SSE-NEXT: paddw %xmm2, %xmm0 +; X64-SSE-NEXT: paddw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE-NEXT: paddw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: paddw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v32i16: ; AVX1-SLOW: # %bb.0: @@ -775,7 +1149,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax ; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq +; AVX1-SLOW-NEXT: ret{{[l|q]}} ; ; AVX1-FAST-LABEL: test_v32i16: ; AVX1-FAST: # %bb.0: @@ -792,7 +1166,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq +; AVX1-FAST-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i16: ; AVX2: # %bb.0: @@ -808,7 +1182,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: @@ -830,91 +1204,199 @@ define i16 @test_v32i16(<32 x i16> %a0) { ret i16 %1 } -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE-LABEL: test_v64i16: -; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm6, %xmm2 -; SSE-NEXT: paddw %xmm4, %xmm0 -; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm7, %xmm3 -; SSE-NEXT: paddw %xmm5, %xmm1 -; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v64i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4 -; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm5 -; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm5, %xmm4 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm4, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v64i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4 -; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm5 -; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm5, %xmm4 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm4, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i16 @test_v64i16(<64 x i16> %a0) nounwind { +; X86-SSE-LABEL: test_v64i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE-NEXT: paddw 56(%ebp), %xmm2 +; X86-SSE-NEXT: paddw 24(%ebp), %xmm0 +; X86-SSE-NEXT: paddw %xmm2, %xmm0 +; X86-SSE-NEXT: paddw 72(%ebp), %xmm3 +; X86-SSE-NEXT: paddw 40(%ebp), %xmm1 +; X86-SSE-NEXT: paddw %xmm3, %xmm1 +; X86-SSE-NEXT: paddw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE-NEXT: paddw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: paddw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: paddw %xmm1, %xmm0 +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v64i16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddw %xmm6, %xmm2 +; X64-SSE-NEXT: paddw %xmm4, %xmm0 +; X64-SSE-NEXT: paddw %xmm2, %xmm0 +; X64-SSE-NEXT: paddw %xmm7, %xmm3 +; X64-SSE-NEXT: paddw %xmm5, %xmm1 +; X64-SSE-NEXT: paddw %xmm3, %xmm1 +; X64-SSE-NEXT: paddw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE-NEXT: paddw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE-NEXT: paddw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: paddw %xmm1, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-SLOW-LABEL: test_v64i16: +; X86-AVX1-SLOW: # %bb.0: +; X86-AVX1-SLOW-NEXT: pushl %ebp +; X86-AVX1-SLOW-NEXT: movl %esp, %ebp +; X86-AVX1-SLOW-NEXT: andl $-32, %esp +; X86-AVX1-SLOW-NEXT: subl $32, %esp +; X86-AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm3 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddw 8(%ebp), %xmm1, %xmm2 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; X86-AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddw 24(%ebp), %xmm1, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X86-AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-SLOW-NEXT: movl %ebp, %esp +; X86-AVX1-SLOW-NEXT: popl %ebp +; X86-AVX1-SLOW-NEXT: vzeroupper +; X86-AVX1-SLOW-NEXT: retl +; +; X64-AVX1-SLOW-LABEL: test_v64i16: +; X64-AVX1-SLOW: # %bb.0: +; X64-AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm5 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm5, %xmm4 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm4, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-SLOW-NEXT: vzeroupper +; X64-AVX1-SLOW-NEXT: retq +; +; X86-AVX1-FAST-LABEL: test_v64i16: +; X86-AVX1-FAST: # %bb.0: +; X86-AVX1-FAST-NEXT: pushl %ebp +; X86-AVX1-FAST-NEXT: movl %esp, %ebp +; X86-AVX1-FAST-NEXT: andl $-32, %esp +; X86-AVX1-FAST-NEXT: subl $32, %esp +; X86-AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm3 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddw 8(%ebp), %xmm1, %xmm2 +; X86-AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; X86-AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-FAST-NEXT: vpaddw 24(%ebp), %xmm1, %xmm1 +; X86-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; X86-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X86-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X86-AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX1-FAST-NEXT: movl %ebp, %esp +; X86-AVX1-FAST-NEXT: popl %ebp +; X86-AVX1-FAST-NEXT: vzeroupper +; X86-AVX1-FAST-NEXT: retl +; +; X64-AVX1-FAST-LABEL: test_v64i16: +; X64-AVX1-FAST: # %bb.0: +; X64-AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4 +; X64-AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm5 +; X64-AVX1-FAST-NEXT: vpaddw %xmm4, %xmm5, %xmm4 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpaddw %xmm0, %xmm4, %xmm0 +; X64-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X64-AVX1-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX1-FAST-NEXT: vzeroupper +; X64-AVX1-FAST-NEXT: retq +; +; X86-AVX2-LABEL: test_v64i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddw 8(%ebp), %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v64i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i16: ; AVX512: # %bb.0: @@ -941,7 +1423,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; vXi8 ; -define i8 @test_v2i8(<2 x i8> %a0) { +define i8 @test_v2i8(<2 x i8> %a0) nounwind { ; SSE-LABEL: test_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -949,7 +1431,7 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: @@ -957,7 +1439,7 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v2i8: ; AVX512: # %bb.0: @@ -970,27 +1452,71 @@ define i8 @test_v2i8(<2 x i8> %a0) { ret i8 %1 } -define i8 @test_v2i8_load(ptr %p) { -; SSE-LABEL: test_v2i8_load: -; SSE: # %bb.0: -; SSE-NEXT: movzwl (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i8_load: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +define i8 @test_v2i8_load(ptr %p) nounwind { +; X86-SSE-LABEL: test_v2i8_load: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzwl (%eax), %eax +; X86-SSE-NEXT: movd %eax, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrlw $8, %xmm1 +; X86-SSE-NEXT: paddb %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v2i8_load: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movzwl (%rdi), %eax +; X64-SSE-NEXT: movd %eax, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrlw $8, %xmm1 +; X64-SSE-NEXT: paddb %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_v2i8_load: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movzwl (%eax), %eax +; X86-AVX1-NEXT: vmovd %eax, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v2i8_load: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: movzwl (%rdi), %eax +; X64-AVX1-NEXT: vmovd %eax, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v2i8_load: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movzwl (%eax), %eax +; X86-AVX2-NEXT: vmovd %eax, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v2i8_load: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: movzwl (%rdi), %eax +; X64-AVX2-NEXT: vmovd %eax, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2i8_load: ; AVX512: # %bb.0: @@ -1006,7 +1532,7 @@ define i8 @test_v2i8_load(ptr %p) { ret i8 %1 } -define i8 @test_v4i8(<4 x i8> %a0) { +define i8 @test_v4i8(<4 x i8> %a0) nounwind { ; SSE2-LABEL: test_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -1014,16 +1540,16 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} ; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq +; SSE4-LABEL: test_v4i8: +; SSE4: # %bb.0: +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE4-NEXT: psadbw %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax +; SSE4-NEXT: # kill: def $al killed $al killed $eax +; SSE4-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_v4i8: ; AVX1: # %bb.0: @@ -1032,7 +1558,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v4i8: ; AVX2: # %bb.0: @@ -1041,7 +1567,7 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v4i8: ; AVX512: # %bb.0: @@ -1056,24 +1582,63 @@ define i8 @test_v4i8(<4 x i8> %a0) { ret i8 %1 } -define i8 @test_v4i8_load(ptr %p) { -; SSE-LABEL: test_v4i8_load: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psadbw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i8_load: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +define i8 @test_v4i8_load(ptr %p) nounwind { +; X86-SSE-LABEL: test_v4i8_load: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v4i8_load: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_v4i8_load: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v4i8_load: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v4i8_load: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v4i8_load: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i8_load: ; AVX512: # %bb.0: @@ -1088,14 +1653,14 @@ define i8 @test_v4i8_load(ptr %p) { ret i8 %1 } -define i8 @test_v8i8(<8 x i8> %a0) { +define i8 @test_v8i8(<8 x i8> %a0) nounwind { ; SSE-LABEL: test_v8i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: @@ -1103,7 +1668,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: @@ -1116,24 +1681,63 @@ define i8 @test_v8i8(<8 x i8> %a0) { ret i8 %1 } -define i8 @test_v8i8_load(ptr %p) { -; SSE-LABEL: test_v8i8_load: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psadbw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v8i8_load: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +define i8 @test_v8i8_load(ptr %p) nounwind { +; X86-SSE-LABEL: test_v8i8_load: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v8i8_load: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: psadbw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_v8i8_load: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v8i8_load: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v8i8_load: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v8i8_load: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i8_load: ; AVX512: # %bb.0: @@ -1148,7 +1752,7 @@ define i8 @test_v8i8_load(ptr %p) { ret i8 %1 } -define i8 @test_v16i8(<16 x i8> %a0) { +define i8 @test_v16i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1157,7 +1761,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-NEXT: psadbw %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: @@ -1167,7 +1771,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v16i8: ; AVX512: # %bb.0: @@ -1182,7 +1786,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ret i8 %1 } -define i8 @test_v32i8(<32 x i8> %a0) { +define i8 @test_v32i8(<32 x i8> %a0) nounwind { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm1, %xmm0 @@ -1192,7 +1796,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-NEXT: psadbw %xmm1, %xmm0 ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: @@ -1205,7 +1809,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: @@ -1218,7 +1822,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: @@ -1236,19 +1840,38 @@ define i8 @test_v32i8(<32 x i8> %a0) { ret i8 %1 } -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE-LABEL: test_v64i8: -; SSE: # %bb.0: -; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: psadbw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +define i8 @test_v64i8(<64 x i8> %a0) nounwind { +; X86-SSE-LABEL: test_v64i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: paddb %xmm2, %xmm0 +; X86-SSE-NEXT: paddb 8(%ebp), %xmm1 +; X86-SSE-NEXT: paddb %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE-NEXT: paddb %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: psadbw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v64i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddb %xmm3, %xmm1 +; X64-SSE-NEXT: paddb %xmm2, %xmm0 +; X64-SSE-NEXT: paddb %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddb %xmm0, %xmm1 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: psadbw %xmm1, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: test_v64i8: ; AVX1: # %bb.0: @@ -1264,7 +1887,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: @@ -1278,7 +1901,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: @@ -1298,61 +1921,134 @@ define i8 @test_v64i8(<64 x i8> %a0) { ret i8 %1 } -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE-LABEL: test_v128i8: -; SSE: # %bb.0: -; SSE-NEXT: paddb %xmm7, %xmm3 -; SSE-NEXT: paddb %xmm5, %xmm1 -; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: paddb %xmm6, %xmm2 -; SSE-NEXT: paddb %xmm4, %xmm0 -; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: psadbw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +define i8 @test_v128i8(<128 x i8> %a0) nounwind { +; X86-SSE-LABEL: test_v128i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE-NEXT: paddb 72(%ebp), %xmm3 +; X86-SSE-NEXT: paddb 40(%ebp), %xmm1 +; X86-SSE-NEXT: paddb %xmm3, %xmm1 +; X86-SSE-NEXT: paddb 56(%ebp), %xmm2 +; X86-SSE-NEXT: paddb 24(%ebp), %xmm0 +; X86-SSE-NEXT: paddb %xmm2, %xmm0 +; X86-SSE-NEXT: paddb %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE-NEXT: paddb %xmm0, %xmm1 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: # kill: def $al killed $al killed $eax +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_v128i8: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddb %xmm7, %xmm3 +; X64-SSE-NEXT: paddb %xmm5, %xmm1 +; X64-SSE-NEXT: paddb %xmm3, %xmm1 +; X64-SSE-NEXT: paddb %xmm6, %xmm2 +; X64-SSE-NEXT: paddb %xmm4, %xmm0 +; X64-SSE-NEXT: paddb %xmm2, %xmm0 +; X64-SSE-NEXT: paddb %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-SSE-NEXT: paddb %xmm0, %xmm1 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: psadbw %xmm1, %xmm0 +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: # kill: def $al killed $al killed $eax +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_v128i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X86-AVX1-NEXT: vpaddb 24(%ebp), %xmm4, %xmm4 +; X86-AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb 8(%ebp), %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_v128i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; X64-AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X64-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; X64-AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: test_v128i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddb 8(%ebp), %ymm1, %ymm1 +; X86-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: test_v128i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; AVX512-LABEL: test_v128i8: ; AVX512: # %bb.0: @@ -1398,3 +2094,6 @@ declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-SSE2: {{.*}} +; X64-SSE4: {{.*}} From 84812fd1e7036ba28fb2136839b7a0a0d9010a63 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2026 12:53:16 +0100 Subject: [PATCH 427/538] [NFC][DAG] scalarizeExtractedBinOp - pull out constant build vector detection into isExtractFree helper (#197155) Prep work for #196493 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 14bf2b704c4da..c265d5d5ec982 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24897,13 +24897,14 @@ static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG, // Extracting an element of a vector constant is constant-folded, so this // transform is just replacing a vector op with a scalar op while moving the // extract. + auto IsExtractFree = [](SDValue Op) { + APInt SplatVal; + return isAnyConstantBuildVector(Op, true) || + ISD::isConstantSplatVector(Op.getNode(), SplatVal); + }; SDValue Op0 = Vec.getOperand(0); SDValue Op1 = Vec.getOperand(1); - APInt SplatVal; - if (!isAnyConstantBuildVector(Op0, true) && - !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) && - !isAnyConstantBuildVector(Op1, true) && - !ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) + if (!IsExtractFree(Op0) && !IsExtractFree(Op1)) return SDValue(); // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C' From 14bf8e76c021c200b6e37b9df996dbfc0b5dbd4e Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 12 May 2026 07:04:11 -0500 Subject: [PATCH 428/538] [libclc] Base the build around `add_sources` instead of source list (#197034) Summary: The current build uses a curated + deduplicated source list. This PR seeks to simplify this a little bit and canonicalize the behavior. Now we create the target up-front, `clc` and `opencl`. We add the directories which add sources to this target. We normalize the architecture to the variants. We always add target specific versions first. When we add sources we check if the file already exists and defer to the architecture specific one. This normalized the behavior, the directories are now laid out like this `clc//`. We normalize these to `amdgpu`, `nvptx`, and `spirv` respectively. We use the OS for the newly created vulkan target. We now control variants via checking if the directory for that exists, so it's nested more naturally. Hopefully this makes more sense, the goal is to exercise the fact that we have individual builds now. Previously this did not work because you could not add_subdirectory more than once. --- libclc/CMakeLists.txt | 141 +++++++---------- libclc/clc/CMakeLists.txt | 5 + libclc/clc/lib/amdgpu/CMakeLists.txt | 8 +- libclc/clc/lib/generic/CMakeLists.txt | 13 +- .../{ptx-nvidiacl => nvptx}/CMakeLists.txt | 3 +- .../{ptx-nvidiacl => nvptx}/math/clc_log.cl | 0 .../{ptx-nvidiacl => nvptx}/math/clc_rsqrt.cl | 0 .../{ptx-nvidiacl => nvptx}/math/clc_sinpi.cl | 0 .../{ptx-nvidiacl => nvptx}/math/clc_sqrt.cl | 0 .../relational/clc_isinf.cl | 0 .../synchronization/clc_work_group_barrier.cl | 0 .../workitem/clc_get_global_id.cl | 0 .../workitem/clc_get_global_size.cl | 0 .../workitem/clc_get_group_id.cl | 0 .../workitem/clc_get_local_id.cl | 0 .../workitem/clc_get_local_size.cl | 0 .../workitem/clc_get_max_sub_group_size.cl | 0 .../workitem/clc_get_num_groups.cl | 0 .../workitem/clc_get_sub_group_local_id.cl | 0 libclc/clc/lib/spirv/CMakeLists.txt | 7 +- libclc/clc/lib/spirv/vulkan/CMakeLists.txt | 4 + .../{ => spirv}/vulkan/integer/clc_mul_hi.cl | 0 .../lib/{ => spirv}/vulkan/math/clc_sw_fma.cl | 0 libclc/clc/lib/vulkan/CMakeLists.txt | 5 - libclc/cmake/modules/AddLibclc.cmake | 144 +++++++++--------- libclc/opencl/CMakeLists.txt | 9 ++ libclc/opencl/lib/amdgpu/CMakeLists.txt | 3 +- libclc/opencl/lib/generic/CMakeLists.txt | 5 +- libclc/opencl/lib/spirv/CMakeLists.txt | 13 +- .../lib/{ => spirv}/vulkan/CMakeLists.txt | 12 +- .../vulkan/conversion/convert_float.inc | 0 .../vulkan/conversion/convert_float2float.cl | 0 .../vulkan/conversion/convert_float2int.cl | 0 .../vulkan/conversion/convert_int2float.cl | 0 .../vulkan/conversion/convert_integer.cl | 0 .../vulkan/conversion/convert_integer.inc | 0 .../opencl/lib/{ => spirv}/vulkan/math/fma.cl | 0 .../{ => spirv}/vulkan/shared/vstore_half.cl | 0 .../{ => spirv}/vulkan/shared/vstore_half.inc | 0 39 files changed, 171 insertions(+), 201 deletions(-) create mode 100644 libclc/clc/CMakeLists.txt rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/CMakeLists.txt (83%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/math/clc_log.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/math/clc_rsqrt.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/math/clc_sinpi.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/math/clc_sqrt.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/relational/clc_isinf.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/synchronization/clc_work_group_barrier.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_global_id.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_global_size.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_group_id.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_local_id.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_local_size.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_max_sub_group_size.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_num_groups.cl (100%) rename libclc/clc/lib/{ptx-nvidiacl => nvptx}/workitem/clc_get_sub_group_local_id.cl (100%) create mode 100644 libclc/clc/lib/spirv/vulkan/CMakeLists.txt rename libclc/clc/lib/{ => spirv}/vulkan/integer/clc_mul_hi.cl (100%) rename libclc/clc/lib/{ => spirv}/vulkan/math/clc_sw_fma.cl (100%) delete mode 100644 libclc/clc/lib/vulkan/CMakeLists.txt create mode 100644 libclc/opencl/CMakeLists.txt rename libclc/opencl/lib/{ => spirv}/vulkan/CMakeLists.txt (85%) rename libclc/opencl/lib/{ => spirv}/vulkan/conversion/convert_float.inc (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/conversion/convert_float2float.cl (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/conversion/convert_float2int.cl (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/conversion/convert_int2float.cl (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/conversion/convert_integer.cl (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/conversion/convert_integer.inc (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/math/fma.cl (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/shared/vstore_half.cl (100%) rename libclc/opencl/lib/{ => spirv}/vulkan/shared/vstore_half.inc (100%) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 3f84458336950..334faa1a02667 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -24,8 +24,8 @@ option( # List of all supported architectures. set( LIBCLC_ARCHS_ALL amdgpu amdgcn nvptx64 ) -set( LIBCLC_ARCHS_SPIRV spirv spirv32 spirv64) -list( APPEND LIBCLC_ARCHS_ALL ${LIBCLC_ARCHS_SPIRV}) +set( LIBCLC_ARCHS_SPIRV spirv spirv32 spirv64 ) +list( APPEND LIBCLC_ARCHS_ALL ${LIBCLC_ARCHS_SPIRV} ) set(LIBCLC_TARGET ${LLVM_DEFAULT_TARGET_TRIPLE}) @@ -33,10 +33,13 @@ if(NOT LIBCLC_TARGET) message(FATAL_ERROR "libclc target is empty\n") endif() -string( REPLACE "-" ";" _target_components ${LIBCLC_TARGET} ) -list(GET _target_components 0 _target_arch) -if(NOT "${_target_arch}" IN_LIST LIBCLC_ARCHS_ALL) - message(FATAL_ERROR "Unknown libclc target architecture: ${_target_arch}\n" +# Parse the target triple into arch and OS components. +string(REPLACE "-" ";" triple_components ${LIBCLC_TARGET}) +list(GET triple_components 0 LIBCLC_TARGET_ARCH) +list(GET triple_components 2 LIBCLC_TARGET_OS) + +if(NOT "${LIBCLC_TARGET_ARCH}" IN_LIST LIBCLC_ARCHS_ALL) + message(FATAL_ERROR "Unknown libclc target architecture: ${LIBCLC_TARGET_ARCH}\n" "Target was: ${LIBCLC_TARGET}\n" "Valid architectures are: ${LIBCLC_ARCHS_ALL}\n") endif() @@ -98,12 +101,17 @@ endif() message(STATUS "libclc target '${LIBCLC_TARGET}' is enabled") -string( REPLACE "-" ";" TRIPLE ${LIBCLC_TARGET} ) -list(GET TRIPLE 0 ARCH) -list(GET TRIPLE 2 OS) +# Map the LLVM target architecture to the standard directory name. +if(LIBCLC_TARGET_ARCH STREQUAL amdgcn OR LIBCLC_TARGET_ARCH STREQUAL amdgpu) + set(LIBCLC_ARCH_DIR amdgpu) +elseif(LIBCLC_TARGET_ARCH STREQUAL nvptx64) + set(LIBCLC_ARCH_DIR nvptx) +elseif(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV) + set(LIBCLC_ARCH_DIR spirv) +endif() -if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) - if(NOT OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND AND NOT llvm-spirv_exe) +if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV) + if(NOT LIBCLC_TARGET_OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND AND NOT llvm-spirv_exe) message(FATAL_ERROR "SPIR-V backend or llvm-spirv is required for libclc ${LIBCLC_TARGET}") endif() endif() @@ -114,35 +122,12 @@ foreach( tool IN ITEMS opt llvm-link ) endif() endforeach() -add_subdirectory(clc/lib/generic) -add_subdirectory(opencl/lib/generic) - -if(ARCH STREQUAL amdgcn) - add_subdirectory(clc/lib/amdgpu) - add_subdirectory(opencl/lib/amdgpu) -elseif(ARCH STREQUAL nvptx64) - add_subdirectory(clc/lib/ptx-nvidiacl) -elseif(ARCH STREQUAL spirv OR ARCH STREQUAL spirv32 OR ARCH STREQUAL spirv64) - if(OS STREQUAL vulkan) - add_subdirectory(clc/lib/vulkan) - add_subdirectory(opencl/lib/vulkan) - else() - add_subdirectory(clc/lib/spirv) - add_subdirectory(opencl/lib/spirv) - endif() -endif() - -add_custom_target( libclc ALL ) - -add_custom_target( libclc-opencl-builtins COMMENT "Build libclc OpenCL builtins" ) -add_dependencies( libclc libclc-opencl-builtins ) - # Determine the clang target triple. Vulkan and SPIR-V backend targets use the # triple directly; other SPIR-V targets fall back to the legacy SPIR target. set(clang_triple ${LIBCLC_TARGET}) -if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) - if(NOT OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND) - if(ARCH STREQUAL spirv) +if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV) + if(NOT LIBCLC_TARGET_OS STREQUAL vulkan AND NOT LIBCLC_USE_SPIRV_BACKEND) + if(LIBCLC_TARGET_ARCH STREQUAL spirv) set(clang_triple spir--) else() set(clang_triple spir64--) @@ -153,10 +138,10 @@ endif() # Address space values. set(private_addrspace_val 0) set(generic_addrspace_val 0) -if(ARCH STREQUAL amdgcn) +if(LIBCLC_TARGET_ARCH STREQUAL amdgcn) set(private_addrspace_val 5) endif() -if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV AND NOT OS STREQUAL vulkan) +if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV AND NOT LIBCLC_TARGET_OS STREQUAL vulkan) set(generic_addrspace_val 4) endif() @@ -165,50 +150,18 @@ set(target_compile_flags) set(target_extra_defines) set(opt_flags -O3) -if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) - if(OS STREQUAL vulkan) +if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV) + if(LIBCLC_TARGET_OS STREQUAL vulkan) list(APPEND target_compile_flags -Wno-unknown-assumption -U__opencl_c_int64) else() list(APPEND target_compile_flags -O0 -finline-hint-functions) list(APPEND target_extra_defines CLC_SPIRV) set(opt_flags) endif() -elseif(ARCH STREQUAL amdgcn) +elseif(LIBCLC_TARGET_ARCH STREQUAL amdgcn) list(APPEND target_compile_flags "SHELL:-Xclang -mcode-object-version=none") endif() -# Collect CLC sources; target-specific sources override generic ones by basename. -set(_clc_overrides) -if(ARCH STREQUAL amdgcn) - list(APPEND _clc_overrides ${CLC_AMDGPU_SOURCES}) -elseif(ARCH STREQUAL nvptx64 AND (OS STREQUAL nvidiacl OR OS STREQUAL cuda)) - list(APPEND _clc_overrides ${CLC_PTX_NVIDIACL_SOURCES}) -elseif(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) - if(OS STREQUAL vulkan) - list(APPEND _clc_overrides ${CLC_VULKAN_SOURCES}) - else() - list(APPEND _clc_overrides ${CLC_SPIRV_SOURCES}) - endif() -endif() -libclc_merge_sources(clc_sources ${CLC_GENERIC_SOURCES} ${_clc_overrides}) - -# Collect OpenCL sources. SPIR-V and Vulkan targets use self-contained -# subsets while others merge with target-specific overrides. -if(ARCH IN_LIST LIBCLC_ARCHS_SPIRV) - if(OS STREQUAL vulkan) - set(opencl_sources ${OPENCL_VULKAN_SOURCES}) - else() - set(opencl_sources ${OPENCL_SPIRV_SOURCES}) - endif() -else() - set(_opencl_overrides) - if(ARCH STREQUAL amdgcn) - list(APPEND _opencl_overrides ${OPENCL_AMDGCN_SOURCES}) - endif() - libclc_merge_sources(opencl_sources - ${OPENCL_GENERIC_SOURCES} ${_opencl_overrides}) -endif() - # Common compile options shared by CLC and OpenCL libraries. set(compile_flags -flto @@ -227,35 +180,45 @@ set(compile_flags ${target_compile_flags} ) -set(_common_defs +set(common_defs ${target_extra_defines} __CLC_PRIVATE_ADDRSPACE_VAL=${private_addrspace_val} __CLC_GENERIC_ADDRSPACE_VAL=${generic_addrspace_val} ) -# Build the CLC internal builtins library. -string(REPLACE "-" "_" lib_suffix ${LIBCLC_TARGET}) -set(clc_lib clc_builtins_${lib_suffix}) -add_libclc_builtin_library(${clc_lib} - SOURCES ${clc_sources} +add_custom_target(libclc ALL) +add_custom_target(libclc-opencl-builtins COMMENT "Build libclc OpenCL builtins") +add_dependencies(libclc libclc-opencl-builtins) + +# Configure the CLC internal builtins library. +set(LIBCLC_CLC_TARGET clc) +libclc_add_builtin_library(${LIBCLC_CLC_TARGET} COMPILE_OPTIONS ${compile_flags} INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/clc/include - COMPILE_DEFINITIONS ${_common_defs} + COMPILE_DEFINITIONS ${common_defs} FOLDER "libclc/Device IR/CLC" ) -# Build, link, and install the final OpenCL builtins library. -add_libclc_library(libclc-${LIBCLC_TARGET} - ARCH ${ARCH} - TRIPLE ${clang_triple} - TARGET_TRIPLE ${LIBCLC_TARGET} - SOURCES ${opencl_sources} +set(LIBCLC_OPENCL_TARGET opencl) +libclc_add_builtin_library(${LIBCLC_OPENCL_TARGET} COMPILE_OPTIONS ${compile_flags} "SHELL:-Xclang -fdeclare-opencl-builtins" INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/clc/include ${CMAKE_CURRENT_SOURCE_DIR}/opencl/include - COMPILE_DEFINITIONS ${_common_defs} - INTERNALIZE_LIBRARIES ${clc_lib} + COMPILE_DEFINITIONS ${common_defs} + FOLDER "libclc/Device IR/Intermediate" +) + +add_subdirectory(clc) +add_subdirectory(opencl) + +# Link and install the final OpenCL builtins library. +libclc_add_library(libclc-${LIBCLC_TARGET} + ARCH ${LIBCLC_TARGET_ARCH} + TRIPLE ${clang_triple} + TARGET_TRIPLE ${LIBCLC_TARGET} + LIBRARIES ${LIBCLC_OPENCL_TARGET} + INTERNALIZE_LIBRARIES ${LIBCLC_CLC_TARGET} OPT_FLAGS ${opt_flags} OUTPUT_FILENAME libclc PARENT_TARGET libclc-opencl-builtins diff --git a/libclc/clc/CMakeLists.txt b/libclc/clc/CMakeLists.txt new file mode 100644 index 0000000000000..bd75adcf75243 --- /dev/null +++ b/libclc/clc/CMakeLists.txt @@ -0,0 +1,5 @@ +# Add the target specific files first so they can override the generic fallback. +if(LIBCLC_ARCH_DIR AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/lib/${LIBCLC_ARCH_DIR}) + add_subdirectory(lib/${LIBCLC_ARCH_DIR}) +endif() +add_subdirectory(lib/generic) diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt index a5cd47fab4462..910a0cf1765df 100644 --- a/libclc/clc/lib/amdgpu/CMakeLists.txt +++ b/libclc/clc/lib/amdgpu/CMakeLists.txt @@ -1,5 +1,4 @@ -libclc_configure_source_list(CLC_AMDGPU_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} +libclc_add_sources(${LIBCLC_CLC_TARGET} FILES address_space/clc_qualifier.cl math/clc_cbrt.cl math/clc_exp.cl @@ -45,9 +44,10 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES workitem/clc_get_num_sub_groups.cl workitem/clc_get_sub_group_id.cl workitem/clc_get_sub_group_size.cl - workitem/clc_get_work_dim.cl) + workitem/clc_get_work_dim.cl +) -libclc_configure_source_options(${CMAKE_CURRENT_SOURCE_DIR} -fapprox-func +libclc_set_source_options(-fapprox-func math/clc_native_exp.cl math/clc_native_exp2.cl math/clc_native_log10.cl diff --git a/libclc/clc/lib/generic/CMakeLists.txt b/libclc/clc/lib/generic/CMakeLists.txt index 168a0f1ff1e84..40261545fce91 100644 --- a/libclc/clc/lib/generic/CMakeLists.txt +++ b/libclc/clc/lib/generic/CMakeLists.txt @@ -1,5 +1,4 @@ -libclc_configure_source_list(CLC_GENERIC_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} +libclc_add_sources(${LIBCLC_CLC_TARGET} FILES async/clc_prefetch.cl atomic/clc_atomic_compare_exchange.cl atomic/clc_atomic_dec.cl @@ -209,7 +208,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES workitem/clc_get_sub_group_size.cl ) -libclc_configure_source_options(${CMAKE_CURRENT_SOURCE_DIR} -fapprox-func +libclc_set_source_options(-fapprox-func math/clc_native_cos.cl math/clc_native_divide.cl math/clc_native_exp.cl @@ -226,8 +225,10 @@ libclc_configure_source_options(${CMAKE_CURRENT_SOURCE_DIR} -fapprox-func math/clc_native_tan.cl math/clc_div_fast.cl math/clc_recip_fast.cl - math/clc_sqrt_fast.cl) + math/clc_sqrt_fast.cl +) -libclc_configure_source_options(${CMAKE_CURRENT_SOURCE_DIR} -cl-fp32-correctly-rounded-divide-sqrt +libclc_set_source_options(-cl-fp32-correctly-rounded-divide-sqrt math/clc_div_cr.cl - math/clc_sqrt_cr.cl) + math/clc_sqrt_cr.cl +) diff --git a/libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt b/libclc/clc/lib/nvptx/CMakeLists.txt similarity index 83% rename from libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt rename to libclc/clc/lib/nvptx/CMakeLists.txt index 6eb0baab1c0bb..2345d5aeed77b 100644 --- a/libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt +++ b/libclc/clc/lib/nvptx/CMakeLists.txt @@ -1,5 +1,4 @@ -libclc_configure_source_list(CLC_PTX_NVIDIACL_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} +libclc_add_sources(${LIBCLC_CLC_TARGET} FILES math/clc_log.cl math/clc_rsqrt.cl math/clc_sinpi.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_log.cl b/libclc/clc/lib/nvptx/math/clc_log.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/math/clc_log.cl rename to libclc/clc/lib/nvptx/math/clc_log.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_rsqrt.cl b/libclc/clc/lib/nvptx/math/clc_rsqrt.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/math/clc_rsqrt.cl rename to libclc/clc/lib/nvptx/math/clc_rsqrt.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_sinpi.cl b/libclc/clc/lib/nvptx/math/clc_sinpi.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/math/clc_sinpi.cl rename to libclc/clc/lib/nvptx/math/clc_sinpi.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_sqrt.cl b/libclc/clc/lib/nvptx/math/clc_sqrt.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/math/clc_sqrt.cl rename to libclc/clc/lib/nvptx/math/clc_sqrt.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/relational/clc_isinf.cl b/libclc/clc/lib/nvptx/relational/clc_isinf.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/relational/clc_isinf.cl rename to libclc/clc/lib/nvptx/relational/clc_isinf.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/nvptx/synchronization/clc_work_group_barrier.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl rename to libclc/clc/lib/nvptx/synchronization/clc_work_group_barrier.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_global_id.cl b/libclc/clc/lib/nvptx/workitem/clc_get_global_id.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_global_id.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_global_id.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_global_size.cl b/libclc/clc/lib/nvptx/workitem/clc_get_global_size.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_global_size.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_global_size.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_group_id.cl b/libclc/clc/lib/nvptx/workitem/clc_get_group_id.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_group_id.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_group_id.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_id.cl b/libclc/clc/lib/nvptx/workitem/clc_get_local_id.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_id.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_local_id.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_size.cl b/libclc/clc/lib/nvptx/workitem/clc_get_local_size.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_size.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_local_size.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_max_sub_group_size.cl b/libclc/clc/lib/nvptx/workitem/clc_get_max_sub_group_size.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_max_sub_group_size.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_max_sub_group_size.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_num_groups.cl b/libclc/clc/lib/nvptx/workitem/clc_get_num_groups.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_num_groups.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_num_groups.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_sub_group_local_id.cl b/libclc/clc/lib/nvptx/workitem/clc_get_sub_group_local_id.cl similarity index 100% rename from libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_sub_group_local_id.cl rename to libclc/clc/lib/nvptx/workitem/clc_get_sub_group_local_id.cl diff --git a/libclc/clc/lib/spirv/CMakeLists.txt b/libclc/clc/lib/spirv/CMakeLists.txt index b7481615b9414..5f361c95c4003 100644 --- a/libclc/clc/lib/spirv/CMakeLists.txt +++ b/libclc/clc/lib/spirv/CMakeLists.txt @@ -1,5 +1,8 @@ -libclc_configure_source_list(CLC_SPIRV_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} +if(LIBCLC_TARGET_OS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBCLC_TARGET_OS}) + add_subdirectory(${LIBCLC_TARGET_OS}) +endif() + +libclc_add_sources(${LIBCLC_CLC_TARGET} FILES math/clc_fmax.cl math/clc_fmin.cl subnormal_config.cl diff --git a/libclc/clc/lib/spirv/vulkan/CMakeLists.txt b/libclc/clc/lib/spirv/vulkan/CMakeLists.txt new file mode 100644 index 0000000000000..499ce274a500c --- /dev/null +++ b/libclc/clc/lib/spirv/vulkan/CMakeLists.txt @@ -0,0 +1,4 @@ +libclc_add_sources(${LIBCLC_CLC_TARGET} FILES + integer/clc_mul_hi.cl + math/clc_sw_fma.cl +) diff --git a/libclc/clc/lib/vulkan/integer/clc_mul_hi.cl b/libclc/clc/lib/spirv/vulkan/integer/clc_mul_hi.cl similarity index 100% rename from libclc/clc/lib/vulkan/integer/clc_mul_hi.cl rename to libclc/clc/lib/spirv/vulkan/integer/clc_mul_hi.cl diff --git a/libclc/clc/lib/vulkan/math/clc_sw_fma.cl b/libclc/clc/lib/spirv/vulkan/math/clc_sw_fma.cl similarity index 100% rename from libclc/clc/lib/vulkan/math/clc_sw_fma.cl rename to libclc/clc/lib/spirv/vulkan/math/clc_sw_fma.cl diff --git a/libclc/clc/lib/vulkan/CMakeLists.txt b/libclc/clc/lib/vulkan/CMakeLists.txt deleted file mode 100644 index 172e3be32d65c..0000000000000 --- a/libclc/clc/lib/vulkan/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -libclc_configure_source_list(CLC_VULKAN_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} - integer/clc_mul_hi.cl - math/clc_sw_fma.cl -) diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 41297e8eb1e92..2d547e13a4e04 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -1,67 +1,71 @@ -# Converts a list of relative source paths to absolute paths and exports -# it to the parent scope. -macro(libclc_configure_source_list variable path) - set(${variable} ${ARGN}) - list(TRANSFORM ${variable} PREPEND "${path}/") - set(${variable} ${${variable}} PARENT_SCOPE) -endmacro() - -# Appends a compile option to the given source files. Paths are relative -# to `path` and the property is set in the top-level libclc directory scope. -macro(libclc_configure_source_options path option) - set(_option_srcs ${ARGN}) - list(TRANSFORM _option_srcs PREPEND "${path}/") - set_property(SOURCE ${_option_srcs} - DIRECTORY ${LIBCLC_SOURCE_DIR} - APPEND PROPERTY COMPILE_OPTIONS ${option} - ) -endmacro() - -# Merges OpenCL C source file lists with priority deduplication. +# Adds source files to a libclc builtin library target with deduplication. If a +# source with the same basename already exists in the target's SOURCES property +# the new file is skipped. This enables target-specific directories to override +# generic implementations when they are included first. # -# All arguments after the output variable name are treated as source file -# paths. When multiple files share the same basename, the last occurrence -# wins. This allows target-specific files to automatically override generic -# ones. -function(libclc_merge_sources output) - set(all_sources ${ARGN}) - set(result) - set(seen_names) - - list(REVERSE all_sources) - foreach(f ${all_sources}) - get_filename_component(name "${f}" NAME) - if(NOT name IN_LIST seen_names) - list(APPEND seen_names "${name}") - list(PREPEND result "${f}") +# Sources are specified as paths relative to CMAKE_CURRENT_SOURCE_DIR, or +# relative to BASE_DIR if provided. +function(libclc_add_sources target) + cmake_parse_arguments(ARG "" "BASE_DIR" "FILES" ${ARGN}) + if(NOT ARG_BASE_DIR) + set(ARG_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + get_target_property(existing ${target} SOURCES) + + set(seen) + foreach(file IN LISTS existing) + get_filename_component(name "${file}" NAME) + list(APPEND seen "${name}") + endforeach() + + set(new_sources) + foreach(rel_src IN LISTS ARG_FILES) + get_filename_component(name "${rel_src}" NAME) + if(NOT name IN_LIST seen) + list(APPEND new_sources "${ARG_BASE_DIR}/${rel_src}") + list(APPEND seen "${name}") endif() endforeach() - set(${output} ${result} PARENT_SCOPE) + if(new_sources) + target_sources(${target} PRIVATE ${new_sources}) + set(inc_dirs) + foreach(file IN LISTS new_sources) + get_filename_component(dir "${file}" DIRECTORY) + list(APPEND inc_dirs "${dir}") + endforeach() + list(REMOVE_DUPLICATES inc_dirs) + target_include_directories(${target} PRIVATE ${inc_dirs}) + endif() endfunction() -# Creates a static library target for libclc builtins. Derives include -# directories to locate `.inc` files in the same directory. -function(add_libclc_builtin_library target_name) +# Appends a compile option to the given source files. Source paths are +# relative to CMAKE_CURRENT_SOURCE_DIR. The property is set in the +# top-level libclc directory scope. +function(libclc_set_source_options option) + set(srcs ${ARGN}) + list(TRANSFORM srcs PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/") + set_property(SOURCE ${srcs} + DIRECTORY ${LIBCLC_SOURCE_DIR} + APPEND PROPERTY COMPILE_OPTIONS ${option} + ) +endfunction() + +# Creates a static library target for libclc builtins and configures its +# compile options, include directories, and definitions. Subdirectories +# populate sources via libclc_add_sources() after this call. +function(libclc_add_builtin_library target_name) cmake_parse_arguments(ARG "" "FOLDER" - "SOURCES;COMPILE_OPTIONS;INCLUDE_DIRS;COMPILE_DEFINITIONS" + "COMPILE_OPTIONS;INCLUDE_DIRS;COMPILE_DEFINITIONS" ${ARGN} ) - set(_inc_dirs) - foreach(f ${ARG_SOURCES}) - get_filename_component(dir ${f} DIRECTORY) - list(APPEND _inc_dirs ${dir}) - endforeach() - list(REMOVE_DUPLICATES _inc_dirs) - - add_library(${target_name} STATIC ${ARG_SOURCES}) + add_library(${target_name} STATIC) target_compile_options(${target_name} PRIVATE ${ARG_COMPILE_OPTIONS}) - target_include_directories(${target_name} PRIVATE - ${ARG_INCLUDE_DIRS} ${_inc_dirs} - ) + target_include_directories(${target_name} PRIVATE ${ARG_INCLUDE_DIRS}) target_compile_definitions(${target_name} PRIVATE ${ARG_COMPILE_DEFINITIONS}) set_target_properties(${target_name} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} @@ -71,7 +75,7 @@ endfunction() # Links one or more libclc builtin libraries together, optionally # internalizing dependencies, then produces a final .bc or .spv file. -function(link_libclc_builtin_library target_name) +function(libclc_link_library target_name) cmake_parse_arguments(ARG "" "ARCH;TRIPLE;TARGET_TRIPLE;FOLDER;OUTPUT_FILENAME" @@ -80,10 +84,10 @@ function(link_libclc_builtin_library target_name) ) if(NOT ARG_OUTPUT_FILENAME) - message(FATAL_ERROR "OUTPUT_FILENAME is required for link_libclc_builtin_library") + message(FATAL_ERROR "OUTPUT_FILENAME is required for libclc_link_library") endif() if(NOT ARG_LIBRARIES) - message(FATAL_ERROR "LIBRARIES is required for link_libclc_builtin_library") + message(FATAL_ERROR "LIBRARIES is required for libclc_link_library") endif() set(library_dir ${LIBCLC_OUTPUT_LIBRARY_DIR}/${ARG_TARGET_TRIPLE}) @@ -113,7 +117,7 @@ function(link_libclc_builtin_library target_name) string(REPLACE "-" ";" triple_parts "${ARG_TRIPLE}") list(GET triple_parts 2 triple_os) - if((ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv32 OR ARG_ARCH STREQUAL spirv64) AND NOT triple_os STREQUAL vulkan) + if(ARG_ARCH IN_LIST LIBCLC_ARCHS_SPIRV AND NOT triple_os STREQUAL vulkan) # SPIR-V targets produce a .spv file from the linked bitcode. set(builtins_lib ${library_dir}/${ARG_OUTPUT_FILENAME}.spv) if(LIBCLC_USE_SPIRV_BACKEND) @@ -148,41 +152,31 @@ function(link_libclc_builtin_library target_name) ) endfunction() -# Builds a builtins library from sources, links it with any internalized -# dependencies via link_libclc_builtin_library, and adds a verification test -# for unresolved symbols. -function(add_libclc_library target_name) +# Links builtin library targets, produces the final output file, and +# registers it for installation. +function(libclc_add_library target_name) cmake_parse_arguments(ARG "" "ARCH;TRIPLE;TARGET_TRIPLE;OUTPUT_FILENAME;PARENT_TARGET" - "SOURCES;COMPILE_OPTIONS;INCLUDE_DIRS;COMPILE_DEFINITIONS;INTERNALIZE_LIBRARIES;OPT_FLAGS" + "LIBRARIES;INTERNALIZE_LIBRARIES;OPT_FLAGS" ${ARGN} ) if(NOT ARG_OUTPUT_FILENAME) - message(FATAL_ERROR "OUTPUT_FILENAME is required for add_libclc_library") + message(FATAL_ERROR "OUTPUT_FILENAME is required for libclc_add_library") endif() if(NOT ARG_PARENT_TARGET) - message(FATAL_ERROR "PARENT_TARGET is required for add_libclc_library") + message(FATAL_ERROR "PARENT_TARGET is required for libclc_add_library") endif() - if(NOT ARG_SOURCES) - message(FATAL_ERROR "SOURCES is required for add_libclc_library") + if(NOT ARG_LIBRARIES) + message(FATAL_ERROR "LIBRARIES is required for libclc_add_library") endif() - set(builtins_target ${target_name}_clc_builtins) - add_libclc_builtin_library(${builtins_target} - SOURCES ${ARG_SOURCES} - COMPILE_OPTIONS ${ARG_COMPILE_OPTIONS} - INCLUDE_DIRS ${ARG_INCLUDE_DIRS} - COMPILE_DEFINITIONS ${ARG_COMPILE_DEFINITIONS} - FOLDER "libclc/Device IR/Intermediate" - ) - - link_libclc_builtin_library(${target_name} + libclc_link_library(${target_name} ARCH ${ARG_ARCH} TRIPLE ${ARG_TRIPLE} TARGET_TRIPLE ${ARG_TARGET_TRIPLE} - LIBRARIES ${builtins_target} + LIBRARIES ${ARG_LIBRARIES} INTERNALIZE_LIBRARIES ${ARG_INTERNALIZE_LIBRARIES} OPT_FLAGS ${ARG_OPT_FLAGS} OUTPUT_FILENAME "${ARG_OUTPUT_FILENAME}" diff --git a/libclc/opencl/CMakeLists.txt b/libclc/opencl/CMakeLists.txt new file mode 100644 index 0000000000000..8b4ac40ea827a --- /dev/null +++ b/libclc/opencl/CMakeLists.txt @@ -0,0 +1,9 @@ +# Add the target specific files first so they can override the generic fallback. +if(LIBCLC_ARCH_DIR AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/lib/${LIBCLC_ARCH_DIR}) + add_subdirectory(lib/${LIBCLC_ARCH_DIR}) +endif() + +# SPIR-V targets cannot use the generic list of generic functions yet. +if(NOT LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV) + add_subdirectory(lib/generic) +endif() diff --git a/libclc/opencl/lib/amdgpu/CMakeLists.txt b/libclc/opencl/lib/amdgpu/CMakeLists.txt index a42f2751906bf..16a75349d8a4a 100644 --- a/libclc/opencl/lib/amdgpu/CMakeLists.txt +++ b/libclc/opencl/lib/amdgpu/CMakeLists.txt @@ -1,5 +1,4 @@ -libclc_configure_source_list(OPENCL_AMDGCN_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} +libclc_add_sources(${LIBCLC_OPENCL_TARGET} FILES async/wait_group_events.cl printf/__printf_alloc.cl ) diff --git a/libclc/opencl/lib/generic/CMakeLists.txt b/libclc/opencl/lib/generic/CMakeLists.txt index 4ad60248139ae..5768ba9446ff1 100644 --- a/libclc/opencl/lib/generic/CMakeLists.txt +++ b/libclc/opencl/lib/generic/CMakeLists.txt @@ -1,5 +1,4 @@ -libclc_configure_source_list(OPENCL_GENERIC_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR} +libclc_add_sources(${LIBCLC_OPENCL_TARGET} FILES address_space/qualifier.cl async/async_work_group_copy.cl async/async_work_group_strided_copy.cl @@ -227,7 +226,7 @@ libclc_configure_source_list(OPENCL_GENERIC_SOURCES workitem/get_work_dim.cl ) -libclc_configure_source_options(${CMAKE_CURRENT_SOURCE_DIR} -fapprox-func +libclc_set_source_options(-fapprox-func math/native_cos.cl math/native_divide.cl math/native_exp.cl diff --git a/libclc/opencl/lib/spirv/CMakeLists.txt b/libclc/opencl/lib/spirv/CMakeLists.txt index ab6fea0692c09..ea35940f60d70 100644 --- a/libclc/opencl/lib/spirv/CMakeLists.txt +++ b/libclc/opencl/lib/spirv/CMakeLists.txt @@ -1,9 +1,12 @@ -set(_gen ${CMAKE_CURRENT_SOURCE_DIR}/../generic) +if(LIBCLC_TARGET_OS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBCLC_TARGET_OS}) + add_subdirectory(${LIBCLC_TARGET_OS}) + return() +endif() -# SPIR-V uses a curated subset of generic builtins, so this list is -# self-contained rather than merged with the generic set. -libclc_configure_source_list(OPENCL_SPIRV_SOURCES - ${_gen} +# Non-Vulkan SPIR-V uses a curated subset of generic builtins. +libclc_add_sources(${LIBCLC_OPENCL_TARGET} + BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../generic + FILES async/async_work_group_strided_copy.cl async/wait_group_events.cl common/degrees.cl diff --git a/libclc/opencl/lib/vulkan/CMakeLists.txt b/libclc/opencl/lib/spirv/vulkan/CMakeLists.txt similarity index 85% rename from libclc/opencl/lib/vulkan/CMakeLists.txt rename to libclc/opencl/lib/spirv/vulkan/CMakeLists.txt index 5fa2e7a367678..d04959027f0de 100644 --- a/libclc/opencl/lib/vulkan/CMakeLists.txt +++ b/libclc/opencl/lib/spirv/vulkan/CMakeLists.txt @@ -1,9 +1,6 @@ -set(_gen ${CMAKE_CURRENT_SOURCE_DIR}/../generic) - # Vulkan uses a curated subset of generic builtins plus its own overrides, # so this list is self-contained rather than merged with the generic set. -libclc_configure_source_list(_vulkan_sources - ${CMAKE_CURRENT_SOURCE_DIR} +libclc_add_sources(${LIBCLC_OPENCL_TARGET} FILES conversion/convert_float2float.cl conversion/convert_float2int.cl conversion/convert_int2float.cl @@ -12,8 +9,9 @@ libclc_configure_source_list(_vulkan_sources shared/vstore_half.cl ) -libclc_configure_source_list(_gen_sources - ${_gen} +libclc_add_sources(${LIBCLC_OPENCL_TARGET} + BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../generic + FILES geometric/distance.cl geometric/length.cl math/acos.cl @@ -84,5 +82,3 @@ libclc_configure_source_list(_gen_sources math/tanpi.cl math/tgamma.cl ) - -set(OPENCL_VULKAN_SOURCES ${_vulkan_sources} ${_gen_sources} PARENT_SCOPE) diff --git a/libclc/opencl/lib/vulkan/conversion/convert_float.inc b/libclc/opencl/lib/spirv/vulkan/conversion/convert_float.inc similarity index 100% rename from libclc/opencl/lib/vulkan/conversion/convert_float.inc rename to libclc/opencl/lib/spirv/vulkan/conversion/convert_float.inc diff --git a/libclc/opencl/lib/vulkan/conversion/convert_float2float.cl b/libclc/opencl/lib/spirv/vulkan/conversion/convert_float2float.cl similarity index 100% rename from libclc/opencl/lib/vulkan/conversion/convert_float2float.cl rename to libclc/opencl/lib/spirv/vulkan/conversion/convert_float2float.cl diff --git a/libclc/opencl/lib/vulkan/conversion/convert_float2int.cl b/libclc/opencl/lib/spirv/vulkan/conversion/convert_float2int.cl similarity index 100% rename from libclc/opencl/lib/vulkan/conversion/convert_float2int.cl rename to libclc/opencl/lib/spirv/vulkan/conversion/convert_float2int.cl diff --git a/libclc/opencl/lib/vulkan/conversion/convert_int2float.cl b/libclc/opencl/lib/spirv/vulkan/conversion/convert_int2float.cl similarity index 100% rename from libclc/opencl/lib/vulkan/conversion/convert_int2float.cl rename to libclc/opencl/lib/spirv/vulkan/conversion/convert_int2float.cl diff --git a/libclc/opencl/lib/vulkan/conversion/convert_integer.cl b/libclc/opencl/lib/spirv/vulkan/conversion/convert_integer.cl similarity index 100% rename from libclc/opencl/lib/vulkan/conversion/convert_integer.cl rename to libclc/opencl/lib/spirv/vulkan/conversion/convert_integer.cl diff --git a/libclc/opencl/lib/vulkan/conversion/convert_integer.inc b/libclc/opencl/lib/spirv/vulkan/conversion/convert_integer.inc similarity index 100% rename from libclc/opencl/lib/vulkan/conversion/convert_integer.inc rename to libclc/opencl/lib/spirv/vulkan/conversion/convert_integer.inc diff --git a/libclc/opencl/lib/vulkan/math/fma.cl b/libclc/opencl/lib/spirv/vulkan/math/fma.cl similarity index 100% rename from libclc/opencl/lib/vulkan/math/fma.cl rename to libclc/opencl/lib/spirv/vulkan/math/fma.cl diff --git a/libclc/opencl/lib/vulkan/shared/vstore_half.cl b/libclc/opencl/lib/spirv/vulkan/shared/vstore_half.cl similarity index 100% rename from libclc/opencl/lib/vulkan/shared/vstore_half.cl rename to libclc/opencl/lib/spirv/vulkan/shared/vstore_half.cl diff --git a/libclc/opencl/lib/vulkan/shared/vstore_half.inc b/libclc/opencl/lib/spirv/vulkan/shared/vstore_half.inc similarity index 100% rename from libclc/opencl/lib/vulkan/shared/vstore_half.inc rename to libclc/opencl/lib/spirv/vulkan/shared/vstore_half.inc From e9942d89bf7b513802c61018131fd505a93c45d9 Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Tue, 12 May 2026 20:32:32 +0800 Subject: [PATCH 429/538] [DAGCombiner] Use KnownBits in `combineFMulOrFDivWithIntPow2` (#197097) Use `computeKnownBits` to tighten the high bit width bound via `countMaxActiveBits()`, which accounts for known leading zeros. Co-Authored-By: Simon Pilgrim --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 154 ++++++++++++++++++ 2 files changed, 161 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c265d5d5ec982..4d441b844ebdc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19029,16 +19029,17 @@ SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) { ConstOp = peekThroughBitcasts(N->getOperand(ConstOpIdx)); Pow2Op = N->getOperand(1 - ConstOpIdx); - if (Pow2Op.getOpcode() != ISD::UINT_TO_FP && - (Pow2Op.getOpcode() != ISD::SINT_TO_FP || - !DAG.computeKnownBits(Pow2Op).isNonNegative())) + unsigned Pow2Opc = Pow2Op.getOpcode(); + if (Pow2Opc != ISD::UINT_TO_FP && Pow2Opc != ISD::SINT_TO_FP) return false; Pow2Op = Pow2Op.getOperand(0); - // `Log2(Pow2Op) < Pow2Op.getScalarSizeInBits()`. - // TODO: We could use knownbits to make this bound more precise. - int MaxExpChange = Pow2Op.getValueType().getScalarSizeInBits(); + KnownBits Pow2OpKnownBits = DAG.computeKnownBits(Pow2Op); + if (Pow2Opc == ISD::SINT_TO_FP && !Pow2OpKnownBits.isNonNegative()) + return false; + + int MaxExpChange = Pow2OpKnownBits.countMaxActiveBits(); auto IsFPConstValid = [N, MaxExpChange, &Mantissa](ConstantFPSDNode *CFP) { if (CFP == nullptr) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index b3c46bc865b25..c757e67d1b1c8 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1712,3 +1712,157 @@ define x86_fp80 @pr128528(i1 %cond) { %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800 ret x86_fp80 %mul } + +define double @fmul_pow_shl_cnt_knownbits(i64 %cnt) nounwind { +; CHECK-SSE-LABEL: fmul_pow_shl_cnt_knownbits: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: andl $15, %edi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 +; CHECK-SSE-NEXT: addq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: fmul_pow_shl_cnt_knownbits: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: andl $15, %edi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 +; CHECK-AVX-NEXT: addq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq + %cnt_masked = and i64 %cnt, 15 + %shl = shl nuw i64 1, %cnt_masked + %conv = uitofp i64 %shl to double + %mul = fmul double 9.745314e+288, %conv + ret double %mul +} + +define double @fdiv_pow_shl_cnt_knownbits(i64 %cnt) nounwind { +; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_knownbits: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: andl $15, %edi +; CHECK-SSE-NEXT: shlq $52, %rdi +; CHECK-SSE-NEXT: movabsq $243194378260042637, %rax # imm = 0x35FFFFF9F8FD38D +; CHECK-SSE-NEXT: subq %rdi, %rax +; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_knownbits: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: andl $15, %edi +; CHECK-AVX-NEXT: shlq $52, %rdi +; CHECK-AVX-NEXT: movabsq $243194378260042637, %rax # imm = 0x35FFFFF9F8FD38D +; CHECK-AVX-NEXT: subq %rdi, %rax +; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: retq + %cnt_masked = and i64 %cnt, 15 + %shl = shl nuw i64 1, %cnt_masked + %conv = uitofp i64 %shl to double + %mul = fdiv double 2.004168e-292, %conv + ret double %mul +} + +; Negative: 1010+16=1026 >= 1023 +define double @fmul_pow_shl_cnt_fail_knownbits_bad_exp(i64 %cnt) nounwind { +; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movq %rdi, %rcx +; CHECK-SSE-NEXT: andb $15, %cl +; CHECK-SSE-NEXT: movl $1, %eax +; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-SSE-NEXT: shll %cl, %eax +; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rdi, %rcx +; CHECK-AVX2-NEXT: andb $15, %cl +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shll %cl, %eax +; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-ONLY-AVX512F-NEXT: andb $15, %cl +; CHECK-ONLY-AVX512F-NEXT: movl $1, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-ONLY-AVX512F-NEXT: shll %cl, %eax +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: andb $15, %dil +; CHECK-SKX-NEXT: movl $1, %eax +; CHECK-SKX-NEXT: shlxq %rdi, %rax, %rax +; CHECK-SKX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq + %cnt_masked = and i64 %cnt, 15 + %shl = shl nuw i64 1, %cnt_masked + %conv = uitofp i64 %shl to double + %mul = fmul double 0x7F18000000000000, %conv + ret double %mul +} + +; Negative: -1008-16=-1024 <= -1022 +define double @fdiv_pow_shl_cnt_fail_knownbits_bad_exp(i64 %cnt) nounwind { +; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movq %rdi, %rcx +; CHECK-SSE-NEXT: andb $15, %cl +; CHECK-SSE-NEXT: movl $1, %eax +; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-SSE-NEXT: shll %cl, %eax +; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-SSE-NEXT: movsd {{.*#+}} xmm0 = [5.4683415146672981E-304,0.0E+0] +; CHECK-SSE-NEXT: divsd %xmm1, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rdi, %rcx +; CHECK-AVX2-NEXT: andb $15, %cl +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shll %cl, %eax +; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 +; CHECK-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [5.4683415146672981E-304,0.0E+0] +; CHECK-AVX2-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-ONLY-AVX512F-LABEL: fdiv_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: movq %rdi, %rcx +; CHECK-ONLY-AVX512F-NEXT: andb $15, %cl +; CHECK-ONLY-AVX512F-NEXT: movl $1, %eax +; CHECK-ONLY-AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-ONLY-AVX512F-NEXT: shll %cl, %eax +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = [5.4683415146672981E-304,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fdiv_pow_shl_cnt_fail_knownbits_bad_exp: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: andb $15, %dil +; CHECK-SKX-NEXT: movl $1, %eax +; CHECK-SKX-NEXT: shlxq %rdi, %rax, %rax +; CHECK-SKX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vmovsd {{.*#+}} xmm1 = [5.4683415146672981E-304,0.0E+0] +; CHECK-SKX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq + %cnt_masked = and i64 %cnt, 15 + %shl = shl nuw i64 1, %cnt_masked + %conv = uitofp i64 %shl to double + %mul = fdiv double 0x00F8000000000000, %conv + ret double %mul +} From 333a54663af3bd57216f9e0f71cfd88d12227334 Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Tue, 12 May 2026 20:32:58 +0800 Subject: [PATCH 430/538] [NFC][InstCombine] fix duplicate `CreateNot` in ((A^C)^B) & (B^A) fold (#197163) It obviously should use the `NotC` created 4 lines above --- llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index c9f2418fae6fd..5f9fecc7c5675 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2754,7 +2754,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { ? Builder.CreateNot(C) : getFreelyInverted(C, C->hasOneUse(), &Builder); if (NotC != nullptr) - return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C)); + return BinaryOperator::CreateAnd(Op1, NotC); } // (A | B) & (~A ^ B) -> A & B From d82b5630fd27ae4312c2aef68b7475f83050f413 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Tue, 12 May 2026 08:34:50 -0400 Subject: [PATCH 431/538] Add LIT_UNSUPPORTED support to lit testing (#193766) Add LIT_UNSUPPORTED support to lit, mirroring the existing LIT_XFAIL implementation. This allows tests to be marked as UNSUPPORTED via command line arguments (--unsupported, --unsupported-not) or environment variables (LIT_UNSUPPORTED, LIT_UNSUPPORTED_NOT). This feature enables users to dynamically mark tests as unsupported without modifying test files, useful for CI/CD pipelines and platform-specific test filtering. Assisted by AI. --- llvm/docs/CommandGuide/lit.rst | 21 ++++++++++++++++++++ llvm/utils/lit/lit/Test.py | 6 ++++++ llvm/utils/lit/lit/cl_arguments.py | 14 +++++++++++++ llvm/utils/lit/lit/main.py | 13 +++++++++++++ llvm/utils/lit/tests/unsupported-cl.py | 27 ++++++++++++++++++++++++++ 5 files changed, 81 insertions(+) create mode 100644 llvm/utils/lit/tests/unsupported-cl.py diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst index 2e14f53d110cb..016e33d06b636 100644 --- a/llvm/docs/CommandGuide/lit.rst +++ b/llvm/docs/CommandGuide/lit.rst @@ -366,6 +366,27 @@ The timing data is stored in the `test_exec_root` in a file named primary purpose is to suppress an ``XPASS`` result without modifying a test case that uses the ``XFAIL`` directive. +.. option:: --unsupported LIST + + Treat those tests whose name is in the semicolon separated list ``LIST`` as + ``UNSUPPORTED``. This can be helpful when one does not want to modify the test + suite. The environment variable ``LIT_UNSUPPORTED`` can be also used in place + of this option, which is especially useful in environments where the call to + ``lit`` is issued indirectly. + + The syntax for specifying test names is the same as for :option:`--xfail` and + ``LIT_XFAIL``. A test name can be specified as a file name relative to the + test suite directory or as the full test name reported in LIT output. + +.. option:: --unsupported-not LIST + + Do not treat the specified tests as ``UNSUPPORTED``. The environment variable + ``LIT_UNSUPPORTED_NOT`` can also be used in place of this option. The syntax + is the same as for :option:`--unsupported` and ``LIT_UNSUPPORTED``. + :option:`--unsupported-not` and ``LIT_UNSUPPORTED_NOT`` always override all + other ``UNSUPPORTED`` specifications, including an :option:`--unsupported` + appearing later on the command line. + .. option:: --exclude-xfail ``XFAIL`` tests won't be run, unless they are listed in the ``--xfail-not`` diff --git a/llvm/utils/lit/lit/Test.py b/llvm/utils/lit/lit/Test.py index daba14a898c4c..6f87ecfc82d15 100644 --- a/llvm/utils/lit/lit/Test.py +++ b/llvm/utils/lit/lit/Test.py @@ -262,6 +262,9 @@ def __init__( # If true, ignore all items in self.xfails. self.xfail_not = False + # If true, ignore all items in self.unsupported. + self.unsupported_not = False + # A list of conditions that must be satisfied before running the test. # Each condition is a boolean expression of features. All of them # must be True for the test to run. @@ -423,6 +426,9 @@ def getUnsupportedFeatures(self): Throws ValueError if an UNSUPPORTED line has a syntax error. """ + if self.unsupported_not: + return [] + features = self.config.available_features try: diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py index bebde4b762b0e..6225ac57abfd0 100644 --- a/llvm/utils/lit/lit/cl_arguments.py +++ b/llvm/utils/lit/lit/cl_arguments.py @@ -464,6 +464,20 @@ def parse_args(): default=False, action="store_true", ) + selection_group.add_argument( + "--unsupported", + metavar="LIST", + type=_semicolon_list, + help="Mark tests with paths in the semicolon separated list as UNSUPPORTED", + default=os.environ.get("LIT_UNSUPPORTED", ""), + ) + selection_group.add_argument( + "--unsupported-not", + metavar="LIST", + type=_semicolon_list, + help="Do not mark tests with paths in the semicolon separated list as UNSUPPORTED", + default=os.environ.get("LIT_UNSUPPORTED_NOT", ""), + ) selection_group.add_argument( "--num-shards", dest="numShards", diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py index 77b23bf560c6e..d1e16b28bea26 100755 --- a/llvm/utils/lit/lit/main.py +++ b/llvm/utils/lit/lit/main.py @@ -122,6 +122,7 @@ def main(builtin_params={}): selected_tests = selected_tests[: opts.max_tests] mark_xfail(discovered_tests, opts) + mark_unsupported(discovered_tests, opts) mark_excluded(discovered_tests, selected_tests) @@ -248,6 +249,18 @@ def mark_xfail(selected_tests, opts): t.exclude_xfail = True +def mark_unsupported(selected_tests, opts): + for t in selected_tests: + test_file = os.sep.join(t.path_in_suite) + test_full_name = t.getFullName() + if test_file in opts.unsupported or test_full_name in opts.unsupported: + # Add a special feature that's always present to mark as unsupported. + t.config.available_features.add("lit-unsupported-marker") + t.unsupported.append("lit-unsupported-marker") + if test_file in opts.unsupported_not or test_full_name in opts.unsupported_not: + t.unsupported_not = True + + def mark_excluded(discovered_tests, selected_tests): excluded_tests = set(discovered_tests) - set(selected_tests) result = lit.Test.Result(lit.Test.EXCLUDED) diff --git a/llvm/utils/lit/tests/unsupported-cl.py b/llvm/utils/lit/tests/unsupported-cl.py new file mode 100644 index 0000000000000..83528e84ddc94 --- /dev/null +++ b/llvm/utils/lit/tests/unsupported-cl.py @@ -0,0 +1,27 @@ +# Check that marking tests as UNSUPPORTED works via command line or env var. + +# RUN: %{lit} --unsupported 'true.txt' \ +# RUN: %{inputs}/xfail-cl/true.txt \ +# RUN: | FileCheck --check-prefix=CHECK-UNSUPPORTED %s + +# RUN: env LIT_UNSUPPORTED='true.txt' \ +# RUN: %{lit} %{inputs}/xfail-cl/true.txt \ +# RUN: | FileCheck --check-prefix=CHECK-UNSUPPORTED %s + +# Check that --unsupported-not and LIT_UNSUPPORTED_NOT override --unsupported. + +# RUN: %{lit} --unsupported 'true.txt' --unsupported-not 'true.txt' \ +# RUN: %{inputs}/xfail-cl/true.txt \ +# RUN: | FileCheck --check-prefix=CHECK-NOT-UNSUPPORTED %s + +# RUN: env LIT_UNSUPPORTED='true.txt' LIT_UNSUPPORTED_NOT='true.txt' \ +# RUN: %{lit} %{inputs}/xfail-cl/true.txt \ +# RUN: | FileCheck --check-prefix=CHECK-NOT-UNSUPPORTED %s + +# END. + +# CHECK-UNSUPPORTED: Testing: 1 tests, {{[0-9]*}} workers +# CHECK-UNSUPPORTED: {{^}}UNSUPPORTED: top-level-suite :: true.txt + +# CHECK-NOT-UNSUPPORTED: Testing: 1 tests, {{[0-9]*}} workers +# CHECK-NOT-UNSUPPORTED: {{^}}PASS: top-level-suite :: true.txt From 0122142e0d51efb3bd8fc116e981feada63447f5 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 12 May 2026 14:37:05 +0200 Subject: [PATCH 432/538] [clang][bytecode][NFC] Use proper format function in Program::dump() (#197160) This was using format() instead of formatv() by accident. --- clang/lib/AST/ByteCode/Disasm.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index 36a94dd937011..cf7afcd6d4b1e 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -28,6 +28,7 @@ #include "clang/AST/DeclCXX.h" #include "clang/AST/ExprCXX.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/FormatVariadic.h" using namespace clang; using namespace clang::interp; @@ -328,9 +329,9 @@ static std::string formatBytes(size_t B) { if (B < (1u << 10u)) SS << B << " B"; else if (B < (1u << 20u)) - SS << llvm::format("{0:F2}", B / 1024.) << " KB"; + SS << llvm::formatv("{0:F2}", B / 1024.) << " KB"; else - SS << llvm::format("{0:F2}", B / 1024. / 1024.) << " MB"; + SS << llvm::formatv("{0:F2}", B / 1024. / 1024.) << " MB"; return Result; } From 23626452644be5f5ede751baf82a6ab506af215f Mon Sep 17 00:00:00 2001 From: walkerkd Date: Tue, 12 May 2026 13:37:17 +0100 Subject: [PATCH 433/538] [AArch64] Add C1-Nano scheduling model (#182316) Instead of using the Cortex-A510 scheduling model, C1-Nano now uses its own scheduling model, based off of the C1-Nano Software Optimization Guide: https://developer.arm.com/documentation/109590/0001 --- llvm/lib/Target/AArch64/AArch64.td | 1 + llvm/lib/Target/AArch64/AArch64Processors.td | 2 +- llvm/lib/Target/AArch64/AArch64SchedA53.td | 2 +- llvm/lib/Target/AArch64/AArch64SchedA57.td | 2 +- llvm/lib/Target/AArch64/AArch64SchedA64FX.td | 2 +- .../Target/AArch64/AArch64SchedAmpere1B.td | 3 +- llvm/lib/Target/AArch64/AArch64SchedC1Nano.td | 1860 +++++ .../Target/AArch64/AArch64SchedC1Premium.td | 3 +- .../lib/Target/AArch64/AArch64SchedC1Ultra.td | 3 +- .../lib/Target/AArch64/AArch64SchedCyclone.td | 2 +- .../Target/AArch64/AArch64SchedExynosM3.td | 2 +- .../Target/AArch64/AArch64SchedExynosM4.td | 2 +- .../Target/AArch64/AArch64SchedExynosM5.td | 2 +- llvm/lib/Target/AArch64/AArch64SchedFalkor.td | 2 +- llvm/lib/Target/AArch64/AArch64SchedKryo.td | 2 +- .../Target/AArch64/AArch64SchedNeoverseN1.td | 2 +- .../Target/AArch64/AArch64SchedNeoverseN2.td | 2 +- .../Target/AArch64/AArch64SchedNeoverseN3.td | 2 +- .../Target/AArch64/AArch64SchedNeoverseV1.td | 2 +- .../Target/AArch64/AArch64SchedNeoverseV2.td | 2 +- .../Target/AArch64/AArch64SchedNeoverseV3.td | 2 +- .../AArch64/AArch64SchedNeoverseV3AE.td | 2 +- .../lib/Target/AArch64/AArch64SchedOlympus.td | 2 +- llvm/lib/Target/AArch64/AArch64SchedOryon.td | 2 +- .../Target/AArch64/AArch64SchedPredicates.td | 42 +- llvm/lib/Target/AArch64/AArch64SchedTSV110.td | 2 +- .../Target/AArch64/AArch64SchedThunderX.td | 2 +- .../AArch64/AArch64SchedThunderX2T99.td | 3 +- .../AArch64/AArch64SchedThunderX3T110.td | 2 +- llvm/lib/Target/AArch64/AArch64Schedule.td | 4 + .../Cortex/C1Nano-basic-instructions.s | 2512 ++++++ .../AArch64/Cortex/C1Nano-bf16-instructions.s | 56 + .../Cortex/C1Nano-complxnum-instructions.s | 42 + .../Cortex/C1Nano-crypto-instructions.s | 96 + .../C1Nano-flag-manipulation-instructions.s | 53 + .../AArch64/Cortex/C1Nano-forwarding.s | 285 + .../Cortex/C1Nano-fp16fml-instructions.s | 67 + .../Cortex/C1Nano-fptoint-instructions.s | 59 + .../AArch64/Cortex/C1Nano-i8mm-instructions.s | 53 + .../AArch64/Cortex/C1Nano-js-instructions.s | 36 + .../AArch64/Cortex/C1Nano-mops-instructions.s | 323 + .../AArch64/Cortex/C1Nano-mte-instructions.s | 244 + .../AArch64/Cortex/C1Nano-neon-instructions.s | 3152 ++++++++ .../Cortex/C1Nano-ptraut-instructions.s | 146 + .../Cortex/C1Nano-rcpc-immo-instructions.s | 61 + .../AArch64/Cortex/C1Nano-sve-instructions.s | 6862 +++++++++++++++++ .../AArch64/Inputs/crypto-instructions.s | 34 + .../llvm-mca/AArch64/Inputs/js-instructions.s | 1 + .../AArch64/Inputs/mops-instructions.s | 138 + .../AArch64/Inputs/ptraut-instructions.s | 56 + .../AArch64/Inputs/sve-while-instructions.s | 6 + mops-instructions.s | 138 + 52 files changed, 16355 insertions(+), 28 deletions(-) create mode 100644 llvm/lib/Target/AArch64/AArch64SchedC1Nano.td create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-basic-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-bf16-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-complxnum-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-crypto-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-flag-manipulation-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-forwarding.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fp16fml-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fptoint-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-i8mm-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-js-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mops-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mte-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-neon-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-ptraut-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-rcpc-immo-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-sve-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Inputs/crypto-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Inputs/js-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Inputs/mops-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Inputs/ptraut-instructions.s create mode 100644 llvm/test/tools/llvm-mca/AArch64/Inputs/sve-while-instructions.s create mode 100644 mops-instructions.s diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 20dea4b1af84e..bdb80e7ffdd6d 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -119,6 +119,7 @@ include "AArch64SchedA53.td" include "AArch64SchedA55.td" include "AArch64SchedA510.td" include "AArch64SchedA57.td" +include "AArch64SchedC1Nano.td" include "AArch64SchedC1Ultra.td" include "AArch64SchedC1Premium.td" include "AArch64SchedCyclone.td" diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index b33ffdafbf2cc..6c24290fb681c 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1439,7 +1439,7 @@ def : ProcessorModel<"cortex-a520", CortexA510Model, ProcessorFeatures.A520, [TuneA520]>; def : ProcessorModel<"cortex-a520ae", CortexA510Model, ProcessorFeatures.A520AE, [TuneA520AE]>; -def : ProcessorModel<"c1-nano", CortexA510Model, +def : ProcessorModel<"c1-nano", C1NanoModel, ProcessorFeatures.C1Nano, [TuneC1Nano]>; def : ProcessorModel<"cortex-a57", CortexA57Model, ProcessorFeatures.A53, [TuneA57]>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td index 66715b9d1db8b..291f78582d94a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -29,7 +29,7 @@ def CortexA53Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index a558022904cf5..9260379fb5b27 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -34,7 +34,7 @@ def CortexA57Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td index d6fe84a2c9c9b..6de7fde1951ab 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td @@ -23,7 +23,7 @@ def A64FXModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F, [HasMTE, HasMatMulInt8, HasBF16, HasPAuth, HasPAuthLR, HasCPA, - HasCSSC]); + HasCSSC, HasMOPS, HasMOPS_GO]); let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td index 67f8593f1577a..d77c48126af39 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1B.td @@ -26,7 +26,8 @@ def Ampere1BModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, SMEUnsupported.F, - PAUnsupported.F); + PAUnsupported.F, + [HasMOPS, HasMOPS_GO]); } let SchedModel = Ampere1BModel in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedC1Nano.td b/llvm/lib/Target/AArch64/AArch64SchedC1Nano.td new file mode 100644 index 0000000000000..2992236b84eb4 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedC1Nano.td @@ -0,0 +1,1860 @@ +//==- AArch64Sched1Nano.td - ARM C1-Nano Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the ARM C1-Nano processor. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// C1-Nano machine model for scheduling and other instruction cost heuristics. +def C1NanoModel : SchedMachineModel { + let MicroOpBufferSize = 0; // The C1-Nano is an in-order processor + let IssueWidth = 3; // It dual-issues under most circumstances + let LoadLatency = 3; // Cycles for loads to access the cache. + // 2 is best case, 4 is normal case. + // 3 seems to be a good tradeoff + let PostRAScheduler = 1; // Enable PostRA scheduler pass. + let CompleteModel = 1; // Covers instructions applicable to C1-Nano. + + list UnsupportedFeatures = !listconcat(SMEUnsupported.F, + [HasSVE2p1, HasSVEB16B16, + HasCPA, HasCSSC]); +} + + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types + +let SchedModel = C1NanoModel in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the +// C1-Nano is in-order. +let BufferSize = 0 in { + def C1NanoUnitALU0 : ProcResource<1>; // Int ALU0 + def C1NanoUnitALU1 : ProcResource<1>; // Int ALU1 + def C1NanoUnitMAC : ProcResource<1>; // Int MAC, 64-bit wide + def C1NanoUnitDiv : ProcResource<1>; // Int Division, not pipelined + // There are 2 LS pipes, 1 for Load/Store; 1 for Load only + def C1NanoUnitLdSt : ProcResource<1>; // Load/Store shared pipe + def C1NanoUnitLd1 : ProcResource<1>; // Load pipe + def C1NanoUnitB : ProcResource<1>; // Branch + def C1NanoUnitPAC : ProcResource<1>; // Pointer Authentication (PAC) pipe + + // The FP DIV/SQRT instructions execute totally differently from the FP ALU + // instructions, which can mostly be dual-issued; that's why for now we model + // them with 2 resources. + def C1NanoUnitVALU0 : ProcResource<1>; // SIMD/FP/SVE ALU0 + def C1NanoUnitVALU1 : ProcResource<1>; // SIMD/FP/SVE ALU1 + def C1NanoUnitVMAC0 : ProcResource<1>; // SIMD/FP/SVE MAC0 + def C1NanoUnitVMAC1 : ProcResource<1>; // SIMD/FP/SVE MAC1 + def C1NanoUnitVMC : ProcResource<1>; // SIMD/FP/SVE multicycle instrs (e.g Div, SQRT, cryptography) +} + +def C1NanoUnitLd : ProcResGroup<[C1NanoUnitLdSt, C1NanoUnitLd1]>; +def C1NanoUnitVALU : ProcResGroup<[C1NanoUnitVALU0, C1NanoUnitVALU1]>; +def C1NanoUnitALU : ProcResGroup<[C1NanoUnitALU0, C1NanoUnitALU1]>; +def C1NanoUnitVMAC : ProcResGroup<[C1NanoUnitVMAC0, C1NanoUnitVMAC1]>; + +// Workaround for throughput being doubled on a single resource +def C1NanoUnit2VMC : ProcResGroup<[C1NanoUnitVMC, C1NanoUnitVMC]>; +def C1NanoUnit4VMC : ProcResGroup<[C1NanoUnitVMC, C1NanoUnitVMC, C1NanoUnitVMC, C1NanoUnitVMC]>; +def C1NanoUnit2VALU0: ProcResGroup<[C1NanoUnitVALU0, C1NanoUnitVALU0]>; + +// These latencies are modeled without taking into account forwarding paths +// (the software optimisation guide lists latencies taking into account +// typical forwarding paths). +def : WriteRes { let Latency = 1; } // MOVN, MOVZ +def : WriteRes { let Latency = 1; } // ALU +def : WriteRes { let Latency = 1; } // ALU of Shifted-Reg +def : WriteRes { let Latency = 1; } // ALU of Extended-Reg +def : WriteRes { let Latency = 2; } // EXTR from a reg pair +def : WriteRes { let Latency = 2; } // Shift/Scale + +// MAC +def : WriteRes { let Latency = 3; } // 32-bit Multiply +def : WriteRes { let Latency = 4; let ReleaseAtCycles = [2];} // 64-bit Multiply + +// Div +def : WriteRes { + let Latency = 12; let ReleaseAtCycles = [12]; +} +def : WriteRes { + let Latency = 20; let ReleaseAtCycles = [20]; +} + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the C1-Nano + +//===----------------------------------------------------------------------===// +class C1NanoWrite : SchedWriteRes<[res]> { + let Latency = n; +} + +class C1NanoMCWrite : SchedWriteRes<[res]> { + let Latency = n; + let ReleaseAtCycles = [m]; + let BeginGroup = 1; +} + +// This is a "workaround" for the case where the throughput is +// 1 or less, and the resource in question is actually a pair +// of resources which normally results in the throughput being +// divided by 2. +class C1NanoMC2Write : SchedWriteRes<[res, res]> { + let Latency = n; + let ReleaseAtCycles = [m, m]; + let BeginGroup = 1; +} + +class C1NanoMC_RC0Write : SchedWriteRes<[res]> { + let Latency = n; + let BeginGroup = 1; +} + +//===----------------------------------------------------------------------===// +// Define generic 2 micro-op types +def C1NanoWrite_10cyc_1VMAC_1VALU : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVMAC]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def C1NanoWrite_10cyc_1VMAC_1VALU_A : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVALU, C1NanoUnitVMAC, C1NanoUnitVMAC]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def C1NanoWrite_14cyc_1VMAC_1VALU_B : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVALU, C1NanoUnitVMAC, C1NanoUnitVMAC]> { + let Latency = 14; + let NumMicroOps = 2; +} + +class C1NanoWrite_PAC_B : SchedWriteRes<[C1NanoUnitPAC, C1NanoUnitB]> { + let Latency = lat; + let NumMicroOps = 2; +} + +// FEAT_MOPS instructions use both ALU and Load/Store pipelines. +class C1NanoWriteMOPS : SchedWriteRes<[C1NanoUnitALU, C1NanoUnitLdSt]> { + let Latency = lat; + let ReleaseAtCycles = [release, release]; +} + +// Note: For some "Main" MOPS instructions, the SWOG latency depends on the runtime +// value in Xn; we model the base latency here. +class C1NanoWriteMOPSDynamic : SchedWriteRes<[C1NanoUnitALU, C1NanoUnitLdSt]> { + let Latency = lat; + let ReleaseAtCycles = [release, release]; +} + +// Load +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } + +// Pre/Post Indexing - Performed as part of address generation +def : WriteRes { let Latency = 0; } + +// Store +let RetireOOO = 1 in { +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +} +def : WriteRes { let Latency = 3; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes { let Latency = 5; + let ReleaseAtCycles = [2];} + +def : WriteRes { let Unsupported = 1; } + +// Branch +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// FP ALU +// As WriteF result is produced in F5 and it can be mostly forwarded +// to consumer at F1, the effectively Latency is set as 4. +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } + +class C1NanoVSt : SchedWriteRes<[C1NanoUnitLdSt]> { + let RetireOOO = 1; + let ReleaseAtCycles = [n]; +} + +def C1NanoVSt0 : SchedWriteRes<[C1NanoUnitLdSt]> { + let RetireOOO = 1; +} + +def : SchedAlias>; +def : SchedAlias>; + +// FP VALU specific new schedwrite definitions +def C1NanoWriteVALU_F2 : SchedWriteRes<[C1NanoUnitVALU]> { let Latency = 2;} +def C1NanoWriteVALU_F3 : SchedWriteRes<[C1NanoUnitVALU]> { let Latency = 3;} +def C1NanoWriteVALU_F4 : SchedWriteRes<[C1NanoUnitVALU]> { let Latency = 4;} +def C1NanoWriteVALU0_F3 : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVALU]> { let Latency = 3;} +def C1NanoWriteVALU1_F4 : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVALU]> { let Latency = 4;} + +// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined +def : WriteRes { let Latency = 4; } + +let RetireOOO = 1 in { +def : WriteRes { let Latency = 22; + let ReleaseAtCycles = [29]; } +def C1NanoWriteVMAC : SchedWriteRes<[C1NanoUnitVMAC]> { let Latency = 4; } +} + +class C1NanoWriteVMACBypass : SchedWriteRes<[C1NanoUnitVMAC]> { + let Latency = 4; +} + +def C1NanoWriteVMACB64 : C1NanoWriteVMACBypass; +def C1NanoWriteVMACB128 : C1NanoWriteVMACBypass; +def C1NanoWriteVMACH64 : C1NanoWriteVMACBypass; +def C1NanoWriteVMACH128 : C1NanoWriteVMACBypass; +def C1NanoWriteVMACS64 : C1NanoWriteVMACBypass; +def C1NanoWriteVMACS128 : C1NanoWriteVMACBypass; +def C1NanoWriteVMACD128 : C1NanoWriteVMACBypass; +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + + +// MUL +def : ReadAdvance; +def : ReadAdvance; + +// Div +def : ReadAdvance; + +// Forwarded types +def C1NanoWr_ADRP : SchedWriteRes<[C1NanoUnitALU]> { let Latency = 1; } +def C1NanoWr_LDR : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; } +def C1NanoRd_LDR : SchedReadAdvance<2, [C1NanoWr_ADRP, C1NanoWr_LDR]>; +def : SchedAlias; + +// SIMD MAC forwarding - reduces latency to 1 cycle when conditions are met +// for accumulator dependencies with matching destination element/register size. +def C1NanoReadVMACB64 : SchedReadAdvance<3, [C1NanoWriteVMACB64]>; +def C1NanoReadVMACB128 : SchedReadAdvance<3, [C1NanoWriteVMACB128]>; +def C1NanoReadVMACH64 : SchedReadAdvance<3, [C1NanoWriteVMACH64]>; +def C1NanoReadVMACH128 : SchedReadAdvance<3, [C1NanoWriteVMACH128]>; +def C1NanoReadVMACS64 : SchedReadAdvance<3, [C1NanoWriteVMACS64]>; +def C1NanoReadVMACS128 : SchedReadAdvance<3, [C1NanoWriteVMACS128]>; +def C1NanoReadVMACD128 : SchedReadAdvance<3, [C1NanoWriteVMACD128]>; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +def C1NanoWriteALU0 : SchedWriteRes<[C1NanoUnitALU, C1NanoUnitALU]> { + let Latency = 1; +} +def C1NanoWriteALU1 : SchedWriteRes<[C1NanoUnitALU, C1NanoUnitALU]> { + let Latency = 1; +} +def C1NanoWriteVALU0 : SchedWriteRes<[C1NanoUnitVALU, C1NanoUnitVALU]> { + let Latency = 1; +} + +// Address generation +def : InstRW<[C1NanoWr_ADRP], (instrs ADR, ADRP)>; + +// CMP/CMN are implemented via ADDS/SUBS with Rd = ZR. +def C1NanoWriteISRegCmp : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +// Arithmetic, basic, flagset +def : InstRW<[C1NanoWriteALU1], (instregex "^(ADCS|SBCS)(W|X)(r|i)$")>; + +// Conditional compare +def : InstRW<[C1NanoWriteALU0], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; + +// Variable shift +def : InstRW<[WriteI], (instregex "(ASR|LSL|LSR|ROR)V[WX]r")>; + +// ROR (immediate) is implemented as an EXTR (EXTR Rd, Rs, Rs, #Imm). +// Keep EXTR's base latency, but model the ROR (immediate) as 1-cycle. +def C1NanoWriteExtrRORImm : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : InstRW<[C1NanoWriteExtrRORImm], (instrs EXTRWrri, EXTRXrri)>; + +// LSL/LSR (immediate) and UXTB are implemented as UBFM aliases. +// Keep UBFM's base latency, but model these aliases as 1-cycle. +def C1NanoWriteISFastUBFMImm : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : InstRW<[C1NanoWriteISFastUBFMImm], (instrs UBFMWri, UBFMXri)>; + +// ASR (immediate) is implemented as an SBFM alias. +// Keep SBFM's base latency, but model ASR (immediate) as 1-cycle. +def C1NanoWriteISFastSBFMImm : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : InstRW<[C1NanoWriteISFastSBFMImm], (instrs SBFMWri, SBFMXri)>; + +// FP conditional compare +// def : InstRW<[C1NanoMC2Write<5, 5, C1NanoUnitVALU>], (instregex "FCCMP")>; + +def C1NanoWriteISReg : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : InstRW<[C1NanoWriteISReg], (instregex ".*rs$")>; + +// Reverse bits +def : InstRW<[WriteI], (instrs RBITWr, RBITXr)>; + +// Multiply accumulate long +def : InstRW<[C1NanoWrite<2, C1NanoUnitMAC>], (instregex "[SU]M(ADD|SUB)L")>; + +// Multiply high +def : InstRW<[C1NanoMCWrite<6, 4, C1NanoUnitMAC>], (instregex "[SU]MULHr")>; + +// ASIMD FP compare +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], + (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v|16|32|64)")>; + +// ASIMD FP, complex add +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCADDv")>; + +// ASIMD reverse bits. +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "RBITv")>; + +// ASIMD count +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(CLS|CLZ|CNT)v")>; + +// ASIMD scalar DUP (asm mnemonic is "mov", e.g. "mov b0, v0.b[1]"). +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "DUPi")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]MOVvi")>; + +// ASIMD transfer from vector element to GPR with sign-extension. +// ASIMD move between vector elements (asm mnemonic is "mov", e.g. "mov v2.b[0], v0.b[0]"). +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "INSvi(8|16|32|64)lane")>; + +// ASIMD transfer, gen reg to element +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "INSvi(8|16|32|64)gpr")>; + +// FP scalar load instructions +// ----------------------------------------------------------------------------- + +// Load vector reg, literal +def : InstRW<[C1NanoWrite<3, C1NanoUnitLd>], (instrs LDRSl, LDRDl, LDRQl)>; + +// Load vector reg, unscaled immediate +def : InstRW<[C1NanoWrite<3, C1NanoUnitLd>], (instregex "LDUR[BHSDQ]i")>; + +// Load vector register, unsigned immediate +def : InstRW<[C1NanoWrite<3, C1NanoUnitLd>], (instregex "LDR[BHSDQ]ui")>; + +// Load vector register, register offset +def : InstRW<[C1NanoWrite<3, C1NanoUnitLd>], (instregex "LDR[BHSDQ]ro[WX]")>; + +// FP scalar store instructions +// ----------------------------------------------------------------------------- + +// Store vector pair, immediate offset, Q-form +def : InstRW<[C1NanoMCWrite<1, 2, C1NanoUnitLdSt>], (instregex "STN?PQ(i|post|pre)")>; + +// Pointer Authentication Instructions (v8.3 PAC) +// ----------------------------------------------------------------------------- + +// Compute pointer authentication code, using generic key +def : InstRW<[C1NanoWrite<5, C1NanoUnitPAC>], (instrs PACGA)>; +// Authenticate data address +// Authenticate instruction address +// Compute pointer authentication code for data address +// Compute pointer authentication code for instruction address +def : InstRW<[C1NanoWrite<4, C1NanoUnitPAC>], (instregex "^AUT", "^PAC[DI]")>; + +// Branch and link, register, with pointer authentication +// Branch, register, with pointer authentication +// Branch, return, with pointer authentication +def : InstRW<[C1NanoWrite_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, + BRAAZ, BRAB, BRABZ, RETAA, RETAB, + ERETAA, ERETAB)>; + +// Load register, with pointer authentication +def : InstRW<[C1NanoWrite<2, C1NanoUnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>; + +// Strip pointer authentication code +def : InstRW<[C1NanoWrite<4, C1NanoUnitPAC>], (instrs XPACD, XPACI, XPACLRI)>; + +// Miscellaneous data-processing instructions +// ----------------------------------------------------------------------------- + +// Convert floating-point condition flags +def : InstRW<[C1NanoMC2Write<1, 2, C1NanoUnitALU>], (instregex "^(AX|XA)FLAG")>; + +// Flag set instructions +def : InstRW<[C1NanoMC2Write<2, 2, C1NanoUnitALU>], (instregex "^SETF(8|16)")>; + +// Flag manipulation instructions, rotate and select +def : InstRW<[C1NanoMC2Write<1, 1, C1NanoUnitALU>], (instrs RMIF)>; + +// Flag manipulation instructions, invert carry +def : InstRW<[C1NanoMC2Write<1, 2, C1NanoUnitALU>], (instrs CFINV)>; + +// Load instructions +// ----------------------------------------------------------------------------- +def C1NanoWriteVLD1 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; } +def C1NanoWriteVLD1SI : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; let SingleIssue = 1; } +def C1NanoWriteLDP1 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 2; } +def C1NanoWriteLDP3 : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 3; } +def C1NanoWriteLDPFP : SchedWriteRes<[C1NanoUnitLd]> { let Latency = 3; } + +// Integer loads, immediate and register offset. +def : InstRW<[C1NanoWr_LDR, C1NanoRd_LDR], (instregex "^LDR[WX]ui$")>; +def : InstRW<[C1NanoWr_LDR, C1NanoRd_LDR], (instregex "^LDR[WX]ro[WX]$")>; + +// Load register, immediate pre-index / post-index +def : InstRW<[WriteAdr, C1NanoWr_LDR, C1NanoWr_LDR], (instregex "LDR[WX](pre|post)")>; + +// Load vector register, immediate pre-index / post-index +def : InstRW<[WriteAdr, C1NanoWriteLDPFP, C1NanoWriteLDPFP], (instregex "LDR[BHSDQ](pre|post)")>; + +def : InstRW<[C1NanoWriteLDP1], (instregex "LDPSWi")>; +def : InstRW<[C1NanoWriteLDP1], (instregex "LDN?P[WX]i")>; +def : InstRW<[C1NanoWriteLDP3,C1NanoWriteLDP3], (instregex "LDN?P[SDQ]i")>; +def : InstRW<[WriteAdr, C1NanoWriteLDP1, C1NanoWriteLDP1], (instregex "LDPSW(pre|post)")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD1SI, C1NanoWriteLDP1], (instregex "LDPW(pre|post)")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD1, C1NanoWriteLDP1], (instregex "LDPX(pre|post)")>; +def : InstRW<[WriteAdr, C1NanoWriteLDPFP, C1NanoWriteLDP1], (instregex "LDP[SDQ](pre|post)")>; +def : InstRW<[WriteI], (instrs COPY)>; + +//--- +// Vector Loads - 128-bit per cycle +//--- +// 1-element structures +def C1NanoWriteVLD1Latency3Release1: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [1]; } +def C1NanoWriteVLD1Latency3: SchedWriteRes<[C1NanoUnitLd]> { let Latency = 3; } +def C1NanoWriteVLD1Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; } +def C1NanoWriteVLD1Latency5Release3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; } +def C1NanoWriteVLD1Latency6Release5: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [5]; } + +def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD1Latency3Release1], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD1Latency4Release2], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD1Latency4Release2], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1i(8|16|32|64)$")>; // single element +def : InstRW<[C1NanoWriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate + +def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3Release1], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency4Release2], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency4Release2], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1i(8|16|32|64)_POST$")>; // single element +def : InstRW<[WriteAdr, C1NanoWriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // replicate + +// 2-element structures +def C1NanoWriteVLD2Latency3Release1: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [1]; } +def C1NanoWriteVLD2Latency3Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [2]; } +def C1NanoWriteVLD2Latency4Release1: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [1]; } +def C1NanoWriteVLD2Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; } +def C1NanoWriteVLD2Latency4Release4: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [4]; } + +def : InstRW<[C1NanoWriteVLD2Latency4Release1], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD2Latency4Release4], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVLD2Latency3Release1], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVLD2Latency4Release1], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD2Latency4Release4], (instregex "LD2i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD2Latency3Release1], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +// 3-element structures +def C1NanoWriteVLD3Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; } +def C1NanoWriteVLD3Latency5Release3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; } +def C1NanoWriteVLD3Latency5Release5: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [5]; } + +def : InstRW<[C1NanoWriteVLD3Latency5Release3], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD3Latency5Release5], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVLD3Latency4Release2], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVLD3Latency5Release3], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD3Latency5Release5], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD3Latency4Release2], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +// 4-element structures +def C1NanoWriteVLD4Latency4Release2: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; } +def C1NanoWriteVLD4Latency5Release3: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; } +def C1NanoWriteVLD4Latency6Release5: SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [5]; } + +def : InstRW<[C1NanoWriteVLD4Latency5Release3], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[C1NanoWriteVLD4Latency6Release5], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVLD4Latency4Release2], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVLD4Latency5Release3], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD4Latency6Release5], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVLD4Latency4Release2], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +//--- +// Vector Stores +//--- +// 1 Element structures +def C1NanoWriteVST1Release1 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; + let ReleaseAtCycles = [1]; } +def C1NanoWriteVST1Release2 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; + let ReleaseAtCycles = [2]; } +def C1NanoWriteVST1Release3 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; + let ReleaseAtCycles = [3]; } +def C1NanoWriteVST1Release4 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 4; + let ReleaseAtCycles = [4]; } +def C1NanoWriteVST2Release1 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [1]; } +def C1NanoWriteVST2Release2 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [2]; } +def C1NanoWriteVST3Release4 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [4]; } +def C1NanoWriteVST3Release6 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [6]; } +def C1NanoWriteVST4Release2 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [2]; } +def C1NanoWriteVST4Release4 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [4]; } +def C1NanoWriteVST4Release8 : SchedWriteRes<[C1NanoUnitLdSt]> { let Latency = 5; + let ReleaseAtCycles = [8]; } + +def : InstRW<[C1NanoWriteVST1Release1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVST1Release1], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[C1NanoWriteVST1Release1], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[C1NanoWriteVST1Release1], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[C1NanoWriteVST1Release2], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +// TODO: Handle the special case of ASIMD store, 1 element, multiple, 3 reg, D-form when: +// Throughput=1/3 when the access is aligned and crosses 16B boundary, one more cycle is needed +def : InstRW<[C1NanoWriteVST1Release2], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[C1NanoWriteVST1Release3], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[C1NanoWriteVST1Release2], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[C1NanoWriteVST1Release4], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVST1Release1], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release1], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release1], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release1], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release2], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +// TODO: Handle the special case of ASIMD store, 1 element, multiple, 3 reg, D-form when: +// Throughput=1/3 when the access is aligned and crosses 16B boundary, one more cycle is needed +def : InstRW<[WriteAdr, C1NanoWriteVST1Release2], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release3], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release2], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST1Release4], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +// 2 Element structures +def : InstRW<[C1NanoWriteVST2Release2], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVST2Release1], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[C1NanoWriteVST4Release2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVST2Release2], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST2Release1], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST4Release2], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +// 3 Element structures +def : InstRW<[C1NanoWriteVST3Release4], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVST3Release4], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[C1NanoWriteVST3Release6], (instregex "ST3Threev(16b|8h|4s|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVST3Release4], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST3Release4], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST3Release6], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; + +// 4 Element structures +def : InstRW<[C1NanoWriteVST4Release8], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[C1NanoWriteVST4Release4], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[C1NanoWriteVST4Release8], (instregex "ST4Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[WriteAdr, C1NanoWriteVST4Release8], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST4Release4], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[WriteAdr, C1NanoWriteVST4Release8], (instregex "ST4Fourv(16b|8h|4s|2d)_POST$")>; + +//--- +// Floating Point Conversions, MAC, DIV, SQRT +//--- +def : InstRW<[C1NanoWriteVALU_F3], (instregex "^DUP(v2i64|v2i32|v4i32|v4i16|v8i16|v8i8|v16i8)")>; +def : InstRW<[C1NanoWriteVALU_F4], (instregex "^XTN")>; + +// FP convert, from vec to gen reg +def : InstRW<[C1NanoWriteVALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>; +def : InstRW<[C1NanoWriteVALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>; + +// FP convert, Javascript from vec to gen reg +def : InstRW<[C1NanoWriteVALU1_F4], (instrs FJCVTZS)>; + +def : InstRW<[C1NanoWriteVALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>; +def : InstRW<[C1NanoWriteVALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>; +def : InstRW<[C1NanoWriteVALU_F4], (instregex "^(S|U)CVTFv")>; + +// MOPS instructions +// ----------------------------------------------------------------------------- + +//Memory Copy Forward-only Prologue +def : InstRW<[C1NanoWriteMOPS<2, 2>], (instregex "^CPYFP")>; + +// Memory Copy Forward-only Main +def : InstRW<[C1NanoWriteMOPSDynamic<1, 1>], (instregex "^CPYFM")>; + +// Memory Copy Forward-only Epilogue +def : InstRW<[C1NanoWriteMOPS<1, 1>], (instregex "^CPYFE")>; + +// Memory Copy Prologue +def : InstRW<[C1NanoWriteMOPS<3, 3>], (instregex "^CPYP")>; + +// Memory Copy Main +def : InstRW<[C1NanoWriteMOPSDynamic<1, 1>], (instregex "^CPYM")>; + +// Memory Copy Epilogue +def : InstRW<[C1NanoWriteMOPS<1, 1>], (instregex "^CPYE")>; + +// Memory Set Prologue +def : InstRW<[C1NanoWriteMOPS<2, 2>], (instregex "^SETP")>; + +// Memory Set Main +def : InstRW<[C1NanoWriteMOPS<1, 1>], (instregex "^SETM")>; + +// Memory Set Epilogue +def : InstRW<[C1NanoWriteMOPS<1, 1>], (instregex "^SETE")>; + +// Memory Set with tag setting Prologue +def : InstRW<[C1NanoWriteMOPS<2, 2>], (instregex "^SETGP")>; + +// Memory Set with tag setting Main +def : InstRW<[C1NanoWriteMOPS<1, 1>], (instregex "^SETGM")>; + +// Memory Set with tag setting Epilogue +def : InstRW<[C1NanoWriteMOPS<1, 1>], (instregex "^MOPSSETGE")>; + +// FP scalar data processing instructions +// ----------------------------------------------------------------------------- + +// FP divide, H-form +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnit4VMC>], (instrs FDIVHrr)>; + +// FP divide, S-form +def : InstRW<[C1NanoMCWrite<13, 10, C1NanoUnit4VMC>], (instrs FDIVSrr)>; + +// FP divide, D-form +def : InstRW<[C1NanoMCWrite<22, 19, C1NanoUnit4VMC>], (instrs FDIVDrr)>; + +// FP square root, H-form +def : InstRW<[C1NanoMCWrite<11, 5, C1NanoUnit4VMC>], (instrs FSQRTHr)>; + +// FP square root, S-form +def : InstRW<[C1NanoMCWrite<14, 9, C1NanoUnit4VMC>], (instrs FSQRTSr)>; + +// FP square root, D-form +def : InstRW<[C1NanoMCWrite<25, 19, C1NanoUnit4VMC>], (instrs FSQRTDr)>; + +// ASIMD FP data processing instructions +// ----------------------------------------------------------------------------- + +// ASIMD FP, arith, normal +def : InstRW<[C1NanoWriteVMAC], (instregex "^FN?M(ADD|SUB)")>; + +// ASIMD FP, complex multiply add +def : InstRW<[C1NanoWriteVMAC], (instregex "^FCMLAv")>; + +// ASIMD FP, multiply accumulate +def : InstRW<[C1NanoWriteVMAC], (instregex "^FML(A|S)v")>; + +// ASIMD FP divide, D-form, F61 +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnit2VMC>], (instrs FDIVv4f16)>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[C1NanoMC2Write<13, 5, C1NanoUnit2VMC>], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F16 +def : InstRW<[C1NanoMC2Write<8, 5, C1NanoUnit2VMC>], (instrs FDIVv8f16)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[C1NanoMC2Write<13, 10, C1NanoUnit2VMC>], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[C1NanoMC2Write<22, 19, C1NanoUnit2VMC>], (instrs FDIVv2f64)>; + +// ASIMD FP square root, D-form, F16 +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnit2VMC>], (instrs FSQRTv4f16)>; + +// ASIMD FP square root, D-form, F32 +def : InstRW<[C1NanoMCWrite<12, 9, C1NanoUnit2VMC>], (instrs FSQRTv2f32)>; + +// ASIMD FP square root, Q-form, F16 +def : InstRW<[C1NanoMC2Write<8, 5, C1NanoUnit2VMC>], (instrs FSQRTv8f16)>; + +// ASIMD FP square root, Q-form, F32 +def : InstRW<[C1NanoMC2Write<12, 9, C1NanoUnit2VMC>], (instrs FSQRTv4f32)>; + +// ASIMD FP square root, Q-form, F64 +def : InstRW<[C1NanoMC2Write<22, 19, C1NanoUnit2VMC>], (instrs FSQRTv2f64)>; + +def : InstRW<[C1NanoWriteVALU0_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)>; + +// ASIMD FP multiply +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "FMULX?(16|32|64|v)")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "FML[AS]L2?(v|lane)")>; + +// ASIMD miscellaneous instructions +// ----------------------------------------------------------------------------- + +// ASIMD move, FP immediate +def : InstRW<[C1NanoWriteVALU_F2], (instrs FMOVSr, FMOVDr)>; + +// ASIMD move, FP transfer, from reg to vec reg +def : InstRW<[C1NanoWriteVALU_F3], (instrs FMOVv2f32_ns, FMOVv4f32_ns, FMOVv2f64_ns, FMOVv4f16_ns, FMOVv8f16_ns)>; + +// ASIMD reciprocal estimate +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "FRECP[EX]v", "URECPEv", "[FU]RSQRTEv")>; + +// ASIMD reciprocal step +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "FR(ECPS|SQRTS)(16|32|64|v)")>; + +// ASIMD integer instructions +// ----------------------------------------------------------------------------- + +// ASIMD absolute diff +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>; +// ASIMD move, integer immediate +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^MOVI(v|D)")>; +// ASIMD reverse +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^REV(16|32|64)v")>; +// ASIMD absolute diff accum +def : InstRW<[C1NanoMC2Write<5, 3, C1NanoUnitVALU>], (instregex "[SU]ABAL?v")>; +// ASIMD absolute diff long +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ABDLv")>; +// ASIMD arith, basic +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "(ABS|ADD|SUB|NEG)v", + "[SU](HADDv|HSUBv)")>; +// ASIMD, arith, basic, long, saturate +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", + "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>; +// ASIMD, arith, complex +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "ADDHNv", "SUBHNv")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>; +// ASIMD, arith, complex, rounding, add and subtract +def : InstRW<[C1NanoMC2Write<6, 3, C1NanoUnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; +// ASIMD, arith, complex, rounding halving addition +def : InstRW<[C1NanoWrite<2, C1NanoUnitVALU>], (instregex "[SU]RHADDv")>; +// ASIMD, arith, pair-wise +//def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ADDLPv", "ADDPv")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]ADDLPv", "ADDPv")>; +// ASIMD arith, reduce +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitVALU>], (instregex "ADDVv(8|16)")>; +// ASIMD, arith, reduce 4H/4S +def : InstRW<[C1NanoMC2Write<4, 1, C1NanoUnitVALU>], (instregex "SADDLVv", "UADDLVv", "ADDVv4")>; +// ASIMD compare +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>; +// ASIMD compare test +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>; +// ASIMD unzip/zip/transpose +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(TRN|UZP|ZIP)[12]v")>; +// ASIMD extract +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^EXTv")>; +// ASIMD table lookup / table lookup extension +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs TBLv8i8One, TBLv16i8One)>; +def : InstRW<[C1NanoMC2Write<5, 2, C1NanoUnitVALU>], (instrs TBLv8i8Two, TBLv16i8Two)>; +def : InstRW<[C1NanoMC2Write<6, 3, C1NanoUnitVALU>], (instrs TBLv8i8Three, TBLv16i8Three)>; +def : InstRW<[C1NanoMC2Write<7, 4, C1NanoUnitVALU>], (instrs TBLv8i8Four, TBLv16i8Four)>; +def : InstRW<[C1NanoMC2Write<5, 2, C1NanoUnitVALU>], (instrs TBXv8i8One, TBXv16i8One)>; +def : InstRW<[C1NanoMC2Write<6, 3, C1NanoUnitVALU>], (instrs TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[C1NanoMC2Write<7, 4, C1NanoUnitVALU>], (instrs TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[C1NanoMC2Write<8, 5, C1NanoUnitVALU>], (instrs TBXv8i8Four, TBXv16i8Four)>; +// ASIMD logical +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8", + "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8", + "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(BIF|BIT|BSL)v")>; +// ASIMD max/min, basic +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i32|8i16)")>; +// ASIMD max/min, reduce +def : InstRW<[C1NanoMC2Write<4, 1, C1NanoUnitVALU>], (instregex "[SU](MAX|MIN)Vv")>; +// ASIMD FP max/min, reduce. +def : InstRW<[C1NanoMC2Write<4, 1, C1NanoUnitVALU>], (instregex "^F(MAX|MIN)(NM)?Vv")>; +// ASIMD multiply, by element +def : InstRW<[C1NanoWriteVMACH64], (instregex "^MULv4i16_indexed$")>; +def : InstRW<[C1NanoWriteVMACH128], (instregex "^MULv8i16_indexed$")>; +def : InstRW<[C1NanoWriteVMACS64], (instregex "^MULv2i32_indexed$")>; +def : InstRW<[C1NanoWriteVMACS128], (instregex "^MULv4i32_indexed$")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], + (instregex "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)(_indexed)?$")>; +// ASIMD multiply +def : InstRW<[C1NanoWriteVMACB64], (instregex "^MULv8i8$")>; +def : InstRW<[C1NanoWriteVMACB128], (instregex "^MULv16i8$")>; +def : InstRW<[C1NanoWriteVMACH64], (instregex "^MULv4i16$")>; +def : InstRW<[C1NanoWriteVMACH128], (instregex "^MULv8i16$")>; +def : InstRW<[C1NanoWriteVMACS64], (instregex "^MULv2i32$")>; +def : InstRW<[C1NanoWriteVMACS128], (instregex "^MULv4i32$")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^PMULv(8i8|16i8)")>; +// ASIMD multiply accumulate +def : InstRW<[C1NanoWriteVMACB64, C1NanoReadVMACB64], (instregex "^ML[AS]v8i8$")>; +def : InstRW<[C1NanoWriteVMACB128, C1NanoReadVMACB128], (instregex "^ML[AS]v16i8$")>; +def : InstRW<[C1NanoWriteVMACH64, C1NanoReadVMACH64], (instregex "^ML[AS]v4i16$")>; +def : InstRW<[C1NanoWriteVMACH128, C1NanoReadVMACH128], (instregex "^ML[AS]v8i16$")>; +def : InstRW<[C1NanoWriteVMACS64, C1NanoReadVMACS64], (instregex "^ML[AS]v2i32$")>; +def : InstRW<[C1NanoWriteVMACS128, C1NanoReadVMACS128], (instregex "^ML[AS]v4i32$")>; +def : InstRW<[C1NanoWriteVMACH64, C1NanoReadVMACH64], (instregex "^ML[AS]v4i16_indexed$")>; +def : InstRW<[C1NanoWriteVMACH128, C1NanoReadVMACH128], (instregex "^ML[AS]v8i16_indexed$")>; +def : InstRW<[C1NanoWriteVMACS64, C1NanoReadVMACS64], (instregex "^ML[AS]v2i32_indexed$")>; +def : InstRW<[C1NanoWriteVMACS128, C1NanoReadVMACS128], (instregex "^ML[AS]v4i32_indexed$")>; +// ASIMD multiply accumulate half +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>, ReadVMACAccum], (instregex "SQRDML[AS]H[vi]")>; +// ASIMD multiply accumulate long +def : InstRW<[C1NanoWriteVMACH128, C1NanoReadVMACH128], + (instregex "^[SU]ML[AS]Lv(8i8|16i8)_(v8i16|indexed)$")>; +def : InstRW<[C1NanoWriteVMACS128, C1NanoReadVMACS128], + (instregex "^[SU]ML[AS]Lv(4i16|8i16)_(v4i32|indexed)$")>; +def : InstRW<[C1NanoWriteVMACD128, C1NanoReadVMACD128], + (instregex "^[SU]ML[AS]Lv(2i32|4i32)_(v2i64|indexed)$")>; +// ASIMD multiply accumulate long #2 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>, ReadVMACAccum], (instregex "SQDML[AS]L[iv]")>; +// ASIMD dot product +def : InstRW<[C1NanoWriteVMACS64, C1NanoReadVMACS64], (instregex "^(S|U|SU|US)DOTv8i8$")>; +def : InstRW<[C1NanoWriteVMACS128, C1NanoReadVMACS128], (instregex "^(S|U|SU|US)DOTv16i8$")>; +// ASIMD dot product, by scalar +def : InstRW<[C1NanoWriteVMACS64, C1NanoReadVMACS64], (instregex "^(S|U|SU|US)DOTlanev8i8$")>; +def : InstRW<[C1NanoWriteVMACS128, C1NanoReadVMACS128], (instregex "^(S|U|SU|US)DOTlanev16i8$")>; +// ASIMD multiply long +def : InstRW<[C1NanoWriteVMACH128], (instregex "^[SU]MULLv(8i8|16i8)_(v8i16|indexed)$")>; +def : InstRW<[C1NanoWriteVMACS128], (instregex "^[SU]MULLv(4i16|8i16)_(v4i32|indexed)$")>; +def : InstRW<[C1NanoWriteVMACD128], (instregex "^[SU]MULLv(2i32|4i32)_(v2i64|indexed)$")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "SQDMULL[iv]")>; +// ASIMD polynomial (8x8) multiply long +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>; +// ASIMD pairwise add and accumulate +def : InstRW<[C1NanoMC2Write<5, 3, C1NanoUnitVALU>], (instregex "[SU]ADALPv")>; +// ASIMD shift accumulate +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; +// ASIMD shift accumulate #2 +def : InstRW<[C1NanoMC2Write<5, 3, C1NanoUnitVALU>], (instregex "[SU]RSRA[vd]")>; +// ASIMD shift by immed +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "SHLd$", "SHLv", + "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv(8i8|4i16|2i32)")>; +// ASIMD shift by immediate and insert, basic +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^SLIv.*_shift", "^SRIv.*_shift")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "SHRNv(16i8|8i16|4i32)")>; +// ASIMD shift by immed +// SXTL and UXTL are aliases for SHLL +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[US]?SHLLv")>; +// ASIMD shift by immed #2 +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)", + "[SU]RSHRv(16i8|2i64|4i32|8i16)")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)", + "RSHRNv(16i8|4i32|8i16)")>; +// ASIMD shift by register +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>; +// ASIMD shift by register #2 +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>; + +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>; + +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>; + +// ASIMD BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// ASIMD dot product +def : InstRW<[C1NanoWrite_10cyc_1VMAC_1VALU], (instregex "^BFDOTv", "^BF16DOT")>; + +// ASIMD matrix multiply accumulate +def : InstRW<[C1NanoWrite_14cyc_1VMAC_1VALU_B], (instregex "^BFMMLA$")>; + +// ASIMD multiply accumulate long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^BFMLAL[BT]$", "^BFMLAL[BT]Idx$")>; + +// Cryptography extensions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[C1NanoMCWrite<3, 2, C1NanoUnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 hash acceleration op +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitVALU>], (instregex "^SHA1H")>; + +// Crypto SHA1 schedule acceleration ops +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^SHA1(SU0|SU1)")>; + +// Crypto SHA1 hash acceleration ops +// Crypto SHA256 hash acceleration ops +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; + +// Crypto SHA256 schedule acceleration ops +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^SHA256SU[01]")>; + +// Crypto SHA512 hash acceleration ops +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; + +// Crypto SHA3 ops +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instrs BCAX, EOR3, RAX1)>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs XAR)>; + + +// Crypto SM3 ops +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", + "^SM3TT[12][AB]$")>; + +// Crypto SM4 ops +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitVMC>], (instrs SM4E, SM4ENCKEY)>; + +// CRC +// ----------------------------------------------------------------------------- + +def : InstRW<[C1NanoWrite<2, C1NanoUnitMAC>], (instregex "^CRC32")>; + +// SVE Predicate instructions +// ----------------------------------------------------------------------------- + +// Note Correction to what is stated in the Arm C1-Nano Core Software Optimization +// Guide issue 4, section 22.4 SVE Predicate instructions: +// - PALU should be ALU0 + +// Loop control, based on predicate +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instrs BRKA_PPmP, BRKA_PPzP, + BRKB_PPmP, BRKB_PPzP)>; + +// Loop control, based on predicate and flag setting +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; + +// Loop control, propagating +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; + +// Loop control, propagating and flag setting +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instrs BRKNS_PPzP, BRKPAS_PPzPP, BRKPBS_PPzPP)>; + +// Loop control, based on GPR +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], + (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; + +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; + +// Loop terminate +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU1>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; + +// Predicate counting scalar +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; + +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], + (instregex "^CNT[BHWD]_XPiI")>; + +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU1>], + (instregex "^(INC|DEC)[BHWD]_XPiI")>; + +def : InstRW<[C1NanoWrite<5, C1NanoUnitALU0>], + (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>; + +// Predicate counting scalar, active predicate +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], + (instregex "^CNTP_XPP_[BHSD]")>; + +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], + (instregex "^(DEC|INC)P_XP_[BHSD]")>; + +def : InstRW<[C1NanoMC2Write<2, 1, C1NanoUnitVALU>], + (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", + "^(UQDEC|UQINC)P_WP_[BHSD]")>; + +// Predicate counting scalar, active predicate, saturating, 32-bit. +def : InstRW<[C1NanoMC2Write<1, 1, C1NanoUnitVALU>], + (instregex "^(SQDEC|SQINC)P_XPWd_[BHSD]")>; + +// Predicate counting vector, active predicate +def : InstRW<[C1NanoWrite<3, C1NanoUnitALU0>], + (instregex "^(DEC|INC)P_ZP_[HSD]")>; + +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; + +// Predicate logical +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], + (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; + +// Predicate logical, flag setting +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], + (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; + +// Predicate reverse +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], (instregex "^REV_PP_[BHSD]")>; + +// Predicate select +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instrs SEL_PPPP)>; + +// Predicate set +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; + +// Predicate set/initialize, set flags +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instregex "^PTRUES_[BHSD]")>; + +// Predicate find first/next +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; + +// Predicate test +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], (instrs PTEST_PP, PTEST_PP_ANY, PTEST_PP_FIRST)>; + +// Predicate transpose +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; + +// Predicate unpack and widen +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; + +// Predicate zip/unzip +def : InstRW<[C1NanoWrite<1, C1NanoUnitALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; + + +// Tag Data processing +// ----------------------------------------------------------------------------- +// Arithmetic, immediate to logical address tag +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU>], (instrs ADDG, SUBG)>; + +// Insert Random Tags +def : InstRW<[C1NanoMC2Write<4, 3, C1NanoUnitALU>], (instrs IRG, IRGstack)>; + +// Insert Tag Mask +// Subtract Pointer +// Subtract Pointer, flagset +def : InstRW<[C1NanoWrite<2, C1NanoUnitALU>], (instrs GMI, SUBP, SUBPS)>; + +// Tag Load instructions +// ----------------------------------------------------------------------------- +// Load allocation tag +def : InstRW<[C1NanoWrite<2, C1NanoUnitLd>], (instrs LDG)>; + +// Load multiple allocation tags +def : InstRW<[C1NanoMC2Write<2, 4, C1NanoUnitLd>], (instrs LDGM)>; + +// Tag store instructions +// ----------------------------------------------------------------------------- +// Store allocation tags to one granule, post-index +// Store allocation tags to one granule, pre-index +// Store allocation tag to one granule, zeroing, post-index +// Store Allocation Tag to one granule, zeroing, pre-index +// Store allocation tags to one granule, signed offset +// Store allocation tag and reg pair to memory, signed offset +// Store allocation tag and reg pair to memory, post-Index +// Store allocation tag and reg pair to memory, pre-Index +// Store multiple allocation tags +def : InstRW<[C1NanoWrite<1, C1NanoUnitLdSt>], (instrs STGPreIndex, STGPostIndex, + STZGPreIndex, STZGPostIndex, + STGPpre, STGPpost, + STGi, STZGi, + STGPi, STGM, STZGM)>; +// Store allocation tags to two granules, post-index +// Store allocation tags to two granules, pre-index +// Store allocation tag to two granules, zeroing, post-index +// Store Allocation Tag to two granules, zeroing, pre-index +// Store allocation tag to two granules, zeroing, signed offset +def : InstRW<[C1NanoMCWrite<1, 2, C1NanoUnitLdSt>], (instrs ST2GPreIndex, ST2GPostIndex, + STZ2GPreIndex, STZ2GPostIndex, + ST2Gi, STZ2Gi)>; + +// SVE integer instructions +// ----------------------------------------------------------------------------- +// Arithmetic, absolute diff +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>; + +// Arithmetic, absolute diff accum +def : InstRW<[C1NanoMC2Write<5, 3, C1NanoUnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; + +// Arithmetic, absolute diff accum long +def : InstRW<[C1NanoMC2Write<5, 3, C1NanoUnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; + +// Arithmetic, absolute diff long +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; + +// Arithmetic, basic +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], + (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]", + "^(ADD|SUB)_ZPmZ_[BHSD]", + "^(ADD|SUB)_ZPZZ_[BHSD]", + "^(ADD|SUB)_ZZZ_[BHSD]", + "^(ADD|SUB)_ZI_[BHSD]", + "^ADR_[SU]XTW_ZZZ_D_[0123]", + "^ADR_LSL_ZZZ_[SD]_[0123]", + "^[SU]H(ADD|SUB|SUBR)_(ZPmZ|ZPZZ)_[BHSD]", + "^UADDW[BT]_ZZZ_[HSD]", + "^[SU]RHADD_ZPmZ_[BHSD]")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^SADDW[BT]_ZZZ_[HSD]", + "^[SU]ADDL[BT]_ZZZ_[HSD]", + "^[SU]SUB[LW][BT]_ZZZ_[HSD]", + "^SADDLBT_ZZZ_[HSD]", + "^SSUBL(BT|TB)_ZZZ_[HSD]", + "^SUBR_(ZPmZ|ZPZZ|ZI)_[BHSD]")>; + +// Arithmetic, complex +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]", + "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]", + "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", + "^[SU]Q(ADD|SUB)_ZI_[BHSD]", + "^(ADD|SUB)HN[BT]_ZZZ_[BHS]", + "^(SUQ|UQ|USQ)ADD_ZPmZ_[BHSD]", + "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; +def : InstRW<[C1NanoMC2Write<6, 3, C1NanoUnitVALU>], + (instregex "^R(ADD|SUB)HN[BT]_ZZZ_[BHS]")>; + +// Arithmetic, large integer +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; + +// Arithmetic, pairwise add +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>; + +// Arithmetic, pairwise add and accum long +def : InstRW<[C1NanoMC2Write<6, 4, C1NanoUnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>; + +// Arithmetic, shift +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], + (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]", + "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]", + "^(ASR|LSL|LSR)_ZPmI_[BHSD]", + "^(ASR|LSL|LSR)_ZPZI_[BHSD]", + "^(ASR|LSL|LSR)_ZPmZ_[BHSD]", + "^(ASR|LSL|LSR)_ZPZZ_[BHSD]", + "^(ASR|LSL|LSR)_ZZI_[BHSD]", + "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; +// Arithmetic, shift right for divide +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^ASRD_ZPmI_[BHSD]", + "^ASRD_ZPZI_[BHSD]")>; + +// Arithmetic, shift and accumulate +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], + (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>; + +def : InstRW<[C1NanoMC2Write<5, 3, C1NanoUnitVALU>], + (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>; + + +// Arithmetic, shift by immediate +// Arithmetic, shift by immediate and insert +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], + (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>; + +// Arithmetic, shift complex +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]", + "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]", + "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]", + "^SQSHRU?N[BT]_ZZI_[BHS]", + "^UQR?SHRN[BT]_ZZI_[BHS]")>; + +// Arithmetic, shift rounding +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]", + "^[SU]RSHR_ZPmI_[BHSD]")>; + +// Bit manipulation +def : InstRW<[C1NanoMCWrite<13, 11, C1NanoUnitVMC>], + (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>; + +def : InstRW<[C1NanoMCWrite<21, 19, C1NanoUnitVMC>], + (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>; + +def : InstRW<[C1NanoMCWrite<37, 35, C1NanoUnitVMC>], + (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>; + +def : InstRW<[C1NanoMCWrite<68, 66, C1NanoUnitVMC>], + (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>; + + +// Bitwise select +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>; + +// Count/reverse bits +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; +def : InstRW<[C1NanoMC2Write<6, 4, C1NanoUnitVALU>], (instregex "^CNT_ZPmZ_S")>; +def : InstRW<[C1NanoMC2Write<9, 7, C1NanoUnitVALU>], (instregex "^CNT_ZPmZ_D")>; +// Broadcast logical bitmask immediate to vector. +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs DUPM_ZI)>; + +// Compare and set flags +def : InstRW<[C1NanoMC2Write<5, 1, C1NanoUnitVALU>], + (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", + "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; + +// Complex add +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>; + +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>; + +// Complex dot product 8-bit element +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; + +// Complex dot product 16-bit element +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; + +// Complex multiply-add B, H, S element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]", + "^CMLA_ZZZI_[HS]")>; + +// Complex multiply-add D element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instrs CMLA_ZZZ_D)>; + +// Conditional extract operations, scalar form +def : InstRW<[C1NanoMC2Write<4, 4, C1NanoUnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>; + +// Conditional extract operations, SIMD&FP scalar and vector forms +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]", + "^COMPACT_ZPZ_[SD]", + "^SPLICE_ZPZZ?_[BHSD]")>; + +// Convert to floating point, 64b to float or convert to double +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>; + +// Convert to floating point, 64b to half +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>; + +// Convert to floating point, 32b to single or half +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>; + +// Convert to floating point, 32b to double +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>; + +// Convert to floating point, 16b to half +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>; + +// Copy, scalar +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>],(instregex "^CPY_ZPmR_[BHSD]")>; + +// Copy, scalar SIMD&FP or imm +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]", + "^CPY_ZPzI_[BHSD]")>; + +// Divides, 32 bit +def : InstRW<[C1NanoMCWrite<15, 12, C1NanoUnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>; + +// Divides, 64 bit +def : InstRW<[C1NanoMCWrite<26, 23, C1NanoUnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>; + +// Dot product, 8 bit +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>, ReadVMACAccum], (instregex "^[SU]DOT_ZZZI?_BtoS")>; + +// Dot product, 8 bit, using signed and unsigned integers +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>, ReadVMACAccum], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; + +// Dot product, 16 bit +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>, ReadVMACAccum], (instregex "^[SU]DOT_ZZZI?_HtoD")>; + +// Duplicate, immediate and indexed form +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^DUP_ZI_[BHSD]", + "^DUP_ZZI_[BHSDQ]")>; + +// Duplicate, scalar form +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^DUP_ZR_[BHSD]")>; + +// Extend, sign or zero +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]", + "^[SU]XTH_ZPmZ_[SD]", + "^[SU]XTW_ZPmZ_[D]")>; + +// Extract +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instrs EXT_ZZI, EXT_ZZI_CONSTRUCTIVE, EXT_ZZI_B)>; + +// Extract narrow saturating +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]", + "^SQXTUN[BT]_ZZ_[BHS]")>; + +// Extract/insert operation, SIMD and FP scalar form +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]", + "^INSR_ZV_[BHSD]")>; + +// Extract/insert operation, scalar +def : InstRW<[C1NanoMC2Write<8, 4, C1NanoUnitVALU>], (instregex "^LAST[AB]_RPZ_[BHSD]")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^INSR_ZR_[BHSD]")>; + +// Histogram operations +def : InstRW<[C1NanoMCWrite<6, 4, C1NanoUnitVALU0>], (instregex "^HISTCNT_ZPzZZ_[SD]", + "^HISTSEG_ZZZ")>; + +// Horizontal operations, B, H, S form, immediate operands only +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^INDEX_II_[BHS]")>; + +// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar +// operands only / immediate, scalar operands +def : InstRW<[C1NanoMC2Write<4, 1, C1NanoUnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>; + +// Horizontal operations, D form, immediate operands only +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instrs INDEX_II_D)>; + +// Horizontal operations, D form, scalar, immediate operands / scalar operands +// only / immediate, scalar operands +def : InstRW<[C1NanoMC2Write<4, 1, C1NanoUnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>; + +// Logical +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], + (instregex "^(AND|EOR|ORR)_ZI", + "^(AND|BIC|EOR|EON|ORR|NAND|NOR)_ZZZ", + "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]", + "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>; + +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>; + +// Max/min, basic and pairwise +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", + "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>; + +// Matching operations +def : InstRW<[C1NanoMC2Write<8, 4, C1NanoUnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>; + +// Matrix multiply-accumulate +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; + +// Move prefix +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]", + "^MOVPRFX_ZZ")>; + +// Multiply, B, H, S element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]", + "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>; + +// Multiply, D element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D", + "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>; + +// Multiply long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]", + "^[SU]MULL[BT]_ZZZ_[HSD]")>; + +// Multiply accumulate, B, H, S element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]", + "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>; + +// Multiply accumulate, D element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D", + "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; + +// Multiply accumulate long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]", + "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>; + +// Multiply accumulate saturating doubling long regular +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]", + "^SQDML[AS](LB|LT)_ZZZI_[SD]")>; + +// Multiply saturating doubling high, B, H, S element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]", + "^SQDMULH_ZZZI_[HS]")>; + +// Multiply saturating doubling high, D element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; + +// Multiply saturating doubling long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]", + "^SQDMULL[BT]_ZZZI_[SD]")>; + +// Multiply saturating rounding doubling regular/complex accumulate, B, H, S +// element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]", + "^SQRDCMLAH_ZZZ_[BHS]", + "^SQRDML[AS]H_ZZZI_[HS]", + "^SQRDCMLAH_ZZZI_[HS]")>; + +// Multiply saturating rounding doubling regular/complex accumulate, D element +// size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D", + "^SQRDCMLAH_ZZZ_D")>; + +// Multiply saturating rounding doubling regular/complex, B, H, S element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]", + "^SQRDMULH_ZZZI_[HS]")>; + +// Multiply saturating rounding doubling regular/complex, D element size +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>; + +// Multiply/multiply long, (8x8) polynomial +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^PMUL_ZZZ_B", "^PMULL[BT]_ZZZ_H")>; + +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitVMC>], (instregex "^PMULL[BT]_ZZZ_[DQ]")>; + + +// Predicate counting vector +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^(DEC|INC)[HWD]_ZPiI")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; + +// Reciprocal estimate +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; + +// Reduction, arithmetic, B form +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; + +// Reduction, arithmetic, H form +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; + +// Reduction, arithmetic, S form +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; + +// Reduction, arithmetic, D form +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU0>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; + +// Reduction, logical +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>; + +// Reverse, vector +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^REV_ZZ_[BHSD]", + "^REVB_ZPmZ_[HSD]", + "^REVH_ZPmZ_[SD]", + "^REVW_ZPmZ_D")>; + +// Select, vector form +def : InstRW<[C1NanoWrite<2, C1NanoUnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>; + +// Table lookup +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^TBL_ZZZ_[BHSD]")>; + +// Table lookup, double table +def : InstRW<[C1NanoMC2Write<8, 5, C1NanoUnitVALU>], (instregex "^TBL_ZZZZ_[BHSD]")>; + +// Table lookup extension +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>; + +// Transpose, vector form +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; + +// Unpack and extend +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; + +// Zip/unzip +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; + +// SVE floating-point instructions +// ----------------------------------------------------------------------------- + +// Floating point absolute value/difference +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]", + "^FAB[SD]_ZPZZ_[HSD]")>; + +// Floating point arithmetic +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]", + "^FADDP_ZPmZZ_[HSD]", + "^FNEG_ZPmZ_[HSD]", + "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>; + +// Floating point associative add, F16 +def : InstRW<[C1NanoMC2Write<32, 25, C1NanoUnitVALU>], (instrs FADDA_VPZ_H)>; + +// Floating point associative add, F32 +def : InstRW<[C1NanoMC2Write<16, 9, C1NanoUnitVALU>], (instrs FADDA_VPZ_S)>; + +// Floating point associative add, F64 +// RThoughput should be 5/2 but we cannot have fractional values so using 3 +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnitVALU>], (instrs FADDA_VPZ_D)>; + +// Floating point compare +def : InstRW<[C1NanoMC2Write<4, 1, C1NanoUnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]", + "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]", + "^FCM(LE|LT)_PPzZ0_[HSD]", + "^FCMUO_PPzZZ_[HSD]")>; + +// Floating point complex add +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>; + +// Floating point complex multiply add +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]", + "^FCMLA_ZZZI_[HS]")>; + +// Floating point convert, long or narrow (F16 to F32 or F32 to F16) +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", + "^FCVTLT_ZPmZ_HtoS", + "^FCVTNT_ZPmZ_StoH")>; + +// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 +// or F64 to F16) +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)", + "^FCVTLT_ZPmZ_StoD", + "^FCVTNT_ZPmZ_DtoS")>; + +// Floating point convert, round to odd +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>; + +// Floating point base2 log, F16 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>; + +// Floating point base2 log, F32 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>; + +// Floating point base2 log, F64 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>; + +// Floating point convert to integer, F16 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>; + +// Floating point convert to integer, F32 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>; + +// Floating point convert to integer, F64 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], + (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>; + +// Floating point copy +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^FCPY_ZPmI_[HSD]", + "^FDUP_ZI_[HSD]")>; + +// Floating point divide, F16 +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>; + +// Floating point divide, F32 +def : InstRW<[C1NanoMCWrite<13, 10, C1NanoUnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>; + +// Floating point divide, F64 +def : InstRW<[C1NanoMCWrite<22, 19, C1NanoUnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>; + +// Floating point min/max pairwise +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>; + +// Floating point min/max +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>; + +// Floating point multiply +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]", + "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>; + +// Floating point multiply accumulate +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], + (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]", + "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>; + +// Floating point multiply add/sub accumulate long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>; + +// Floating point reciprocal estimate, F16 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H", + "^FRSQRTE_ZZ_H")>; + +// Floating point reciprocal estimate, F32 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S", + "^FRSQRTE_ZZ_S")>; +// Floating point reciprocal estimate, F64 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D", + "^FRSQRTE_ZZ_D")>; + +// Floating point reciprocal step +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; + +// Floating point reduction +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU0>], (instregex "^F(MAX|MIN)(NM)?V_VPZ_[HSD]")>; + +// Floating point reduction, F16 +def : InstRW<[C1NanoMCWrite<12, 5, C1NanoUnitVALU0>], (instregex "^FADDV_VPZ_H")>; + +// Floating point reduction, F32 +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnit2VALU0>], (instregex "^FADDV_VPZ_S")>; + +// Floating point reduction, F64 +def : InstRW<[C1NanoMCWrite<4, 1, C1NanoUnit2VALU0>], (instregex "^FADDV_VPZ_D")>; + + +// Floating point round to integral, F16 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>; + +// Floating point round to integral, F32 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>; + +// Floating point round to integral, F64 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>; + +// Floating point square root, F16 +def : InstRW<[C1NanoMCWrite<8, 5, C1NanoUnitVMC>], (instregex "^FSQRT_ZPmZ_H")>; + +// Floating point square root, F32 +def : InstRW<[C1NanoMCWrite<12, 9, C1NanoUnitVMC>], (instregex "^FSQRT_ZPmZ_S")>; + +// Floating point square root, F64 +def : InstRW<[C1NanoMCWrite<22, 19, C1NanoUnitVMC>], (instregex "^FSQRT_ZPmZ_D")>; + +// Floating point trigonometric exponentiation +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>; + +// Floating point trigonometric multiply add +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>; + +// Floating point trigonometric, miscellaneous +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>; +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; + + +// SVE BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// Convert, F32 to BF16 +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; + +// Dot product +def : InstRW<[C1NanoWrite_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; + +// Matrix multiply accumulate +def : InstRW<[C1NanoWrite_14cyc_1VMAC_1VALU_B], (instregex "^BFMMLA_ZZZ")>; + +// Multiply accumulate long +def : InstRW<[C1NanoWrite<4, C1NanoUnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>; + +// SVE Load instructions +// ----------------------------------------------------------------------------- + +// Load vector +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instrs LDR_ZXI)>; + +// Load predicate +def : InstRW<[C1NanoWrite<3, C1NanoUnitLdSt>], (instrs LDR_PXI)>; + +// Contiguous load, scalar + imm +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LD1[BHWD]_IMM$", + "^LD1S?B_[HSD]_IMM$", + "^LD1S?H_[SD]_IMM$", + "^LD1S?W_D_IMM$" )>; +// Contiguous load, scalar + scalar +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LD1[BHWD]$", + "^LD1S?B_[HSD]$", + "^LD1S?H_[SD]$", + "^LD1S?W_D$" )>; + +// Contiguous load broadcast, scalar + imm +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LD1R[BHWD]_IMM$", + "^LD1RSW_IMM$", + "^LD1RS?B_[HSD]_IMM$", + "^LD1RS?H_[SD]_IMM$", + "^LD1RS?W_D_IMM$", + "^LD1RQ_[BHWD]_IMM$")>; + +// Contiguous load broadcast, scalar + scalar +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LD1RQ_[BHWD]$")>; + +// Non temporal load, scalar + imm +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LDNT1[BHWD]_ZRI$")>; + +// Non temporal load, scalar + scalar +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LDNT1[BHWD]_ZRR$")>; + +// Non temporal gather load, vector + scalar 32-bit element size +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S$", + "^LDNT1S[BH]_ZZR_S$")>; + +// Non temporal gather load, vector + scalar 64-bit element size +def : InstRW<[C1NanoMCWrite<7, 6, C1NanoUnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D$")>; +def : InstRW<[C1NanoMCWrite<7, 6, C1NanoUnitLdSt>], (instrs LDNT1D_ZZR_D)>; + +// Contiguous first faulting load, scalar + scalar +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LDFF1[BHWD]$", + "^LDFF1S?B_[HSD]$", + "^LDFF1S?H_[SD]$", + "^LDFF1S?W_D$")>; + +// Contiguous non faulting load, scalar + imm +def : InstRW<[C1NanoMC2Write<3, 1, C1NanoUnitLd>], (instregex "^LDNF1[BHWD]_IMM$", + "^LDNF1S?B_[HSD]_IMM$", + "^LDNF1S?H_[SD]_IMM$", + "^LDNF1S?W_D_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + imm +def : InstRW<[C1NanoWrite<3, C1NanoUnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + scalar +def : InstRW<[C1NanoMCWrite<3, 2, C1NanoUnitLdSt>], (instregex "^LD2[BHWD]$")>; + +// Contiguous Load three structures to three vectors, scalar + imm +def : InstRW<[C1NanoMCWrite<5, 3, C1NanoUnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>; + +// Contiguous Load three structures to three vectors, scalar + scalar +def : InstRW<[C1NanoMCWrite<5, 4, C1NanoUnitLdSt>], (instregex "^LD3[BHWD]$")>; + +// Contiguous Load four structures to four vectors, scalar + imm +def : InstRW<[C1NanoMCWrite<5, 3, C1NanoUnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>; + +// Contiguous Load four structures to four vectors, scalar + scalar +def : InstRW<[C1NanoMCWrite<5, 4, C1NanoUnitLdSt>], (instregex "^LD4[BHWD]$")>; + +// Gather load, vector + imm, 32-bit element size +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM$", + "^GLD(FF)?1W_IMM$")>; + +// Gather load, vector + imm, 64-bit element size +def : InstRW<[C1NanoMCWrite<7, 6, C1NanoUnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$", + "^GLD(FF)?1D_IMM$")>; + +// Gather load, 64-bit element size +def : InstRW<[C1NanoMCWrite<7, 6, C1NanoUnitLdSt>], + (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW(_SCALED)?$", + "^GLD(FF)?1S?[BHW]_D(_SCALED)?$", + "^GLD(FF)?1D_[SU]XTW(_SCALED)?$", + "^GLD(FF)?1D(_SCALED)?$")>; + +// Gather load, 32-bit scaled offset +def : InstRW<[C1NanoMCWrite<7, 7, C1NanoUnitLdSt>], + (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$", + "^GLD(FF)?1W_[SU]XTW_SCALED")>; + +// Gather load, 32-bit unpacked unscaled offset +def : InstRW<[C1NanoMCWrite<7, 6, C1NanoUnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", + "^GLD(FF)?1W_[SU]XTW$")>; + +def : InstRW<[C1NanoWrite<0, C1NanoUnitVALU>], (instregex "^PRF(B|H|W|D).*")>; +// SVE Store instructions +// ----------------------------------------------------------------------------- + +// Store from predicate reg +def : InstRW<[C1NanoVSt0], (instrs STR_PXI)>; + +// Store from vector reg +def : InstRW<[C1NanoVSt0], (instrs STR_ZXI)>; + +// Contiguous store, scalar + imm +def : InstRW<[C1NanoVSt0], (instregex "^ST1[BHWD]_IMM$", + "^ST1B_[HSD]_IMM$", + "^ST1H_[SD]_IMM$", + "^ST1W_D_IMM$")>; + +// Contiguous store, scalar + scalar +def : InstRW<[C1NanoVSt0], (instregex "^ST1H(_[SD])?$")>; +def : InstRW<[C1NanoVSt0], (instregex "^ST1[BWD]$", + "^ST1B_[HSD]$", + "^ST1W_D$")>; + +// Contiguous store two structures from two vectors, scalar + imm +def : InstRW<[C1NanoVSt<2>], (instregex "^ST2[BHWD]_IMM$")>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[C1NanoVSt<2>], (instrs ST2H)>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[C1NanoVSt<2>], (instregex "^ST2[BWD]$")>; + +// Contiguous store three structures from three vectors, scalar + imm +def : InstRW<[C1NanoVSt<6>], (instregex "^ST3[BHW]_IMM$")>; +def : InstRW<[C1NanoVSt<3>], (instregex "^ST3D_IMM$")>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[C1NanoVSt<6>], (instregex "^ST3[BHW]$")>; +def : InstRW<[C1NanoVSt<3>], (instregex "^ST3D$")>; + +// Contiguous store four structures from four vectors, scalar + imm +def : InstRW<[C1NanoVSt<8>], (instregex "^ST4[BHW]_IMM$")>; +def : InstRW<[C1NanoVSt<4>], (instregex "^ST4D_IMM$")>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[C1NanoVSt<8>], (instregex "^ST4[BHW]$")>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[C1NanoVSt<4>], (instregex "^ST4D$")>; + +// Non temporal store, scalar + imm +def : InstRW<[C1NanoVSt0], (instregex "^STNT1[BHWD]_ZRI$")>; + +// Non temporal store, scalar + scalar +def : InstRW<[C1NanoVSt0], (instrs STNT1H_ZRR)>; +def : InstRW<[C1NanoVSt0], (instregex "^STNT1[BWD]_ZRR$")>; + +// Scatter non temporal store, vector + scalar 32-bit element size +def : InstRW<[C1NanoVSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>; + +// Scatter non temporal store, vector + scalar 64-bit element size +def : InstRW<[C1NanoVSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>; + +// Scatter store vector + imm 32-bit element size +def : InstRW<[C1NanoVSt<9>], (instregex "^SST1[BH]_S_IMM$", + "^SST1W_IMM$")>; + +// Scatter store vector + imm 64-bit element size +def : InstRW<[C1NanoVSt<7>], (instregex "^SST1[BHW]_D_IMM$", + "^SST1D_IMM$")>; + +// Scatter store, 32-bit scaled offset +def : InstRW<[C1NanoVSt<9>], + (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unpacked unscaled offset +def : InstRW<[C1NanoVSt<7>], (instregex "^SST1[BHW]_D_[SU]XTW$", + "^SST1D_[SU]XTW$")>; + +// Scatter store, 32-bit unpacked scaled offset +def : InstRW<[C1NanoVSt<7>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", + "^SST1D_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unscaled offset +def : InstRW<[C1NanoVSt<9>], (instregex "^SST1[BH]_S_[SU]XTW$", + "^SST1W_[SU]XTW$")>; + +// Scatter store, 64-bit scaled offset +def : InstRW<[C1NanoVSt<7>], (instregex "^SST1[HW]_D_SCALED$", + "^SST1D_SCALED$")>; + +// Scatter store, 64-bit unscaled offset +def : InstRW<[C1NanoVSt<7>], (instregex "^SST1[BHW]_D$", + "^SST1D$")>; + +// SVE Miscellaneous instructions +// ----------------------------------------------------------------------------- + +// Read first fault register, unpredicated +def : InstRW<[C1NanoWrite<1, C1NanoUnitLdSt>], (instrs RDFFR_P)>; + +// Read first fault register, predicated +def : InstRW<[C1NanoMCWrite<3, 1, C1NanoUnitLdSt>], (instrs RDFFR_PPz)>; + +// Read first fault register and set flags +def : InstRW<[C1NanoMCWrite<3, 1, C1NanoUnitLdSt>], (instrs RDFFRS_PPz)>; + +// Set first fault register +// Write to first fault register +def : InstRW<[C1NanoWrite<1, C1NanoUnitLdSt>], (instrs SETFFR, WRFFR)>; + +// SVE Cryptographic instructions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^AES[DE]_ZZZ_B$", + "^AESI?MC_ZZ_B$")>; + +// Crypto SHA3 ops +def : InstRW<[C1NanoWrite<3, C1NanoUnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$")>; +def : InstRW<[C1NanoWrite<4, C1NanoUnitVALU>], (instregex "^XAR_ZZZI_[BHSD]$")>; + +def : InstRW<[C1NanoMC_RC0Write<3, C1NanoUnitVALU>], (instregex "^RAX1_ZZZ_D$")>; + +// Crypto SM4 ops +def : InstRW<[C1NanoMCWrite<9, 7, C1NanoUnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; + +} diff --git a/llvm/lib/Target/AArch64/AArch64SchedC1Premium.td b/llvm/lib/Target/AArch64/AArch64SchedC1Premium.td index 5a8de63618cee..19a9dc75d560f 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedC1Premium.td +++ b/llvm/lib/Target/AArch64/AArch64SchedC1Premium.td @@ -23,7 +23,8 @@ def C1PremiumModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, [HasSVE2p1, HasSVEB16B16, - HasCPA, HasCSSC]); + HasCPA, HasCSSC, + HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedC1Ultra.td b/llvm/lib/Target/AArch64/AArch64SchedC1Ultra.td index ec23324008b6f..4ba79ce3df3cf 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedC1Ultra.td +++ b/llvm/lib/Target/AArch64/AArch64SchedC1Ultra.td @@ -23,7 +23,8 @@ def C1UltraModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, [HasSVE2p1, HasSVEB16B16, - HasCPA, HasCSSC]); + HasCPA, HasMTE, HasCSSC, + HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 48324654949c0..2340e7dc48771 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -21,7 +21,7 @@ def CycloneModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index 6fc4ec3ae41b7..3644617928dc2 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -27,7 +27,7 @@ def ExynosM3Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index b75264602dbc1..e72d91101582a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -27,7 +27,7 @@ def ExynosM4Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index 6b5a6da76b3a8..35a787bdd8fc7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -27,7 +27,7 @@ def ExynosM5Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index e9172e82b099d..778a2621618a7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -26,7 +26,7 @@ def FalkorModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td index 258b34c38898c..eec8f37906fd4 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -30,7 +30,7 @@ def KryoModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td index 3b101e0b7655e..8847f32a9da9a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td @@ -27,7 +27,7 @@ def NeoverseN1Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(PAUnsupported.F, SMEUnsupported.F, SVEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td index afa4e4744a997..39987b81ee9a4 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel { let CompleteModel = 1; list UnsupportedFeatures = !listconcat(SMEUnsupported.F, - [HasSVE2p1, HasSVEB16B16, HasPAuthLR, HasCPA, HasCSSC]); + [HasSVE2p1, HasSVEB16B16, HasPAuthLR, HasCPA, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td index 3759dc99cea9f..ad1f62ad6aa53 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td @@ -19,7 +19,7 @@ def NeoverseN3Model : SchedMachineModel { let CompleteModel = 1; list UnsupportedFeatures = !listconcat(SMEUnsupported.F, - [HasSVE2p1, HasSVEB16B16, HasPAuthLR, HasCPA, HasCSSC]); + [HasSVE2p1, HasSVEB16B16, HasPAuthLR, HasCPA, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index 6c40304e7cd6b..e2f6a6cc85bcc 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -31,7 +31,7 @@ def NeoverseV1Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVE2Unsupported.F, SMEUnsupported.F, [HasMTE, HasCPA, - HasCSSC]); + HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index 275bbaf5fba4c..46c41660604f3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -25,7 +25,7 @@ def NeoverseV2Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, [HasSVE2p1, HasSVEB16B16, - HasCPA, HasCSSC]); + HasCPA, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td index 3e210f088c45b..b6e5bc308ceb6 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td @@ -23,7 +23,7 @@ def NeoverseV3Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, [HasSVE2p1, HasSVEB16B16, - HasCPA, HasCSSC]); + HasCPA, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td index 470385ae23c93..606e2a3fd1c38 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td @@ -23,7 +23,7 @@ def NeoverseV3AEModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, [HasSVE2p1, HasSVEB16B16, - HasCPA, HasCSSC]); + HasCPA, HasCSSC, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedOlympus.td b/llvm/lib/Target/AArch64/AArch64SchedOlympus.td index 9d843a9490618..e4854aa0a1899 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedOlympus.td +++ b/llvm/lib/Target/AArch64/AArch64SchedOlympus.td @@ -26,7 +26,7 @@ def OlympusModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SMEUnsupported.F, [HasSVE2p1, HasSVEB16B16, HasCPA, HasCSSC, - HasMatMulFP64]); + HasMatMulFP64, HasMOPS, HasMOPS_GO]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td index 435eaf99c6175..e113bf4525274 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedOryon.td +++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td @@ -27,7 +27,7 @@ def OryonModel : SchedMachineModel { SMEUnsupported.F, MTEUnsupported.F, PAUnsupported.F, - [HasPAuth, HasCSSC]); + [HasPAuth, HasCSSC, HasMOPS, HasMOPS_GO]); } let SchedModel = OryonModel in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td index ee6c7e72bae29..2b97aef92574b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td +++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td @@ -61,13 +61,14 @@ let FunctionMapper = "AArch64_AM::getShiftType" in { // Generic predicates. // Check for ZR in a register operand. -foreach I = {1-3} in { +foreach I = {0-3} in { def CheckIsReg#I#Zero : CheckAll< [CheckIsRegOperand, CheckAny< [CheckRegOperand, CheckRegOperand]>]>; } +def IsReg0ZeroPred : MCSchedPredicate; def IsReg1ZeroPred : MCSchedPredicate; def IsReg2ZeroPred : MCSchedPredicate; def IsReg3ZeroPred : MCSchedPredicate; @@ -350,6 +351,45 @@ def IsRORImmIdiomPred : MCSchedPredicate< // EXTR Rd, Rs, Rs, #Imm CheckAll<[CheckOpcode<[EXTRWrri, EXTRXrri]>, CheckSameRegOperand<1, 2>]>>; +// LSL Rd, Rn, #s alias of UBFM Rd, Rn, #(N-s), #(N-s-1) +class IsUBFMLSLImm : CheckAll<[ + CheckOpcode<[Opc]>, + CheckAny, + CheckImmOperand<3, !sub(!sub(Width, Shift), 1)>]>)>]>; + +// LSR Rd, Rn, #s alias of UBFM Rd, Rn, #s, #(N-1) +class IsUBFMLSRImm : CheckAll<[ + CheckOpcode<[Opc]>, + CheckImmOperandRange<2, 0, !sub(Width, 1)>, + CheckImmOperand<3, !sub(Width, 1)>]>; + +// UXTB Wd, Wn alias of UBFM Wd, Wn, #0, #7 +class IsUBFMUXTBImm : CheckAll<[ + CheckOpcode<[Opc]>, + CheckImmOperand<2, 0>, + CheckImmOperand<3, 7>]>; + +// ASR Rd, Rn, #s alias of SBFM Rd, Rn, #s, #(N-1) +class IsSBFMASRImm : CheckAll<[ + CheckOpcode<[Opc]>, + CheckImmOperandRange<2, 0, !sub(Width, 1)>, + CheckImmOperand<3, !sub(Width, 1)>]>; + +// Identify UBFM as the alias for LSL/LSR/UXTB (immediate). +def IsFastUBFMImmPred : MCSchedPredicate, + IsUBFMLSLImm, + IsUBFMLSRImm, + IsUBFMLSRImm, + IsUBFMUXTBImm, + IsUBFMUXTBImm ]>>; + +// Identify SBFM as the alias for ASR (immediate). +def IsFastSBFMImmPred : MCSchedPredicate, + IsSBFMASRImm ]>>; + // Identify whether destination operand is a W-form register. def CheckIsWRegOp0 : CheckAll<[ CheckIsRegOperand<0>, diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 1c577a25bf739..160f51f63f197 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -27,7 +27,7 @@ def TSV110Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); } // Define each kind of processor resource and number available on the TSV110, diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index 8df3f56e45738..e9f2d7618e199 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -28,7 +28,7 @@ def ThunderXT8XModel : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index ef4baa3dedff9..5d2adb938d687 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -28,7 +28,7 @@ def ThunderX2T99Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -1865,4 +1865,3 @@ def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; } // SchedModel = ThunderX2T99Model - diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td index 796bd4b8b5c9a..80f37d306a48a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -27,7 +27,7 @@ def ThunderX3T110Model : SchedMachineModel { list UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, SMEUnsupported.F, - [HasMTE, HasCSSC]); + [HasMTE, HasCSSC, HasMOPS, HasMOPS_GO]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64Schedule.td b/llvm/lib/Target/AArch64/AArch64Schedule.td index b8572c9b45723..06dfcb4154404 100644 --- a/llvm/lib/Target/AArch64/AArch64Schedule.td +++ b/llvm/lib/Target/AArch64/AArch64Schedule.td @@ -94,3 +94,7 @@ def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteVq, WriteVq]>; // Store a shuffled vector. def WriteVSTShuffle : WriteSequence<[WriteVq, WriteVST]>; def WriteVSTPairShuffle : WriteSequence<[WriteVq, WriteVq, WriteVST]>; + +// SIMD MAC forwarding reads +def ReadVMAC : SchedRead; // SIMD MAC operand read +def ReadVMACAccum : SchedRead; // SIMD MAC accumulator/destination read diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-basic-instructions.s new file mode 100644 index 0000000000000..ad19cad01f7c5 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-basic-instructions.s @@ -0,0 +1,2512 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/basic-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 add w2, w3, #4095 +# CHECK-NEXT: 1 1 0.50 add w30, w29, #1, lsl #12 +# CHECK-NEXT: 1 1 0.50 add w13, w5, #4095, lsl #12 +# CHECK-NEXT: 1 1 0.50 add x5, x7, #1638 +# CHECK-NEXT: 1 1 0.50 add w20, wsp, #801 +# CHECK-NEXT: 1 1 0.50 add wsp, wsp, #1104 +# CHECK-NEXT: 1 1 0.50 add wsp, w30, #4084 +# CHECK-NEXT: 1 1 0.50 add x0, x24, #291 +# CHECK-NEXT: 1 1 0.50 add x3, x24, #4095, lsl #12 +# CHECK-NEXT: 1 1 0.50 add x8, sp, #1074 +# CHECK-NEXT: 1 1 0.50 add sp, x29, #3816 +# CHECK-NEXT: 1 1 0.50 sub w0, wsp, #4077 +# CHECK-NEXT: 1 1 0.50 sub w4, w20, #546, lsl #12 +# CHECK-NEXT: 1 1 0.50 sub sp, sp, #288 +# CHECK-NEXT: 1 1 0.50 sub wsp, w19, #16 +# CHECK-NEXT: 1 1 0.50 adds w13, w23, #291, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmn w2, #4095 +# CHECK-NEXT: 1 1 0.50 adds w20, wsp, #0 +# CHECK-NEXT: 1 1 0.50 cmn x3, #1, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmp wsp, #2342 +# CHECK-NEXT: 1 1 0.50 cmp sp, #20, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmp x30, #4095 +# CHECK-NEXT: 1 1 0.50 subs x4, sp, #3822 +# CHECK-NEXT: 1 1 0.50 cmn w3, #291, lsl #12 +# CHECK-NEXT: 1 1 0.50 cmn wsp, #1365 +# CHECK-NEXT: 1 1 0.50 cmn sp, #1092, lsl #12 +# CHECK-NEXT: 1 1 0.50 mov x10, #-63432 +# CHECK-NEXT: 1 1 0.50 add wsp, wsp, w10 +# CHECK-NEXT: 1 1 0.50 add x25, x9, w25, uxtb +# CHECK-NEXT: 1 1 0.50 add w3, w5, w7 +# CHECK-NEXT: 1 1 0.50 add wzr, w3, w5 +# CHECK-NEXT: 1 1 0.50 add w20, wzr, w4 +# CHECK-NEXT: 1 1 0.50 add w4, w6, wzr +# CHECK-NEXT: 1 1 0.50 add w11, w13, w15 +# CHECK-NEXT: 1 1 0.50 add w9, w3, wzr, lsl #1 +# CHECK-NEXT: 1 1 0.50 add w17, w29, w20, lsl #31 +# CHECK-NEXT: 1 1 0.50 add w21, w22, w23, lsr #0 +# CHECK-NEXT: 1 1 0.50 add w24, w25, w26, lsr #18 +# CHECK-NEXT: 1 1 0.50 add w27, w28, w29, lsr #31 +# CHECK-NEXT: 1 1 0.50 add w2, w3, w4, asr #0 +# CHECK-NEXT: 1 1 0.50 add w5, w6, w7, asr #21 +# CHECK-NEXT: 1 1 0.50 add w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.50 add x3, x5, x7 +# CHECK-NEXT: 1 1 0.50 add xzr, x3, x5 +# CHECK-NEXT: 1 1 0.50 add x20, xzr, x4 +# CHECK-NEXT: 1 1 0.50 add x4, x6, xzr +# CHECK-NEXT: 1 1 0.50 add x11, x13, x15 +# CHECK-NEXT: 1 1 0.50 add x9, x3, xzr, lsl #10 +# CHECK-NEXT: 1 1 0.50 add x17, x29, x20, lsl #3 +# CHECK-NEXT: 1 1 0.50 add x21, x22, x23, lsr #0 +# CHECK-NEXT: 1 1 0.50 add x24, x25, x26, lsr #18 +# CHECK-NEXT: 1 1 0.50 add x27, x28, x29, lsr #63 +# CHECK-NEXT: 1 1 0.50 add x2, x3, x4, asr #0 +# CHECK-NEXT: 1 1 0.50 add x5, x6, x7, asr #21 +# CHECK-NEXT: 1 1 0.50 add x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.50 adds w3, w5, w7 +# CHECK-NEXT: 1 1 0.50 adds w17, wsp, w25 +# CHECK-NEXT: 1 1 0.50 adds x13, x23, w8, uxtb +# CHECK-NEXT: 1 1 0.50 cmn w3, w5 +# CHECK-NEXT: 1 1 0.50 adds w20, wzr, w4 +# CHECK-NEXT: 1 1 0.50 adds w4, w6, wzr +# CHECK-NEXT: 1 1 0.50 adds w11, w13, w15 +# CHECK-NEXT: 1 1 0.50 adds w9, w3, wzr, lsl #1 +# CHECK-NEXT: 1 1 0.50 adds w17, w29, w20, lsl #31 +# CHECK-NEXT: 1 1 0.50 adds w21, w22, w23, lsr #0 +# CHECK-NEXT: 1 1 0.50 adds w24, w25, w26, lsr #18 +# CHECK-NEXT: 1 1 0.50 adds w27, w28, w29, lsr #31 +# CHECK-NEXT: 1 1 0.50 adds w2, w3, w4, asr #0 +# CHECK-NEXT: 1 1 0.50 adds w5, w6, w7, asr #21 +# CHECK-NEXT: 1 1 0.50 adds w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.50 adds x3, x5, x7 +# CHECK-NEXT: 1 1 0.50 cmn x3, x5 +# CHECK-NEXT: 1 1 0.50 adds x20, xzr, x4 +# CHECK-NEXT: 1 1 0.50 adds x4, x6, xzr +# CHECK-NEXT: 1 1 0.50 adds x11, x13, x15 +# CHECK-NEXT: 1 1 0.50 adds x9, x3, xzr, lsl #10 +# CHECK-NEXT: 1 1 0.50 adds x17, x29, x20, lsl #3 +# CHECK-NEXT: 1 1 0.50 adds x21, x22, x23, lsr #0 +# CHECK-NEXT: 1 1 0.50 adds x24, x25, x26, lsr #18 +# CHECK-NEXT: 1 1 0.50 adds x27, x28, x29, lsr #63 +# CHECK-NEXT: 1 1 0.50 adds x2, x3, x4, asr #0 +# CHECK-NEXT: 1 1 0.50 adds x5, x6, x7, asr #21 +# CHECK-NEXT: 1 1 0.50 adds x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.50 sub w3, w5, w7 +# CHECK-NEXT: 1 1 0.50 sub wzr, w3, w5 +# CHECK-NEXT: 1 1 0.50 sub w4, w6, wzr +# CHECK-NEXT: 1 1 0.50 sub w11, w13, w15 +# CHECK-NEXT: 1 1 0.50 sub w9, w3, wzr, lsl #1 +# CHECK-NEXT: 1 1 0.50 sub w17, w29, w20, lsl #31 +# CHECK-NEXT: 1 1 0.50 sub w21, w22, w23, lsr #0 +# CHECK-NEXT: 1 1 0.50 sub w24, w25, w26, lsr #18 +# CHECK-NEXT: 1 1 0.50 sub w27, w28, w29, lsr #31 +# CHECK-NEXT: 1 1 0.50 sub w2, w3, w4, asr #0 +# CHECK-NEXT: 1 1 0.50 sub w5, w6, w7, asr #21 +# CHECK-NEXT: 1 1 0.50 sub w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.50 sub x3, x5, x7 +# CHECK-NEXT: 1 1 0.50 sub xzr, x3, x5 +# CHECK-NEXT: 1 1 0.50 sub x4, x6, xzr +# CHECK-NEXT: 1 1 0.50 sub x11, x13, x15 +# CHECK-NEXT: 1 1 0.50 sub x9, x3, xzr, lsl #10 +# CHECK-NEXT: 1 1 0.50 sub x17, x29, x20, lsl #3 +# CHECK-NEXT: 1 1 0.50 sub x21, x22, x23, lsr #0 +# CHECK-NEXT: 1 1 0.50 sub x24, x25, x26, lsr #18 +# CHECK-NEXT: 1 1 0.50 sub x27, x28, x29, lsr #63 +# CHECK-NEXT: 1 1 0.50 sub x2, x3, x4, asr #0 +# CHECK-NEXT: 1 1 0.50 sub x5, x6, x7, asr #21 +# CHECK-NEXT: 1 1 0.50 sub x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.50 sub w13, wsp, w10 +# CHECK-NEXT: 1 1 0.50 sub x16, x2, w19, uxtb +# CHECK-NEXT: 1 1 0.50 subs x13, x15, x14, sxtx #1 +# CHECK-NEXT: 1 1 0.50 subs w3, w5, w7 +# CHECK-NEXT: 1 1 0.50 cmp w3, w5 +# CHECK-NEXT: 1 1 0.50 subs w4, w6, wzr +# CHECK-NEXT: 1 1 0.50 subs w11, w13, w15 +# CHECK-NEXT: 1 1 0.50 subs w9, w3, wzr, lsl #1 +# CHECK-NEXT: 1 1 0.50 subs w17, w29, w20, lsl #31 +# CHECK-NEXT: 1 1 0.50 subs w21, w22, w23, lsr #0 +# CHECK-NEXT: 1 1 0.50 subs w24, w25, w26, lsr #18 +# CHECK-NEXT: 1 1 0.50 subs w27, w28, w29, lsr #31 +# CHECK-NEXT: 1 1 0.50 subs w2, w3, w4, asr #0 +# CHECK-NEXT: 1 1 0.50 subs w5, w6, w7, asr #21 +# CHECK-NEXT: 1 1 0.50 subs w8, w9, w10, asr #31 +# CHECK-NEXT: 1 1 0.50 subs x3, x5, x7 +# CHECK-NEXT: 1 1 0.50 cmp x3, x5 +# CHECK-NEXT: 1 1 0.50 subs x4, x6, xzr +# CHECK-NEXT: 1 1 0.50 subs x11, x13, x15 +# CHECK-NEXT: 1 1 0.50 subs x9, x3, xzr, lsl #10 +# CHECK-NEXT: 1 1 0.50 subs x17, x29, x20, lsl #3 +# CHECK-NEXT: 1 1 0.50 subs x21, x22, x23, lsr #0 +# CHECK-NEXT: 1 1 0.50 subs x24, x25, x26, lsr #18 +# CHECK-NEXT: 1 1 0.50 subs x27, x28, x29, lsr #63 +# CHECK-NEXT: 1 1 0.50 subs x2, x3, x4, asr #0 +# CHECK-NEXT: 1 1 0.50 subs x5, x6, x7, asr #21 +# CHECK-NEXT: 1 1 0.50 subs x8, x9, x10, asr #63 +# CHECK-NEXT: 1 1 0.50 cmn wzr, w4 +# CHECK-NEXT: 1 1 0.50 cmn w5, wzr +# CHECK-NEXT: 1 1 0.50 cmn w6, w7 +# CHECK-NEXT: 1 1 0.50 cmn w8, w9, lsl #1 +# CHECK-NEXT: 1 1 0.50 cmn w10, w11, lsl #31 +# CHECK-NEXT: 1 1 0.50 cmn w12, w13, lsr #0 +# CHECK-NEXT: 1 1 0.50 cmn w14, w15, lsr #21 +# CHECK-NEXT: 1 1 0.50 cmn w16, w17, lsr #31 +# CHECK-NEXT: 1 1 0.50 cmn w18, w19, asr #0 +# CHECK-NEXT: 1 1 0.50 cmn w20, w21, asr #22 +# CHECK-NEXT: 1 1 0.50 cmn w22, w23, asr #31 +# CHECK-NEXT: 1 1 0.50 cmn x0, x3 +# CHECK-NEXT: 1 1 0.50 cmn xzr, x4 +# CHECK-NEXT: 1 1 0.50 cmn x5, xzr +# CHECK-NEXT: 1 1 0.50 cmn x6, x7 +# CHECK-NEXT: 1 1 0.50 cmn x8, x9, lsl #15 +# CHECK-NEXT: 1 1 0.50 cmn x10, x11, lsl #3 +# CHECK-NEXT: 1 1 0.50 cmn x12, x13, lsr #0 +# CHECK-NEXT: 1 1 0.50 cmn x14, x15, lsr #41 +# CHECK-NEXT: 1 1 0.50 cmn x16, x17, lsr #63 +# CHECK-NEXT: 1 1 0.50 cmn x18, x19, asr #0 +# CHECK-NEXT: 1 1 0.50 cmn x20, x21, asr #55 +# CHECK-NEXT: 1 1 0.50 cmn x22, x23, asr #63 +# CHECK-NEXT: 1 1 0.50 cmp w0, w3 +# CHECK-NEXT: 1 1 0.50 cmp wzr, w4 +# CHECK-NEXT: 1 1 0.50 cmp w5, wzr +# CHECK-NEXT: 1 1 0.50 cmp w6, w7 +# CHECK-NEXT: 1 1 0.50 cmp w8, w9, lsl #1 +# CHECK-NEXT: 1 1 0.50 cmp w10, w11, lsl #31 +# CHECK-NEXT: 1 1 0.50 cmp w12, w13, lsr #0 +# CHECK-NEXT: 1 1 0.50 cmp w14, w15, lsr #21 +# CHECK-NEXT: 1 1 0.50 cmp w18, w19, asr #0 +# CHECK-NEXT: 1 1 0.50 cmp w20, w21, asr #22 +# CHECK-NEXT: 1 1 0.50 cmp w22, w23, asr #31 +# CHECK-NEXT: 1 1 0.50 cmp wsp, w26 +# CHECK-NEXT: 1 1 0.50 cmp x16, w27, uxtb +# CHECK-NEXT: 1 1 0.50 cmp x0, x3 +# CHECK-NEXT: 1 1 0.50 cmp xzr, x4 +# CHECK-NEXT: 1 1 0.50 cmp x5, xzr +# CHECK-NEXT: 1 1 0.50 cmp x6, x7 +# CHECK-NEXT: 1 1 0.50 cmp x8, x9, lsl #15 +# CHECK-NEXT: 1 1 0.50 cmp x10, x11, lsl #3 +# CHECK-NEXT: 1 1 0.50 cmp x12, x13, lsr #0 +# CHECK-NEXT: 1 1 0.50 cmp x14, x15, lsr #41 +# CHECK-NEXT: 1 1 0.50 cmp x16, x17, lsr #63 +# CHECK-NEXT: 1 1 0.50 cmp x18, x19, asr #0 +# CHECK-NEXT: 1 1 0.50 cmp x20, x21, asr #55 +# CHECK-NEXT: 1 1 0.50 cmp x22, x23, asr #63 +# CHECK-NEXT: 1 1 0.50 cmp wzr, w0 +# CHECK-NEXT: 1 1 0.50 cmp xzr, x0 +# CHECK-NEXT: 1 1 0.50 mov sp, x30 +# CHECK-NEXT: 1 1 0.50 mov wsp, w20 +# CHECK-NEXT: 1 1 0.50 mov x11, sp +# CHECK-NEXT: 1 1 0.50 mov w24, wsp +# CHECK-NEXT: 1 1 0.50 adc w29, w27, w25 +# CHECK-NEXT: 1 1 0.50 adc wzr, w3, w4 +# CHECK-NEXT: 1 1 0.50 adc w9, wzr, w10 +# CHECK-NEXT: 1 1 0.50 adc w20, w0, wzr +# CHECK-NEXT: 1 1 0.50 adc x29, x27, x25 +# CHECK-NEXT: 1 1 0.50 adc xzr, x3, x4 +# CHECK-NEXT: 1 1 0.50 adc x9, xzr, x10 +# CHECK-NEXT: 1 1 0.50 adc x20, x0, xzr +# CHECK-NEXT: 1 1 1.00 adcs w29, w27, w25 +# CHECK-NEXT: 1 1 1.00 adcs wzr, w3, w4 +# CHECK-NEXT: 1 1 1.00 adcs w9, wzr, w10 +# CHECK-NEXT: 1 1 1.00 adcs w20, w0, wzr +# CHECK-NEXT: 1 1 1.00 adcs x29, x27, x25 +# CHECK-NEXT: 1 1 1.00 adcs xzr, x3, x4 +# CHECK-NEXT: 1 1 1.00 adcs x9, xzr, x10 +# CHECK-NEXT: 1 1 1.00 adcs x20, x0, xzr +# CHECK-NEXT: 1 1 0.50 sbc w29, w27, w25 +# CHECK-NEXT: 1 1 0.50 sbc wzr, w3, w4 +# CHECK-NEXT: 1 1 0.50 ngc w9, w10 +# CHECK-NEXT: 1 1 0.50 sbc w20, w0, wzr +# CHECK-NEXT: 1 1 0.50 sbc x29, x27, x25 +# CHECK-NEXT: 1 1 0.50 sbc xzr, x3, x4 +# CHECK-NEXT: 1 1 0.50 ngc x9, x10 +# CHECK-NEXT: 1 1 0.50 sbc x20, x0, xzr +# CHECK-NEXT: 1 1 1.00 sbcs w29, w27, w25 +# CHECK-NEXT: 1 1 1.00 sbcs wzr, w3, w4 +# CHECK-NEXT: 1 1 1.00 ngcs w9, w10 +# CHECK-NEXT: 1 1 1.00 sbcs w20, w0, wzr +# CHECK-NEXT: 1 1 1.00 sbcs x29, x27, x25 +# CHECK-NEXT: 1 1 1.00 sbcs xzr, x3, x4 +# CHECK-NEXT: 1 1 1.00 ngcs x9, x10 +# CHECK-NEXT: 1 1 1.00 sbcs x20, x0, xzr +# CHECK-NEXT: 1 1 0.50 ngc w3, w12 +# CHECK-NEXT: 1 1 0.50 ngc wzr, w9 +# CHECK-NEXT: 1 1 0.50 ngc w23, wzr +# CHECK-NEXT: 1 1 0.50 ngc x29, x30 +# CHECK-NEXT: 1 1 0.50 ngc xzr, x0 +# CHECK-NEXT: 1 1 0.50 ngc x0, xzr +# CHECK-NEXT: 1 1 1.00 ngcs w3, w12 +# CHECK-NEXT: 1 1 1.00 ngcs wzr, w9 +# CHECK-NEXT: 1 1 1.00 ngcs w23, wzr +# CHECK-NEXT: 1 1 1.00 ngcs x29, x30 +# CHECK-NEXT: 1 1 1.00 ngcs xzr, x0 +# CHECK-NEXT: 1 1 1.00 ngcs x0, xzr +# CHECK-NEXT: 1 2 0.50 sbfx x1, x2, #3, #2 +# CHECK-NEXT: 1 1 0.50 asr x3, x4, #63 +# CHECK-NEXT: 1 1 0.50 asr wzr, wzr, #31 +# CHECK-NEXT: 1 2 0.50 sbfx w12, w9, #0, #1 +# CHECK-NEXT: 1 2 0.50 ubfiz x4, x5, #52, #11 +# CHECK-NEXT: 1 2 0.50 ubfx xzr, x4, #0, #1 +# CHECK-NEXT: 1 2 0.50 ubfiz x4, xzr, #1, #6 +# CHECK-NEXT: 1 1 0.50 lsr x5, x6, #12 +# CHECK-NEXT: 1 2 0.50 bfi x4, x5, #52, #11 +# CHECK-NEXT: 1 2 0.50 bfxil xzr, x4, #0, #1 +# CHECK-NEXT: 1 2 0.50 bfc x4, #1, #6 +# CHECK-NEXT: 1 2 0.50 bfxil x5, x6, #12, #52 +# CHECK-NEXT: 1 2 0.50 sxtb w1, w2 +# CHECK-NEXT: 1 2 0.50 sxtb xzr, w3 +# CHECK-NEXT: 1 2 0.50 sxth w9, w10 +# CHECK-NEXT: 1 2 0.50 sxth x0, w1 +# CHECK-NEXT: 1 2 0.50 sxtw x3, w30 +# CHECK-NEXT: 1 1 0.50 uxtb w1, w2 +# CHECK-NEXT: 1 2 0.50 uxth w9, w10 +# CHECK-NEXT: 1 2 0.50 ubfx x3, x30, #0, #32 +# CHECK-NEXT: 1 1 0.50 asr w3, w2, #0 +# CHECK-NEXT: 1 1 0.50 asr w9, w10, #31 +# CHECK-NEXT: 1 1 0.50 asr x20, x21, #63 +# CHECK-NEXT: 1 1 0.50 asr w1, wzr, #3 +# CHECK-NEXT: 1 1 0.50 lsr w3, w2, #0 +# CHECK-NEXT: 1 1 0.50 lsr w9, w10, #31 +# CHECK-NEXT: 1 1 0.50 lsr x20, x21, #63 +# CHECK-NEXT: 1 1 0.50 lsr wzr, wzr, #3 +# CHECK-NEXT: 1 1 0.50 lsl w9, w10, #31 +# CHECK-NEXT: 1 1 0.50 lsl x20, x21, #63 +# CHECK-NEXT: 1 1 0.50 lsl w1, wzr, #3 +# CHECK-NEXT: 1 2 0.50 sbfiz x2, x3, #63, #1 +# CHECK-NEXT: 1 2 0.50 sbfiz x9, x10, #5, #59 +# CHECK-NEXT: 1 2 0.50 sbfiz w11, w12, #31, #1 +# CHECK-NEXT: 1 2 0.50 sbfiz w13, w14, #29, #3 +# CHECK-NEXT: 1 2 0.50 sbfiz xzr, xzr, #10, #11 +# CHECK-NEXT: 1 2 0.50 sbfx w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 asr x2, x3, #63 +# CHECK-NEXT: 1 1 0.50 asr x19, x20, #0 +# CHECK-NEXT: 1 1 0.50 asr x9, x10, #5 +# CHECK-NEXT: 1 1 0.50 asr w9, w10, #0 +# CHECK-NEXT: 1 1 0.50 asr w11, w12, #31 +# CHECK-NEXT: 1 1 0.50 asr w13, w14, #29 +# CHECK-NEXT: 1 2 0.50 sbfx xzr, xzr, #10, #11 +# CHECK-NEXT: 1 2 0.50 bfi x2, x3, #63, #1 +# CHECK-NEXT: 1 2 0.50 bfi x9, x10, #5, #59 +# CHECK-NEXT: 1 2 0.50 bfi w11, w12, #31, #1 +# CHECK-NEXT: 1 2 0.50 bfi w13, w14, #29, #3 +# CHECK-NEXT: 1 2 0.50 bfc xzr, #10, #11 +# CHECK-NEXT: 1 2 0.50 bfxil w9, w10, #0, #1 +# CHECK-NEXT: 1 2 0.50 bfxil x2, x3, #63, #1 +# CHECK-NEXT: 1 2 0.50 bfxil x19, x20, #0, #64 +# CHECK-NEXT: 1 2 0.50 bfxil x9, x10, #5, #59 +# CHECK-NEXT: 1 2 0.50 bfxil w9, w10, #0, #32 +# CHECK-NEXT: 1 2 0.50 bfxil w11, w12, #31, #1 +# CHECK-NEXT: 1 2 0.50 bfxil w13, w14, #29, #3 +# CHECK-NEXT: 1 2 0.50 bfxil xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 0.50 lsl x2, x3, #63 +# CHECK-NEXT: 1 1 0.50 lsl x9, x10, #5 +# CHECK-NEXT: 1 1 0.50 lsl w11, w12, #31 +# CHECK-NEXT: 1 1 0.50 lsl w13, w14, #29 +# CHECK-NEXT: 1 2 0.50 ubfiz xzr, xzr, #10, #11 +# CHECK-NEXT: 1 2 0.50 ubfx w9, w10, #0, #1 +# CHECK-NEXT: 1 1 0.50 lsr x2, x3, #63 +# CHECK-NEXT: 1 1 0.50 lsr x19, x20, #0 +# CHECK-NEXT: 1 1 0.50 lsr x9, x10, #5 +# CHECK-NEXT: 1 1 0.50 lsr w9, w10, #0 +# CHECK-NEXT: 1 1 0.50 lsr w11, w12, #31 +# CHECK-NEXT: 1 1 0.50 lsr w13, w14, #29 +# CHECK-NEXT: 1 2 0.50 ubfx xzr, xzr, #10, #11 +# CHECK-NEXT: 1 1 1.00 cbz w5, #4 +# CHECK-NEXT: 1 1 1.00 cbz x5, #0 +# CHECK-NEXT: 1 1 1.00 cbnz x2, #-4 +# CHECK-NEXT: 1 1 1.00 cbnz x26, #1048572 +# CHECK-NEXT: 1 1 1.00 cbz wzr, #0 +# CHECK-NEXT: 1 1 1.00 cbnz xzr, #0 +# CHECK-NEXT: 1 1 1.00 cbnz w21, test +# CHECK-NEXT: 1 1 1.00 b.ne #4 +# CHECK-NEXT: 1 1 1.00 b.ge #1048572 +# CHECK-NEXT: 1 1 1.00 b.ge #-4 +# CHECK-NEXT: 1 1 1.00 ccmp w1, #31, #0, eq +# CHECK-NEXT: 1 1 1.00 ccmp w3, #0, #15, hs +# CHECK-NEXT: 1 1 1.00 ccmp wzr, #15, #13, hs +# CHECK-NEXT: 1 1 1.00 ccmp x9, #31, #0, le +# CHECK-NEXT: 1 1 1.00 ccmp x3, #0, #15, gt +# CHECK-NEXT: 1 1 1.00 ccmp xzr, #5, #7, ne +# CHECK-NEXT: 1 1 1.00 ccmn w1, #31, #0, eq +# CHECK-NEXT: 1 1 1.00 ccmn w3, #0, #15, hs +# CHECK-NEXT: 1 1 1.00 ccmn wzr, #15, #13, hs +# CHECK-NEXT: 1 1 1.00 ccmn x9, #31, #0, le +# CHECK-NEXT: 1 1 1.00 ccmn x3, #0, #15, gt +# CHECK-NEXT: 1 1 1.00 ccmn xzr, #5, #7, ne +# CHECK-NEXT: 1 1 1.00 ccmp w1, wzr, #0, eq +# CHECK-NEXT: 1 1 1.00 ccmp w3, w0, #15, hs +# CHECK-NEXT: 1 1 1.00 ccmp wzr, w15, #13, hs +# CHECK-NEXT: 1 1 1.00 ccmp x9, xzr, #0, le +# CHECK-NEXT: 1 1 1.00 ccmp x3, x0, #15, gt +# CHECK-NEXT: 1 1 1.00 ccmp xzr, x5, #7, ne +# CHECK-NEXT: 1 1 1.00 ccmn w1, wzr, #0, eq +# CHECK-NEXT: 1 1 1.00 ccmn w3, w0, #15, hs +# CHECK-NEXT: 1 1 1.00 ccmn wzr, w15, #13, hs +# CHECK-NEXT: 1 1 1.00 ccmn x9, xzr, #0, le +# CHECK-NEXT: 1 1 1.00 ccmn x3, x0, #15, gt +# CHECK-NEXT: 1 1 1.00 ccmn xzr, x5, #7, ne +# CHECK-NEXT: 1 1 0.50 csel w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csel wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csel w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csel w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csel x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csel xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csel x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csel x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 csinc w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csinc wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csinc w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csinc w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csinc x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csinc xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csinc x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csinc x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 csinv w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csinv wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csinv w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csinv w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csinv x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csinv xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csinv x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csinv x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 csneg w1, w0, w19, ne +# CHECK-NEXT: 1 1 0.50 csneg wzr, w5, w9, eq +# CHECK-NEXT: 1 1 0.50 csneg w9, wzr, w30, gt +# CHECK-NEXT: 1 1 0.50 csneg w1, w28, wzr, mi +# CHECK-NEXT: 1 1 0.50 csneg x19, x23, x29, lt +# CHECK-NEXT: 1 1 0.50 csneg xzr, x3, x4, ge +# CHECK-NEXT: 1 1 0.50 csneg x5, xzr, x6, hs +# CHECK-NEXT: 1 1 0.50 csneg x7, x8, xzr, lo +# CHECK-NEXT: 1 1 0.50 cset w3, eq +# CHECK-NEXT: 1 1 0.50 cset x9, pl +# CHECK-NEXT: 1 1 0.50 csetm w20, ne +# CHECK-NEXT: 1 1 0.50 csetm x30, ge +# CHECK-NEXT: 1 1 0.50 csinc w2, wzr, wzr, al +# CHECK-NEXT: 1 1 0.50 csinv x3, xzr, xzr, nv +# CHECK-NEXT: 1 1 0.50 cinc w3, w5, gt +# CHECK-NEXT: 1 1 0.50 cinc wzr, w4, le +# CHECK-NEXT: 1 1 0.50 cset w9, lt +# CHECK-NEXT: 1 1 0.50 cinc x3, x5, gt +# CHECK-NEXT: 1 1 0.50 cinc xzr, x4, le +# CHECK-NEXT: 1 1 0.50 cset x9, lt +# CHECK-NEXT: 1 1 0.50 csinc w5, w6, w6, nv +# CHECK-NEXT: 1 1 0.50 csinc x1, x2, x2, al +# CHECK-NEXT: 1 1 0.50 cinv w3, w5, gt +# CHECK-NEXT: 1 1 0.50 cinv wzr, w4, le +# CHECK-NEXT: 1 1 0.50 csetm w9, lt +# CHECK-NEXT: 1 1 0.50 cinv x3, x5, gt +# CHECK-NEXT: 1 1 0.50 cinv xzr, x4, le +# CHECK-NEXT: 1 1 0.50 csetm x9, lt +# CHECK-NEXT: 1 1 0.50 csinv x1, x0, x0, al +# CHECK-NEXT: 1 1 0.50 csinv w9, w8, w8, nv +# CHECK-NEXT: 1 1 0.50 cneg w3, w5, gt +# CHECK-NEXT: 1 1 0.50 cneg wzr, w4, le +# CHECK-NEXT: 1 1 0.50 cneg w9, wzr, lt +# CHECK-NEXT: 1 1 0.50 cneg x3, x5, gt +# CHECK-NEXT: 1 1 0.50 cneg xzr, x4, le +# CHECK-NEXT: 1 1 0.50 cneg x9, xzr, lt +# CHECK-NEXT: 1 1 0.50 csneg x4, x8, x8, al +# CHECK-NEXT: 1 1 0.50 rbit w0, w7 +# CHECK-NEXT: 1 1 0.50 rbit x18, x3 +# CHECK-NEXT: 1 1 0.50 rev16 w17, w1 +# CHECK-NEXT: 1 1 0.50 rev16 x5, x2 +# CHECK-NEXT: 1 1 0.50 rev w18, w0 +# CHECK-NEXT: 1 1 0.50 rev32 x20, x1 +# CHECK-NEXT: 1 1 0.50 rev x22, x2 +# CHECK-NEXT: 1 1 0.50 clz w24, w3 +# CHECK-NEXT: 1 1 0.50 clz x26, x4 +# CHECK-NEXT: 1 1 0.50 cls w3, w5 +# CHECK-NEXT: 1 1 0.50 cls x20, x5 +# CHECK-NEXT: 1 12 12.00 udiv w0, w7, w10 +# CHECK-NEXT: 1 20 20.00 udiv x9, x22, x4 +# CHECK-NEXT: 1 12 12.00 sdiv w12, w21, w0 +# CHECK-NEXT: 1 20 20.00 sdiv x13, x2, x1 +# CHECK-NEXT: 1 1 0.50 lsl w11, w12, w13 +# CHECK-NEXT: 1 1 0.50 lsl x14, x15, x16 +# CHECK-NEXT: 1 1 0.50 lsr w17, w18, w19 +# CHECK-NEXT: 1 1 0.50 lsr x20, x21, x22 +# CHECK-NEXT: 1 1 0.50 asr w23, w24, w25 +# CHECK-NEXT: 1 1 0.50 asr x26, x27, x28 +# CHECK-NEXT: 1 1 0.50 ror w0, w1, w2 +# CHECK-NEXT: 1 1 0.50 ror x3, x4, x5 +# CHECK-NEXT: 1 1 0.50 lsl w6, w7, w8 +# CHECK-NEXT: 1 1 0.50 lsl x9, x10, x11 +# CHECK-NEXT: 1 1 0.50 lsr w12, w13, w14 +# CHECK-NEXT: 1 1 0.50 lsr x15, x16, x17 +# CHECK-NEXT: 1 1 0.50 asr w18, w19, w20 +# CHECK-NEXT: 1 1 0.50 asr x21, x22, x23 +# CHECK-NEXT: 1 1 0.50 ror w24, w25, w26 +# CHECK-NEXT: 1 1 0.50 ror x27, x28, x29 +# CHECK-NEXT: 1 2 1.00 crc32cb w30, w23, w15 +# CHECK-NEXT: 1 2 1.00 crc32cb wzr, w12, w14 +# CHECK-NEXT: 1 2 1.00 crc32cb w28, w10, w11 +# CHECK-NEXT: 1 2 1.00 crc32b w27, w12, w15 +# CHECK-NEXT: 1 2 1.00 crc32h w3, w15, w21 +# CHECK-NEXT: 1 2 1.00 crc32w w9, w18, w24 +# CHECK-NEXT: 1 2 1.00 crc32x w19, w6, x25 +# CHECK-NEXT: 1 2 1.00 crc32ch w25, w26, w16 +# CHECK-NEXT: 1 2 1.00 crc32cw w27, w12, w23 +# CHECK-NEXT: 1 2 1.00 crc32cx w21, w28, x5 +# CHECK-NEXT: 1 6 4.00 smulh x30, x29, x28 +# CHECK-NEXT: 1 6 4.00 smulh xzr, x27, x26 +# CHECK-NEXT: 1 6 4.00 umulh x30, x29, x28 +# CHECK-NEXT: 1 6 4.00 umulh x23, x30, xzr +# CHECK-NEXT: 1 3 1.00 madd w1, w3, w7, w4 +# CHECK-NEXT: 1 3 1.00 madd wzr, w0, w9, w11 +# CHECK-NEXT: 1 3 1.00 madd w13, wzr, w4, w4 +# CHECK-NEXT: 1 3 1.00 madd w19, w30, wzr, w29 +# CHECK-NEXT: 1 3 1.00 mul w4, w5, w6 +# CHECK-NEXT: 1 4 2.00 madd x1, x3, x7, x4 +# CHECK-NEXT: 1 4 2.00 madd xzr, x0, x9, x11 +# CHECK-NEXT: 1 4 2.00 madd x13, xzr, x4, x4 +# CHECK-NEXT: 1 4 2.00 madd x19, x30, xzr, x29 +# CHECK-NEXT: 1 4 2.00 mul x4, x5, x6 +# CHECK-NEXT: 1 3 1.00 msub w1, w3, w7, w4 +# CHECK-NEXT: 1 3 1.00 msub wzr, w0, w9, w11 +# CHECK-NEXT: 1 3 1.00 msub w13, wzr, w4, w4 +# CHECK-NEXT: 1 3 1.00 msub w19, w30, wzr, w29 +# CHECK-NEXT: 1 3 1.00 mneg w4, w5, w6 +# CHECK-NEXT: 1 4 2.00 msub x1, x3, x7, x4 +# CHECK-NEXT: 1 4 2.00 msub xzr, x0, x9, x11 +# CHECK-NEXT: 1 4 2.00 msub x13, xzr, x4, x4 +# CHECK-NEXT: 1 4 2.00 msub x19, x30, xzr, x29 +# CHECK-NEXT: 1 4 2.00 mneg x4, x5, x6 +# CHECK-NEXT: 1 2 1.00 smaddl x3, w5, w2, x9 +# CHECK-NEXT: 1 2 1.00 smaddl xzr, w10, w11, x12 +# CHECK-NEXT: 1 2 1.00 smaddl x13, wzr, w14, x15 +# CHECK-NEXT: 1 2 1.00 smaddl x16, w17, wzr, x18 +# CHECK-NEXT: 1 2 1.00 smull x19, w20, w21 +# CHECK-NEXT: 1 2 1.00 smsubl x3, w5, w2, x9 +# CHECK-NEXT: 1 2 1.00 smsubl xzr, w10, w11, x12 +# CHECK-NEXT: 1 2 1.00 smsubl x13, wzr, w14, x15 +# CHECK-NEXT: 1 2 1.00 smsubl x16, w17, wzr, x18 +# CHECK-NEXT: 1 2 1.00 smnegl x19, w20, w21 +# CHECK-NEXT: 1 2 1.00 umaddl x3, w5, w2, x9 +# CHECK-NEXT: 1 2 1.00 umaddl xzr, w10, w11, x12 +# CHECK-NEXT: 1 2 1.00 umaddl x13, wzr, w14, x15 +# CHECK-NEXT: 1 2 1.00 umaddl x16, w17, wzr, x18 +# CHECK-NEXT: 1 2 1.00 umull x19, w20, w21 +# CHECK-NEXT: 1 2 1.00 umsubl x3, w5, w2, x9 +# CHECK-NEXT: 1 2 1.00 umsubl x16, w17, wzr, x18 +# CHECK-NEXT: 1 2 1.00 umnegl x19, w20, w21 +# CHECK-NEXT: 1 6 4.00 smulh x23, x22, xzr +# CHECK-NEXT: 1 6 4.00 umulh x23, x22, xzr +# CHECK-NEXT: 1 4 2.00 mul x19, x20, xzr +# CHECK-NEXT: 1 3 1.00 mneg w21, w22, w23 +# CHECK-NEXT: 1 2 1.00 smull x11, w13, w17 +# CHECK-NEXT: 1 2 1.00 umull x11, w13, w17 +# CHECK-NEXT: 1 2 1.00 smnegl x11, w13, w17 +# CHECK-NEXT: 1 2 1.00 umnegl x11, w13, w17 +# CHECK-NEXT: 1 2 0.50 extr w3, w5, w7, #0 +# CHECK-NEXT: 1 2 0.50 extr w11, w13, w17, #31 +# CHECK-NEXT: 1 2 0.50 extr x3, x5, x7, #15 +# CHECK-NEXT: 1 2 0.50 extr x11, x13, x17, #63 +# CHECK-NEXT: 1 1 0.50 ror x19, x23, #24 +# CHECK-NEXT: 1 1 0.50 ror x29, xzr, #63 +# CHECK-NEXT: 1 1 0.50 ror w9, w13, #31 +# CHECK-NEXT: 1 3 1.00 fcmp h5, h21 +# CHECK-NEXT: 1 3 1.00 fcmp h5, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe h22, h21 +# CHECK-NEXT: 1 3 1.00 fcmpe h13, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmp s3, s5 +# CHECK-NEXT: 1 3 1.00 fcmp s31, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe s29, s30 +# CHECK-NEXT: 1 3 1.00 fcmpe s15, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmp d4, d12 +# CHECK-NEXT: 1 3 1.00 fcmp d23, #0.0 +# CHECK-NEXT: 1 3 1.00 fcmpe d26, d22 +# CHECK-NEXT: 1 3 1.00 fcmpe d29, #0.0 +# CHECK-NEXT: 1 3 1.00 fccmp s1, s31, #0, eq +# CHECK-NEXT: 1 3 1.00 fccmp s3, s0, #15, hs +# CHECK-NEXT: 1 3 1.00 fccmp s31, s15, #13, hs +# CHECK-NEXT: 1 3 1.00 fccmp d9, d31, #0, le +# CHECK-NEXT: 1 3 1.00 fccmp d3, d0, #15, gt +# CHECK-NEXT: 1 3 1.00 fccmp d31, d5, #7, ne +# CHECK-NEXT: 1 3 1.00 fccmp h31, h3, #11, hs +# CHECK-NEXT: 1 3 1.00 fccmpe h6, h1, #12, ne +# CHECK-NEXT: 1 3 1.00 fccmpe s1, s31, #0, eq +# CHECK-NEXT: 1 3 1.00 fccmpe s3, s0, #15, hs +# CHECK-NEXT: 1 3 1.00 fccmpe s31, s15, #13, hs +# CHECK-NEXT: 1 3 1.00 fccmpe d9, d31, #0, le +# CHECK-NEXT: 1 3 1.00 fccmpe d3, d0, #15, gt +# CHECK-NEXT: 1 3 1.00 fccmpe d31, d5, #7, ne +# CHECK-NEXT: 1 3 1.00 fcsel s3, s20, s9, pl +# CHECK-NEXT: 1 3 1.00 fcsel d9, d10, d11, mi +# CHECK-NEXT: 1 3 1.00 fcsel h26, h2, h11, hs +# CHECK-NEXT: 1 4 0.50 fmov h18, h28 +# CHECK-NEXT: 1 2 0.50 fmov s0, s1 +# CHECK-NEXT: 1 4 0.50 fabs s2, s3 +# CHECK-NEXT: 1 4 0.50 fneg h2, h9 +# CHECK-NEXT: 1 4 0.50 fneg s4, s5 +# CHECK-NEXT: 1 14 4.50 fsqrt s6, s7 +# CHECK-NEXT: 1 4 0.50 fcvt d8, s9 +# CHECK-NEXT: 1 4 0.50 fcvt h10, s11 +# CHECK-NEXT: 1 4 0.50 frintn h12, h3 +# CHECK-NEXT: 1 4 0.50 frintn s12, s13 +# CHECK-NEXT: 1 4 0.50 frintp h17, h31 +# CHECK-NEXT: 1 4 0.50 frintp s14, s15 +# CHECK-NEXT: 1 4 0.50 frintm h0, h21 +# CHECK-NEXT: 1 4 0.50 frintm s16, s17 +# CHECK-NEXT: 1 4 0.50 frintz h10, h29 +# CHECK-NEXT: 1 4 0.50 frintz s18, s19 +# CHECK-NEXT: 1 4 0.50 frinta h22, h10 +# CHECK-NEXT: 1 4 0.50 frinta s20, s21 +# CHECK-NEXT: 1 4 0.50 frintx h4, h5 +# CHECK-NEXT: 1 4 0.50 frintx s22, s23 +# CHECK-NEXT: 1 4 0.50 frinti s24, s25 +# CHECK-NEXT: 1 4 0.50 frinti h31, h14 +# CHECK-NEXT: 1 2 0.50 fmov d0, d1 +# CHECK-NEXT: 1 4 0.50 fabs d2, d3 +# CHECK-NEXT: 1 4 0.50 fneg d4, d5 +# CHECK-NEXT: 1 11 2.50 fsqrt h13, h24 +# CHECK-NEXT: 1 25 9.50 fsqrt d6, d7 +# CHECK-NEXT: 1 4 0.50 fcvt s8, d9 +# CHECK-NEXT: 1 4 0.50 fcvt h10, d11 +# CHECK-NEXT: 1 4 0.50 frintn d12, d13 +# CHECK-NEXT: 1 4 0.50 frintp d14, d15 +# CHECK-NEXT: 1 4 0.50 frintm d16, d17 +# CHECK-NEXT: 1 4 0.50 frintz d18, d19 +# CHECK-NEXT: 1 4 0.50 frinta d20, d21 +# CHECK-NEXT: 1 4 0.50 frintx d22, d23 +# CHECK-NEXT: 1 4 0.50 frinti d24, d25 +# CHECK-NEXT: 1 4 0.50 fcvt s26, h27 +# CHECK-NEXT: 1 4 0.50 fcvt d28, h29 +# CHECK-NEXT: 1 4 0.50 fmul s20, s19, s17 +# CHECK-NEXT: 1 8 2.50 fdiv h1, h26, h23 +# CHECK-NEXT: 1 13 5.00 fdiv s1, s2, s3 +# CHECK-NEXT: 1 4 0.50 fadd h23, h27, h22 +# CHECK-NEXT: 1 4 0.50 fadd s4, s5, s6 +# CHECK-NEXT: 1 4 0.50 fsub h20, h11, h18 +# CHECK-NEXT: 1 4 0.50 fsub s7, s8, s9 +# CHECK-NEXT: 1 4 0.50 fmax s10, s11, s12 +# CHECK-NEXT: 1 4 0.50 fmax h8, h7, h11 +# CHECK-NEXT: 1 4 0.50 fmin s13, s14, s15 +# CHECK-NEXT: 1 4 0.50 fmaxnm h29, h13, h14 +# CHECK-NEXT: 1 4 0.50 fmaxnm s16, s17, s18 +# CHECK-NEXT: 1 4 0.50 fminnm s19, s20, s21 +# CHECK-NEXT: 1 4 0.50 fnmul h3, h15, h7 +# CHECK-NEXT: 1 4 0.50 fnmul s22, s23, s2 +# CHECK-NEXT: 1 4 0.50 fmul d20, d19, d17 +# CHECK-NEXT: 1 22 9.50 fdiv d1, d2, d3 +# CHECK-NEXT: 1 4 0.50 fadd d4, d5, d6 +# CHECK-NEXT: 1 4 0.50 fsub d7, d8, d9 +# CHECK-NEXT: 1 4 0.50 fmax d10, d11, d12 +# CHECK-NEXT: 1 4 0.50 fmin d13, d14, d15 +# CHECK-NEXT: 1 4 0.50 fmin h4, h13, h17 +# CHECK-NEXT: 1 4 0.50 fmaxnm d16, d17, d18 +# CHECK-NEXT: 1 4 0.50 fminnm d19, d20, d21 +# CHECK-NEXT: 1 4 0.50 fminnm h29, h23, h17 +# CHECK-NEXT: 1 4 0.50 fnmul d22, d23, d24 +# CHECK-NEXT: 1 4 0.50 fmadd h27, h0, h6, h28 +# CHECK-NEXT: 1 4 0.50 fmadd s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fmadd d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fmsub h25, h28, h12, h24 +# CHECK-NEXT: 1 4 0.50 fmsub s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fmsub d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fnmadd h3, h18, h31, h24 +# CHECK-NEXT: 1 4 0.50 fnmadd s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fnmadd d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fnmsub s3, s5, s6, s31 +# CHECK-NEXT: 1 4 0.50 fnmsub d3, d13, d0, d23 +# CHECK-NEXT: 1 4 0.50 fnmsub h3, h29, h24, h17 +# CHECK-NEXT: 1 4 0.50 fcvtzs w3, h5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs wzr, h20, #13 +# CHECK-NEXT: 1 4 0.50 fcvtzs w19, h0, #32 +# CHECK-NEXT: 1 4 0.50 fcvtzs x3, h5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs x12, h30, #45 +# CHECK-NEXT: 1 4 0.50 fcvtzs x19, h0, #64 +# CHECK-NEXT: 1 4 0.50 fcvtzs w3, s5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs wzr, s20, #13 +# CHECK-NEXT: 1 4 0.50 fcvtzs w19, s0, #32 +# CHECK-NEXT: 1 4 0.50 fcvtzs x3, s5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs x12, s30, #45 +# CHECK-NEXT: 1 4 0.50 fcvtzs x19, s0, #64 +# CHECK-NEXT: 1 4 0.50 fcvtzs w3, d5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs wzr, d20, #13 +# CHECK-NEXT: 1 4 0.50 fcvtzs w19, d0, #32 +# CHECK-NEXT: 1 4 0.50 fcvtzs x3, d5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs x12, d30, #45 +# CHECK-NEXT: 1 4 0.50 fcvtzs x19, d0, #64 +# CHECK-NEXT: 1 4 0.50 fcvtzu w3, h5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu wzr, h20, #13 +# CHECK-NEXT: 1 4 0.50 fcvtzu w19, h0, #32 +# CHECK-NEXT: 1 4 0.50 fcvtzu x3, h5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu x12, h30, #45 +# CHECK-NEXT: 1 4 0.50 fcvtzu x19, h0, #64 +# CHECK-NEXT: 1 4 0.50 fcvtzu w3, s5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu wzr, s20, #13 +# CHECK-NEXT: 1 4 0.50 fcvtzu w19, s0, #32 +# CHECK-NEXT: 1 4 0.50 fcvtzu x3, s5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu x12, s30, #45 +# CHECK-NEXT: 1 4 0.50 fcvtzu x19, s0, #64 +# CHECK-NEXT: 1 4 0.50 fcvtzu w3, d5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu wzr, d20, #13 +# CHECK-NEXT: 1 4 0.50 fcvtzu w19, d0, #32 +# CHECK-NEXT: 1 4 0.50 fcvtzu x3, d5, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu x12, d30, #45 +# CHECK-NEXT: 1 4 0.50 fcvtzu x19, d0, #64 +# CHECK-NEXT: 1 4 0.50 scvtf h23, w19, #1 +# CHECK-NEXT: 1 4 0.50 scvtf h31, wzr, #20 +# CHECK-NEXT: 1 4 0.50 scvtf h14, w0, #32 +# CHECK-NEXT: 1 4 0.50 scvtf h23, x19, #1 +# CHECK-NEXT: 1 4 0.50 scvtf h31, xzr, #20 +# CHECK-NEXT: 1 4 0.50 scvtf h14, x0, #64 +# CHECK-NEXT: 1 4 0.50 scvtf s23, w19, #1 +# CHECK-NEXT: 1 4 0.50 scvtf s31, wzr, #20 +# CHECK-NEXT: 1 4 0.50 scvtf s14, w0, #32 +# CHECK-NEXT: 1 4 0.50 scvtf s23, x19, #1 +# CHECK-NEXT: 1 4 0.50 scvtf s31, xzr, #20 +# CHECK-NEXT: 1 4 0.50 scvtf s14, x0, #64 +# CHECK-NEXT: 1 4 0.50 scvtf d23, w19, #1 +# CHECK-NEXT: 1 4 0.50 scvtf d31, wzr, #20 +# CHECK-NEXT: 1 4 0.50 scvtf d14, w0, #32 +# CHECK-NEXT: 1 4 0.50 scvtf d23, x19, #1 +# CHECK-NEXT: 1 4 0.50 scvtf d31, xzr, #20 +# CHECK-NEXT: 1 4 0.50 scvtf d14, x0, #64 +# CHECK-NEXT: 1 4 0.50 ucvtf h23, w19, #1 +# CHECK-NEXT: 1 4 0.50 ucvtf h31, wzr, #20 +# CHECK-NEXT: 1 4 0.50 ucvtf h14, w0, #32 +# CHECK-NEXT: 1 4 0.50 ucvtf h23, x19, #1 +# CHECK-NEXT: 1 4 0.50 ucvtf h31, xzr, #20 +# CHECK-NEXT: 1 4 0.50 ucvtf h14, x0, #64 +# CHECK-NEXT: 1 4 0.50 ucvtf s23, w19, #1 +# CHECK-NEXT: 1 4 0.50 ucvtf s31, wzr, #20 +# CHECK-NEXT: 1 4 0.50 ucvtf s14, w0, #32 +# CHECK-NEXT: 1 4 0.50 ucvtf s23, x19, #1 +# CHECK-NEXT: 1 4 0.50 ucvtf s31, xzr, #20 +# CHECK-NEXT: 1 4 0.50 ucvtf s14, x0, #64 +# CHECK-NEXT: 1 4 0.50 ucvtf d23, w19, #1 +# CHECK-NEXT: 1 4 0.50 ucvtf d31, wzr, #20 +# CHECK-NEXT: 1 4 0.50 ucvtf d14, w0, #32 +# CHECK-NEXT: 1 4 0.50 ucvtf d23, x19, #1 +# CHECK-NEXT: 1 4 0.50 ucvtf d31, xzr, #20 +# CHECK-NEXT: 1 4 0.50 ucvtf d14, x0, #64 +# CHECK-NEXT: 1 4 0.50 fcvtns w3, h31 +# CHECK-NEXT: 1 4 0.50 fcvtns xzr, h12 +# CHECK-NEXT: 1 4 0.50 fcvtnu wzr, h12 +# CHECK-NEXT: 1 4 0.50 fcvtnu x0, h0 +# CHECK-NEXT: 1 4 0.50 fcvtps wzr, h9 +# CHECK-NEXT: 1 4 0.50 fcvtps x12, h20 +# CHECK-NEXT: 1 4 0.50 fcvtpu w30, h23 +# CHECK-NEXT: 1 4 0.50 fcvtpu x29, h3 +# CHECK-NEXT: 1 4 0.50 fcvtms w2, h3 +# CHECK-NEXT: 1 4 0.50 fcvtms x4, h5 +# CHECK-NEXT: 1 4 0.50 fcvtmu w6, h7 +# CHECK-NEXT: 1 4 0.50 fcvtmu x8, h9 +# CHECK-NEXT: 1 4 0.50 fcvtzs w10, h11 +# CHECK-NEXT: 1 4 0.50 fcvtzs x12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtzu w14, h15 +# CHECK-NEXT: 1 4 0.50 fcvtzu x15, h16 +# CHECK-NEXT: 1 4 0.50 scvtf h17, w18 +# CHECK-NEXT: 1 4 0.50 scvtf h19, x20 +# CHECK-NEXT: 1 4 0.50 ucvtf h21, w22 +# CHECK-NEXT: 1 4 0.50 scvtf h23, x24 +# CHECK-NEXT: 1 4 0.50 fcvtas w25, h26 +# CHECK-NEXT: 1 4 0.50 fcvtas x27, h28 +# CHECK-NEXT: 1 4 0.50 fcvtau w29, h30 +# CHECK-NEXT: 1 4 0.50 fcvtau xzr, h0 +# CHECK-NEXT: 1 4 0.50 fcvtns w3, s31 +# CHECK-NEXT: 1 4 0.50 fcvtns xzr, s12 +# CHECK-NEXT: 1 4 0.50 fcvtnu wzr, s12 +# CHECK-NEXT: 1 4 0.50 fcvtnu x0, s0 +# CHECK-NEXT: 1 4 0.50 fcvtps wzr, s9 +# CHECK-NEXT: 1 4 0.50 fcvtps x12, s20 +# CHECK-NEXT: 1 4 0.50 fcvtpu w30, s23 +# CHECK-NEXT: 1 4 0.50 fcvtpu x29, s3 +# CHECK-NEXT: 1 4 0.50 fcvtms w2, s3 +# CHECK-NEXT: 1 4 0.50 fcvtms x4, s5 +# CHECK-NEXT: 1 4 0.50 fcvtmu w6, s7 +# CHECK-NEXT: 1 4 0.50 fcvtmu x8, s9 +# CHECK-NEXT: 1 4 0.50 fcvtzs w10, s11 +# CHECK-NEXT: 1 4 0.50 fcvtzs x12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtzu w14, s15 +# CHECK-NEXT: 1 4 0.50 fcvtzu x15, s16 +# CHECK-NEXT: 1 4 0.50 scvtf s17, w18 +# CHECK-NEXT: 1 4 0.50 scvtf s19, x20 +# CHECK-NEXT: 1 4 0.50 ucvtf s21, w22 +# CHECK-NEXT: 1 4 0.50 scvtf s23, x24 +# CHECK-NEXT: 1 4 0.50 fcvtas w25, s26 +# CHECK-NEXT: 1 4 0.50 fcvtas x27, s28 +# CHECK-NEXT: 1 4 0.50 fcvtau w29, s30 +# CHECK-NEXT: 1 4 0.50 fcvtau xzr, s0 +# CHECK-NEXT: 1 4 0.50 fcvtns w3, d31 +# CHECK-NEXT: 1 4 0.50 fcvtns xzr, d12 +# CHECK-NEXT: 1 4 0.50 fcvtnu wzr, d12 +# CHECK-NEXT: 1 4 0.50 fcvtnu x0, d0 +# CHECK-NEXT: 1 4 0.50 fcvtps wzr, d9 +# CHECK-NEXT: 1 4 0.50 fcvtps x12, d20 +# CHECK-NEXT: 1 4 0.50 fcvtpu w30, d23 +# CHECK-NEXT: 1 4 0.50 fcvtpu x29, d3 +# CHECK-NEXT: 1 4 0.50 fcvtms w2, d3 +# CHECK-NEXT: 1 4 0.50 fcvtms x4, d5 +# CHECK-NEXT: 1 4 0.50 fcvtmu w6, d7 +# CHECK-NEXT: 1 4 0.50 fcvtmu x8, d9 +# CHECK-NEXT: 1 4 0.50 fcvtzs w10, d11 +# CHECK-NEXT: 1 4 0.50 fcvtzs x12, d13 +# CHECK-NEXT: 1 4 0.50 fcvtzu w14, d15 +# CHECK-NEXT: 1 4 0.50 fcvtzu x15, d16 +# CHECK-NEXT: 1 4 0.50 scvtf d17, w18 +# CHECK-NEXT: 1 4 0.50 scvtf d19, x20 +# CHECK-NEXT: 1 4 0.50 ucvtf d21, w22 +# CHECK-NEXT: 1 4 0.50 ucvtf d23, x24 +# CHECK-NEXT: 1 4 0.50 fcvtas w25, d26 +# CHECK-NEXT: 1 4 0.50 fcvtas x27, d28 +# CHECK-NEXT: 1 4 0.50 fcvtau w29, d30 +# CHECK-NEXT: 1 4 0.50 fcvtau xzr, d0 +# CHECK-NEXT: 1 3 0.50 fmov h6, w5 +# CHECK-NEXT: 1 3 0.50 fmov h16, x27 +# CHECK-NEXT: 1 3 0.50 fmov w15, h31 +# CHECK-NEXT: 1 3 0.50 fmov w3, s9 +# CHECK-NEXT: 1 3 0.50 fmov s9, w3 +# CHECK-NEXT: 1 3 0.50 fmov x21, h14 +# CHECK-NEXT: 1 3 0.50 fmov x20, d31 +# CHECK-NEXT: 1 3 0.50 fmov d1, x15 +# CHECK-NEXT: 1 3 0.50 fmov x3, v12.d[1] +# CHECK-NEXT: 1 3 0.50 fmov v1.d[1], x19 +# CHECK-NEXT: 1 3 0.50 fmov h29, #0.50000000 +# CHECK-NEXT: 1 3 0.50 fmov s2, #0.12500000 +# CHECK-NEXT: 1 3 0.50 fmov s3, #1.00000000 +# CHECK-NEXT: 1 3 0.50 fmov d30, #16.00000000 +# CHECK-NEXT: 1 3 0.50 fmov s4, #1.06250000 +# CHECK-NEXT: 1 3 0.50 fmov d10, #1.93750000 +# CHECK-NEXT: 1 3 0.50 fmov s12, #-1.00000000 +# CHECK-NEXT: 1 3 0.50 fmov d16, #8.50000000 +# CHECK-NEXT: 1 2 0.50 * ldr w3, #0 +# CHECK-NEXT: 1 2 0.50 * ldr x29, #4 +# CHECK-NEXT: 1 2 0.50 * ldrsw xzr, #-4 +# CHECK-NEXT: 1 3 0.50 * ldr s0, #8 +# CHECK-NEXT: 1 3 0.50 * ldr d0, #1048572 +# CHECK-NEXT: 1 3 0.50 * ldr q0, #-1048576 +# CHECK-NEXT: 1 2 0.50 U prfm pldl1strm, #0 +# CHECK-NEXT: 1 2 0.50 U prfm #25, #0 +# CHECK-NEXT: 2 3 1.00 * * U stxrb w18, w8, [sp] +# CHECK-NEXT: 2 3 1.00 * * U stxrh w24, w15, [x16] +# CHECK-NEXT: 2 3 1.00 * * U stxr w5, w6, [x17] +# CHECK-NEXT: 2 3 1.00 * * U stxr w1, x10, [x21] +# CHECK-NEXT: 1 2 0.50 * * U ldxrb w30, [x0] +# CHECK-NEXT: 1 2 0.50 * * U ldxrh w17, [x4] +# CHECK-NEXT: 1 2 0.50 * * U ldxr w22, [sp] +# CHECK-NEXT: 1 2 0.50 * * U ldxr x11, [x29] +# CHECK-NEXT: 2 3 1.00 * * U stxp w12, w11, w10, [sp] +# CHECK-NEXT: 2 3 1.00 * * U stxp wzr, x27, x9, [x12] +# CHECK-NEXT: 2 2 1.00 * * U ldxp w0, wzr, [sp] +# CHECK-NEXT: 2 2 1.00 * * U ldxp x17, x0, [x18] +# CHECK-NEXT: 2 3 1.00 * * U stlxrb w12, w22, [x0] +# CHECK-NEXT: 2 3 1.00 * * U stlxrh w10, w1, [x1] +# CHECK-NEXT: 2 3 1.00 * * U stlxr w9, w2, [x2] +# CHECK-NEXT: 2 3 1.00 * * U stlxr w9, x3, [sp] +# CHECK-NEXT: 1 2 0.50 * * U ldaxrb w8, [x4] +# CHECK-NEXT: 1 2 0.50 * * U ldaxrh w7, [x5] +# CHECK-NEXT: 1 2 0.50 * * U ldaxr w6, [sp] +# CHECK-NEXT: 1 2 0.50 * * U ldaxr x5, [x6] +# CHECK-NEXT: 2 3 1.00 * * U stlxp w4, w5, w6, [sp] +# CHECK-NEXT: 2 3 1.00 * * U stlxp wzr, x6, x7, [x1] +# CHECK-NEXT: 2 2 1.00 * * U ldaxp w5, w18, [sp] +# CHECK-NEXT: 2 2 1.00 * * U ldaxp x6, x19, [x22] +# CHECK-NEXT: 1 1 1.00 * U stlrb w24, [sp] +# CHECK-NEXT: 1 1 1.00 * U stlrh w25, [x30] +# CHECK-NEXT: 1 1 1.00 * U stlr w26, [x29] +# CHECK-NEXT: 1 1 1.00 * U stlr x27, [x28] +# CHECK-NEXT: 1 2 0.50 * U ldarb w16, [x21] +# CHECK-NEXT: 1 2 0.50 * U ldarb w23, [sp] +# CHECK-NEXT: 1 2 0.50 * U ldarh w22, [x30] +# CHECK-NEXT: 1 2 0.50 * U ldar wzr, [x29] +# CHECK-NEXT: 1 2 0.50 * U ldar x21, [x28] +# CHECK-NEXT: 1 1 1.00 * sturb w9, [sp] +# CHECK-NEXT: 1 1 1.00 * sturh wzr, [x12, #255] +# CHECK-NEXT: 1 1 1.00 * stur w16, [x0, #-256] +# CHECK-NEXT: 1 1 1.00 * stur x28, [x14, #1] +# CHECK-NEXT: 1 2 0.50 * ldurb w1, [x20, #255] +# CHECK-NEXT: 1 2 0.50 * ldurh w20, [x1, #255] +# CHECK-NEXT: 1 2 0.50 * ldur w12, [sp, #255] +# CHECK-NEXT: 1 2 0.50 * ldur xzr, [x12, #255] +# CHECK-NEXT: 1 2 0.50 * ldursb x9, [x7, #-256] +# CHECK-NEXT: 1 2 0.50 * ldursh x17, [x19, #-256] +# CHECK-NEXT: 1 2 0.50 * ldursw x20, [x15, #-256] +# CHECK-NEXT: 1 2 0.50 U prfum pldl2keep, [sp, #-256] +# CHECK-NEXT: 1 2 0.50 * ldursb w19, [x1, #-256] +# CHECK-NEXT: 1 2 0.50 * ldursh w15, [x21, #-256] +# CHECK-NEXT: 1 1 1.00 * stur b0, [sp, #1] +# CHECK-NEXT: 1 1 1.00 * stur h12, [x12, #-1] +# CHECK-NEXT: 1 1 1.00 * stur s15, [x0, #255] +# CHECK-NEXT: 1 1 1.00 * stur d31, [x5, #25] +# CHECK-NEXT: 1 1 1.00 * stur q9, [x5] +# CHECK-NEXT: 1 3 0.50 * ldur b3, [sp] +# CHECK-NEXT: 1 3 0.50 * ldur h5, [x4, #-256] +# CHECK-NEXT: 1 3 0.50 * ldur s7, [x12, #-1] +# CHECK-NEXT: 1 3 0.50 * ldur d11, [x19, #4] +# CHECK-NEXT: 1 3 0.50 * ldur q13, [x1, #2] +# CHECK-NEXT: 2 1 1.00 * strb w9, [x2], #255 +# CHECK-NEXT: 2 1 1.00 * strb w10, [x3], #1 +# CHECK-NEXT: 2 1 1.00 * strb w10, [x3], #-256 +# CHECK-NEXT: 2 1 1.00 * strh w9, [x2], #255 +# CHECK-NEXT: 2 1 1.00 * strh w9, [x2], #1 +# CHECK-NEXT: 2 1 1.00 * strh w10, [x3], #-256 +# CHECK-NEXT: 2 1 1.00 * str w19, [sp], #255 +# CHECK-NEXT: 2 1 1.00 * str w20, [x30], #1 +# CHECK-NEXT: 2 1 1.00 * str w21, [x12], #-256 +# CHECK-NEXT: 2 1 1.00 * str xzr, [x9], #255 +# CHECK-NEXT: 2 1 1.00 * str x2, [x3], #1 +# CHECK-NEXT: 2 1 1.00 * str x19, [x12], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrb w9, [x2], #255 +# CHECK-NEXT: 2 2 0.50 * ldrb w10, [x3], #1 +# CHECK-NEXT: 2 2 0.50 * ldrb w10, [x3], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrh w9, [x2], #255 +# CHECK-NEXT: 2 2 0.50 * ldrh w9, [x2], #1 +# CHECK-NEXT: 2 2 0.50 * ldrh w10, [x3], #-256 +# CHECK-NEXT: 3 2 1.00 * ldr w19, [sp], #255 +# CHECK-NEXT: 3 2 1.00 * ldr w20, [x30], #1 +# CHECK-NEXT: 3 2 1.00 * ldr w21, [x12], #-256 +# CHECK-NEXT: 3 2 1.00 * ldr xzr, [x9], #255 +# CHECK-NEXT: 3 2 1.00 * ldr x2, [x3], #1 +# CHECK-NEXT: 3 2 1.00 * ldr x19, [x12], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrsb xzr, [x9], #255 +# CHECK-NEXT: 2 2 0.50 * ldrsb x2, [x3], #1 +# CHECK-NEXT: 2 2 0.50 * ldrsb x19, [x12], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrsh xzr, [x9], #255 +# CHECK-NEXT: 2 2 0.50 * ldrsh x2, [x3], #1 +# CHECK-NEXT: 2 2 0.50 * ldrsh x19, [x12], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrsw xzr, [x9], #255 +# CHECK-NEXT: 2 2 0.50 * ldrsw x2, [x3], #1 +# CHECK-NEXT: 2 2 0.50 * ldrsw x19, [x12], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrsb wzr, [x9], #255 +# CHECK-NEXT: 2 2 0.50 * ldrsb w2, [x3], #1 +# CHECK-NEXT: 2 2 0.50 * ldrsb w19, [x12], #-256 +# CHECK-NEXT: 2 2 0.50 * ldrsh wzr, [x9], #255 +# CHECK-NEXT: 2 2 0.50 * ldrsh w2, [x3], #1 +# CHECK-NEXT: 2 2 0.50 * ldrsh w19, [x12], #-256 +# CHECK-NEXT: 2 1 1.00 * str b0, [x0], #255 +# CHECK-NEXT: 2 1 1.00 * str b3, [x3], #1 +# CHECK-NEXT: 2 1 1.00 * str b5, [sp], #-256 +# CHECK-NEXT: 2 1 1.00 * str h10, [x10], #255 +# CHECK-NEXT: 2 1 1.00 * str h13, [x23], #1 +# CHECK-NEXT: 2 1 1.00 * str h15, [sp], #-256 +# CHECK-NEXT: 2 1 1.00 * str s20, [x20], #255 +# CHECK-NEXT: 2 1 1.00 * str s23, [x23], #1 +# CHECK-NEXT: 2 1 1.00 * str s25, [x0], #-256 +# CHECK-NEXT: 2 1 1.00 * str d20, [x20], #255 +# CHECK-NEXT: 2 1 1.00 * str d23, [x23], #1 +# CHECK-NEXT: 2 1 1.00 * str d25, [x0], #-256 +# CHECK-NEXT: 3 3 1.00 * ldr b0, [x0], #255 +# CHECK-NEXT: 3 3 1.00 * ldr b3, [x3], #1 +# CHECK-NEXT: 3 3 1.00 * ldr b5, [sp], #-256 +# CHECK-NEXT: 3 3 1.00 * ldr h10, [x10], #255 +# CHECK-NEXT: 3 3 1.00 * ldr h13, [x23], #1 +# CHECK-NEXT: 3 3 1.00 * ldr h15, [sp], #-256 +# CHECK-NEXT: 3 3 1.00 * ldr s20, [x20], #255 +# CHECK-NEXT: 3 3 1.00 * ldr s23, [x23], #1 +# CHECK-NEXT: 3 3 1.00 * ldr s25, [x0], #-256 +# CHECK-NEXT: 3 3 1.00 * ldr d20, [x20], #255 +# CHECK-NEXT: 3 3 1.00 * ldr d23, [x23], #1 +# CHECK-NEXT: 3 3 1.00 * ldr d25, [x0], #-256 +# CHECK-NEXT: 3 3 1.00 * ldr q20, [x1], #255 +# CHECK-NEXT: 3 3 1.00 * ldr q23, [x9], #1 +# CHECK-NEXT: 3 3 1.00 * ldr q25, [x20], #-256 +# CHECK-NEXT: 2 1 1.00 * str q10, [x1], #255 +# CHECK-NEXT: 2 1 1.00 * str q22, [sp], #1 +# CHECK-NEXT: 2 1 1.00 * str q21, [x20], #-256 +# CHECK-NEXT: 3 2 1.00 * ldr x3, [x4, #0]! +# CHECK-NEXT: 2 1 1.00 * strb w9, [x2, #255]! +# CHECK-NEXT: 2 1 1.00 * strb w10, [x3, #1]! +# CHECK-NEXT: 2 1 1.00 * strb w10, [x3, #-256]! +# CHECK-NEXT: 2 1 1.00 * strh w9, [x2, #255]! +# CHECK-NEXT: 2 1 1.00 * strh w9, [x2, #1]! +# CHECK-NEXT: 2 1 1.00 * strh w10, [x3, #-256]! +# CHECK-NEXT: 2 1 1.00 * str w19, [sp, #255]! +# CHECK-NEXT: 2 1 1.00 * str w20, [x30, #1]! +# CHECK-NEXT: 2 1 1.00 * str w21, [x12, #-256]! +# CHECK-NEXT: 2 1 1.00 * str xzr, [x9, #255]! +# CHECK-NEXT: 2 1 1.00 * str x2, [x3, #1]! +# CHECK-NEXT: 2 1 1.00 * str x19, [x12, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrb w9, [x2, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrb w10, [x3, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrb w10, [x3, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrh w9, [x2, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrh w9, [x2, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrh w10, [x3, #-256]! +# CHECK-NEXT: 3 2 1.00 * ldr w19, [sp, #255]! +# CHECK-NEXT: 3 2 1.00 * ldr w20, [x30, #1]! +# CHECK-NEXT: 3 2 1.00 * ldr w21, [x12, #-256]! +# CHECK-NEXT: 3 2 1.00 * ldr xzr, [x9, #255]! +# CHECK-NEXT: 3 2 1.00 * ldr x2, [x3, #1]! +# CHECK-NEXT: 3 2 1.00 * ldr x19, [x12, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrsb xzr, [x9, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrsb x2, [x3, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrsb x19, [x12, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrsh xzr, [x9, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrsh x2, [x3, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrsh x19, [x12, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrsw xzr, [x9, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrsw x2, [x3, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrsw x19, [x12, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrsb wzr, [x9, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrsb w2, [x3, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrsb w19, [x12, #-256]! +# CHECK-NEXT: 2 2 0.50 * ldrsh wzr, [x9, #255]! +# CHECK-NEXT: 2 2 0.50 * ldrsh w2, [x3, #1]! +# CHECK-NEXT: 2 2 0.50 * ldrsh w19, [x12, #-256]! +# CHECK-NEXT: 2 1 1.00 * str b0, [x0, #255]! +# CHECK-NEXT: 2 1 1.00 * str b3, [x3, #1]! +# CHECK-NEXT: 2 1 1.00 * str b5, [sp, #-256]! +# CHECK-NEXT: 2 1 1.00 * str h10, [x10, #255]! +# CHECK-NEXT: 2 1 1.00 * str h13, [x23, #1]! +# CHECK-NEXT: 2 1 1.00 * str h15, [sp, #-256]! +# CHECK-NEXT: 2 1 1.00 * str s20, [x20, #255]! +# CHECK-NEXT: 2 1 1.00 * str s23, [x23, #1]! +# CHECK-NEXT: 2 1 1.00 * str s25, [x0, #-256]! +# CHECK-NEXT: 2 1 1.00 * str d20, [x20, #255]! +# CHECK-NEXT: 2 1 1.00 * str d23, [x23, #1]! +# CHECK-NEXT: 2 1 1.00 * str d25, [x0, #-256]! +# CHECK-NEXT: 3 3 1.00 * ldr b0, [x0, #255]! +# CHECK-NEXT: 3 3 1.00 * ldr b3, [x3, #1]! +# CHECK-NEXT: 3 3 1.00 * ldr b5, [sp, #-256]! +# CHECK-NEXT: 3 3 1.00 * ldr h10, [x10, #255]! +# CHECK-NEXT: 3 3 1.00 * ldr h13, [x23, #1]! +# CHECK-NEXT: 3 3 1.00 * ldr h15, [sp, #-256]! +# CHECK-NEXT: 3 3 1.00 * ldr s20, [x20, #255]! +# CHECK-NEXT: 3 3 1.00 * ldr s23, [x23, #1]! +# CHECK-NEXT: 3 3 1.00 * ldr s25, [x0, #-256]! +# CHECK-NEXT: 3 3 1.00 * ldr d20, [x20, #255]! +# CHECK-NEXT: 3 3 1.00 * ldr d23, [x23, #1]! +# CHECK-NEXT: 3 3 1.00 * ldr d25, [x0, #-256]! +# CHECK-NEXT: 3 3 1.00 * ldr q20, [x1, #255]! +# CHECK-NEXT: 3 3 1.00 * ldr q23, [x9, #1]! +# CHECK-NEXT: 3 3 1.00 * ldr q25, [x20, #-256]! +# CHECK-NEXT: 2 1 1.00 * str q10, [x1, #255]! +# CHECK-NEXT: 2 1 1.00 * str q22, [sp, #1]! +# CHECK-NEXT: 2 1 1.00 * str q21, [x20, #-256]! +# CHECK-NEXT: 1 1 1.00 * sttrb w9, [sp] +# CHECK-NEXT: 1 1 1.00 * sttrh wzr, [x12, #255] +# CHECK-NEXT: 1 1 1.00 * sttr w16, [x0, #-256] +# CHECK-NEXT: 1 1 1.00 * sttr x28, [x14, #1] +# CHECK-NEXT: 1 2 0.50 * ldtrb w1, [x20, #255] +# CHECK-NEXT: 1 2 0.50 * ldtrh w20, [x1, #255] +# CHECK-NEXT: 1 2 0.50 * ldtr w12, [sp, #255] +# CHECK-NEXT: 1 2 0.50 * ldtr xzr, [x12, #255] +# CHECK-NEXT: 1 2 0.50 * ldtrsb x9, [x7, #-256] +# CHECK-NEXT: 1 2 0.50 * ldtrsh x17, [x19, #-256] +# CHECK-NEXT: 1 2 0.50 * ldtrsw x20, [x15, #-256] +# CHECK-NEXT: 1 2 0.50 * ldtrsb w19, [x1, #-256] +# CHECK-NEXT: 1 2 0.50 * ldtrsh w15, [x21, #-256] +# CHECK-NEXT: 1 2 0.50 * ldr x4, [x29] +# CHECK-NEXT: 1 2 0.50 * ldr x30, [x12, #32760] +# CHECK-NEXT: 1 2 0.50 * ldr x20, [sp, #8] +# CHECK-NEXT: 1 2 0.50 * ldr xzr, [sp] +# CHECK-NEXT: 1 2 0.50 * ldr w2, [sp] +# CHECK-NEXT: 1 2 0.50 * ldr w17, [sp, #16380] +# CHECK-NEXT: 1 2 0.50 * ldr w13, [x2, #4] +# CHECK-NEXT: 1 2 0.50 * ldrsw x2, [x5, #4] +# CHECK-NEXT: 1 2 0.50 * ldrsw x23, [sp, #16380] +# CHECK-NEXT: 1 2 0.50 * ldrsw x21, [x25, x7] +# CHECK-NEXT: 1 2 0.50 * ldrh w2, [x4] +# CHECK-NEXT: 1 2 0.50 * ldrsh w23, [x6, #8190] +# CHECK-NEXT: 1 2 0.50 * ldrsh wzr, [sp, #2] +# CHECK-NEXT: 1 2 0.50 * ldrsh x29, [x2, #2] +# CHECK-NEXT: 1 2 0.50 * ldrsh x25, [x8, w13, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldrb w26, [x3, #121] +# CHECK-NEXT: 1 2 0.50 * ldrb w12, [x2] +# CHECK-NEXT: 1 2 0.50 * ldrsb w27, [sp, #4095] +# CHECK-NEXT: 1 2 0.50 * ldrsb xzr, [x15] +# CHECK-NEXT: 1 2 0.50 * ldrsb x12, [x28, x27] +# CHECK-NEXT: 1 1 1.00 * str x30, [sp] +# CHECK-NEXT: 1 1 1.00 * str w20, [x4, #16380] +# CHECK-NEXT: 1 1 1.00 * str b5, [x11] +# CHECK-NEXT: 1 1 1.00 * str h23, [x15] +# CHECK-NEXT: 1 1 1.00 * str s25, [x19] +# CHECK-NEXT: 1 1 1.00 * str d15, [x2] +# CHECK-NEXT: 1 1 1.00 * strh w17, [sp, #8190] +# CHECK-NEXT: 1 1 1.00 * strb w23, [x3, #4095] +# CHECK-NEXT: 1 1 1.00 * strb wzr, [x2] +# CHECK-NEXT: 1 3 0.50 * ldr b31, [sp, #4095] +# CHECK-NEXT: 1 3 0.50 * ldr h20, [x2, #8190] +# CHECK-NEXT: 1 3 0.50 * ldr s10, [x19, #16380] +# CHECK-NEXT: 1 3 0.50 * ldr d3, [x10, #32760] +# CHECK-NEXT: 1 1 1.00 * str q12, [sp, #65520] +# CHECK-NEXT: 1 3 0.50 * ldr q14, [x6, #4624] +# CHECK-NEXT: 1 2 0.50 * ldrb w3, [sp, x5] +# CHECK-NEXT: 1 2 0.50 * ldrb w9, [x27, x6] +# CHECK-NEXT: 1 2 0.50 * ldrsb w10, [x30, x7] +# CHECK-NEXT: 1 2 0.50 * ldrb w11, [x29, x3, sxtx] +# CHECK-NEXT: 1 1 1.00 * strb w12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 1 1.00 * strb w5, [x26, w7, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldrb w14, [x26, w6, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldrsb w15, [x25, w7, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldrb w17, [x23, w9, sxtw] +# CHECK-NEXT: 1 2 0.50 * ldrsb x18, [x22, w10, sxtw] +# CHECK-NEXT: 1 2 0.50 * ldrsh w3, [sp, x5] +# CHECK-NEXT: 1 2 0.50 * ldrsh w9, [x27, x6] +# CHECK-NEXT: 1 2 0.50 * ldrh w10, [x30, x7, lsl #1] +# CHECK-NEXT: 1 1 1.00 * strh w11, [x29, x3, sxtx] +# CHECK-NEXT: 1 2 0.50 * ldrh w12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 2 0.50 * ldrsh x13, [x27, x5, sxtx #1] +# CHECK-NEXT: 1 2 0.50 * ldrh w14, [x26, w6, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldrh w15, [x25, w7, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldrsh w16, [x24, w8, uxtw #1] +# CHECK-NEXT: 1 2 0.50 * ldrh w17, [x23, w9, sxtw] +# CHECK-NEXT: 1 2 0.50 * ldrh w18, [x22, w10, sxtw] +# CHECK-NEXT: 1 1 1.00 * strh w19, [x21, wzr, sxtw #1] +# CHECK-NEXT: 1 3 0.50 * ldr b25, [x21, w8, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr b8, [x30, x10] +# CHECK-NEXT: 1 1 1.00 * str b14, [x13, x25] +# CHECK-NEXT: 1 1 1.00 * str b30, [x16, w26, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr h3, [sp, x5] +# CHECK-NEXT: 1 3 0.50 * ldr h9, [x27, x6] +# CHECK-NEXT: 1 3 0.50 * ldr h10, [x30, x7, lsl #1] +# CHECK-NEXT: 1 1 1.00 * str h11, [x29, x3, sxtx] +# CHECK-NEXT: 1 1 1.00 * str h12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 1 1.00 * str h13, [x27, x5, sxtx #1] +# CHECK-NEXT: 1 3 0.50 * ldr h14, [x26, w6, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr h15, [x25, w7, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr h16, [x24, w8, uxtw #1] +# CHECK-NEXT: 1 3 0.50 * ldr h17, [x23, w9, sxtw] +# CHECK-NEXT: 1 1 1.00 * str h18, [x22, w10, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldr h19, [x21, wzr, sxtw #1] +# CHECK-NEXT: 1 3 0.50 * ldr s12, [x30, w5, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr d24, [x26, w7, uxtw] +# CHECK-NEXT: 1 1 1.00 * str s20, [x24, w10, uxtw] +# CHECK-NEXT: 1 1 1.00 * str d5, [x26, x6] +# CHECK-NEXT: 1 2 0.50 * ldr w3, [sp, x5] +# CHECK-NEXT: 1 3 0.50 * ldr s9, [x27, x6] +# CHECK-NEXT: 1 2 0.50 * ldr w10, [x30, x7, lsl #2] +# CHECK-NEXT: 1 2 0.50 * ldr w11, [x29, x3, sxtx] +# CHECK-NEXT: 1 1 1.00 * str s12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 1 1.00 * str w13, [x27, x5, sxtx #2] +# CHECK-NEXT: 1 1 1.00 * str w14, [x26, w6, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldr w15, [x25, w7, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldr w16, [x24, w8, uxtw #2] +# CHECK-NEXT: 1 2 0.50 * ldrsw x17, [x23, w9, sxtw] +# CHECK-NEXT: 1 2 0.50 * ldr w18, [x22, w10, sxtw] +# CHECK-NEXT: 1 2 0.50 * ldrsw x19, [x21, wzr, sxtw #2] +# CHECK-NEXT: 1 2 0.50 * ldr x3, [sp, x5] +# CHECK-NEXT: 1 1 1.00 * str x9, [x27, x6] +# CHECK-NEXT: 1 3 0.50 * ldr d10, [x30, x7, lsl #3] +# CHECK-NEXT: 1 1 1.00 * str x11, [x29, x3, sxtx] +# CHECK-NEXT: 1 2 0.50 * ldr x12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 2 0.50 * ldr x13, [x27, x5, sxtx #3] +# CHECK-NEXT: 1 2 0.50 U prfm pldl1keep, [x26, w6, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldr x15, [x25, w7, uxtw] +# CHECK-NEXT: 1 1 1.00 * str x27, [x26, w24, uxtw] +# CHECK-NEXT: 1 2 0.50 * ldr x16, [x24, w8, uxtw #3] +# CHECK-NEXT: 1 2 0.50 * ldr x17, [x23, w9, sxtw] +# CHECK-NEXT: 1 2 0.50 * ldr x18, [x22, w10, sxtw] +# CHECK-NEXT: 1 1 1.00 * str d19, [x21, wzr, sxtw #3] +# CHECK-NEXT: 1 3 0.50 * ldr q3, [sp, x5] +# CHECK-NEXT: 1 3 0.50 * ldr q9, [x27, x6] +# CHECK-NEXT: 1 3 0.50 * ldr q10, [x30, x7, lsl #4] +# CHECK-NEXT: 1 1 1.00 * str q11, [x29, x3, sxtx] +# CHECK-NEXT: 1 1 1.00 * str q12, [x28, xzr, sxtx] +# CHECK-NEXT: 1 1 1.00 * str q13, [x27, x5, sxtx #4] +# CHECK-NEXT: 1 3 0.50 * ldr q14, [x26, w6, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr q15, [x25, w7, uxtw] +# CHECK-NEXT: 1 3 0.50 * ldr q16, [x24, w8, uxtw #4] +# CHECK-NEXT: 1 3 0.50 * ldr q17, [x23, w9, sxtw] +# CHECK-NEXT: 1 1 1.00 * str q18, [x22, w10, sxtw] +# CHECK-NEXT: 1 3 0.50 * ldr q19, [x21, wzr, sxtw #4] +# CHECK-NEXT: 1 2 0.50 * ldp w3, w5, [sp] +# CHECK-NEXT: 1 1 1.00 * stp wzr, w9, [sp, #252] +# CHECK-NEXT: 1 2 0.50 * ldp w2, wzr, [sp, #-256] +# CHECK-NEXT: 1 2 0.50 * ldp w9, w10, [sp, #4] +# CHECK-NEXT: 1 2 0.50 * ldpsw x9, x10, [sp, #4] +# CHECK-NEXT: 1 2 0.50 * ldpsw x9, x10, [x2, #-256] +# CHECK-NEXT: 1 2 0.50 * ldpsw x20, x30, [sp, #252] +# CHECK-NEXT: 1 2 0.50 * ldp x21, x29, [x2, #504] +# CHECK-NEXT: 1 2 0.50 * ldp x22, x23, [x3, #-512] +# CHECK-NEXT: 1 2 0.50 * ldp x24, x25, [x4, #8] +# CHECK-NEXT: 2 3 1.00 * ldp s29, s28, [sp, #252] +# CHECK-NEXT: 1 1 1.00 * stp s27, s26, [sp, #-256] +# CHECK-NEXT: 2 3 1.00 * ldp s1, s2, [x3, #44] +# CHECK-NEXT: 1 1 1.00 * stp d3, d5, [x9, #504] +# CHECK-NEXT: 1 1 1.00 * stp d7, d11, [x10, #-512] +# CHECK-NEXT: 1 1 1.00 * stnp x20, x16, [x8] +# CHECK-NEXT: 1 1 1.00 * stp x3, x6, [x16] +# CHECK-NEXT: 2 3 1.00 * ldp d2, d3, [x30, #-8] +# CHECK-NEXT: 1 1 2.00 * stp q3, q5, [sp] +# CHECK-NEXT: 1 1 2.00 * stp q17, q19, [sp, #1008] +# CHECK-NEXT: 2 3 1.00 * ldp q23, q29, [x1, #-1024] +# CHECK-NEXT: 3 2 1.00 * ldp w3, w5, [sp], #0 +# CHECK-NEXT: 2 1 1.00 * stp wzr, w9, [sp], #252 +# CHECK-NEXT: 3 2 1.00 * ldp w2, wzr, [sp], #-256 +# CHECK-NEXT: 3 2 1.00 * ldp w9, w10, [sp], #4 +# CHECK-NEXT: 3 2 1.00 * ldpsw x9, x10, [sp], #4 +# CHECK-NEXT: 3 2 1.00 * ldpsw x9, x10, [x2], #-256 +# CHECK-NEXT: 3 2 1.00 * ldpsw x20, x30, [sp], #252 +# CHECK-NEXT: 3 2 1.00 * ldp x21, x29, [x2], #504 +# CHECK-NEXT: 3 2 1.00 * ldp x22, x23, [x3], #-512 +# CHECK-NEXT: 3 2 1.00 * ldp x24, x25, [x4], #8 +# CHECK-NEXT: 3 3 1.00 * ldp s29, s28, [sp], #252 +# CHECK-NEXT: 2 1 1.00 * stp s27, s26, [sp], #-256 +# CHECK-NEXT: 3 3 1.00 * ldp s1, s2, [x3], #44 +# CHECK-NEXT: 2 1 1.00 * stp d3, d5, [x9], #504 +# CHECK-NEXT: 2 1 1.00 * stp d7, d11, [x10], #-512 +# CHECK-NEXT: 3 3 1.00 * ldp d2, d3, [x30], #-8 +# CHECK-NEXT: 1 1 2.00 * stp q3, q5, [sp], #0 +# CHECK-NEXT: 1 1 2.00 * stp q17, q19, [sp], #1008 +# CHECK-NEXT: 3 3 1.00 * ldp q23, q29, [x1], #-1024 +# CHECK-NEXT: 3 2 1.00 * ldp w3, w5, [sp, #0]! +# CHECK-NEXT: 2 1 1.00 * stp wzr, w9, [sp, #252]! +# CHECK-NEXT: 3 2 1.00 * ldp w2, wzr, [sp, #-256]! +# CHECK-NEXT: 3 2 1.00 * ldp w9, w10, [sp, #4]! +# CHECK-NEXT: 3 2 1.00 * ldpsw x9, x10, [sp, #4]! +# CHECK-NEXT: 3 2 1.00 * ldpsw x9, x10, [x2, #-256]! +# CHECK-NEXT: 3 2 1.00 * ldpsw x20, x30, [sp, #252]! +# CHECK-NEXT: 3 2 1.00 * ldp x21, x29, [x2, #504]! +# CHECK-NEXT: 3 2 1.00 * ldp x22, x23, [x3, #-512]! +# CHECK-NEXT: 3 2 1.00 * ldp x24, x25, [x4, #8]! +# CHECK-NEXT: 3 3 1.00 * ldp s29, s28, [sp, #252]! +# CHECK-NEXT: 2 1 1.00 * stp s27, s26, [sp, #-256]! +# CHECK-NEXT: 3 3 1.00 * ldp s1, s2, [x3, #44]! +# CHECK-NEXT: 2 1 1.00 * stp d3, d5, [x9, #504]! +# CHECK-NEXT: 2 1 1.00 * stp d7, d11, [x10, #-512]! +# CHECK-NEXT: 3 3 1.00 * ldp d2, d3, [x30, #-8]! +# CHECK-NEXT: 1 1 2.00 * stp q3, q5, [sp, #0]! +# CHECK-NEXT: 1 1 2.00 * stp q17, q19, [sp, #1008]! +# CHECK-NEXT: 3 3 1.00 * ldp q23, q29, [x1, #-1024]! +# CHECK-NEXT: 1 2 0.50 * ldnp w3, w5, [sp] +# CHECK-NEXT: 1 1 1.00 * stnp wzr, w9, [sp, #252] +# CHECK-NEXT: 1 2 0.50 * ldnp w2, wzr, [sp, #-256] +# CHECK-NEXT: 1 2 0.50 * ldnp w9, w10, [sp, #4] +# CHECK-NEXT: 1 2 0.50 * ldnp x21, x29, [x2, #504] +# CHECK-NEXT: 1 2 0.50 * ldnp x22, x23, [x3, #-512] +# CHECK-NEXT: 1 2 0.50 * ldnp x24, x25, [x4, #8] +# CHECK-NEXT: 2 3 1.00 * ldnp s29, s28, [sp, #252] +# CHECK-NEXT: 1 1 1.00 * stnp s27, s26, [sp, #-256] +# CHECK-NEXT: 2 3 1.00 * ldnp s1, s2, [x3, #44] +# CHECK-NEXT: 1 1 1.00 * stnp d3, d5, [x9, #504] +# CHECK-NEXT: 1 1 1.00 * stnp d7, d11, [x10, #-512] +# CHECK-NEXT: 2 3 1.00 * ldnp d2, d3, [x30, #-8] +# CHECK-NEXT: 1 1 2.00 * stnp q3, q5, [sp] +# CHECK-NEXT: 1 1 2.00 * stnp q17, q19, [sp, #1008] +# CHECK-NEXT: 2 3 1.00 * ldnp q23, q29, [x1, #-1024] +# CHECK-NEXT: 1 1 0.50 and wsp, w16, #0xe00 +# CHECK-NEXT: 1 1 0.50 and x2, x22, #0x1e00 +# CHECK-NEXT: 1 1 0.50 ands w14, w8, #0x70 +# CHECK-NEXT: 1 1 0.50 ands x4, x10, #0x60 +# CHECK-NEXT: 1 1 0.50 eor wsp, w4, #0xe00 +# CHECK-NEXT: 1 1 0.50 eor x27, x25, #0x1e00 +# CHECK-NEXT: 1 1 0.50 mov w3, #983055 +# CHECK-NEXT: 1 1 0.50 mov x10, #-6148914691236517206 +# CHECK-NEXT: 1 1 0.50 and w12, w23, w21 +# CHECK-NEXT: 1 1 0.50 and w16, w15, w1, lsl #1 +# CHECK-NEXT: 1 1 0.50 and w9, w4, w10, lsl #31 +# CHECK-NEXT: 1 1 0.50 and w3, w30, w11 +# CHECK-NEXT: 1 1 0.50 and x3, x5, x7, lsl #63 +# CHECK-NEXT: 1 1 0.50 and x5, x14, x19, asr #4 +# CHECK-NEXT: 1 1 0.50 and w3, w17, w19, ror #31 +# CHECK-NEXT: 1 1 0.50 and w0, w2, wzr, lsr #17 +# CHECK-NEXT: 1 1 0.50 and w3, w30, w11, asr #2 +# CHECK-NEXT: 1 1 0.50 and xzr, x4, x26 +# CHECK-NEXT: 1 1 0.50 and w3, wzr, w20, ror #2 +# CHECK-NEXT: 1 1 0.50 and x7, x20, xzr, asr #63 +# CHECK-NEXT: 1 1 0.50 bic x13, x20, x14, lsl #47 +# CHECK-NEXT: 1 1 0.50 bic w2, w7, w9 +# CHECK-NEXT: 1 1 0.50 eon w29, w4, w19 +# CHECK-NEXT: 1 1 0.50 eon x19, x12, x2 +# CHECK-NEXT: 1 1 0.50 eor w8, w27, w2 +# CHECK-NEXT: 1 1 0.50 eor x22, x16, x6 +# CHECK-NEXT: 1 1 0.50 orr w2, w7, w0, asr #31 +# CHECK-NEXT: 1 1 0.50 orr x8, x9, x10, lsl #12 +# CHECK-NEXT: 1 1 0.50 orn x3, x5, x7, asr #2 +# CHECK-NEXT: 1 1 0.50 orn w2, w5, w29 +# CHECK-NEXT: 1 1 0.50 ands w7, wzr, w9, lsl #1 +# CHECK-NEXT: 1 1 0.50 ands x3, x5, x20, ror #63 +# CHECK-NEXT: 1 1 0.50 bics w3, w5, w7 +# CHECK-NEXT: 1 1 0.50 bics x3, xzr, x3, lsl #1 +# CHECK-NEXT: 1 1 0.50 tst w3, w7, lsl #31 +# CHECK-NEXT: 1 1 0.50 tst x2, x20, asr #2 +# CHECK-NEXT: 1 1 0.50 mov x3, x6 +# CHECK-NEXT: 1 1 0.50 mov x3, xzr +# CHECK-NEXT: 1 1 0.50 mov wzr, w2 +# CHECK-NEXT: 1 1 0.50 mov w3, w5 +# CHECK-NEXT: 1 1 0.50 movz w2, #0, lsl #16 +# CHECK-NEXT: 1 1 0.50 mov w2, #-1235 +# CHECK-NEXT: 1 1 0.50 mov x2, #5299989643264 +# CHECK-NEXT: 1 1 0.50 mov x2, #0 +# CHECK-NEXT: 1 1 0.50 movk w3, #0 +# CHECK-NEXT: 1 1 0.50 movz x4, #0, lsl #16 +# CHECK-NEXT: 1 1 0.50 movk w5, #0, lsl #16 +# CHECK-NEXT: 1 1 0.50 movz x6, #0, lsl #32 +# CHECK-NEXT: 1 1 0.50 movk x7, #0, lsl #32 +# CHECK-NEXT: 1 1 0.50 movz x8, #0, lsl #48 +# CHECK-NEXT: 1 1 0.50 movk x9, #0, lsl #48 +# CHECK-NEXT: 1 1 1.00 U msr DAIFSet, #0 +# CHECK-NEXT: 1 1 0.50 adr x2, #1600 +# CHECK-NEXT: 1 1 0.50 adrp x21, #6553600 +# CHECK-NEXT: 1 1 0.50 adr x0, #262144 +# CHECK-NEXT: 1 1 1.00 tbz x12, #62, #0 +# CHECK-NEXT: 1 1 1.00 tbz x12, #62, #4 +# CHECK-NEXT: 1 1 1.00 tbz x12, #62, #-32768 +# CHECK-NEXT: 1 1 1.00 tbz w17, #16, test +# CHECK-NEXT: 1 1 1.00 tbnz x12, #60, #32764 +# CHECK-NEXT: 1 1 1.00 tbnz w3, #28, test +# CHECK-NEXT: 1 1 1.00 b #4 +# CHECK-NEXT: 1 1 1.00 b #-4 +# CHECK-NEXT: 1 1 1.00 b #134217724 +# CHECK-NEXT: 1 1 1.00 bl test +# CHECK-NEXT: 1 1 1.00 br x20 +# CHECK-NEXT: 1 1 1.00 blr xzr +# CHECK-NEXT: 1 1 1.00 U ret x10 +# CHECK-NEXT: 1 1 1.00 U ret +# CHECK-NEXT: 1 1 1.00 U eret +# CHECK-NEXT: 1 1 1.00 U drps + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: 260.50 260.50 27.00 64.00 178.50 328.50 89.00 - 136.50 136.50 8.50 8.50 67.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w2, w3, #4095 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w30, w29, #1, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w13, w5, #4095, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x5, x7, #1638 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w20, wsp, #801 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add wsp, wsp, #1104 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add wsp, w30, #4084 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x0, x24, #291 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x3, x24, #4095, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x8, sp, #1074 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add sp, x29, #3816 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w0, wsp, #4077 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w4, w20, #546, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub sp, sp, #288 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub wsp, w19, #16 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w13, w23, #291, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w2, #4095 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w20, wsp, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x3, #1, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp wsp, #2342 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp sp, #20, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x30, #4095 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x4, sp, #3822 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w3, #291, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn wsp, #1365 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn sp, #1092, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x10, #-63432 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add wsp, wsp, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x25, x9, w25, uxtb +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w3, w5, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add wzr, w3, w5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w20, wzr, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w4, w6, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w11, w13, w15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w9, w3, wzr, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w2, w3, w4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w5, w6, w7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add w8, w9, w10, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x3, x5, x7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add xzr, x3, x5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x20, xzr, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x4, x6, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x11, x13, x15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x17, x29, x20, lsl #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x2, x3, x4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x5, x6, x7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - add x8, x9, x10, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w3, w5, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w17, wsp, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x13, x23, w8, uxtb +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w3, w5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w20, wzr, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w4, w6, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w11, w13, w15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w9, w3, wzr, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w2, w3, w4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w5, w6, w7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds w8, w9, w10, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x3, x5, x7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x3, x5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x20, xzr, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x4, x6, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x11, x13, x15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x17, x29, x20, lsl #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x2, x3, x4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x5, x6, x7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adds x8, x9, x10, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w3, w5, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub wzr, w3, w5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w4, w6, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w11, w13, w15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w9, w3, wzr, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w2, w3, w4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w5, w6, w7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w8, w9, w10, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x3, x5, x7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub xzr, x3, x5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x4, x6, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x11, x13, x15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x17, x29, x20, lsl #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x2, x3, x4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x5, x6, x7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x8, x9, x10, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub w13, wsp, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sub x16, x2, w19, uxtb +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x13, x15, x14, sxtx #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w3, w5, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w3, w5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w4, w6, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w11, w13, w15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w9, w3, wzr, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w17, w29, w20, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w21, w22, w23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w24, w25, w26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w27, w28, w29, lsr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w2, w3, w4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w5, w6, w7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs w8, w9, w10, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x3, x5, x7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x3, x5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x4, x6, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x11, x13, x15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x9, x3, xzr, lsl #10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x17, x29, x20, lsl #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x21, x22, x23, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x24, x25, x26, lsr #18 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x27, x28, x29, lsr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x2, x3, x4, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x5, x6, x7, asr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subs x8, x9, x10, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn wzr, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w5, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w6, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w8, w9, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w10, w11, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w12, w13, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w14, w15, lsr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w16, w17, lsr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w18, w19, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w20, w21, asr #22 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn w22, w23, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x0, x3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn xzr, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x5, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x6, x7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x8, x9, lsl #15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x10, x11, lsl #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x12, x13, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x14, x15, lsr #41 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x16, x17, lsr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x18, x19, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x20, x21, asr #55 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmn x22, x23, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w0, w3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp wzr, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w5, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w6, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w8, w9, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w10, w11, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w12, w13, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w14, w15, lsr #21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w18, w19, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w20, w21, asr #22 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp w22, w23, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp wsp, w26 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x16, w27, uxtb +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x0, x3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp xzr, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x5, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x6, x7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x8, x9, lsl #15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x10, x11, lsl #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x12, x13, lsr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x14, x15, lsr #41 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x16, x17, lsr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x18, x19, asr #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x20, x21, asr #55 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp x22, x23, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp wzr, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cmp xzr, x0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov sp, x30 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov wsp, w20 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x11, sp +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov w24, wsp +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc w29, w27, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc wzr, w3, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc w9, wzr, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc w20, w0, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc x29, x27, x25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc xzr, x3, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc x9, xzr, x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adc x20, x0, xzr +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs w29, w27, w25 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs wzr, w3, w4 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs w9, wzr, w10 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs w20, w0, wzr +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs x29, x27, x25 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs xzr, x3, x4 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs x9, xzr, x10 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - adcs x20, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbc w29, w27, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbc wzr, w3, w4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc w9, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbc w20, w0, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbc x29, x27, x25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbc xzr, x3, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc x9, x10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbc x20, x0, xzr +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - sbcs w29, w27, w25 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - sbcs wzr, w3, w4 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs w9, w10 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - sbcs w20, w0, wzr +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - sbcs x29, x27, x25 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - sbcs xzr, x3, x4 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs x9, x10 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - sbcs x20, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc w3, w12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc wzr, w9 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc w23, wzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc x29, x30 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc xzr, x0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ngc x0, xzr +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs w3, w12 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs wzr, w9 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs w23, wzr +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs x29, x30 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs xzr, x0 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ngcs x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfx x1, x2, #3, #2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x3, x4, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr wzr, wzr, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfx w12, w9, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfiz x4, x5, #52, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfx xzr, x4, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfiz x4, xzr, #1, #6 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x5, x6, #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfi x4, x5, #52, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil xzr, x4, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfc x4, #1, #6 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil x5, x6, #12, #52 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sxtb w1, w2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sxtb xzr, w3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sxth w9, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sxth x0, w1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sxtw x3, w30 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - uxtb w1, w2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - uxth w9, w10 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfx x3, x30, #0, #32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w3, w2, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w9, w10, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x20, x21, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w1, wzr, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w3, w2, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w9, w10, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x20, x21, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr wzr, wzr, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl w9, w10, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl x20, x21, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl w1, wzr, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfiz x2, x3, #63, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfiz x9, x10, #5, #59 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfiz w11, w12, #31, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfiz w13, w14, #29, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfiz xzr, xzr, #10, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfx w9, w10, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x2, x3, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x19, x20, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x9, x10, #5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w9, w10, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w11, w12, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w13, w14, #29 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - sbfx xzr, xzr, #10, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfi x2, x3, #63, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfi x9, x10, #5, #59 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfi w11, w12, #31, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfi w13, w14, #29, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfc xzr, #10, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil w9, w10, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil x2, x3, #63, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil x19, x20, #0, #64 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil x9, x10, #5, #59 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil w9, w10, #0, #32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil w11, w12, #31, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil w13, w14, #29, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bfxil xzr, xzr, #10, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl x2, x3, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl x9, x10, #5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl w11, w12, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl w13, w14, #29 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfiz xzr, xzr, #10, #11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfx w9, w10, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x2, x3, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x19, x20, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x9, x10, #5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w9, w10, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w11, w12, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w13, w14, #29 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ubfx xzr, xzr, #10, #11 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbz w5, #4 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbz x5, #0 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbnz x2, #-4 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbnz x26, #1048572 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbz wzr, #0 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbnz xzr, #0 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - cbnz w21, test +# CHECK-NEXT: - - 1.00 - - - - - - - - - - b.ne #4 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - b.ge #1048572 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - b.ge #-4 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp w1, #31, #0, eq +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp w3, #0, #15, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp wzr, #15, #13, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp x9, #31, #0, le +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp x3, #0, #15, gt +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp xzr, #5, #7, ne +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn w1, #31, #0, eq +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn w3, #0, #15, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn wzr, #15, #13, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn x9, #31, #0, le +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn x3, #0, #15, gt +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn xzr, #5, #7, ne +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp w1, wzr, #0, eq +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp w3, w0, #15, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp wzr, w15, #13, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp x9, xzr, #0, le +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp x3, x0, #15, gt +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmp xzr, x5, #7, ne +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn w1, wzr, #0, eq +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn w3, w0, #15, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn wzr, w15, #13, hs +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn x9, xzr, #0, le +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn x3, x0, #15, gt +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - ccmn xzr, x5, #7, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csel x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg w1, w0, w19, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg wzr, w5, w9, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg w9, wzr, w30, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg w1, w28, wzr, mi +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg x19, x23, x29, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg xzr, x3, x4, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg x5, xzr, x6, hs +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg x7, x8, xzr, lo +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cset w3, eq +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cset x9, pl +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csetm w20, ne +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csetm x30, ge +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc w2, wzr, wzr, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv x3, xzr, xzr, nv +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinc w3, w5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinc wzr, w4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cset w9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinc x3, x5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinc xzr, x4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cset x9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc w5, w6, w6, nv +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinc x1, x2, x2, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinv w3, w5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinv wzr, w4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csetm w9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinv x3, x5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cinv xzr, x4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csetm x9, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv x1, x0, x0, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csinv w9, w8, w8, nv +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cneg w3, w5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cneg wzr, w4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cneg w9, wzr, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cneg x3, x5, gt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cneg xzr, x4, le +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cneg x9, xzr, lt +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - csneg x4, x8, x8, al +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rbit w0, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rbit x18, x3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rev16 w17, w1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rev16 x5, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rev w18, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rev32 x20, x1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rev x22, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - clz w24, w3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - clz x26, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cls w3, w5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - cls x20, x5 +# CHECK-NEXT: - - - 12.00 - - - - - - - - - udiv w0, w7, w10 +# CHECK-NEXT: - - - 20.00 - - - - - - - - - udiv x9, x22, x4 +# CHECK-NEXT: - - - 12.00 - - - - - - - - - sdiv w12, w21, w0 +# CHECK-NEXT: - - - 20.00 - - - - - - - - - sdiv x13, x2, x1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl w11, w12, w13 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl x14, x15, x16 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w17, w18, w19 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x20, x21, x22 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w23, w24, w25 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x26, x27, x28 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror w0, w1, w2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror x3, x4, x5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl w6, w7, w8 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsl x9, x10, x11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr w12, w13, w14 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - lsr x15, x16, x17 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr w18, w19, w20 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - asr x21, x22, x23 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror w24, w25, w26 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror x27, x28, x29 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32cb w30, w23, w15 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32cb wzr, w12, w14 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32cb w28, w10, w11 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32b w27, w12, w15 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32h w3, w15, w21 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32w w9, w18, w24 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32x w19, w6, x25 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32ch w25, w26, w16 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32cw w27, w12, w23 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - crc32cx w21, w28, x5 +# CHECK-NEXT: - - - - - - 4.00 - - - - - - smulh x30, x29, x28 +# CHECK-NEXT: - - - - - - 4.00 - - - - - - smulh xzr, x27, x26 +# CHECK-NEXT: - - - - - - 4.00 - - - - - - umulh x30, x29, x28 +# CHECK-NEXT: - - - - - - 4.00 - - - - - - umulh x23, x30, xzr +# CHECK-NEXT: - - - - - - 1.00 - - - - - - madd w1, w3, w7, w4 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - madd wzr, w0, w9, w11 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - madd w13, wzr, w4, w4 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - madd w19, w30, wzr, w29 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - mul w4, w5, w6 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - madd x1, x3, x7, x4 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - madd xzr, x0, x9, x11 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - madd x13, xzr, x4, x4 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - madd x19, x30, xzr, x29 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - mul x4, x5, x6 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - msub w1, w3, w7, w4 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - msub wzr, w0, w9, w11 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - msub w13, wzr, w4, w4 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - msub w19, w30, wzr, w29 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - mneg w4, w5, w6 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - msub x1, x3, x7, x4 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - msub xzr, x0, x9, x11 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - msub x13, xzr, x4, x4 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - msub x19, x30, xzr, x29 +# CHECK-NEXT: - - - - - - 2.00 - - - - - - mneg x4, x5, x6 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smaddl x3, w5, w2, x9 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smaddl xzr, w10, w11, x12 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smaddl x13, wzr, w14, x15 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smaddl x16, w17, wzr, x18 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smull x19, w20, w21 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smsubl x3, w5, w2, x9 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smsubl xzr, w10, w11, x12 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smsubl x13, wzr, w14, x15 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smsubl x16, w17, wzr, x18 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smnegl x19, w20, w21 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umaddl x3, w5, w2, x9 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umaddl xzr, w10, w11, x12 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umaddl x13, wzr, w14, x15 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umaddl x16, w17, wzr, x18 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umull x19, w20, w21 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umsubl x3, w5, w2, x9 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umsubl x16, w17, wzr, x18 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umnegl x19, w20, w21 +# CHECK-NEXT: - - - - - - 4.00 - - - - - - smulh x23, x22, xzr +# CHECK-NEXT: - - - - - - 4.00 - - - - - - umulh x23, x22, xzr +# CHECK-NEXT: - - - - - - 2.00 - - - - - - mul x19, x20, xzr +# CHECK-NEXT: - - - - - - 1.00 - - - - - - mneg w21, w22, w23 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smull x11, w13, w17 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umull x11, w13, w17 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - smnegl x11, w13, w17 +# CHECK-NEXT: - - - - - - 1.00 - - - - - - umnegl x11, w13, w17 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - extr w3, w5, w7, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - extr w11, w13, w17, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - extr x3, x5, x7, #15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - extr x11, x13, x17, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror x19, x23, #24 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror x29, xzr, #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ror w9, w13, #31 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmp h5, h21 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmp h5, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmpe h22, h21 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmpe h13, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmp s3, s5 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmp s31, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmpe s29, s30 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmpe s15, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmp d4, d12 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmp d23, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmpe d26, d22 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmpe d29, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp s1, s31, #0, eq +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp s3, s0, #15, hs +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp s31, s15, #13, hs +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp d9, d31, #0, le +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp d3, d0, #15, gt +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp d31, d5, #7, ne +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmp h31, h3, #11, hs +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe h6, h1, #12, ne +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe s1, s31, #0, eq +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe s3, s0, #15, hs +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe s31, s15, #13, hs +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe d9, d31, #0, le +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe d3, d0, #15, gt +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fccmpe d31, d5, #7, ne +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcsel s3, s20, s9, pl +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcsel d9, d10, d11, mi +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcsel h26, h2, h11, hs +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov h18, h28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov s0, s1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs s2, s3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg h2, h9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg s4, s5 +# CHECK-NEXT: - - - - - - - - - - - - 9.00 fsqrt s6, s7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt d8, s9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt h10, s11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn h12, h3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp h17, h31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp s14, s15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm h0, h21 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm s16, s17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz h10, h29 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz s18, s19 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta h22, h10 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta s20, s21 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx h4, h5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx s22, s23 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti s24, s25 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti h31, h14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov d0, d1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs d2, d3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg d4, d5 +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fsqrt h13, h24 +# CHECK-NEXT: - - - - - - - - - - - - 19.00 fsqrt d6, d7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt s8, d9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt h10, d11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn d12, d13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp d14, d15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm d16, d17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz d18, d19 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta d20, d21 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx d22, d23 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti d24, d25 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt s26, h27 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt d28, h29 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul s20, s19, s17 +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fdiv h1, h26, h23 +# CHECK-NEXT: - - - - - - - - - - - - 10.00 fdiv s1, s2, s3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd h23, h27, h22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd s4, s5, s6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub h20, h11, h18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub s7, s8, s9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax s10, s11, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax h8, h7, h11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin s13, s14, s15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm h29, h13, h14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm s16, s17, s18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm s19, s20, s21 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmul h3, h15, h7 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmul s22, s23, s2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul d20, d19, d17 +# CHECK-NEXT: - - - - - - - - - - - - 19.00 fdiv d1, d2, d3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd d4, d5, d6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub d7, d8, d9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax d10, d11, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin d13, d14, d15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin h4, h13, h17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm d16, d17, d18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm d19, d20, d21 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm h29, h23, h17 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmul d22, d23, d24 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmadd h27, h0, h6, h28 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmadd s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmadd d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmsub h25, h28, h12, h24 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmsub s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmsub d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmadd h3, h18, h31, h24 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmadd s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmadd d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmsub s3, s5, s6, s31 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmsub d3, d13, d0, d23 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmsub h3, h29, h24, h17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w3, h5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs wzr, h20, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w19, h0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x3, h5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x12, h30, #45 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x19, h0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w3, s5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs wzr, s20, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w19, s0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x3, s5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x12, s30, #45 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x19, s0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w3, d5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs wzr, d20, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w19, d0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x3, d5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x12, d30, #45 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x19, d0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w3, h5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu wzr, h20, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w19, h0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x3, h5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x12, h30, #45 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x19, h0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w3, s5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu wzr, s20, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w19, s0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x3, s5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x12, s30, #45 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x19, s0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w3, d5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu wzr, d20, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w19, d0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x3, d5, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x12, d30, #45 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x19, d0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h23, w19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h31, wzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h14, w0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h23, x19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h31, xzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h14, x0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s23, w19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s31, wzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s14, w0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s23, x19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s31, xzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s14, x0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d23, w19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d31, wzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d14, w0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d23, x19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d31, xzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d14, x0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h23, w19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h31, wzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h14, w0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h23, x19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h31, xzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h14, x0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s23, w19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s31, wzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s14, w0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s23, x19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s31, xzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s14, x0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d23, w19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d31, wzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d14, w0, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d23, x19, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d31, xzr, #20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d14, x0, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns w3, h31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns xzr, h12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu wzr, h12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu x0, h0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps wzr, h9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps x12, h20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu w30, h23 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu x29, h3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms w2, h3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms x4, h5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu w6, h7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu x8, h9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w10, h11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w14, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x15, h16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h17, w18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h19, x20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h21, w22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h23, x24 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas w25, h26 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas x27, h28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau w29, h30 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau xzr, h0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns w3, s31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns xzr, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu wzr, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu x0, s0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps wzr, s9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps x12, s20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu w30, s23 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu x29, s3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms w2, s3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms x4, s5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu w6, s7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu x8, s9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w10, s11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w14, s15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x15, s16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s17, w18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s19, x20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s21, w22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s23, x24 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas w25, s26 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas x27, s28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau w29, s30 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau xzr, s0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns w3, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns xzr, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu wzr, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu x0, d0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps wzr, d9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps x12, d20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu w30, d23 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu x29, d3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms w2, d3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms x4, d5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu w6, d7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu x8, d9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs w10, d11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs x12, d13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu w14, d15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu x15, d16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d17, w18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d19, x20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d21, w22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d23, x24 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas w25, d26 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas x27, d28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau w29, d30 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau xzr, d0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov h6, w5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov h16, x27 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov w15, h31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov w3, s9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov s9, w3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov x21, h14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov x20, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov d1, x15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov x3, v12.d[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov v1.d[1], x19 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov h29, #0.50000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov s2, #0.12500000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov s3, #1.00000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov d30, #16.00000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov s4, #1.06250000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov d10, #1.93750000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov s12, #-1.00000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov d16, #8.50000000 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w3, #0 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x29, #4 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw xzr, #-4 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr s0, #8 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr d0, #1048572 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q0, #-1048576 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - prfm pldl1strm, #0 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - prfm #25, #0 +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stxrb w18, w8, [sp] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stxrh w24, w15, [x16] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stxr w5, w6, [x17] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stxr w1, x10, [x21] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldxrb w30, [x0] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldxrh w17, [x4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldxr w22, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldxr x11, [x29] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stxp w12, w11, w10, [sp] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stxp wzr, x27, x9, [x12] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldxp w0, wzr, [sp] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldxp x17, x0, [x18] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stlxrb w12, w22, [x0] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stlxrh w10, w1, [x1] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stlxr w9, w2, [x2] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stlxr w9, x3, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldaxrb w8, [x4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldaxrh w7, [x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldaxr w6, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldaxr x5, [x6] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stlxp w4, w5, w6, [sp] +# CHECK-NEXT: - - - - 0.50 1.50 - - - - - - - stlxp wzr, x6, x7, [x1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldaxp w5, w18, [sp] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldaxp x6, x19, [x22] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlrb w24, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlrh w25, [x30] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlr w26, [x29] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlr x27, [x28] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldarb w16, [x21] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldarb w23, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldarh w22, [x30] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldar wzr, [x29] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldar x21, [x28] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - sturb w9, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - sturh wzr, [x12, #255] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur w16, [x0, #-256] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur x28, [x14, #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldurb w1, [x20, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldurh w20, [x1, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur w12, [sp, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur xzr, [x12, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldursb x9, [x7, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldursh x17, [x19, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldursw x20, [x15, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - prfum pldl2keep, [sp, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldursb w19, [x1, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldursh w15, [x21, #-256] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur b0, [sp, #1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur h12, [x12, #-1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur s15, [x0, #255] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur d31, [x5, #25] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stur q9, [x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur b3, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur h5, [x4, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur s7, [x12, #-1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur d11, [x19, #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldur q13, [x1, #2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w9, [x2], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w10, [x3], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w10, [x3], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w9, [x2], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w9, [x2], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w10, [x3], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w19, [sp], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w20, [x30], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w21, [x12], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str xzr, [x9], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x2, [x3], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x19, [x12], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w9, [x2], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w10, [x3], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w10, [x3], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w9, [x2], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w9, [x2], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w10, [x3], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr w19, [sp], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr w20, [x30], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr w21, [x12], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr xzr, [x9], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr x2, [x3], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr x19, [x12], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb xzr, [x9], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb x2, [x3], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb x19, [x12], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh xzr, [x9], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x2, [x3], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x19, [x12], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw xzr, [x9], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x2, [x3], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x19, [x12], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb wzr, [x9], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w2, [x3], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w19, [x12], #-256 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh wzr, [x9], #255 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w2, [x3], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w19, [x12], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b0, [x0], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b3, [x3], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b5, [sp], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h10, [x10], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h13, [x23], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h15, [sp], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s20, [x20], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s23, [x23], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s25, [x0], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d20, [x20], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d23, [x23], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d25, [x0], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr b0, [x0], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr b3, [x3], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr b5, [sp], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr h10, [x10], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr h13, [x23], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr h15, [sp], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr s20, [x20], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr s23, [x23], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr s25, [x0], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr d20, [x20], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr d23, [x23], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr d25, [x0], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr q20, [x1], #255 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr q23, [x9], #1 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr q25, [x20], #-256 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q10, [x1], #255 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q22, [sp], #1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q21, [x20], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr x3, [x4, #0]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w9, [x2, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w10, [x3, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w10, [x3, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w9, [x2, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w9, [x2, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w10, [x3, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w19, [sp, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w20, [x30, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w21, [x12, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str xzr, [x9, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x2, [x3, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x19, [x12, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w9, [x2, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w10, [x3, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w10, [x3, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w9, [x2, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w9, [x2, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w10, [x3, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr w19, [sp, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr w20, [x30, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr w21, [x12, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr xzr, [x9, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr x2, [x3, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr x19, [x12, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb xzr, [x9, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb x2, [x3, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb x19, [x12, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh xzr, [x9, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x2, [x3, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x19, [x12, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw xzr, [x9, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x2, [x3, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x19, [x12, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb wzr, [x9, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w2, [x3, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w19, [x12, #-256]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh wzr, [x9, #255]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w2, [x3, #1]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w19, [x12, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b0, [x0, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b3, [x3, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b5, [sp, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h10, [x10, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h13, [x23, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h15, [sp, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s20, [x20, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s23, [x23, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s25, [x0, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d20, [x20, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d23, [x23, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d25, [x0, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr b0, [x0, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr b3, [x3, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr b5, [sp, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr h10, [x10, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr h13, [x23, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr h15, [sp, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr s20, [x20, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr s23, [x23, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr s25, [x0, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr d20, [x20, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr d23, [x23, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr d25, [x0, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr q20, [x1, #255]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr q23, [x9, #1]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr q25, [x20, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q10, [x1, #255]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q22, [sp, #1]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q21, [x20, #-256]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - sttrb w9, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - sttrh wzr, [x12, #255] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - sttr w16, [x0, #-256] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - sttr x28, [x14, #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrb w1, [x20, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrh w20, [x1, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtr w12, [sp, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtr xzr, [x12, #255] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrsb x9, [x7, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrsh x17, [x19, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrsw x20, [x15, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrsb w19, [x1, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldtrsh w15, [x21, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x4, [x29] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x30, [x12, #32760] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x20, [sp, #8] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr xzr, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w2, [sp] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w17, [sp, #16380] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w13, [x2, #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x2, [x5, #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x23, [sp, #16380] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x21, [x25, x7] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w2, [x4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w23, [x6, #8190] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh wzr, [sp, #2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x29, [x2, #2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x25, [x8, w13, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w26, [x3, #121] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w12, [x2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w27, [sp, #4095] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb xzr, [x15] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb x12, [x28, x27] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x30, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w20, [x4, #16380] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b5, [x11] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h23, [x15] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s25, [x19] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d15, [x2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w17, [sp, #8190] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w23, [x3, #4095] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb wzr, [x2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr b31, [sp, #4095] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h20, [x2, #8190] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr s10, [x19, #16380] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr d3, [x10, #32760] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q12, [sp, #65520] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q14, [x6, #4624] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w3, [sp, x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w9, [x27, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w10, [x30, x7] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strb w5, [x26, w7, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb w15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrb w17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsb x18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w3, [sp, x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w9, [x27, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w10, [x30, x7, lsl #1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh x13, [x27, x5, sxtx #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsh w16, [x24, w8, uxtw #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrh w18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - strh w19, [x21, wzr, sxtw #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr b25, [x21, w8, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr b8, [x30, x10] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b14, [x13, x25] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str b30, [x16, w26, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h3, [sp, x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h9, [x27, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h10, [x30, x7, lsl #1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h13, [x27, x5, sxtx #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h16, [x24, w8, uxtw #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str h18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr h19, [x21, wzr, sxtw #1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr s12, [x30, w5, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr d24, [x26, w7, uxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s20, [x24, w10, uxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d5, [x26, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w3, [sp, x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr s9, [x27, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w10, [x30, x7, lsl #2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str s12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w13, [x27, x5, sxtx #2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str w14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w16, [x24, w8, uxtw #2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr w18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldrsw x19, [x21, wzr, sxtw #2] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x3, [sp, x5] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x9, [x27, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr d10, [x30, x7, lsl #3] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x13, [x27, x5, sxtx #3] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - prfm pldl1keep, [x26, w6, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str x27, [x26, w24, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x16, [x24, w8, uxtw #3] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr x18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str d19, [x21, wzr, sxtw #3] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q3, [sp, x5] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q9, [x27, x6] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q10, [x30, x7, lsl #4] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q11, [x29, x3, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q12, [x28, xzr, sxtx] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q13, [x27, x5, sxtx #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q14, [x26, w6, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q15, [x25, w7, uxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q16, [x24, w8, uxtw #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q17, [x23, w9, sxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str q18, [x22, w10, sxtw] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldr q19, [x21, wzr, sxtw #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldp w3, w5, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp wzr, w9, [sp, #252] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldp w2, wzr, [sp, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldp w9, w10, [sp, #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldpsw x9, x10, [sp, #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldpsw x9, x10, [x2, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldpsw x20, x30, [sp, #252] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldp x21, x29, [x2, #504] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldp x22, x23, [x3, #-512] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldp x24, x25, [x4, #8] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp s29, s28, [sp, #252] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp s27, s26, [sp, #-256] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp s1, s2, [x3, #44] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp d3, d5, [x9, #504] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp d7, d11, [x10, #-512] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnp x20, x16, [x8] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp x3, x6, [x16] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp d2, d3, [x30, #-8] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stp q3, q5, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stp q17, q19, [sp, #1008] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp q23, q29, [x1, #-1024] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp w3, w5, [sp], #0 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp wzr, w9, [sp], #252 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp w2, wzr, [sp], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp w9, w10, [sp], #4 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldpsw x9, x10, [sp], #4 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldpsw x9, x10, [x2], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldpsw x20, x30, [sp], #252 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp x21, x29, [x2], #504 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp x22, x23, [x3], #-512 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp x24, x25, [x4], #8 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp s29, s28, [sp], #252 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp s27, s26, [sp], #-256 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp s1, s2, [x3], #44 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp d3, d5, [x9], #504 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp d7, d11, [x10], #-512 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp d2, d3, [x30], #-8 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stp q3, q5, [sp], #0 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stp q17, q19, [sp], #1008 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp q23, q29, [x1], #-1024 +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp w3, w5, [sp, #0]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp wzr, w9, [sp, #252]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp w2, wzr, [sp, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp w9, w10, [sp, #4]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldpsw x9, x10, [sp, #4]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldpsw x9, x10, [x2, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldpsw x20, x30, [sp, #252]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp x21, x29, [x2, #504]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp x22, x23, [x3, #-512]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp x24, x25, [x4, #8]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp s29, s28, [sp, #252]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp s27, s26, [sp, #-256]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp s1, s2, [x3, #44]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp d3, d5, [x9, #504]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stp d7, d11, [x10, #-512]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp d2, d3, [x30, #-8]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stp q3, q5, [sp, #0]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stp q17, q19, [sp, #1008]! +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldp q23, q29, [x1, #-1024]! +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldnp w3, w5, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnp wzr, w9, [sp, #252] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldnp w2, wzr, [sp, #-256] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldnp w9, w10, [sp, #4] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldnp x21, x29, [x2, #504] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldnp x22, x23, [x3, #-512] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldnp x24, x25, [x4, #8] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnp s29, s28, [sp, #252] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnp s27, s26, [sp, #-256] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnp s1, s2, [x3, #44] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnp d3, d5, [x9, #504] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnp d7, d11, [x10, #-512] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnp d2, d3, [x30, #-8] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stnp q3, q5, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stnp q17, q19, [sp, #1008] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnp q23, q29, [x1, #-1024] +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and wsp, w16, #0xe00 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and x2, x22, #0x1e00 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ands w14, w8, #0x70 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ands x4, x10, #0x60 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - eor wsp, w4, #0xe00 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - eor x27, x25, #0x1e00 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov w3, #983055 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x10, #-6148914691236517206 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w12, w23, w21 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w16, w15, w1, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w9, w4, w10, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w3, w30, w11 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and x3, x5, x7, lsl #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and x5, x14, x19, asr #4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w3, w17, w19, ror #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w0, w2, wzr, lsr #17 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w3, w30, w11, asr #2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and xzr, x4, x26 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and w3, wzr, w20, ror #2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - and x7, x20, xzr, asr #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bic x13, x20, x14, lsl #47 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bic w2, w7, w9 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - eon w29, w4, w19 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - eon x19, x12, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - eor w8, w27, w2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - eor x22, x16, x6 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - orr w2, w7, w0, asr #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - orr x8, x9, x10, lsl #12 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - orn x3, x5, x7, asr #2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - orn w2, w5, w29 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ands w7, wzr, w9, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - ands x3, x5, x20, ror #63 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bics w3, w5, w7 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - bics x3, xzr, x3, lsl #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - tst w3, w7, lsl #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - tst x2, x20, asr #2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x3, x6 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x3, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov wzr, w2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov w3, w5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movz w2, #0, lsl #16 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov w2, #-1235 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x2, #5299989643264 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - mov x2, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movk w3, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movz x4, #0, lsl #16 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movk w5, #0, lsl #16 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movz x6, #0, lsl #32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movk x7, #0, lsl #32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movz x8, #0, lsl #48 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - movk x9, #0, lsl #48 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - msr DAIFSet, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adr x2, #1600 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adrp x21, #6553600 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - adr x0, #262144 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - tbz x12, #62, #0 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - tbz x12, #62, #4 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - tbz x12, #62, #-32768 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - tbz w17, #16, test +# CHECK-NEXT: - - 1.00 - - - - - - - - - - tbnz x12, #60, #32764 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - tbnz w3, #28, test +# CHECK-NEXT: - - 1.00 - - - - - - - - - - b #4 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - b #-4 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - b #134217724 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - bl test +# CHECK-NEXT: - - 1.00 - - - - - - - - - - br x20 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - blr xzr +# CHECK-NEXT: - - 1.00 - - - - - - - - - - ret x10 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - ret +# CHECK-NEXT: - - 1.00 - - - - - - - - - - eret +# CHECK-NEXT: - - 1.00 - - - - - - - - - - drps diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-bf16-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-bf16-instructions.s new file mode 100644 index 0000000000000..97c5b902dc519 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-bf16-instructions.s @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/bf16-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 0.50 bfcvt h0, s0 +# CHECK-NEXT: 1 4 0.50 bfcvtn v0.4h, v0.4s +# CHECK-NEXT: 1 4 0.50 bfcvtn2 v0.8h, v0.4s +# CHECK-NEXT: 2 10 0.50 bfdot v0.2s, v24.4h, v14.2h[2] +# CHECK-NEXT: 2 10 0.50 bfdot v0.2s, v0.4h, v0.4h +# CHECK-NEXT: 2 10 0.50 bfdot v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 bfmlalb v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 bfmlalb v0.4s, v0.8h, v0.h[3] +# CHECK-NEXT: 1 4 0.50 bfmlalt v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 bfmlalt v0.4s, v0.8h, v0.h[3] +# CHECK-NEXT: 2 14 1.00 bfmmla v0.4s, v0.8h, v0.8h + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - 4.00 4.00 4.50 4.50 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bfcvt h0, s0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bfcvtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bfcvtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 0.50 0.50 - bfdot v0.2s, v24.4h, v14.2h[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 0.50 0.50 - bfdot v0.2s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 0.50 0.50 - bfdot v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb v0.4s, v0.8h, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt v0.4s, v0.8h, v0.h[3] +# CHECK-NEXT: - - - - - - - - 1.00 1.00 1.00 1.00 - bfmmla v0.4s, v0.8h, v0.8h diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-complxnum-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-complxnum-instructions.s new file mode 100644 index 0000000000000..d05ec5c240c74 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-complxnum-instructions.s @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/complxnum-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 0.50 fcadd v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: 1 4 0.50 fcadd v0.4s, v0.4s, v0.4s, #270 +# CHECK-NEXT: 1 4 0.50 fcmla v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: 1 4 0.50 fcmla v0.4s, v0.4s, v0.s[1], #0 + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - 1.00 1.00 1.00 1.00 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd v0.4s, v0.4s, v0.4s, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla v0.4s, v0.4s, v0.s[1], #0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-crypto-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-crypto-instructions.s new file mode 100644 index 0000000000000..0749c4f016a58 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-crypto-instructions.s @@ -0,0 +1,96 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -mattr=+sve-aes,+sha3,+sm4 -instruction-tables < %p/../Inputs/crypto-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 0.50 aesd z0.b, z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 aese z0.b, z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 aesimc z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 aesmc z0.b, z0.b +# CHECK-NEXT: 1 3 1.00 sha1h s0, s1 +# CHECK-NEXT: 1 3 0.50 sha1su1 v0.4s, v1.4s +# CHECK-NEXT: 1 4 0.50 sha256su0 v0.4s, v1.4s +# CHECK-NEXT: 1 4 0.50 sha1c q0, s1, v2.4s +# CHECK-NEXT: 1 4 0.50 sha1p q0, s1, v2.4s +# CHECK-NEXT: 1 4 0.50 sha1m q0, s1, v2.4s +# CHECK-NEXT: 1 3 0.50 sha1su0 v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1 4 0.50 sha256h q0, q1, v2.4s +# CHECK-NEXT: 1 4 0.50 sha256h2 q0, q1, v2.4s +# CHECK-NEXT: 1 4 0.50 sha256su1 v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1 9 7.00 sha512h q0, q1, v2.2d +# CHECK-NEXT: 1 9 7.00 sha512h2 q0, q1, v2.2d +# CHECK-NEXT: 1 9 7.00 sha512su0 v11.2d, v12.2d +# CHECK-NEXT: 1 9 7.00 sha512su1 v11.2d, v13.2d, v14.2d +# CHECK-NEXT: 1 3 0.50 eor3 v25.16b, v12.16b, v7.16b, v2.16b +# CHECK-NEXT: 1 3 0.50 rax1 v30.2d, v29.2d, v26.2d +# CHECK-NEXT: 1 4 0.50 xar v26.2d, v21.2d, v27.2d, #63 +# CHECK-NEXT: 1 3 0.50 bcax v31.16b, v26.16b, v2.16b, v1.16b +# CHECK-NEXT: 1 9 7.00 sm3ss1 v20.4s, v23.4s, v21.4s, v22.4s +# CHECK-NEXT: 1 9 7.00 sm3tt1a v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: 1 9 7.00 sm3tt1b v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: 1 9 7.00 sm3tt2a v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: 1 9 7.00 sm3tt2b v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: 1 9 7.00 sm3partw1 v30.4s, v29.4s, v26.4s +# CHECK-NEXT: 1 9 7.00 sm3partw2 v30.4s, v29.4s, v26.4s +# CHECK-NEXT: 1 9 7.00 sm4ekey v11.4s, v11.4s, v19.4s +# CHECK-NEXT: 1 9 7.00 sm4e v2.4s, v15.4s + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - 9.50 9.50 - - 91.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesd z0.b, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aese z0.b, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesimc z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesmc z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sha1h s0, s1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha1su1 v0.4s, v1.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha256su0 v0.4s, v1.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha1c q0, s1, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha1p q0, s1, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha1m q0, s1, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha1su0 v0.4s, v1.4s, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha256h q0, q1, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha256h2 q0, q1, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sha256su1 v0.4s, v1.4s, v2.4s +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sha512h q0, q1, v2.2d +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sha512h2 q0, q1, v2.2d +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sha512su0 v11.2d, v12.2d +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sha512su1 v11.2d, v13.2d, v14.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor3 v25.16b, v12.16b, v7.16b, v2.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rax1 v30.2d, v29.2d, v26.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar v26.2d, v21.2d, v27.2d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bcax v31.16b, v26.16b, v2.16b, v1.16b +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3ss1 v20.4s, v23.4s, v21.4s, v22.4s +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3tt1a v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3tt1b v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3tt2a v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3tt2b v20.4s, v23.4s, v21.s[3] +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3partw1 v30.4s, v29.4s, v26.4s +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm3partw2 v30.4s, v29.4s, v26.4s +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm4ekey v11.4s, v11.4s, v19.4s +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm4e v2.4s, v15.4s diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-flag-manipulation-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-flag-manipulation-instructions.s new file mode 100644 index 0000000000000..050338a9277b9 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-flag-manipulation-instructions.s @@ -0,0 +1,53 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/flag-manipulation-instructions.s | FileCheck %s + +setf8 w1 +setf16 w1 +rmif x0, #0, #0 +cfinv +axflag +xaflag + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 2.00 U setf8 w1 +# CHECK-NEXT: 1 2 2.00 U setf16 w1 +# CHECK-NEXT: 1 1 1.00 U rmif x0, #0, #0 +# CHECK-NEXT: 1 1 2.00 U cfinv +# CHECK-NEXT: 1 1 2.00 U axflag +# CHECK-NEXT: 1 1 2.00 U xaflag + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: 11.00 11.00 - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - setf8 w1 +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - setf16 w1 +# CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - rmif x0, #0, #0 +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - cfinv +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - axflag +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - xaflag diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-forwarding.s new file mode 100644 index 0000000000000..a6efdf5030e11 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-forwarding.s @@ -0,0 +1,285 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s + +# LLVM-MCA-BEGIN adrp +adrp x0, #6553600 +ldr x0, [x0, #4096] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN simd_mac_mla +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v3.4s, v4.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN simd_mac_dot +udot v0.4s, v1.16b, v2.16b +udot v0.4s, v3.16b, v4.16b +sdot v0.4s, v5.16b, v6.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN simd_mac_long +umlal v0.4s, v1.4h, v2.4h +smlal v0.4s, v3.4h, v4.4h +umlsl v0.4s, v5.4h, v6.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN simd_mac_mixed +mla v0.4s, v1.4s, v2.4s +mls v0.4s, v3.4s, v4.4s +udot v0.4s, v5.16b, v6.16b +umlal v0.4s, v7.4h, v8.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN simd_mac_mul_to_accum +mul v0.4s, v1.4s, v2.4s +mla v0.4s, v3.4s, v4.4s +smull v1.4s, v5.4h, v6.4h +smlal v1.4s, v7.4h, v8.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN simd_mac_size_mismatch +mul v0.4s, v1.4s, v2.4s +mla v0.2s, v3.2s, v4.2s +mla v0.4h, v5.4h, v6.4h +# LLVM-MCA-END + +# CHECK: [0] Code Region - adrp + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 200 +# CHECK-NEXT: Total Cycles: 102 +# CHECK-NEXT: Total uOps: 200 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 1.96 +# CHECK-NEXT: IPC: 1.96 +# CHECK-NEXT: Block RThroughput: 0.7 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123 + +# CHECK: [0,0] DE . adrp x0, #6553600 +# CHECK-NEXT: [0,1] DeE. ldr x0, [x0, #4096] +# CHECK-NEXT: [1,0] .DE. adrp x0, #6553600 +# CHECK-NEXT: [1,1] .DeE ldr x0, [x0, #4096] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 adrp x0, #6553600 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 ldr x0, [x0, #4096] +# CHECK-NEXT: 2 0.0 0.0 0.0 + +# CHECK: [1] Code Region - simd_mac_mla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 200 +# CHECK-NEXT: Total Cycles: 204 +# CHECK-NEXT: Total uOps: 200 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 0.98 +# CHECK-NEXT: IPC: 0.98 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 01234567 + +# CHECK: [0,0] DeeeE. . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] .DeeeE . mla v0.4s, v3.4s, v4.4s +# CHECK-NEXT: [1,0] . DeeeE. mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,1] . DeeeE mla v0.4s, v3.4s, v4.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 mla v0.4s, v3.4s, v4.4s +# CHECK-NEXT: 2 0.0 0.0 0.0 + +# CHECK: [2] Code Region - simd_mac_dot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 304 +# CHECK-NEXT: Total uOps: 300 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeE. . udot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,1] .DeeeE . udot v0.4s, v3.16b, v4.16b +# CHECK-NEXT: [0,2] . DeeeE . sdot v0.4s, v5.16b, v6.16b +# CHECK-NEXT: [1,0] . DeeeE . udot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,1] . DeeeE. udot v0.4s, v3.16b, v4.16b +# CHECK-NEXT: [1,2] . DeeeE sdot v0.4s, v5.16b, v6.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 udot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 udot v0.4s, v3.16b, v4.16b +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 sdot v0.4s, v5.16b, v6.16b +# CHECK-NEXT: 2 0.0 0.0 0.0 + +# CHECK: [3] Code Region - simd_mac_long + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 304 +# CHECK-NEXT: Total uOps: 300 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeE. . umlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,1] .DeeeE . smlal v0.4s, v3.4h, v4.4h +# CHECK-NEXT: [0,2] . DeeeE . umlsl v0.4s, v5.4h, v6.4h +# CHECK-NEXT: [1,0] . DeeeE . umlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,1] . DeeeE. smlal v0.4s, v3.4h, v4.4h +# CHECK-NEXT: [1,2] . DeeeE umlsl v0.4s, v5.4h, v6.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 umlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 smlal v0.4s, v3.4h, v4.4h +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 umlsl v0.4s, v5.4h, v6.4h +# CHECK-NEXT: 2 0.0 0.0 0.0 + +# CHECK: [4] Code Region - simd_mac_mixed + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 404 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 01 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeE. .. mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] .DeeeE .. mls v0.4s, v3.4s, v4.4s +# CHECK-NEXT: [0,2] . DeeeE .. udot v0.4s, v5.16b, v6.16b +# CHECK-NEXT: [0,3] . DeeeE .. umlal v0.4s, v7.4h, v8.4h +# CHECK-NEXT: [1,0] . DeeeE .. mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,1] . DeeeE.. mls v0.4s, v3.4s, v4.4s +# CHECK-NEXT: [1,2] . .DeeeE. udot v0.4s, v5.16b, v6.16b +# CHECK-NEXT: [1,3] . . DeeeE umlal v0.4s, v7.4h, v8.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 mls v0.4s, v3.4s, v4.4s +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 udot v0.4s, v5.16b, v6.16b +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 umlal v0.4s, v7.4h, v8.4h +# CHECK-NEXT: 2 0.0 0.0 0.0 + +# CHECK: [5] Code Region - simd_mac_mul_to_accum + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 601 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.67 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 012 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeE. . . mul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] .DeeeE . . mla v0.4s, v3.4s, v4.4s +# CHECK-NEXT: [0,2] .DeeeE . . smull v1.4s, v5.4h, v6.4h +# CHECK-NEXT: [0,3] . DeeeE . . smlal v1.4s, v7.4h, v8.4h +# CHECK-NEXT: [1,0] . .DeeeE . mul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,1] . . DeeeE. mla v0.4s, v3.4s, v4.4s +# CHECK-NEXT: [1,2] . . DeeeE. smull v1.4s, v5.4h, v6.4h +# CHECK-NEXT: [1,3] . . DeeeE smlal v1.4s, v7.4h, v8.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 mul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 mla v0.4s, v3.4s, v4.4s +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 smull v1.4s, v5.4h, v6.4h +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 smlal v1.4s, v7.4h, v8.4h +# CHECK-NEXT: 2 0.0 0.0 0.0 + +# CHECK: [6] Code Region - simd_mac_size_mismatch + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 805 +# CHECK-NEXT: Total uOps: 300 + +# CHECK: Dispatch Width: 3 +# CHECK-NEXT: uOps Per Cycle: 0.37 +# CHECK-NEXT: IPC: 0.37 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0 + +# CHECK: [0,0] DeeeE. . . . mul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] . DeeeE . . . mla v0.2s, v3.2s, v4.2s +# CHECK-NEXT: [0,2] . . DeeeE . . mla v0.4h, v5.4h, v6.4h +# CHECK-NEXT: [1,0] . . DeeeE . . mul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,1] . . . DeeeE . mla v0.2s, v3.2s, v4.2s +# CHECK-NEXT: [1,2] . . . .DeeeE mla v0.4h, v5.4h, v6.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 mul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 mla v0.2s, v3.2s, v4.2s +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 mla v0.4h, v5.4h, v6.4h +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fp16fml-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fp16fml-instructions.s new file mode 100644 index 0000000000000..6e7b51c5f0607 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fp16fml-instructions.s @@ -0,0 +1,67 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/fp16fml-instructions.s | FileCheck %s +-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 0.50 fmlal v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: 1 4 0.50 fmlal v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: 1 4 0.50 fmlal v0.2s, v0.2h, v0.2h +# CHECK-NEXT: 1 4 0.50 fmlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fmlal2 v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: 1 4 0.50 fmlal2 v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: 1 4 0.50 fmlal2 v0.2s, v0.2h, v0.2h +# CHECK-NEXT: 1 4 0.50 fmlal2 v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fmlsl v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: 1 4 0.50 fmlsl v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: 1 4 0.50 fmlsl v0.2s, v0.2h, v0.2h +# CHECK-NEXT: 1 4 0.50 fmlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fmlsl2 v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: 1 4 0.50 fmlsl2 v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: 1 4 0.50 fmlsl2 v0.2s, v0.2h, v0.2h +# CHECK-NEXT: 1 4 0.50 fmlsl2 v0.4s, v0.4h, v0.4h + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - - - 8.00 8.00 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal v0.2s, v0.2h, v0.2h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal2 v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal2 v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal2 v0.2s, v0.2h, v0.2h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlal2 v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl v0.2s, v0.2h, v0.2h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl2 v0.2s, v0.2h, v0.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl2 v0.4s, v0.4h, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl2 v0.2s, v0.2h, v0.2h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlsl2 v0.4s, v0.4h, v0.4h diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fptoint-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fptoint-instructions.s new file mode 100644 index 0000000000000..79601ef0fca1d --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-fptoint-instructions.s @@ -0,0 +1,59 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/fptoint-instructions.s | FileCheck %s +-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 0.50 frint32x v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frint32x v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frint32x v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frint32z v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frint32z v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frint32z v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frint64x v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frint64x v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frint64x v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frint64z v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frint64z v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frint64z v0.4s, v0.4s + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - 6.00 6.00 - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint32x v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint32x v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint32x v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint32z v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint32z v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint32z v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint64x v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint64x v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint64x v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint64z v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint64z v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frint64z v0.4s, v0.4s diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-i8mm-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-i8mm-instructions.s new file mode 100644 index 0000000000000..b4102a2297e22 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-i8mm-instructions.s @@ -0,0 +1,53 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/i8mm-instructions.s | FileCheck %s +-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 0.50 smmla v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 sudot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 4 0.50 sudot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 4 0.50 ummla v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 usdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 4 0.50 usdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 usdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 usdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 4 0.50 usmmla v0.4s, v0.16b, v0.16b + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - 1.50 1.50 3.00 3.00 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smmla v0.4s, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sudot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sudot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ummla v0.4s, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - usdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - usdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - usdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - usdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usmmla v0.4s, v0.16b, v0.16b diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-js-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-js-instructions.s new file mode 100644 index 0000000000000..f6d1b255bd6c3 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-js-instructions.s @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/js-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 fjcvtzs w25, d26 + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fjcvtzs w25, d26 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mops-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mops-instructions.s new file mode 100644 index 0000000000000..f688ca4d57a29 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mops-instructions.s @@ -0,0 +1,323 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/mops-instructions.s | FileCheck %s + + cpyfp [x0]!, [x1]!, x2! + cpyfm [x0]!, [x1]!, x2! + cpyfe [x0]!, [x1]!, x2! + cpyp [x0]!, [x1]!, x2! + cpym [x0]!, [x1]!, x2! + cpye [x0]!, [x1]!, x2! + setp [x0]!, x1!, x2 + setm [x0]!, x1!, x2 + sete [x0]!, x1!, x2 + setgp [x0]!, x1!, x2 + setgm [x0]!, x1!, x2 + setge [x0]!, x1!, x2 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 2.00 * * U cpyfp [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfprn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfprt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfprtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfprtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfprtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfpt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfptwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfptrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * * U cpyfptn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfm [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmrt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmrtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmrtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmrtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfmtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfe [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfewn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfern [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfen [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfewt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfewtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfewtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfewtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfert [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfertwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfertrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfertn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfet [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfetwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfetrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyfetn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyp [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyprn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyprt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyprtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyprtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyprtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpypt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyptwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyptrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 3 3.00 * * U cpyptn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpym [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymrt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymrtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymrtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymrtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpymtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpye [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyewn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyern [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyen [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyewt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyewtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyewtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyewtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyert [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyertwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyertrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyertn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyet [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyetwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyetrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 1 1.00 * * U cpyetn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1 2 2.00 * U setp [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setpt [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setpn [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setptn [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setm [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setmt [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setmn [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setmtn [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U sete [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setet [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U seten [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setetn [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setgp [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setgpt [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setgpn [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * U setgptn [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgm [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgmt [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgmn [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgmtn [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setge [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setget [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgen [x0]!, x1!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgetn [x0]!, x1!, x2 +# CHECK-NEXT: 1 2 2.00 * * U cpyfp [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1 1 1.00 * * U cpyfm [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1 1 1.00 * * U cpyfe [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1 3 3.00 * * U cpyp [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1 1 1.00 * * U cpym [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1 1 1.00 * * U cpye [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1 2 2.00 * U setp [x0]!, xzr!, x2 +# CHECK-NEXT: 1 2 2.00 * U setp [x0]!, x1!, xzr +# CHECK-NEXT: 1 1 1.00 * U setm [x0]!, xzr!, x2 +# CHECK-NEXT: 1 1 1.00 * U setm [x0]!, x1!, xzr +# CHECK-NEXT: 1 1 1.00 * U sete [x0]!, xzr!, x2 +# CHECK-NEXT: 1 1 1.00 * U sete [x0]!, x1!, xzr +# CHECK-NEXT: 1 2 2.00 * U setgp [x0]!, xzr!, x2 +# CHECK-NEXT: 1 2 2.00 * U setgp [x0]!, x1!, xzr +# CHECK-NEXT: 1 1 1.00 * U setgm [x0]!, xzr!, x2 +# CHECK-NEXT: 1 1 1.00 * U setgm [x0]!, x1!, xzr +# CHECK-NEXT: 1 1 1.00 * U setge [x0]!, xzr!, x2 +# CHECK-NEXT: 1 1 1.00 * U setge [x0]!, x1!, xzr + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: 100.50 100.50 - - - 201.00 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfp [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfprn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfprt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfprtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfprtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfprtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfpt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfptwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfptrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfptn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfm [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmrt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmrtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmrtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmrtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfmtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfe [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfewn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfern [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfen [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfewt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfewtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfewtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfewtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfert [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfertwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfertrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfertn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfet [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfetwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfetrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfetn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyp [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyprn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyprt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyprtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyprtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyprtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpypt [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyptwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyptrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyptn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpym [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymwt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymwtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymwtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymwtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymrt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymrtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymrtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymrtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpymtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpye [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyewn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyern [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyen [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyewt [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyewtwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyewtrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyewtn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyert [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyertwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyertrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyertn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyet [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyetwn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyetrn [x0]!, [x1]!, x2! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyetn [x0]!, [x1]!, x2! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setp [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setpt [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setpn [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setptn [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setm [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setmt [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setmn [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setmtn [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - sete [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setet [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - seten [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setetn [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setgp [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setgpt [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setgpn [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setgptn [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgm [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgmt [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgmn [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgmtn [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setge [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setget [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgen [x0]!, x1!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgetn [x0]!, x1!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - cpyfp [x0]!, [x1]!, xzr! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfm [x0]!, [x1]!, xzr! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpyfe [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1.50 1.50 - - - 3.00 - - - - - - - cpyp [x0]!, [x1]!, xzr! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpym [x0]!, [x1]!, xzr! +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - cpye [x0]!, [x1]!, xzr! +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setp [x0]!, xzr!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setp [x0]!, x1!, xzr +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setm [x0]!, xzr!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setm [x0]!, x1!, xzr +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - sete [x0]!, xzr!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - sete [x0]!, x1!, xzr +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setgp [x0]!, xzr!, x2 +# CHECK-NEXT: 1.00 1.00 - - - 2.00 - - - - - - - setgp [x0]!, x1!, xzr +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgm [x0]!, xzr!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setgm [x0]!, x1!, xzr +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setge [x0]!, xzr!, x2 +# CHECK-NEXT: 0.50 0.50 - - - 1.00 - - - - - - - setge [x0]!, x1!, xzr diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mte-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mte-instructions.s new file mode 100644 index 0000000000000..31b3ca323215b --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-mte-instructions.s @@ -0,0 +1,244 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/mte-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 3.00 U irg x0, x1 +# CHECK-NEXT: 1 4 3.00 U irg sp, x1 +# CHECK-NEXT: 1 4 3.00 U irg x0, sp +# CHECK-NEXT: 1 4 3.00 U irg x0, x1, x2 +# CHECK-NEXT: 1 4 3.00 U irg sp, x1, x2 +# CHECK-NEXT: 1 2 0.50 addg x0, x1, #0, #1 +# CHECK-NEXT: 1 2 0.50 addg sp, x2, #32, #3 +# CHECK-NEXT: 1 2 0.50 addg x0, sp, #64, #5 +# CHECK-NEXT: 1 2 0.50 addg x3, x4, #1008, #6 +# CHECK-NEXT: 1 2 0.50 addg x5, x6, #112, #15 +# CHECK-NEXT: 1 2 0.50 U subg x0, x1, #0, #1 +# CHECK-NEXT: 1 2 0.50 U subg sp, x2, #32, #3 +# CHECK-NEXT: 1 2 0.50 U subg x0, sp, #64, #5 +# CHECK-NEXT: 1 2 0.50 U subg x3, x4, #1008, #6 +# CHECK-NEXT: 1 2 0.50 U subg x5, x6, #112, #15 +# CHECK-NEXT: 1 2 0.50 gmi x0, x1, x2 +# CHECK-NEXT: 1 2 0.50 gmi x3, sp, x4 +# CHECK-NEXT: 1 2 0.50 gmi xzr, x0, x30 +# CHECK-NEXT: 1 2 0.50 gmi x30, x0, xzr +# CHECK-NEXT: 1 2 0.50 subp x0, x1, x2 +# CHECK-NEXT: 1 2 0.50 U subps x0, x1, x2 +# CHECK-NEXT: 1 2 0.50 subp x0, sp, sp +# CHECK-NEXT: 1 2 0.50 U subps x0, sp, sp +# CHECK-NEXT: 1 2 0.50 U subps xzr, x0, x1 +# CHECK-NEXT: 1 2 0.50 U subps xzr, sp, sp +# CHECK-NEXT: 1 1 1.00 * stg x0, [x1, #-4096] +# CHECK-NEXT: 1 1 1.00 * stg x1, [x2, #4080] +# CHECK-NEXT: 1 1 1.00 * stg x2, [sp, #16] +# CHECK-NEXT: 1 1 1.00 * stg x3, [x1] +# CHECK-NEXT: 1 1 1.00 * stg sp, [x1] +# CHECK-NEXT: 1 1 1.00 * stzg x0, [x1, #-4096] +# CHECK-NEXT: 1 1 1.00 * stzg x1, [x2, #4080] +# CHECK-NEXT: 1 1 1.00 * stzg x2, [sp, #16] +# CHECK-NEXT: 1 1 1.00 * stzg x3, [x1] +# CHECK-NEXT: 1 1 1.00 * stzg sp, [x1] +# CHECK-NEXT: 1 1 1.00 * U stg x0, [x1, #-4096]! +# CHECK-NEXT: 1 1 1.00 * U stg x1, [x2, #4080]! +# CHECK-NEXT: 1 1 1.00 * U stg x2, [sp, #16]! +# CHECK-NEXT: 1 1 1.00 * U stg sp, [sp, #16]! +# CHECK-NEXT: 1 1 1.00 * U stzg x0, [x1, #-4096]! +# CHECK-NEXT: 1 1 1.00 * U stzg x1, [x2, #4080]! +# CHECK-NEXT: 1 1 1.00 * U stzg x2, [sp, #16]! +# CHECK-NEXT: 1 1 1.00 * U stzg sp, [sp, #16]! +# CHECK-NEXT: 1 1 1.00 * U stg x0, [x1], #-4096 +# CHECK-NEXT: 1 1 1.00 * U stg x1, [x2], #4080 +# CHECK-NEXT: 1 1 1.00 * U stg x2, [sp], #16 +# CHECK-NEXT: 1 1 1.00 * U stg sp, [sp], #16 +# CHECK-NEXT: 1 1 1.00 * U stzg x0, [x1], #-4096 +# CHECK-NEXT: 1 1 1.00 * U stzg x1, [x2], #4080 +# CHECK-NEXT: 1 1 1.00 * U stzg x2, [sp], #16 +# CHECK-NEXT: 1 1 1.00 * U stzg sp, [sp], #16 +# CHECK-NEXT: 1 1 2.00 * st2g x0, [x1, #-4096] +# CHECK-NEXT: 1 1 2.00 * st2g x1, [x2, #4080] +# CHECK-NEXT: 1 1 2.00 * st2g x2, [sp, #16] +# CHECK-NEXT: 1 1 2.00 * st2g x3, [x1] +# CHECK-NEXT: 1 1 2.00 * st2g sp, [x1] +# CHECK-NEXT: 1 1 2.00 * stz2g x0, [x1, #-4096] +# CHECK-NEXT: 1 1 2.00 * stz2g x1, [x2, #4080] +# CHECK-NEXT: 1 1 2.00 * stz2g x2, [sp, #16] +# CHECK-NEXT: 1 1 2.00 * stz2g x3, [x1] +# CHECK-NEXT: 1 1 2.00 * stz2g sp, [x1] +# CHECK-NEXT: 1 1 2.00 * U st2g x0, [x1, #-4096]! +# CHECK-NEXT: 1 1 2.00 * U st2g x1, [x2, #4080]! +# CHECK-NEXT: 1 1 2.00 * U st2g x2, [sp, #16]! +# CHECK-NEXT: 1 1 2.00 * U st2g sp, [sp, #16]! +# CHECK-NEXT: 1 1 2.00 * U stz2g x0, [x1, #-4096]! +# CHECK-NEXT: 1 1 2.00 * U stz2g x1, [x2, #4080]! +# CHECK-NEXT: 1 1 2.00 * U stz2g x2, [sp, #16]! +# CHECK-NEXT: 1 1 2.00 * U stz2g sp, [sp, #16]! +# CHECK-NEXT: 1 1 2.00 * U st2g x0, [x1], #-4096 +# CHECK-NEXT: 1 1 2.00 * U st2g x1, [x2], #4080 +# CHECK-NEXT: 1 1 2.00 * U st2g x2, [sp], #16 +# CHECK-NEXT: 1 1 2.00 * U st2g sp, [sp], #16 +# CHECK-NEXT: 1 1 2.00 * U stz2g x0, [x1], #-4096 +# CHECK-NEXT: 1 1 2.00 * U stz2g x1, [x2], #4080 +# CHECK-NEXT: 1 1 2.00 * U stz2g x2, [sp], #16 +# CHECK-NEXT: 1 1 2.00 * U stz2g sp, [sp], #16 +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [x2, #-1024] +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [x2, #1008] +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [sp, #16] +# CHECK-NEXT: 1 1 1.00 * stgp xzr, x1, [x2, #16] +# CHECK-NEXT: 1 1 1.00 * stgp x0, xzr, [x2, #16] +# CHECK-NEXT: 1 1 1.00 * stgp x0, xzr, [x2] +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [x2, #-1024]! +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [x2, #1008]! +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [sp, #16]! +# CHECK-NEXT: 1 1 1.00 * stgp xzr, x1, [x2, #16]! +# CHECK-NEXT: 1 1 1.00 * stgp x0, xzr, [x2, #16]! +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [x2], #-1024 +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [x2], #1008 +# CHECK-NEXT: 1 1 1.00 * stgp x0, x1, [sp], #16 +# CHECK-NEXT: 1 1 1.00 * stgp xzr, x1, [x2], #16 +# CHECK-NEXT: 1 1 1.00 * stgp x0, xzr, [x2], #16 +# CHECK-NEXT: 1 2 0.50 * ldg x0, [x1] +# CHECK-NEXT: 1 2 0.50 * ldg x2, [sp, #-4096] +# CHECK-NEXT: 1 2 0.50 * ldg x3, [x4, #4080] +# CHECK-NEXT: 1 2 4.00 * U ldgm x0, [x1] +# CHECK-NEXT: 1 2 4.00 * U ldgm x1, [sp] +# CHECK-NEXT: 1 2 4.00 * U ldgm xzr, [x2] +# CHECK-NEXT: 1 1 1.00 * U stgm x0, [x1] +# CHECK-NEXT: 1 1 1.00 * U stgm x1, [sp] +# CHECK-NEXT: 1 1 1.00 * U stgm xzr, [x2] +# CHECK-NEXT: 1 1 1.00 * U stzgm x0, [x1] +# CHECK-NEXT: 1 1 1.00 * U stzgm x1, [sp] +# CHECK-NEXT: 1 1 1.00 * U stzgm xzr, [x2] + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: 25.00 25.00 - - 13.50 113.50 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: 3.00 3.00 - - - - - - - - - - - irg x0, x1 +# CHECK-NEXT: 3.00 3.00 - - - - - - - - - - - irg sp, x1 +# CHECK-NEXT: 3.00 3.00 - - - - - - - - - - - irg x0, sp +# CHECK-NEXT: 3.00 3.00 - - - - - - - - - - - irg x0, x1, x2 +# CHECK-NEXT: 3.00 3.00 - - - - - - - - - - - irg sp, x1, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addg x0, x1, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addg sp, x2, #32, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addg x0, sp, #64, #5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addg x3, x4, #1008, #6 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addg x5, x6, #112, #15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subg x0, x1, #0, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subg sp, x2, #32, #3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subg x0, sp, #64, #5 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subg x3, x4, #1008, #6 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subg x5, x6, #112, #15 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - gmi x0, x1, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - gmi x3, sp, x4 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - gmi xzr, x0, x30 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - gmi x30, x0, xzr +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subp x0, x1, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subps x0, x1, x2 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subp x0, sp, sp +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subps x0, sp, sp +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subps xzr, x0, x1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - subps xzr, sp, sp +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x0, [x1, #-4096] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x1, [x2, #4080] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x2, [sp, #16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x3, [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg sp, [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x0, [x1, #-4096] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x1, [x2, #4080] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x2, [sp, #16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x3, [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg sp, [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x1, [x2, #4080]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x2, [sp, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg sp, [sp, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x1, [x2, #4080]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x2, [sp, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg sp, [sp, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x0, [x1], #-4096 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x1, [x2], #4080 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg x2, [sp], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stg sp, [sp], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x0, [x1], #-4096 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x1, [x2], #4080 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg x2, [sp], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzg sp, [sp], #16 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x0, [x1, #-4096] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x1, [x2, #4080] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x2, [sp, #16] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x3, [x1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g sp, [x1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x0, [x1, #-4096] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x1, [x2, #4080] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x2, [sp, #16] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x3, [x1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g sp, [x1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x1, [x2, #4080]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x2, [sp, #16]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g sp, [sp, #16]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x1, [x2, #4080]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x2, [sp, #16]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g sp, [sp, #16]! +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x0, [x1], #-4096 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x1, [x2], #4080 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g x2, [sp], #16 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2g sp, [sp], #16 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x0, [x1], #-4096 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x1, [x2], #4080 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g x2, [sp], #16 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - stz2g sp, [sp], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [x2, #-1024] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [x2, #1008] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [sp, #16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp xzr, x1, [x2, #16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, xzr, [x2, #16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, xzr, [x2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [x2, #-1024]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [x2, #1008]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [sp, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp xzr, x1, [x2, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, xzr, [x2, #16]! +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [x2], #-1024 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [x2], #1008 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, x1, [sp], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp xzr, x1, [x2], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgp x0, xzr, [x2], #16 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldg x0, [x1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldg x2, [sp, #-4096] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldg x3, [x4, #4080] +# CHECK-NEXT: - - - - 4.00 4.00 - - - - - - - ldgm x0, [x1] +# CHECK-NEXT: - - - - 4.00 4.00 - - - - - - - ldgm x1, [sp] +# CHECK-NEXT: - - - - 4.00 4.00 - - - - - - - ldgm xzr, [x2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgm x0, [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgm x1, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stgm xzr, [x2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzgm x0, [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzgm x1, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stzgm xzr, [x2] diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-neon-instructions.s new file mode 100644 index 0000000000000..454fbed5f0aae --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-neon-instructions.s @@ -0,0 +1,3152 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -mattr=+aes -instruction-tables < %p/../Inputs/neon-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 0.50 abs d29, d24 +# CHECK-NEXT: 1 3 0.50 abs v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 abs v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 abs v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 abs v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 abs v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 abs v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 abs v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 add d17, d31, d29 +# CHECK-NEXT: 1 3 0.50 add v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 addhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 addhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 addhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 addhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 addhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 addhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 addp v7.2s, v1.2s, v2.2s +# CHECK-NEXT: 1 3 0.50 addp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 addp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 addp d1, v14.2d +# CHECK-NEXT: 1 4 1.00 addv s0, v0.4s +# CHECK-NEXT: 1 4 1.00 addv h0, v0.4h +# CHECK-NEXT: 1 3 1.00 addv h0, v0.8h +# CHECK-NEXT: 1 3 1.00 addv b0, v0.8b +# CHECK-NEXT: 1 3 1.00 addv b0, v0.16b +# CHECK-NEXT: 1 3 0.50 aesd v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 aese v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 aesimc v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 aesmc v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 and v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 bic v0.4h, #15, lsl #8 +# CHECK-NEXT: 1 3 0.50 bic v23.8h, #101 +# CHECK-NEXT: 1 3 0.50 bic v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 bic v25.16b, v10.16b, v9.16b +# CHECK-NEXT: 1 3 0.50 bic v24.2s, #70 +# CHECK-NEXT: 1 3 0.50 bit v5.8b, v12.8b, v22.8b +# CHECK-NEXT: 1 3 0.50 bif v0.8b, v25.8b, v4.8b +# CHECK-NEXT: 1 3 0.50 bif v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 bit v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 bsl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 bsl v27.16b, v13.16b, v21.16b +# CHECK-NEXT: 1 3 0.50 cls v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 cls v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 cls v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 cls v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 cls v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 cls v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 clz v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 clz v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 clz v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 clz v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 clz v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 clz v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 cmeq v9.8h, v16.8h, v24.8h +# CHECK-NEXT: 1 3 0.50 cmeq v14.4h, v18.4h, #0 +# CHECK-NEXT: 1 3 0.50 cmeq d20, d21, #0 +# CHECK-NEXT: 1 3 0.50 cmeq d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 cmeq v0.16b, v0.16b, #0 +# CHECK-NEXT: 1 3 0.50 cmeq v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 cmge v22.8h, v16.8h, v3.8h +# CHECK-NEXT: 1 3 0.50 cmge v22.16b, v30.16b, #0 +# CHECK-NEXT: 1 3 0.50 cmge d20, d21, #0 +# CHECK-NEXT: 1 3 0.50 cmge d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 cmge v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 cmge v0.8b, v0.8b, #0 +# CHECK-NEXT: 1 3 0.50 cmgt v3.2d, v29.2d, v11.2d +# CHECK-NEXT: 1 3 0.50 cmgt d20, d21, #0 +# CHECK-NEXT: 1 3 0.50 cmgt d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 cmgt v0.2s, v0.2s, #0 +# CHECK-NEXT: 1 3 0.50 cmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 cmhi v28.4h, v25.4h, v21.4h +# CHECK-NEXT: 1 3 0.50 cmhi d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 cmhi v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 cmhs d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 cmhs v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 cmle v21.2s, v19.2s, #0 +# CHECK-NEXT: 1 3 0.50 cmle d20, d21, #0 +# CHECK-NEXT: 1 3 0.50 cmle v0.2d, v0.2d, #0 +# CHECK-NEXT: 1 3 0.50 cmlt v26.4h, v12.4h, #0 +# CHECK-NEXT: 1 3 0.50 cmlt d20, d21, #0 +# CHECK-NEXT: 1 3 0.50 cmlt v0.8h, v0.8h, #0 +# CHECK-NEXT: 1 3 0.50 cmtst d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 cmtst v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 cmtst v13.2d, v13.2d, v13.2d +# CHECK-NEXT: 1 3 0.50 cnt v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 cnt v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 dup v0.16b, w28 +# CHECK-NEXT: 1 3 0.50 dup v0.2d, x28 +# CHECK-NEXT: 1 3 0.50 dup v0.2s, w28 +# CHECK-NEXT: 1 3 0.50 dup v0.4h, w28 +# CHECK-NEXT: 1 3 0.50 dup v0.4s, w28 +# CHECK-NEXT: 1 3 0.50 dup v0.8b, w28 +# CHECK-NEXT: 1 3 0.50 dup v0.8h, w28 +# CHECK-NEXT: 1 3 0.50 mov b0, v0.b[1] +# CHECK-NEXT: 1 3 0.50 mov d0, v0.d[1] +# CHECK-NEXT: 1 3 0.50 mov h0, v0.h[1] +# CHECK-NEXT: 1 3 0.50 mov s0, v0.s[1] +# CHECK-NEXT: 1 3 0.50 dup v0.16b, v0.b[1] +# CHECK-NEXT: 1 3 0.50 dup v0.2d, v0.d[1] +# CHECK-NEXT: 1 3 0.50 dup v0.2s, v0.s[1] +# CHECK-NEXT: 1 3 0.50 dup v0.4h, v0.h[1] +# CHECK-NEXT: 1 3 0.50 dup v0.4s, v0.s[1] +# CHECK-NEXT: 1 3 0.50 dup v0.8b, v0.b[1] +# CHECK-NEXT: 1 3 0.50 dup v0.8h, v0.h[1] +# CHECK-NEXT: 1 3 0.50 eor v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 ext v0.16b, v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ext v0.8b, v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 4 0.50 fabd d29, d24, d20 +# CHECK-NEXT: 1 4 0.50 fabd s29, s24, s20 +# CHECK-NEXT: 1 4 0.50 fabd h27, h20, h17 +# CHECK-NEXT: 1 4 0.50 fabd v13.8h, v28.8h, v12.8h +# CHECK-NEXT: 1 4 0.50 fabd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fabs h25, h7 +# CHECK-NEXT: 1 4 0.50 fabs v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fabs v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fabs v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fabs v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fabs v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 facge d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 facge s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 facge h24, h26, h29 +# CHECK-NEXT: 1 3 0.50 facge v25.4h, v16.4h, v11.4h +# CHECK-NEXT: 1 3 0.50 facge v19.2s, v24.2s, v5.2s +# CHECK-NEXT: 1 3 0.50 facge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 facgt d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 facgt s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 facgt h0, h4, h10 +# CHECK-NEXT: 1 3 0.50 facgt v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 facgt v22.8h, v14.8h, v31.8h +# CHECK-NEXT: 1 3 0.50 facgt v22.4s, v8.4s, v2.4s +# CHECK-NEXT: 1 4 0.50 fadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 faddp h10, v19.2h +# CHECK-NEXT: 1 4 0.50 faddp d11, v28.2d +# CHECK-NEXT: 1 4 0.50 faddp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 faddp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 faddp v16.2d, v11.2d, v5.2d +# CHECK-NEXT: 1 3 0.50 fcmeq h30, h6, h1 +# CHECK-NEXT: 1 3 0.50 fcmeq h19, h23, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 fcmeq s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 fcmeq v0.2s, v0.2s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmeq v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 fcmeq v12.4s, v11.4s, v26.4s +# CHECK-NEXT: 1 3 0.50 fcmeq v18.2d, v17.2d, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge h10, h23, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge h1, h16, h12 +# CHECK-NEXT: 1 3 0.50 fcmge d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 fcmge s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 fcmge v0.2d, v0.2d, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge v17.2d, v11.2d, v13.2d +# CHECK-NEXT: 1 3 0.50 fcmge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcmge v18.4h, v27.4h, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmge v20.8h, v19.8h, v22.8h +# CHECK-NEXT: 1 3 0.50 fcmge v17.2s, v11.2s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt h4, h5, h0 +# CHECK-NEXT: 1 3 0.50 fcmgt h0, h18, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt d20, d21, d22 +# CHECK-NEXT: 1 3 0.50 fcmgt s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt s10, s11, s12 +# CHECK-NEXT: 1 3 0.50 fcmgt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 fcmgt v24.8h, v24.8h, v28.8h +# CHECK-NEXT: 1 3 0.50 fcmgt v0.8h, v11.8h, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmgt v19.2d, v31.2d, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle v16.8h, v11.8h, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle v22.4s, v30.4s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle v0.2d, v0.2d, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmle h18, h28, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt h23, h7, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt d20, d21, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt s10, s11, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt v8.4h, v2.4h, #0.0 +# CHECK-NEXT: 1 3 0.50 fcmlt v7.2d, v16.2d, #0.0 +# CHECK-NEXT: 1 4 0.50 fcvtas d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtas s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtas h12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtas v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtas v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtas v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtas v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtas v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtau d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtau s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtau h12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtau v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtau v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtau v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtau v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtau v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtl v0.2d, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtl v0.4s, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtl2 v0.2d, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtl2 v0.4s, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtms d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtms s22, s13 +# CHECK-NEXT: 1 4 0.50 fcvtms h22, h13 +# CHECK-NEXT: 1 4 0.50 fcvtms v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtms v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtms v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtms v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtms v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtmu d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtmu s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtmu h12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtmu v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtmu v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtmu v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtmu v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtmu v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtn v0.2s, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtn v0.4h, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtns d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtns s22, s13 +# CHECK-NEXT: 1 4 0.50 fcvtns h22, h13 +# CHECK-NEXT: 1 4 0.50 fcvtns v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtns v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtns v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtns v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtns v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtnu d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtnu s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtnu h12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtnu v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtnu v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtnu v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtnu v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtnu v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtps d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtps s22, s13 +# CHECK-NEXT: 1 4 0.50 fcvtps h22, h13 +# CHECK-NEXT: 1 4 0.50 fcvtps v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtps v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtps v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtps v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtps v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtpu d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtpu s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtpu h12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtpu v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtpu v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtpu v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtpu v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtpu v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtxn s22, d13 +# CHECK-NEXT: 1 4 0.50 fcvtxn v0.2s, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtxn2 v0.4s, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtzs d21, d12, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtzs s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtzs s21, s12, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs h21, h14 +# CHECK-NEXT: 1 4 0.50 fcvtzs h21, h12, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtzs v20.4h, v24.4h, #11 +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 fcvtzs v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtzs v18.8h, v10.8h, #7 +# CHECK-NEXT: 1 4 0.50 fcvtzu d21, d12, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu d21, d14 +# CHECK-NEXT: 1 4 0.50 fcvtzu s12, s13 +# CHECK-NEXT: 1 4 0.50 fcvtzu s21, s12, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu h12, h13 +# CHECK-NEXT: 1 4 0.50 fcvtzu h21, h12, #1 +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fcvtzu v19.4h, v26.4h, #9 +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 fcvtzu v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fcvtzu v27.8h, v6.8h, #11 +# CHECK-NEXT: 1 22 19.00 fdiv v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 13 5.00 fdiv v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 8 2.50 fdiv v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 13 10.00 fdiv v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 8 5.00 fdiv v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fmax v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fmaxnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmaxnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmaxnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fmaxnmp h25, v19.2h +# CHECK-NEXT: 1 4 0.50 fmaxnmp d17, v29.2d +# CHECK-NEXT: 1 4 0.50 fmaxnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmaxnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmaxnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 1.00 fmaxnmv h0, v13.4h +# CHECK-NEXT: 1 4 1.00 fmaxnmv h12, v11.8h +# CHECK-NEXT: 1 4 1.00 fmaxnmv s28, v31.4s +# CHECK-NEXT: 1 4 0.50 fmaxp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fmaxp h15, v25.2h +# CHECK-NEXT: 1 4 0.50 fmaxp s6, v2.2s +# CHECK-NEXT: 1 4 1.00 fmaxv h0, v0.4h +# CHECK-NEXT: 1 4 1.00 fmaxv h0, v0.8h +# CHECK-NEXT: 1 4 1.00 fmaxv s0, v0.4s +# CHECK-NEXT: 1 4 0.50 fmin v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fminnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fminnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fminnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fminnmp h20, v14.2h +# CHECK-NEXT: 1 4 0.50 fminnmp d15, v8.2d +# CHECK-NEXT: 1 4 0.50 fminnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fminnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fminnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 1.00 fminnmv h19, v25.4h +# CHECK-NEXT: 1 4 1.00 fminnmv h23, v17.8h +# CHECK-NEXT: 1 4 1.00 fminnmv s29, v17.4s +# CHECK-NEXT: 1 4 0.50 fminp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fminp h7, v10.2h +# CHECK-NEXT: 1 4 0.50 fminp s17, v7.2s +# CHECK-NEXT: 1 4 1.00 fminv h3, v30.4h +# CHECK-NEXT: 1 4 1.00 fminv h29, v12.8h +# CHECK-NEXT: 1 4 1.00 fminv s16, v19.4s +# CHECK-NEXT: 1 4 0.50 fmla d0, d1, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmla h23, h24, v15.h[4] +# CHECK-NEXT: 1 4 0.50 fmla s0, s1, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmla v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmla v29.8h, v15.8h, v10.h[4] +# CHECK-NEXT: 1 4 0.50 fmla v2.2s, v16.2s, v28.s[0] +# CHECK-NEXT: 1 4 0.50 fmla v14.4s, v14.4s, v5.s[3] +# CHECK-NEXT: 1 4 0.50 fmla v1.4s, v24.4s, v12.4s +# CHECK-NEXT: 1 4 0.50 fmla v10.2d, v14.2d, v21.d[1] +# CHECK-NEXT: 1 4 0.50 fmls d0, d4, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmls h8, h14, v7.h[4] +# CHECK-NEXT: 1 4 0.50 fmls s3, s5, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmls v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmls v30.8h, v18.8h, v4.h[6] +# CHECK-NEXT: 1 4 0.50 fmls v10.2s, v27.2s, v0.s[0] +# CHECK-NEXT: 1 4 0.50 fmls v27.4s, v7.4s, v24.s[0] +# CHECK-NEXT: 1 4 0.50 fmls v10.2d, v22.2d, v29.d[0] +# CHECK-NEXT: 1 4 0.50 fmls v6.8h, v15.8h, v23.8h +# CHECK-NEXT: 1 3 0.50 fmov v0.2d, #-1.25000000 +# CHECK-NEXT: 1 3 0.50 fmov v0.2s, #13.00000000 +# CHECK-NEXT: 1 3 0.50 fmov v0.4s, #1.00000000 +# CHECK-NEXT: 1 4 0.50 fmul h18, h4, v7.h[3] +# CHECK-NEXT: 1 4 0.50 fmul v10.4h, v2.4h, v7.h[5] +# CHECK-NEXT: 1 4 0.50 fmul v5.2s, v12.2s, v9.s[0] +# CHECK-NEXT: 1 4 0.50 fmul v15.4s, v30.4s, v2.s[3] +# CHECK-NEXT: 1 4 0.50 fmul v11.2d, v31.2d, v24.d[1] +# CHECK-NEXT: 1 4 0.50 fmul h28, h14, h3 +# CHECK-NEXT: 1 4 0.50 fmul d0, d1, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmul s0, s1, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmul v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmulx d0, d4, v0.d[1] +# CHECK-NEXT: 1 4 0.50 fmulx d23, d11, d1 +# CHECK-NEXT: 1 4 0.50 fmulx s20, s22, s15 +# CHECK-NEXT: 1 4 0.50 fmulx h18, h17, v7.h[1] +# CHECK-NEXT: 1 4 0.50 fmulx h20, h25, h0 +# CHECK-NEXT: 1 4 0.50 fmulx s3, s5, v0.s[3] +# CHECK-NEXT: 1 4 0.50 fmulx v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fmulx v28.4h, v25.4h, v15.h[1] +# CHECK-NEXT: 1 4 0.50 fmulx v3.2s, v22.2s, v23.s[3] +# CHECK-NEXT: 1 4 0.50 fmulx v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fmulx v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fmulx v5.4s, v28.4s, v15.s[3] +# CHECK-NEXT: 1 4 0.50 fmulx v22.2d, v18.2d, v25.d[1] +# CHECK-NEXT: 1 4 0.50 fneg v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 fneg v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 fneg v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 fneg v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 fneg v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frecpe h20, h8 +# CHECK-NEXT: 1 4 0.50 frecpe d13, d13 +# CHECK-NEXT: 1 4 0.50 frecpe s19, s14 +# CHECK-NEXT: 1 4 0.50 frecpe v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frecpe v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frecpe v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frecpe v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frecpe v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frecps h29, h19, h8 +# CHECK-NEXT: 1 4 0.50 frecpx h18, h11 +# CHECK-NEXT: 1 4 0.50 frecps v12.8h, v25.8h, v4.8h +# CHECK-NEXT: 1 4 0.50 frecps v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frecps d22, d30, d21 +# CHECK-NEXT: 1 4 0.50 frecps s21, s16, s13 +# CHECK-NEXT: 1 4 0.50 frecps v7.2d, v29.2d, v18.2d +# CHECK-NEXT: 1 4 0.50 frecpx d16, d19 +# CHECK-NEXT: 1 4 0.50 frecpx s18, s10 +# CHECK-NEXT: 1 4 0.50 frinta v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frinta v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frinta v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frinta v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frinta v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frinti v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frinti v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frinti v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frinti v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frinti v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frintm v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frintm v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frintm v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frintm v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frintm v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frintn v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frintn v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frintn v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frintn v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frintn v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frintp v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frintp v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frintp v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frintp v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frintp v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frintx v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frintx v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frintx v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frintx v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frintx v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frintz v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frintz v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frintz v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frintz v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frintz v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frsqrte h23, h26 +# CHECK-NEXT: 1 4 0.50 frsqrte d21, d12 +# CHECK-NEXT: 1 4 0.50 frsqrte s22, s13 +# CHECK-NEXT: 1 4 0.50 frsqrte v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 frsqrte v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 frsqrte v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 frsqrte v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 frsqrts v20.4s, v26.4s, v27.4s +# CHECK-NEXT: 1 4 0.50 frsqrts v8.4h, v9.4h, v30.4h +# CHECK-NEXT: 1 4 0.50 frsqrte v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 frsqrts h28, h26, h1 +# CHECK-NEXT: 1 4 0.50 frsqrts d8, d22, d18 +# CHECK-NEXT: 1 4 0.50 frsqrts s21, s5, s12 +# CHECK-NEXT: 1 4 0.50 frsqrts v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 22 19.00 fsqrt v0.2d, v0.2d +# CHECK-NEXT: 1 12 4.50 fsqrt v0.2s, v0.2s +# CHECK-NEXT: 1 8 2.50 fsqrt v0.4h, v0.4h +# CHECK-NEXT: 1 12 9.00 fsqrt v0.4s, v0.4s +# CHECK-NEXT: 1 8 5.00 fsqrt v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 fsub v13.8h, v15.8h, v17.8h +# CHECK-NEXT: 1 4 0.50 fsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 * ld1 { v0.16b }, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1 { v0.16b, v1.16b }, [x14] +# CHECK-NEXT: 1 4 2.00 * ld1 { v19.16b, v20.16b, v21.16b }, [x10] +# CHECK-NEXT: 1 4 2.00 * ld1 { v13.16b, v14.16b, v15.16b, v16.16b }, [x9] +# CHECK-NEXT: 1 3 0.50 * ld1 { v24.8h }, [x27] +# CHECK-NEXT: 1 3 1.00 * ld1 { v1.8h, v2.8h }, [x27] +# CHECK-NEXT: 2 3 1.00 * ld1 { v0.8h, v1.8h }, [sp], #32 +# CHECK-NEXT: 1 4 2.00 * ld1 { v21.8h, v22.8h, v23.8h }, [x22] +# CHECK-NEXT: 1 4 2.00 * ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x21] +# CHECK-NEXT: 1 3 0.50 * ld1 { v3.4s }, [x4] +# CHECK-NEXT: 1 3 1.00 * ld1 { v11.4s, v12.4s }, [x30] +# CHECK-NEXT: 1 4 2.00 * ld1 { v0.4s, v1.4s, v2.4s }, [x24] +# CHECK-NEXT: 1 4 2.00 * ld1 { v15.4s, v16.4s, v17.4s, v18.4s }, [x28] +# CHECK-NEXT: 2 4 2.00 * ld1 { v0.4s, v1.4s, v2.4s }, [x0], #48 +# CHECK-NEXT: 1 3 0.50 * ld1 { v3.2d }, [x28] +# CHECK-NEXT: 1 3 1.00 * ld1 { v13.2d, v14.2d }, [x13] +# CHECK-NEXT: 1 4 2.00 * ld1 { v12.2d, v13.2d, v14.2d }, [x15] +# CHECK-NEXT: 2 4 2.00 * ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: 1 4 2.00 * ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: 2 3 0.50 * ld1 { v0.1d }, [x15], x2 +# CHECK-NEXT: 1 3 1.00 * ld1 { v27.1d, v28.1d }, [x7] +# CHECK-NEXT: 1 4 2.00 * ld1 { v14.1d, v15.1d, v16.1d }, [x3] +# CHECK-NEXT: 1 4 2.00 * ld1 { v22.1d, v23.1d, v24.1d, v25.1d }, [x4] +# CHECK-NEXT: 1 3 1.00 * ld1 { v0.2s, v1.2s }, [x15] +# CHECK-NEXT: 1 4 2.00 * ld1 { v16.2s, v17.2s, v18.2s }, [x27] +# CHECK-NEXT: 1 4 2.00 * ld1 { v21.2s, v22.2s, v23.2s, v24.2s }, [x21] +# CHECK-NEXT: 1 3 1.00 * ld1 { v25.4h, v26.4h }, [x3] +# CHECK-NEXT: 1 4 2.00 * ld1 { v20.4h, v21.4h, v22.4h, v23.4h }, [x15] +# CHECK-NEXT: 1 4 2.00 * ld1 { v0.4h, v1.4h, v2.4h }, [sp] +# CHECK-NEXT: 1 3 1.00 * ld1 { v24.8b, v25.8b }, [x6] +# CHECK-NEXT: 1 4 2.00 * ld1 { v7.8b, v8.8b, v9.8b }, [x12] +# CHECK-NEXT: 1 4 2.00 * ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x13] +# CHECK-NEXT: 2 3 1.00 * ld1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: 1 4 2.00 * ld1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: 2 4 2.00 * ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: 1 3 0.50 * ld1 { v0.b }[7], [x0] +# CHECK-NEXT: 2 3 0.50 * ld1 { v0.h }[3], [x0], #2 +# CHECK-NEXT: 1 3 0.50 * ld1 { v18.h }[3], [x1] +# CHECK-NEXT: 1 3 0.50 * ld1 { v0.s }[1], [x15] +# CHECK-NEXT: 2 3 0.50 * ld1 { v0.d }[0], [x15], #8 +# CHECK-NEXT: 1 3 0.50 * ld1 { v11.d }[0], [x13] +# CHECK-NEXT: 2 3 0.50 * ld1 { v0.8h }, [x15], x2 +# CHECK-NEXT: 1 3 1.00 * ld1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: 1 3 0.50 * ld1 { v0.b }[9], [x0] +# CHECK-NEXT: 2 3 0.50 * ld1 { v0.b }[9], [x0], #1 +# CHECK-NEXT: 1 3 0.50 * ld1r { v0.16b }, [x0] +# CHECK-NEXT: 2 3 0.50 * ld1r { v0.8h }, [x0], #2 +# CHECK-NEXT: 1 3 0.50 * ld1r { v0.4s }, [x15] +# CHECK-NEXT: 1 3 0.50 * ld1r { v3.1d }, [x15] +# CHECK-NEXT: 2 3 0.50 * ld1r { v0.2d }, [x15], x16 +# CHECK-NEXT: 1 3 0.50 * ld1r { v18.2d }, [x0] +# CHECK-NEXT: 1 3 0.50 * ld1r { v8.8b }, [x23] +# CHECK-NEXT: 1 3 0.50 * ld1r { v28.4h }, [x9] +# CHECK-NEXT: 1 3 0.50 * ld1r { v3.8h }, [x16] +# CHECK-NEXT: 1 3 0.50 * ld1r { v10.2s }, [x20] +# CHECK-NEXT: 1 4 1.00 * ld2 { v0.4h, v1.4h }, [x21] +# CHECK-NEXT: 1 4 1.00 * ld2 { v8.8h, v9.8h }, [x28] +# CHECK-NEXT: 1 4 1.00 * ld2 { v2.2s, v3.2s }, [x16] +# CHECK-NEXT: 1 4 1.00 * ld2 { v22.4s, v23.4s }, [x4] +# CHECK-NEXT: 1 4 1.00 * ld2 { v22.2d, v23.2d }, [x17] +# CHECK-NEXT: 1 4 4.00 * ld2 { v29.b, v30.b }[3], [x1] +# CHECK-NEXT: 1 4 4.00 * ld2 { v26.s, v27.s }[1], [x17] +# CHECK-NEXT: 1 4 4.00 * ld2 { v1.d, v2.d }[0], [x10] +# CHECK-NEXT: 1 4 1.00 * ld2 { v0.16b, v1.16b }, [x0] +# CHECK-NEXT: 1 4 1.00 * ld2 { v13.8b, v14.8b }, [x4] +# CHECK-NEXT: 2 4 1.00 * ld2 { v0.8b, v1.8b }, [x0], #16 +# CHECK-NEXT: 2 3 0.50 * ld1r { v0.16b }, [x0], #1 +# CHECK-NEXT: 1 3 0.50 * ld1r { v0.8h }, [x15] +# CHECK-NEXT: 2 3 0.50 * ld1r { v0.8h }, [x15], #2 +# CHECK-NEXT: 2 4 1.00 * ld2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: 1 4 1.00 * ld2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: 1 4 4.00 * ld2 { v0.h, v1.h }[7], [x15] +# CHECK-NEXT: 2 4 4.00 * ld2 { v0.h, v1.h }[7], [x15], x8 +# CHECK-NEXT: 2 4 4.00 * ld2 { v0.h, v1.h }[7], [x15], #4 +# CHECK-NEXT: 1 3 1.00 * ld2r { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: 1 3 1.00 * ld2r { v10.16b, v11.16b }, [x23] +# CHECK-NEXT: 2 3 1.00 * ld2r { v0.4h, v1.4h }, [x0], #4 +# CHECK-NEXT: 1 3 1.00 * ld2r { v25.4h, v26.4h }, [x11] +# CHECK-NEXT: 1 3 1.00 * ld2r { v23.8h, v24.8h }, [x10] +# CHECK-NEXT: 1 3 1.00 * ld2r { v0.2s, v1.2s }, [sp] +# CHECK-NEXT: 1 3 1.00 * ld2r { v8.4s, v9.4s }, [x17] +# CHECK-NEXT: 2 3 1.00 * ld2r { v0.1d, v1.1d }, [sp], x8 +# CHECK-NEXT: 1 3 1.00 * ld2r { v9.1d, v10.1d }, [x25] +# CHECK-NEXT: 1 3 1.00 * ld2r { v26.2d, v27.2d }, [x8] +# CHECK-NEXT: 1 5 3.00 * ld3 { v8.8b, v9.8b, v10.8b }, [x0] +# CHECK-NEXT: 1 5 3.00 * ld3 { v15.16b, v16.16b, v17.16b }, [x5] +# CHECK-NEXT: 1 3 1.00 * ld2r { v0.2d, v1.2d }, [x0] +# CHECK-NEXT: 2 3 1.00 * ld2r { v0.2d, v1.2d }, [x0], #16 +# CHECK-NEXT: 1 3 1.00 * ld2r { v0.4s, v1.4s }, [sp] +# CHECK-NEXT: 2 3 1.00 * ld2r { v0.4s, v1.4s }, [sp], #8 +# CHECK-NEXT: 1 5 3.00 * ld3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: 2 5 3.00 * ld3 { v0.8h, v1.8h, v2.8h }, [x15], #48 +# CHECK-NEXT: 1 5 3.00 * ld3 { v7.8h, v8.8h, v9.8h }, [x21] +# CHECK-NEXT: 1 5 3.00 * ld3 { v16.2s, v17.2s, v18.2s }, [x0] +# CHECK-NEXT: 1 5 3.00 * ld3 { v12.4s, v13.4s, v14.4s }, [x25] +# CHECK-NEXT: 1 5 5.00 * ld3 { v17.b, v18.b, v19.b }[2], [x27] +# CHECK-NEXT: 1 5 5.00 * ld3 { v18.h, v19.h, v20.h }[5], [x16] +# CHECK-NEXT: 1 5 3.00 * ld3 { v10.2d, v11.2d, v12.2d }, [x18] +# CHECK-NEXT: 2 5 3.00 * ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: 1 5 5.00 * ld3 { v0.s, v1.s, v2.s }[3], [sp] +# CHECK-NEXT: 2 5 5.00 * ld3 { v0.s, v1.s, v2.s }[3], [sp], x3 +# CHECK-NEXT: 1 5 5.00 * ld3 { v5.d, v6.d, v7.d }[1], [x14] +# CHECK-NEXT: 1 4 2.00 * ld3r { v0.8b, v1.8b, v2.8b }, [x15] +# CHECK-NEXT: 1 4 2.00 * ld3r { v17.16b, v18.16b, v19.16b }, [x3] +# CHECK-NEXT: 1 4 2.00 * ld3r { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: 2 4 2.00 * ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6 +# CHECK-NEXT: 1 4 2.00 * ld3r { v3.4h, v4.4h, v5.4h }, [x1] +# CHECK-NEXT: 1 4 2.00 * ld3r { v6.8h, v7.8h, v8.8h }, [x28] +# CHECK-NEXT: 1 4 2.00 * ld3r { v0.2s, v1.2s, v2.2s }, [x0] +# CHECK-NEXT: 1 4 2.00 * ld3r { v28.4s, v29.4s, v30.4s }, [x2] +# CHECK-NEXT: 2 4 2.00 * ld3r { v0.1d, v1.1d, v2.1d }, [x0], x0 +# CHECK-NEXT: 1 4 2.00 * ld3r { v1.1d, v2.1d, v3.1d }, [x28] +# CHECK-NEXT: 1 4 2.00 * ld3r { v8.2d, v9.2d, v10.2d }, [x3] +# CHECK-NEXT: 1 5 3.00 * ld4 { v6.8b, v7.8b, v8.8b, v9.8b }, [x27] +# CHECK-NEXT: 1 5 3.00 * ld4 { v11.16b, v12.16b, v13.16b, v14.16b }, [x5] +# CHECK-NEXT: 1 5 3.00 * ld4 { v21.4h, v22.4h, v23.4h, v24.4h }, [x14] +# CHECK-NEXT: 1 5 3.00 * ld4 { v9.8h, v10.8h, v11.8h, v12.8h }, [x1] +# CHECK-NEXT: 1 5 3.00 * ld4 { v17.4s, v18.4s, v19.4s, v20.4s }, [x4] +# CHECK-NEXT: 1 4 2.00 * ld3r { v0.8b, v1.8b, v2.8b }, [x0] +# CHECK-NEXT: 2 4 2.00 * ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3 +# CHECK-NEXT: 1 5 3.00 * ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: 2 5 3.00 * ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: 1 6 5.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] +# CHECK-NEXT: 1 5 3.00 * ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x24] +# CHECK-NEXT: 1 6 5.00 * ld4 { v4.b, v5.b, v6.b, v7.b }[12], [x27] +# CHECK-NEXT: 1 6 5.00 * ld4 { v5.h, v6.h, v7.h, v8.h }[0], [x4] +# CHECK-NEXT: 2 6 5.00 * ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 +# CHECK-NEXT: 2 6 5.00 * ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0 +# CHECK-NEXT: 1 6 5.00 * ld4 { v0.s, v1.s, v2.s, v3.s }[0], [x26] +# CHECK-NEXT: 1 4 2.00 * ld4r { v20.8b, v21.8b, v22.8b, v23.8b }, [x23] +# CHECK-NEXT: 1 4 2.00 * ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x25] +# CHECK-NEXT: 1 4 2.00 * ld4r { v16.4h, v17.4h, v18.4h, v19.4h }, [x6] +# CHECK-NEXT: 1 4 2.00 * ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp] +# CHECK-NEXT: 1 4 2.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# CHECK-NEXT: 1 4 2.00 * ld4r { v4.8h, v5.8h, v6.8h, v7.8h }, [x23] +# CHECK-NEXT: 1 4 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [x30] +# CHECK-NEXT: 2 4 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 +# CHECK-NEXT: 1 4 2.00 * ld4r { v7.4s, v8.4s, v9.4s, v10.4s }, [x23] +# CHECK-NEXT: 2 4 2.00 * ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 +# CHECK-NEXT: 2 4 2.00 * ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7 +# CHECK-NEXT: 1 4 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: 2 4 2.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30 +# CHECK-NEXT: 1 4 0.50 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mla v15.8h, v22.8h, v4.h[3] +# CHECK-NEXT: 1 4 0.50 mla v28.2s, v10.2s, v2.s[0] +# CHECK-NEXT: 1 4 0.50 mls v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 mls v25.8h, v29.8h, v0.h[4] +# CHECK-NEXT: 1 4 0.50 mls v22.2s, v29.2s, v0.s[3] +# CHECK-NEXT: 1 4 0.50 mls v26.4s, v5.4s, v28.4s +# CHECK-NEXT: 1 3 0.50 mov b0, v0.b[15] +# CHECK-NEXT: 1 3 0.50 mov d6, v0.d[1] +# CHECK-NEXT: 1 3 0.50 mov h2, v0.h[5] +# CHECK-NEXT: 1 3 0.50 mov s17, v0.s[2] +# CHECK-NEXT: 1 3 0.50 mov w8, v8.s[0] +# CHECK-NEXT: 1 3 0.50 mov x30, v18.d[0] +# CHECK-NEXT: 1 3 0.50 mov v2.b[0], v0.b[0] +# CHECK-NEXT: 1 3 0.50 mov v2.h[1], v0.h[1] +# CHECK-NEXT: 1 3 0.50 mov v2.s[2], v0.s[2] +# CHECK-NEXT: 1 3 0.50 mov v2.d[1], v0.d[1] +# CHECK-NEXT: 1 3 0.50 mov v0.b[0], w8 +# CHECK-NEXT: 1 3 0.50 mov v0.h[1], w8 +# CHECK-NEXT: 1 3 0.50 mov v0.s[2], w8 +# CHECK-NEXT: 1 3 0.50 mov v0.d[1], x8 +# CHECK-NEXT: 1 3 0.50 mov v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 mov v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 movi d15, #0xff00ff00ff00ff +# CHECK-NEXT: 1 3 0.50 movi v0.16b, #31 +# CHECK-NEXT: 1 3 0.50 movi v14.8h, #174 +# CHECK-NEXT: 1 3 0.50 movi v13.4h, #74, lsl #8 +# CHECK-NEXT: 1 3 0.50 movi v0.2d, #0xff0000ff0000ffff +# CHECK-NEXT: 1 3 0.50 movi v0.2s, #8, msl #8 +# CHECK-NEXT: 1 3 0.50 movi v19.2s, #226 +# CHECK-NEXT: 1 3 0.50 movi v1.4s, #122, msl #8 +# CHECK-NEXT: 1 3 0.50 movi v0.4s, #255, lsl #24 +# CHECK-NEXT: 1 3 0.50 movi v0.8b, #255 +# CHECK-NEXT: 1 4 0.50 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mul v26.4h, v20.4h, v14.h[5] +# CHECK-NEXT: 1 4 0.50 mul v5.8h, v21.8h, v3.h[7] +# CHECK-NEXT: 1 4 0.50 mul v29.2s, v10.2s, v3.s[1] +# CHECK-NEXT: 1 4 0.50 mul v30.4s, v11.4s, v4.s[0] +# CHECK-NEXT: 1 4 0.50 mul v30.4s, v11.4s, v4.4s +# CHECK-NEXT: 1 4 0.50 mul v3.8h, v9.8h, v8.8h +# CHECK-NEXT: 1 3 0.50 mvni v9.4h, #237 +# CHECK-NEXT: 1 3 0.50 mvni v8.8h, #171, lsl #8 +# CHECK-NEXT: 1 3 0.50 mvni v22.4s, #15, lsl #8 +# CHECK-NEXT: 1 3 0.50 mvni v0.2s, #0 +# CHECK-NEXT: 1 3 0.50 mvni v0.4s, #16, msl #16 +# CHECK-NEXT: 1 3 0.50 neg d29, d24 +# CHECK-NEXT: 1 3 0.50 neg v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 neg v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 neg v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 neg v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 neg v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 neg v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 neg v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 mvn v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 mvn v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 orn v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 orn v29.8b, v19.8b, v16.8b +# CHECK-NEXT: 1 3 0.50 mov v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 orr v9.4h, #18 +# CHECK-NEXT: 1 3 0.50 orr v0.8h, #31 +# CHECK-NEXT: 1 3 0.50 orr v4.4s, #0 +# CHECK-NEXT: 1 3 0.50 pmul v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 pmul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 pmull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 pmull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 6 3.00 raddhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 6 3.00 raddhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 6 3.00 raddhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 6 3.00 raddhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 6 3.00 raddhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 6 3.00 raddhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 rbit v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 rbit v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 rev16 v21.8b, v1.8b +# CHECK-NEXT: 1 3 0.50 rev16 v30.16b, v31.16b +# CHECK-NEXT: 1 3 0.50 rev32 v0.4h, v9.4h +# CHECK-NEXT: 1 3 0.50 rev32 v21.8b, v1.8b +# CHECK-NEXT: 1 3 0.50 rev32 v30.16b, v31.16b +# CHECK-NEXT: 1 3 0.50 rev32 v4.8h, v7.8h +# CHECK-NEXT: 1 3 0.50 rev64 v0.16b, v31.16b +# CHECK-NEXT: 1 3 0.50 rev64 v1.8b, v9.8b +# CHECK-NEXT: 1 3 0.50 rev64 v13.4h, v21.4h +# CHECK-NEXT: 1 3 0.50 rev64 v2.8h, v4.8h +# CHECK-NEXT: 1 3 0.50 rev64 v4.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 rev64 v6.4s, v8.4s +# CHECK-NEXT: 1 4 0.50 rshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 rshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 rshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 rshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 rshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 rshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 6 3.00 rsubhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 6 3.00 rsubhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 6 3.00 rsubhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 6 3.00 rsubhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 6 3.00 rsubhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 6 3.00 rsubhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 5 3.00 saba v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 5 3.00 sabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 5 3.00 sabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 5 3.00 sabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 5 3.00 sabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 5 3.00 sabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 5 3.00 sabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 sabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sabd v12.2s, v11.2s, v27.2s +# CHECK-NEXT: 1 3 0.50 sabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 sabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 sabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 5 3.00 sadalp v0.1d, v0.2s +# CHECK-NEXT: 1 5 3.00 sadalp v0.2d, v0.4s +# CHECK-NEXT: 1 5 3.00 sadalp v0.2s, v0.4h +# CHECK-NEXT: 1 5 3.00 sadalp v0.4h, v0.8b +# CHECK-NEXT: 1 5 3.00 sadalp v0.4s, v0.8h +# CHECK-NEXT: 1 5 3.00 sadalp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 saddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 saddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 saddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 saddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 saddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 saddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 saddlp v0.1d, v0.2s +# CHECK-NEXT: 1 3 0.50 saddlp v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 saddlp v0.2s, v0.4h +# CHECK-NEXT: 1 3 0.50 saddlp v0.4h, v0.8b +# CHECK-NEXT: 1 3 0.50 saddlp v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 saddlp v0.8h, v0.16b +# CHECK-NEXT: 1 4 1.00 saddlv d0, v0.4s +# CHECK-NEXT: 1 4 1.00 saddlv s0, v0.4h +# CHECK-NEXT: 1 4 1.00 saddlv s0, v0.8h +# CHECK-NEXT: 1 4 1.00 saddlv h0, v0.8b +# CHECK-NEXT: 1 4 1.00 saddlv h0, v0.16b +# CHECK-NEXT: 1 3 0.50 saddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 saddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 saddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 saddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 saddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 saddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 4 0.50 scvtf h4, h8, #9 +# CHECK-NEXT: 1 4 0.50 scvtf h5, h14 +# CHECK-NEXT: 1 4 0.50 scvtf d21, d12 +# CHECK-NEXT: 1 4 0.50 scvtf d21, d12, #64 +# CHECK-NEXT: 1 4 0.50 scvtf s22, s13 +# CHECK-NEXT: 1 4 0.50 scvtf s22, s13, #32 +# CHECK-NEXT: 1 4 0.50 scvtf v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 scvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 scvtf v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 scvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 scvtf v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 scvtf v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 scvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 scvtf v25.4h, v13.4h, #8 +# CHECK-NEXT: 1 4 0.50 scvtf v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 scvtf v4.8h, v8.8h, #10 +# CHECK-NEXT: 1 4 0.50 sdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 4 0.50 sdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 sdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 sdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.50 shadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 shadd v25.16b, v1.16b, v10.16b +# CHECK-NEXT: 1 3 0.50 shl d7, d10, #12 +# CHECK-NEXT: 1 3 0.50 shl v23.8b, v18.8b, #6 +# CHECK-NEXT: 1 3 0.50 shl v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 shl v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 shl v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 shl v0.8h, v23.8h, #10 +# CHECK-NEXT: 1 3 0.50 shl v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 shll v0.4s, v0.4h, #16 +# CHECK-NEXT: 1 3 0.50 shll v0.8h, v0.8b, #8 +# CHECK-NEXT: 1 3 0.50 shll v0.2d, v0.2s, #32 +# CHECK-NEXT: 1 3 0.50 shll2 v0.2d, v0.4s, #32 +# CHECK-NEXT: 1 3 0.50 shll2 v0.4s, v0.8h, #16 +# CHECK-NEXT: 1 3 0.50 shll2 v0.8h, v0.16b, #8 +# CHECK-NEXT: 1 3 0.50 shrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 shrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 shrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 shrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 shrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 shrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 shsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 shsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 shsub v15.8h, v5.8h, v27.8h +# CHECK-NEXT: 1 3 0.50 sli d10, d14, #12 +# CHECK-NEXT: 1 3 0.50 sli v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 sli v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 smax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smax v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smax v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smax v30.16b, v3.16b, v30.16b +# CHECK-NEXT: 1 3 0.50 smaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smaxp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smaxp v21.8h, v16.8h, v7.8h +# CHECK-NEXT: 1 3 0.50 smaxp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 1.00 smaxv b0, v0.8b +# CHECK-NEXT: 1 4 1.00 smaxv b0, v0.16b +# CHECK-NEXT: 1 4 1.00 smaxv h0, v0.4h +# CHECK-NEXT: 1 4 1.00 smaxv h0, v0.8h +# CHECK-NEXT: 1 4 1.00 smaxv s0, v0.4s +# CHECK-NEXT: 1 3 0.50 smin v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 smin v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sminp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 sminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 sminp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 1.00 sminv b0, v0.8b +# CHECK-NEXT: 1 4 1.00 sminv b0, v0.16b +# CHECK-NEXT: 1 4 1.00 sminv h0, v0.4h +# CHECK-NEXT: 1 4 1.00 sminv h0, v0.8h +# CHECK-NEXT: 1 4 1.00 sminv s0, v0.4s +# CHECK-NEXT: 1 4 0.50 smlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 smlal v0.2d, v25.2s, v1.s[1] +# CHECK-NEXT: 1 4 0.50 smlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 smlal v16.4s, v9.4h, v11.h[4] +# CHECK-NEXT: 1 4 0.50 smlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 smlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 smlal2 v30.2d, v22.4s, v7.s[2] +# CHECK-NEXT: 1 4 0.50 smlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 smlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 smlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 smlsl v25.2d, v27.2s, v1.s[1] +# CHECK-NEXT: 1 4 0.50 smlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 smlsl v14.4s, v23.4h, v12.h[7] +# CHECK-NEXT: 1 4 0.50 smlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 smlal2 v1.4s, v9.8h, v0.h[6] +# CHECK-NEXT: 1 4 0.50 smlsl2 v12.4s, v11.8h, v12.h[0] +# CHECK-NEXT: 1 4 0.50 smlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 smlsl2 v11.2d, v28.4s, v7.s[2] +# CHECK-NEXT: 1 4 0.50 smlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 smlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 smull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 smull v31.2d, v23.2s, v6.s[2] +# CHECK-NEXT: 1 4 0.50 smull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 smull v3.4s, v26.4h, v1.h[5] +# CHECK-NEXT: 1 4 0.50 smull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 smull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 smull2 v11.2d, v1.4s, v7.s[0] +# CHECK-NEXT: 1 4 0.50 smull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 smull2 v13.4s, v18.8h, v0.h[3] +# CHECK-NEXT: 1 4 0.50 smull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 sqabs b19, b14 +# CHECK-NEXT: 1 4 0.50 sqabs d18, d12 +# CHECK-NEXT: 1 4 0.50 sqabs h21, h15 +# CHECK-NEXT: 1 4 0.50 sqabs s20, s12 +# CHECK-NEXT: 1 4 0.50 sqabs v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 sqabs v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 sqabs v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqabs v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqabs v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqabs v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 sqabs v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 sqadd b20, b11, b15 +# CHECK-NEXT: 1 4 0.50 sqadd h12, h18, h10 +# CHECK-NEXT: 1 4 0.50 sqadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 sqadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqdmlal d19, s24, s12 +# CHECK-NEXT: 1 4 0.50 sqdmlal d8, s9, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmlal s0, h0, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmlal s17, h27, h12 +# CHECK-NEXT: 1 4 0.50 sqdmlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqdmlal v11.2d, v24.2s, v0.s[3] +# CHECK-NEXT: 1 4 0.50 sqdmlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqdmlal v20.4s, v30.4h, v12.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqdmlal2 v23.2d, v30.4s, v6.s[0] +# CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 sqdmlal2 v2.4s, v17.8h, v5.h[6] +# CHECK-NEXT: 1 4 0.50 sqdmulh v8.4h, v16.4h, v5.h[4] +# CHECK-NEXT: 1 4 0.50 sqdmulh v16.2s, v24.2s, v7.s[2] +# CHECK-NEXT: 1 4 0.50 sqdmull v8.4s, v19.4h, v1.h[2] +# CHECK-NEXT: 1 4 0.50 sqdmull v20.2d, v10.2s, v6.s[2] +# CHECK-NEXT: 1 4 0.50 sqdmull2 v10.4s, v25.8h, v0.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmull2 v4.2d, v29.4s, v2.s[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh v0.8h, v15.8h, v0.h[5] +# CHECK-NEXT: 1 4 0.50 sqrdmulh v6.2s, v29.2s, v4.s[2] +# CHECK-NEXT: 1 4 0.50 sqrdmulh v31.2s, v17.2s, v4.2s +# CHECK-NEXT: 1 4 0.50 sqdmlsl d12, s23, s13 +# CHECK-NEXT: 1 4 0.50 sqdmlsl d8, s9, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmlsl s0, h0, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmlsl s14, h12, h25 +# CHECK-NEXT: 1 4 0.50 sqdmlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqdmlsl v26.2d, v7.2s, v3.s[0] +# CHECK-NEXT: 1 4 0.50 sqdmlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqdmlsl v4.4s, v22.4h, v13.h[2] +# CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqdmlsl2 v4.2d, v3.4s, v3.s[2] +# CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 sqdmlsl2 v2.4s, v28.8h, v4.h[6] +# CHECK-NEXT: 1 4 0.50 sqdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqdmulh v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqdmulh v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqdmull d1, s1, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmull d15, s22, s12 +# CHECK-NEXT: 1 4 0.50 sqdmull s1, h1, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmull s12, h22, h12 +# CHECK-NEXT: 1 4 0.50 sqdmull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqdmull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqdmull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqdmull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 sqneg b19, b14 +# CHECK-NEXT: 1 4 0.50 sqneg d18, d12 +# CHECK-NEXT: 1 4 0.50 sqneg h21, h15 +# CHECK-NEXT: 1 4 0.50 sqneg s20, s12 +# CHECK-NEXT: 1 4 0.50 sqneg v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 sqneg v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 sqneg v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqneg v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqneg v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqneg v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 sqneg v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, h2 +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.4h +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, s2 +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.2s +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, h2 +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.4h +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.8h +# CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, s2 +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.2s +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 1 4 0.50 sqrdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqrdmulh v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqrdmulh v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 sqrshl d31, d31, d31 +# CHECK-NEXT: 1 4 0.50 sqrshl h3, h4, h15 +# CHECK-NEXT: 1 4 0.50 sqrshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqrshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqrshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 sqshl s17, s4, s23 +# CHECK-NEXT: 1 4 0.50 sqsub b3, b13, b12 +# CHECK-NEXT: 1 4 0.50 sqsub v20.8h, v18.8h, v12.8h +# CHECK-NEXT: 1 4 0.50 sqrshrn b10, h13, #2 +# CHECK-NEXT: 1 4 0.50 sqrshrn h15, s10, #6 +# CHECK-NEXT: 1 4 0.50 sqrshrn s15, d12, #9 +# CHECK-NEXT: 1 4 0.50 sqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrun b17, h10, #6 +# CHECK-NEXT: 1 4 0.50 sqrshrun h10, s13, #15 +# CHECK-NEXT: 1 4 0.50 sqrshrun s22, d16, #31 +# CHECK-NEXT: 1 4 0.50 sqrshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqrshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqshl b11, b19, #7 +# CHECK-NEXT: 1 4 0.50 sqshl d15, d16, #51 +# CHECK-NEXT: 1 4 0.50 sqshl d31, d31, d31 +# CHECK-NEXT: 1 4 0.50 sqshl h13, h18, #11 +# CHECK-NEXT: 1 4 0.50 sqshl h3, h4, h15 +# CHECK-NEXT: 1 4 0.50 sqshl s14, s17, #22 +# CHECK-NEXT: 1 4 0.50 sqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 4 0.50 sqshl v23.16b, v23.16b, v23.16b +# CHECK-NEXT: 1 4 0.50 sqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 sqshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 sqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 4 0.50 sqshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 sqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 4 0.50 sqshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 sqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu b15, b18, #6 +# CHECK-NEXT: 1 4 0.50 sqshlu d11, d13, #32 +# CHECK-NEXT: 1 4 0.50 sqshlu h19, h17, #6 +# CHECK-NEXT: 1 4 0.50 sqshlu s16, s14, #25 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 4 0.50 sqshlu v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqshrn b10, h15, #5 +# CHECK-NEXT: 1 4 0.50 sqshrn h17, s10, #4 +# CHECK-NEXT: 1 4 0.50 sqshrn s18, d10, #31 +# CHECK-NEXT: 1 4 0.50 sqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun b15, h10, #7 +# CHECK-NEXT: 1 4 0.50 sqshrun h20, s14, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun s10, d15, #15 +# CHECK-NEXT: 1 4 0.50 sqshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 sqshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 sqsub s20, s10, s7 +# CHECK-NEXT: 1 4 0.50 sqsub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 sqsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 sqsub v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 sqxtn b18, h18 +# CHECK-NEXT: 1 4 0.50 sqxtn h20, s17 +# CHECK-NEXT: 1 4 0.50 sqxtn s19, d14 +# CHECK-NEXT: 1 4 0.50 sqxtn v0.2s, v0.2d +# CHECK-NEXT: 1 4 0.50 sqxtn v0.4h, v0.4s +# CHECK-NEXT: 1 4 0.50 sqxtn v0.8b, v0.8h +# CHECK-NEXT: 1 4 0.50 sqxtn2 v0.16b, v0.8h +# CHECK-NEXT: 1 4 0.50 sqxtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 4 0.50 sqxtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 4 0.50 sqxtun b19, h14 +# CHECK-NEXT: 1 4 0.50 sqxtun h21, s15 +# CHECK-NEXT: 1 4 0.50 sqxtun s20, d12 +# CHECK-NEXT: 1 4 0.50 sqxtun v0.2s, v0.2d +# CHECK-NEXT: 1 4 0.50 sqxtun v0.4h, v0.4s +# CHECK-NEXT: 1 4 0.50 sqxtun v0.8b, v0.8h +# CHECK-NEXT: 1 4 0.50 sqxtun2 v0.16b, v0.8h +# CHECK-NEXT: 1 4 0.50 sqxtun2 v0.4s, v0.2d +# CHECK-NEXT: 1 4 0.50 sqxtun2 v0.8h, v0.4s +# CHECK-NEXT: 1 2 0.50 srhadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 2 0.50 srhadd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 2 0.50 srhadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 sri d10, d12, #14 +# CHECK-NEXT: 1 3 0.50 sri v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 sri v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 srshl d16, d16, d16 +# CHECK-NEXT: 1 3 0.50 srshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 srshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 srshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 srshr d19, d18, #7 +# CHECK-NEXT: 1 3 0.50 srshr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 srshr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 5 3.00 srsra d15, d11, #19 +# CHECK-NEXT: 1 5 3.00 srsra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 5 3.00 srsra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 5 3.00 srsra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 5 3.00 srsra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 5 3.00 srsra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 5 3.00 srsra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 5 3.00 srsra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 sshl d31, d31, d31 +# CHECK-NEXT: 1 3 0.50 sshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 sshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 sshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 sshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 sshll v9.8h, v2.8b, #0 +# CHECK-NEXT: 1 3 0.50 sshll v12.4s, v3.4h, #4 +# CHECK-NEXT: 1 3 0.50 sshll v0.2d, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sshll2 v28.8h, v12.16b, #7 +# CHECK-NEXT: 1 3 0.50 sshll2 v0.4s, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 sshll2 v17.2d, v13.4s, #22 +# CHECK-NEXT: 1 3 0.50 sshr d15, d16, #12 +# CHECK-NEXT: 1 3 0.50 sshr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 sshr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ssra d18, d12, #21 +# CHECK-NEXT: 1 3 0.50 ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ssubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 ssubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 ssubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 ssubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 ssubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 ssubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 4 1.00 * st1 { v18.8b }, [x15] +# CHECK-NEXT: 1 4 1.00 * st1 { v8.8b, v9.8b }, [x18] +# CHECK-NEXT: 1 4 2.00 * st1 { v15.8b, v16.8b, v17.8b }, [x0] +# CHECK-NEXT: 1 4 2.00 * st1 { v21.8b, v22.8b, v23.8b, v24.8b }, [x14] +# CHECK-NEXT: 1 4 1.00 * st1 { v0.16b }, [x0] +# CHECK-NEXT: 1 4 2.00 * st1 { v1.16b, v2.16b }, [x4] +# CHECK-NEXT: 1 4 3.00 * st1 { v27.16b, v28.16b, v29.16b }, [x18] +# CHECK-NEXT: 1 4 4.00 * st1 { v18.16b, v19.16b, v20.16b, v21.16b }, [x29] +# CHECK-NEXT: 1 4 1.00 * st1 { v19.4h }, [x7] +# CHECK-NEXT: 1 4 1.00 * st1 { v22.4h, v23.4h }, [x22] +# CHECK-NEXT: 1 4 2.00 * st1 { v13.4h, v14.4h, v15.4h }, [x7] +# CHECK-NEXT: 1 4 2.00 * st1 { v23.4h, v24.4h, v25.4h, v26.4h }, [x24] +# CHECK-NEXT: 1 4 1.00 * st1 { v27.8h }, [x17] +# CHECK-NEXT: 1 4 3.00 * st1 { v8.8h, v9.8h, v10.8h }, [x16] +# CHECK-NEXT: 1 4 4.00 * st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x19] +# CHECK-NEXT: 1 4 1.00 * st1 { v25.2s }, [x6] +# CHECK-NEXT: 1 4 1.00 * st1 { v13.2s, v14.2s }, [x9] +# CHECK-NEXT: 1 4 2.00 * st1 { v12.2s, v13.2s, v14.2s }, [x3] +# CHECK-NEXT: 1 4 2.00 * st1 { v6.2s, v7.2s, v8.2s, v9.2s }, [x13] +# CHECK-NEXT: 2 4 2.00 * st1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: 1 4 1.00 * st1 { v22.4s }, [x19] +# CHECK-NEXT: 1 4 2.00 * st1 { v15.4s, v16.4s }, [x12] +# CHECK-NEXT: 1 4 4.00 * st1 { v26.4s, v27.4s, v28.4s, v29.4s }, [x12] +# CHECK-NEXT: 1 4 1.00 * st1 { v20.1d }, [x10] +# CHECK-NEXT: 1 4 1.00 * st1 { v21.1d, v22.1d }, [x29] +# CHECK-NEXT: 1 4 2.00 * st1 { v5.1d, v6.1d, v7.1d }, [x3] +# CHECK-NEXT: 1 4 2.00 * st1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x10] +# CHECK-NEXT: 1 4 2.00 * st1 { v26.2d, v27.2d }, [x28] +# CHECK-NEXT: 2 4 3.00 * st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: 1 4 3.00 * st1 { v13.2d, v14.2d, v15.2d }, [x27] +# CHECK-NEXT: 1 4 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: 1 4 1.00 * st1 { v8.2d }, [x15] +# CHECK-NEXT: 2 4 1.00 * st1 { v0.8h }, [x15], x2 +# CHECK-NEXT: 1 4 2.00 * st1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: 2 4 2.00 * st1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: 1 4 3.00 * st1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: 2 4 2.00 * st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: 1 4 1.00 * st1 { v1.b }[5], [x1] +# CHECK-NEXT: 1 4 1.00 * st1 { v0.h }[2], [x1] +# CHECK-NEXT: 1 4 1.00 * st1 { v31.s }[1], [x16] +# CHECK-NEXT: 2 4 1.00 * st1 { v0.8h }, [x15], x2 +# CHECK-NEXT: 1 4 2.00 * st1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: 1 4 1.00 * st1 { v0.d }[1], [x0] +# CHECK-NEXT: 2 4 1.00 * st1 { v0.d }[1], [x0], #8 +# CHECK-NEXT: 2 5 2.00 * st2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: 1 5 1.00 * st2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: 1 5 2.00 * st2 { v6.16b, v7.16b }, [x23] +# CHECK-NEXT: 1 5 1.00 * st2 { v10.4h, v11.4h }, [x18] +# CHECK-NEXT: 1 5 2.00 * st2 { v10.8h, v11.8h }, [x18] +# CHECK-NEXT: 1 5 1.00 * st2 { v25.2s, v26.2s }, [x29] +# CHECK-NEXT: 1 5 2.00 * st2 { v26.4s, v27.4s }, [x14] +# CHECK-NEXT: 1 5 2.00 * st2 { v10.2d, v11.2d }, [x1] +# CHECK-NEXT: 1 5 2.00 * st2 { v21.b, v22.b }[15], [x15] +# CHECK-NEXT: 1 5 2.00 * st2 { v28.h, v29.h }[2], [x6] +# CHECK-NEXT: 1 5 2.00 * st2 { v0.s, v1.s }[3], [sp] +# CHECK-NEXT: 2 5 2.00 * st2 { v0.s, v1.s }[3], [sp], #8 +# CHECK-NEXT: 1 5 2.00 * st2 { v17.d, v18.d }[1], [x1] +# CHECK-NEXT: 1 5 4.00 * st3 { v10.8b, v11.8b, v12.8b }, [x18] +# CHECK-NEXT: 1 5 6.00 * st3 { v26.16b, v27.16b, v28.16b }, [x4] +# CHECK-NEXT: 1 5 4.00 * st3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: 2 5 6.00 * st3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: 1 5 6.00 * st3 { v0.8h, v1.8h, v2.8h }, [x0] +# CHECK-NEXT: 1 5 4.00 * st3 { v19.2s, v20.2s, v21.2s }, [x30] +# CHECK-NEXT: 1 5 6.00 * st3 { v24.4s, v25.4s, v26.4s }, [x8] +# CHECK-NEXT: 1 5 6.00 * st3 { v24.2d, v25.2d, v26.2d }, [x25] +# CHECK-NEXT: 1 5 4.00 * st3 { v8.b, v9.b, v10.b }[4], [x18] +# CHECK-NEXT: 1 5 4.00 * st3 { v0.h, v1.h, v2.h }[7], [x15] +# CHECK-NEXT: 2 5 4.00 * st3 { v0.h, v1.h, v2.h }[7], [x15], #6 +# CHECK-NEXT: 1 5 4.00 * st3 { v9.s, v10.s, v11.s }[2], [x20] +# CHECK-NEXT: 1 5 4.00 * st3 { v16.d, v17.d, v18.d }[0], [x13] +# CHECK-NEXT: 1 5 4.00 * st4 { v17.8b, v18.8b, v19.8b, v20.8b }, [x8] +# CHECK-NEXT: 1 5 8.00 * st4 { v7.16b, v8.16b, v9.16b, v10.16b }, [x15] +# CHECK-NEXT: 1 5 4.00 * st4 { v5.4h, v6.4h, v7.4h, v8.4h }, [x13] +# CHECK-NEXT: 1 5 8.00 * st4 { v11.8h, v12.8h, v13.8h, v14.8h }, [x1] +# CHECK-NEXT: 1 5 4.00 * st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: 2 5 8.00 * st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: 1 5 8.00 * st4 { v21.4s, v22.4s, v23.4s, v24.4s }, [x6] +# CHECK-NEXT: 1 5 8.00 * st4 { v25.2d, v26.2d, v27.2d, v28.2d }, [x16] +# CHECK-NEXT: 1 5 8.00 * st4 { v0.b, v1.b, v2.b, v3.b }[15], [x0] +# CHECK-NEXT: 1 5 8.00 * st4 { v5.h, v6.h, v7.h, v8.h }[4], [x13] +# CHECK-NEXT: 1 5 8.00 * st4 { v22.s, v23.s, v24.s, v25.s }[0], [x7] +# CHECK-NEXT: 1 5 8.00 * st4 { v23.d, v24.d, v25.d, v26.d }[1], [x5] +# CHECK-NEXT: 1 5 8.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] +# CHECK-NEXT: 2 5 8.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 +# CHECK-NEXT: 2 5 8.00 * st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], x5 +# CHECK-NEXT: 1 3 0.50 sub d15, d5, d16 +# CHECK-NEXT: 1 3 0.50 sub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 sub v15.2s, v14.2s, v11.2s +# CHECK-NEXT: 1 4 0.50 subhn v7.4h, v10.4s, v13.4s +# CHECK-NEXT: 1 4 0.50 subhn2 v24.4s, v24.2d, v8.2d +# CHECK-NEXT: 1 4 0.50 suqadd b19, b14 +# CHECK-NEXT: 1 4 0.50 suqadd d18, d22 +# CHECK-NEXT: 1 4 0.50 suqadd h20, h15 +# CHECK-NEXT: 1 4 0.50 suqadd s21, s12 +# CHECK-NEXT: 1 4 0.50 suqadd v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 suqadd v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 suqadd v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 suqadd v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 suqadd v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 suqadd v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 suqadd v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 tbl v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: 1 5 2.00 tbl v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: 1 6 3.00 tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: 1 7 4.00 tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: 1 4 0.50 tbl v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: 1 5 2.00 tbl v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: 1 6 3.00 tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: 1 7 4.00 tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: 1 5 2.00 tbx v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: 1 6 3.00 tbx v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: 1 7 4.00 tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: 1 8 5.00 tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: 1 5 2.00 tbx v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: 1 6 3.00 tbx v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: 1 7 4.00 tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: 1 8 5.00 tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: 1 3 0.50 trn1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 trn1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 trn1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 trn1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 trn1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 trn1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 trn1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 trn2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 trn2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 trn2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 trn2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 trn2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 trn2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 trn2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 5 3.00 uaba v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 5 3.00 uaba v13.16b, v14.16b, v19.16b +# CHECK-NEXT: 1 5 3.00 uabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 5 3.00 uabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 5 3.00 uabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 5 3.00 uabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 5 3.00 uabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 5 3.00 uabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uabd v23.4s, v4.4s, v30.4s +# CHECK-NEXT: 1 3 0.50 uabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 uabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 5 3.00 uadalp v0.1d, v0.2s +# CHECK-NEXT: 1 5 3.00 uadalp v0.2d, v0.4s +# CHECK-NEXT: 1 5 3.00 uadalp v0.2s, v0.4h +# CHECK-NEXT: 1 5 3.00 uadalp v0.4h, v0.8b +# CHECK-NEXT: 1 5 3.00 uadalp v0.4s, v0.8h +# CHECK-NEXT: 1 5 3.00 uadalp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddlp v0.1d, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddlp v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddlp v0.2s, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddlp v0.4h, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddlp v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddlp v0.8h, v0.16b +# CHECK-NEXT: 1 4 1.00 uaddlv d0, v0.4s +# CHECK-NEXT: 1 4 1.00 uaddlv s0, v0.4h +# CHECK-NEXT: 1 4 1.00 uaddlv s0, v0.8h +# CHECK-NEXT: 1 4 1.00 uaddlv h0, v0.8b +# CHECK-NEXT: 1 4 1.00 uaddlv h0, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 4 0.50 ucvtf h17, x12 +# CHECK-NEXT: 1 4 0.50 ucvtf h22, h16, #11 +# CHECK-NEXT: 1 4 0.50 ucvtf h7, h21 +# CHECK-NEXT: 1 4 0.50 ucvtf d21, d14 +# CHECK-NEXT: 1 4 0.50 ucvtf d21, d14, #64 +# CHECK-NEXT: 1 4 0.50 ucvtf s8, x0 +# CHECK-NEXT: 1 4 0.50 ucvtf s22, s13 +# CHECK-NEXT: 1 4 0.50 ucvtf s22, s13, #32 +# CHECK-NEXT: 1 4 0.50 ucvtf v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 ucvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 ucvtf v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 ucvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 ucvtf v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 ucvtf v18.4h, v11.4h, #7 +# CHECK-NEXT: 1 4 0.50 ucvtf v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 ucvtf v22.8h, v20.8h, #10 +# CHECK-NEXT: 1 4 0.50 udot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 4 0.50 udot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 udot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 udot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.50 uhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uhsub v12.4h, v16.4h, v28.4h +# CHECK-NEXT: 1 3 0.50 uhsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umax v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umax v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umaxp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umaxp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 1.00 umaxv b0, v0.8b +# CHECK-NEXT: 1 4 1.00 umaxv b0, v0.16b +# CHECK-NEXT: 1 4 1.00 umaxv h0, v0.4h +# CHECK-NEXT: 1 4 1.00 umaxv h0, v0.8h +# CHECK-NEXT: 1 4 1.00 umaxv s0, v0.4s +# CHECK-NEXT: 1 3 0.50 umin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 umin v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 umin v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umin v0.16b, v26.16b, v2.16b +# CHECK-NEXT: 1 3 0.50 uminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uminp v28.4s, v16.4s, v15.4s +# CHECK-NEXT: 1 3 0.50 uminp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uminp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 1.00 uminv b0, v0.8b +# CHECK-NEXT: 1 4 1.00 uminv b0, v0.16b +# CHECK-NEXT: 1 4 1.00 uminv h0, v0.4h +# CHECK-NEXT: 1 4 1.00 uminv h0, v0.8h +# CHECK-NEXT: 1 4 1.00 uminv s0, v0.4s +# CHECK-NEXT: 1 4 0.50 umlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 umlal v28.2d, v31.2s, v0.s[1] +# CHECK-NEXT: 1 4 0.50 umlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 umlal v22.4s, v14.4h, v0.h[6] +# CHECK-NEXT: 1 4 0.50 umlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 umlal2 v10.2d, v4.4s, v3.s[2] +# CHECK-NEXT: 1 4 0.50 umlal2 v31.4s, v7.8h, v15.h[5] +# CHECK-NEXT: 1 4 0.50 umlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 umlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 umlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 umlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 umlsl v20.2d, v20.2s, v2.s[0] +# CHECK-NEXT: 1 4 0.50 umlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 umlsl v21.4s, v12.4h, v7.h[5] +# CHECK-NEXT: 1 4 0.50 umlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 umlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 umlsl2 v30.2d, v23.4s, v1.s[2] +# CHECK-NEXT: 1 4 0.50 umlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 umlsl2 v27.4s, v28.8h, v6.h[4] +# CHECK-NEXT: 1 4 0.50 umlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umov w6, v22.b[0] +# CHECK-NEXT: 1 3 0.50 umov w0, v0.b[1] +# CHECK-NEXT: 1 3 0.50 umov w10, v25.h[0] +# CHECK-NEXT: 1 3 0.50 umov w0, v0.h[1] +# CHECK-NEXT: 1 3 0.50 mov w0, v0.s[1] +# CHECK-NEXT: 1 3 0.50 mov x0, v0.d[1] +# CHECK-NEXT: 1 4 0.50 umull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 umull v22.2d, v28.2s, v6.s[1] +# CHECK-NEXT: 1 4 0.50 umull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 umull v27.4s, v1.4h, v8.h[6] +# CHECK-NEXT: 1 4 0.50 umull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 umull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 umull2 v28.2d, v21.4s, v1.s[0] +# CHECK-NEXT: 1 4 0.50 umull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 umull2 v18.4s, v26.8h, v10.h[1] +# CHECK-NEXT: 1 4 0.50 umull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 uqadd h0, h1, h5 +# CHECK-NEXT: 1 4 0.50 uqadd s0, s24, s30 +# CHECK-NEXT: 1 4 0.50 uqadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 uqadd v14.2d, v22.2d, v20.2d +# CHECK-NEXT: 1 4 0.50 uqrshl b11, b20, b30 +# CHECK-NEXT: 1 4 0.50 uqrshl s23, s20, s16 +# CHECK-NEXT: 1 4 0.50 uqrshl v25.8b, v13.8b, v23.8b +# CHECK-NEXT: 1 4 0.50 uqrshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 uqrshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 uqrshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 uqrshrn b10, h12, #5 +# CHECK-NEXT: 1 4 0.50 uqrshrn h12, s10, #14 +# CHECK-NEXT: 1 4 0.50 uqrshrn s10, d10, #25 +# CHECK-NEXT: 1 4 0.50 uqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 uqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 uqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 uqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 uqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 uqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 uqshl b11, b20, b30 +# CHECK-NEXT: 1 4 0.50 uqshl b18, b15, #6 +# CHECK-NEXT: 1 4 0.50 uqshl d15, d12, #19 +# CHECK-NEXT: 1 4 0.50 uqshl h11, h18, #7 +# CHECK-NEXT: 1 4 0.50 uqshl s14, s19, #18 +# CHECK-NEXT: 1 4 0.50 uqshl s23, s20, s16 +# CHECK-NEXT: 1 4 0.50 uqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 uqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 uqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v8.4h, v17.4h, v13.4h +# CHECK-NEXT: 1 4 0.50 uqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 uqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 uqshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 uqshrn b12, h10, #7 +# CHECK-NEXT: 1 4 0.50 uqshrn h10, s14, #5 +# CHECK-NEXT: 1 4 0.50 uqshrn s10, d12, #13 +# CHECK-NEXT: 1 4 0.50 uqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 uqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 uqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 uqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 uqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: 1 4 0.50 uqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: 1 4 0.50 uqsub s16, s21, s6 +# CHECK-NEXT: 1 4 0.50 uqsub d16, d16, d16 +# CHECK-NEXT: 1 4 0.50 uqsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 uqsub v19.4s, v0.4s, v5.4s +# CHECK-NEXT: 1 4 0.50 uqxtn b18, h18 +# CHECK-NEXT: 1 4 0.50 uqxtn h20, s17 +# CHECK-NEXT: 1 4 0.50 uqxtn s19, d14 +# CHECK-NEXT: 1 4 0.50 uqxtn v0.2s, v0.2d +# CHECK-NEXT: 1 4 0.50 uqxtn v0.4h, v0.4s +# CHECK-NEXT: 1 4 0.50 uqxtn v0.8b, v0.8h +# CHECK-NEXT: 1 4 0.50 uqxtn2 v0.16b, v0.8h +# CHECK-NEXT: 1 4 0.50 uqxtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 4 0.50 uqxtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 4 0.50 urecpe v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 urecpe v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 urhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 2 0.50 urhadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 2 0.50 urhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 2 0.50 urhadd v16.2s, v19.2s, v2.2s +# CHECK-NEXT: 1 3 0.50 urshl d8, d7, d4 +# CHECK-NEXT: 1 3 0.50 urshl v31.8b, v5.8b, v3.8b +# CHECK-NEXT: 1 3 0.50 urshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 urshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 urshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 urshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 urshr d20, d23, #31 +# CHECK-NEXT: 1 3 0.50 urshr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 urshr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 4 0.50 ursqrte v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 ursqrte v0.4s, v0.4s +# CHECK-NEXT: 1 5 3.00 ursra d18, d10, #13 +# CHECK-NEXT: 1 5 3.00 ursra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 5 3.00 ursra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 5 3.00 ursra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 5 3.00 ursra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 5 3.00 ursra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 5 3.00 ursra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 5 3.00 ursra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ushl d0, d0, d0 +# CHECK-NEXT: 1 3 0.50 ushl v6.8b, v26.8b, v6.8b +# CHECK-NEXT: 1 3 0.50 ushl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 ushl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 ushl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 ushll v0.4s, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ushll v18.8h, v24.8b, #4 +# CHECK-NEXT: 1 3 0.50 ushll v16.2d, v16.2s, #31 +# CHECK-NEXT: 1 3 0.50 ushll2 v31.2d, v12.4s, #11 +# CHECK-NEXT: 1 3 0.50 ushll2 v18.4s, v22.8h, #13 +# CHECK-NEXT: 1 3 0.50 ushll2 v0.8h, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ushr d10, d17, #18 +# CHECK-NEXT: 1 3 0.50 ushr v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 ushr v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 U smov w15, v22.b[0] +# CHECK-NEXT: 1 3 0.50 U smov w26, v27.h[0] +# CHECK-NEXT: 1 3 0.50 U smov x21, v0.b[0] +# CHECK-NEXT: 1 3 0.50 U smov x9, v27.h[0] +# CHECK-NEXT: 1 3 0.50 U smov x15, v3.s[0] +# CHECK-NEXT: 1 3 0.50 smov w0, v0.b[1] +# CHECK-NEXT: 1 3 0.50 smov w0, v0.h[1] +# CHECK-NEXT: 1 3 0.50 smov x0, v0.b[1] +# CHECK-NEXT: 1 3 0.50 smov x0, v0.h[1] +# CHECK-NEXT: 1 3 0.50 smov x0, v0.s[1] +# CHECK-NEXT: 1 4 0.50 usqadd b19, b14 +# CHECK-NEXT: 1 4 0.50 usqadd d18, d22 +# CHECK-NEXT: 1 4 0.50 usqadd h20, h15 +# CHECK-NEXT: 1 4 0.50 usqadd s21, s12 +# CHECK-NEXT: 1 4 0.50 usqadd v0.16b, v0.16b +# CHECK-NEXT: 1 4 0.50 usqadd v0.2d, v0.2d +# CHECK-NEXT: 1 4 0.50 usqadd v0.2s, v0.2s +# CHECK-NEXT: 1 4 0.50 usqadd v0.4h, v0.4h +# CHECK-NEXT: 1 4 0.50 usqadd v0.4s, v0.4s +# CHECK-NEXT: 1 4 0.50 usqadd v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 usqadd v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 usra d20, d13, #61 +# CHECK-NEXT: 1 3 0.50 usra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 usubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 usubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 usubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 usubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 usubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 usubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 usubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 usubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 usubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 usubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 usubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 usubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 uzp1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uzp1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 uzp1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uzp1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uzp1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 uzp1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uzp1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uzp2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uzp2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 uzp2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uzp2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uzp2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 uzp2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uzp2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 4 0.50 xtn v0.2s, v0.2d +# CHECK-NEXT: 1 4 0.50 xtn v0.4h, v0.4s +# CHECK-NEXT: 1 4 0.50 xtn v0.8b, v0.8h +# CHECK-NEXT: 1 4 0.50 xtn2 v0.16b, v0.8h +# CHECK-NEXT: 1 4 0.50 xtn2 v0.4s, v0.2d +# CHECK-NEXT: 1 4 0.50 xtn2 v0.8h, v0.4s +# CHECK-NEXT: 1 3 0.50 zip1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 zip1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 zip1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 zip1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 zip1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 zip1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 zip1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 zip2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 zip2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 zip2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 zip2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 zip2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 zip2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 zip2 v0.8h, v0.8h, v0.8h + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - 13.50 545.50 - - 744.00 744.00 117.00 117.00 163.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs d29, d24 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add d17, d31, d29 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp v7.2s, v1.2s, v2.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp d1, v14.2d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - addv s0, v0.4s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - addv h0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - addv h0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - addv b0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - addv b0, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesd v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aese v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesimc v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesmc v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic v0.4h, #15, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic v23.8h, #101 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic v25.16b, v10.16b, v9.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic v24.2s, #70 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bit v5.8b, v12.8b, v22.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bif v0.8b, v25.8b, v4.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bif v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bit v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bsl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bsl v27.16b, v13.16b, v21.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmeq v9.8h, v16.8h, v24.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmeq v14.4h, v18.4h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmeq d20, d21, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmeq d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmeq v0.16b, v0.16b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmeq v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmge v22.8h, v16.8h, v3.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmge v22.16b, v30.16b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmge d20, d21, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmge d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmge v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmge v0.8b, v0.8b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmgt v3.2d, v29.2d, v11.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmgt d20, d21, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmgt d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmgt v0.2s, v0.2s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmhi v28.4h, v25.4h, v21.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmhi d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmhi v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmhs d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmhs v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmle v21.2s, v19.2s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmle d20, d21, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmle v0.2d, v0.2d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmlt v26.4h, v12.4h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmlt d20, d21, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmlt v0.8h, v0.8h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmtst d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmtst v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cmtst v13.2d, v13.2d, v13.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnt v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnt v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.16b, w28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.2d, x28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.2s, w28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.4h, w28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.4s, w28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.8b, w28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.8h, w28 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov b0, v0.b[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov d0, v0.d[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov h0, v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov s0, v0.s[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.16b, v0.b[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.2d, v0.d[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.2s, v0.s[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.4h, v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.4s, v0.s[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.8b, v0.b[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dup v0.8h, v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ext v0.16b, v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ext v0.8b, v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd d29, d24, d20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd s29, s24, s20 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd h27, h20, h17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd v13.8h, v28.8h, v12.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs h25, h7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facge d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facge s10, s11, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facge h24, h26, h29 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facge v25.4h, v16.4h, v11.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facge v19.2s, v24.2s, v5.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facgt d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facgt s10, s11, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facgt h0, h4, h10 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facgt v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facgt v22.8h, v14.8h, v31.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - facgt v22.4s, v8.4s, v2.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp h10, v19.2h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp d11, v28.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp v16.2d, v11.2d, v5.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq h30, h6, h1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq h19, h23, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq s10, s11, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq v0.2s, v0.2s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq v12.4s, v11.4s, v26.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmeq v18.2d, v17.2d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge h10, h23, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge h1, h16, h12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge s10, s11, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge v0.2d, v0.2d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge v17.2d, v11.2d, v13.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge v18.4h, v27.4h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge v20.8h, v19.8h, v22.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmge v17.2s, v11.2s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt h4, h5, h0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt h0, h18, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt d20, d21, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt s10, s11, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt v24.8h, v24.8h, v28.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt v0.8h, v11.8h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmgt v19.2d, v31.2d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmle v16.8h, v11.8h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmle v22.4s, v30.4s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmle d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmle s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmle v0.2d, v0.2d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmle h18, h28, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmlt h23, h7, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmlt d20, d21, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmlt s10, s11, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmlt v0.4s, v0.4s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmlt v8.4h, v2.4h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcmlt v7.2d, v16.2d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas h12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtas v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau h12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtau v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtl v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtl v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtl2 v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtl2 v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms s22, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms h22, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtms v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu h12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtmu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns s22, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns h22, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtns v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu h12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps s22, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps h22, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtps v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu h12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtpu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtxn s22, d13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtxn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtxn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs d21, d12, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs s21, s12, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs h21, h14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs h21, h12, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v20.4h, v24.4h, #11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs v18.8h, v10.8h, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu d21, d12, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu s12, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu s21, s12, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu h12, h13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu h21, h12, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v19.4h, v26.4h, #9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu v27.8h, v6.8h, #11 +# CHECK-NEXT: - - - - - - - - - - - - 38.00 fdiv v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - - - - 10.00 fdiv v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fdiv v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - - - 20.00 fdiv v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - - - 10.00 fdiv v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp h25, v19.2h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp d17, v29.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fmaxnmv h0, v13.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fmaxnmv h12, v11.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fmaxnmv s28, v31.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp h15, v25.2h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp s6, v2.2s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fmaxv h0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fmaxv h0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fmaxv s0, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp h20, v14.2h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp d15, v8.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fminnmv h19, v25.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fminnmv h23, v17.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fminnmv s29, v17.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp h7, v10.2h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp s17, v7.2s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fminv h3, v30.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fminv h29, v12.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fminv s16, v19.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla d0, d1, v0.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla h23, h24, v15.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla s0, s1, v0.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla v29.8h, v15.8h, v10.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla v2.2s, v16.2s, v28.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla v14.4s, v14.4s, v5.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla v1.4s, v24.4s, v12.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla v10.2d, v14.2d, v21.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls d0, d4, v0.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls h8, h14, v7.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls s3, s5, v0.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls v30.8h, v18.8h, v4.h[6] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls v10.2s, v27.2s, v0.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls v27.4s, v7.4s, v24.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls v10.2d, v22.2d, v29.d[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls v6.8h, v15.8h, v23.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov v0.2d, #-1.25000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov v0.2s, #13.00000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov v0.4s, #1.00000000 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul h18, h4, v7.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul v10.4h, v2.4h, v7.h[5] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul v5.2s, v12.2s, v9.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul v15.4s, v30.4s, v2.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul v11.2d, v31.2d, v24.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul h28, h14, h3 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul d0, d1, v0.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul s0, s1, v0.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx d0, d4, v0.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx d23, d11, d1 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx s20, s22, s15 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx h18, h17, v7.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx h20, h25, h0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx s3, s5, v0.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v28.4h, v25.4h, v15.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v3.2s, v22.2s, v23.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v5.4s, v28.4s, v15.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx v22.2d, v18.2d, v25.d[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe h20, h8 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe d13, d13 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe s19, s14 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps h29, h19, h8 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpx h18, h11 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps v12.8h, v25.8h, v4.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps d22, d30, d21 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps s21, s16, s13 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps v7.2d, v29.2d, v18.2d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpx d16, d19 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpx s18, s10 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte h23, h26 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte d21, d12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte s22, s13 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts v20.4s, v26.4s, v27.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts v8.4h, v9.4h, v30.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts h28, h26, h1 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts d8, d22, d18 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts s21, s5, s12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - - - - 38.00 fsqrt v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - - - - - 9.00 fsqrt v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fsqrt v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - - - 18.00 fsqrt v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - - - 10.00 fsqrt v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub v13.8h, v15.8h, v17.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.16b }, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v0.16b, v1.16b }, [x14] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v19.16b, v20.16b, v21.16b }, [x10] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v13.16b, v14.16b, v15.16b, v16.16b }, [x9] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v24.8h }, [x27] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v1.8h, v2.8h }, [x27] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v0.8h, v1.8h }, [sp], #32 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v21.8h, v22.8h, v23.8h }, [x22] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x21] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v3.4s }, [x4] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v11.4s, v12.4s }, [x30] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.4s, v1.4s, v2.4s }, [x24] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v15.4s, v16.4s, v17.4s, v18.4s }, [x28] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.4s, v1.4s, v2.4s }, [x0], #48 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v3.2d }, [x28] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v13.2d, v14.2d }, [x13] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v12.2d, v13.2d, v14.2d }, [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.1d }, [x15], x2 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v27.1d, v28.1d }, [x7] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v14.1d, v15.1d, v16.1d }, [x3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v22.1d, v23.1d, v24.1d, v25.1d }, [x4] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v0.2s, v1.2s }, [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v16.2s, v17.2s, v18.2s }, [x27] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v21.2s, v22.2s, v23.2s, v24.2s }, [x21] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v25.4h, v26.4h }, [x3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v20.4h, v21.4h, v22.4h, v23.4h }, [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.4h, v1.4h, v2.4h }, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v24.8b, v25.8b }, [x6] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v7.8b, v8.8b, v9.8b }, [x12] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x13] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.b }[7], [x0] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.h }[3], [x0], #2 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v18.h }[3], [x1] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.s }[1], [x15] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.d }[0], [x15], #8 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v11.d }[0], [x13] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.b }[9], [x0] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1 { v0.b }[9], [x0], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.16b }, [x0] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.8h }, [x0], #2 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.4s }, [x15] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v3.1d }, [x15] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.2d }, [x15], x16 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v18.2d }, [x0] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v8.8b }, [x23] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v28.4h }, [x9] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v3.8h }, [x16] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v10.2s }, [x20] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v0.4h, v1.4h }, [x21] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v8.8h, v9.8h }, [x28] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v2.2s, v3.2s }, [x16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v22.4s, v23.4s }, [x4] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v22.2d, v23.2d }, [x17] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld2 { v29.b, v30.b }[3], [x1] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld2 { v26.s, v27.s }[1], [x17] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld2 { v1.d, v2.d }[0], [x10] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v0.16b, v1.16b }, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v13.8b, v14.8b }, [x4] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v0.8b, v1.8b }, [x0], #16 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.16b }, [x0], #1 +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.8h }, [x15] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ld1r { v0.8h }, [x15], #2 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld2 { v0.h, v1.h }[7], [x15] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld2 { v0.h, v1.h }[7], [x15], x8 +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld2 { v0.h, v1.h }[7], [x15], #4 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v10.16b, v11.16b }, [x23] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.4h, v1.4h }, [x0], #4 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v25.4h, v26.4h }, [x11] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v23.8h, v24.8h }, [x10] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.2s, v1.2s }, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v8.4s, v9.4s }, [x17] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.1d, v1.1d }, [sp], x8 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v9.1d, v10.1d }, [x25] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v26.2d, v27.2d }, [x8] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v8.8b, v9.8b, v10.8b }, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v15.16b, v16.16b, v17.16b }, [x5] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.2d, v1.2d }, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.2d, v1.2d }, [x0], #16 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.4s, v1.4s }, [sp] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2r { v0.4s, v1.4s }, [sp], #8 +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v0.8h, v1.8h, v2.8h }, [x15], #48 +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v7.8h, v8.8h, v9.8h }, [x21] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v16.2s, v17.2s, v18.2s }, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v12.4s, v13.4s, v14.4s }, [x25] +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld3 { v17.b, v18.b, v19.b }[2], [x27] +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld3 { v18.h, v19.h, v20.h }[5], [x16] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v10.2d, v11.2d, v12.2d }, [x18] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld3 { v0.s, v1.s, v2.s }[3], [sp] +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld3 { v0.s, v1.s, v2.s }[3], [sp], x3 +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld3 { v5.d, v6.d, v7.d }[1], [x14] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.8b, v1.8b, v2.8b }, [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v17.16b, v18.16b, v19.16b }, [x3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v3.4h, v4.4h, v5.4h }, [x1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v6.8h, v7.8h, v8.8h }, [x28] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.2s, v1.2s, v2.2s }, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v28.4s, v29.4s, v30.4s }, [x2] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.1d, v1.1d, v2.1d }, [x0], x0 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v1.1d, v2.1d, v3.1d }, [x28] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v8.2d, v9.2d, v10.2d }, [x3] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v6.8b, v7.8b, v8.8b, v9.8b }, [x27] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v11.16b, v12.16b, v13.16b, v14.16b }, [x5] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v21.4h, v22.4h, v23.4h, v24.4h }, [x14] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v9.8h, v10.8h, v11.8h, v12.8h }, [x1] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v17.4s, v18.4s, v19.4s, v20.4s }, [x4] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.8b, v1.8b, v2.8b }, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3 +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x24] +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld4 { v4.b, v5.b, v6.b, v7.b }[12], [x27] +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld4 { v5.h, v6.h, v7.h, v8.h }[0], [x4] +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0 +# CHECK-NEXT: - - - - - 5.00 - - - - - - - ld4 { v0.s, v1.s, v2.s, v3.s }[0], [x26] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v20.8b, v21.8b, v22.8b, v23.8b }, [x23] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x25] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v16.4h, v17.4h, v18.4h, v19.4h }, [x6] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v4.8h, v5.8h, v6.8h, v7.8h }, [x23] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [x30] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v7.4s, v8.4s, v9.4s, v10.4s }, [x23] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla v15.8h, v22.8h, v4.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla v28.2s, v10.2s, v2.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls v25.8h, v29.8h, v0.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls v22.2s, v29.2s, v0.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls v26.4s, v5.4s, v28.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov b0, v0.b[15] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov d6, v0.d[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov h2, v0.h[5] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov s17, v0.s[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov w8, v8.s[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov x30, v18.d[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v2.b[0], v0.b[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v2.h[1], v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v2.s[2], v0.s[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v2.d[1], v0.d[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.b[0], w8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.h[1], w8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.s[2], w8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.d[1], x8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi d15, #0xff00ff00ff00ff +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v0.16b, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v14.8h, #174 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v13.4h, #74, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v0.2d, #0xff0000ff0000ffff +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v0.2s, #8, msl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v19.2s, #226 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v1.4s, #122, msl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v0.4s, #255, lsl #24 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movi v0.8b, #255 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v26.4h, v20.4h, v14.h[5] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v5.8h, v21.8h, v3.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v29.2s, v10.2s, v3.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v30.4s, v11.4s, v4.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v30.4s, v11.4s, v4.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul v3.8h, v9.8h, v8.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvni v9.4h, #237 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvni v8.8h, #171, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvni v22.4s, #15, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvni v0.2s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvni v0.4s, #16, msl #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg d29, d24 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvn v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mvn v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orn v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orn v29.8b, v19.8b, v16.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr v9.4h, #18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr v0.8h, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr v4.4s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmul v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rbit v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rbit v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev16 v21.8b, v1.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev16 v30.16b, v31.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev32 v0.4h, v9.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev32 v21.8b, v1.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev32 v30.16b, v31.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev32 v4.8h, v7.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev64 v0.16b, v31.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev64 v1.8b, v9.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev64 v13.4h, v21.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev64 v2.8h, v4.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev64 v4.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev64 v6.4s, v8.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - saba v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabd v12.2s, v11.2s, v27.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sadalp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sadalp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sadalp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sadalp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sadalp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sadalp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - saddlv d0, v0.4s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - saddlv s0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - saddlv s0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - saddlv h0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - saddlv h0, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h4, h8, #9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf h5, h14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d21, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf d21, d12, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s22, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf s22, s13, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v25.4h, v13.4h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf v4.8h, v8.8h, #10 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shadd v25.16b, v1.16b, v10.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl d7, d10, #12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl v23.8b, v18.8b, #6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl v0.8h, v23.8h, #10 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shl v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shll v0.4s, v0.4h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shll v0.8h, v0.8b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shll v0.2d, v0.2s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shll2 v0.2d, v0.4s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shll2 v0.4s, v0.8h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shll2 v0.8h, v0.16b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub v15.8h, v5.8h, v27.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli d10, d14, #12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax v30.16b, v3.16b, v30.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp v21.8h, v16.8h, v7.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - smaxv b0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - smaxv b0, v0.16b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - smaxv h0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - smaxv h0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - smaxv s0, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sminv b0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sminv b0, v0.16b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sminv h0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sminv h0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sminv s0, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal v0.2d, v25.2s, v1.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal v16.4s, v9.4h, v11.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal2 v30.2d, v22.4s, v7.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl v25.2d, v27.2s, v1.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl v14.4s, v23.4h, v12.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlal2 v1.4s, v9.8h, v0.h[6] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl2 v12.4s, v11.8h, v12.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl2 v11.2d, v28.4s, v7.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull v31.2d, v23.2s, v6.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull v3.4s, v26.4h, v1.h[5] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull2 v11.2d, v1.4s, v7.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull2 v13.4s, v18.8h, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs b19, b14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs d18, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs h21, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs s20, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd b20, b11, b15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd h12, h18, h10 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal d19, s24, s12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal d8, s9, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal s0, h0, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal s17, h27, h12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal v11.2d, v24.2s, v0.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal v20.4s, v30.4h, v12.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal2 v23.2d, v30.4s, v6.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlal2 v2.4s, v17.8h, v5.h[6] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh v8.4h, v16.4h, v5.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh v16.2s, v24.2s, v7.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull v8.4s, v19.4h, v1.h[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull v20.2d, v10.2s, v6.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull2 v10.4s, v25.8h, v0.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull2 v4.2d, v29.4s, v2.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh v0.8h, v15.8h, v0.h[5] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh v6.2s, v29.2s, v4.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh v31.2s, v17.2s, v4.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl d12, s23, s13 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl d8, s9, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl s0, h0, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl s14, h12, h25 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl v26.2d, v7.2s, v3.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl v4.4s, v22.4h, v13.h[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl2 v4.2d, v3.4s, v3.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlsl2 v2.4s, v28.8h, v4.h[6] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull d1, s1, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull d15, s22, s12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull s1, h1, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull s12, h22, h12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg b19, b14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg d18, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg h21, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg s20, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah h0, h1, h2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.4h, v1.4h, v2.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah s0, s1, s2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.2s, v1.2s, v2.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah v0.4s, v1.4s, v2.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh h0, h1, h2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.4h, v1.4h, v2.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.8h, v1.8h, v2.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh s0, s1, s2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.2s, v1.2s, v2.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh v0.4s, v1.4s, v2.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl d31, d31, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl h3, h4, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl s17, s4, s23 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub b3, b13, b12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub v20.8h, v18.8h, v12.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn b10, h13, #2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn h15, s10, #6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn s15, d12, #9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun b17, h10, #6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun h10, s13, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun s22, d16, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl b11, b19, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl d15, d16, #51 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl d31, d31, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl h13, h18, #11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl h3, h4, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl s14, s17, #22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v23.16b, v23.16b, v23.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu b15, b18, #6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu d11, d13, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu h19, h17, #6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu s16, s14, #25 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn b10, h15, #5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn h17, s10, #4 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn s18, d10, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun b15, h10, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun h20, s14, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun s10, d15, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrun2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub s20, s10, s7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn b18, h18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn h20, s17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn s19, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun b19, h14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun h21, s15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun s20, d12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtun2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri d10, d12, #14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl d16, d16, d16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr d19, d18, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra d15, d11, #19 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshl d31, d31, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshl v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshl v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshl v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshll v9.8h, v2.8b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshll v12.4s, v3.4h, #4 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshll v0.2d, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshll2 v28.8h, v12.16b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshll2 v0.4s, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshll2 v17.2d, v13.4s, #22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr d15, d16, #12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra d18, d12, #21 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v18.8b }, [x15] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v8.8b, v9.8b }, [x18] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v15.8b, v16.8b, v17.8b }, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v21.8b, v22.8b, v23.8b, v24.8b }, [x14] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v0.16b }, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v1.16b, v2.16b }, [x4] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st1 { v27.16b, v28.16b, v29.16b }, [x18] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st1 { v18.16b, v19.16b, v20.16b, v21.16b }, [x29] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v19.4h }, [x7] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v22.4h, v23.4h }, [x22] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v13.4h, v14.4h, v15.4h }, [x7] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v23.4h, v24.4h, v25.4h, v26.4h }, [x24] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v27.8h }, [x17] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st1 { v8.8h, v9.8h, v10.8h }, [x16] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x19] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v25.2s }, [x6] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v13.2s, v14.2s }, [x9] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v12.2s, v13.2s, v14.2s }, [x3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v6.2s, v7.2s, v8.2s, v9.2s }, [x13] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v22.4s }, [x19] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v15.4s, v16.4s }, [x12] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st1 { v26.4s, v27.4s, v28.4s, v29.4s }, [x12] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v20.1d }, [x10] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v21.1d, v22.1d }, [x29] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v5.1d, v6.1d, v7.1d }, [x3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x10] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v26.2d, v27.2d }, [x28] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st1 { v13.2d, v14.2d, v15.2d }, [x27] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v8.2d }, [x15] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v0.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v0.4s, v1.4s }, [sp], #32 +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st1 { v0.4s, v1.4s, v2.4s }, [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v1.b }[5], [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v0.h }[2], [x1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v31.s }[1], [x16] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v0.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st1 { v0.8h, v1.8h }, [x15] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v0.d }[1], [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1 { v0.d }[1], [x0], #8 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v0.16b, v1.16b }, [x0], x1 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st2 { v0.8b, v1.8b }, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v6.16b, v7.16b }, [x23] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st2 { v10.4h, v11.4h }, [x18] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v10.8h, v11.8h }, [x18] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st2 { v25.2s, v26.2s }, [x29] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v26.4s, v27.4s }, [x14] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v10.2d, v11.2d }, [x1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v21.b, v22.b }[15], [x15] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v28.h, v29.h }[2], [x6] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v0.s, v1.s }[3], [sp] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v0.s, v1.s }[3], [sp], #8 +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2 { v17.d, v18.d }[1], [x1] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v10.8b, v11.8b, v12.8b }, [x18] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3 { v26.16b, v27.16b, v28.16b }, [x4] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v0.4h, v1.4h, v2.4h }, [x15] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3 { v0.8h, v1.8h, v2.8h }, [x15], x2 +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3 { v0.8h, v1.8h, v2.8h }, [x0] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v19.2s, v20.2s, v21.2s }, [x30] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3 { v24.4s, v25.4s, v26.4s }, [x8] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3 { v24.2d, v25.2d, v26.2d }, [x25] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v8.b, v9.b, v10.b }[4], [x18] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v0.h, v1.h, v2.h }[7], [x15] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v0.h, v1.h, v2.h }[7], [x15], #6 +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v9.s, v10.s, v11.s }[2], [x20] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st3 { v16.d, v17.d, v18.d }[0], [x13] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4 { v17.8b, v18.8b, v19.8b, v20.8b }, [x8] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v7.16b, v8.16b, v9.16b, v10.16b }, [x15] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4 { v5.4h, v6.4h, v7.4h, v8.4h }, [x13] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v11.8h, v12.8h, v13.8h, v14.8h }, [x1] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64 +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v21.4s, v22.4s, v23.4s, v24.4s }, [x6] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v25.2d, v26.2d, v27.2d, v28.2d }, [x16] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v0.b, v1.b, v2.b, v3.b }[15], [x0] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v5.h, v6.h, v7.h, v8.h }[4], [x13] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v22.s, v23.s, v24.s, v25.s }[0], [x7] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v23.d, v24.d, v25.d, v26.d }[1], [x5] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], x5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub d15, d5, d16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub v15.2s, v14.2s, v11.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhn v7.4h, v10.4s, v13.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhn2 v24.4s, v24.2d, v8.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd b19, b14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd d18, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd h20, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd s21, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbl v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - tbl v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbl v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - tbl v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - tbx v0.16b, { v0.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - tbx v0.16b, { v0.16b, v1.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 5.00 5.00 - - - tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b +# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - tbx v0.8b, { v0.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - tbx v0.8b, { v0.16b, v1.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 5.00 5.00 - - - tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uaba v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uaba v13.16b, v14.16b, v19.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabd v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabd v23.4s, v4.4s, v30.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uadalp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uadalp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uadalp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uadalp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uadalp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uadalp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlp v0.1d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlp v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlp v0.2s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlp v0.4h, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlp v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlp v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uaddlv d0, v0.4s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uaddlv s0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uaddlv s0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uaddlv h0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uaddlv h0, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h17, x12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h22, h16, #11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf h7, h21 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d21, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf d21, d14, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s8, x0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s22, s13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf s22, s13, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v18.4h, v11.4h, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf v22.8h, v20.8h, #10 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsub v12.4h, v16.4h, v28.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - umaxv b0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - umaxv b0, v0.16b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - umaxv h0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - umaxv h0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - umaxv s0, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin v0.16b, v26.16b, v2.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp v28.4s, v16.4s, v15.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uminv b0, v0.8b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uminv b0, v0.16b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uminv h0, v0.4h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uminv h0, v0.8h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uminv s0, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal v28.2d, v31.2s, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal v22.4s, v14.4h, v0.h[6] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal2 v10.2d, v4.4s, v3.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal2 v31.4s, v7.8h, v15.h[5] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl v20.2d, v20.2s, v2.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl v21.4s, v12.4h, v7.h[5] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl2 v30.2d, v23.4s, v1.s[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl2 v27.4s, v28.8h, v6.h[4] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlsl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umov w6, v22.b[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umov w0, v0.b[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umov w10, v25.h[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umov w0, v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov w0, v0.s[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov x0, v0.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull v22.2d, v28.2s, v6.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull v27.4s, v1.4h, v8.h[6] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull2 v28.2d, v21.4s, v1.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull2 v18.4s, v26.8h, v10.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umull2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd h0, h1, h5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd s0, s24, s30 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd v14.2d, v22.2d, v20.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl b11, b20, b30 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl s23, s20, s16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl v25.8b, v13.8b, v23.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn b10, h12, #5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn h12, s10, #14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn s10, d10, #25 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl b11, b20, b30 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl b18, b15, #6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl d15, d12, #19 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl h11, h18, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl s14, s19, #18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl s23, s20, s16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v8.4h, v17.4h, v13.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn b12, h10, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn h10, s14, #5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn s10, d12, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn v0.2s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn v0.4h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn v0.8b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn2 v0.16b, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn2 v0.4s, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrn2 v0.8h, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub s16, s21, s6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub d16, d16, d16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub v19.4s, v0.4s, v5.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn b18, h18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn h20, s17 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn s19, d14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - urecpe v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - urecpe v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd v16.2s, v19.2s, v2.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl d8, d7, d4 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl v31.8b, v5.8b, v3.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr d20, d23, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ursqrte v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ursqrte v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra d18, d10, #13 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushl d0, d0, d0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushl v6.8b, v26.8b, v6.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushl v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushl v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushl v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushll v0.4s, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushll v18.8h, v24.8b, #4 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushll v16.2d, v16.2s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushll2 v31.2d, v12.4s, #11 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushll2 v18.4s, v22.8h, #13 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushll2 v0.8h, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr d10, d17, #18 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushr v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov w15, v22.b[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov w26, v27.h[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov x21, v0.b[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov x9, v27.h[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov x15, v3.s[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov w0, v0.b[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov w0, v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov x0, v0.b[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov x0, v0.h[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smov x0, v0.s[1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd b19, b14 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd d18, d22 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd h20, h15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd s21, s12 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra d20, d13, #61 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xtn v0.2s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xtn v0.4h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xtn v0.8b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xtn2 v0.16b, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xtn2 v0.4s, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xtn2 v0.8h, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 v0.8h, v0.8h, v0.8h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.16b, v0.16b, v0.16b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.2d, v0.2d, v0.2d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.2s, v0.2s, v0.2s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.4h, v0.4h, v0.4h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.4s, v0.4s, v0.4s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 v0.8h, v0.8h, v0.8h diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-ptraut-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-ptraut-instructions.s new file mode 100644 index 0000000000000..7ea528acd365c --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-ptraut-instructions.s @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/ptraut-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 U pacia1716 +# CHECK-NEXT: 1 4 1.00 U pacib1716 +# CHECK-NEXT: 1 4 1.00 U autia1716 +# CHECK-NEXT: 1 4 1.00 U autib1716 +# CHECK-NEXT: 1 4 1.00 U paciaz +# CHECK-NEXT: 1 4 1.00 U paciasp +# CHECK-NEXT: 1 4 1.00 U pacibz +# CHECK-NEXT: 1 4 1.00 U pacibsp +# CHECK-NEXT: 1 4 1.00 U autiaz +# CHECK-NEXT: 1 4 1.00 U autiasp +# CHECK-NEXT: 1 4 1.00 U autibz +# CHECK-NEXT: 1 4 1.00 U autibsp +# CHECK-NEXT: 1 4 1.00 U pacia x0, x1 +# CHECK-NEXT: 1 4 1.00 U autia x0, x1 +# CHECK-NEXT: 1 4 1.00 U pacda x0, x1 +# CHECK-NEXT: 1 4 1.00 U autda x0, x1 +# CHECK-NEXT: 1 4 1.00 U pacib x0, x1 +# CHECK-NEXT: 1 4 1.00 U autib x0, x1 +# CHECK-NEXT: 1 4 1.00 U pacdb x0, x1 +# CHECK-NEXT: 1 4 1.00 U autdb x0, x1 +# CHECK-NEXT: 1 5 1.00 pacga x0, x1, x2 +# CHECK-NEXT: 1 4 1.00 U paciza x0 +# CHECK-NEXT: 1 4 1.00 U autiza x0 +# CHECK-NEXT: 1 4 1.00 U pacdza x0 +# CHECK-NEXT: 1 4 1.00 U autdza x0 +# CHECK-NEXT: 1 4 1.00 U pacizb x0 +# CHECK-NEXT: 1 4 1.00 U autizb x0 +# CHECK-NEXT: 1 4 1.00 U pacdzb x0 +# CHECK-NEXT: 1 4 1.00 U autdzb x0 +# CHECK-NEXT: 1 4 1.00 xpaci x0 +# CHECK-NEXT: 1 4 1.00 xpacd x0 +# CHECK-NEXT: 1 4 1.00 U xpaclri +# CHECK-NEXT: 2 1 1.00 U braa x0, x1 +# CHECK-NEXT: 2 1 1.00 U brab x0, x1 +# CHECK-NEXT: 2 1 1.00 U blraa x0, x1 +# CHECK-NEXT: 2 1 1.00 U blrab x0, x1 +# CHECK-NEXT: 2 1 1.00 U braaz x0 +# CHECK-NEXT: 2 1 1.00 U brabz x0 +# CHECK-NEXT: 2 1 1.00 U blraaz x0 +# CHECK-NEXT: 2 1 1.00 U blrabz x0 +# CHECK-NEXT: 2 1 1.00 U retaa +# CHECK-NEXT: 2 1 1.00 U retab +# CHECK-NEXT: 1 2 1.00 * U ldraa x0, [x1, #4088] +# CHECK-NEXT: 1 2 1.00 * U ldraa x0, [x1, #-4096] +# CHECK-NEXT: 1 2 1.00 * U ldrab x0, [x1, #4088] +# CHECK-NEXT: 1 2 1.00 * U ldrab x0, [x1, #-4096] +# CHECK-NEXT: 1 2 1.00 * U ldraa x0, [x1, #4088]! +# CHECK-NEXT: 1 2 1.00 * U ldraa x0, [x1, #-4096]! +# CHECK-NEXT: 1 2 1.00 * U ldrab x0, [x1, #4088]! +# CHECK-NEXT: 1 2 1.00 * U ldrab x0, [x1, #-4096]! +# CHECK-NEXT: 1 2 1.00 * U ldraa x0, [x1] +# CHECK-NEXT: 1 2 1.00 * U ldrab x0, [x1] +# CHECK-NEXT: 1 2 1.00 * U ldraa x0, [x1, #0]! +# CHECK-NEXT: 1 2 1.00 * U ldrab x0, [x1, #0]! +# CHECK-NEXT: 1 2 1.00 * U ldraa xzr, [sp, #-4096]! +# CHECK-NEXT: 1 2 1.00 * U ldrab xzr, [sp, #-4096]! + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - 10.00 - - - - 56.00 - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacia1716 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacib1716 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autia1716 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autib1716 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - paciaz +# CHECK-NEXT: - - - - - - - 1.00 - - - - - paciasp +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacibz +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacibsp +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autiaz +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autiasp +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autibz +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autibsp +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacia x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autia x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacda x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autda x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacib x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autib x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacdb x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autdb x0, x1 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacga x0, x1, x2 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - paciza x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autiza x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacdza x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autdza x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacizb x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autizb x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - pacdzb x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - autdzb x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - xpaci x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - xpacd x0 +# CHECK-NEXT: - - - - - - - 1.00 - - - - - xpaclri +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - braa x0, x1 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - brab x0, x1 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - blraa x0, x1 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - blrab x0, x1 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - braaz x0 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - brabz x0 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - blraaz x0 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - blrabz x0 +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - retaa +# CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - - retab +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa x0, [x1, #4088] +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa x0, [x1, #-4096] +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab x0, [x1, #4088] +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab x0, [x1, #-4096] +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa x0, [x1, #4088]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab x0, [x1, #4088]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab x0, [x1, #-4096]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa x0, [x1] +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab x0, [x1] +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa x0, [x1, #0]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab x0, [x1, #0]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldraa xzr, [sp, #-4096]! +# CHECK-NEXT: - - - - - - - 1.00 - - - - - ldrab xzr, [sp, #-4096]! diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-rcpc-immo-instructions.s new file mode 100644 index 0000000000000..e9044762700b9 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-rcpc-immo-instructions.s @@ -0,0 +1,61 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -instruction-tables < %p/../Inputs/rcpc-immo-instructions.s | FileCheck %s +-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 0.50 * ldapur w7, [x24] +# CHECK-NEXT: 1 2 0.50 * ldapur x20, [x13] +# CHECK-NEXT: 1 2 0.50 * ldapurb w13, [x17] +# CHECK-NEXT: 1 2 0.50 * ldapurh w3, [x22] +# CHECK-NEXT: 1 2 0.50 * U ldapursb w7, [x8] +# CHECK-NEXT: 1 2 0.50 * U ldapursb x29, [x7] +# CHECK-NEXT: 1 2 0.50 * U ldapursh w17, [x19] +# CHECK-NEXT: 1 2 0.50 * U ldapursh x3, [x3] +# CHECK-NEXT: 1 2 0.50 * U ldapursw x3, [x18] +# CHECK-NEXT: 1 1 1.00 * stlur w3, [x27] +# CHECK-NEXT: 1 1 1.00 * stlur x23, [x25] +# CHECK-NEXT: 1 1 1.00 * stlurb w30, [x17] +# CHECK-NEXT: 1 1 1.00 * stlurh w9, [x29] + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: - - - - 4.50 8.50 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlur w3, [x27] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlur x23, [x25] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlurb w30, [x17] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stlurh w9, [x29] diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-sve-instructions.s new file mode 100644 index 0000000000000..f3bb840fca403 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/C1Nano-sve-instructions.s @@ -0,0 +1,6862 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=c1-nano -mattr=+sve2-aes,+sve2-sha3,+sve2-sm4 -instruction-tables < %p/../Inputs/sve-instructions.s | FileCheck %s + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 0.50 abs z0.b, p0/m, z0.b +# CHECK-NEXT: 1 3 0.50 abs z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 abs z0.h, p0/m, z0.h +# CHECK-NEXT: 1 3 0.50 abs z0.s, p0/m, z0.s +# CHECK-NEXT: 1 3 0.50 abs z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 abs z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 abs z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 abs z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 adclb z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 adclb z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 adclt z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 adclt z0.s, z1.s, z31.s +# CHECK-NEXT: 1 3 0.50 add z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 add z0.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 add z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 add z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 add z0.d, z0.d, #0 +# CHECK-NEXT: 1 3 0.50 add z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 3 0.50 add z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 add z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 add z0.h, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 add z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 3 0.50 add z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 add z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 add z0.s, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 add z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 3 0.50 add z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 add z0.s, z1.s, z2.s +# CHECK-NEXT: 1 3 0.50 add z21.b, p5/m, z21.b, z10.b +# CHECK-NEXT: 1 3 0.50 add z21.b, z10.b, z21.b +# CHECK-NEXT: 1 3 0.50 add z21.d, p5/m, z21.d, z10.d +# CHECK-NEXT: 1 3 0.50 add z21.d, z10.d, z21.d +# CHECK-NEXT: 1 3 0.50 add z21.h, p5/m, z21.h, z10.h +# CHECK-NEXT: 1 3 0.50 add z21.h, z10.h, z21.h +# CHECK-NEXT: 1 3 0.50 add z21.s, p5/m, z21.s, z10.s +# CHECK-NEXT: 1 3 0.50 add z21.s, z10.s, z21.s +# CHECK-NEXT: 1 3 0.50 add z23.b, p3/m, z23.b, z13.b +# CHECK-NEXT: 1 3 0.50 add z23.b, z13.b, z8.b +# CHECK-NEXT: 1 3 0.50 add z23.d, p3/m, z23.d, z13.d +# CHECK-NEXT: 1 3 0.50 add z23.d, z13.d, z8.d +# CHECK-NEXT: 1 3 0.50 add z23.h, p3/m, z23.h, z13.h +# CHECK-NEXT: 1 3 0.50 add z23.h, z13.h, z8.h +# CHECK-NEXT: 1 3 0.50 add z23.s, p3/m, z23.s, z13.s +# CHECK-NEXT: 1 3 0.50 add z23.s, z13.s, z8.s +# CHECK-NEXT: 1 3 0.50 add z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 add z31.b, z31.b, #255 +# CHECK-NEXT: 1 3 0.50 add z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 add z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 add z31.d, z31.d, #65280 +# CHECK-NEXT: 1 3 0.50 add z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 add z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 add z31.h, z31.h, #65280 +# CHECK-NEXT: 1 3 0.50 add z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 add z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 add z31.s, z31.s, #65280 +# CHECK-NEXT: 1 3 0.50 add z31.s, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 addhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 addhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 addhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 addhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 addhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 addhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 3 0.50 addp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 addp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 addp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 addp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 1 0.50 addpl sp, sp, #31 +# CHECK-NEXT: 1 1 0.50 addpl x0, x0, #-32 +# CHECK-NEXT: 1 1 0.50 addpl x21, x21, #0 +# CHECK-NEXT: 1 1 0.50 addpl x23, x8, #-1 +# CHECK-NEXT: 1 1 0.50 addvl sp, sp, #31 +# CHECK-NEXT: 1 1 0.50 addvl x0, x0, #-32 +# CHECK-NEXT: 1 1 0.50 addvl x21, x21, #0 +# CHECK-NEXT: 1 1 0.50 addvl x23, x8, #-1 +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, lsl #1] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, lsl #2] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, lsl #3] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, sxtw #1] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, sxtw #2] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, sxtw #3] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, sxtw] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, uxtw #1] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, uxtw #2] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, uxtw #3] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d, uxtw] +# CHECK-NEXT: 1 3 0.50 adr z0.d, [z0.d, z0.d] +# CHECK-NEXT: 1 3 0.50 adr z0.s, [z0.s, z0.s, lsl #1] +# CHECK-NEXT: 1 3 0.50 adr z0.s, [z0.s, z0.s, lsl #2] +# CHECK-NEXT: 1 3 0.50 adr z0.s, [z0.s, z0.s, lsl #3] +# CHECK-NEXT: 1 3 0.50 adr z0.s, [z0.s, z0.s] +# CHECK-NEXT: 1 3 0.50 aesd z0.b, z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 aese z0.b, z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 aesimc z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 aesimc z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 aesmc z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 aesmc z31.b, z31.b +# CHECK-NEXT: 1 2 1.00 and p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, #0x6 +# CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 and z0.s, z0.s, #0x6 +# CHECK-NEXT: 1 3 0.50 and z0.s, z0.s, #0xfffffff9 +# CHECK-NEXT: 1 3 0.50 and z23.d, z13.d, z8.d +# CHECK-NEXT: 1 3 0.50 and z23.h, z23.h, #0x6 +# CHECK-NEXT: 1 3 0.50 and z23.h, z23.h, #0xfff9 +# CHECK-NEXT: 1 3 0.50 and z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 and z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 and z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 and z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 and z5.b, z5.b, #0x6 +# CHECK-NEXT: 1 3 0.50 and z5.b, z5.b, #0xf9 +# CHECK-NEXT: 1 2 1.00 ands p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 4 1.00 andv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 andv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 andv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 andv s0, p7, z31.s +# CHECK-NEXT: 1 3 0.50 asr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 asr z0.b, p0/m, z0.b, z1.d +# CHECK-NEXT: 1 3 0.50 asr z0.b, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.b, z1.b, z2.d +# CHECK-NEXT: 1 3 0.50 asr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 asr z0.d, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 asr z0.h, p0/m, z0.h, z1.d +# CHECK-NEXT: 1 3 0.50 asr z0.h, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.h, z1.h, z2.d +# CHECK-NEXT: 1 3 0.50 asr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 asr z0.s, p0/m, z0.s, z1.d +# CHECK-NEXT: 1 3 0.50 asr z0.s, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 asr z0.s, z1.s, z2.d +# CHECK-NEXT: 1 3 0.50 asr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 asr z31.b, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 asr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 asr z31.d, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 asr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 asr z31.h, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 asr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: 1 3 0.50 asr z31.s, z31.s, #32 +# CHECK-NEXT: 1 4 0.50 asrd z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: 1 4 0.50 asrd z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 asrd z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 asrd z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 asrd z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: 1 4 0.50 asrd z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: 1 4 0.50 asrd z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: 1 4 0.50 asrd z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: 1 3 0.50 asrr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 asrr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 asrr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 asrr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 bcax z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: 1 13 11.00 bdep z0.b, z1.b, z31.b +# CHECK-NEXT: 1 68 66.00 bdep z0.d, z1.d, z31.d +# CHECK-NEXT: 1 21 19.00 bdep z0.h, z1.h, z31.h +# CHECK-NEXT: 1 37 35.00 bdep z0.s, z1.s, z31.s +# CHECK-NEXT: 1 13 11.00 bext z0.b, z1.b, z31.b +# CHECK-NEXT: 1 68 66.00 bext z0.d, z1.d, z31.d +# CHECK-NEXT: 1 21 19.00 bext z0.h, z1.h, z31.h +# CHECK-NEXT: 1 37 35.00 bext z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 bfcvt z0.h, p0/m, z1.s +# CHECK-NEXT: 1 4 0.50 bfcvtnt z0.h, p0/m, z1.s +# CHECK-NEXT: 2 10 0.50 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 2 10 0.50 bfdot z0.s, z1.h, z2.h[0] +# CHECK-NEXT: 2 10 0.50 bfdot z0.s, z1.h, z2.h[3] +# CHECK-NEXT: 1 4 0.50 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 bfmlalb z0.s, z1.h, z2.h[0] +# CHECK-NEXT: 1 4 0.50 bfmlalb z0.s, z1.h, z2.h[7] +# CHECK-NEXT: 1 4 0.50 bfmlalb z10.s, z21.h, z14.h +# CHECK-NEXT: 1 4 0.50 bfmlalb z21.s, z14.h, z3.h[2] +# CHECK-NEXT: 1 4 0.50 bfmlalt z0.s, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 bfmlalt z0.s, z1.h, z2.h[0] +# CHECK-NEXT: 1 4 0.50 bfmlalt z0.s, z1.h, z2.h[7] +# CHECK-NEXT: 1 4 0.50 bfmlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 bfmlalt z14.s, z10.h, z21.h +# CHECK-NEXT: 2 14 1.00 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 1 13 11.00 bgrp z0.b, z1.b, z31.b +# CHECK-NEXT: 1 68 66.00 bgrp z0.d, z1.d, z31.d +# CHECK-NEXT: 1 21 19.00 bgrp z0.h, z1.h, z31.h +# CHECK-NEXT: 1 37 35.00 bgrp z0.s, z1.s, z31.s +# CHECK-NEXT: 1 2 1.00 bic p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 bic p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 3 0.50 bic z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 bic z23.d, z13.d, z8.d +# CHECK-NEXT: 1 3 0.50 bic z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 bic z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 bic z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 bic z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 bics p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 bics p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brka p0.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 brka p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkas p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkb p0.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 brkb p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkbs p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkn p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1 2 1.00 brkn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkns p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1 2 1.00 brkns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpa p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpa p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpas p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpas p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpb p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpb p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpbs p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpbs p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 3 0.50 bsl z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: 1 3 0.50 bsl1n z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: 1 3 0.50 bsl2n z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: 1 3 0.50 cadd z0.b, z0.b, z0.b, #90 +# CHECK-NEXT: 1 3 0.50 cadd z0.d, z0.d, z0.d, #90 +# CHECK-NEXT: 1 3 0.50 cadd z0.h, z0.h, z0.h, #90 +# CHECK-NEXT: 1 3 0.50 cadd z0.s, z0.s, z0.s, #90 +# CHECK-NEXT: 1 3 0.50 cadd z31.b, z31.b, z31.b, #270 +# CHECK-NEXT: 1 3 0.50 cadd z31.d, z31.d, z31.d, #270 +# CHECK-NEXT: 1 3 0.50 cadd z31.h, z31.h, z31.h, #270 +# CHECK-NEXT: 1 3 0.50 cadd z31.s, z31.s, z31.s, #270 +# CHECK-NEXT: 1 4 0.50 cdot z0.d, z1.h, z15.h[1], #0 +# CHECK-NEXT: 1 4 0.50 cdot z0.d, z1.h, z31.h, #0 +# CHECK-NEXT: 1 4 0.50 cdot z0.d, z1.h, z31.h, #180 +# CHECK-NEXT: 1 4 0.50 cdot z0.d, z1.h, z31.h, #270 +# CHECK-NEXT: 1 4 0.50 cdot z0.d, z1.h, z31.h, #90 +# CHECK-NEXT: 1 4 0.50 cdot z0.s, z1.b, z31.b, #0 +# CHECK-NEXT: 1 4 0.50 cdot z0.s, z1.b, z7.b[3], #0 +# CHECK-NEXT: 1 4 0.50 cdot z29.d, z30.h, z0.h[0], #180 +# CHECK-NEXT: 1 4 0.50 cdot z31.d, z30.h, z7.h[1], #270 +# CHECK-NEXT: 1 4 0.50 cdot z5.d, z6.h, z3.h[0], #90 +# CHECK-NEXT: 1 4 0.50 clasta b0, p7, b0, z31.b +# CHECK-NEXT: 1 4 0.50 clasta d0, p7, d0, z31.d +# CHECK-NEXT: 1 4 0.50 clasta h0, p7, h0, z31.h +# CHECK-NEXT: 1 4 0.50 clasta s0, p7, s0, z31.s +# CHECK-NEXT: 1 4 4.00 clasta w0, p7, w0, z31.b +# CHECK-NEXT: 1 4 4.00 clasta w0, p7, w0, z31.h +# CHECK-NEXT: 1 4 4.00 clasta w0, p7, w0, z31.s +# CHECK-NEXT: 1 4 4.00 clasta x0, p7, x0, z31.d +# CHECK-NEXT: 1 4 0.50 clasta z0.b, p7, z0.b, z31.b +# CHECK-NEXT: 1 4 0.50 clasta z0.d, p7, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 clasta z0.h, p7, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 clasta z0.s, p7, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 clastb b0, p7, b0, z31.b +# CHECK-NEXT: 1 4 0.50 clastb d0, p7, d0, z31.d +# CHECK-NEXT: 1 4 0.50 clastb h0, p7, h0, z31.h +# CHECK-NEXT: 1 4 0.50 clastb s0, p7, s0, z31.s +# CHECK-NEXT: 1 4 4.00 clastb w0, p7, w0, z31.b +# CHECK-NEXT: 1 4 4.00 clastb w0, p7, w0, z31.h +# CHECK-NEXT: 1 4 4.00 clastb w0, p7, w0, z31.s +# CHECK-NEXT: 1 4 4.00 clastb x0, p7, x0, z31.d +# CHECK-NEXT: 1 4 0.50 clastb z0.b, p7, z0.b, z31.b +# CHECK-NEXT: 1 4 0.50 clastb z0.d, p7, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 clastb z0.h, p7, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 clastb z0.s, p7, z0.s, z31.s +# CHECK-NEXT: 1 3 0.50 cls z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 cls z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 cls z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 cls z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 clz z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 clz z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 clz z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 clz z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 cmla z0.b, z1.b, z2.b, #0 +# CHECK-NEXT: 1 4 0.50 cmla z0.d, z1.d, z2.d, #0 +# CHECK-NEXT: 1 4 0.50 cmla z0.h, z1.h, z2.h, #0 +# CHECK-NEXT: 1 4 0.50 cmla z0.h, z1.h, z2.h[0], #0 +# CHECK-NEXT: 1 4 0.50 cmla z0.s, z1.s, z2.s, #0 +# CHECK-NEXT: 1 4 0.50 cmla z0.s, z1.s, z2.s[0], #0 +# CHECK-NEXT: 1 4 0.50 cmla z15.b, z16.b, z17.b, #270 +# CHECK-NEXT: 1 4 0.50 cmla z15.d, z16.d, z17.d, #270 +# CHECK-NEXT: 1 4 0.50 cmla z15.h, z16.h, z17.h, #270 +# CHECK-NEXT: 1 4 0.50 cmla z15.s, z16.s, z17.s, #270 +# CHECK-NEXT: 1 4 0.50 cmla z29.b, z30.b, z31.b, #90 +# CHECK-NEXT: 1 4 0.50 cmla z29.d, z30.d, z31.d, #90 +# CHECK-NEXT: 1 4 0.50 cmla z29.h, z30.h, z31.h, #90 +# CHECK-NEXT: 1 4 0.50 cmla z29.s, z30.s, z31.s, #90 +# CHECK-NEXT: 1 4 0.50 cmla z31.b, z31.b, z31.b, #180 +# CHECK-NEXT: 1 4 0.50 cmla z31.d, z31.d, z31.d, #180 +# CHECK-NEXT: 1 4 0.50 cmla z31.h, z30.h, z7.h[0], #180 +# CHECK-NEXT: 1 4 0.50 cmla z31.h, z31.h, z31.h, #180 +# CHECK-NEXT: 1 4 0.50 cmla z31.s, z30.s, z7.s[0], #180 +# CHECK-NEXT: 1 4 0.50 cmla z31.s, z31.s, z31.s, #180 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmpeq p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmpeq p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmpeq p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmpeq p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmpeq p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 5 1.00 cmpeq p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmpeq p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmpge p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 5 1.00 cmpge p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 5 1.00 cmpge p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmpge p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmpge p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmpge p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 5 1.00 cmpge p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 5 1.00 cmpge p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmpge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmpge p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 5 1.00 cmpge p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 5 1.00 cmpge p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmpge p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmpge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmpge p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 5 1.00 cmpge p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 5 1.00 cmpge p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmpge p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmpge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmpgt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmpgt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmpgt p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmpgt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmpgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmpgt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmpgt p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmpgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmpgt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 5 1.00 cmpgt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmpgt p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmpgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmphi p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 5 1.00 cmphi p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 5 1.00 cmphi p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmphi p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmphi p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmphi p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 5 1.00 cmphi p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 5 1.00 cmphi p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmphi p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmphi p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 5 1.00 cmphi p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 5 1.00 cmphi p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmphi p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmphi p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmphi p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 5 1.00 cmphi p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 5 1.00 cmphi p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmphi p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmphi p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmphs p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 5 1.00 cmphs p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 5 1.00 cmphs p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmphs p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmphs p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmphs p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 5 1.00 cmphs p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 5 1.00 cmphs p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmphs p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmphs p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 5 1.00 cmphs p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 5 1.00 cmphs p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmphs p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmphs p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmphs p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 5 1.00 cmphs p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 5 1.00 cmphs p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmphs p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmphs p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 5 1.00 cmple p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 5 1.00 cmple p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 5 1.00 cmple p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmple p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 5 1.00 cmple p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 5 1.00 cmple p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 5 1.00 cmple p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 5 1.00 cmple p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmple p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 5 1.00 cmple p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 5 1.00 cmple p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmplo p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 5 1.00 cmplo p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 5 1.00 cmplo p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmplo p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 5 1.00 cmplo p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 5 1.00 cmplo p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 5 1.00 cmplo p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 5 1.00 cmplo p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmplo p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 5 1.00 cmplo p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 5 1.00 cmplo p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmpls p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 5 1.00 cmpls p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 5 1.00 cmpls p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmpls p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 5 1.00 cmpls p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 5 1.00 cmpls p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 5 1.00 cmpls p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 5 1.00 cmpls p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmpls p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 5 1.00 cmpls p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 5 1.00 cmpls p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmplt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 5 1.00 cmplt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 5 1.00 cmplt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmplt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 5 1.00 cmplt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 5 1.00 cmplt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 5 1.00 cmplt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 5 1.00 cmplt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmplt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 5 1.00 cmplt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 5 1.00 cmplt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmpne p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 5 1.00 cmpne p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 5 1.00 cmpne p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 5 1.00 cmpne p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 5 1.00 cmpne p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 5 1.00 cmpne p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 5 1.00 cmpne p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 5 1.00 cmpne p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 5 1.00 cmpne p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 5 1.00 cmpne p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 5 1.00 cmpne p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 5 1.00 cmpne p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 5 1.00 cmpne p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 5 1.00 cmpne p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 5 1.00 cmpne p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 cnot z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 cnot z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 cnot z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 cnot z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 cnt z31.b, p7/m, z31.b +# CHECK-NEXT: 1 9 7.00 cnt z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 cnt z31.h, p7/m, z31.h +# CHECK-NEXT: 1 6 4.00 cnt z31.s, p7/m, z31.s +# CHECK-NEXT: 1 1 1.00 cntb x0 +# CHECK-NEXT: 1 1 1.00 cntb x0, #28 +# CHECK-NEXT: 1 1 1.00 cntb x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 cntb x0, pow2 +# CHECK-NEXT: 1 1 1.00 cntd x0 +# CHECK-NEXT: 1 1 1.00 cntd x0, #28 +# CHECK-NEXT: 1 1 1.00 cntd x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 cntd x0, pow2 +# CHECK-NEXT: 1 1 1.00 cnth x0 +# CHECK-NEXT: 1 1 1.00 cnth x0, #28 +# CHECK-NEXT: 1 1 1.00 cnth x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 cnth x0, pow2 +# CHECK-NEXT: 1 1 1.00 cntp x0, p15, p0.b +# CHECK-NEXT: 1 1 1.00 cntp x0, p15, p0.d +# CHECK-NEXT: 1 1 1.00 cntp x0, p15, p0.h +# CHECK-NEXT: 1 1 1.00 cntp x0, p15, p0.s +# CHECK-NEXT: 1 1 1.00 cntw x0 +# CHECK-NEXT: 1 1 1.00 cntw x0, #28 +# CHECK-NEXT: 1 1 1.00 cntw x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 cntw x0, pow2 +# CHECK-NEXT: 1 4 0.50 compact z31.d, p7, z31.d +# CHECK-NEXT: 1 4 0.50 compact z31.s, p7, z31.s +# CHECK-NEXT: 1 1 1.00 ctermeq w30, wzr +# CHECK-NEXT: 1 1 1.00 ctermeq wzr, w30 +# CHECK-NEXT: 1 1 1.00 ctermeq x30, xzr +# CHECK-NEXT: 1 1 1.00 ctermeq xzr, x30 +# CHECK-NEXT: 1 1 1.00 ctermne w30, wzr +# CHECK-NEXT: 1 1 1.00 ctermne wzr, w30 +# CHECK-NEXT: 1 1 1.00 ctermne x30, xzr +# CHECK-NEXT: 1 1 1.00 ctermne xzr, x30 +# CHECK-NEXT: 1 1 1.00 decb x0 +# CHECK-NEXT: 1 1 1.00 decb x0, #14 +# CHECK-NEXT: 1 1 1.00 decb x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 decb x0, pow2 +# CHECK-NEXT: 1 1 1.00 decb x0, vl1 +# CHECK-NEXT: 1 1 1.00 decd x0 +# CHECK-NEXT: 1 1 1.00 decd x0, #14 +# CHECK-NEXT: 1 1 1.00 decd x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 decd x0, pow2 +# CHECK-NEXT: 1 1 1.00 decd x0, vl1 +# CHECK-NEXT: 1 1 1.00 dech x0 +# CHECK-NEXT: 1 1 1.00 dech x0, #14 +# CHECK-NEXT: 1 1 1.00 dech x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 dech x0, pow2 +# CHECK-NEXT: 1 1 1.00 dech x0, vl1 +# CHECK-NEXT: 1 1 1.00 decp x0, p0.b +# CHECK-NEXT: 1 1 1.00 decp x0, p0.d +# CHECK-NEXT: 1 1 1.00 decp x0, p0.h +# CHECK-NEXT: 1 1 1.00 decp x0, p0.s +# CHECK-NEXT: 1 1 1.00 decp xzr, p15.b +# CHECK-NEXT: 1 1 1.00 decp xzr, p15.d +# CHECK-NEXT: 1 1 1.00 decp xzr, p15.h +# CHECK-NEXT: 1 1 1.00 decp xzr, p15.s +# CHECK-NEXT: 1 3 1.00 decp z31.d, p15.d +# CHECK-NEXT: 1 3 1.00 decp z31.h, p15.h +# CHECK-NEXT: 1 3 1.00 decp z31.s, p15.s +# CHECK-NEXT: 1 1 1.00 decw x0 +# CHECK-NEXT: 1 1 1.00 decw x0, #14 +# CHECK-NEXT: 1 1 1.00 decw x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 decw x0, pow2 +# CHECK-NEXT: 1 1 1.00 decw x0, vl1 +# CHECK-NEXT: 1 4 0.50 dupm z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: 1 4 0.50 dupm z0.s, #0xfffffff9 +# CHECK-NEXT: 1 4 0.50 dupm z23.h, #0xfff9 +# CHECK-NEXT: 1 4 0.50 dupm z5.b, #0xf9 +# CHECK-NEXT: 1 2 1.00 eor p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, #0x6 +# CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 eor z0.s, z0.s, #0x6 +# CHECK-NEXT: 1 3 0.50 eor z0.s, z0.s, #0xfffffff9 +# CHECK-NEXT: 1 3 0.50 eor z23.d, z13.d, z8.d +# CHECK-NEXT: 1 3 0.50 eor z23.h, z23.h, #0x6 +# CHECK-NEXT: 1 3 0.50 eor z23.h, z23.h, #0xfff9 +# CHECK-NEXT: 1 3 0.50 eor z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 eor z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 eor z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 eor z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 eor z5.b, z5.b, #0x6 +# CHECK-NEXT: 1 3 0.50 eor z5.b, z5.b, #0xf9 +# CHECK-NEXT: 1 3 0.50 eor3 z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: 1 4 0.50 eorbt z0.b, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 eorbt z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 eorbt z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 eorbt z0.s, z1.s, z31.s +# CHECK-NEXT: 1 2 1.00 eors p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 4 0.50 eortb z0.b, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 eortb z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 eortb z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 eortb z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 1.00 eorv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 eorv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 eorv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 eorv s0, p7, z31.s +# CHECK-NEXT: 1 3 0.50 ext z0.b, { z1.b, z2.b }, #0 +# CHECK-NEXT: 1 3 0.50 ext z31.b, z31.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 ext z31.b, z31.b, z0.b, #255 +# CHECK-NEXT: 1 3 0.50 ext z31.b, { z30.b, z31.b }, #255 +# CHECK-NEXT: 1 4 0.50 fabd z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fabd z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fabd z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fabs z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 fabs z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 fabs z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 1.00 facge p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 facge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 1.00 facge p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 facge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 1.00 facge p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 1.00 facge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 1.00 facgt p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 facgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 1.00 facgt p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 facgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 1.00 facgt p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 1.00 facgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 fadd z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: 1 4 0.50 fadd z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fadd z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fadd z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: 1 4 0.50 fadd z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fadd z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fadd z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: 1 4 0.50 fadd z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fadd z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fadd z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fadd z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fadd z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 8 2.50 fadda d0, p7, d0, z31.d +# CHECK-NEXT: 1 32 25.00 fadda h0, p7, h0, z31.h +# CHECK-NEXT: 1 16 9.00 fadda s0, p7, s0, z31.s +# CHECK-NEXT: 1 4 0.50 faddp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 faddp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 faddp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 faddv d0, p7, z31.d +# CHECK-NEXT: 1 12 5.00 faddv h0, p7, z31.h +# CHECK-NEXT: 1 8 2.50 faddv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 fcadd z0.d, p0/m, z0.d, z0.d, #90 +# CHECK-NEXT: 1 4 0.50 fcadd z0.h, p0/m, z0.h, z0.h, #90 +# CHECK-NEXT: 1 4 0.50 fcadd z0.s, p0/m, z0.s, z0.s, #90 +# CHECK-NEXT: 1 4 0.50 fcadd z31.d, p7/m, z31.d, z31.d, #270 +# CHECK-NEXT: 1 4 0.50 fcadd z31.h, p7/m, z31.h, z31.h, #270 +# CHECK-NEXT: 1 4 0.50 fcadd z31.s, p7/m, z31.s, z31.s, #270 +# CHECK-NEXT: 1 4 1.00 fcmeq p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmeq p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 fcmeq p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmeq p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 fcmeq p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmeq p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 1.00 fcmge p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmge p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 fcmge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 1.00 fcmge p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmge p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 fcmge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 1.00 fcmge p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmge p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 1.00 fcmge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 1.00 fcmgt p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmgt p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 fcmgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 1.00 fcmgt p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmgt p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 fcmgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 1.00 fcmgt p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmgt p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 1.00 fcmgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 fcmla z0.d, p0/m, z0.d, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 1 4 0.50 fcmla z0.h, p0/m, z0.h, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 fcmla z0.h, p0/m, z1.h, z2.h, #90 +# CHECK-NEXT: 1 4 0.50 fcmla z0.h, z0.h, z0.h[0], #0 +# CHECK-NEXT: 1 4 0.50 fcmla z0.s, p0/m, z0.s, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 fcmla z0.s, p0/m, z1.s, z2.s, #90 +# CHECK-NEXT: 1 4 0.50 fcmla z21.s, z10.s, z5.s[1], #90 +# CHECK-NEXT: 1 4 0.50 fcmla z23.s, z13.s, z8.s[0], #270 +# CHECK-NEXT: 1 4 0.50 fcmla z29.d, p7/m, z30.d, z31.d, #180 +# CHECK-NEXT: 1 4 0.50 fcmla z29.h, p7/m, z30.h, z31.h, #180 +# CHECK-NEXT: 1 4 0.50 fcmla z29.s, p7/m, z30.s, z31.s, #180 +# CHECK-NEXT: 1 4 0.50 fcmla z31.d, p7/m, z31.d, z31.d, #270 +# CHECK-NEXT: 1 4 0.50 fcmla z31.h, p7/m, z31.h, z31.h, #270 +# CHECK-NEXT: 1 4 0.50 fcmla z31.h, z31.h, z7.h[3], #270 +# CHECK-NEXT: 1 4 0.50 fcmla z31.s, p7/m, z31.s, z31.s, #270 +# CHECK-NEXT: 1 4 1.00 fcmle p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmle p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmle p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmlt p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmlt p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmlt p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmne p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmne p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 fcmne p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmne p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 fcmne p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: 1 4 1.00 fcmne p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 1.00 fcmuo p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: 1 4 1.00 fcmuo p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: 1 4 1.00 fcmuo p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: 1 4 0.50 fcvt z0.d, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvt z0.d, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 fcvt z0.h, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvt z0.h, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 fcvt z0.s, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvt z0.s, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtlt z0.s, p0/m, z1.h +# CHECK-NEXT: 1 4 0.50 fcvtlt z30.d, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 fcvtnt z0.h, p0/m, z1.s +# CHECK-NEXT: 1 4 0.50 fcvtnt z30.s, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 fcvtx z0.s, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvtx z30.s, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 fcvtxnt z0.s, p0/m, z1.d +# CHECK-NEXT: 1 4 0.50 fcvtxnt z30.s, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.d, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.d, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.d, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.h, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.s, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.s, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtzs z0.s, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.d, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.d, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.d, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.h, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.s, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.s, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 fcvtzu z0.s, p0/m, z0.s +# CHECK-NEXT: 1 22 19.00 fdiv z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 8 5.00 fdiv z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 13 10.00 fdiv z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 22 19.00 fdivr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 8 5.00 fdivr z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 13 10.00 fdivr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fexpa z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fexpa z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fexpa z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 flogb z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 flogb z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 flogb z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 fmad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmad z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmad z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmax z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: 1 4 0.50 fmax z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmax z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: 1 4 0.50 fmax z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmax z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: 1 4 0.50 fmax z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmax z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fmax z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fmax z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 4 0.50 fmaxnm z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: 1 4 0.50 fmaxnm z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmaxnm z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: 1 4 0.50 fmaxnm z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmaxnm z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: 1 4 0.50 fmaxnm z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmaxnm z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fmaxnm z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fmaxnm z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 4 0.50 fmaxnmp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 fmaxnmp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 fmaxnmp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 fmaxnmv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 fmaxnmv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 fmaxnmv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 fmaxp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 fmaxp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 fmaxp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 fmaxv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 fmaxv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 fmaxv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 fmin z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: 1 4 0.50 fmin z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmin z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: 1 4 0.50 fmin z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmin z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: 1 4 0.50 fmin z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmin z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fmin z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fmin z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 4 0.50 fminnm z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: 1 4 0.50 fminnm z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fminnm z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: 1 4 0.50 fminnm z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fminnm z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: 1 4 0.50 fminnm z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fminnm z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fminnm z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fminnm z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 4 0.50 fminnmp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 fminnmp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 fminnmp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 fminnmv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 fminnmv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 fminnmv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 fminp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 fminp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 fminp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 fminv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 fminv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 fminv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 fmla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmla z0.d, z1.d, z7.d[1] +# CHECK-NEXT: 1 4 0.50 fmla z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmla z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 fmla z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmla z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 fmlalb z0.s, z1.h, z7.h[0] +# CHECK-NEXT: 1 4 0.50 fmlalb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmlalb z30.s, z31.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 fmlalt z0.s, z1.h, z7.h[0] +# CHECK-NEXT: 1 4 0.50 fmlalt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmlalt z30.s, z31.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 fmls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmls z0.d, z1.d, z7.d[1] +# CHECK-NEXT: 1 4 0.50 fmls z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmls z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 fmls z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmls z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 fmlslb z0.s, z1.h, z7.h[0] +# CHECK-NEXT: 1 4 0.50 fmlslb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmlslb z30.s, z31.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 fmlslt z0.s, z1.h, z7.h[0] +# CHECK-NEXT: 1 4 0.50 fmlslt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmlslt z30.s, z31.h, z7.h[7] +# CHECK-NEXT: 1 3 0.50 fmov z0.d, #-10.00000000 +# CHECK-NEXT: 1 3 0.50 fmov z0.d, #0.12500000 +# CHECK-NEXT: 1 3 0.50 fmov z0.d, p0/m, #-10.00000000 +# CHECK-NEXT: 1 3 0.50 fmov z0.d, p0/m, #0.12500000 +# CHECK-NEXT: 1 3 0.50 fmov z0.h, #-0.12500000 +# CHECK-NEXT: 1 3 0.50 fmov z0.h, p0/m, #-0.12500000 +# CHECK-NEXT: 1 3 0.50 fmov z0.s, #-0.12500000 +# CHECK-NEXT: 1 3 0.50 fmov z0.s, p0/m, #-0.12500000 +# CHECK-NEXT: 1 4 0.50 fmsb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmsb z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmsb z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmul z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: 1 4 0.50 fmul z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmul z0.d, z0.d, z0.d[0] +# CHECK-NEXT: 1 4 0.50 fmul z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmul z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: 1 4 0.50 fmul z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmul z0.h, z0.h, z0.h[0] +# CHECK-NEXT: 1 4 0.50 fmul z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmul z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: 1 4 0.50 fmul z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmul z0.s, z0.s, z0.s[0] +# CHECK-NEXT: 1 4 0.50 fmul z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fmul z31.d, p7/m, z31.d, #2.0 +# CHECK-NEXT: 1 4 0.50 fmul z31.d, z31.d, z15.d[1] +# CHECK-NEXT: 1 4 0.50 fmul z31.h, p7/m, z31.h, #2.0 +# CHECK-NEXT: 1 4 0.50 fmul z31.h, z31.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 fmul z31.s, p7/m, z31.s, #2.0 +# CHECK-NEXT: 1 4 0.50 fmul z31.s, z31.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 fmulx z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fmulx z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fmulx z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fneg z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 fneg z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 fneg z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 fnmad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fnmad z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fnmad z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fnmla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fnmla z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fnmla z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fnmls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fnmls z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fnmls z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fnmsb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fnmsb z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fnmsb z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 frecpe z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 frecpe z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 frecpe z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 frecps z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 frecps z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 frecps z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 frecpx z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frecpx z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frecpx z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frinta z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frinta z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frinta z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frinti z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frinti z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frinti z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frintm z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frintm z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frintm z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frintn z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frintn z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frintn z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frintp z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frintp z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frintp z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frintx z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frintx z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frintx z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frintz z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 frintz z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 frintz z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 frsqrte z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 frsqrte z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 frsqrte z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 frsqrts z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 frsqrts z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 frsqrts z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fscale z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fscale z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fscale z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 22 19.00 fsqrt z31.d, p7/m, z31.d +# CHECK-NEXT: 1 8 5.00 fsqrt z31.h, p7/m, z31.h +# CHECK-NEXT: 1 12 9.00 fsqrt z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 fsub z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: 1 4 0.50 fsub z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fsub z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 fsub z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: 1 4 0.50 fsub z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fsub z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 fsub z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: 1 4 0.50 fsub z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fsub z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 fsub z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fsub z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fsub z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 4 0.50 fsubr z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: 1 4 0.50 fsubr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 fsubr z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: 1 4 0.50 fsubr z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 fsubr z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: 1 4 0.50 fsubr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 fsubr z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: 1 4 0.50 fsubr z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: 1 4 0.50 fsubr z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: 1 4 0.50 ftmad z0.d, z0.d, z31.d, #7 +# CHECK-NEXT: 1 4 0.50 ftmad z0.h, z0.h, z31.h, #7 +# CHECK-NEXT: 1 4 0.50 ftmad z0.s, z0.s, z31.s, #7 +# CHECK-NEXT: 1 4 0.50 ftsmul z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 ftsmul z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ftsmul z0.s, z1.s, z31.s +# CHECK-NEXT: 1 3 0.50 ftssel z0.d, z1.d, z31.d +# CHECK-NEXT: 1 3 0.50 ftssel z0.h, z1.h, z31.h +# CHECK-NEXT: 1 3 0.50 ftssel z0.s, z1.s, z31.s +# CHECK-NEXT: 1 6 4.00 histcnt z0.s, p0/z, z1.s, z2.s +# CHECK-NEXT: 1 6 4.00 histcnt z29.d, p7/z, z30.d, z31.d +# CHECK-NEXT: 1 6 4.00 histseg z0.b, z1.b, z31.b +# CHECK-NEXT: 1 1 1.00 incb x0 +# CHECK-NEXT: 1 1 1.00 incb x0, #14 +# CHECK-NEXT: 1 1 1.00 incb x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 incb x0, pow2 +# CHECK-NEXT: 1 1 1.00 incb x0, vl1 +# CHECK-NEXT: 1 1 1.00 incd x0 +# CHECK-NEXT: 1 1 1.00 incd x0, #14 +# CHECK-NEXT: 1 1 1.00 incd x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 incd x0, pow2 +# CHECK-NEXT: 1 1 1.00 incd x0, vl1 +# CHECK-NEXT: 1 4 0.50 incd z0.d +# CHECK-NEXT: 1 4 0.50 incd z0.d, all, mul #16 +# CHECK-NEXT: 1 1 1.00 inch x0 +# CHECK-NEXT: 1 1 1.00 inch x0, #14 +# CHECK-NEXT: 1 1 1.00 inch x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 inch x0, pow2 +# CHECK-NEXT: 1 1 1.00 inch x0, vl1 +# CHECK-NEXT: 1 4 0.50 inch z0.h +# CHECK-NEXT: 1 4 0.50 inch z0.h, all, mul #16 +# CHECK-NEXT: 1 1 1.00 incp x0, p0.b +# CHECK-NEXT: 1 1 1.00 incp x0, p0.d +# CHECK-NEXT: 1 1 1.00 incp x0, p0.h +# CHECK-NEXT: 1 1 1.00 incp x0, p0.s +# CHECK-NEXT: 1 1 1.00 incp xzr, p15.b +# CHECK-NEXT: 1 1 1.00 incp xzr, p15.d +# CHECK-NEXT: 1 1 1.00 incp xzr, p15.h +# CHECK-NEXT: 1 1 1.00 incp xzr, p15.s +# CHECK-NEXT: 1 3 1.00 incp z31.d, p15.d +# CHECK-NEXT: 1 3 1.00 incp z31.h, p15.h +# CHECK-NEXT: 1 3 1.00 incp z31.s, p15.s +# CHECK-NEXT: 1 1 1.00 incw x0 +# CHECK-NEXT: 1 1 1.00 incw x0, #14 +# CHECK-NEXT: 1 1 1.00 incw x0, all, mul #16 +# CHECK-NEXT: 1 1 1.00 incw x0, pow2 +# CHECK-NEXT: 1 1 1.00 incw x0, vl1 +# CHECK-NEXT: 1 4 0.50 incw z0.s +# CHECK-NEXT: 1 4 0.50 incw z0.s, all, mul #16 +# CHECK-NEXT: 1 4 0.50 index z0.b, #0, #0 +# CHECK-NEXT: 1 4 0.50 index z0.d, #0, #0 +# CHECK-NEXT: 1 4 0.50 index z0.h, #0, #0 +# CHECK-NEXT: 1 4 1.00 index z0.h, w0, w0 +# CHECK-NEXT: 1 4 0.50 index z0.s, #0, #0 +# CHECK-NEXT: 1 4 1.00 index z21.b, w10, w21 +# CHECK-NEXT: 1 4 1.00 index z21.d, x10, x21 +# CHECK-NEXT: 1 4 1.00 index z21.s, w10, w21 +# CHECK-NEXT: 1 4 1.00 index z23.b, #13, w8 +# CHECK-NEXT: 1 4 1.00 index z23.b, w13, #8 +# CHECK-NEXT: 1 4 1.00 index z23.d, #13, x8 +# CHECK-NEXT: 1 4 1.00 index z23.d, x13, #8 +# CHECK-NEXT: 1 4 1.00 index z23.h, #13, w8 +# CHECK-NEXT: 1 4 1.00 index z23.h, w13, #8 +# CHECK-NEXT: 1 4 1.00 index z23.s, #13, w8 +# CHECK-NEXT: 1 4 1.00 index z23.s, w13, #8 +# CHECK-NEXT: 1 4 0.50 index z31.b, #-1, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.b, #-1, wzr +# CHECK-NEXT: 1 4 1.00 index z31.b, wzr, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.b, wzr, wzr +# CHECK-NEXT: 1 4 0.50 index z31.d, #-1, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.d, #-1, xzr +# CHECK-NEXT: 1 4 1.00 index z31.d, xzr, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.d, xzr, xzr +# CHECK-NEXT: 1 4 0.50 index z31.h, #-1, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.h, #-1, wzr +# CHECK-NEXT: 1 4 1.00 index z31.h, wzr, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.h, wzr, wzr +# CHECK-NEXT: 1 4 0.50 index z31.s, #-1, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.s, #-1, wzr +# CHECK-NEXT: 1 4 1.00 index z31.s, wzr, #-1 +# CHECK-NEXT: 1 4 1.00 index z31.s, wzr, wzr +# CHECK-NEXT: 1 4 0.50 insr z0.b, w0 +# CHECK-NEXT: 1 4 0.50 insr z0.d, x0 +# CHECK-NEXT: 1 4 0.50 insr z0.h, w0 +# CHECK-NEXT: 1 4 0.50 insr z0.s, w0 +# CHECK-NEXT: 1 4 0.50 insr z31.b, b31 +# CHECK-NEXT: 1 4 0.50 insr z31.b, wzr +# CHECK-NEXT: 1 4 0.50 insr z31.d, d31 +# CHECK-NEXT: 1 4 0.50 insr z31.d, xzr +# CHECK-NEXT: 1 4 0.50 insr z31.h, h31 +# CHECK-NEXT: 1 4 0.50 insr z31.h, wzr +# CHECK-NEXT: 1 4 0.50 insr z31.s, s31 +# CHECK-NEXT: 1 4 0.50 insr z31.s, wzr +# CHECK-NEXT: 1 4 0.50 lasta b0, p7, z31.b +# CHECK-NEXT: 1 4 0.50 lasta d0, p7, z31.d +# CHECK-NEXT: 1 4 0.50 lasta h0, p7, z31.h +# CHECK-NEXT: 1 4 0.50 lasta s0, p7, z31.s +# CHECK-NEXT: 1 8 4.00 lasta w0, p7, z31.b +# CHECK-NEXT: 1 8 4.00 lasta w0, p7, z31.h +# CHECK-NEXT: 1 8 4.00 lasta w0, p7, z31.s +# CHECK-NEXT: 1 8 4.00 lasta x0, p7, z31.d +# CHECK-NEXT: 1 4 0.50 lastb b0, p7, z31.b +# CHECK-NEXT: 1 4 0.50 lastb d0, p7, z31.d +# CHECK-NEXT: 1 4 0.50 lastb h0, p7, z31.h +# CHECK-NEXT: 1 4 0.50 lastb s0, p7, z31.s +# CHECK-NEXT: 1 8 4.00 lastb w0, p7, z31.b +# CHECK-NEXT: 1 8 4.00 lastb w0, p7, z31.h +# CHECK-NEXT: 1 8 4.00 lastb w0, p7, z31.s +# CHECK-NEXT: 1 8 4.00 lastb x0, p7, z31.d +# CHECK-NEXT: 1 3 1.00 * ld1b { z0.b }, p0/z, [sp, x0] +# CHECK-NEXT: 1 3 1.00 * ld1b { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * ld1b { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1b { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1b { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * ld1b { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1b { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ld1b { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 3 1.00 * ld1b { z21.b }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1b { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1b { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1b { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1b { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1b { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1b { z21.s }, p5/z, [x10, x21] +# CHECK-NEXT: 1 3 1.00 * ld1b { z23.d }, p3/z, [x13, x8] +# CHECK-NEXT: 1 3 1.00 * ld1b { z31.b }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1b { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1b { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1b { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: 1 3 1.00 * ld1b { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1b { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 9 7.00 * ld1b { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: 1 3 1.00 * ld1b { z5.h }, p3/z, [x17, x16] +# CHECK-NEXT: 1 7 6.00 * ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +# CHECK-NEXT: 1 7 6.00 * ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +# CHECK-NEXT: 1 3 1.00 * ld1d { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1d { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * ld1d { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1d { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1d { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1d { z23.d }, p3/z, [sp, x8, lsl #3] +# CHECK-NEXT: 1 3 1.00 * ld1d { z23.d }, p3/z, [x13, x8, lsl #3] +# CHECK-NEXT: 1 7 6.00 * ld1d { z23.d }, p3/z, [x13, z8.d, lsl #3] +# CHECK-NEXT: 1 3 1.00 * ld1d { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1d { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1d { z31.d }, p7/z, [z31.d, #248] +# CHECK-NEXT: 1 7 6.00 * ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: 1 7 6.00 * ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: 1 3 1.00 * ld1h { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1h { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * ld1h { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1h { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ld1h { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 3 1.00 * ld1h { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1h { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1h { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1h { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1h { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1h { z21.s }, p5/z, [x10, x21, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1h { z23.d }, p3/z, [x13, x8, lsl #1] +# CHECK-NEXT: 1 7 6.00 * ld1h { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1h { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1h { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1h { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: 1 3 1.00 * ld1h { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1h { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 7.00 * ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 7.00 * ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 9 7.00 * ld1h { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: 1 3 1.00 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: 1 3 1.00 * ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 3 1.00 * ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: 1 3 1.00 * ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 3 1.00 * ld1rqb { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * ld1rqb { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rqb { z21.b }, p5/z, [x10, #112] +# CHECK-NEXT: 1 3 1.00 * ld1rqb { z23.b }, p3/z, [x13, #-128] +# CHECK-NEXT: 1 3 1.00 * ld1rqb { z31.b }, p7/z, [sp, #-16] +# CHECK-NEXT: 1 3 1.00 * ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 3 1.00 * ld1rqd { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rqd { z23.d }, p3/z, [x13, #-128] +# CHECK-NEXT: 1 3 1.00 * ld1rqd { z23.d }, p3/z, [x13, #112] +# CHECK-NEXT: 1 3 1.00 * ld1rqd { z31.d }, p7/z, [sp, #-16] +# CHECK-NEXT: 1 3 1.00 * ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1rqh { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rqh { z23.h }, p3/z, [x13, #-128] +# CHECK-NEXT: 1 3 1.00 * ld1rqh { z23.h }, p3/z, [x13, #112] +# CHECK-NEXT: 1 3 1.00 * ld1rqh { z31.h }, p7/z, [sp, #-16] +# CHECK-NEXT: 1 3 1.00 * ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld1rqw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rqw { z23.s }, p3/z, [x13, #-128] +# CHECK-NEXT: 1 3 1.00 * ld1rqw { z23.s }, p3/z, [x13, #112] +# CHECK-NEXT: 1 3 1.00 * ld1rqw { z31.s }, p7/z, [sp, #-16] +# CHECK-NEXT: 1 3 1.00 * ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: 1 3 1.00 * ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: 1 3 1.00 * ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: 1 3 1.00 * ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 3 1.00 * ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: 1 3 1.00 * ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1sb { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z0.h }, p0/z, [sp, x0] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z0.h }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ld1sb { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1sb { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1sb { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z21.s }, p5/z, [x10, x21] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z23.d }, p3/z, [x13, x8] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1sb { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1sb { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1sb { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 9 7.00 * ld1sb { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ld1sh { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z21.s }, p5/z, [sp, x21, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z21.s }, p5/z, [x10, x21, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z23.d }, p3/z, [x13, x8, lsl #1] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1sh { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: 1 3 1.00 * ld1sh { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 7.00 * ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 7.00 * ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 9 7.00 * ld1sh { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: 1 3 1.00 * ld1sw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * ld1sw { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1sw { z23.d }, p3/z, [sp, x8, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld1sw { z23.d }, p3/z, [x13, x8, lsl #2] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld1sw { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1sw { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: 1 7 6.00 * ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: 1 7 6.00 * ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: 1 3 1.00 * ld1w { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ld1w { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 7 6.00 * ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1w { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ld1w { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 3 1.00 * ld1w { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1w { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * ld1w { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * ld1w { z21.s }, p5/z, [sp, x21, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld1w { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld1w { z21.s }, p5/z, [x10, x21, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld1w { z23.d }, p3/z, [x13, x8, lsl #2] +# CHECK-NEXT: 1 7 6.00 * ld1w { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld1w { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 6.00 * ld1w { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 7 6.00 * ld1w { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: 1 3 1.00 * ld1w { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 7 7.00 * ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: 1 7 7.00 * ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: 1 9 7.00 * ld1w { z31.s }, p7/z, [z31.s, #124] +# CHECK-NEXT: 1 3 2.00 * ld2b { z0.b, z1.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * ld2b { z0.b, z1.b }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld2b { z21.b, z22.b }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld2b { z23.b, z24.b }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: 1 3 2.00 * ld2b { z5.b, z6.b }, p3/z, [x17, x16] +# CHECK-NEXT: 1 3 2.00 * ld2d { z0.d, z1.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 3 1.00 * ld2d { z0.d, z1.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld2d { z21.d, z22.d }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld2d { z23.d, z24.d }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: 1 3 2.00 * ld2d { z5.d, z6.d }, p3/z, [x17, x16, lsl #3] +# CHECK-NEXT: 1 3 2.00 * ld2h { z0.h, z1.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ld2h { z0.h, z1.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld2h { z21.h, z22.h }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld2h { z23.h, z24.h }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: 1 3 2.00 * ld2h { z5.h, z6.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: 1 3 2.00 * ld2w { z0.s, z1.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ld2w { z0.s, z1.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * ld2w { z21.s, z22.s }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: 1 3 1.00 * ld2w { z23.s, z24.s }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: 1 3 2.00 * ld2w { z5.s, z6.s }, p3/z, [x17, x16, lsl #2] +# CHECK-NEXT: 1 5 4.00 * ld3b { z0.b - z2.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 5 3.00 * ld3b { z0.b - z2.b }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld3b { z21.b - z23.b }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld3b { z23.b - z25.b }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld3b { z5.b - z7.b }, p3/z, [x17, x16] +# CHECK-NEXT: 1 5 4.00 * ld3d { z0.d - z2.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 5 3.00 * ld3d { z0.d - z2.d }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld3d { z21.d - z23.d }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld3d { z23.d - z25.d }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld3d { z5.d - z7.d }, p3/z, [x17, x16, lsl #3] +# CHECK-NEXT: 1 5 4.00 * ld3h { z0.h - z2.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 5 3.00 * ld3h { z0.h - z2.h }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld3h { z21.h - z23.h }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld3h { z23.h - z25.h }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld3h { z5.h - z7.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: 1 5 4.00 * ld3w { z0.s - z2.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 5 3.00 * ld3w { z0.s - z2.s }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld3w { z21.s - z23.s }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld3w { z23.s - z25.s }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld3w { z5.s - z7.s }, p3/z, [x17, x16, lsl #2] +# CHECK-NEXT: 1 5 4.00 * ld4b { z0.b - z3.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 5 3.00 * ld4b { z0.b - z3.b }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld4b { z21.b - z24.b }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld4b { z23.b - z26.b }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld4b { z5.b - z8.b }, p3/z, [x17, x16] +# CHECK-NEXT: 1 5 4.00 * ld4d { z0.d - z3.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 5 3.00 * ld4d { z0.d - z3.d }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld4d { z21.d - z24.d }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld4d { z23.d - z26.d }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld4d { z5.d - z8.d }, p3/z, [x17, x16, lsl #3] +# CHECK-NEXT: 1 5 4.00 * ld4h { z0.h - z3.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 5 3.00 * ld4h { z0.h - z3.h }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld4h { z21.h - z24.h }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld4h { z23.h - z26.h }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld4h { z5.h - z8.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: 1 5 4.00 * ld4w { z0.s - z3.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 5 3.00 * ld4w { z0.s - z3.s }, p0/z, [x0] +# CHECK-NEXT: 1 5 3.00 * ld4w { z21.s - z24.s }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: 1 5 3.00 * ld4w { z23.s - z26.s }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: 1 5 4.00 * ld4w { z5.s - z8.s }, p3/z, [x17, x16, lsl #2] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z0.d }, p0/z, [x0, x0] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z0.h }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z0.s }, p0/z, [x0, x0] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 9 7.00 * U ldff1b { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z31.b }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1b { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z31.h }, p7/z, [sp] +# CHECK-NEXT: 1 3 1.00 * U ldff1b { z31.s }, p7/z, [sp] +# CHECK-NEXT: 1 9 7.00 * U ldff1b { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: 1 3 1.00 * U ldff1d { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z23.d }, p3/z, [x13, z8.d, lsl #3] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1d { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1d { z31.d }, p7/z, [z31.d, #248] +# CHECK-NEXT: 1 3 1.00 * U ldff1h { z0.d }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1h { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 3 1.00 * U ldff1h { z0.s }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 9 7.00 * U ldff1h { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1h { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1h { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: 1 3 1.00 * U ldff1h { z31.h }, p7/z, [sp] +# CHECK-NEXT: 1 7 7.00 * U ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 7.00 * U ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 3 1.00 * U ldff1h { z31.s }, p7/z, [sp] +# CHECK-NEXT: 1 9 7.00 * U ldff1h { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: 1 3 1.00 * U ldff1sb { z0.d }, p0/z, [x0, x0] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1sb { z0.h }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * U ldff1sb { z0.s }, p0/z, [x0, x0] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 9 7.00 * U ldff1sb { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1sb { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1sb { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: 1 3 1.00 * U ldff1sb { z31.h }, p7/z, [sp] +# CHECK-NEXT: 1 3 1.00 * U ldff1sb { z31.s }, p7/z, [sp] +# CHECK-NEXT: 1 9 7.00 * U ldff1sb { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: 1 3 1.00 * U ldff1sh { z0.d }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1sh { z0.s }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 9 7.00 * U ldff1sh { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1sh { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1sh { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: 1 7 7.00 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 7.00 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 3 1.00 * U ldff1sh { z31.s }, p7/z, [sp] +# CHECK-NEXT: 1 9 7.00 * U ldff1sh { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: 1 3 1.00 * U ldff1sw { z0.d }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1sw { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1sw { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: 1 3 1.00 * U ldff1w { z0.d }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1w { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 9 7.00 * U ldff1w { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: 1 3 1.00 * U ldff1w { z31.d }, p7/z, [sp] +# CHECK-NEXT: 1 7 6.00 * U ldff1w { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: 1 7 7.00 * U ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: 1 7 7.00 * U ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: 1 3 1.00 * U ldff1w { z31.s }, p7/z, [sp] +# CHECK-NEXT: 1 9 7.00 * U ldff1w { z31.s }, p7/z, [z31.s, #124] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z21.b }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z31.b }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1b { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1d { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1d { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1d { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1h { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sb { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sh { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sh { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sh { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sh { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sh { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sh { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sw { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sw { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1sw { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1w { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1w { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 3 1.00 * U ldnf1w { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1w { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1w { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * U ldnf1w { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldnt1b { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: 1 3 1.00 * ldnt1b { z0.b }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1b { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1b { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: 1 3 1.00 * ldnt1b { z21.b }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldnt1b { z23.b }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: 1 7 6.00 * ldnt1b { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1b { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1b { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1b { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: 1 3 1.00 * ldnt1d { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: 1 3 1.00 * ldnt1d { z0.d }, p0/z, [x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1d { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 3 1.00 * ldnt1d { z21.d }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldnt1d { z23.d }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: 1 7 6.00 * ldnt1d { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1d { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 7 6.00 * ldnt1h { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 3 1.00 * ldnt1h { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: 1 3 1.00 * ldnt1h { z0.h }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1h { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: 1 3 1.00 * ldnt1h { z21.h }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldnt1h { z23.h }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: 1 7 6.00 * ldnt1h { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1h { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1h { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1h { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: 1 7 6.00 * ldnt1sb { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1sb { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: 1 7 6.00 * ldnt1sb { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1sb { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1sb { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1sb { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: 1 7 6.00 * ldnt1sh { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1sh { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: 1 7 6.00 * ldnt1sh { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1sh { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1sh { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1sh { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: 1 7 6.00 * ldnt1sw { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 7 6.00 * ldnt1sw { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1sw { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 7 6.00 * ldnt1w { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: 1 3 1.00 * ldnt1w { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: 1 3 1.00 * ldnt1w { z0.s }, p0/z, [x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1w { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: 1 3 1.00 * ldnt1w { z21.s }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldnt1w { z23.s }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: 1 7 6.00 * ldnt1w { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: 1 7 6.00 * ldnt1w { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: 1 9 7.00 * ldnt1w { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: 1 9 7.00 * ldnt1w { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: 1 3 1.00 * ldr p0, [x0] +# CHECK-NEXT: 1 3 1.00 * ldr p5, [x10, #255, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldr p7, [x13, #-256, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldr z0, [x0] +# CHECK-NEXT: 1 3 1.00 * ldr z23, [x13, #255, mul vl] +# CHECK-NEXT: 1 3 1.00 * ldr z31, [sp, #-256, mul vl] +# CHECK-NEXT: 1 3 0.50 lsl z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 lsl z0.b, p0/m, z0.b, z1.d +# CHECK-NEXT: 1 3 0.50 lsl z0.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.b, z1.b, z2.d +# CHECK-NEXT: 1 3 0.50 lsl z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 lsl z0.d, z0.d, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 lsl z0.h, p0/m, z0.h, z1.d +# CHECK-NEXT: 1 3 0.50 lsl z0.h, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.h, z1.h, z2.d +# CHECK-NEXT: 1 3 0.50 lsl z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 lsl z0.s, p0/m, z0.s, z1.d +# CHECK-NEXT: 1 3 0.50 lsl z0.s, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 lsl z0.s, z1.s, z2.d +# CHECK-NEXT: 1 3 0.50 lsl z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 lsl z31.b, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 lsl z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: 1 3 0.50 lsl z31.d, z31.d, #63 +# CHECK-NEXT: 1 3 0.50 lsl z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: 1 3 0.50 lsl z31.h, z31.h, #15 +# CHECK-NEXT: 1 3 0.50 lsl z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: 1 3 0.50 lsl z31.s, z31.s, #31 +# CHECK-NEXT: 1 3 0.50 lslr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 lslr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 lslr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 lslr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 lsr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 lsr z0.b, p0/m, z0.b, z1.d +# CHECK-NEXT: 1 3 0.50 lsr z0.b, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.b, z1.b, z2.d +# CHECK-NEXT: 1 3 0.50 lsr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 lsr z0.d, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 lsr z0.h, p0/m, z0.h, z1.d +# CHECK-NEXT: 1 3 0.50 lsr z0.h, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.h, z1.h, z2.d +# CHECK-NEXT: 1 3 0.50 lsr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 lsr z0.s, p0/m, z0.s, z1.d +# CHECK-NEXT: 1 3 0.50 lsr z0.s, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 lsr z0.s, z1.s, z2.d +# CHECK-NEXT: 1 3 0.50 lsr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 lsr z31.b, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 lsr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 lsr z31.d, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 lsr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 lsr z31.h, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 lsr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: 1 3 0.50 lsr z31.s, z31.s, #32 +# CHECK-NEXT: 1 3 0.50 lsrr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 lsrr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 lsrr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 lsrr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 mad z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 mad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 mad z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 mad z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 8 4.00 match p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 8 4.00 match p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 8 4.00 match p15.b, p7/z, z30.b, z31.b +# CHECK-NEXT: 1 8 4.00 match p15.h, p7/z, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 mla z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 mla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 mla z0.d, z1.d, z7.d[1] +# CHECK-NEXT: 1 4 0.50 mla z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 mla z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 mla z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 mla z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 mls z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 mls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 mls z0.d, z1.d, z7.d[1] +# CHECK-NEXT: 1 4 0.50 mls z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 mls z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 mls z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 mls z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0/m, p0.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15/z, p15.b +# CHECK-NEXT: 1 3 0.50 mov z0.b, #127 +# CHECK-NEXT: 1 3 0.50 mov z0.b, b0 +# CHECK-NEXT: 1 3 0.50 mov z0.b, p0/m, b0 +# CHECK-NEXT: 1 3 0.50 mov z0.b, p0/m, w0 +# CHECK-NEXT: 1 3 0.50 mov z0.b, p0/z, #127 +# CHECK-NEXT: 1 3 0.50 mov z0.b, w0 +# CHECK-NEXT: 1 3 0.50 mov z0.d, #0 +# CHECK-NEXT: 1 4 0.50 mov z0.d, #0xe0000000000003ff +# CHECK-NEXT: 1 4 0.50 mov z0.d, #0xffffffffffff7fff +# CHECK-NEXT: 1 4 0.50 mov z0.d, #32768 +# CHECK-NEXT: 1 3 0.50 mov z0.d, d0 +# CHECK-NEXT: 1 3 0.50 mov z0.d, p0/m, d0 +# CHECK-NEXT: 1 3 0.50 mov z0.d, p0/m, x0 +# CHECK-NEXT: 1 3 0.50 mov z0.d, x0 +# CHECK-NEXT: 1 3 0.50 mov z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 mov z0.h, #-256 +# CHECK-NEXT: 1 3 0.50 mov z0.h, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z0.h, #0 +# CHECK-NEXT: 1 3 0.50 mov z0.h, #32512 +# CHECK-NEXT: 1 4 0.50 mov z0.h, #32767 +# CHECK-NEXT: 1 3 0.50 mov z0.h, h0 +# CHECK-NEXT: 1 3 0.50 mov z0.h, p0/m, h0 +# CHECK-NEXT: 1 3 0.50 mov z0.h, p0/m, w0 +# CHECK-NEXT: 1 3 0.50 mov z0.h, p0/z, #32512 +# CHECK-NEXT: 1 3 0.50 mov z0.h, w0 +# CHECK-NEXT: 1 3 0.50 mov z0.q, q0 +# CHECK-NEXT: 1 3 0.50 mov z0.s, #0 +# CHECK-NEXT: 1 4 0.50 mov z0.s, #0xffff7fff +# CHECK-NEXT: 1 4 0.50 mov z0.s, #32768 +# CHECK-NEXT: 1 3 0.50 mov z0.s, p0/m, s0 +# CHECK-NEXT: 1 3 0.50 mov z0.s, p0/m, w0 +# CHECK-NEXT: 1 3 0.50 mov z0.s, s0 +# CHECK-NEXT: 1 3 0.50 mov z0.s, w0 +# CHECK-NEXT: 1 3 0.50 mov z21.d, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.d, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.d, #127 +# CHECK-NEXT: 1 3 0.50 mov z21.d, #32512 +# CHECK-NEXT: 1 3 0.50 mov z21.d, p0/z, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.d, p0/z, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.d, p0/z, #127 +# CHECK-NEXT: 1 3 0.50 mov z21.d, p0/z, #32512 +# CHECK-NEXT: 1 3 0.50 mov z21.d, p15/m, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.d, p15/m, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.h, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.h, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.h, #127 +# CHECK-NEXT: 1 3 0.50 mov z21.h, #32512 +# CHECK-NEXT: 1 3 0.50 mov z21.h, p0/z, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.h, p0/z, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.h, p0/z, #127 +# CHECK-NEXT: 1 3 0.50 mov z21.h, p0/z, #32512 +# CHECK-NEXT: 1 3 0.50 mov z21.h, p15/m, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.h, p15/m, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.s, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.s, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.s, #127 +# CHECK-NEXT: 1 3 0.50 mov z21.s, #32512 +# CHECK-NEXT: 1 3 0.50 mov z21.s, p0/z, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.s, p0/z, #-32768 +# CHECK-NEXT: 1 3 0.50 mov z21.s, p0/z, #127 +# CHECK-NEXT: 1 3 0.50 mov z21.s, p0/z, #32512 +# CHECK-NEXT: 1 3 0.50 mov z21.s, p15/m, #-128 +# CHECK-NEXT: 1 3 0.50 mov z21.s, p15/m, #-32768 +# CHECK-NEXT: 1 2 0.50 mov z31.b, p15/m, z31.b +# CHECK-NEXT: 1 3 0.50 mov z31.b, p7/m, b31 +# CHECK-NEXT: 1 3 0.50 movprfx z31, z6 +# CHECK-NEXT: 1 3 0.50 mov z31.b, p7/m, wsp +# CHECK-NEXT: 1 3 0.50 mov z31.b, wsp +# CHECK-NEXT: 1 3 0.50 mov z31.b, z31.b[63] +# CHECK-NEXT: 1 2 0.50 mov z31.d, p15/m, z31.d +# CHECK-NEXT: 1 3 0.50 mov z31.d, p7/m, d31 +# CHECK-NEXT: 1 3 0.50 movprfx z31.d, p7/z, z6.d +# CHECK-NEXT: 1 3 0.50 mov z31.d, p7/m, sp +# CHECK-NEXT: 1 3 0.50 mov z31.d, sp +# CHECK-NEXT: 1 3 0.50 mov z31.d, z0.d +# CHECK-NEXT: 1 3 0.50 mov z31.d, z31.d[7] +# CHECK-NEXT: 1 2 0.50 mov z31.h, p15/m, z31.h +# CHECK-NEXT: 1 3 0.50 mov z31.h, p7/m, h31 +# CHECK-NEXT: 1 3 0.50 mov z31.h, p7/m, wsp +# CHECK-NEXT: 1 3 0.50 mov z31.h, wsp +# CHECK-NEXT: 1 3 0.50 mov z31.h, z31.h[31] +# CHECK-NEXT: 1 2 0.50 mov z31.s, p15/m, z31.s +# CHECK-NEXT: 1 3 0.50 mov z31.s, p7/m, s31 +# CHECK-NEXT: 1 3 0.50 mov z31.s, p7/m, wsp +# CHECK-NEXT: 1 3 0.50 mov z31.s, wsp +# CHECK-NEXT: 1 3 0.50 mov z31.s, z31.s[15] +# CHECK-NEXT: 1 3 0.50 mov z5.b, #-1 +# CHECK-NEXT: 1 3 0.50 mov z5.b, #-128 +# CHECK-NEXT: 1 3 0.50 mov z5.b, #127 +# CHECK-NEXT: 1 3 0.50 mov z5.b, p0/z, #-1 +# CHECK-NEXT: 1 3 0.50 mov z5.b, p0/z, #-128 +# CHECK-NEXT: 1 3 0.50 mov z5.b, p0/z, #127 +# CHECK-NEXT: 1 3 0.50 mov z5.b, p15/m, #-128 +# CHECK-NEXT: 1 3 0.50 mov z5.d, #-6 +# CHECK-NEXT: 1 3 0.50 mov z5.h, #-6 +# CHECK-NEXT: 1 3 0.50 mov z5.q, z17.q[3] +# CHECK-NEXT: 1 3 0.50 mov z5.s, #-6 +# CHECK-NEXT: 1 2 1.00 movs p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 movs p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 movs p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 movs p15.b, p15/z, p15.b +# CHECK-NEXT: 1 1 1.00 U mrs x3, ID_AA64ZFR0_EL1 +# CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL1 +# CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL12 +# CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL2 +# CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL3 +# CHECK-NEXT: 1 4 0.50 msb z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 msb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 msb z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 msb z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: 1 1 1.00 U msr ZCR_EL1, x3 +# CHECK-NEXT: 1 1 1.00 U msr ZCR_EL12, x3 +# CHECK-NEXT: 1 1 1.00 U msr ZCR_EL2, x3 +# CHECK-NEXT: 1 1 1.00 U msr ZCR_EL3, x3 +# CHECK-NEXT: 1 4 0.50 mul z0.b, p7/m, z0.b, z31.b +# CHECK-NEXT: 1 4 0.50 mul z0.b, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 mul z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 mul z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 4 0.50 mul z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 mul z0.h, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 mul z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 mul z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 mul z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 mul z29.s, z30.s, z31.s +# CHECK-NEXT: 1 4 0.50 mul z31.b, z31.b, #-128 +# CHECK-NEXT: 1 4 0.50 mul z31.b, z31.b, #127 +# CHECK-NEXT: 1 4 0.50 mul z31.d, z31.d, #-128 +# CHECK-NEXT: 1 4 0.50 mul z31.d, z31.d, #127 +# CHECK-NEXT: 1 4 0.50 mul z31.d, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 mul z31.h, z31.h, #-128 +# CHECK-NEXT: 1 4 0.50 mul z31.h, z31.h, #127 +# CHECK-NEXT: 1 4 0.50 mul z31.s, z31.s, #-128 +# CHECK-NEXT: 1 4 0.50 mul z31.s, z31.s, #127 +# CHECK-NEXT: 1 2 1.00 nand p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nand p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nands p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nands p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 3 0.50 nbsl z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: 1 3 0.50 neg z0.b, p0/m, z0.b +# CHECK-NEXT: 1 3 0.50 neg z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 neg z0.h, p0/m, z0.h +# CHECK-NEXT: 1 3 0.50 neg z0.s, p0/m, z0.s +# CHECK-NEXT: 1 3 0.50 neg z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 neg z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 neg z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 neg z31.s, p7/m, z31.s +# CHECK-NEXT: 1 8 4.00 nmatch p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 8 4.00 nmatch p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 8 4.00 nmatch p15.b, p7/z, z30.b, z31.b +# CHECK-NEXT: 1 8 4.00 nmatch p15.h, p7/z, z30.h, z31.h +# CHECK-NEXT: 1 2 1.00 nor p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nor p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nors p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nors p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 not p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 not p15.b, p15/z, p15.b +# CHECK-NEXT: 1 3 0.50 not z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 not z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 not z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 not z31.s, p7/m, z31.s +# CHECK-NEXT: 1 2 1.00 nots p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 nots p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 orn p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 orn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 orns p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 orns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 orr p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 3 0.50 orr z0.d, z0.d, #0x6 +# CHECK-NEXT: 1 3 0.50 orr z0.d, z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: 1 3 0.50 orr z0.s, z0.s, #0x6 +# CHECK-NEXT: 1 3 0.50 orr z0.s, z0.s, #0xfffffff9 +# CHECK-NEXT: 1 3 0.50 orr z23.d, z13.d, z8.d +# CHECK-NEXT: 1 3 0.50 orr z23.h, z23.h, #0x6 +# CHECK-NEXT: 1 3 0.50 orr z23.h, z23.h, #0xfff9 +# CHECK-NEXT: 1 3 0.50 orr z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 orr z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 orr z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 orr z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 orr z5.b, z5.b, #0x6 +# CHECK-NEXT: 1 3 0.50 orr z5.b, z5.b, #0xf9 +# CHECK-NEXT: 1 2 1.00 orrs p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 4 1.00 orv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 orv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 orv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 orv s0, p7, z31.s +# CHECK-NEXT: 1 1 1.00 pfalse p15.b +# CHECK-NEXT: 1 2 1.00 pfirst p0.b, p15, p0.b +# CHECK-NEXT: 1 2 1.00 pfirst p15.b, p15, p15.b +# CHECK-NEXT: 1 3 0.50 pmul z0.b, z1.b, z2.b +# CHECK-NEXT: 1 3 0.50 pmul z29.b, z30.b, z31.b +# CHECK-NEXT: 1 3 0.50 pmullb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 9 7.00 pmullb z29.q, z30.d, z31.d +# CHECK-NEXT: 1 9 7.00 pmullb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 pmullt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 9 7.00 pmullt z29.q, z30.d, z31.d +# CHECK-NEXT: 1 9 7.00 pmullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 pnext p0.b, p15, p0.b +# CHECK-NEXT: 1 2 1.00 pnext p0.d, p15, p0.d +# CHECK-NEXT: 1 2 1.00 pnext p0.h, p15, p0.h +# CHECK-NEXT: 1 2 1.00 pnext p0.s, p15, p0.s +# CHECK-NEXT: 1 2 1.00 pnext p15.b, p15, p15.b +# CHECK-NEXT: 1 0 0.50 * * U prfb #14, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb #15, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb #6, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb #7, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb #7, p3, [z13.s, #31] +# CHECK-NEXT: 1 0 0.50 * * U prfb #7, p3, [z13.s] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1keep, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1keep, p0, [x0, z0.d] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1keep, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl3strm, p5, [x10, z21.d, sxtw] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl3strm, p5, [x10, z21.s, uxtw] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl3strm, p5, [z10.d, #31] +# CHECK-NEXT: 1 0 0.50 * * U prfb pldl3strm, p5, [z10.d] +# CHECK-NEXT: 1 0 0.50 * * U prfb pstl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pstl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pstl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pstl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pstl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfb pstl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd #14, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd #15, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd #15, p7, [z31.d, #248] +# CHECK-NEXT: 1 0 0.50 * * U prfd #15, p7, [z31.d] +# CHECK-NEXT: 1 0 0.50 * * U prfd #15, p7, [z31.s, #248] +# CHECK-NEXT: 1 0 0.50 * * U prfd #15, p7, [z31.s] +# CHECK-NEXT: 1 0 0.50 * * U prfd #6, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd #7, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1keep, p0, [x0, z0.d, lsl #3] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1keep, p0, [x0, z0.d, sxtw #3] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1keep, p0, [x0, z0.d, uxtw #3] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1keep, p0, [x0, z0.s, sxtw #3] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1keep, p0, [x0, z0.s, uxtw #3] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pldl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pstl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pstl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pstl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pstl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pstl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfd pstl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh #14, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh #15, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh #15, p7, [z31.d, #62] +# CHECK-NEXT: 1 0 0.50 * * U prfh #15, p7, [z31.d] +# CHECK-NEXT: 1 0 0.50 * * U prfh #15, p7, [z31.s, #62] +# CHECK-NEXT: 1 0 0.50 * * U prfh #15, p7, [z31.s] +# CHECK-NEXT: 1 0 0.50 * * U prfh #6, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh #7, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl1keep, p0, [x0, z0.d, lsl #1] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl3strm, p5, [x10, z21.d, sxtw #1] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl3strm, p5, [x10, z21.d, uxtw #1] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl3strm, p5, [x10, z21.s, sxtw #1] +# CHECK-NEXT: 1 0 0.50 * * U prfh pldl3strm, p5, [x10, z21.s, uxtw #1] +# CHECK-NEXT: 1 0 0.50 * * U prfh pstl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pstl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pstl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pstl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pstl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfh pstl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw #14, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw #15, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw #15, p7, [z31.d, #124] +# CHECK-NEXT: 1 0 0.50 * * U prfw #15, p7, [z31.d] +# CHECK-NEXT: 1 0 0.50 * * U prfw #15, p7, [z31.s, #124] +# CHECK-NEXT: 1 0 0.50 * * U prfw #15, p7, [z31.s] +# CHECK-NEXT: 1 0 0.50 * * U prfw #6, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw #7, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw #7, p3, [x13, z8.d, uxtw #2] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl1keep, p0, [x0, z0.d, sxtw #2] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl1keep, p0, [x0, z0.s, uxtw #2] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl3strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl3strm, p5, [x10, z21.d, lsl #2] +# CHECK-NEXT: 1 0 0.50 * * U prfw pldl3strm, p5, [x10, z21.s, sxtw #2] +# CHECK-NEXT: 1 0 0.50 * * U prfw pstl1keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pstl1strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pstl2keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pstl2strm, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pstl3keep, p0, [x0] +# CHECK-NEXT: 1 0 0.50 * * U prfw pstl3strm, p0, [x0] +# CHECK-NEXT: 1 1 1.00 ptest p15, p0.b +# CHECK-NEXT: 1 1 1.00 ptest p15, p15.b +# CHECK-NEXT: 1 1 1.00 ptrue p0.b, pow2 +# CHECK-NEXT: 1 1 1.00 ptrue p0.d, pow2 +# CHECK-NEXT: 1 1 1.00 ptrue p0.h, pow2 +# CHECK-NEXT: 1 1 1.00 ptrue p0.s, pow2 +# CHECK-NEXT: 1 1 1.00 ptrue p15.b +# CHECK-NEXT: 1 1 1.00 ptrue p15.d +# CHECK-NEXT: 1 1 1.00 ptrue p15.h +# CHECK-NEXT: 1 1 1.00 ptrue p15.s +# CHECK-NEXT: 1 1 1.00 ptrue p7.s +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #14 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #15 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #16 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #17 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #18 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #19 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #20 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #21 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #22 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #23 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #24 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #25 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #26 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #27 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, #28 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, mul3 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, mul4 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl1 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl128 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl16 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl2 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl256 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl3 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl32 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl4 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl5 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl6 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl64 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl7 +# CHECK-NEXT: 1 1 1.00 ptrue p7.s, vl8 +# CHECK-NEXT: 1 2 1.00 ptrues p0.b, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.d, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.h, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.s, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p15.b +# CHECK-NEXT: 1 2 1.00 ptrues p15.d +# CHECK-NEXT: 1 2 1.00 ptrues p15.h +# CHECK-NEXT: 1 2 1.00 ptrues p15.s +# CHECK-NEXT: 1 2 1.00 ptrues p7.s +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #14 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #15 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #16 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #17 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #18 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #19 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #20 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #21 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #22 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #23 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #24 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #25 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #26 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #27 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #28 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, mul3 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, mul4 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl1 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl128 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl16 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl2 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl256 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl3 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl32 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl4 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl5 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl6 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl64 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl7 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl8 +# CHECK-NEXT: 1 1 1.00 punpkhi p0.h, p0.b +# CHECK-NEXT: 1 1 1.00 punpkhi p15.h, p15.b +# CHECK-NEXT: 1 1 1.00 punpklo p0.h, p0.b +# CHECK-NEXT: 1 1 1.00 punpklo p15.h, p15.b +# CHECK-NEXT: 1 6 3.00 raddhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 6 3.00 raddhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 6 3.00 raddhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 6 3.00 raddhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 6 3.00 raddhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 6 3.00 raddhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 3 0.50 rax1 z0.d, z1.d, z31.d +# CHECK-NEXT: 1 3 0.50 rbit z0.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 rbit z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 rbit z0.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 rbit z0.s, p7/m, z31.s +# CHECK-NEXT: 1 1 1.00 * U rdffr p0.b +# CHECK-NEXT: 1 3 1.00 * U rdffr p0.b, p0/z +# CHECK-NEXT: 1 1 1.00 * U rdffr p15.b +# CHECK-NEXT: 1 3 1.00 * U rdffr p15.b, p15/z +# CHECK-NEXT: 1 3 1.00 U rdffrs p0.b, p0/z +# CHECK-NEXT: 1 3 1.00 U rdffrs p15.b, p15/z +# CHECK-NEXT: 1 1 0.50 rdvl x0, #0 +# CHECK-NEXT: 1 1 0.50 rdvl x21, #-32 +# CHECK-NEXT: 1 1 0.50 rdvl x23, #31 +# CHECK-NEXT: 1 1 0.50 rdvl xzr, #-1 +# CHECK-NEXT: 1 1 1.00 rev p0.b, p1.b +# CHECK-NEXT: 1 1 1.00 rev p0.d, p1.d +# CHECK-NEXT: 1 1 1.00 rev p0.h, p1.h +# CHECK-NEXT: 1 1 1.00 rev p0.s, p1.s +# CHECK-NEXT: 1 3 0.50 rev z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 rev z0.d, z31.d +# CHECK-NEXT: 1 3 0.50 rev z0.h, z31.h +# CHECK-NEXT: 1 3 0.50 rev z0.s, z31.s +# CHECK-NEXT: 1 3 0.50 revb z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 revb z0.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 revb z0.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 revh z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 revh z0.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 revw z0.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 rshrnb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 rshrnb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 rshrnb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 rshrnb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 rshrnb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 rshrnb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 rshrnt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 rshrnt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 rshrnt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 rshrnt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 rshrnt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 rshrnt z31.s, z31.d, #32 +# CHECK-NEXT: 1 6 3.00 rsubhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 6 3.00 rsubhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 6 3.00 rsubhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 6 3.00 rsubhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 6 3.00 rsubhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 6 3.00 rsubhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 5 3.00 saba z0.b, z1.b, z31.b +# CHECK-NEXT: 1 5 3.00 saba z0.d, z1.d, z31.d +# CHECK-NEXT: 1 5 3.00 saba z0.h, z1.h, z31.h +# CHECK-NEXT: 1 5 3.00 saba z0.s, z1.s, z31.s +# CHECK-NEXT: 1 5 3.00 sabalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 5 3.00 sabalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 5 3.00 sabalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 5 3.00 sabalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 5 3.00 sabalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 5 3.00 sabalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 3 0.50 sabd z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 sabd z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 sabd z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 sabd z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 sabdlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 3 0.50 sabdlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 3 0.50 sabdlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 sabdlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 3 0.50 sabdlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 3 0.50 sabdlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 6 4.00 sadalp z0.h, p0/m, z1.b +# CHECK-NEXT: 1 6 4.00 sadalp z29.s, p0/m, z30.h +# CHECK-NEXT: 1 6 4.00 sadalp z30.d, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 saddlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 saddlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 saddlbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 saddlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 saddwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 saddwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 saddwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 saddwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 saddwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 saddwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 sbclb z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 sbclb z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sbclt z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 sbclt z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 scvtf z0.d, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 scvtf z0.d, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 scvtf z0.h, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 scvtf z0.h, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 scvtf z0.h, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 scvtf z0.s, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 scvtf z0.s, p0/m, z0.s +# CHECK-NEXT: 1 26 23.00 sdiv z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 15 12.00 sdiv z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 26 23.00 sdivr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 15 12.00 sdivr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 sdot z0.d, z1.h, z15.h[1] +# CHECK-NEXT: 1 4 0.50 sdot z0.d, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sdot z0.s, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sdot z0.s, z1.b, z7.b[3] +# CHECK-NEXT: 1 2 1.00 sel p0.b, p1, p2.b, p3.b +# CHECK-NEXT: 1 2 0.50 sel z23.b, p11, z13.b, z8.b +# CHECK-NEXT: 1 2 0.50 sel z23.d, p11, z13.d, z8.d +# CHECK-NEXT: 1 2 0.50 sel z23.h, p11, z13.h, z8.h +# CHECK-NEXT: 1 2 0.50 sel z23.s, p11, z13.s, z8.s +# CHECK-NEXT: 1 1 1.00 * U setffr +# CHECK-NEXT: 1 3 0.50 shadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 shadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 shadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 shadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 shrnb z0.b, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 shrnb z0.h, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 shrnb z0.s, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 shrnb z31.b, z31.h, #8 +# CHECK-NEXT: 1 3 0.50 shrnb z31.h, z31.s, #16 +# CHECK-NEXT: 1 3 0.50 shrnb z31.s, z31.d, #32 +# CHECK-NEXT: 1 3 0.50 shrnt z0.b, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 shrnt z0.h, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 shrnt z0.s, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 shrnt z31.b, z31.h, #8 +# CHECK-NEXT: 1 3 0.50 shrnt z31.h, z31.s, #16 +# CHECK-NEXT: 1 3 0.50 shrnt z31.s, z31.d, #32 +# CHECK-NEXT: 1 3 0.50 shsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 shsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 shsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 shsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 shsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 shsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 shsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 shsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 sli z0.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 sli z0.d, z0.d, #0 +# CHECK-NEXT: 1 3 0.50 sli z0.h, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 sli z0.s, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 sli z31.b, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 sli z31.d, z31.d, #63 +# CHECK-NEXT: 1 3 0.50 sli z31.h, z31.h, #15 +# CHECK-NEXT: 1 3 0.50 sli z31.s, z31.s, #31 +# CHECK-NEXT: 1 9 7.00 sm4e z0.s, z0.s, z31.s +# CHECK-NEXT: 1 9 7.00 sm4ekey z0.s, z1.s, z31.s +# CHECK-NEXT: 1 3 0.50 smax z0.b, z0.b, #-128 +# CHECK-NEXT: 1 3 0.50 smax z0.d, z0.d, #-128 +# CHECK-NEXT: 1 3 0.50 smax z0.h, z0.h, #-128 +# CHECK-NEXT: 1 3 0.50 smax z0.s, z0.s, #-128 +# CHECK-NEXT: 1 3 0.50 smax z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 smax z31.b, z31.b, #127 +# CHECK-NEXT: 1 3 0.50 smax z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 smax z31.d, z31.d, #127 +# CHECK-NEXT: 1 3 0.50 smax z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 smax z31.h, z31.h, #127 +# CHECK-NEXT: 1 3 0.50 smax z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 smax z31.s, z31.s, #127 +# CHECK-NEXT: 1 3 0.50 smaxp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 smaxp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 smaxp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 smaxp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 smaxv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 smaxv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 smaxv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 smaxv s0, p7, z31.s +# CHECK-NEXT: 1 3 0.50 smin z0.b, z0.b, #-128 +# CHECK-NEXT: 1 3 0.50 smin z0.d, z0.d, #-128 +# CHECK-NEXT: 1 3 0.50 smin z0.h, z0.h, #-128 +# CHECK-NEXT: 1 3 0.50 smin z0.s, z0.s, #-128 +# CHECK-NEXT: 1 3 0.50 smin z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 smin z31.b, z31.b, #127 +# CHECK-NEXT: 1 3 0.50 smin z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 smin z31.d, z31.d, #127 +# CHECK-NEXT: 1 3 0.50 smin z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 smin z31.h, z31.h, #127 +# CHECK-NEXT: 1 3 0.50 smin z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 smin z31.s, z31.s, #127 +# CHECK-NEXT: 1 3 0.50 sminp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 sminp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 sminp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 sminp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 sminv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 sminv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 sminv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 sminv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 smlalb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 smlalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 smlalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 smlalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 smlalb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 smlalt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 smlalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 smlalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 smlalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 smlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 smlslb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 smlslb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 smlslb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 smlslb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 smlslb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 smlslt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 smlslt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 smlslt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 smlslt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 smlslt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 smulh z0.b, p7/m, z0.b, z31.b +# CHECK-NEXT: 1 4 0.50 smulh z0.b, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 smulh z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 smulh z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 smulh z0.h, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 smulh z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 smulh z29.s, z30.s, z31.s +# CHECK-NEXT: 1 4 0.50 smulh z31.d, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 smullb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 smullb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 smullb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 smullb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 smullb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 smullt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 smullt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 smullt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 smullt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 smullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 splice z29.b, p7, { z30.b, z31.b } +# CHECK-NEXT: 1 4 0.50 splice z29.d, p7, { z30.d, z31.d } +# CHECK-NEXT: 1 4 0.50 splice z29.h, p7, { z30.h, z31.h } +# CHECK-NEXT: 1 4 0.50 splice z29.s, p7, { z30.s, z31.s } +# CHECK-NEXT: 1 4 0.50 splice z31.b, p7, z31.b, z31.b +# CHECK-NEXT: 1 4 0.50 splice z31.d, p7, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 splice z31.h, p7, z31.h, z31.h +# CHECK-NEXT: 1 4 0.50 splice z31.s, p7, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqabs z31.b, p7/m, z31.b +# CHECK-NEXT: 1 4 0.50 sqabs z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 sqabs z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 sqabs z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 sqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqadd z0.b, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 sqadd z0.b, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 sqadd z0.d, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 sqadd z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 sqadd z0.d, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 sqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqadd z0.h, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 sqadd z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 sqadd z0.h, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 sqadd z0.s, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 sqadd z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 sqadd z0.s, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 sqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqadd z31.b, z31.b, #255 +# CHECK-NEXT: 1 4 0.50 sqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqadd z31.d, z31.d, #65280 +# CHECK-NEXT: 1 4 0.50 sqadd z31.h, z31.h, #65280 +# CHECK-NEXT: 1 4 0.50 sqadd z31.s, z31.s, #65280 +# CHECK-NEXT: 1 4 0.50 sqcadd z0.b, z0.b, z0.b, #90 +# CHECK-NEXT: 1 4 0.50 sqcadd z0.d, z0.d, z0.d, #90 +# CHECK-NEXT: 1 4 0.50 sqcadd z0.h, z0.h, z0.h, #90 +# CHECK-NEXT: 1 4 0.50 sqcadd z0.s, z0.s, z0.s, #90 +# CHECK-NEXT: 1 4 0.50 sqcadd z31.b, z31.b, z31.b, #270 +# CHECK-NEXT: 1 4 0.50 sqcadd z31.d, z31.d, z31.d, #270 +# CHECK-NEXT: 1 4 0.50 sqcadd z31.h, z31.h, z31.h, #270 +# CHECK-NEXT: 1 4 0.50 sqcadd z31.s, z31.s, z31.s, #270 +# CHECK-NEXT: 1 5 1.00 sqdecb x0 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, #14 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, w0 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdecb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecd x0 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, #14 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, w0 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdecd x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdecd z0.d +# CHECK-NEXT: 1 4 0.50 sqdecd z0.d, all, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdecd z0.d, pow2 +# CHECK-NEXT: 1 4 0.50 sqdecd z0.d, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdech x0 +# CHECK-NEXT: 1 5 1.00 sqdech x0, #14 +# CHECK-NEXT: 1 5 1.00 sqdech x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdech x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdech x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqdech x0, w0 +# CHECK-NEXT: 1 5 1.00 sqdech x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdech x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdech x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdech z0.h +# CHECK-NEXT: 1 4 0.50 sqdech z0.h, all, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdech z0.h, pow2 +# CHECK-NEXT: 1 4 0.50 sqdech z0.h, pow2, mul #16 +# CHECK-NEXT: 1 2 1.00 sqdecp x0, p0.b +# CHECK-NEXT: 1 2 1.00 sqdecp x0, p0.d +# CHECK-NEXT: 1 2 1.00 sqdecp x0, p0.h +# CHECK-NEXT: 1 2 1.00 sqdecp x0, p0.s +# CHECK-NEXT: 1 1 1.00 sqdecp xzr, p15.b, wzr +# CHECK-NEXT: 1 1 1.00 sqdecp xzr, p15.d, wzr +# CHECK-NEXT: 1 1 1.00 sqdecp xzr, p15.h, wzr +# CHECK-NEXT: 1 1 1.00 sqdecp xzr, p15.s, wzr +# CHECK-NEXT: 1 4 0.50 sqdecp z0.d, p0.d +# CHECK-NEXT: 1 4 0.50 sqdecp z0.h, p0.h +# CHECK-NEXT: 1 4 0.50 sqdecp z0.s, p0.s +# CHECK-NEXT: 1 5 1.00 sqdecw x0 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, #14 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, w0 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqdecw x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdecw z0.s +# CHECK-NEXT: 1 4 0.50 sqdecw z0.s, all, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdecw z0.s, pow2 +# CHECK-NEXT: 1 4 0.50 sqdecw z0.s, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqdmlalb z0.d, z1.s, z15.s[3] +# CHECK-NEXT: 1 4 0.50 sqdmlalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmlalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqdmlalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmlalb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmlalbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmlalbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqdmlalbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmlalt z0.d, z1.s, z15.s[3] +# CHECK-NEXT: 1 4 0.50 sqdmlalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmlalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqdmlalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmlslb z0.d, z1.s, z15.s[3] +# CHECK-NEXT: 1 4 0.50 sqdmlslb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmlslb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqdmlslb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmlslb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmlslbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmlslbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqdmlslbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmlslt z0.d, z1.s, z15.s[3] +# CHECK-NEXT: 1 4 0.50 sqdmlslt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmlslt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqdmlslt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmlslt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmulh z0.b, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 sqdmulh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 4 0.50 sqdmulh z0.h, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 sqdmulh z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmulh z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 sqdmulh z29.s, z30.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmulh z31.d, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 sqdmullb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmullb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 sqdmullb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmullb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmullb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqdmullt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmullt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 sqdmullt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqdmullt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqdmullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 5 1.00 sqincb x0 +# CHECK-NEXT: 1 5 1.00 sqincb x0, #14 +# CHECK-NEXT: 1 5 1.00 sqincb x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincb x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqincb x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqincb x0, w0 +# CHECK-NEXT: 1 5 1.00 sqincb x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincb x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqincb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincd x0 +# CHECK-NEXT: 1 5 1.00 sqincd x0, #14 +# CHECK-NEXT: 1 5 1.00 sqincd x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincd x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqincd x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqincd x0, w0 +# CHECK-NEXT: 1 5 1.00 sqincd x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincd x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqincd x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqincd z0.d +# CHECK-NEXT: 1 4 0.50 sqincd z0.d, all, mul #16 +# CHECK-NEXT: 1 4 0.50 sqincd z0.d, pow2 +# CHECK-NEXT: 1 4 0.50 sqincd z0.d, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 sqinch x0 +# CHECK-NEXT: 1 5 1.00 sqinch x0, #14 +# CHECK-NEXT: 1 5 1.00 sqinch x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqinch x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqinch x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqinch x0, w0 +# CHECK-NEXT: 1 5 1.00 sqinch x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqinch x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqinch x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqinch z0.h +# CHECK-NEXT: 1 4 0.50 sqinch z0.h, all, mul #16 +# CHECK-NEXT: 1 4 0.50 sqinch z0.h, pow2 +# CHECK-NEXT: 1 4 0.50 sqinch z0.h, pow2, mul #16 +# CHECK-NEXT: 1 2 1.00 sqincp x0, p0.b +# CHECK-NEXT: 1 2 1.00 sqincp x0, p0.d +# CHECK-NEXT: 1 2 1.00 sqincp x0, p0.h +# CHECK-NEXT: 1 2 1.00 sqincp x0, p0.s +# CHECK-NEXT: 1 1 1.00 sqincp xzr, p15.b, wzr +# CHECK-NEXT: 1 1 1.00 sqincp xzr, p15.d, wzr +# CHECK-NEXT: 1 1 1.00 sqincp xzr, p15.h, wzr +# CHECK-NEXT: 1 1 1.00 sqincp xzr, p15.s, wzr +# CHECK-NEXT: 1 4 0.50 sqincp z0.d, p0.d +# CHECK-NEXT: 1 4 0.50 sqincp z0.h, p0.h +# CHECK-NEXT: 1 4 0.50 sqincp z0.s, p0.s +# CHECK-NEXT: 1 5 1.00 sqincw x0 +# CHECK-NEXT: 1 5 1.00 sqincw x0, #14 +# CHECK-NEXT: 1 5 1.00 sqincw x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincw x0, pow2 +# CHECK-NEXT: 1 5 1.00 sqincw x0, vl1 +# CHECK-NEXT: 1 5 1.00 sqincw x0, w0 +# CHECK-NEXT: 1 5 1.00 sqincw x0, w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 sqincw x0, w0, pow2 +# CHECK-NEXT: 1 5 1.00 sqincw x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqincw z0.s +# CHECK-NEXT: 1 4 0.50 sqincw z0.s, all, mul #16 +# CHECK-NEXT: 1 4 0.50 sqincw z0.s, pow2 +# CHECK-NEXT: 1 4 0.50 sqincw z0.s, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 sqneg z31.b, p7/m, z31.b +# CHECK-NEXT: 1 4 0.50 sqneg z31.d, p7/m, z31.d +# CHECK-NEXT: 1 4 0.50 sqneg z31.h, p7/m, z31.h +# CHECK-NEXT: 1 4 0.50 sqneg z31.s, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z0.b, z1.b, z2.b, #0 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z0.d, z1.d, z2.d, #0 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z0.h, z1.h, z2.h, #0 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z0.h, z1.h, z2.h[0], #0 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z0.s, z1.s, z2.s, #0 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z0.s, z1.s, z2.s[0], #0 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z15.b, z16.b, z17.b, #270 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z15.d, z16.d, z17.d, #270 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z15.h, z16.h, z17.h, #270 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z15.s, z16.s, z17.s, #270 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z29.b, z30.b, z31.b, #90 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z29.d, z30.d, z31.d, #90 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z29.h, z30.h, z31.h, #90 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z29.s, z30.s, z31.s, #90 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z31.b, z31.b, z31.b, #180 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z31.d, z31.d, z31.d, #180 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z31.h, z30.h, z7.h[0], #180 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z31.h, z31.h, z31.h, #180 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z31.s, z30.s, z7.s[0], #180 +# CHECK-NEXT: 1 4 0.50 sqrdcmlah z31.s, z31.s, z31.s, #180 +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.b, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqrdmlah z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.b, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.d, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.h, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.s, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqrdmlsh z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh z0.b, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 sqrdmulh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 4 0.50 sqrdmulh z0.h, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 sqrdmulh z0.h, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 sqrdmulh z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh z29.s, z30.s, z31.s +# CHECK-NEXT: 1 4 0.50 sqrdmulh z31.d, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 sqrshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqrshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqrshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqrshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqrshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqrshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqrshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqrshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqrshrnb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrnb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrnb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrnb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqrshrnb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqrshrnb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqrshrnt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrnt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrnt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrnt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqrshrnt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqrshrnt z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqrshrunb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrunb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrunb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrunb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqrshrunb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqrshrunb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqrshrunt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrunt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrunt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqrshrunt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqrshrunt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqrshrunt z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqshl z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 sqshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqshl z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 sqshl z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 sqshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqshl z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 sqshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqshl z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: 1 4 0.50 sqshl z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: 1 4 0.50 sqshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqshl z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: 1 4 0.50 sqshl z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: 1 4 0.50 sqshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqshlu z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 sqshlu z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 sqshlu z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 sqshlu z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 sqshlu z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: 1 4 0.50 sqshlu z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: 1 4 0.50 sqshlu z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: 1 4 0.50 sqshlu z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: 1 4 0.50 sqshrnb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqshrnb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqshrnb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqshrnb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqshrnb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqshrnb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqshrnt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqshrnt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqshrnt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqshrnt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqshrnt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqshrnt z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqshrunb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqshrunb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqshrunb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqshrunb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqshrunb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqshrunb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqshrunt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 sqshrunt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 sqshrunt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 sqshrunt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 sqshrunt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 sqshrunt z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 sqsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqsub z0.b, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 sqsub z0.b, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 sqsub z0.d, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 sqsub z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 sqsub z0.d, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 sqsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqsub z0.h, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 sqsub z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 sqsub z0.h, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 sqsub z0.s, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 sqsub z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 sqsub z0.s, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 sqsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqsub z31.b, z31.b, #255 +# CHECK-NEXT: 1 4 0.50 sqsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqsub z31.d, z31.d, #65280 +# CHECK-NEXT: 1 4 0.50 sqsub z31.h, z31.h, #65280 +# CHECK-NEXT: 1 4 0.50 sqsub z31.s, z31.s, #65280 +# CHECK-NEXT: 1 4 0.50 sqsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 sqsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 sqsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 sqsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 sqxtnb z0.b, z31.h +# CHECK-NEXT: 1 4 0.50 sqxtnb z0.h, z31.s +# CHECK-NEXT: 1 4 0.50 sqxtnb z0.s, z31.d +# CHECK-NEXT: 1 4 0.50 sqxtnt z0.b, z31.h +# CHECK-NEXT: 1 4 0.50 sqxtnt z0.h, z31.s +# CHECK-NEXT: 1 4 0.50 sqxtnt z0.s, z31.d +# CHECK-NEXT: 1 4 0.50 sqxtunb z0.b, z31.h +# CHECK-NEXT: 1 4 0.50 sqxtunb z0.h, z31.s +# CHECK-NEXT: 1 4 0.50 sqxtunb z0.s, z31.d +# CHECK-NEXT: 1 4 0.50 sqxtunt z0.b, z31.h +# CHECK-NEXT: 1 4 0.50 sqxtunt z0.h, z31.s +# CHECK-NEXT: 1 4 0.50 sqxtunt z0.s, z31.d +# CHECK-NEXT: 1 3 0.50 srhadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 srhadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 srhadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 srhadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 sri z0.b, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 sri z0.d, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 sri z0.h, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 sri z0.s, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 sri z31.b, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 sri z31.d, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 sri z31.h, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 sri z31.s, z31.s, #32 +# CHECK-NEXT: 1 4 0.50 srshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 srshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 srshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 srshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 srshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 srshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 srshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 srshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 srshr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: 1 4 0.50 srshr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 srshr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 srshr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 srshr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: 1 4 0.50 srshr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: 1 4 0.50 srshr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: 1 4 0.50 srshr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: 1 5 3.00 srsra z0.b, z0.b, #1 +# CHECK-NEXT: 1 5 3.00 srsra z0.d, z0.d, #1 +# CHECK-NEXT: 1 5 3.00 srsra z0.h, z0.h, #1 +# CHECK-NEXT: 1 5 3.00 srsra z0.s, z0.s, #1 +# CHECK-NEXT: 1 5 3.00 srsra z31.b, z31.b, #8 +# CHECK-NEXT: 1 5 3.00 srsra z31.d, z31.d, #64 +# CHECK-NEXT: 1 5 3.00 srsra z31.h, z31.h, #16 +# CHECK-NEXT: 1 5 3.00 srsra z31.s, z31.s, #32 +# CHECK-NEXT: 1 3 0.50 sshllb z0.d, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 sshllb z0.h, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 sshllb z0.s, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 sshllb z31.d, z31.s, #31 +# CHECK-NEXT: 1 3 0.50 sshllb z31.h, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 sshllb z31.s, z31.h, #15 +# CHECK-NEXT: 1 3 0.50 sshllt z0.d, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 sshllt z0.h, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 sshllt z0.s, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 sshllt z31.d, z31.s, #31 +# CHECK-NEXT: 1 3 0.50 sshllt z31.h, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 sshllt z31.s, z31.h, #15 +# CHECK-NEXT: 1 3 0.50 ssra z0.b, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 ssra z0.d, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 ssra z0.h, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 ssra z0.s, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 ssra z31.b, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 ssra z31.d, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 ssra z31.h, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 ssra z31.s, z31.s, #32 +# CHECK-NEXT: 1 4 0.50 ssublb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 ssublb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssublbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssublbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 ssublbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 ssublt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssubltb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssubltb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 ssubltb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 ssubwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 ssubwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 ssubwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 1 1.00 * st1b { z0.b }, p0, [x0, x0] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.b }, p0, [x0] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.d }, p0, [x0, x0] +# CHECK-NEXT: 1 1 7.00 * st1b { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: 1 1 7.00 * st1b { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: 1 1 7.00 * st1b { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.d }, p0, [x0] +# CHECK-NEXT: 1 1 7.00 * st1b { z0.d }, p7, [z0.d] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.h }, p0, [x0, x0] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.h }, p0, [x0] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.s }, p0, [x0, x0] +# CHECK-NEXT: 1 1 9.00 * st1b { z0.s }, p0, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 1 9.00 * st1b { z0.s }, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 1 1.00 * st1b { z0.s }, p0, [x0] +# CHECK-NEXT: 1 1 9.00 * st1b { z0.s }, p7, [z0.s] +# CHECK-NEXT: 1 1 1.00 * st1b { z21.b }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1b { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1b { z21.h }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1b { z21.s }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1b { z31.b }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1b { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 7.00 * st1b { z31.d }, p7, [z31.d, #31] +# CHECK-NEXT: 1 1 1.00 * st1b { z31.h }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1b { z31.s }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 9.00 * st1b { z31.s }, p7, [z31.s, #31] +# CHECK-NEXT: 1 1 1.00 * st1d { z0.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p0, [x0, z0.d, lsl #3] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p0, [x0, z0.d, sxtw #3] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p0, [x0, z0.d, uxtw #3] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: 1 1 1.00 * st1d { z0.d }, p0, [x0] +# CHECK-NEXT: 1 1 7.00 * st1d { z0.d }, p7, [z0.d] +# CHECK-NEXT: 1 1 1.00 * st1d { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1d { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 7.00 * st1d { z31.d }, p7, [z31.d, #248] +# CHECK-NEXT: 1 1 1.00 * st1h { z0.d }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p0, [x0, z0.d, lsl #1] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p0, [x0, z0.d, sxtw #1] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p0, [x0, z0.d, uxtw #1] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: 1 1 1.00 * st1h { z0.d }, p0, [x0] +# CHECK-NEXT: 1 1 7.00 * st1h { z0.d }, p7, [z0.d] +# CHECK-NEXT: 1 1 1.00 * st1h { z0.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 1.00 * st1h { z0.h }, p0, [x0] +# CHECK-NEXT: 1 1 1.00 * st1h { z0.s }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 9.00 * st1h { z0.s }, p0, [x0, z0.s, sxtw #1] +# CHECK-NEXT: 1 1 9.00 * st1h { z0.s }, p0, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 1 9.00 * st1h { z0.s }, p0, [x0, z0.s, uxtw #1] +# CHECK-NEXT: 1 1 9.00 * st1h { z0.s }, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 1 1.00 * st1h { z0.s }, p0, [x0] +# CHECK-NEXT: 1 1 9.00 * st1h { z0.s }, p7, [z0.s] +# CHECK-NEXT: 1 1 1.00 * st1h { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1h { z21.h }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1h { z21.s }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1h { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 7.00 * st1h { z31.d }, p7, [z31.d, #62] +# CHECK-NEXT: 1 1 1.00 * st1h { z31.h }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1h { z31.s }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 9.00 * st1h { z31.s }, p7, [z31.s, #62] +# CHECK-NEXT: 1 1 1.00 * st1w { z0.d }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p0, [x0, z0.d, lsl #2] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p0, [x0, z0.d, sxtw #2] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p0, [x0, z0.d, uxtw #2] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: 1 1 1.00 * st1w { z0.d }, p0, [x0] +# CHECK-NEXT: 1 1 7.00 * st1w { z0.d }, p7, [z0.d] +# CHECK-NEXT: 1 1 1.00 * st1w { z0.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: 1 1 9.00 * st1w { z0.s }, p0, [x0, z0.s, sxtw #2] +# CHECK-NEXT: 1 1 9.00 * st1w { z0.s }, p0, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 1 9.00 * st1w { z0.s }, p0, [x0, z0.s, uxtw #2] +# CHECK-NEXT: 1 1 9.00 * st1w { z0.s }, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 1 1.00 * st1w { z0.s }, p0, [x0] +# CHECK-NEXT: 1 1 9.00 * st1w { z0.s }, p7, [z0.s] +# CHECK-NEXT: 1 1 1.00 * st1w { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1w { z21.s }, p5, [x10, #5, mul vl] +# CHECK-NEXT: 1 1 1.00 * st1w { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 7.00 * st1w { z31.d }, p7, [z31.d, #124] +# CHECK-NEXT: 1 1 1.00 * st1w { z31.s }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: 1 1 9.00 * st1w { z31.s }, p7, [z31.s, #124] +# CHECK-NEXT: 1 1 2.00 * st2b { z0.b, z1.b }, p0, [x0, x0] +# CHECK-NEXT: 1 1 2.00 * st2b { z0.b, z1.b }, p0, [x0] +# CHECK-NEXT: 1 1 2.00 * st2b { z21.b, z22.b }, p5, [x10, #10, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2b { z23.b, z24.b }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2b { z5.b, z6.b }, p3, [x17, x16] +# CHECK-NEXT: 1 1 2.00 * st2d { z0.d, z1.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: 1 1 2.00 * st2d { z0.d, z1.d }, p0, [x0] +# CHECK-NEXT: 1 1 2.00 * st2d { z21.d, z22.d }, p5, [x10, #10, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2d { z23.d, z24.d }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2d { z5.d, z6.d }, p3, [x17, x16, lsl #3] +# CHECK-NEXT: 1 1 2.00 * st2h { z0.h, z1.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 2.00 * st2h { z0.h, z1.h }, p0, [x0] +# CHECK-NEXT: 1 1 2.00 * st2h { z21.h, z22.h }, p5, [x10, #10, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2h { z23.h, z24.h }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2h { z5.h, z6.h }, p3, [x17, x16, lsl #1] +# CHECK-NEXT: 1 1 2.00 * st2w { z0.s, z1.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: 1 1 2.00 * st2w { z0.s, z1.s }, p0, [x0] +# CHECK-NEXT: 1 1 2.00 * st2w { z21.s, z22.s }, p5, [x10, #10, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2w { z23.s, z24.s }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: 1 1 2.00 * st2w { z5.s, z6.s }, p3, [x17, x16, lsl #2] +# CHECK-NEXT: 1 1 6.00 * st3b { z0.b - z2.b }, p0, [x0, x0] +# CHECK-NEXT: 1 1 6.00 * st3b { z0.b - z2.b }, p0, [x0] +# CHECK-NEXT: 1 1 6.00 * st3b { z21.b - z23.b }, p5, [x10, #15, mul vl] +# CHECK-NEXT: 1 1 6.00 * st3b { z23.b - z25.b }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: 1 1 6.00 * st3b { z5.b - z7.b }, p3, [x17, x16] +# CHECK-NEXT: 1 1 3.00 * st3d { z0.d - z2.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: 1 1 3.00 * st3d { z0.d - z2.d }, p0, [x0] +# CHECK-NEXT: 1 1 3.00 * st3d { z21.d - z23.d }, p5, [x10, #15, mul vl] +# CHECK-NEXT: 1 1 3.00 * st3d { z23.d - z25.d }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: 1 1 3.00 * st3d { z5.d - z7.d }, p3, [x17, x16, lsl #3] +# CHECK-NEXT: 1 1 6.00 * st3h { z0.h - z2.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 6.00 * st3h { z0.h - z2.h }, p0, [x0] +# CHECK-NEXT: 1 1 6.00 * st3h { z21.h - z23.h }, p5, [x10, #15, mul vl] +# CHECK-NEXT: 1 1 6.00 * st3h { z23.h - z25.h }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: 1 1 6.00 * st3h { z5.h - z7.h }, p3, [x17, x16, lsl #1] +# CHECK-NEXT: 1 1 6.00 * st3w { z0.s - z2.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: 1 1 6.00 * st3w { z0.s - z2.s }, p0, [x0] +# CHECK-NEXT: 1 1 6.00 * st3w { z21.s - z23.s }, p5, [x10, #15, mul vl] +# CHECK-NEXT: 1 1 6.00 * st3w { z23.s - z25.s }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: 1 1 6.00 * st3w { z5.s - z7.s }, p3, [x17, x16, lsl #2] +# CHECK-NEXT: 1 1 8.00 * st4b { z0.b - z3.b }, p0, [x0, x0] +# CHECK-NEXT: 1 1 8.00 * st4b { z0.b - z3.b }, p0, [x0] +# CHECK-NEXT: 1 1 8.00 * st4b { z21.b - z24.b }, p5, [x10, #20, mul vl] +# CHECK-NEXT: 1 1 8.00 * st4b { z23.b - z26.b }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: 1 1 8.00 * st4b { z5.b - z8.b }, p3, [x17, x16] +# CHECK-NEXT: 1 1 4.00 * st4d { z0.d - z3.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: 1 1 4.00 * st4d { z0.d - z3.d }, p0, [x0] +# CHECK-NEXT: 1 1 4.00 * st4d { z21.d - z24.d }, p5, [x10, #20, mul vl] +# CHECK-NEXT: 1 1 4.00 * st4d { z23.d - z26.d }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: 1 1 4.00 * st4d { z5.d - z8.d }, p3, [x17, x16, lsl #3] +# CHECK-NEXT: 1 1 8.00 * st4h { z0.h - z3.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 8.00 * st4h { z0.h - z3.h }, p0, [x0] +# CHECK-NEXT: 1 1 8.00 * st4h { z21.h - z24.h }, p5, [x10, #20, mul vl] +# CHECK-NEXT: 1 1 8.00 * st4h { z23.h - z26.h }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: 1 1 8.00 * st4h { z5.h - z8.h }, p3, [x17, x16, lsl #1] +# CHECK-NEXT: 1 1 8.00 * st4w { z0.s - z3.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: 1 1 8.00 * st4w { z0.s - z3.s }, p0, [x0] +# CHECK-NEXT: 1 1 8.00 * st4w { z21.s - z24.s }, p5, [x10, #20, mul vl] +# CHECK-NEXT: 1 1 8.00 * st4w { z23.s - z26.s }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: 1 1 8.00 * st4w { z5.s - z8.s }, p3, [x17, x16, lsl #2] +# CHECK-NEXT: 1 1 1.00 * stnt1b { z0.b }, p0, [x0, x0] +# CHECK-NEXT: 1 1 1.00 * stnt1b { z0.b }, p0, [x0] +# CHECK-NEXT: 1 1 7.00 * stnt1b { z0.d }, p0, [z1.d] +# CHECK-NEXT: 1 1 9.00 * stnt1b { z0.s }, p0, [z1.s] +# CHECK-NEXT: 1 1 1.00 * stnt1b { z21.b }, p5, [x10, #7, mul vl] +# CHECK-NEXT: 1 1 1.00 * stnt1b { z23.b }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: 1 1 7.00 * stnt1b { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: 1 1 7.00 * stnt1b { z31.d }, p7, [z31.d] +# CHECK-NEXT: 1 1 9.00 * stnt1b { z31.s }, p7, [z31.s, x0] +# CHECK-NEXT: 1 1 9.00 * stnt1b { z31.s }, p7, [z31.s] +# CHECK-NEXT: 1 1 1.00 * stnt1d { z0.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: 1 1 1.00 * stnt1d { z0.d }, p0, [x0] +# CHECK-NEXT: 1 1 7.00 * stnt1d { z0.d }, p0, [z1.d] +# CHECK-NEXT: 1 1 1.00 * stnt1d { z21.d }, p5, [x10, #7, mul vl] +# CHECK-NEXT: 1 1 1.00 * stnt1d { z23.d }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: 1 1 7.00 * stnt1d { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: 1 1 7.00 * stnt1d { z31.d }, p7, [z31.d] +# CHECK-NEXT: 1 1 7.00 * stnt1h { z0.d }, p0, [z1.d] +# CHECK-NEXT: 1 1 1.00 * stnt1h { z0.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: 1 1 1.00 * stnt1h { z0.h }, p0, [x0] +# CHECK-NEXT: 1 1 9.00 * stnt1h { z0.s }, p0, [z1.s] +# CHECK-NEXT: 1 1 1.00 * stnt1h { z21.h }, p5, [x10, #7, mul vl] +# CHECK-NEXT: 1 1 1.00 * stnt1h { z23.h }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: 1 1 7.00 * stnt1h { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: 1 1 7.00 * stnt1h { z31.d }, p7, [z31.d] +# CHECK-NEXT: 1 1 9.00 * stnt1h { z31.s }, p7, [z31.s, x0] +# CHECK-NEXT: 1 1 9.00 * stnt1h { z31.s }, p7, [z31.s] +# CHECK-NEXT: 1 1 7.00 * stnt1w { z0.d }, p0, [z1.d] +# CHECK-NEXT: 1 1 1.00 * stnt1w { z0.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: 1 1 1.00 * stnt1w { z0.s }, p0, [x0] +# CHECK-NEXT: 1 1 9.00 * stnt1w { z0.s }, p0, [z1.s] +# CHECK-NEXT: 1 1 1.00 * stnt1w { z21.s }, p5, [x10, #7, mul vl] +# CHECK-NEXT: 1 1 1.00 * stnt1w { z23.s }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: 1 1 7.00 * stnt1w { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: 1 1 7.00 * stnt1w { z31.d }, p7, [z31.d] +# CHECK-NEXT: 1 1 9.00 * stnt1w { z31.s }, p7, [z31.s, x0] +# CHECK-NEXT: 1 1 9.00 * stnt1w { z31.s }, p7, [z31.s] +# CHECK-NEXT: 1 1 1.00 * str p0, [x0] +# CHECK-NEXT: 1 1 1.00 * str p15, [sp, #-256, mul vl] +# CHECK-NEXT: 1 1 1.00 * str p5, [x10, #255, mul vl] +# CHECK-NEXT: 1 1 1.00 * str z0, [x0] +# CHECK-NEXT: 1 1 1.00 * str z21, [x10, #-256, mul vl] +# CHECK-NEXT: 1 1 1.00 * str z31, [sp, #255, mul vl] +# CHECK-NEXT: 1 3 0.50 sub z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 sub z0.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 sub z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 sub z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 sub z0.d, z0.d, #0 +# CHECK-NEXT: 1 3 0.50 sub z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 3 0.50 sub z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 sub z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 sub z0.h, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 sub z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 3 0.50 sub z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 sub z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 sub z0.s, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 sub z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 3 0.50 sub z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 sub z21.b, p5/m, z21.b, z10.b +# CHECK-NEXT: 1 3 0.50 sub z21.b, z10.b, z21.b +# CHECK-NEXT: 1 3 0.50 sub z21.d, p5/m, z21.d, z10.d +# CHECK-NEXT: 1 3 0.50 sub z21.d, z10.d, z21.d +# CHECK-NEXT: 1 3 0.50 sub z21.h, p5/m, z21.h, z10.h +# CHECK-NEXT: 1 3 0.50 sub z21.h, z10.h, z21.h +# CHECK-NEXT: 1 3 0.50 sub z21.s, p5/m, z21.s, z10.s +# CHECK-NEXT: 1 3 0.50 sub z21.s, z10.s, z21.s +# CHECK-NEXT: 1 3 0.50 sub z23.b, p3/m, z23.b, z13.b +# CHECK-NEXT: 1 3 0.50 sub z23.b, z13.b, z8.b +# CHECK-NEXT: 1 3 0.50 sub z23.d, p3/m, z23.d, z13.d +# CHECK-NEXT: 1 3 0.50 sub z23.d, z13.d, z8.d +# CHECK-NEXT: 1 3 0.50 sub z23.h, p3/m, z23.h, z13.h +# CHECK-NEXT: 1 3 0.50 sub z23.h, z13.h, z8.h +# CHECK-NEXT: 1 3 0.50 sub z23.s, p3/m, z23.s, z13.s +# CHECK-NEXT: 1 3 0.50 sub z23.s, z13.s, z8.s +# CHECK-NEXT: 1 3 0.50 sub z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 sub z31.b, z31.b, #255 +# CHECK-NEXT: 1 3 0.50 sub z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 sub z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 sub z31.d, z31.d, #65280 +# CHECK-NEXT: 1 3 0.50 sub z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 sub z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 sub z31.h, z31.h, #65280 +# CHECK-NEXT: 1 3 0.50 sub z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 sub z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 sub z31.s, z31.s, #65280 +# CHECK-NEXT: 1 3 0.50 sub z31.s, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 subhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 subhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 subhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 subhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 subhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 subhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 4 0.50 subr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 subr z0.b, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 subr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 subr z0.d, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 subr z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 subr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 subr z0.h, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 subr z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 subr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 subr z0.s, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 subr z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 subr z31.b, z31.b, #255 +# CHECK-NEXT: 1 4 0.50 subr z31.d, z31.d, #65280 +# CHECK-NEXT: 1 4 0.50 subr z31.h, z31.h, #65280 +# CHECK-NEXT: 1 4 0.50 subr z31.s, z31.s, #65280 +# CHECK-NEXT: 1 4 0.50 sunpkhi z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 sunpkhi z31.h, z31.b +# CHECK-NEXT: 1 4 0.50 sunpkhi z31.s, z31.h +# CHECK-NEXT: 1 4 0.50 sunpklo z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 sunpklo z31.h, z31.b +# CHECK-NEXT: 1 4 0.50 sunpklo z31.s, z31.h +# CHECK-NEXT: 1 4 0.50 suqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 suqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 suqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 suqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 sxtb z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 sxtb z0.h, p0/m, z0.h +# CHECK-NEXT: 1 3 0.50 sxtb z0.s, p0/m, z0.s +# CHECK-NEXT: 1 3 0.50 sxtb z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 sxtb z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 sxtb z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 sxth z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 sxth z0.s, p0/m, z0.s +# CHECK-NEXT: 1 3 0.50 sxth z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 sxth z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 sxtw z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 sxtw z31.d, p7/m, z31.d +# CHECK-NEXT: 1 8 5.00 tbl z28.b, { z29.b, z30.b }, z31.b +# CHECK-NEXT: 1 8 5.00 tbl z28.d, { z29.d, z30.d }, z31.d +# CHECK-NEXT: 1 8 5.00 tbl z28.h, { z29.h, z30.h }, z31.h +# CHECK-NEXT: 1 8 5.00 tbl z28.s, { z29.s, z30.s }, z31.s +# CHECK-NEXT: 1 4 0.50 tbl z31.b, { z31.b }, z31.b +# CHECK-NEXT: 1 4 0.50 tbl z31.d, { z31.d }, z31.d +# CHECK-NEXT: 1 4 0.50 tbl z31.h, { z31.h }, z31.h +# CHECK-NEXT: 1 4 0.50 tbl z31.s, { z31.s }, z31.s +# CHECK-NEXT: 1 4 0.50 tbx z31.b, z31.b, z31.b +# CHECK-NEXT: 1 4 0.50 tbx z31.d, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 tbx z31.h, z31.h, z31.h +# CHECK-NEXT: 1 4 0.50 tbx z31.s, z31.s, z31.s +# CHECK-NEXT: 1 1 1.00 trn1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 1 1.00 trn1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 1 1.00 trn1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 1 1.00 trn1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 trn1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 trn1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 trn1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 trn1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 1 1.00 trn2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 1 1.00 trn2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 1 1.00 trn2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 1 1.00 trn2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 trn2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 trn2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 trn2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 trn2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 5 3.00 uaba z0.b, z1.b, z31.b +# CHECK-NEXT: 1 5 3.00 uaba z0.d, z1.d, z31.d +# CHECK-NEXT: 1 5 3.00 uaba z0.h, z1.h, z31.h +# CHECK-NEXT: 1 5 3.00 uaba z0.s, z1.s, z31.s +# CHECK-NEXT: 1 5 3.00 uabalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 5 3.00 uabalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 5 3.00 uabalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 5 3.00 uabalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 5 3.00 uabalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 5 3.00 uabalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 3 0.50 uabd z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uabd z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uabd z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uabd z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 uabdlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 3 0.50 uabdlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 3 0.50 uabdlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 uabdlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 3 0.50 uabdlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 3 0.50 uabdlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 6 4.00 uadalp z0.h, p0/m, z1.b +# CHECK-NEXT: 1 6 4.00 uadalp z29.s, p0/m, z30.h +# CHECK-NEXT: 1 6 4.00 uadalp z30.d, p7/m, z31.s +# CHECK-NEXT: 1 4 0.50 uaddlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 uaddlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 uaddlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 uaddlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 uaddlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 uaddlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.s +# CHECK-NEXT: 1 3 0.50 uaddwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 3 0.50 uaddwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 3 0.50 uaddwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 3 0.50 uaddwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 3 0.50 uaddwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 3 0.50 uaddwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 ucvtf z0.d, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 ucvtf z0.d, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 ucvtf z0.h, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 ucvtf z0.h, p0/m, z0.h +# CHECK-NEXT: 1 4 0.50 ucvtf z0.h, p0/m, z0.s +# CHECK-NEXT: 1 4 0.50 ucvtf z0.s, p0/m, z0.d +# CHECK-NEXT: 1 4 0.50 ucvtf z0.s, p0/m, z0.s +# CHECK-NEXT: 1 26 23.00 udiv z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 15 12.00 udiv z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 26 23.00 udivr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 15 12.00 udivr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 udot z0.d, z1.h, z15.h[1] +# CHECK-NEXT: 1 4 0.50 udot z0.d, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 udot z0.s, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 udot z0.s, z1.b, z7.b[3] +# CHECK-NEXT: 1 3 0.50 uhadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 uhadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 uhadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 uhadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 uhsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 uhsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 uhsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 uhsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 uhsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 uhsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 uhsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 uhsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 umax z0.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 umax z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 umax z31.b, z31.b, #255 +# CHECK-NEXT: 1 3 0.50 umax z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 umax z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 umax z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 umaxp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 umaxp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 umaxp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 umaxp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 umaxv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 umaxv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 umaxv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 umaxv s0, p7, z31.s +# CHECK-NEXT: 1 3 0.50 umin z0.b, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 umin z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 umin z31.b, z31.b, #255 +# CHECK-NEXT: 1 3 0.50 umin z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 umin z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 umin z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1 3 0.50 uminp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 uminp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 uminp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 uminp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 1.00 uminv b0, p7, z31.b +# CHECK-NEXT: 1 4 1.00 uminv d0, p7, z31.d +# CHECK-NEXT: 1 4 1.00 uminv h0, p7, z31.h +# CHECK-NEXT: 1 4 1.00 uminv s0, p7, z31.s +# CHECK-NEXT: 1 4 0.50 umlalb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 umlalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 umlalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 umlalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 umlalb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 umlalt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 umlalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 umlalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 umlalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 umlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 umlslb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 umlslb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 umlslb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 umlslb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 umlslb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 umlslt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 umlslt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 umlslt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 umlslt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 umlslt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 ummla z0.s, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 umulh z0.b, p7/m, z0.b, z31.b +# CHECK-NEXT: 1 4 0.50 umulh z0.b, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 umulh z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: 1 4 0.50 umulh z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: 1 4 0.50 umulh z0.h, z1.h, z2.h +# CHECK-NEXT: 1 4 0.50 umulh z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: 1 4 0.50 umulh z29.s, z30.s, z31.s +# CHECK-NEXT: 1 4 0.50 umulh z31.d, z31.d, z31.d +# CHECK-NEXT: 1 4 0.50 umullb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 umullb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 umullb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 umullb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 umullb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 umullt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: 1 4 0.50 umullt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 umullt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: 1 4 0.50 umullt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 umullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 uqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqadd z0.b, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 uqadd z0.b, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 uqadd z0.d, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 uqadd z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 uqadd z0.d, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 uqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqadd z0.h, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 uqadd z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 uqadd z0.h, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 uqadd z0.s, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 uqadd z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 uqadd z0.s, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 uqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqadd z31.b, z31.b, #255 +# CHECK-NEXT: 1 4 0.50 uqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqadd z31.d, z31.d, #65280 +# CHECK-NEXT: 1 4 0.50 uqadd z31.h, z31.h, #65280 +# CHECK-NEXT: 1 4 0.50 uqadd z31.s, z31.s, #65280 +# CHECK-NEXT: 1 5 1.00 uqdecb w0 +# CHECK-NEXT: 1 5 1.00 uqdecb w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecb w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdecb w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecb x0 +# CHECK-NEXT: 1 5 1.00 uqdecb x0, #14 +# CHECK-NEXT: 1 5 1.00 uqdecb x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecb x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdecb x0, vl1 +# CHECK-NEXT: 1 5 1.00 uqdecd w0 +# CHECK-NEXT: 1 5 1.00 uqdecd w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecd w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdecd w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecd x0 +# CHECK-NEXT: 1 5 1.00 uqdecd x0, #14 +# CHECK-NEXT: 1 5 1.00 uqdecd x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecd x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdecd x0, vl1 +# CHECK-NEXT: 1 4 0.50 uqdecd z0.d +# CHECK-NEXT: 1 4 0.50 uqdecd z0.d, all, mul #16 +# CHECK-NEXT: 1 4 0.50 uqdecd z0.d, pow2 +# CHECK-NEXT: 1 4 0.50 uqdecd z0.d, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdech w0 +# CHECK-NEXT: 1 5 1.00 uqdech w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdech w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdech w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdech x0 +# CHECK-NEXT: 1 5 1.00 uqdech x0, #14 +# CHECK-NEXT: 1 5 1.00 uqdech x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdech x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdech x0, vl1 +# CHECK-NEXT: 1 4 0.50 uqdech z0.h +# CHECK-NEXT: 1 4 0.50 uqdech z0.h, all, mul #16 +# CHECK-NEXT: 1 4 0.50 uqdech z0.h, pow2 +# CHECK-NEXT: 1 4 0.50 uqdech z0.h, pow2, mul #16 +# CHECK-NEXT: 1 2 1.00 uqdecp wzr, p15.b +# CHECK-NEXT: 1 2 1.00 uqdecp wzr, p15.d +# CHECK-NEXT: 1 2 1.00 uqdecp wzr, p15.h +# CHECK-NEXT: 1 2 1.00 uqdecp wzr, p15.s +# CHECK-NEXT: 1 2 1.00 uqdecp x0, p0.b +# CHECK-NEXT: 1 2 1.00 uqdecp x0, p0.d +# CHECK-NEXT: 1 2 1.00 uqdecp x0, p0.h +# CHECK-NEXT: 1 2 1.00 uqdecp x0, p0.s +# CHECK-NEXT: 1 4 0.50 uqdecp z0.d, p0.d +# CHECK-NEXT: 1 4 0.50 uqdecp z0.h, p0.h +# CHECK-NEXT: 1 4 0.50 uqdecp z0.s, p0.s +# CHECK-NEXT: 1 5 1.00 uqdecw w0 +# CHECK-NEXT: 1 5 1.00 uqdecw w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecw w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdecw w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecw x0 +# CHECK-NEXT: 1 5 1.00 uqdecw x0, #14 +# CHECK-NEXT: 1 5 1.00 uqdecw x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqdecw x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqdecw x0, vl1 +# CHECK-NEXT: 1 4 0.50 uqdecw z0.s +# CHECK-NEXT: 1 4 0.50 uqdecw z0.s, all, mul #16 +# CHECK-NEXT: 1 4 0.50 uqdecw z0.s, pow2 +# CHECK-NEXT: 1 4 0.50 uqdecw z0.s, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincb w0 +# CHECK-NEXT: 1 5 1.00 uqincb w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincb w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqincb w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincb x0 +# CHECK-NEXT: 1 5 1.00 uqincb x0, #14 +# CHECK-NEXT: 1 5 1.00 uqincb x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincb x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqincb x0, vl1 +# CHECK-NEXT: 1 5 1.00 uqincd w0 +# CHECK-NEXT: 1 5 1.00 uqincd w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincd w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqincd w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincd x0 +# CHECK-NEXT: 1 5 1.00 uqincd x0, #14 +# CHECK-NEXT: 1 5 1.00 uqincd x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincd x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqincd x0, vl1 +# CHECK-NEXT: 1 4 0.50 uqincd z0.d +# CHECK-NEXT: 1 4 0.50 uqincd z0.d, all, mul #16 +# CHECK-NEXT: 1 4 0.50 uqincd z0.d, pow2 +# CHECK-NEXT: 1 4 0.50 uqincd z0.d, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqinch w0 +# CHECK-NEXT: 1 5 1.00 uqinch w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqinch w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqinch w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqinch x0 +# CHECK-NEXT: 1 5 1.00 uqinch x0, #14 +# CHECK-NEXT: 1 5 1.00 uqinch x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqinch x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqinch x0, vl1 +# CHECK-NEXT: 1 4 0.50 uqinch z0.h +# CHECK-NEXT: 1 4 0.50 uqinch z0.h, all, mul #16 +# CHECK-NEXT: 1 4 0.50 uqinch z0.h, pow2 +# CHECK-NEXT: 1 4 0.50 uqinch z0.h, pow2, mul #16 +# CHECK-NEXT: 1 2 1.00 uqincp wzr, p15.b +# CHECK-NEXT: 1 2 1.00 uqincp wzr, p15.d +# CHECK-NEXT: 1 2 1.00 uqincp wzr, p15.h +# CHECK-NEXT: 1 2 1.00 uqincp wzr, p15.s +# CHECK-NEXT: 1 2 1.00 uqincp x0, p0.b +# CHECK-NEXT: 1 2 1.00 uqincp x0, p0.d +# CHECK-NEXT: 1 2 1.00 uqincp x0, p0.h +# CHECK-NEXT: 1 2 1.00 uqincp x0, p0.s +# CHECK-NEXT: 1 4 0.50 uqincp z0.d, p0.d +# CHECK-NEXT: 1 4 0.50 uqincp z0.h, p0.h +# CHECK-NEXT: 1 4 0.50 uqincp z0.s, p0.s +# CHECK-NEXT: 1 5 1.00 uqincw w0 +# CHECK-NEXT: 1 5 1.00 uqincw w0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincw w0, pow2 +# CHECK-NEXT: 1 5 1.00 uqincw w0, pow2, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincw x0 +# CHECK-NEXT: 1 5 1.00 uqincw x0, #14 +# CHECK-NEXT: 1 5 1.00 uqincw x0, all, mul #16 +# CHECK-NEXT: 1 5 1.00 uqincw x0, pow2 +# CHECK-NEXT: 1 5 1.00 uqincw x0, vl1 +# CHECK-NEXT: 1 4 0.50 uqincw z0.s +# CHECK-NEXT: 1 4 0.50 uqincw z0.s, all, mul #16 +# CHECK-NEXT: 1 4 0.50 uqincw z0.s, pow2 +# CHECK-NEXT: 1 4 0.50 uqincw z0.s, pow2, mul #16 +# CHECK-NEXT: 1 4 0.50 uqrshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqrshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqrshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqrshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqrshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqrshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqrshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqrshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqrshrnb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 uqrshrnb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 uqrshrnb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 uqrshrnb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 uqrshrnb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 uqrshrnb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 uqrshrnt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 uqrshrnt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 uqrshrnt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 uqrshrnt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 uqrshrnt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 uqrshrnt z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 uqshl z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 uqshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqshl z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 uqshl z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 uqshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqshl z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 uqshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqshl z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: 1 4 0.50 uqshl z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: 1 4 0.50 uqshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqshl z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: 1 4 0.50 uqshl z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: 1 4 0.50 uqshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqshrnb z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 uqshrnb z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 uqshrnb z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 uqshrnb z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 uqshrnb z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 uqshrnb z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 uqshrnt z0.b, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 uqshrnt z0.h, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 uqshrnt z0.s, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 uqshrnt z31.b, z31.h, #8 +# CHECK-NEXT: 1 4 0.50 uqshrnt z31.h, z31.s, #16 +# CHECK-NEXT: 1 4 0.50 uqshrnt z31.s, z31.d, #32 +# CHECK-NEXT: 1 4 0.50 uqsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqsub z0.b, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 uqsub z0.b, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 uqsub z0.d, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 uqsub z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 uqsub z0.d, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 uqsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqsub z0.h, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 uqsub z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 uqsub z0.h, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 uqsub z0.s, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 uqsub z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: 1 4 0.50 uqsub z0.s, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 uqsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqsub z31.b, z31.b, #255 +# CHECK-NEXT: 1 4 0.50 uqsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqsub z31.d, z31.d, #65280 +# CHECK-NEXT: 1 4 0.50 uqsub z31.h, z31.h, #65280 +# CHECK-NEXT: 1 4 0.50 uqsub z31.s, z31.s, #65280 +# CHECK-NEXT: 1 4 0.50 uqsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 uqsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 uqsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 uqsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 uqxtnb z0.b, z31.h +# CHECK-NEXT: 1 4 0.50 uqxtnb z0.h, z31.s +# CHECK-NEXT: 1 4 0.50 uqxtnb z0.s, z31.d +# CHECK-NEXT: 1 4 0.50 uqxtnt z0.b, z31.h +# CHECK-NEXT: 1 4 0.50 uqxtnt z0.h, z31.s +# CHECK-NEXT: 1 4 0.50 uqxtnt z0.s, z31.d +# CHECK-NEXT: 1 4 0.50 urecpe z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 urhadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 3 0.50 urhadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 3 0.50 urhadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 3 0.50 urhadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 urshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 urshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 urshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 urshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 urshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 urshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 urshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 urshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 4 0.50 urshr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: 1 4 0.50 urshr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: 1 4 0.50 urshr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: 1 4 0.50 urshr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: 1 4 0.50 urshr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: 1 4 0.50 urshr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: 1 4 0.50 urshr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: 1 4 0.50 urshr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: 1 4 0.50 ursqrte z31.s, p7/m, z31.s +# CHECK-NEXT: 1 5 3.00 ursra z0.b, z0.b, #1 +# CHECK-NEXT: 1 5 3.00 ursra z0.d, z0.d, #1 +# CHECK-NEXT: 1 5 3.00 ursra z0.h, z0.h, #1 +# CHECK-NEXT: 1 5 3.00 ursra z0.s, z0.s, #1 +# CHECK-NEXT: 1 5 3.00 ursra z31.b, z31.b, #8 +# CHECK-NEXT: 1 5 3.00 ursra z31.d, z31.d, #64 +# CHECK-NEXT: 1 5 3.00 ursra z31.h, z31.h, #16 +# CHECK-NEXT: 1 5 3.00 ursra z31.s, z31.s, #32 +# CHECK-NEXT: 1 3 0.50 ushllb z0.d, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 ushllb z0.h, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 ushllb z0.s, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 ushllb z31.d, z31.s, #31 +# CHECK-NEXT: 1 3 0.50 ushllb z31.h, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 ushllb z31.s, z31.h, #15 +# CHECK-NEXT: 1 3 0.50 ushllt z0.d, z0.s, #0 +# CHECK-NEXT: 1 3 0.50 ushllt z0.h, z0.b, #0 +# CHECK-NEXT: 1 3 0.50 ushllt z0.s, z0.h, #0 +# CHECK-NEXT: 1 3 0.50 ushllt z31.d, z31.s, #31 +# CHECK-NEXT: 1 3 0.50 ushllt z31.h, z31.b, #7 +# CHECK-NEXT: 1 3 0.50 ushllt z31.s, z31.h, #15 +# CHECK-NEXT: 1 4 0.50 usmmla z0.s, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 1 4 0.50 usqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: 1 4 0.50 usqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: 1 4 0.50 usqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 1 3 0.50 usra z0.b, z0.b, #1 +# CHECK-NEXT: 1 3 0.50 usra z0.d, z0.d, #1 +# CHECK-NEXT: 1 3 0.50 usra z0.h, z0.h, #1 +# CHECK-NEXT: 1 3 0.50 usra z0.s, z0.s, #1 +# CHECK-NEXT: 1 3 0.50 usra z31.b, z31.b, #8 +# CHECK-NEXT: 1 3 0.50 usra z31.d, z31.d, #64 +# CHECK-NEXT: 1 3 0.50 usra z31.h, z31.h, #16 +# CHECK-NEXT: 1 3 0.50 usra z31.s, z31.s, #32 +# CHECK-NEXT: 1 4 0.50 usublb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usublb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 usublb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 usublt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usublt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 usublt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 usubwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 usubwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 usubwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 usubwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 usubwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 usubwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uunpkhi z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uunpkhi z31.h, z31.b +# CHECK-NEXT: 1 4 0.50 uunpkhi z31.s, z31.h +# CHECK-NEXT: 1 4 0.50 uunpklo z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uunpklo z31.h, z31.b +# CHECK-NEXT: 1 4 0.50 uunpklo z31.s, z31.h +# CHECK-NEXT: 1 3 0.50 uxtb z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 uxtb z0.h, p0/m, z0.h +# CHECK-NEXT: 1 3 0.50 uxtb z0.s, p0/m, z0.s +# CHECK-NEXT: 1 3 0.50 uxtb z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 uxtb z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 uxtb z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 uxth z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 uxth z0.s, p0/m, z0.s +# CHECK-NEXT: 1 3 0.50 uxth z31.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 uxth z31.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 uxtw z0.d, p0/m, z0.d +# CHECK-NEXT: 1 3 0.50 uxtw z31.d, p7/m, z31.d +# CHECK-NEXT: 1 1 1.00 uzp1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 1 1.00 uzp1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 1 1.00 uzp1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 1 1.00 uzp1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 uzp1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uzp1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uzp1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uzp1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 1 1.00 uzp2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 1 1.00 uzp2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 1 1.00 uzp2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 1 1.00 uzp2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 uzp2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uzp2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uzp2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uzp2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 whilege p15.b, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.b, wzr, w0 +# CHECK-NEXT: 1 2 1.00 whilege p15.b, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.b, xzr, x0 +# CHECK-NEXT: 1 2 1.00 whilege p15.d, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.d, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.h, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.h, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.s, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.s, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilerw p15.b, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.d, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.h, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.s, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.b, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.d, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.h, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.s, x30, x30 +# CHECK-NEXT: 1 1 1.00 * U wrffr p0.b +# CHECK-NEXT: 1 1 1.00 * U wrffr p15.b +# CHECK-NEXT: 1 4 0.50 xar z0.b, z0.b, z1.b, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.d, z0.d, z1.d, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.h, z0.h, z1.h, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.s, z0.s, z1.s, #1 +# CHECK-NEXT: 1 4 0.50 xar z31.b, z31.b, z30.b, #8 +# CHECK-NEXT: 1 4 0.50 xar z31.d, z31.d, z30.d, #64 +# CHECK-NEXT: 1 4 0.50 xar z31.h, z31.h, z30.h, #16 +# CHECK-NEXT: 1 4 0.50 xar z31.s, z31.s, z30.s, #32 +# CHECK-NEXT: 1 1 1.00 zip1 p0.b, p0.b, p0.b +# CHECK-NEXT: 1 1 1.00 zip1 p0.d, p0.d, p0.d +# CHECK-NEXT: 1 1 1.00 zip1 p0.h, p0.h, p0.h +# CHECK-NEXT: 1 1 1.00 zip1 p0.s, p0.s, p0.s +# CHECK-NEXT: 1 1 1.00 zip1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 1 1.00 zip1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 1 1.00 zip1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 1 1.00 zip1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 zip1 z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 zip1 z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 zip1 z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 zip1 z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 zip1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 zip1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 zip1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 zip1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 1 1.00 zip2 p0.b, p0.b, p0.b +# CHECK-NEXT: 1 1 1.00 zip2 p0.d, p0.d, p0.d +# CHECK-NEXT: 1 1 1.00 zip2 p0.h, p0.h, p0.h +# CHECK-NEXT: 1 1 1.00 zip2 p0.s, p0.s, p0.s +# CHECK-NEXT: 1 1 1.00 zip2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 1 1.00 zip2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 1 1.00 zip2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 1 1.00 zip2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 zip2 z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 zip2 z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 zip2 z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 zip2 z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 zip2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 zip2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 zip2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 zip2 z31.s, z31.s, z31.s + +# CHECK: Resources: +# CHECK-NEXT: [0] - C1NanoUnitALU0 +# CHECK-NEXT: [1] - C1NanoUnitALU1 +# CHECK-NEXT: [2] - C1NanoUnitB +# CHECK-NEXT: [3] - C1NanoUnitDiv +# CHECK-NEXT: [4] - C1NanoUnitLd1 +# CHECK-NEXT: [5] - C1NanoUnitLdSt +# CHECK-NEXT: [6] - C1NanoUnitMAC +# CHECK-NEXT: [7] - C1NanoUnitPAC +# CHECK-NEXT: [8] - C1NanoUnitVALU0 +# CHECK-NEXT: [9] - C1NanoUnitVALU1 +# CHECK-NEXT: [10] - C1NanoUnitVMAC0 +# CHECK-NEXT: [11] - C1NanoUnitVMAC1 +# CHECK-NEXT: [12] - C1NanoUnitVMC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] +# CHECK-NEXT: 393.00 54.00 9.00 - 221.00 2425.00 - - 1378.50 1308.50 212.00 212.00 676.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z0.b, p0/m, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - abs z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adclb z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adclb z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adclt z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adclt z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z0.s, z1.s, z2.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.b, p5/m, z21.b, z10.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.b, z10.b, z21.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.d, p5/m, z21.d, z10.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.d, z10.d, z21.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.h, p5/m, z21.h, z10.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.h, z10.h, z21.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.s, p5/m, z21.s, z10.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z21.s, z10.s, z21.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.b, p3/m, z23.b, z13.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.b, z13.b, z8.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.d, p3/m, z23.d, z13.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.d, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.h, p3/m, z23.h, z13.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.h, z13.h, z8.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.s, p3/m, z23.s, z13.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z23.s, z13.s, z8.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.s, z31.s, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - add z31.s, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhnb z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhnb z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhnb z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhnt z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhnt z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addhnt z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - addp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addpl sp, sp, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addpl x0, x0, #-32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addpl x21, x21, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addpl x23, x8, #-1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addvl sp, sp, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addvl x0, x0, #-32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addvl x21, x21, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - addvl x23, x8, #-1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, lsl #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, lsl #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, lsl #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, sxtw #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, sxtw #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, sxtw] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, uxtw #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, uxtw #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, uxtw #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d, uxtw] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.d, [z0.d, z0.d] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.s, [z0.s, z0.s, lsl #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.s, [z0.s, z0.s, lsl #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.s, [z0.s, z0.s, lsl #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - adr z0.s, [z0.s, z0.s] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesd z0.b, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aese z0.b, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesimc z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesimc z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesmc z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - aesmc z31.b, z31.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - and p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z0.d, z0.d, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z0.d, z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z0.s, z0.s, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z0.s, z0.s, #0xfffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z23.d, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z23.h, z23.h, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z23.h, z23.h, #0xfff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z5.b, z5.b, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - and z5.b, z5.b, #0xf9 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ands p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - andv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - andv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - andv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - andv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.b, p0/m, z0.b, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.b, z1.b, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.h, p0/m, z0.h, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.h, z1.h, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.s, p0/m, z0.s, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z0.s, z1.s, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asr z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrd z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - asrr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bcax z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 11.00 bdep z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - - - 66.00 bdep z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 19.00 bdep z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - - - 35.00 bdep z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - - - 11.00 bext z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - - - 66.00 bext z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 19.00 bext z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - - - 35.00 bext z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bfcvt z0.h, p0/m, z1.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bfcvtnt z0.h, p0/m, z1.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 0.50 0.50 - bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 0.50 0.50 - bfdot z0.s, z1.h, z2.h[0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 0.50 0.50 - bfdot z0.s, z1.h, z2.h[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb z0.s, z1.h, z2.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb z0.s, z1.h, z2.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb z10.s, z21.h, z14.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalb z21.s, z14.h, z3.h[2] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt z0.s, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt z0.s, z1.h, z2.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt z0.s, z1.h, z2.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - bfmlalt z14.s, z10.h, z21.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 1.00 1.00 - bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - - - 11.00 bgrp z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - - - 66.00 bgrp z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 19.00 bgrp z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - - - 35.00 bgrp z0.s, z1.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - bic p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - bic p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic z23.d, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bic z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - bics p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - bics p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brka p0.b, p15/m, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brka p0.b, p15/z, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkas p0.b, p15/z, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkb p0.b, p15/m, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkb p0.b, p15/z, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkbs p0.b, p15/z, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkn p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkns p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpa p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpa p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpas p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpas p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpb p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpb p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpbs p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - brkpbs p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bsl z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bsl1n z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - bsl2n z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z0.b, z0.b, z0.b, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z0.d, z0.d, z0.d, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z0.h, z0.h, z0.h, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z0.s, z0.s, z0.s, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z31.b, z31.b, z31.b, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z31.d, z31.d, z31.d, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z31.h, z31.h, z31.h, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cadd z31.s, z31.s, z31.s, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.d, z1.h, z15.h[1], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.d, z1.h, z31.h, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.d, z1.h, z31.h, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.d, z1.h, z31.h, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.d, z1.h, z31.h, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.s, z1.b, z31.b, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z0.s, z1.b, z7.b[3], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z29.d, z30.h, z0.h[0], #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z31.d, z30.h, z7.h[1], #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cdot z5.d, z6.h, z3.h[0], #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta b0, p7, b0, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta d0, p7, d0, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta h0, p7, h0, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta s0, p7, s0, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clasta w0, p7, w0, z31.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clasta w0, p7, w0, z31.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clasta w0, p7, w0, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clasta x0, p7, x0, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta z0.b, p7, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta z0.d, p7, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta z0.h, p7, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clasta z0.s, p7, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb b0, p7, b0, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb d0, p7, d0, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb h0, p7, h0, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb s0, p7, s0, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clastb w0, p7, w0, z31.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clastb w0, p7, w0, z31.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clastb w0, p7, w0, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - clastb x0, p7, x0, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb z0.b, p7, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb z0.d, p7, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb z0.h, p7, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clastb z0.s, p7, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cls z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - clz z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z0.b, z1.b, z2.b, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z0.d, z1.d, z2.d, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z0.h, z1.h, z2.h, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z0.h, z1.h, z2.h[0], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z0.s, z1.s, z2.s, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z0.s, z1.s, z2.s[0], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z15.b, z16.b, z17.b, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z15.d, z16.d, z17.d, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z15.h, z16.h, z17.h, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z15.s, z16.s, z17.s, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z29.b, z30.b, z31.b, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z29.d, z30.d, z31.d, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z29.h, z30.h, z31.h, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z29.s, z30.s, z31.s, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z31.b, z31.b, z31.b, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z31.d, z31.d, z31.d, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z31.h, z30.h, z7.h[0], #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z31.h, z31.h, z31.h, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z31.s, z30.s, z7.s[0], #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - cmla z31.s, z31.s, z31.s, #180 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpeq p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphi p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmphs p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmple p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplo p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpls p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmplt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - cmpne p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnot z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnot z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnot z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnot z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnt z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 7.00 7.00 - - - cnt z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - cnt z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - cnt z31.s, p7/m, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntb x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntb x0, #28 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntb x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntb x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntd x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntd x0, #28 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntd x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntd x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cnth x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cnth x0, #28 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cnth x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cnth x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntp x0, p15, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntp x0, p15, p0.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntp x0, p15, p0.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntp x0, p15, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntw x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntw x0, #28 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntw x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - cntw x0, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - compact z31.d, p7, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - compact z31.s, p7, z31.s +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermeq w30, wzr +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermeq wzr, w30 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermeq x30, xzr +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermeq xzr, x30 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermne w30, wzr +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermne wzr, w30 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermne x30, xzr +# CHECK-NEXT: - 1.00 - - - - - - - - - - - ctermne xzr, x30 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decb x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decb x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decb x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decb x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decb x0, vl1 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decd x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decd x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decd x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decd x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decd x0, vl1 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - dech x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - dech x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - dech x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - dech x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - dech x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp x0, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp x0, p0.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp x0, p0.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp x0, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp xzr, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp xzr, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp xzr, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp xzr, p15.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp z31.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp z31.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - decp z31.s, p15.s +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decw x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decw x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decw x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decw x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - decw x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dupm z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dupm z0.s, #0xfffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dupm z23.h, #0xfff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - dupm z5.b, #0xf9 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - eor p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z0.d, z0.d, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z0.d, z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z0.s, z0.s, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z0.s, z0.s, #0xfffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z23.d, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z23.h, z23.h, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z23.h, z23.h, #0xfff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z5.b, z5.b, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor z5.b, z5.b, #0xf9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eor3 z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eorbt z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eorbt z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eorbt z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eorbt z0.s, z1.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - eors p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eortb z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eortb z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eortb z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - eortb z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 1.00 - - - - eorv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - eorv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - eorv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - eorv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ext z0.b, { z1.b, z2.b }, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ext z31.b, z31.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ext z31.b, z31.b, z0.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ext z31.b, { z30.b, z31.b }, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabd z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fabs z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facge p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facge p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facge p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facgt p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facgt p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facgt p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - facgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fadd z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - 2.50 2.50 - - - fadda d0, p7, d0, z31.d +# CHECK-NEXT: - - - - - - - - 25.00 25.00 - - - fadda h0, p7, h0, z31.h +# CHECK-NEXT: - - - - - - - - 9.00 9.00 - - - fadda s0, p7, s0, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - faddp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - faddv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 5.00 - - - - faddv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 5.00 - - - - faddv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd z0.d, p0/m, z0.d, z0.d, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd z0.h, p0/m, z0.h, z0.h, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd z0.s, p0/m, z0.s, z0.s, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd z31.d, p7/m, z31.d, z31.d, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd z31.h, p7/m, z31.h, z31.h, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcadd z31.s, p7/m, z31.s, z31.s, #270 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmeq p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmeq p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmeq p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmeq p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmeq p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmeq p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.d, p0/m, z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.h, p0/m, z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.h, p0/m, z1.h, z2.h, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.h, z0.h, z0.h[0], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.s, p0/m, z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z0.s, p0/m, z1.s, z2.s, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z21.s, z10.s, z5.s[1], #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z23.s, z13.s, z8.s[0], #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z29.d, p7/m, z30.d, z31.d, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z29.h, p7/m, z30.h, z31.h, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z29.s, p7/m, z30.s, z31.s, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z31.d, p7/m, z31.d, z31.d, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z31.h, p7/m, z31.h, z31.h, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z31.h, z31.h, z7.h[3], #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fcmla z31.s, p7/m, z31.s, z31.s, #270 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmle p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmle p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmle p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmlt p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmlt p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmlt p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmne p0.d, p0/z, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmne p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmne p0.h, p0/z, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmne p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmne p0.s, p0/z, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmne p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmuo p0.d, p0/z, z0.d, z1.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmuo p0.h, p0/z, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - fcmuo p0.s, p0/z, z0.s, z1.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt z0.d, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt z0.d, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt z0.h, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt z0.h, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt z0.s, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvt z0.s, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtlt z0.s, p0/m, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtlt z30.d, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnt z0.h, p0/m, z1.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtnt z30.s, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtx z0.s, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtx z30.s, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtxnt z0.s, p0/m, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtxnt z30.s, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.d, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.d, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.s, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.s, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzs z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.d, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.d, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.s, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.s, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fcvtzu z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - - - - - 19.00 fdiv z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fdiv z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - - - 10.00 fdiv z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - - - 19.00 fdivr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fdivr z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - - - 10.00 fdivr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fexpa z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fexpa z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fexpa z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - flogb z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - flogb z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - flogb z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmad z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmad z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmax z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnm z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxnmp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fmaxnmv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fmaxnmv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fmaxnmv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmaxp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fmaxv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fmaxv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fmaxv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmin z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z0.d, p0/m, z0.d, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z0.h, p0/m, z0.h, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z0.s, p0/m, z0.s, #0.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnm z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminnmp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fminnmv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fminnmv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fminnmv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp z29.s, p3/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fminp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fminv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fminv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - fminv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla z0.d, z1.d, z7.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmla z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlalb z0.s, z1.h, z7.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlalb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlalb z30.s, z31.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlalt z0.s, z1.h, z7.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlalt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlalt z30.s, z31.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls z0.d, z1.d, z7.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmls z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlslb z0.s, z1.h, z7.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlslb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlslb z30.s, z31.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlslt z0.s, z1.h, z7.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlslt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmlslt z30.s, z31.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.d, #-10.00000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.d, #0.12500000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.d, p0/m, #-10.00000000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.d, p0/m, #0.12500000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.h, #-0.12500000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.h, p0/m, #-0.12500000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.s, #-0.12500000 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fmov z0.s, p0/m, #-0.12500000 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmsb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmsb z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmsb z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.d, z0.d, z0.d[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.h, z0.h, z0.h[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.s, z0.s, z0.s[0] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z31.d, p7/m, z31.d, #2.0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z31.d, z31.d, z15.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z31.h, p7/m, z31.h, #2.0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z31.h, z31.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z31.s, p7/m, z31.s, #2.0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmul z31.s, z31.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fmulx z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fneg z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmad z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmad z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmla z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmla z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmls z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmls z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmsb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmsb z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fnmsb z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpe z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecps z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpx z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpx z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frecpx z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinta z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frinti z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintm z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintn z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintp z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintx z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - frintz z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrte z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - frsqrts z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fscale z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fscale z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - fscale z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - - - 19.00 fsqrt z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 5.00 fsqrt z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - - - - - 9.00 fsqrt z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsub z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z0.d, p0/m, z0.d, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z0.h, p0/m, z0.h, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z0.s, p0/m, z0.s, #0.5 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z31.d, p7/m, z31.d, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z31.h, p7/m, z31.h, #1.0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - fsubr z31.s, p7/m, z31.s, #1.0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ftmad z0.d, z0.d, z31.d, #7 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ftmad z0.h, z0.h, z31.h, #7 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ftmad z0.s, z0.s, z31.s, #7 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ftsmul z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ftsmul z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ftsmul z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ftssel z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ftssel z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ftssel z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 - - - - histcnt z0.s, p0/z, z1.s, z2.s +# CHECK-NEXT: - - - - - - - - 4.00 - - - - histcnt z29.d, p7/z, z30.d, z31.d +# CHECK-NEXT: - - - - - - - - 4.00 - - - - histseg z0.b, z1.b, z31.b +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incb x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incb x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incb x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incb x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incb x0, vl1 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incd x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incd x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incd x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incd x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incd x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - incd z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - incd z0.d, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - inch x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - inch x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - inch x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - inch x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - inch x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - inch z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - inch z0.h, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp x0, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp x0, p0.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp x0, p0.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp x0, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp xzr, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp xzr, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp xzr, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp xzr, p15.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp z31.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp z31.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - incp z31.s, p15.s +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incw x0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incw x0, #14 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incw x0, all, mul #16 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incw x0, pow2 +# CHECK-NEXT: - 1.00 - - - - - - - - - - - incw x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - incw z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - incw z0.s, all, mul #16 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z0.b, #0, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z0.d, #0, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z0.h, #0, #0 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z0.h, w0, w0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z0.s, #0, #0 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z21.b, w10, w21 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z21.d, x10, x21 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z21.s, w10, w21 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.b, #13, w8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.b, w13, #8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.d, #13, x8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.d, x13, #8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.h, #13, w8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.h, w13, #8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.s, #13, w8 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z23.s, w13, #8 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z31.b, #-1, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.b, #-1, wzr +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.b, wzr, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.b, wzr, wzr +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z31.d, #-1, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.d, #-1, xzr +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.d, xzr, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.d, xzr, xzr +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z31.h, #-1, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.h, #-1, wzr +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.h, wzr, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.h, wzr, wzr +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - index z31.s, #-1, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.s, #-1, wzr +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.s, wzr, #-1 +# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 - index z31.s, wzr, wzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z0.b, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z0.d, x0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z0.h, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z0.s, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.b, b31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.b, wzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.d, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.d, xzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.h, h31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.h, wzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.s, s31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - insr z31.s, wzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lasta b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lasta d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lasta h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lasta s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lasta w0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lasta w0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lasta w0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lasta x0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lastb b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lastb d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lastb h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lastb s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lastb w0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lastb w0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lastb w0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - lastb x0, p7, z31.d +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z0.b }, p0/z, [sp, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z0.b }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1b { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z21.b }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z21.s }, p5/z, [x10, x21] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z23.d }, p3/z, [x13, x8] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z31.b }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1b { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1b { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1b { z5.h }, p3/z, [x17, x16] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1d { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1d { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1d { z23.d }, p3/z, [sp, x8, lsl #3] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1d { z23.d }, p3/z, [x13, x8, lsl #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z23.d }, p3/z, [x13, z8.d, lsl #3] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1d { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1d { z31.d }, p7/z, [z31.d, #248] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1h { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z21.s }, p5/z, [x10, x21, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z23.d }, p3/z, [x13, x8, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1h { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1h { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z5.h }, p3/z, [sp, x16, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1h { z5.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z0.b }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z31.b }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rd { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rd { z31.d }, p7/z, [sp, #504] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rh { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rh { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rh { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rh { z31.h }, p7/z, [sp, #126] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqb { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqb { z0.b }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqb { z21.b }, p5/z, [x10, #112] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqb { z23.b }, p3/z, [x13, #-128] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqb { z31.b }, p7/z, [sp, #-16] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqd { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqd { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqd { z23.d }, p3/z, [x13, #-128] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqd { z23.d }, p3/z, [x13, #112] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqd { z31.d }, p7/z, [sp, #-16] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqh { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqh { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqh { z23.h }, p3/z, [x13, #-128] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqh { z23.h }, p3/z, [x13, #112] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqh { z31.h }, p7/z, [sp, #-16] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqw { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqw { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqw { z23.s }, p3/z, [x13, #-128] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqw { z23.s }, p3/z, [x13, #112] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rqw { z31.s }, p7/z, [sp, #-16] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsb { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsb { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsb { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsb { z31.d }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsb { z31.h }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsb { z31.s }, p7/z, [sp, #63] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsh { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsh { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsh { z31.d }, p7/z, [sp, #126] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsh { z31.s }, p7/z, [sp, #126] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsw { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rsw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rw { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rw { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rw { z31.d }, p7/z, [sp, #252] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1rw { z31.s }, p7/z, [sp, #252] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sb { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z0.h }, p0/z, [sp, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z0.h }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1sb { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sb { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sb { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z21.s }, p5/z, [x10, x21] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z23.d }, p3/z, [x13, x8] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sb { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sb { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sb { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1sb { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1sh { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z21.s }, p5/z, [sp, x21, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z21.s }, p5/z, [x10, x21, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z23.d }, p3/z, [x13, x8, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sh { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sh { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1sh { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sw { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sw { z23.d }, p3/z, [sp, x8, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sw { z23.d }, p3/z, [x13, x8, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1sw { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1sw { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1w { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z21.s }, p5/z, [sp, x21, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z21.s }, p5/z, [x10, x21, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z23.d }, p3/z, [x13, x8, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ld1w { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ld1w { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ld1w { z31.s }, p7/z, [z31.s, #124] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2b { z0.b, z1.b }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2b { z0.b, z1.b }, p0/z, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2b { z21.b, z22.b }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2b { z23.b, z24.b }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2b { z5.b, z6.b }, p3/z, [x17, x16] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2d { z0.d, z1.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2d { z0.d, z1.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2d { z21.d, z22.d }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2d { z23.d, z24.d }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2d { z5.d, z6.d }, p3/z, [x17, x16, lsl #3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2h { z0.h, z1.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2h { z0.h, z1.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2h { z21.h, z22.h }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2h { z23.h, z24.h }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2h { z5.h, z6.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2w { z0.s, z1.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2w { z0.s, z1.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2w { z21.s, z22.s }, p5/z, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ld2w { z23.s, z24.s }, p3/z, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - ld2w { z5.s, z6.s }, p3/z, [x17, x16, lsl #2] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3b { z0.b - z2.b }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3b { z0.b - z2.b }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3b { z21.b - z23.b }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3b { z23.b - z25.b }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3b { z5.b - z7.b }, p3/z, [x17, x16] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3d { z0.d - z2.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3d { z0.d - z2.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3d { z21.d - z23.d }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3d { z23.d - z25.d }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3d { z5.d - z7.d }, p3/z, [x17, x16, lsl #3] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3h { z0.h - z2.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3h { z0.h - z2.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3h { z21.h - z23.h }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3h { z23.h - z25.h }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3h { z5.h - z7.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3w { z0.s - z2.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3w { z0.s - z2.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3w { z21.s - z23.s }, p5/z, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld3w { z23.s - z25.s }, p3/z, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld3w { z5.s - z7.s }, p3/z, [x17, x16, lsl #2] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4b { z0.b - z3.b }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4b { z0.b - z3.b }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4b { z21.b - z24.b }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4b { z23.b - z26.b }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4b { z5.b - z8.b }, p3/z, [x17, x16] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4d { z0.d - z3.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4d { z0.d - z3.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4d { z21.d - z24.d }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4d { z23.d - z26.d }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4d { z5.d - z8.d }, p3/z, [x17, x16, lsl #3] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4h { z0.h - z3.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4h { z0.h - z3.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4h { z21.h - z24.h }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4h { z23.h - z26.h }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4h { z5.h - z8.h }, p3/z, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4w { z0.s - z3.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4w { z0.s - z3.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4w { z21.s - z24.s }, p5/z, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - ld4w { z23.s - z26.s }, p3/z, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - ld4w { z5.s - z8.s }, p3/z, [x17, x16, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z0.d }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z0.h }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z0.s }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1b { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z31.b }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1b { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z31.h }, p7/z, [sp] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1b { z31.s }, p7/z, [sp] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1b { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1d { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z23.d }, p3/z, [x13, z8.d, lsl #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1d { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1d { z31.d }, p7/z, [z31.d, #248] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1h { z0.d }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1h { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1h { z0.s }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1h { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1h { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1h { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1h { z31.h }, p7/z, [sp] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1h { z31.s }, p7/z, [sp] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1h { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sb { z0.d }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sb { z0.h }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1sb { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sb { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sb { z31.d }, p7/z, [z31.d, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sb { z31.h }, p7/z, [sp] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sb { z31.s }, p7/z, [sp] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1sb { z31.s }, p7/z, [z31.s, #31] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sh { z0.d }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1sh { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z23.d }, p3/z, [x13, z8.d, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sh { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sh { z31.d }, p7/z, [z31.d, #62] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sh { z31.s }, p7/z, [sp] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1sh { z31.s }, p7/z, [z31.s, #62] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sw { z0.d }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1sw { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1sw { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1w { z0.d }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z0.d }, p0/z, [z0.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1w { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1w { z0.s }, p0/z, [z0.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z21.d }, p5/z, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z21.d }, p5/z, [x10, z21.d, uxtw] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z23.d }, p3/z, [x13, z8.d, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z31.d }, p7/z, [sp, z31.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1w { z31.d }, p7/z, [sp] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldff1w { z31.d }, p7/z, [z31.d, #124] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldff1w { z31.s }, p7/z, [sp] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldff1w { z31.s }, p7/z, [z31.s, #124] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z0.b }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z21.b }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z31.b }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1b { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1d { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1d { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1d { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1h { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z21.h }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z31.h }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sb { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sh { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sh { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sh { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sh { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sh { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sh { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sw { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sw { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1sw { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1w { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1w { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1w { z21.d }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1w { z21.s }, p5/z, [x10, #5, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1w { z31.d }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnf1w { z31.s }, p7/z, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1b { z0.b }, p0/z, [x0, x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1b { z0.b }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1b { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1b { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1b { z21.b }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1b { z23.b }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1b { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1b { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1b { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1b { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1d { z0.d }, p0/z, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1d { z0.d }, p0/z, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1d { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1d { z21.d }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1d { z23.d }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1d { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1d { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1h { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1h { z0.h }, p0/z, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1h { z0.h }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1h { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1h { z21.h }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1h { z23.h }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1h { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1h { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1h { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1h { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sb { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1sb { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sb { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sb { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1sb { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1sb { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sh { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1sh { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sh { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sh { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1sh { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1sh { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sw { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sw { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1sw { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1w { z0.d }, p0/z, [z1.d] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1w { z0.s }, p0/z, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1w { z0.s }, p0/z, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1w { z0.s }, p0/z, [z1.s] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1w { z21.s }, p5/z, [x10, #7, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldnt1w { z23.s }, p3/z, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1w { z31.d }, p7/z, [z31.d, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - ldnt1w { z31.d }, p7/z, [z31.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1w { z31.s }, p7/z, [z31.s, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - ldnt1w { z31.s }, p7/z, [z31.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ldr p0, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ldr p5, [x10, #255, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - ldr p7, [x13, #-256, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr z0, [x0] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr z23, [x13, #255, mul vl] +# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - - ldr z31, [sp, #-256, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.b, p0/m, z0.b, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.b, z1.b, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.h, p0/m, z0.h, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.h, z1.h, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.s, p0/m, z0.s, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z0.s, z1.s, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.b, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.d, z31.d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.h, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsl z31.s, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lslr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lslr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lslr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lslr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.b, p0/m, z0.b, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.b, z1.b, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.h, p0/m, z0.h, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.h, z1.h, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.s, p0/m, z0.s, z1.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z0.s, z1.s, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsr z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsrr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsrr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsrr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - lsrr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mad z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mad z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mad z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - match p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - match p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - match p15.b, p7/z, z30.b, z31.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - match p15.h, p7/z, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.d, z1.d, z7.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mla z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.d, z1.d, z7.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mls z0.s, z1.s, z7.s[3] +# CHECK-NEXT: 1.00 - - - - - - - - - - - - mov p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - mov p0.b, p0/m, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - mov p0.b, p0/z, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - mov p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - mov p15.b, p15/m, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - mov p15.b, p15/z, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.b, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.b, b0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.b, p0/m, b0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.b, p0/m, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.b, p0/z, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.b, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, #0xe0000000000003ff +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, #0xffffffffffff7fff +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, #32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, d0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, p0/m, d0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, p0/m, x0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, x0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, #-256 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, #32767 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, h0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, p0/m, h0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, p0/m, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, p0/z, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.h, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.q, q0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, #0xffff7fff +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, #32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, p0/m, s0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, p0/m, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, s0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z0.s, w0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, p0/z, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, p0/z, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, p0/z, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, p0/z, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, p15/m, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.d, p15/m, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, p0/z, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, p0/z, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, p0/z, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, p0/z, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, p15/m, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.h, p15/m, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, p0/z, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, p0/z, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, p0/z, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, p0/z, #32512 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, p15/m, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z21.s, p15/m, #-32768 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.b, p15/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.b, p7/m, b31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movprfx z31, z6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.b, p7/m, wsp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.b, wsp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.b, z31.b[63] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.d, p15/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.d, p7/m, d31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - movprfx z31.d, p7/z, z6.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.d, p7/m, sp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.d, sp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.d, z31.d[7] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.h, p15/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.h, p7/m, h31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.h, p7/m, wsp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.h, wsp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.h, z31.h[31] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.s, p15/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.s, p7/m, s31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.s, p7/m, wsp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.s, wsp +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z31.s, z31.s[15] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, #-1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, p0/z, #-1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, p0/z, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, p0/z, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.b, p15/m, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.d, #-6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.h, #-6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.q, z17.q[3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - mov z5.s, #-6 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - movs p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - movs p0.b, p0/z, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - movs p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - movs p15.b, p15/z, p15.b +# CHECK-NEXT: - - 1.00 - - - - - - - - - - mrs x3, ID_AA64ZFR0_EL1 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - mrs x3, ZCR_EL1 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - mrs x3, ZCR_EL12 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - mrs x3, ZCR_EL2 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - mrs x3, ZCR_EL3 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - msb z0.b, p7/m, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - msb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - msb z0.h, p7/m, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - msb z0.s, p7/m, z1.s, z31.s +# CHECK-NEXT: - - 1.00 - - - - - - - - - - msr ZCR_EL1, x3 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - msr ZCR_EL12, x3 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - msr ZCR_EL2, x3 +# CHECK-NEXT: - - 1.00 - - - - - - - - - - msr ZCR_EL3, x3 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.b, p7/m, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.b, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.d, z1.d, z15.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.h, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z29.s, z30.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.b, z31.b, #-128 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.b, z31.b, #127 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.d, z31.d, #-128 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.d, z31.d, #127 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.h, z31.h, #-128 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.h, z31.h, #127 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.s, z31.s, #-128 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - mul z31.s, z31.s, #127 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nand p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nand p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nands p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nands p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - nbsl z0.d, z0.d, z1.d, z2.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z0.b, p0/m, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - neg z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - nmatch p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - nmatch p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - nmatch p15.b, p7/z, z30.b, z31.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - nmatch p15.h, p7/z, z30.h, z31.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nor p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nor p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nors p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nors p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - not p0.b, p0/z, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - not p15.b, p15/z, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - not z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - not z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - not z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - not z31.s, p7/m, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nots p0.b, p0/z, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - nots p15.b, p15/z, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - orn p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - orn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - orns p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - orns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - orr p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z0.d, z0.d, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z0.d, z0.d, #0xfffffffffffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z0.s, z0.s, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z0.s, z0.s, #0xfffffff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z23.d, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z23.h, z23.h, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z23.h, z23.h, #0xfff9 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z5.b, z5.b, #0x6 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - orr z5.b, z5.b, #0xf9 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - orrs p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - orv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - orv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - orv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - orv s0, p7, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pfalse p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pfirst p0.b, p15, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pfirst p15.b, p15, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmul z0.b, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmul z29.b, z30.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmullb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - - - 7.00 pmullb z29.q, z30.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 7.00 pmullb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - pmullt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - - - 7.00 pmullt z29.q, z30.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 7.00 pmullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pnext p0.b, p15, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pnext p0.d, p15, p0.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pnext p0.h, p15, p0.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pnext p0.s, p15, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - pnext p15.b, p15, p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb #14, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb #15, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb #6, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb #7, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb #7, p3, [z13.s, #31] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb #7, p3, [z13.s] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1keep, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1keep, p0, [x0, z0.d] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1keep, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl3strm, p5, [x10, z21.d, sxtw] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl3strm, p5, [x10, z21.s, uxtw] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl3strm, p5, [z10.d, #31] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pldl3strm, p5, [z10.d] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pstl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pstl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pstl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pstl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pstl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfb pstl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #14, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #15, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #15, p7, [z31.d, #248] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #15, p7, [z31.d] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #15, p7, [z31.s, #248] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #15, p7, [z31.s] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #6, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd #7, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1keep, p0, [x0, z0.d, lsl #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1keep, p0, [x0, z0.d, sxtw #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1keep, p0, [x0, z0.d, uxtw #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1keep, p0, [x0, z0.s, sxtw #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1keep, p0, [x0, z0.s, uxtw #3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pldl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pstl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pstl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pstl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pstl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pstl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfd pstl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #14, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #15, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #15, p7, [z31.d, #62] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #15, p7, [z31.d] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #15, p7, [z31.s, #62] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #15, p7, [z31.s] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #6, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh #7, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl1keep, p0, [x0, z0.d, lsl #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl3strm, p5, [x10, z21.d, sxtw #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl3strm, p5, [x10, z21.d, uxtw #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl3strm, p5, [x10, z21.s, sxtw #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pldl3strm, p5, [x10, z21.s, uxtw #1] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pstl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pstl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pstl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pstl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pstl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfh pstl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #14, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #15, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #15, p7, [z31.d, #124] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #15, p7, [z31.d] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #15, p7, [z31.s, #124] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #15, p7, [z31.s] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #6, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #7, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw #7, p3, [x13, z8.d, uxtw #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl1keep, p0, [x0, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl1keep, p0, [x0, z0.s, uxtw #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl1strm, p0, [x0, #-32, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl1strm, p0, [x0, #31, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl3strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl3strm, p5, [x10, z21.d, lsl #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pldl3strm, p5, [x10, z21.s, sxtw #2] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pstl1keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pstl1strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pstl2keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pstl2strm, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pstl3keep, p0, [x0] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - prfw pstl3strm, p0, [x0] +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptest p15, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptest p15, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p0.b, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p0.d, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p0.h, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p0.s, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p15.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #15 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #17 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #18 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #19 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #20 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #21 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #22 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #23 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #24 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #25 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #26 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #27 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, #28 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, mul3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, mul4 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl128 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl256 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl32 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl4 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl5 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl6 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl64 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl7 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrue p7.s, vl8 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p0.b, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p0.d, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p0.h, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p0.s, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p15.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #15 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #17 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #18 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #19 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #20 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #21 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #22 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #23 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #24 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #25 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #26 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #27 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, #28 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, mul3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, mul4 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl128 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl256 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl32 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl4 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl5 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl6 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl64 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl7 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - ptrues p7.s, vl8 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - punpkhi p0.h, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - punpkhi p15.h, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - punpklo p0.h, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - punpklo p15.h, p15.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhnb z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhnb z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhnb z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhnt z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhnt z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - raddhnt z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rax1 z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rbit z0.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rbit z0.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rbit z0.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rbit z0.s, p7/m, z31.s +# CHECK-NEXT: - - - - - 1.00 - - - - - - - rdffr p0.b +# CHECK-NEXT: - - - - - 1.00 - - - - - - - rdffr p0.b, p0/z +# CHECK-NEXT: - - - - - 1.00 - - - - - - - rdffr p15.b +# CHECK-NEXT: - - - - - 1.00 - - - - - - - rdffr p15.b, p15/z +# CHECK-NEXT: - - - - - 1.00 - - - - - - - rdffrs p0.b, p0/z +# CHECK-NEXT: - - - - - 1.00 - - - - - - - rdffrs p15.b, p15/z +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rdvl x0, #0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rdvl x21, #-32 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rdvl x23, #31 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - rdvl xzr, #-1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - rev p0.b, p1.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - rev p0.d, p1.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - rev p0.h, p1.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - rev p0.s, p1.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev z0.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev z0.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev z0.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rev z0.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - revb z0.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - revb z0.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - revb z0.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - revh z0.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - revh z0.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - revw z0.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - rshrnt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhnb z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhnb z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhnb z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhnt z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhnt z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - rsubhnt z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - saba z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - saba z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - saba z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - saba z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabalb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabalb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabalb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabalt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabalt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - sabalt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabd z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabd z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabd z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabd z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdlb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdlb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdlb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdlt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdlt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sabdlt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - sadalp z0.h, p0/m, z1.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - sadalp z29.s, p0/m, z30.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - sadalp z30.d, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlbt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlbt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlbt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddlt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 1.00 - - - - saddv d0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - saddv d0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - saddv d0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddwb z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddwb z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddwb z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddwt z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddwt z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - saddwt z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sbclb z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sbclb z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sbclt z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sbclt z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.d, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.h, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.h, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.s, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - scvtf z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - - - - - 23.00 sdiv z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 12.00 sdiv z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - - - 23.00 sdivr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 12.00 sdivr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot z0.d, z1.h, z15.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot z0.d, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot z0.s, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sdot z0.s, z1.b, z7.b[3] +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sel p0.b, p1, p2.b, p3.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sel z23.b, p11, z13.b, z8.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sel z23.d, p11, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sel z23.h, p11, z13.h, z8.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sel z23.s, p11, z13.s, z8.s +# CHECK-NEXT: - - - - - 1.00 - - - - - - - setffr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shrnt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - shsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z31.b, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z31.d, z31.d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z31.h, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sli z31.s, z31.s, #31 +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm4e z0.s, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - - - 7.00 sm4ekey z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z0.b, z0.b, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z0.d, z0.d, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z0.h, z0.h, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z0.s, z0.s, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.b, z31.b, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.d, z31.d, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.h, z31.h, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smax z31.s, z31.s, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smaxp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - smaxv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - smaxv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - smaxv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - smaxv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z0.b, z0.b, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z0.d, z0.d, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z0.h, z0.h, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z0.s, z0.s, #-128 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.b, z31.b, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.d, z31.d, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.h, z31.h, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - smin z31.s, z31.s, #127 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sminp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - sminv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - sminv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - sminv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - sminv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smlslt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smmla z0.s, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z0.b, p7/m, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z0.b, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z0.h, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z29.s, z30.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smulh z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - smullt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z29.b, p7, { z30.b, z31.b } +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z29.d, p7, { z30.d, z31.d } +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z29.h, p7, { z30.h, z31.h } +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z29.s, p7, { z30.s, z31.s } +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z31.b, p7, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z31.d, p7, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z31.h, p7, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - splice z31.s, p7, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqabs z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqadd z31.s, z31.s, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z0.b, z0.b, z0.b, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z0.d, z0.d, z0.d, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z0.h, z0.h, z0.h, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z0.s, z0.s, z0.s, #90 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z31.b, z31.b, z31.b, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z31.d, z31.d, z31.d, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z31.h, z31.h, z31.h, #270 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqcadd z31.s, z31.s, z31.s, #270 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecd x0, w0, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecd z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecd z0.d, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecd z0.d, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecd z0.d, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdech x0, w0, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdech z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdech z0.h, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdech z0.h, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdech z0.h, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp x0, p0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp x0, p0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp x0, p0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp x0, p0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp xzr, p15.b, wzr +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp xzr, p15.d, wzr +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp xzr, p15.h, wzr +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqdecp xzr, p15.s, wzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecp z0.d, p0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecp z0.h, p0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecp z0.s, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqdecw x0, w0, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecw z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecw z0.s, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecw z0.s, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqdecw z0.s, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalb z0.d, z1.s, z15.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalbt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalbt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalbt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalt z0.d, z1.s, z15.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslb z0.d, z1.s, z15.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslbt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslbt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslbt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslt z0.d, z1.s, z15.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmlslt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z0.b, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z0.h, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z29.s, z30.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmulh z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqdmullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincd x0, w0, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincd z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincd z0.d, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincd z0.d, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincd z0.d, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqinch x0, w0, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqinch z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqinch z0.h, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqinch z0.h, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqinch z0.h, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp x0, p0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp x0, p0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp x0, p0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp x0, p0.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp xzr, p15.b, wzr +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp xzr, p15.d, wzr +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp xzr, p15.h, wzr +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - sqincp xzr, p15.s, wzr +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincp z0.d, p0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincp z0.h, p0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincp z0.s, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - sqincw x0, w0, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincw z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincw z0.s, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincw z0.s, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqincw z0.s, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg z31.b, p7/m, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqneg z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z0.b, z1.b, z2.b, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z0.d, z1.d, z2.d, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z0.h, z1.h, z2.h, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z0.h, z1.h, z2.h[0], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z0.s, z1.s, z2.s, #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z0.s, z1.s, z2.s[0], #0 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z15.b, z16.b, z17.b, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z15.d, z16.d, z17.d, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z15.h, z16.h, z17.h, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z15.s, z16.s, z17.s, #270 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z29.b, z30.b, z31.b, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z29.d, z30.d, z31.d, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z29.h, z30.h, z31.h, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z29.s, z30.s, z31.s, #90 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z31.b, z31.b, z31.b, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z31.d, z31.d, z31.d, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z31.h, z30.h, z7.h[0], #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z31.h, z31.h, z31.h, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z31.s, z30.s, z7.s[0], #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdcmlah z31.s, z31.s, z31.s, #180 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.d, z1.d, z15.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlah z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmlsh z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z0.b, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z0.h, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z0.h, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z0.s, z1.s, z7.s[3] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z29.s, z30.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - sqrdmulh z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrnt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqrshrunt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshl z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshlu z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrnt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqshrunt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsub z31.s, z31.s, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtnb z0.b, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtnb z0.h, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtnb z0.s, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtnt z0.b, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtnt z0.h, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtnt z0.s, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtunb z0.b, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtunb z0.h, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtunb z0.s, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtunt z0.b, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtunt z0.h, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sqxtunt z0.s, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srhadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sri z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - srshr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - srsra z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllb z0.d, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllb z0.h, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllb z0.s, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllb z31.d, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllb z31.h, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllb z31.s, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllt z0.d, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllt z0.h, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllt z0.s, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllt z31.d, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllt z31.h, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sshllt z31.s, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssra z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublbt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublbt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublbt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssublt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubltb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubltb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubltb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubwb z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubwb z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubwb z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubwt z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubwt z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ssubwt z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.b }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.b }, p0, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.d }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1b { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1b { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1b { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.d }, p0, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1b { z0.d }, p7, [z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.h }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.h }, p0, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.s }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1b { z0.s }, p0, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1b { z0.s }, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z0.s }, p0, [x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1b { z0.s }, p7, [z0.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z21.b }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z21.h }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z21.s }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z31.b }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1b { z31.d }, p7, [z31.d, #31] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z31.h }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1b { z31.s }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1b { z31.s }, p7, [z31.s, #31] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1d { z0.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p0, [x0, z0.d, lsl #3] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p0, [x0, z0.d, sxtw #3] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p0, [x0, z0.d, uxtw #3] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1d { z0.d }, p0, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z0.d }, p7, [z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1d { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1d { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1d { z31.d }, p7, [z31.d, #248] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z0.d }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p0, [x0, z0.d, lsl #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p0, [x0, z0.d, sxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p0, [x0, z0.d, uxtw #1] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z0.d }, p0, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z0.d }, p7, [z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z0.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z0.h }, p0, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z0.s }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1h { z0.s }, p0, [x0, z0.s, sxtw #1] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1h { z0.s }, p0, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1h { z0.s }, p0, [x0, z0.s, uxtw #1] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1h { z0.s }, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z0.s }, p0, [x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1h { z0.s }, p7, [z0.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z21.h }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z21.s }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1h { z31.d }, p7, [z31.d, #62] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z31.h }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1h { z31.s }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1h { z31.s }, p7, [z31.s, #62] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z0.d }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p0, [x0, z0.d, lsl #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p0, [x0, z0.d, sxtw #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p0, [x0, z0.d, sxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p0, [x0, z0.d, uxtw #2] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p0, [x0, z0.d, uxtw] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p0, [x0, z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z0.d }, p0, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z0.d }, p7, [z0.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z0.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1w { z0.s }, p0, [x0, z0.s, sxtw #2] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1w { z0.s }, p0, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1w { z0.s }, p0, [x0, z0.s, uxtw #2] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1w { z0.s }, p0, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z0.s }, p0, [x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1w { z0.s }, p7, [z0.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z21.d }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z21.s }, p5, [x10, #5, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z31.d }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - st1w { z31.d }, p7, [z31.d, #124] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - st1w { z31.s }, p7, [sp, #-1, mul vl] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - st1w { z31.s }, p7, [z31.s, #124] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2b { z0.b, z1.b }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2b { z0.b, z1.b }, p0, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2b { z21.b, z22.b }, p5, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2b { z23.b, z24.b }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2b { z5.b, z6.b }, p3, [x17, x16] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2d { z0.d, z1.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2d { z0.d, z1.d }, p0, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2d { z21.d, z22.d }, p5, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2d { z23.d, z24.d }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2d { z5.d, z6.d }, p3, [x17, x16, lsl #3] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2h { z0.h, z1.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2h { z0.h, z1.h }, p0, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2h { z21.h, z22.h }, p5, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2h { z23.h, z24.h }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2h { z5.h, z6.h }, p3, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2w { z0.s, z1.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2w { z0.s, z1.s }, p0, [x0] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2w { z21.s, z22.s }, p5, [x10, #10, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2w { z23.s, z24.s }, p3, [x13, #-16, mul vl] +# CHECK-NEXT: - - - - - 2.00 - - - - - - - st2w { z5.s, z6.s }, p3, [x17, x16, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3b { z0.b - z2.b }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3b { z0.b - z2.b }, p0, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3b { z21.b - z23.b }, p5, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3b { z23.b - z25.b }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3b { z5.b - z7.b }, p3, [x17, x16] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st3d { z0.d - z2.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st3d { z0.d - z2.d }, p0, [x0] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st3d { z21.d - z23.d }, p5, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st3d { z23.d - z25.d }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 3.00 - - - - - - - st3d { z5.d - z7.d }, p3, [x17, x16, lsl #3] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3h { z0.h - z2.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3h { z0.h - z2.h }, p0, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3h { z21.h - z23.h }, p5, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3h { z23.h - z25.h }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3h { z5.h - z7.h }, p3, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3w { z0.s - z2.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3w { z0.s - z2.s }, p0, [x0] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3w { z21.s - z23.s }, p5, [x10, #15, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3w { z23.s - z25.s }, p3, [x13, #-24, mul vl] +# CHECK-NEXT: - - - - - 6.00 - - - - - - - st3w { z5.s - z7.s }, p3, [x17, x16, lsl #2] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4b { z0.b - z3.b }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4b { z0.b - z3.b }, p0, [x0] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4b { z21.b - z24.b }, p5, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4b { z23.b - z26.b }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4b { z5.b - z8.b }, p3, [x17, x16] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4d { z0.d - z3.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4d { z0.d - z3.d }, p0, [x0] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4d { z21.d - z24.d }, p5, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4d { z23.d - z26.d }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 4.00 - - - - - - - st4d { z5.d - z8.d }, p3, [x17, x16, lsl #3] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4h { z0.h - z3.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4h { z0.h - z3.h }, p0, [x0] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4h { z21.h - z24.h }, p5, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4h { z23.h - z26.h }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4h { z5.h - z8.h }, p3, [x17, x16, lsl #1] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4w { z0.s - z3.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4w { z0.s - z3.s }, p0, [x0] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4w { z21.s - z24.s }, p5, [x10, #20, mul vl] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4w { z23.s - z26.s }, p3, [x13, #-32, mul vl] +# CHECK-NEXT: - - - - - 8.00 - - - - - - - st4w { z5.s - z8.s }, p3, [x17, x16, lsl #2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1b { z0.b }, p0, [x0, x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1b { z0.b }, p0, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1b { z0.d }, p0, [z1.d] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1b { z0.s }, p0, [z1.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1b { z21.b }, p5, [x10, #7, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1b { z23.b }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1b { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1b { z31.d }, p7, [z31.d] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1b { z31.s }, p7, [z31.s, x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1b { z31.s }, p7, [z31.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1d { z0.d }, p0, [x0, x0, lsl #3] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1d { z0.d }, p0, [x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1d { z0.d }, p0, [z1.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1d { z21.d }, p5, [x10, #7, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1d { z23.d }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1d { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1d { z31.d }, p7, [z31.d] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1h { z0.d }, p0, [z1.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1h { z0.h }, p0, [x0, x0, lsl #1] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1h { z0.h }, p0, [x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1h { z0.s }, p0, [z1.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1h { z21.h }, p5, [x10, #7, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1h { z23.h }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1h { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1h { z31.d }, p7, [z31.d] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1h { z31.s }, p7, [z31.s, x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1h { z31.s }, p7, [z31.s] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1w { z0.d }, p0, [z1.d] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1w { z0.s }, p0, [x0, x0, lsl #2] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1w { z0.s }, p0, [x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1w { z0.s }, p0, [z1.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1w { z21.s }, p5, [x10, #7, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - stnt1w { z23.s }, p3, [x13, #-8, mul vl] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1w { z31.d }, p7, [z31.d, x0] +# CHECK-NEXT: - - - - - 7.00 - - - - - - - stnt1w { z31.d }, p7, [z31.d] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1w { z31.s }, p7, [z31.s, x0] +# CHECK-NEXT: - - - - - 9.00 - - - - - - - stnt1w { z31.s }, p7, [z31.s] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str p0, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str p15, [sp, #-256, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str p5, [x10, #255, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str z0, [x0] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str z21, [x10, #-256, mul vl] +# CHECK-NEXT: - - - - - 1.00 - - - - - - - str z31, [sp, #255, mul vl] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.b, p5/m, z21.b, z10.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.b, z10.b, z21.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.d, p5/m, z21.d, z10.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.d, z10.d, z21.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.h, p5/m, z21.h, z10.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.h, z10.h, z21.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.s, p5/m, z21.s, z10.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z21.s, z10.s, z21.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.b, p3/m, z23.b, z13.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.b, z13.b, z8.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.d, p3/m, z23.d, z13.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.d, z13.d, z8.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.h, p3/m, z23.h, z13.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.h, z13.h, z8.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.s, p3/m, z23.s, z13.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z23.s, z13.s, z8.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.s, z31.s, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sub z31.s, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhnb z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhnb z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhnb z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhnt z0.b, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhnt z0.h, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subhnt z0.s, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.b, p0/m, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.h, p0/m, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.s, p0/m, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - subr z31.s, z31.s, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sunpkhi z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sunpkhi z31.h, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sunpkhi z31.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sunpklo z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sunpklo z31.h, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sunpklo z31.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - suqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtb z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtb z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtb z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtb z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtb z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtb z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxth z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxth z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxth z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxth z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtw z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - sxtw z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 5.00 5.00 - - - tbl z28.b, { z29.b, z30.b }, z31.b +# CHECK-NEXT: - - - - - - - - 5.00 5.00 - - - tbl z28.d, { z29.d, z30.d }, z31.d +# CHECK-NEXT: - - - - - - - - 5.00 5.00 - - - tbl z28.h, { z29.h, z30.h }, z31.h +# CHECK-NEXT: - - - - - - - - 5.00 5.00 - - - tbl z28.s, { z29.s, z30.s }, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbl z31.b, { z31.b }, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbl z31.d, { z31.d }, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbl z31.h, { z31.h }, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbl z31.s, { z31.s }, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbx z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbx z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbx z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - tbx z31.s, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn1 p15.s, p15.s, p15.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - trn2 p15.s, p15.s, p15.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - trn2 z31.s, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uaba z0.b, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uaba z0.d, z1.d, z31.d +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uaba z0.h, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uaba z0.s, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabalb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabalb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabalb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabalt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabalt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - uabalt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabd z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabd z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabd z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabd z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdlb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdlb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdlb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdlt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdlt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uabdlt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - uadalp z0.h, p0/m, z1.b +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - uadalp z29.s, p0/m, z30.h +# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - - uadalp z30.d, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddlt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uaddv d0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uaddv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uaddv d0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uaddv d0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddwb z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddwb z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddwb z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddwt z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddwt z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uaddwt z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.d, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.h, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.h, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.s, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ucvtf z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - - - - - 23.00 udiv z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 12.00 udiv z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - - - 23.00 udivr z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - - - 12.00 udivr z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot z0.d, z1.h, z15.h[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot z0.d, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot z0.s, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - udot z0.s, z1.b, z7.b[3] +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uhsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umax z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umaxp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - umaxv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - umaxv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - umaxv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - umaxv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin z31.b, p7/m, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin z31.d, p7/m, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin z31.h, p7/m, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - umin z31.s, p7/m, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uminp z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uminv b0, p7, z31.b +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uminv d0, p7, z31.d +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uminv h0, p7, z31.h +# CHECK-NEXT: - - - - - - - - 1.00 - - - - uminv s0, p7, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlalt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslb z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslb z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslb z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslt z0.d, z1.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslt z0.h, z1.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslt z0.s, z1.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umlslt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ummla z0.s, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z0.b, p7/m, z0.b, z31.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z0.b, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z0.d, p7/m, z0.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z0.h, p7/m, z0.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z0.h, z1.h, z2.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z0.s, p7/m, z0.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z29.s, z30.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umulh z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullb z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullb z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullt z0.d, z1.s, z15.s[1] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullt z0.s, z1.h, z7.h[7] +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - umullt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqadd z31.s, z31.s, #65280 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecb x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecd x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecd z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecd z0.d, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecd z0.d, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecd z0.d, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdech x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdech z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdech z0.h, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdech z0.h, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdech z0.h, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp wzr, p15.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp wzr, p15.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp wzr, p15.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp wzr, p15.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp x0, p0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp x0, p0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp x0, p0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqdecp x0, p0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecp z0.d, p0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecp z0.h, p0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecp z0.s, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqdecw x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecw z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecw z0.s, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecw z0.s, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqdecw z0.s, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincb x0, vl1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincd x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincd z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincd z0.d, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincd z0.d, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincd z0.d, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqinch x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqinch z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqinch z0.h, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqinch z0.h, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqinch z0.h, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp wzr, p15.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp wzr, p15.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp wzr, p15.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp wzr, p15.s +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp x0, p0.b +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp x0, p0.d +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp x0, p0.h +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - uqincp x0, p0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincp z0.d, p0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincp z0.h, p0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincp z0.s, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw w0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw w0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw w0, pow2, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw x0, #14 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw x0, all, mul #16 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw x0, pow2 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uqincw x0, vl1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincw z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincw z0.s, all, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincw z0.s, pow2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqincw z0.s, pow2, mul #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqrshrnt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z0.b, p0/m, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z0.d, p0/m, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z0.h, p0/m, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z0.s, p0/m, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z31.b, p0/m, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z31.d, p0/m, z31.d, #63 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z31.h, p0/m, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshl z31.s, p0/m, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnb z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnb z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnb z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnb z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnb z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnb z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnt z0.b, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnt z0.h, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnt z0.s, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnt z31.b, z31.h, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnt z31.h, z31.s, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqshrnt z31.s, z31.d, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.b, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.d, z0.d, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.d, z0.d, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.h, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.h, z0.h, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.s, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.s, z0.s, #0, lsl #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z31.b, z31.b, #255 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z31.d, z31.d, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z31.h, z31.h, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsub z31.s, z31.s, #65280 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsubr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsubr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsubr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqsubr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtnb z0.b, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtnb z0.h, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtnb z0.s, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtnt z0.b, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtnt z0.h, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uqxtnt z0.s, z31.d +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - urecpe z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urhadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshl z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshlr z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshlr z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshlr z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshlr z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z0.b, p0/m, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z0.d, p0/m, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z0.h, p0/m, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z0.s, p0/m, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z31.b, p0/m, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z31.d, p0/m, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z31.h, p0/m, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - urshr z31.s, p0/m, z31.s, #32 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - ursqrte z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - - ursra z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllb z0.d, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllb z0.h, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllb z0.s, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllb z31.d, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllb z31.h, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllb z31.s, z31.h, #15 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllt z0.d, z0.s, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllt z0.h, z0.b, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllt z0.s, z0.h, #0 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllt z31.d, z31.s, #31 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllt z31.h, z31.b, #7 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - ushllt z31.s, z31.h, #15 +# CHECK-NEXT: - - - - - - - - - - 0.50 0.50 - usmmla z0.s, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd z0.h, p0/m, z0.h, z1.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd z29.s, p7/m, z29.s, z30.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usqadd z31.d, p7/m, z31.d, z30.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z0.b, z0.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z0.d, z0.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z0.h, z0.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z0.s, z0.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z31.b, z31.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z31.d, z31.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z31.h, z31.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usra z31.s, z31.s, #32 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usublb z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usublb z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usublb z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usublt z0.h, z1.b, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usublt z29.s, z30.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usublt z31.d, z31.s, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubwb z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubwb z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubwb z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubwt z0.h, z1.h, z2.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubwt z29.s, z30.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - usubwt z31.d, z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uunpkhi z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uunpkhi z31.h, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uunpkhi z31.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uunpklo z31.d, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uunpklo z31.h, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uunpklo z31.s, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtb z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtb z0.h, p0/m, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtb z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtb z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtb z31.h, p7/m, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtb z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxth z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxth z0.s, p0/m, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxth z31.d, p7/m, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxth z31.s, p7/m, z31.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtw z0.d, p0/m, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uxtw z31.d, p7/m, z31.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp1 p15.s, p15.s, p15.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - uzp2 p15.s, p15.s, p15.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - uzp2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.b, w0, wzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.b, wzr, w0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.b, x0, xzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.b, xzr, x0 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.d, w0, wzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.d, x0, xzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.h, w0, wzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.h, x0, xzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.s, w0, wzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilege p15.s, x0, xzr +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilerw p15.b, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilerw p15.d, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilerw p15.h, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilerw p15.s, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilewr p15.b, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilewr p15.d, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilewr p15.h, x30, x30 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - whilewr p15.s, x30, x30 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - wrffr p0.b +# CHECK-NEXT: - - - - - 1.00 - - - - - - - wrffr p15.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z0.b, z0.b, z1.b, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z0.d, z0.d, z1.d, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z0.h, z0.h, z1.h, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z0.s, z0.s, z1.s, #1 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z31.b, z31.b, z30.b, #8 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z31.d, z31.d, z30.d, #64 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z31.h, z31.h, z30.h, #16 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - xar z31.s, z31.s, z30.s, #32 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p0.b, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p0.d, p0.d, p0.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p0.h, p0.h, p0.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p0.s, p0.s, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip1 p15.s, p15.s, p15.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p0.b, p0.b, p0.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p0.d, p0.d, p0.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p0.h, p0.h, p0.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p0.s, p0.s, p0.s +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1.00 - - - - - - - - - - - - zip2 p15.s, p15.s, p15.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z0.b, z0.b, z0.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z0.d, z0.d, z0.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z0.h, z0.h, z0.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z0.s, z0.s, z0.s +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z31.b, z31.b, z31.b +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z31.d, z31.d, z31.d +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z31.h, z31.h, z31.h +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - zip2 z31.s, z31.s, z31.s diff --git a/llvm/test/tools/llvm-mca/AArch64/Inputs/crypto-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Inputs/crypto-instructions.s new file mode 100644 index 0000000000000..ced20b87241d7 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Inputs/crypto-instructions.s @@ -0,0 +1,34 @@ +aesd z0.b, z0.b, z31.b +aese z0.b, z0.b, z31.b +aesimc z0.b, z0.b +aesmc z0.b, z0.b + +sha1h s0, s1 +sha1su1 v0.4s, v1.4s +sha256su0 v0.4s, v1.4s +sha1c q0, s1, v2.4s +sha1p q0, s1, v2.4s +sha1m q0, s1, v2.4s +sha1su0 v0.4s, v1.4s, v2.4s +sha256h q0, q1, v2.4s +sha256h2 q0, q1, v2.4s +sha256su1 v0.4s, v1.4s, v2.4s + +// armv8.2a +sha512h q0, q1, v2.2d +sha512h2 q0, q1, v2.2d +sha512su0 v11.2d, v12.2d +sha512su1 v11.2d, v13.2d, v14.2d +eor3 v25.16b, v12.16b, v7.16b, v2.16b +rax1 v30.2d, v29.2d, v26.2d +xar v26.2d, v21.2d, v27.2d, #63 +bcax v31.16b, v26.16b, v2.16b, v1.16b +sm3ss1 v20.4s, v23.4s, v21.4s, v22.4s +sm3tt1a v20.4s, v23.4s, v21.s[3] +sm3tt1b v20.4s, v23.4s, v21.s[3] +sm3tt2a v20.4s, v23.4s, v21.s[3] +sm3tt2b v20.4s, v23.4s, v21.s[3] +sm3partw1 v30.4s, v29.4s, v26.4s +sm3partw2 v30.4s, v29.4s, v26.4s +sm4ekey v11.4s, v11.4s, v19.4s +sm4e v2.4s, v15.4s diff --git a/llvm/test/tools/llvm-mca/AArch64/Inputs/js-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Inputs/js-instructions.s new file mode 100644 index 0000000000000..2349b3d20911e --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Inputs/js-instructions.s @@ -0,0 +1 @@ +fjcvtzs w25, d26 diff --git a/llvm/test/tools/llvm-mca/AArch64/Inputs/mops-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Inputs/mops-instructions.s new file mode 100644 index 0000000000000..89f271f55ab7d --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Inputs/mops-instructions.s @@ -0,0 +1,138 @@ +cpyfp [x0]!, [x1]!, x2! +cpyfpwn [x0]!, [x1]!, x2! +cpyfprn [x0]!, [x1]!, x2! +cpyfpn [x0]!, [x1]!, x2! +cpyfpwt [x0]!, [x1]!, x2! +cpyfpwtwn [x0]!, [x1]!, x2! +cpyfpwtrn [x0]!, [x1]!, x2! +cpyfpwtn [x0]!, [x1]!, x2! +cpyfprt [x0]!, [x1]!, x2! +cpyfprtwn [x0]!, [x1]!, x2! +cpyfprtrn [x0]!, [x1]!, x2! +cpyfprtn [x0]!, [x1]!, x2! +cpyfpt [x0]!, [x1]!, x2! +cpyfptwn [x0]!, [x1]!, x2! +cpyfptrn [x0]!, [x1]!, x2! +cpyfptn [x0]!, [x1]!, x2! +cpyfm [x0]!, [x1]!, x2! +cpyfmwn [x0]!, [x1]!, x2! +cpyfmrn [x0]!, [x1]!, x2! +cpyfmn [x0]!, [x1]!, x2! +cpyfmwt [x0]!, [x1]!, x2! +cpyfmwtwn [x0]!, [x1]!, x2! +cpyfmwtrn [x0]!, [x1]!, x2! +cpyfmwtn [x0]!, [x1]!, x2! +cpyfmrt [x0]!, [x1]!, x2! +cpyfmrtwn [x0]!, [x1]!, x2! +cpyfmrtrn [x0]!, [x1]!, x2! +cpyfmrtn [x0]!, [x1]!, x2! +cpyfmt [x0]!, [x1]!, x2! +cpyfmtwn [x0]!, [x1]!, x2! +cpyfmtrn [x0]!, [x1]!, x2! +cpyfmtn [x0]!, [x1]!, x2! +cpyfe [x0]!, [x1]!, x2! +cpyfewn [x0]!, [x1]!, x2! +cpyfern [x0]!, [x1]!, x2! +cpyfen [x0]!, [x1]!, x2! +cpyfewt [x0]!, [x1]!, x2! +cpyfewtwn [x0]!, [x1]!, x2! +cpyfewtrn [x0]!, [x1]!, x2! +cpyfewtn [x0]!, [x1]!, x2! +cpyfert [x0]!, [x1]!, x2! +cpyfertwn [x0]!, [x1]!, x2! +cpyfertrn [x0]!, [x1]!, x2! +cpyfertn [x0]!, [x1]!, x2! +cpyfet [x0]!, [x1]!, x2! +cpyfetwn [x0]!, [x1]!, x2! +cpyfetrn [x0]!, [x1]!, x2! +cpyfetn [x0]!, [x1]!, x2! +cpyp [x0]!, [x1]!, x2! +cpypwn [x0]!, [x1]!, x2! +cpyprn [x0]!, [x1]!, x2! +cpypn [x0]!, [x1]!, x2! +cpypwt [x0]!, [x1]!, x2! +cpypwtwn [x0]!, [x1]!, x2! +cpypwtrn [x0]!, [x1]!, x2! +cpypwtn [x0]!, [x1]!, x2! +cpyprt [x0]!, [x1]!, x2! +cpyprtwn [x0]!, [x1]!, x2! +cpyprtrn [x0]!, [x1]!, x2! +cpyprtn [x0]!, [x1]!, x2! +cpypt [x0]!, [x1]!, x2! +cpyptwn [x0]!, [x1]!, x2! +cpyptrn [x0]!, [x1]!, x2! +cpyptn [x0]!, [x1]!, x2! +cpym [x0]!, [x1]!, x2! +cpymwn [x0]!, [x1]!, x2! +cpymrn [x0]!, [x1]!, x2! +cpymn [x0]!, [x1]!, x2! +cpymwt [x0]!, [x1]!, x2! +cpymwtwn [x0]!, [x1]!, x2! +cpymwtrn [x0]!, [x1]!, x2! +cpymwtn [x0]!, [x1]!, x2! +cpymrt [x0]!, [x1]!, x2! +cpymrtwn [x0]!, [x1]!, x2! +cpymrtrn [x0]!, [x1]!, x2! +cpymrtn [x0]!, [x1]!, x2! +cpymt [x0]!, [x1]!, x2! +cpymtwn [x0]!, [x1]!, x2! +cpymtrn [x0]!, [x1]!, x2! +cpymtn [x0]!, [x1]!, x2! +cpye [x0]!, [x1]!, x2! +cpyewn [x0]!, [x1]!, x2! +cpyern [x0]!, [x1]!, x2! +cpyen [x0]!, [x1]!, x2! +cpyewt [x0]!, [x1]!, x2! +cpyewtwn [x0]!, [x1]!, x2! +cpyewtrn [x0]!, [x1]!, x2! +cpyewtn [x0]!, [x1]!, x2! +cpyert [x0]!, [x1]!, x2! +cpyertwn [x0]!, [x1]!, x2! +cpyertrn [x0]!, [x1]!, x2! +cpyertn [x0]!, [x1]!, x2! +cpyet [x0]!, [x1]!, x2! +cpyetwn [x0]!, [x1]!, x2! +cpyetrn [x0]!, [x1]!, x2! +cpyetn [x0]!, [x1]!, x2! +setp [x0]!, x1!, x2 +setpt [x0]!, x1!, x2 +setpn [x0]!, x1!, x2 +setptn [x0]!, x1!, x2 +setm [x0]!, x1!, x2 +setmt [x0]!, x1!, x2 +setmn [x0]!, x1!, x2 +setmtn [x0]!, x1!, x2 +sete [x0]!, x1!, x2 +setet [x0]!, x1!, x2 +seten [x0]!, x1!, x2 +setetn [x0]!, x1!, x2 +setgp [x0]!, x1!, x2 +setgpt [x0]!, x1!, x2 +setgpn [x0]!, x1!, x2 +setgptn [x0]!, x1!, x2 +setgm [x0]!, x1!, x2 +setgmt [x0]!, x1!, x2 +setgmn [x0]!, x1!, x2 +setgmtn [x0]!, x1!, x2 +setge [x0]!, x1!, x2 +setget [x0]!, x1!, x2 +setgen [x0]!, x1!, x2 +setgetn [x0]!, x1!, x2 +cpyfp [x0]!, [x1]!, xzr! +cpyfm [x0]!, [x1]!, xzr! +cpyfe [x0]!, [x1]!, xzr! +cpyp [x0]!, [x1]!, xzr! +cpym [x0]!, [x1]!, xzr! +cpye [x0]!, [x1]!, xzr! +setp [x0]!, xzr!, x2 +setp [x0]!, x1!, xzr +setm [x0]!, xzr!, x2 +setm [x0]!, x1!, xzr +sete [x0]!, xzr!, x2 +sete [x0]!, x1!, xzr +setgp [x0]!, xzr!, x2 +setgp [x0]!, x1!, xzr +setgm [x0]!, xzr!, x2 +setgm [x0]!, x1!, xzr +setge [x0]!, xzr!, x2 +setge [x0]!, x1!, xzr diff --git a/llvm/test/tools/llvm-mca/AArch64/Inputs/ptraut-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Inputs/ptraut-instructions.s new file mode 100644 index 0000000000000..d1a8ee01c6aa6 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Inputs/ptraut-instructions.s @@ -0,0 +1,56 @@ +pacia1716 +pacib1716 +autia1716 +autib1716 +paciaz +paciasp +pacibz +pacibsp +autiaz +autiasp +autibz +autibsp +pacia x0, x1 +autia x0, x1 +pacda x0, x1 +autda x0, x1 +pacib x0, x1 +autib x0, x1 +pacdb x0, x1 +autdb x0, x1 +pacga x0, x1, x2 +paciza x0 +autiza x0 +pacdza x0 +autdza x0 +pacizb x0 +autizb x0 +pacdzb x0 +autdzb x0 +xpaci x0 +xpacd x0 +xpaclri +braa x0, x1 +brab x0, x1 +blraa x0, x1 +blrab x0, x1 +braaz x0 +brabz x0 +blraaz x0 +blrabz x0 +retaa +retab +ldraa x0, [x1, 4088] +ldraa x0, [x1, -4096] +ldrab x0, [x1, 4088] +ldrab x0, [x1, -4096] +ldraa x0, [x1, 4088]! +ldraa x0, [x1, -4096]! +ldrab x0, [x1, 4088]! +ldrab x0, [x1, -4096]! +ldraa x0, [x1] +ldrab x0, [x1] +ldraa x0, [x1]! +ldrab x0, [x1]! +ldraa xzr, [sp, -4096]! +ldrab xzr, [sp, -4096]! \ No newline at end of file diff --git a/llvm/test/tools/llvm-mca/AArch64/Inputs/sve-while-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Inputs/sve-while-instructions.s new file mode 100644 index 0000000000000..714cf82bc00df --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Inputs/sve-while-instructions.s @@ -0,0 +1,6 @@ +whilege p15.b, xzr, x0 +whilegt p15.b, xzr, x0 +whilehi p15.b, xzr, x0 +whilehs p15.b, xzr, x0 +whilerw p15.b, x30, x30 +whilewr p15.b, x30, x30 diff --git a/mops-instructions.s b/mops-instructions.s new file mode 100644 index 0000000000000..89f271f55ab7d --- /dev/null +++ b/mops-instructions.s @@ -0,0 +1,138 @@ +cpyfp [x0]!, [x1]!, x2! +cpyfpwn [x0]!, [x1]!, x2! +cpyfprn [x0]!, [x1]!, x2! +cpyfpn [x0]!, [x1]!, x2! +cpyfpwt [x0]!, [x1]!, x2! +cpyfpwtwn [x0]!, [x1]!, x2! +cpyfpwtrn [x0]!, [x1]!, x2! +cpyfpwtn [x0]!, [x1]!, x2! +cpyfprt [x0]!, [x1]!, x2! +cpyfprtwn [x0]!, [x1]!, x2! +cpyfprtrn [x0]!, [x1]!, x2! +cpyfprtn [x0]!, [x1]!, x2! +cpyfpt [x0]!, [x1]!, x2! +cpyfptwn [x0]!, [x1]!, x2! +cpyfptrn [x0]!, [x1]!, x2! +cpyfptn [x0]!, [x1]!, x2! +cpyfm [x0]!, [x1]!, x2! +cpyfmwn [x0]!, [x1]!, x2! +cpyfmrn [x0]!, [x1]!, x2! +cpyfmn [x0]!, [x1]!, x2! +cpyfmwt [x0]!, [x1]!, x2! +cpyfmwtwn [x0]!, [x1]!, x2! +cpyfmwtrn [x0]!, [x1]!, x2! +cpyfmwtn [x0]!, [x1]!, x2! +cpyfmrt [x0]!, [x1]!, x2! +cpyfmrtwn [x0]!, [x1]!, x2! +cpyfmrtrn [x0]!, [x1]!, x2! +cpyfmrtn [x0]!, [x1]!, x2! +cpyfmt [x0]!, [x1]!, x2! +cpyfmtwn [x0]!, [x1]!, x2! +cpyfmtrn [x0]!, [x1]!, x2! +cpyfmtn [x0]!, [x1]!, x2! +cpyfe [x0]!, [x1]!, x2! +cpyfewn [x0]!, [x1]!, x2! +cpyfern [x0]!, [x1]!, x2! +cpyfen [x0]!, [x1]!, x2! +cpyfewt [x0]!, [x1]!, x2! +cpyfewtwn [x0]!, [x1]!, x2! +cpyfewtrn [x0]!, [x1]!, x2! +cpyfewtn [x0]!, [x1]!, x2! +cpyfert [x0]!, [x1]!, x2! +cpyfertwn [x0]!, [x1]!, x2! +cpyfertrn [x0]!, [x1]!, x2! +cpyfertn [x0]!, [x1]!, x2! +cpyfet [x0]!, [x1]!, x2! +cpyfetwn [x0]!, [x1]!, x2! +cpyfetrn [x0]!, [x1]!, x2! +cpyfetn [x0]!, [x1]!, x2! +cpyp [x0]!, [x1]!, x2! +cpypwn [x0]!, [x1]!, x2! +cpyprn [x0]!, [x1]!, x2! +cpypn [x0]!, [x1]!, x2! +cpypwt [x0]!, [x1]!, x2! +cpypwtwn [x0]!, [x1]!, x2! +cpypwtrn [x0]!, [x1]!, x2! +cpypwtn [x0]!, [x1]!, x2! +cpyprt [x0]!, [x1]!, x2! +cpyprtwn [x0]!, [x1]!, x2! +cpyprtrn [x0]!, [x1]!, x2! +cpyprtn [x0]!, [x1]!, x2! +cpypt [x0]!, [x1]!, x2! +cpyptwn [x0]!, [x1]!, x2! +cpyptrn [x0]!, [x1]!, x2! +cpyptn [x0]!, [x1]!, x2! +cpym [x0]!, [x1]!, x2! +cpymwn [x0]!, [x1]!, x2! +cpymrn [x0]!, [x1]!, x2! +cpymn [x0]!, [x1]!, x2! +cpymwt [x0]!, [x1]!, x2! +cpymwtwn [x0]!, [x1]!, x2! +cpymwtrn [x0]!, [x1]!, x2! +cpymwtn [x0]!, [x1]!, x2! +cpymrt [x0]!, [x1]!, x2! +cpymrtwn [x0]!, [x1]!, x2! +cpymrtrn [x0]!, [x1]!, x2! +cpymrtn [x0]!, [x1]!, x2! +cpymt [x0]!, [x1]!, x2! +cpymtwn [x0]!, [x1]!, x2! +cpymtrn [x0]!, [x1]!, x2! +cpymtn [x0]!, [x1]!, x2! +cpye [x0]!, [x1]!, x2! +cpyewn [x0]!, [x1]!, x2! +cpyern [x0]!, [x1]!, x2! +cpyen [x0]!, [x1]!, x2! +cpyewt [x0]!, [x1]!, x2! +cpyewtwn [x0]!, [x1]!, x2! +cpyewtrn [x0]!, [x1]!, x2! +cpyewtn [x0]!, [x1]!, x2! +cpyert [x0]!, [x1]!, x2! +cpyertwn [x0]!, [x1]!, x2! +cpyertrn [x0]!, [x1]!, x2! +cpyertn [x0]!, [x1]!, x2! +cpyet [x0]!, [x1]!, x2! +cpyetwn [x0]!, [x1]!, x2! +cpyetrn [x0]!, [x1]!, x2! +cpyetn [x0]!, [x1]!, x2! +setp [x0]!, x1!, x2 +setpt [x0]!, x1!, x2 +setpn [x0]!, x1!, x2 +setptn [x0]!, x1!, x2 +setm [x0]!, x1!, x2 +setmt [x0]!, x1!, x2 +setmn [x0]!, x1!, x2 +setmtn [x0]!, x1!, x2 +sete [x0]!, x1!, x2 +setet [x0]!, x1!, x2 +seten [x0]!, x1!, x2 +setetn [x0]!, x1!, x2 +setgp [x0]!, x1!, x2 +setgpt [x0]!, x1!, x2 +setgpn [x0]!, x1!, x2 +setgptn [x0]!, x1!, x2 +setgm [x0]!, x1!, x2 +setgmt [x0]!, x1!, x2 +setgmn [x0]!, x1!, x2 +setgmtn [x0]!, x1!, x2 +setge [x0]!, x1!, x2 +setget [x0]!, x1!, x2 +setgen [x0]!, x1!, x2 +setgetn [x0]!, x1!, x2 +cpyfp [x0]!, [x1]!, xzr! +cpyfm [x0]!, [x1]!, xzr! +cpyfe [x0]!, [x1]!, xzr! +cpyp [x0]!, [x1]!, xzr! +cpym [x0]!, [x1]!, xzr! +cpye [x0]!, [x1]!, xzr! +setp [x0]!, xzr!, x2 +setp [x0]!, x1!, xzr +setm [x0]!, xzr!, x2 +setm [x0]!, x1!, xzr +sete [x0]!, xzr!, x2 +sete [x0]!, x1!, xzr +setgp [x0]!, xzr!, x2 +setgp [x0]!, x1!, xzr +setgm [x0]!, xzr!, x2 +setgm [x0]!, x1!, xzr +setge [x0]!, xzr!, x2 +setge [x0]!, x1!, xzr From de696eeb7051c0d9e4729a6f4a84fc99bb38e904 Mon Sep 17 00:00:00 2001 From: Victor Chernyakin Date: Tue, 12 May 2026 05:59:00 -0700 Subject: [PATCH 434/538] [clang-tidy] Run analysis even with no checks (as long as `--allow-no-checks` is passed) (#194006) Fixes #192713. Currently, clang-tidy exits immediately if the only enabled checks are `clang-diagnostic-*` ones. This prevents the reasonable use case where a user isn't interested in any "native" clang-tidy checks and just wants to use clang-tidy as a frontend for builtin clang warnings. --- clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp | 6 +----- clang-tools-extra/docs/ReleaseNotes.rst | 4 ++++ .../test/clang-tidy/infrastructure/allow-no-checks.cpp | 4 +--- .../infrastructure/clang-diagnostic-checks-only.cpp | 6 ++++++ .../infrastructure/custom-query-check-not-enable.cpp | 5 ++--- .../test/clang-tidy/infrastructure/custom-query-check.cpp | 5 ++--- .../header-filter-from-config-file/inheritance/foo.cpp | 2 +- .../header-filter-from-config-file/simple/foo.cpp | 2 +- .../test/clang-tidy/infrastructure/nolintbeginend-LIFO.cpp | 2 +- .../infrastructure/nolintbeginend-begin-all-end-glob.cpp | 2 +- .../nolintbeginend-begin-all-end-specific.cpp | 2 +- .../infrastructure/nolintbeginend-begin-at-eof.cpp | 2 +- .../infrastructure/nolintbeginend-begin-glob-end-all.cpp | 2 +- .../nolintbeginend-begin-glob-end-specific.cpp | 2 +- .../nolintbeginend-begin-multiple-end-single.cpp | 2 +- .../nolintbeginend-begin-single-end-multiple.cpp | 2 +- .../nolintbeginend-begin-specific-end-all.cpp | 2 +- .../nolintbeginend-begin-specific-end-glob.cpp | 2 +- .../infrastructure/nolintbeginend-begin-without-end.cpp | 2 +- .../clang-tidy/infrastructure/nolintbeginend-end-at-sof.cpp | 2 +- .../infrastructure/nolintbeginend-end-without-begin.cpp | 2 +- .../nolintbeginend-mismatched-check-names.cpp | 2 +- .../infrastructure/nolintbeginend-mismatched-delims.cpp | 2 +- .../infrastructure/nolintbeginend-typo-in-check-name.cpp | 2 +- 24 files changed, 34 insertions(+), 32 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/clang-diagnostic-checks-only.cpp diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp index f61e2f40ed03b..949a88f0fd50d 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp @@ -718,11 +718,7 @@ int clangTidyMain(int argc, const char **argv) { return 0; } - if (EnabledChecks.empty()) { - if (AllowNoChecks) { - llvm::outs() << "No checks enabled.\n"; - return 0; - } + if (EnabledChecks.empty() && !AllowNoChecks) { llvm::errs() << "Error: no checks enabled.\n"; llvm::cl::PrintHelpMessage(/*Hidden=*/false, /*Categorized=*/true); return 1; diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 34d9d90c70a47..75ad9050787a5 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -186,6 +186,10 @@ Improvements to clang-tidy compiler. (E.g. tidy suppressed many ``clang-diagnostic-invalid-offsetof`` reports because they usually occur in expansion of the macro ``offsetof``.) +- :program:`clang-tidy` will no longer exit immediately if the only enabled + checks are `clang-diagnostic-*` ones. This allows using + :program:`clang-tidy` purely as a frontend to Clang's builtin warnings. + New checks ^^^^^^^^^^ diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp index a1f059b92384d..677ceee75b672 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/allow-no-checks.cpp @@ -1,4 +1,2 @@ // RUN: not clang-tidy %s -checks='-*' -// RUN: clang-tidy %s -checks='-*' --allow-no-checks | FileCheck --match-full-lines %s - -// CHECK: No checks enabled. +// RUN: clang-tidy %s -checks='-*' --allow-no-checks -- | count 0 diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/clang-diagnostic-checks-only.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/clang-diagnostic-checks-only.cpp new file mode 100644 index 0000000000000..d3f2a11db45bd --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/clang-diagnostic-checks-only.cpp @@ -0,0 +1,6 @@ +// RUN: clang-tidy %s -checks='-*,clang-diagnostic-literal-conversion' --allow-no-checks -- -Wliteral-conversion | FileCheck %s + +void f() { + int i = 1.5; + // CHECK: :[[@LINE-1]]:11: warning: implicit conversion from 'double' to 'int' changes value from 1.5 to 1 [clang-diagnostic-literal-conversion] +} diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check-not-enable.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check-not-enable.cpp index 3f284fc68f9f3..564cec561f27c 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check-not-enable.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check-not-enable.cpp @@ -1,12 +1,11 @@ // sed command does not work as-is on Windows. // UNSUPPORTED: system-windows // RUN: sed -e "s:INPUT_DIR:%S/Inputs/custom-query-check:g" -e "s:OUT_DIR:%t:g" -e "s:MAIN_FILE:%s:g" %S/Inputs/custom-query-check/vfsoverlay.yaml > %t.yaml -// RUN: clang-tidy --allow-no-checks %t/main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml | FileCheck %s --check-prefix=CHECK -// RUN: clang-tidy --allow-no-checks %t/main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml --list-checks | FileCheck %s --check-prefix=LIST-CHECK +// RUN: clang-tidy --allow-no-checks %t/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml -- | count 0 +// RUN: clang-tidy --allow-no-checks %t/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml --list-checks | FileCheck %s --check-prefix=LIST-CHECK long V; -// CHECK: No checks enabled. void f(); // CHECK-SUB-DIR-APPEND: [[@LINE-1]]:1: warning: find function decl [custom-function-decl] diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check.cpp index ad225e4d14f41..10cea7458110e 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/custom-query-check.cpp @@ -10,7 +10,7 @@ // RUN: clang-tidy --experimental-custom-checks %t/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml | FileCheck %s --check-prefix=CHECK-SAME-DIR // RUN: clang-tidy --experimental-custom-checks %t/subdir/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml | FileCheck %s --check-prefix=CHECK-SUB-DIR-BASE // RUN: clang-tidy --experimental-custom-checks %t/subdir-override/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml | FileCheck %s --check-prefix=CHECK-SUB-DIR-OVERRIDE -// RUN: clang-tidy --experimental-custom-checks %t/subdir-empty/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml --allow-no-checks | FileCheck %s --check-prefix=CHECK-SUB-DIR-EMPTY +// RUN: clang-tidy --experimental-custom-checks %t/subdir-empty/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml --allow-no-checks -- | count 0 // RUN: clang-tidy --experimental-custom-checks %t/subdir-append/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml | FileCheck %s --check-prefix=CHECK-SUB-DIR-APPEND // RUN: clang-tidy --experimental-custom-checks %t/subdir-append/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml --list-checks | FileCheck %s --check-prefix=LIST-CHECK // RUN: clang-tidy --experimental-custom-checks %t/subdir-append/cqc-main.cpp -checks='-*,custom-*' -vfsoverlay %t.yaml --dump-config | FileCheck %s --check-prefix=DUMP-CONFIG @@ -20,8 +20,7 @@ long V; // CHECK-SAME-DIR: [[@LINE-1]]:1: warning: use 'int' instead of 'long' [custom-avoid-long-type] // CHECK-SUB-DIR-BASE: [[@LINE-2]]:1: warning: use 'int' instead of 'long' [custom-avoid-long-type] // CHECK-SUB-DIR-OVERRIDE: [[@LINE-3]]:1: warning: use 'int' instead of 'long' override [custom-avoid-long-type] -// CHECK-SUB-DIR-EMPTY: No checks enabled. -// CHECK-SUB-DIR-APPEND: [[@LINE-5]]:1: warning: use 'int' instead of 'long' [custom-avoid-long-type] +// CHECK-SUB-DIR-APPEND: [[@LINE-4]]:1: warning: use 'int' instead of 'long' [custom-avoid-long-type] void f(); // CHECK-SUB-DIR-APPEND: [[@LINE-1]]:1: warning: find function decl [custom-function-decl] diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp index 5828c2cafaf7d..79622418d6dff 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp @@ -1,3 +1,3 @@ -// RUN: clang-tidy -checks=-*,google-explicit-constructor %s 2>&1 | FileCheck %s +// RUN: clang-tidy -checks=-*,google-explicit-constructor %s -- 2>&1 | FileCheck %s #include "foo.h" // CHECK: foo.h:1:12: warning: single-argument constructors must be marked explicit diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp index 5828c2cafaf7d..79622418d6dff 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp @@ -1,3 +1,3 @@ -// RUN: clang-tidy -checks=-*,google-explicit-constructor %s 2>&1 | FileCheck %s +// RUN: clang-tidy -checks=-*,google-explicit-constructor %s -- 2>&1 | FileCheck %s #include "foo.h" // CHECK: foo.h:1:12: warning: single-argument constructors must be marked explicit diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-LIFO.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-LIFO.cpp index e86b3df34fcfc..95e2f698e867e 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-LIFO.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-LIFO.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor) // NOLINTBEGIN(modernize-avoid-c-style-cast) diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-glob.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-glob.cpp index 90b9fa9883024..4e17e809690fc 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-glob.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-glob.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // NOLINTBEGIN class B { B(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-specific.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-specific.cpp index 6ffa914e4ef0b..07e430f4b5dab 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-specific.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-all-end-specific.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // NOLINTBEGIN class A { A(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-at-eof.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-at-eof.cpp index 0d3dcf381eaba..860a4f9dd0ab7 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-at-eof.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-at-eof.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // CHECK: :[[@LINE+8]]:11: warning: single-argument constructors must be marked explicit // Note: the expected output has been split over several lines so that clang-tidy diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-all.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-all.cpp index 3697d5c11e2e2..b2722071135ed 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-all.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-all.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" -- 2>&1 | FileCheck %s // NOLINTBEGIN(*) class B { B(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-specific.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-specific.cpp index 5bdb117f20242..dc282aa01424f 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-specific.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-glob-end-specific.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" -- 2>&1 | FileCheck %s // NOLINTBEGIN(*) class B { B(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-multiple-end-single.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-multiple-end-single.cpp index 156a5cb345dbd..640c25d245e9b 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-multiple-end-single.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-multiple-end-single.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor,modernize-avoid-c-style-cast) class B { B(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-single-end-multiple.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-single-end-multiple.cpp index 837213227dc2d..7de57dda9ea1d 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-single-end-multiple.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-single-end-multiple.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor) // NOLINTBEGIN(modernize-avoid-c-style-cast) diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-all.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-all.cpp index decfe2dd5a4c1..34c431013f0fc 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-all.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-all.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor) class A { A(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-glob.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-glob.cpp index a9f904ccce138..8e9c9c6df0a79 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-glob.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-specific-end-glob.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks="-*,google-explicit-constructor" -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor) class A { A(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-without-end.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-without-end.cpp index 2cb84ae59775d..1d3a0646cf3e2 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-without-end.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-begin-without-end.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // NOLINTBEGIN class A { A(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-at-sof.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-at-sof.cpp index 72b8ac9256866..72d1596cac961 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-at-sof.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-at-sof.cpp @@ -9,4 +9,4 @@ class A { A(int i); }; // CHECK: TBEGIN' comment [clang-tidy-nolint] // CHECK: :[[@LINE-8]]:11: warning: single-argument constructors must be marked explicit -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-without-begin.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-without-begin.cpp index cea16610823dd..653926f8c2c1c 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-without-begin.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-end-without-begin.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // NOLINTEND class A { A(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-check-names.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-check-names.cpp index ddd399dfc764f..e54f34646f02c 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-check-names.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-check-names.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor,modernize-avoid-c-style-cast' -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor) class A { A(int i); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-delims.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-delims.cpp index 4b8947e369f92..b913ef03ebb28 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-delims.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-mismatched-delims.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // NOLINTBEGIN // NOLINTBEGIN diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-typo-in-check-name.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-typo-in-check-name.cpp index 57e1ff331c8ba..13580cfbea8e5 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-typo-in-check-name.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/nolintbeginend-typo-in-check-name.cpp @@ -1,4 +1,4 @@ -// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' 2>&1 | FileCheck %s +// RUN: not clang-tidy %s --checks='-*,google-explicit-constructor' -- 2>&1 | FileCheck %s // NOLINTBEGIN(google-explicit-constructor) class A { A(int i); }; From a37c5f07c47c7bb45d2ad0562c8dd08d772a8315 Mon Sep 17 00:00:00 2001 From: Mahesh-Attarde Date: Tue, 12 May 2026 18:31:23 +0530 Subject: [PATCH 435/538] [X86][Codegen] Fix frame-destroy for win-eh case (#196904) From Discussion on https://github.com/llvm/llvm-project/pull/177248/changes#r3217065230 Fixing test which had missing `frame-destroy` --- llvm/lib/Target/X86/X86FrameLowering.cpp | 3 ++- llvm/test/CodeGen/X86/cfi-xmm.ll | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 79ed59fcc1d34..55863d3c3f1b9 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -3166,7 +3166,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register(), 0, + MachineInstr::FrameDestroy); } // Clear the stack slot for spill base pointer register. diff --git a/llvm/test/CodeGen/X86/cfi-xmm.ll b/llvm/test/CodeGen/X86/cfi-xmm.ll index c4616b6273883..dde731c5cbdfb 100644 --- a/llvm/test/CodeGen/X86/cfi-xmm.ll +++ b/llvm/test/CodeGen/X86/cfi-xmm.ll @@ -13,8 +13,8 @@ define void @_Z1fv() { ; PEI-NEXT: CFI_INSTRUCTION offset $xmm10, -48 ; PEI-NEXT: CFI_INSTRUCTION offset $xmm15, -32 ; PEI-NEXT: INLINEASM &"", sideeffect attdialect, clobber, implicit-def dead early-clobber $xmm10, clobber, implicit-def dead early-clobber $xmm15, clobber, implicit-def dead early-clobber $df, clobber, implicit-def early-clobber $fpsw, clobber, implicit-def dead early-clobber $eflags - ; PEI-NEXT: $xmm10 = MOVAPSrm $rsp, 1, $noreg, 0, $noreg :: (load (s128) from %fixed-stack.0) - ; PEI-NEXT: $xmm15 = MOVAPSrm $rsp, 1, $noreg, 16, $noreg :: (load (s128) from %fixed-stack.1) + ; PEI-NEXT: $xmm10 = frame-destroy MOVAPSrm $rsp, 1, $noreg, 0, $noreg :: (load (s128) from %fixed-stack.0) + ; PEI-NEXT: $xmm15 = frame-destroy MOVAPSrm $rsp, 1, $noreg, 16, $noreg :: (load (s128) from %fixed-stack.1) ; PEI-NEXT: $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags ; PEI-NEXT: RET 0 entry: From 96f60adb34b0a6caf727d395835641fd5147cb48 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Tue, 12 May 2026 14:02:19 +0100 Subject: [PATCH 436/538] [AArch64] Clean-up constrain register check in selectCopy (NFC) (#197168) This addresses post-commit review from #188781. --- .../Target/AArch64/GISel/AArch64InstructionSelector.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 5d2b4bc7ac6d3..cf650fd5c4e72 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1056,11 +1056,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, unsigned SrcSubReg = I.getOperand(1).getSubReg(); unsigned SubReg; - if (SrcSubReg) { - if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) - return false; - return true; - } + if (SrcSubReg) + return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); // If the source bank doesn't support a subregister copy small enough, // then we first need to copy to the destination bank. From ccb01f878971239ef141a53cbb66a8b411293580 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 12 May 2026 09:02:38 -0400 Subject: [PATCH 437/538] [libc] Remove some global printf_core declarations in float_dec_converter.h (#196860) fixed_converter.h and float_hex_converter.h have local declarations with the same name shadowing these, causing -Wshadow warnings. For now, just don't have global declarations for these. --- .../stdio/printf_core/float_dec_converter.h | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h index 509b4e24ada32..8b99688cb993c 100644 --- a/libc/src/stdio/printf_core/float_dec_converter.h +++ b/libc/src/stdio/printf_core/float_dec_converter.h @@ -29,11 +29,6 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { -#ifdef LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE -using StorageType = UInt128; -#else -using StorageType = fputil::FPBits::StorageType; -#endif // LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE using DecimalString = IntegerToString; using ExponentString = IntegerToString::WithSign>; @@ -51,7 +46,6 @@ constexpr uint32_t MAX_BLOCK = 999999999; // constexpr size_t BLOCK_SIZE = 18; // constexpr uint32_t MAX_BLOCK = 999999999999999999; -constexpr char DECIMAL_POINT = '.'; LIBC_INLINE RoundDirection get_round_direction(int last_digit, bool truncated, Sign sign) { @@ -176,6 +170,7 @@ template class FloatWriter { LIBC_INLINE int flush_buffer(bool round_up_max_blocks = false) { const char MAX_BLOCK_DIGIT = (round_up_max_blocks ? '0' : '9'); + constexpr char DECIMAL_POINT = '.'; // Write the most recent buffered block, and mark has_written if (!has_written) { @@ -368,6 +363,8 @@ template class FloatWriter { // has_carry should only be true here if every previous digit is 9, which // implies that the number has never been written. if (has_carry /* && !has_written */) { + constexpr char DECIMAL_POINT = '.'; + if (has_exp) { // This is in %e style // Since this is exponential notation, we don't write any more digits // but we do increment the exponent. @@ -604,6 +601,12 @@ template *writer, const FormatSection &to_conv, fputil::FPBits float_bits) { +#ifdef LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE + using StorageType = UInt128; +#else + using StorageType = fputil::FPBits::StorageType; +#endif // LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE + // signed because later we use -FRACTION_LEN constexpr int32_t FRACTION_LEN = fputil::FPBits::FRACTION_LEN; int exponent = float_bits.get_explicit_exponent(); @@ -766,6 +769,12 @@ template *writer, const FormatSection &to_conv, fputil::FPBits float_bits) { +#ifdef LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE + using StorageType = UInt128; +#else + using StorageType = fputil::FPBits::StorageType; +#endif // LIBC_TYPES_LONG_DOUBLE_IS_DOUBLE_DOUBLE + // signed because later we use -FRACTION_LEN constexpr int32_t FRACTION_LEN = fputil::FPBits::FRACTION_LEN; int exponent = float_bits.get_explicit_exponent(); From 285669ee677b783c62f132a696a7110a078bff97 Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Tue, 12 May 2026 15:03:08 +0200 Subject: [PATCH 438/538] [llvm][tools] Add support to llvm-offload-binary to unbundle images inside OffloadBinary images (#184774) Enhance the llvm-offload-binary tool to be able to unbudle with logic to handle different cases related to spirv64-intel offload binary images. It also allows to extract all images without requiring the use --image options to simplify its use. Assisted by Claude. --- .../docs/CommandGuide/llvm-offload-binary.rst | 107 ++++++++++++- .../llvm-offload-binary.ll | 140 ++++++++++++++---- .../llvm-offload-binary.cpp | 58 +++++++- 3 files changed, 267 insertions(+), 38 deletions(-) diff --git a/llvm/docs/CommandGuide/llvm-offload-binary.rst b/llvm/docs/CommandGuide/llvm-offload-binary.rst index 960b12d8af286..87005967775db 100644 --- a/llvm/docs/CommandGuide/llvm-offload-binary.rst +++ b/llvm/docs/CommandGuide/llvm-offload-binary.rst @@ -15,7 +15,16 @@ DESCRIPTION files into a single binary container. The resulting binary can then be embedded into the host section table to form a fat binary containing offloading code for different targets. Conversely, it can also extract previously bundled device -images. +images from offload binaries. + +When extracting images, if no :option:`--image` filters are specified, all +offload images are automatically extracted with descriptive filenames. When +:option:`--image` filters are provided, only matching images are extracted. + +The tool supports nested OffloadBinary format, where device images can be wrapped +in an inner OffloadBinary container. When extracting, the tool automatically +detects and unwraps nested OffloadBinary images, making the format transparent +to users. The binary format begins with the magic bytes ``0x10FF10AD``, followed by a version and size. Each binary contains its own header, allowing tools to locate @@ -32,12 +41,22 @@ EXAMPLE $ llvm-offload-binary -o out.bin \ --image=file=input.o,triple=nvptx64,arch=sm_70 - # Extract a matching image from a fat binary: - $ llvm-offload-binary in.bin \ - --image=file=output.o,triple=nvptx64,arch=sm_70 + # Extract all offload images from an executable (no filters): + $ llvm-offload-binary in.bin + # Output: + # Extracted: in-nvptx64-nvidia-cuda-sm_70.0.bc + # Extracted: in-spirv64-intel-unknown.0.spv + + # Extract only SPIR-V images using filters: + $ llvm-offload-binary in.bin --image=triple=spirv64-intel + # Output: + # Extracted: in-spirv64-intel-unknown.0.spv - # Extract and archive images into a static library: - $ llvm-offload-binary in.bin --archive -o libdevice.a + # Extract filtered images to a specific file: + $ llvm-offload-binary in.bin --image=file=output.bc,arch=sm_70 + + # Extract filtered images to an archive: + $ llvm-offload-binary in.bin --image=file=output.a,triple=nvptx64 --archive OPTIONS ------- @@ -53,6 +72,12 @@ OPTIONS Commonly used optional keys include ``arch`` (e.g. ``sm_70`` for CUDA) and ``triple`` (e.g. nvptx64-nvidia-cuda). + When bundling, this option specifies images to include in the output binary. + When extracting, this option acts as a filter: only images matching the + specified keys are extracted. If no :option:`--image` options are provided + during extraction, all images are automatically extracted with descriptive + filenames. + .. option:: -o Write output to . When bundling, this specifies the fat binary filename. @@ -179,7 +204,75 @@ The enumerated values for ``image kind`` and ``offload kind`` are: | OFK_SYCL | 0x04 | The producer was SYCL | +------------+-------+---------------------------------------+ +COMMON WORKFLOWS +---------------- + +**Workflow 1: Explore Executable Contents** + +Extract all embedded offload images to see what's inside: + +.. code-block:: console + + $ clang++ -fopenmp -fopenmp-targets=nvptx64,spirv64-intel app.cpp -o myapp + $ llvm-offload-binary myapp + # Output: + # Extracted: myapp-nvptx64-nvidia-cuda-sm_70.0.bc + # Extracted: myapp-spirv64-intel-unknown.1.spv + +**Workflow 2: Extract Specific Target** + +Extract only images for a specific target: + +.. code-block:: console + + $ llvm-offload-binary myapp --image=triple=spirv64-intel + # Output: + # Extracted: myapp-spirv64-intel-unknown.0.spv + +**Workflow 3: Create Device Image Archive** + +Extract filtered images into a static archive: + +.. code-block:: console + + $ llvm-offload-binary myapp --image=file=nvptx.a,triple=nvptx64 --archive + $ ar t nvptx.a + # Shows extracted CUDA images + +**Workflow 4: Validate SPIR-V** + +Extract and validate SPIR-V binaries: + +.. code-block:: console + + $ llvm-offload-binary myapp --image=triple=spirv64-intel + $ spirv-val myapp-spirv64-intel-unknown.0.spv + $ spirv-dis myapp-spirv64-intel-unknown.0.spv -o kernel.spvasm + +**Workflow 5: Bundle Multiple Targets** + +Create a fat binary from multiple device images: + +.. code-block:: console + + $ clang++ -fopenmp -fopenmp-targets=nvptx64 --offload-device-only kernel.cpp -o kernel_nvptx.bc + $ clang++ -fopenmp -fopenmp-targets=spirv64-intel --offload-device-only kernel.cpp -o kernel_spirv.bc + $ llvm-offload-binary -o bundle.bin \ + --image=file=kernel_nvptx.bc,triple=nvptx64,arch=sm_70 \ + --image=file=kernel_spirv.bc,triple=spirv64-intel + +**Workflow 6: Extract and Rebundle** + +Extract images from one binary and rebundle with modifications: + +.. code-block:: console + + $ llvm-offload-binary old_app + $ llvm-offload-binary -o new_bundle.bin \ + --image=file=old_app-nvptx64-nvidia-cuda-sm_70.0.bc,triple=nvptx64,arch=sm_70 \ + --image=file=new_kernel.bc,triple=nvptx64,arch=sm_80 + SEE ALSO -------- -:manpage:`clang(1)`, :manpage:`llvm-objdump(1)` +:manpage:`clang(1)`, :manpage:`llvm-objdump(1)`, :manpage:`spirv-val(1)`, :manpage:`spirv-dis(1)` diff --git a/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll b/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll index 31ee5e286717f..023194dfab60c 100644 --- a/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll +++ b/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll @@ -16,40 +16,128 @@ ; RUN: llvm-offload-binary %t3 --image=file=%t4 ; RUN: diff %s %t4 +; Test extracting all images without specifying --image filters. +; RUN: llvm-offload-binary %t | FileCheck --check-prefix=EXTRACT %s + +; EXTRACT: Extracted: llvm-offload-binary.{{.*}}-x-y-z-abc.0. + ; Test nested OffloadBinary construction with multiple inner images. ; RUN: llvm-offload-binary -o %t5 --image=file=%s,arch=abc,triple=x-y-z --image=file=%s,arch=def,triple=x-y-z ; RUN: llvm-offload-binary -o %t6 --image=file=%t5,arch=nested,triple=x-y-z ; RUN: llvm-objdump --offloading %t6 | FileCheck %s --check-prefix=NESTED -; NESTED: OFFLOADING IMAGE [0]: -; NESTED: arch nested -; NESTED: nested images 2 -; NESTED: OFFLOADING IMAGE [0.0]: -; NESTED: arch abc -; NESTED: OFFLOADING IMAGE [0.1]: -; NESTED: arch def +; NESTED: OFFLOADING IMAGE [0]: +; NESTED-NEXT: kind +; NESTED-NEXT: arch nested +; NESTED-NEXT: triple x-y-z +; NESTED-NEXT: producer none +; NESTED-NEXT: image size {{.*}} bytes +; NESTED-NEXT: nested images 2 +; NESTED-EMPTY: +; NESTED-NEXT: OFFLOADING IMAGE [0.0]: +; NESTED-NEXT: kind +; NESTED-NEXT: arch abc +; NESTED-NEXT: triple x-y-z +; NESTED-NEXT: producer none +; NESTED-NEXT: image size {{.*}} bytes +; NESTED-EMPTY: +; NESTED-NEXT: OFFLOADING IMAGE [0.1]: +; NESTED-NEXT: kind +; NESTED-NEXT: arch def +; NESTED-NEXT: triple x-y-z +; NESTED-NEXT: producer none +; NESTED-NEXT: image size {{.*}} bytes ; Test complex nested OffloadBinary construction with multiple levels. ; RUN: llvm-offload-binary -o %t7 --image=file=%s,arch=abc,triple=x-y-z --image=file=%t5,arch=nested,triple=x-y-z ; RUN: llvm-offload-binary -o %t8 --image=file=%t7,arch=nested,triple=x-y-z --image=file=%t5,arch=nested2,triple=x-y-z ; RUN: llvm-objdump --offloading %t8 | FileCheck %s --check-prefix=NESTED2 -; NESTED2: OFFLOADING IMAGE [0]: -; NESTED2: arch nested -; NESTED2: nested images 2 -; NESTED2: OFFLOADING IMAGE [0.0]: -; NESTED2: arch abc -; NESTED2: OFFLOADING IMAGE [0.1]: -; NESTED2: arch nested -; NESTED2: nested images 2 -; NESTED2: OFFLOADING IMAGE [0.1.0]: -; NESTED2: arch abc -; NESTED2: OFFLOADING IMAGE [0.1.1]: -; NESTED2: arch def -; NESTED2: OFFLOADING IMAGE [1]: -; NESTED2: arch nested2 -; NESTED2: nested images 2 -; NESTED2: OFFLOADING IMAGE [1.0]: -; NESTED2: arch abc -; NESTED2: OFFLOADING IMAGE [1.1]: -; NESTED2: arch def +; NESTED2: OFFLOADING IMAGE [0]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch nested +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-NEXT: nested images 2 +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [0.0]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch abc +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [0.1]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch nested +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-NEXT: nested images 2 +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [0.1.0]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch abc +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [0.1.1]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch def +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [1]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch nested2 +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-NEXT: nested images 2 +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [1.0]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch abc +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes +; NESTED2-EMPTY: +; NESTED2-NEXT: OFFLOADING IMAGE [1.1]: +; NESTED2-NEXT: kind +; NESTED2-NEXT: arch def +; NESTED2-NEXT: triple x-y-z +; NESTED2-NEXT: producer none +; NESTED2-NEXT: image size {{.*}} bytes + +; Test extracting nested images. +; RUN: llvm-offload-binary %t6 | FileCheck --check-prefix=EXTRACT-NESTED %s + +; EXTRACT-NESTED: Extracted: llvm-offload-binary.{{.*}}-x-y-z-abc.0. +; EXTRACT-NESTED-NEXT: Extracted: llvm-offload-binary.{{.*}}-x-y-z-def.1. + +; Test mixed nested and non-nested images. +; RUN: llvm-offload-binary -o %t7 --image=file=%t5,arch=nested,triple=x-y-z --image=file=%s,arch=ghi,triple=x-y-z +; RUN: llvm-offload-binary %t7 | FileCheck --check-prefix=EXTRACT-MIXED %s + +; EXTRACT-MIXED: Extracted: llvm-offload-binary.{{.*}}-x-y-z-abc.0. +; EXTRACT-MIXED-NEXT: Extracted: llvm-offload-binary.{{.*}}-x-y-z-def.1. +; EXTRACT-MIXED-NEXT: Extracted: llvm-offload-binary.{{.*}}-x-y-z-ghi.2. + +; Test extracting inner OffloadBinary with --image filter. +; RUN: llvm-offload-binary %t7 --image=file=%t8,arch=nested,triple=x-y-z +; RUN: diff %t5 %t8 + +; Test malformed outer OffloadBinary is handled gracefully. +; RUN: printf "\020\377\020\255\012" > %t9 +; RUN: not llvm-offload-binary %t9 2>&1 | FileCheck --check-prefix=MALFORMED-OUTER %s + +; MALFORMED-OUTER: llvm-offload-binary: error: Invalid data was encountered while parsing the file + +; Test malformed inner OffloadBinary is handled gracefully. +; RUN: llvm-offload-binary -o %t10 --image=file=%t9,arch=nested,triple=x-y-z +; RUN: not llvm-offload-binary %t10 2>&1 | FileCheck --check-prefix=MALFORMED-INNER %s + +; MALFORMED-INNER: llvm-offload-binary: error: Invalid data was encountered while parsing the file diff --git a/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp b/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp index e22d13b946651..1b8ed02c8e6d0 100644 --- a/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp +++ b/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp @@ -16,6 +16,8 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Object/ArchiveWriter.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" #include "llvm/Object/OffloadBinary.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileOutputBuffer.h" @@ -135,6 +137,44 @@ static Error bundleImages() { return Error::success(); } +// Extract a single OffloadBinary, recursively handling nested OffloadBinaries. +static Error extractBinary(const OffloadBinary *Binary, StringRef InputFile, + uint64_t &Idx, StringSaver &Saver) { + StringRef ImageData = Binary->getImage(); + + // Check if the image contains a nested OffloadBinary. + if (identify_magic(ImageData) == file_magic::offload_binary) { + // Parse nested OffloadBinary. + MemoryBufferRef InnerBuffer(ImageData, "nested-offload-binary"); + SmallVector InnerBinaries; + if (Error Err = extractOffloadBinaries(InnerBuffer, InnerBinaries)) + return Err; + + // Recursively extract each nested binary. + for (const auto &InnerBinary : InnerBinaries) { + if (Error E = + extractBinary(InnerBinary.getBinary(), InputFile, Idx, Saver)) + return E; + } + return Error::success(); + } + + // Base case: extract the actual device image. + std::string Filename; + raw_string_ostream SS(Filename); + SS << sys::path::stem(InputFile) << "-" << Binary->getTriple(); + StringRef Arch = Binary->getArch(); + if (!Arch.empty()) + SS << "-" << Arch; + SS << "." << Idx++ << "." << getImageKindName(Binary->getImageKind()); + + if (Error E = writeFile(Saver.save(Filename), ImageData)) + return E; + + outs() << "Extracted: " << Filename << "\n"; + return Error::success(); +} + static Error unbundleImages() { ErrorOr> BufferOrErr = MemoryBuffer::getFileOrSTDIN(InputFile); @@ -152,6 +192,18 @@ static Error unbundleImages() { if (Error Err = extractOffloadBinaries(*Buffer, Binaries)) return Err; + // If no filters specified, extract all images. + if (DeviceImages.empty()) { + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + uint64_t Idx = 0; + for (const OffloadFile &File : Binaries) { + if (Error E = extractBinary(File.getBinary(), InputFile, Idx, Saver)) + return E; + } + return Error::success(); + } + // Try to extract each device image specified by the user from the input file. for (StringRef Image : DeviceImages) { BumpPtrAllocator Alloc; @@ -202,11 +254,7 @@ static Error unbundleImages() { } else { uint64_t Idx = 0; for (const OffloadBinary *Binary : Extracted) { - StringRef Filename = - Saver.save(sys::path::stem(InputFile) + "-" + Binary->getTriple() + - "-" + Binary->getArch() + "." + std::to_string(Idx++) + - "." + getImageKindName(Binary->getImageKind())); - if (Error E = writeFile(Filename, Binary->getImage())) + if (Error E = extractBinary(Binary, InputFile, Idx, Saver)) return E; } } From 0dda05ec66d342c0703665aac8562e049ce5b446 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 12 May 2026 09:03:35 -0400 Subject: [PATCH 439/538] [gn build] Port 9f3d3048715a (#197182) --- .../gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn index 7b48e0d459665..d6acccb232eac 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn @@ -53,6 +53,7 @@ static_library("misc") { "PredictableRandCheck.cpp", "RedundantExpressionCheck.cpp", "StaticAssertCheck.cpp", + "StaticInitializationCycleCheck.cpp", "ThrowByValueCatchByReferenceCheck.cpp", "UnconventionalAssignOperatorCheck.cpp", "UniqueptrResetReleaseCheck.cpp", From dc713c371eb137a2f9c86abe4377e87fcbc4d180 Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi Date: Tue, 12 May 2026 18:38:05 +0530 Subject: [PATCH 440/538] [UniformityAnalysis] Rename public api's in UA (NFC) (#196251) --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 7 +- llvm/include/llvm/ADT/GenericUniformityInfo.h | 21 ++-- .../SelectionDAG/FunctionLoweringInfo.cpp | 2 +- .../AMDGPU/AMDGPUAnnotateUniformValues.cpp | 4 +- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 8 +- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 4 +- .../AMDGPUGlobalISelDivergenceLowering.cpp | 4 +- .../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 2 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 115 ++++++++++-------- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 2 +- .../AMDGPU/AMDGPURewriteUndefForPHI.cpp | 4 +- .../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 2 +- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 2 +- .../Target/AMDGPU/SIAnnotateControlFlow.cpp | 2 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 2 +- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 2 +- .../Target/AMDGPU/UniformityAnalysisTest.cpp | 8 +- 18 files changed, 103 insertions(+), 90 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index a363b2a3c1702..a9a1211947492 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -1292,17 +1292,18 @@ GenericUniformityInfo::getFunction() const { /// A default-constructed instance (no analysis computed) reports everything /// as uniform, which is conservatively correct for non-divergent targets. template -bool GenericUniformityInfo::isDivergent(ConstValueRefT V) const { +bool GenericUniformityInfo::isDivergentAtDef(ConstValueRefT V) const { return DA && DA->isDivergent(V); } template -bool GenericUniformityInfo::isDivergent(const InstructionT *I) const { +bool GenericUniformityInfo::isDivergentAtDef( + const InstructionT *I) const { return DA && DA->isDivergent(*I); } template -bool GenericUniformityInfo::isDivergentUse(const UseT &U) const { +bool GenericUniformityInfo::isDivergentAtUse(const UseT &U) const { return DA && DA->isDivergentUse(U); } diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index a504335ec078e..c4eb5485424b9 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -58,21 +58,26 @@ template class GenericUniformityInfo { const FunctionT &getFunction() const; /// Whether \p V is divergent at its definition. - bool isDivergent(ConstValueRefT V) const; + bool isDivergentAtDef(ConstValueRefT V) const; - /// Whether \p V is uniform/non-divergent. - bool isUniform(ConstValueRefT V) const { return !isDivergent(V); } + /// Whether \p V is uniform/non-divergent at its definition. + bool isUniformAtDef(ConstValueRefT V) const { return !isDivergentAtDef(V); } // Similar queries for InstructionT. These accept a pointer argument so that // in LLVM IR, they overload the equivalent queries for Value*. For example, // if querying whether a CondBrInst is divergent, it should not be treated as // a Value in LLVM IR. - bool isUniform(const InstructionT *I) const { return !isDivergent(I); }; - bool isDivergent(const InstructionT *I) const; + bool isUniformAtDef(const InstructionT *I) const { + return !isDivergentAtDef(I); + }; + bool isDivergentAtDef(const InstructionT *I) const; - /// \brief Whether \p U is divergent. Uses of a uniform value can be - /// divergent. - bool isDivergentUse(const UseT &U) const; + /// \brief Whether \p U is divergent at its use. Uses of a uniform value can + /// be divergent. + bool isDivergentAtUse(const UseT &U) const; + + /// \brief Whether \p U is uniform/non-divergent at its use. + bool isUniformAtUse(const UseT &U) const { return !isDivergentAtUse(U); } bool hasDivergentTerminator(const BlockT &B); diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 5edb992e68f4c..6ca0592644a08 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -385,7 +385,7 @@ Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) { } Register FunctionLoweringInfo::CreateRegs(const Value *V) { - return CreateRegs(V->getType(), UA && UA->isDivergent(V) && + return CreateRegs(V->getType(), UA && UA->isDivergentAtDef(V) && !TLI->requiresUniformRegister(*MF, V)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 77b042c8e7076..f6122f9856e4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -60,13 +60,13 @@ class AMDGPUAnnotateUniformValues } // End anonymous namespace void AMDGPUAnnotateUniformValues::visitCondBrInst(CondBrInst &I) { - if (UA->isUniform(&I)) + if (UA->isUniformAtDef(&I)) setUniformMetadata(&I); } void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); - if (!UA->isUniform(Ptr)) + if (!UA->isUniformAtDef(Ptr)) return; Instruction *PtrI = dyn_cast(Ptr); if (PtrI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index b4d51522e28af..a4029fb79b49a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -223,11 +223,11 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // If the pointer operand is divergent, then each lane is doing an atomic // operation on a different address, and we cannot optimize that. - if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) { + if (UA.isDivergentAtUse(I.getOperandUse(PtrIdx))) { return; } - bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx)); + bool ValDivergent = UA.isDivergentAtUse(I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if @@ -311,7 +311,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { const unsigned ValIdx = 0; - const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx)); + const bool ValDivergent = UA.isDivergentAtUse(I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if @@ -328,7 +328,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { // If any of the other arguments to the intrinsic are divergent, we can't // optimize the operation. for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { - if (UA.isDivergentUse(I.getOperandUse(Idx))) + if (UA.isDivergentAtUse(I.getOperandUse(Idx))) return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 915d2116bd268..5388d1402e493 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -318,7 +318,7 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { int TySize = DL.getTypeSizeInBits(Ty); Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); - return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I); + return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniformAtDef(&I); } unsigned @@ -370,7 +370,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { return false; // Prefer scalar if this could be s_mul_i32 - if (UA.isUniform(&I)) + if (UA.isUniformAtDef(&I)) return false; Value *LHS = I.getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp index 6d4487935e260..cbb4269e17260 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -111,7 +111,7 @@ void DivergenceLoweringHelper::getCandidatesForLowering( if (MI.getOpcode() != TargetOpcode::G_PHI) continue; Register Dst = MI.getOperand(0).getReg(); - if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst)) + if (MRI->getType(Dst) == S1 && MUI->isDivergentAtDef(Dst)) Vreg1Phis.push_back(&MI); } } @@ -207,7 +207,7 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { DenseMap TDCache; for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) { - if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || + if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergentAtDef(Reg) || ILMA.isS32S64LaneMask(Reg)) continue; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 63e265612cbf7..3844e68be8e8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -493,7 +493,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { if (LI.getAlign() < DL.getABITypeAlign(Ty)) return false; // It should be uniform, i.e. a scalar load. - return UA.isUniform(&LI); + return UA.isUniformAtDef(&LI); } bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 061b8dc070ead..1e5c5e851fb6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -99,57 +99,59 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, case B512: return MRI.getType(Reg).getSizeInBits() == 512; case DivAnyTy: - return MUI.isDivergent(Reg); + return MUI.isDivergentAtDef(Reg); case UniS1: - return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniformAtDef(Reg); case UniS16: - return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniformAtDef(Reg); case UniS32: - return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniformAtDef(Reg); case UniS64: - return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniformAtDef(Reg); case UniS128: - return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniformAtDef(Reg); case UniP0: - return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniformAtDef(Reg); case UniP1: - return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniformAtDef(Reg); case UniP2: - return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniformAtDef(Reg); case UniP3: - return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniformAtDef(Reg); case UniP4: - return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniformAtDef(Reg); case UniP5: - return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniformAtDef(Reg); case UniP8: - return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniformAtDef(Reg); case UniPtr32: - return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg); + return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniformAtDef(Reg); case UniPtr64: - return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniform(Reg); + return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniformAtDef(Reg); case UniPtr128: - return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg); + return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniformAtDef(Reg); case UniV2S16: - return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && + MUI.isUniformAtDef(Reg); case UniV2S32: - return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && + MUI.isUniformAtDef(Reg); case UniB32: - return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniformAtDef(Reg); case UniB64: - return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniformAtDef(Reg); case UniB96: - return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniformAtDef(Reg); case UniB128: - return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniformAtDef(Reg); case UniB160: - return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniformAtDef(Reg); case UniB256: - return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniformAtDef(Reg); case UniB512: - return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg); + return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniformAtDef(Reg); case UniBRC: { - if (!MUI.isUniform(Reg)) + if (!MUI.isUniformAtDef(Reg)) return false; // Check if there is SGPR register class of same size as the LLT. const SIRegisterInfo *TRI = @@ -160,59 +162,64 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize); } case DivS1: - return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergentAtDef(Reg); case DivS16: - return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergentAtDef(Reg); case DivS32: - return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergentAtDef(Reg); case DivS64: - return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergentAtDef(Reg); case DivS128: - return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergentAtDef(Reg); case DivP0: - return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergentAtDef(Reg); case DivP1: - return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergentAtDef(Reg); case DivP2: - return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergentAtDef(Reg); case DivP3: - return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergentAtDef(Reg); case DivP4: - return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergentAtDef(Reg); case DivP5: - return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergentAtDef(Reg); case DivPtr32: - return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergent(Reg); + return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergentAtDef(Reg); case DivPtr64: - return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergent(Reg); + return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergentAtDef(Reg); case DivPtr128: - return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg); + return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergentAtDef(Reg); case DivV2S16: - return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && + MUI.isDivergentAtDef(Reg); case DivV2S32: - return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && + MUI.isDivergentAtDef(Reg); case DivV3S32: - return MRI.getType(Reg) == LLT::fixed_vector(3, 32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(3, 32) && + MUI.isDivergentAtDef(Reg); case DivV4S16: - return MRI.getType(Reg) == LLT::fixed_vector(4, 16) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(4, 16) && + MUI.isDivergentAtDef(Reg); case DivV6S32: - return MRI.getType(Reg) == LLT::fixed_vector(6, 32) && MUI.isDivergent(Reg); + return MRI.getType(Reg) == LLT::fixed_vector(6, 32) && + MUI.isDivergentAtDef(Reg); case DivB32: - return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergentAtDef(Reg); case DivB64: - return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergentAtDef(Reg); case DivB96: - return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergentAtDef(Reg); case DivB128: - return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergentAtDef(Reg); case DivB160: - return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergentAtDef(Reg); case DivB256: - return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergentAtDef(Reg); case DivB512: - return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg); + return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergentAtDef(Reg); case DivBRC: { - if (!MUI.isDivergent(Reg)) + if (!MUI.isDivergentAtDef(Reg)) return false; // Check if there is VGPR register class of same size as the LLT. const SIRegisterInfo *TRI = @@ -317,7 +324,7 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg))); if (Slot != -1) - return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot]; + return MUI.isUniformAtDef(Reg) ? &Uni[Slot] : &Div[Slot]; } // Slow search for more complex rules. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp index 493b7541cdd81..247522a0dd43b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -111,7 +111,7 @@ class RegBankSelectHelper { const RegisterBank *getRegBankToAssign(Register Reg) { if (!isTemporalDivergenceCopy(Reg) && - (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))) + (MUI.isUniformAtDef(Reg) || ILMA.isS32S64LaneMask(Reg))) return SgprRB; if (MRI.getType(Reg) == LLT::scalar(1)) return VccRB; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp index 1c135f09080e1..c2aa3ddd99a69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp @@ -102,7 +102,7 @@ bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) { SmallVector ToBeDeleted; for (auto &BB : F) { for (auto &PHI : BB.phis()) { - if (UA.isDivergent(&PHI)) + if (UA.isDivergentAtDef(&PHI)) continue; // The unique incoming value except undef/poison for the PHI node. @@ -144,7 +144,7 @@ bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) { // TODO: We should still be able to replace undef value if the unique // value is a Constant. if (!UniqueDefinedIncoming || Undefs.empty() || - !UA.isDivergent(DominateBB->getTerminator())) + !UA.isDivergentAtDef(DominateBB->getTerminator())) continue; // We only replace the undef when DominateBB truly dominates all the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 864d877fe9ac0..6b68c25825d7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -49,7 +49,7 @@ isDivergentUseWithNew(const Use &U, const UniformityInfo &UI, Value *V = U.get(); if (auto It = Tracker.find(V); It != Tracker.end()) return !It->second; // divergent if marked false - return UI.isDivergentUse(U); + return UI.isDivergentAtUse(U); } /// Optimizes uniform intrinsics calls if their operand can be proven uniform. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 36e51421c5ae4..6502b76fdee4f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -123,7 +123,7 @@ static bool isUniformlyReached(const UniformityInfo &UA, BasicBlock &BB) { while (!Stack.empty()) { BasicBlock *Top = Stack.pop_back_val(); - if (!UA.isUniform(Top->getTerminator())) + if (!UA.isUniformAtDef(Top->getTerminator())) return false; for (BasicBlock *Pred : predecessors(Top)) { diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 54ec4a51a4ab3..6d251acf08566 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -128,7 +128,7 @@ void SIAnnotateControlFlow::initialize(const GCNSubtarget &ST) { /// Is the branch condition uniform or did the StructurizeCFG pass /// consider it as such? bool SIAnnotateControlFlow::isUniform(CondBrInst *T) { - return UA->isUniform(T) || T->hasMetadata("structurizecfg.uniform"); + return UA->isUniformAtDef(T) || T->hasMetadata("structurizecfg.uniform"); } /// Is BB the last block saved on the stack ? diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8b89366f89c5a..4bd0a0bee00ff 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19810,7 +19810,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, return !TRI->isSGPRReg(MRI, Reg); if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) - return UA->isDivergent(V); + return UA->isDivergentAtDef(V); assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); return !TRI->isSGPRReg(MRI, Reg); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 0c5e3b5039309..ab32a7602a4c0 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -780,7 +780,7 @@ static bool isSCEVUniform(const SCEV *S, UniformityInfo &UI) { if (isa(S)) return true; if (auto *U = dyn_cast(S)) - return UI.isUniform(U->getValue()); + return UI.isUniformAtDef(U->getValue()); for (const SCEV *Op : S->operands()) { if (!isSCEVUniform(Op, UI)) return false; diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 04ece92b74375..903a0f2f6bab3 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -1302,7 +1302,7 @@ static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, if (!Br) continue; - if (!UA.isUniform(Br)) + if (!UA.isUniformAtDef(Br)) return false; // One of our direct children is conditional. diff --git a/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp b/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp index da480e02e8c12..e61ef1d56c0a3 100644 --- a/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp +++ b/llvm/unittests/Target/AMDGPU/UniformityAnalysisTest.cpp @@ -64,9 +64,9 @@ TEST_F(AMDGPUTestBase, NewValueIsConservativelyDivergent) { // Existing values from the analysis are uniform (kernel args are inreg). Instruction *AddInst = &*F->getEntryBlock().begin(); ASSERT_TRUE(isa(AddInst)); - EXPECT_FALSE(UI.isDivergent(AddInst)) << "%add should be uniform"; - EXPECT_FALSE(UI.isDivergent(F->getArg(0))) << "%a should be uniform"; - EXPECT_FALSE(UI.isDivergent(F->getArg(1))) << "%b should be uniform"; + EXPECT_FALSE(UI.isDivergentAtDef(AddInst)) << "%add should be uniform"; + EXPECT_FALSE(UI.isDivergentAtDef(F->getArg(0))) << "%a should be uniform"; + EXPECT_FALSE(UI.isDivergentAtDef(F->getArg(1))) << "%b should be uniform"; // Create a new instruction after analysis. It was not present during // analysis, so it is not in UniformValues and must be conservatively @@ -74,6 +74,6 @@ TEST_F(AMDGPUTestBase, NewValueIsConservativelyDivergent) { IRBuilder<> Builder(AddInst->getNextNode()); Value *NewInst = Builder.CreateMul(F->getArg(0), F->getArg(1), "new_mul"); - EXPECT_TRUE(UI.isDivergent(NewInst)) + EXPECT_TRUE(UI.isDivergentAtDef(NewInst)) << "New instruction created after analysis must be reported divergent"; } From 635314c6c848e5c0c5f9dcc92c5679bf0b508a50 Mon Sep 17 00:00:00 2001 From: Ritanya-B-Bharadwaj Date: Tue, 12 May 2026 18:46:42 +0530 Subject: [PATCH 441/538] [flang][OpenMP] Fix data race with LINEAR clause on PARALLEL DO SIMD (#195634) Fixes https://github.com/llvm/llvm-project/issues/180093 LINEAR variables on composite DO SIMD were being lowered onto omp.simd, which writes back unconditionally causing a race inside PARALLEL. Move them to omp.wsloop instead, which already has correct last-iter write-back with a barrier. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 24 +++++++++++ .../Lower/OpenMP/composite_simd_linear.f90 | 41 +++++++++++-------- flang/test/Lower/OpenMP/linear_modifier.f90 | 11 ++--- 3 files changed, 53 insertions(+), 23 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index fb5014f3394be..da44717c8b6ed 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3527,6 +3527,17 @@ static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd( genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps, simdReductionSyms, &reductionVarCache); + // Same as genCompositeDoSimd. + if (!simdClauseOps.linearVars.empty()) { + wsloopClauseOps.linearVars = std::move(simdClauseOps.linearVars); + wsloopClauseOps.linearStepVars = std::move(simdClauseOps.linearStepVars); + wsloopClauseOps.linearVarTypes = simdClauseOps.linearVarTypes; + wsloopClauseOps.linearModifiers = simdClauseOps.linearModifiers; + simdClauseOps.linearVars.clear(); + simdClauseOps.linearStepVars.clear(); + simdClauseOps.linearVarTypes = nullptr; + simdClauseOps.linearModifiers = nullptr; + } DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval, /*shouldCollectPreDeterminedSymbols=*/true, /*useDelayedPrivatization=*/true, symTable); @@ -3662,6 +3673,19 @@ static mlir::omp::WsloopOp genCompositeDoSimd( genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps, simdReductionSyms, &reductionVarCache); + // omp.simd writes back linear vars unconditionally, causing a race when + // inside a parallel region. Move them to wsloop which has proper last-iter + // write-back guarded by a barrier. + if (!simdClauseOps.linearVars.empty()) { + wsloopClauseOps.linearVars = std::move(simdClauseOps.linearVars); + wsloopClauseOps.linearStepVars = std::move(simdClauseOps.linearStepVars); + wsloopClauseOps.linearVarTypes = simdClauseOps.linearVarTypes; + wsloopClauseOps.linearModifiers = simdClauseOps.linearModifiers; + simdClauseOps.linearVars.clear(); + simdClauseOps.linearStepVars.clear(); + simdClauseOps.linearVarTypes = nullptr; + simdClauseOps.linearModifiers = nullptr; + } DataSharingProcessor wsloopItemDSP( converter, semaCtx, doItem->clauses, eval, /*shouldCollectPreDeterminedSymbols=*/false, diff --git a/flang/test/Lower/OpenMP/composite_simd_linear.f90 b/flang/test/Lower/OpenMP/composite_simd_linear.f90 index 38ef80292326d..dbe9961cafbdc 100644 --- a/flang/test/Lower/OpenMP/composite_simd_linear.f90 +++ b/flang/test/Lower/OpenMP/composite_simd_linear.f90 @@ -7,13 +7,13 @@ subroutine do_simd !CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFdo_simdEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[CONST:.*]] = arith.constant 1 : i32 !CHECK: %{{.*}} = arith.constant 1 : i32 -!CHECK: %[[IV_STEP:.*]] = arith.constant 1 : i32 -!CHECK: omp.wsloop { -!DEFAULT: omp.simd linear(%[[X]]#0 : !fir.ref = %[[CONST]] : i32, %[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32) private(@_QFdo_simdEi_private_i32 {{.*}} -> %arg0 : !fir.ref) { -!OPENMP52: omp.simd linear(val(%[[X]]#0 : !fir.ref = %[[CONST]] : i32), val(%[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32)) private(@_QFdo_simdEi_private_i32 {{.*}} -> %arg0 : !fir.ref) { +!CHECK: %{{.*}} = arith.constant 1 : i32 +!DEFAULT: omp.wsloop linear(%[[X]]#0 : !fir.ref = {{.*}}) { +!OPENMP52: omp.wsloop linear(val(%[[X]]#0 : !fir.ref = {{.*}})) { +!CHECK: omp.simd linear({{.*}}) private(@_QFdo_simdEi_private_i32 {{.*}} -> %arg0 : !fir.ref) { !CHECK: } -!CHECK: } {linear_var_types = [i32, i32], omp.composite} -!CHECK: } {omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} integer :: x !$omp do simd linear(x:1) do i = 1, N @@ -45,15 +45,18 @@ subroutine distribute_parallel_do !CHECK: omp.teams { !CHECK: omp.parallel { !CHECK: %[[CONST]] = arith.constant 1 : i32 +!CHECK: %{{.*}} = arith.constant 1 : i32 +!CHECK: %[[IV_STEP:.*]] = arith.constant 1 : i32 !CHECK: omp.distribute { -!CHECK: omp.wsloop { -!DEFAULT: omp.simd linear(%[[I]]#0 : !fir.ref = %[[CONST]] : i32) private(@_QFdistribute_parallel_doEi_private_i32 %[[I]]#0 -> %arg0 : !fir.ref) { -!OPENMP52: omp.simd linear(val(%[[I]]#0 : !fir.ref = %[[CONST]] : i32)) private(@_QFdistribute_parallel_doEi_private_i32 %[[I]]#0 -> %arg0 : !fir.ref) { +!DEFAULT: omp.wsloop linear(%[[I]]#0 : !fir.ref = {{.*}}) { +!OPENMP52: omp.wsloop linear(val(%[[I]]#0 : !fir.ref = {{.*}})) { +!CHECK: omp.simd linear({{.*}}) private(@_QFdistribute_parallel_doEi_private_i32 {{.*}}) { !$omp teams !$omp distribute parallel do simd linear(i:1) do i = 1, N end do !$omp end distribute parallel do simd +!CHECK: } {linear_var_types = [i32], omp.composite} !CHECK: } {linear_var_types = [i32], omp.composite} !$omp end teams end subroutine distribute_parallel_do @@ -64,16 +67,17 @@ subroutine parallel_do !CHECK: omp.parallel { !CHECK: %[[LINEAR_STEP:.*]] = arith.constant 2 : i32 !CHECK: %{{.*}} = arith.constant 1 : i32 -!CHECK: %[[IV_STEP:.*]] = arith.constant 1 : i32 -!CHECK: omp.wsloop { -!DEFAULT: omp.simd linear(%[[X]]#0 : !fir.ref = %[[LINEAR_STEP]] : i32, %[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32) private(@_QFparallel_doEi_private_i32 %[[I]]#0 -> %arg0 : !fir.ref) { -!OPENMP52: omp.simd linear(val(%[[X]]#0 : !fir.ref = %[[LINEAR_STEP]] : i32), val(%[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32)) private(@_QFparallel_doEi_private_i32 %[[I]]#0 -> %arg0 : !fir.ref) { +!CHECK: %{{.*}} = arith.constant 1 : i32 +!DEFAULT: omp.wsloop linear(%[[X]]#0 : !fir.ref = {{.*}}) { +!OPENMP52: omp.wsloop linear(val(%[[X]]#0 : !fir.ref = {{.*}})) { +!CHECK: omp.simd linear({{.*}}) private(@_QFparallel_doEi_private_i32 {{.*}}) { integer :: x !$omp parallel do simd linear(x:2) do i = 1, N end do !$omp end parallel do simd -!CHECK: } {linear_var_types = [i32, i32], omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} end subroutine parallel_do subroutine teams_distribute @@ -103,13 +107,14 @@ subroutine teams_distribute_parallel_do !CHECK: %{{.*}} = arith.constant 1 : i32 !CHECK: %[[IV_STEP:.*]] = arith.constant 1 : i32 !CHECK: omp.distribute { -!CHECK: omp.wsloop { -!DEFAULT: omp.simd linear(%[[X]]#0 : !fir.ref = %c1_i32 : i32, %[[I]]#0 : !fir.ref = %c1_i32_1 : i32) private(@_QFteams_distribute_parallel_doEi_private_i32 %[[I]]#0 -> %arg0 : !fir.ref) { -!OPENMP52: omp.simd linear(val(%[[X]]#0 : !fir.ref = %c1_i32 : i32), val(%[[I]]#0 : !fir.ref = %c1_i32_1 : i32)) private(@_QFteams_distribute_parallel_doEi_private_i32 %[[I]]#0 -> %arg0 : !fir.ref) { +!DEFAULT: omp.wsloop linear(%[[X]]#0 : !fir.ref = {{.*}}) { +!OPENMP52: omp.wsloop linear(val(%[[X]]#0 : !fir.ref = {{.*}})) { +!CHECK: omp.simd linear({{.*}}) private(@_QFteams_distribute_parallel_doEi_private_i32 {{.*}}) { integer :: x !$omp teams distribute parallel do simd linear(x) do i = 1, N end do !$omp end teams distribute parallel do simd -!CHECK: } {linear_var_types = [i32, i32], omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} end subroutine teams_distribute_parallel_do diff --git a/flang/test/Lower/OpenMP/linear_modifier.f90 b/flang/test/Lower/OpenMP/linear_modifier.f90 index 8364e5d698f06..cce565bec86d3 100644 --- a/flang/test/Lower/OpenMP/linear_modifier.f90 +++ b/flang/test/Lower/OpenMP/linear_modifier.f90 @@ -41,14 +41,15 @@ subroutine do_simd_linear !CHECK: %[[CONST:.*]] = arith.constant 1 : i32 !CHECK: %{{.*}} = arith.constant 1 : i32 !CHECK: %[[IV_STEP:.*]] = arith.constant 1 : i32 -!CHECK: omp.wsloop { -!OPENMP52: omp.simd linear(val(%[[X]]#0 : !fir.ref = %[[CONST]] : i32), val(%[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32)) private({{.*}}) { -!OPENMP45: omp.simd linear(%[[X]]#0 : !fir.ref = %[[CONST]] : i32, %[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32) private({{.*}}) { +!OPENMP52: omp.wsloop linear(val(%[[X]]#0 : !fir.ref = %[[CONST]] : i32)) { +!OPENMP45: omp.wsloop linear(%[[X]]#0 : !fir.ref = %[[CONST]] : i32) { +!OPENMP52: omp.simd linear(val(%[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32)) private({{.*}}) { +!OPENMP45: omp.simd linear(%[[I]]#0 : !fir.ref = %[[IV_STEP]] : i32) private({{.*}}) { integer :: x !$omp do simd linear(x:1) do i = 1, 10 end do !$omp end do simd -!CHECK: } {linear_var_types = [i32, i32], omp.composite} -!CHECK: } {omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} +!CHECK: } {linear_var_types = [i32], omp.composite} end subroutine do_simd_linear From b7c62f76798eb62306b9dfe1321fa776a4f40c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=AD=90=E6=98=82?= <2802328816@qq.com> Date: Tue, 12 May 2026 21:22:03 +0800 Subject: [PATCH 442/538] [CodeGen][NFC] Extract foldLoadInto helper in PeepholeOptimizer (#197110) Pull the load-folding bookkeeping out of `PeepholeOptimizer::run` into a new `PeepholeOptimizer::foldLoadInto` helper. No functional change intended. This is a preliminary NFC split out from #194662 per @RKSimon's review suggestion: > Still think this is worth pulling out as its own NFC PR The follow-up patch (#194662) adds a second call site that folds a load into an EFLAGS producer after `optimizeCmpInstr` erases the compare, and will reuse this helper instead of duplicating the bookkeeping. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 57 +++++++++++++++----------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 1cd2ec5c1cded..d10cf18b1c5e2 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -494,6 +494,13 @@ class PeepholeOptimizer : private MachineFunction::Delegate { bool isLoadFoldable(MachineInstr &MI, SmallSet &FoldAsLoadDefCandidates); + /// Try to fold the load defined by \p FoldReg into \p MI using + /// TII->optimizeLoadInstr. On success, updates \p LocalMIs, erases the old + /// instructions, and returns the replacement; returns nullptr otherwise. + MachineInstr *foldLoadInto(MachineFunction &MF, MachineInstr &MI, + Register FoldReg, + SmallPtrSet &LocalMIs); + /// Check whether \p MI is understood by the register coalescer /// but may require some rewriting. static bool isCoalescableCopy(const MachineInstr &MI) { @@ -1392,6 +1399,31 @@ bool PeepholeOptimizer::isLoadFoldable( return false; } +MachineInstr * +PeepholeOptimizer::foldLoadInto(MachineFunction &MF, MachineInstr &MI, + Register FoldReg, + SmallPtrSet &LocalMIs) { + Register Reg = FoldReg; + MachineInstr *DefMI = nullptr; + MachineInstr *CopyMI = nullptr; + MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI, Reg, DefMI, CopyMI); + if (!FoldMI) + return nullptr; + LLVM_DEBUG(dbgs() << "Replacing: " << MI << " With: " << *FoldMI); + LocalMIs.erase(&MI); + LocalMIs.erase(DefMI); + LocalMIs.insert(FoldMI); + if (CopyMI) + LocalMIs.insert(CopyMI); + if (MI.shouldUpdateAdditionalCallInfo()) + MF.moveAdditionalCallInfo(&MI, FoldMI); + MI.eraseFromParent(); + DefMI->eraseFromParent(); + MRI->markUsesInDebugValueAsUndef(FoldReg); + ++NumLoadFold; + return FoldMI; +} + bool PeepholeOptimizer::isMoveImmediate( MachineInstr &MI, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs) { @@ -1862,31 +1894,10 @@ bool PeepholeOptimizer::run(MachineFunction &MF) { if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) { // We need to fold load after optimizeCmpInstr, since // optimizeCmpInstr can enable folding by converting SUB to CMP. - // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and - // we need it for markUsesInDebugValueAsUndef(). Register FoldedReg = FoldAsLoadDefReg; - MachineInstr *DefMI = nullptr; - MachineInstr *CopyMI = nullptr; - if (MachineInstr *FoldMI = TII->optimizeLoadInstr( - *MI, MRI, FoldAsLoadDefReg, DefMI, CopyMI)) { - // Update LocalMIs since we replaced MI with FoldMI and deleted - // DefMI. - LLVM_DEBUG(dbgs() << "Replacing: " << *MI); - LLVM_DEBUG(dbgs() << " With: " << *FoldMI); - LocalMIs.erase(MI); - LocalMIs.erase(DefMI); - LocalMIs.insert(FoldMI); - if (CopyMI) - LocalMIs.insert(CopyMI); - // Update the call info. - if (MI->shouldUpdateAdditionalCallInfo()) - MI->getMF()->moveAdditionalCallInfo(MI, FoldMI); - MI->eraseFromParent(); - DefMI->eraseFromParent(); - MRI->markUsesInDebugValueAsUndef(FoldedReg); + if (MachineInstr *FoldMI = + foldLoadInto(MF, *MI, FoldAsLoadDefReg, LocalMIs)) { FoldAsLoadDefCandidates.erase(FoldedReg); - ++NumLoadFold; - // MI is replaced with FoldMI so we can continue trying to fold Changed = true; MI = FoldMI; From c5fc073cbd35f0e9482006323e39e9b47efdf209 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 12 May 2026 06:23:15 -0700 Subject: [PATCH 443/538] [libc] Use correct include path for in_port_t.h (#197187) llvm-libc-types/stdint-macros.h does not exist. Not sure why this was passing the CMake build, but this causes the bazel build to fail. --- libc/include/llvm-libc-types/in_port_t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/include/llvm-libc-types/in_port_t.h b/libc/include/llvm-libc-types/in_port_t.h index daf33f91d95af..f86c39cb34e8d 100644 --- a/libc/include/llvm-libc-types/in_port_t.h +++ b/libc/include/llvm-libc-types/in_port_t.h @@ -9,7 +9,7 @@ #ifndef LLVM_LIBC_TYPES_IN_PORT_T_H #define LLVM_LIBC_TYPES_IN_PORT_T_H -#include "../llvm-libc-types/stdint-macros.h" +#include "../llvm-libc-macros/stdint-macros.h" typedef uint16_t in_port_t; From 6d22e10845e43897eadab44631eb785170b41d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Endre=20F=C3=BCl=C3=B6p?= Date: Tue, 12 May 2026 15:24:25 +0200 Subject: [PATCH 444/538] [NFC][clang-tidy] Unify diagnostic emission in bugprone-unsafe-functions (#194709) This patch extracts the three diagnostic forms currently duplicated across the Custom and non-Custom branches of `check()` into a single `emitDiag()` helper. --- .../bugprone/UnsafeFunctionsCheck.cpp | 101 ++++++++++-------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp index d87511b6bdd73..936fb991c5fff 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp @@ -249,6 +249,29 @@ void UnsafeFunctionsCheck::registerMatchers(MatchFinder *Finder) { } } +/// A ``Reason`` prefixed with ``>`` produces a fully-custom message and +/// suppresses the ``Replacement`` suffix; an empty ``Replacement`` yields +/// the "it should not be used" form; otherwise the standard suggestion +/// form is used. +static void emitDiag(ClangTidyCheck &Check, const Expr *SourceExpr, + const FunctionDecl *FuncDecl, StringRef Replacement, + StringRef Reason) { + if (Reason.consume_front(">")) { + Check.diag(SourceExpr->getExprLoc(), "function %0 %1") + << FuncDecl << Reason.trim() << SourceExpr->getSourceRange(); + return; + } + if (Replacement.empty()) { + Check.diag(SourceExpr->getExprLoc(), + "function %0 %1; it should not be used") + << FuncDecl << Reason << SourceExpr->getSourceRange(); + return; + } + Check.diag(SourceExpr->getExprLoc(), + "function %0 %1; '%2' should be used instead") + << FuncDecl << Reason << Replacement << SourceExpr->getSourceRange(); +} + void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) { const Expr *SourceExpr = nullptr; const FunctionDecl *FuncDecl = nullptr; @@ -282,60 +305,50 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) { isAnnexKAvailable(IsAnnexKAvailable, PP, getLangOpts()); StringRef FunctionName = FuncDecl->getName(); + std::string Replacement; + std::string Reason; + if (Custom) { + const CheckedFunction *MatchedEntry = nullptr; for (const auto &Entry : CustomFunctions) { if (Entry.Pattern.match(*FuncDecl)) { - StringRef Reason = - Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str(); - - // Omit the replacement, when a fully-custom reason is given. - if (Reason.consume_front(">")) { - diag(SourceExpr->getExprLoc(), "function %0 %1") - << FuncDecl << Reason.trim() << SourceExpr->getSourceRange(); - // Do not recommend a replacement when it is not present. - } else if (Entry.Replacement.empty()) { - diag(SourceExpr->getExprLoc(), - "function %0 %1; it should not be used") - << FuncDecl << Reason << Entry.Replacement - << SourceExpr->getSourceRange(); - // Otherwise, emit the replacement. - } else { - diag(SourceExpr->getExprLoc(), - "function %0 %1; '%2' should be used instead") - << FuncDecl << Reason << Entry.Replacement - << SourceExpr->getSourceRange(); - } - - return; + MatchedEntry = &Entry; + break; } } - - llvm_unreachable("No custom function was matched."); - return; - } - - const std::optional ReplacementFunctionName = - [&]() -> std::optional { - if (AnnexK) { - if (AnnexKIsAvailable) - return getAnnexKReplacementFor(FunctionName); - return std::nullopt; + if (!MatchedEntry) { + llvm_unreachable("No custom function was matched."); + return; } + Replacement = MatchedEntry->Replacement; + Reason = MatchedEntry->Reason.empty() ? "is marked as unsafe" + : MatchedEntry->Reason; + } else { + const std::optional ReplacementFunctionName = + [&]() -> std::optional { + if (AnnexK) { + if (AnnexKIsAvailable) + return getAnnexKReplacementFor(FunctionName); + return std::nullopt; + } - if (Normal) - return getReplacementFor(FunctionName, AnnexKIsAvailable).str(); + if (Normal) + return getReplacementFor(FunctionName, AnnexKIsAvailable).str(); - if (Additional) - return getReplacementForAdditional(FunctionName, AnnexKIsAvailable).str(); + if (Additional) + return getReplacementForAdditional(FunctionName, AnnexKIsAvailable) + .str(); - llvm_unreachable("Unhandled match category"); - }(); - if (!ReplacementFunctionName) - return; + llvm_unreachable("Unhandled match category"); + }(); + if (!ReplacementFunctionName) + return; + + Replacement = *ReplacementFunctionName; + Reason = getRationaleFor(FunctionName).str(); + } - diag(SourceExpr->getExprLoc(), "function %0 %1; '%2' should be used instead") - << FuncDecl << getRationaleFor(FunctionName) - << ReplacementFunctionName.value() << SourceExpr->getSourceRange(); + emitDiag(*this, SourceExpr, FuncDecl, Replacement, Reason); } void UnsafeFunctionsCheck::registerPPCallbacks( From 3473ceaadc2c596281ad2f215d0240a200689e63 Mon Sep 17 00:00:00 2001 From: Victor Chernyakin Date: Tue, 12 May 2026 06:28:13 -0700 Subject: [PATCH 445/538] [clang][NFC] Mark CWG743 and CWG950 as implemented and add tests (#197015) [CWG743](https://wg21.link/cwg743) allows using `decltype` in a *nested-name-specifier*, i.e.: `decltype(foo)::type`. [CWG950](https://wg21.link/cwg950) allows using it as a *base-specifier*, i.e.: `struct B : decltype(foo)`. Both these DRs were resolved by [N3049](https://wg21.link/n3049). Clang supports both of these since 3.1: https://godbolt.org/z/aohPs5zaa --- clang/test/CXX/drs/cwg7xx.cpp | 13 +++++++++++++ clang/test/CXX/drs/cwg9xx.cpp | 10 ++++++++++ clang/www/cxx_dr_status.html | 4 ++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/clang/test/CXX/drs/cwg7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp index 39fcc53cae2ea..09869128f6054 100644 --- a/clang/test/CXX/drs/cwg7xx.cpp +++ b/clang/test/CXX/drs/cwg7xx.cpp @@ -335,6 +335,19 @@ namespace cwg727 { // cwg727: partial Collision c; // #cwg727-Collision-int-int } // namespace cwg727 +namespace cwg743 { // cwg743: 3.1 +#if __cplusplus >= 201103L +struct S { + using T = int; +}; + +decltype(S())::T i; + +template +using foo = typename decltype(T())::I; +#endif +} // namespace cwg743 + namespace cwg777 { // cwg777: 3.7 #if __cplusplus >= 201103L template diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp index d7385ab6db859..b5bcffc840725 100644 --- a/clang/test/CXX/drs/cwg9xx.cpp +++ b/clang/test/CXX/drs/cwg9xx.cpp @@ -51,6 +51,16 @@ namespace cwg948 { // cwg948: 3.7 #endif } // namespace cwg948 +namespace cwg950 { // cwg950: 3.1 +#if __cplusplus >= 201103L +struct A {}; +struct B : decltype(A()) {}; + +template +struct C : decltype(T()) {}; +#endif +} // namespace cwg950 + namespace cwg952 { // cwg952: 2.8 namespace example1 { struct A { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 5fb86ecb85393..842c91a346c3c 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -5230,7 +5230,7 @@

C++ defect report implementation status

[expr.prim.general] CD2 Use of decltype in a nested-name-specifier - Unknown + Clang 3.1 744 @@ -6427,7 +6427,7 @@

C++ defect report implementation status

[dcl.type.simple] CD2 Use of decltype as a class-name - Unknown + Clang 3.1 951 From 5f86a378a3a010897e96a9b390418cacd45dca83 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Tue, 12 May 2026 14:42:35 +0100 Subject: [PATCH 446/538] [AMDGPU] Handle high element extraction with G_UNMERGE_VALUES (#188287) This allows to detect when G_UNMERGE_VALUES extracts a hi16 element and select `s_cvt_hi_f32_f16` removing need for a shift. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 12 + llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll | 228 ++++++++---------- llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll | 204 +++++++--------- 3 files changed, 204 insertions(+), 240 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index c157a88694c39..8568bc94361b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2837,6 +2837,18 @@ static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) { static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out) { + // When unmerging a register that is composed of 2 x 16-bit values allow to + // use an extract hi instruction for the upper 16 bits. We only need to check + // the size of `In` as all defs are guaranteed to be the same type for + // GUnmerge. + if (auto *Unmerge = dyn_cast(MRI.getVRegDef(In))) { + if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In && + MRI.getType(In).getSizeInBits() == 16) { + Out = Unmerge->getSourceReg(); + return true; + } + } + Register Trunc; if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc)))) return false; diff --git a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll index 8674f748a3f73..d4b5825913503 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll @@ -3346,31 +3346,28 @@ define <4 x i1> @test_s_signed_v4f16_v4i1(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 -; GFX12-GI-NEXT: s_min_i32 s0, s0, 0 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_i32 s2, s2, 0 -; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 +; GFX12-GI-NEXT: s_min_i32 s0, s0, 0 ; GFX12-GI-NEXT: s_min_i32 s3, s3, 0 -; GFX12-GI-NEXT: s_max_i32 s0, s0, -1 +; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_max_i32 s2, s2, -1 -; GFX12-GI-NEXT: s_max_i32 s1, s1, -1 +; GFX12-GI-NEXT: s_max_i32 s0, s0, -1 ; GFX12-GI-NEXT: s_max_i32 s3, s3, -1 +; GFX12-GI-NEXT: s_max_i32 s1, s1, -1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s1 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f) ret <4 x i1> %x @@ -3606,31 +3603,28 @@ define <4 x i8> @test_s_signed_v4f16_v4i8(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 -; GFX12-GI-NEXT: s_min_i32 s0, s0, 0x7f +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_i32 s2, s2, 0x7f -; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f +; GFX12-GI-NEXT: s_min_i32 s0, s0, 0x7f ; GFX12-GI-NEXT: s_min_i32 s3, s3, 0x7f -; GFX12-GI-NEXT: s_max_i32 s0, s0, 0xffffff80 +; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_max_i32 s2, s2, 0xffffff80 -; GFX12-GI-NEXT: s_max_i32 s1, s1, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s0, s0, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s3, s3, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s1, s1, 0xffffff80 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s1 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i8> @llvm.fptosi.sat.v4f16.v4i8(<4 x half> %f) ret <4 x i8> %x @@ -3901,23 +3895,20 @@ define <4 x i64> @test_s_signed_v4f16_v4i64(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i64> @llvm.fptosi.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x @@ -5490,51 +5481,46 @@ define <8 x i1> @test_s_signed_v8f16_v8i1(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 -; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 -; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s2 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s3 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_i32_f32 s4, s4 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_i32_f32 s5, s5 -; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_cvt_i32_f32 s6, s6 -; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 +; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_i32_f32 s7, s7 -; GFX12-GI-NEXT: s_min_i32 s0, s0, 0 +; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_i32 s4, s4, 0 -; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 +; GFX12-GI-NEXT: s_min_i32 s0, s0, 0 ; GFX12-GI-NEXT: s_min_i32 s5, s5, 0 -; GFX12-GI-NEXT: s_min_i32 s2, s2, 0 +; GFX12-GI-NEXT: s_min_i32 s1, s1, 0 ; GFX12-GI-NEXT: s_min_i32 s6, s6, 0 -; GFX12-GI-NEXT: s_min_i32 s3, s3, 0 +; GFX12-GI-NEXT: s_min_i32 s2, s2, 0 ; GFX12-GI-NEXT: s_min_i32 s7, s7, 0 -; GFX12-GI-NEXT: s_max_i32 s0, s0, -1 +; GFX12-GI-NEXT: s_min_i32 s3, s3, 0 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_max_i32 s4, s4, -1 -; GFX12-GI-NEXT: s_max_i32 s1, s1, -1 +; GFX12-GI-NEXT: s_max_i32 s0, s0, -1 ; GFX12-GI-NEXT: s_max_i32 s5, s5, -1 -; GFX12-GI-NEXT: s_max_i32 s2, s2, -1 +; GFX12-GI-NEXT: s_max_i32 s1, s1, -1 ; GFX12-GI-NEXT: s_max_i32 s6, s6, -1 -; GFX12-GI-NEXT: s_max_i32 s3, s3, -1 +; GFX12-GI-NEXT: s_max_i32 s2, s2, -1 ; GFX12-GI-NEXT: s_max_i32 s7, s7, -1 +; GFX12-GI-NEXT: s_max_i32 s3, s3, -1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f) ret <8 x i1> %x @@ -5939,51 +5925,46 @@ define <8 x i8> @test_s_signed_v8f16_v8i8(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 -; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 -; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s2 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s3 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_i32_f32 s4, s4 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_i32_f32 s5, s5 -; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_cvt_i32_f32 s6, s6 -; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 +; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_i32_f32 s7, s7 -; GFX12-GI-NEXT: s_min_i32 s0, s0, 0x7f +; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_i32 s4, s4, 0x7f -; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f +; GFX12-GI-NEXT: s_min_i32 s0, s0, 0x7f ; GFX12-GI-NEXT: s_min_i32 s5, s5, 0x7f -; GFX12-GI-NEXT: s_min_i32 s2, s2, 0x7f +; GFX12-GI-NEXT: s_min_i32 s1, s1, 0x7f ; GFX12-GI-NEXT: s_min_i32 s6, s6, 0x7f -; GFX12-GI-NEXT: s_min_i32 s3, s3, 0x7f +; GFX12-GI-NEXT: s_min_i32 s2, s2, 0x7f ; GFX12-GI-NEXT: s_min_i32 s7, s7, 0x7f -; GFX12-GI-NEXT: s_max_i32 s0, s0, 0xffffff80 +; GFX12-GI-NEXT: s_min_i32 s3, s3, 0x7f ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_max_i32 s4, s4, 0xffffff80 -; GFX12-GI-NEXT: s_max_i32 s1, s1, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s0, s0, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s5, s5, 0xffffff80 -; GFX12-GI-NEXT: s_max_i32 s2, s2, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s1, s1, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s6, s6, 0xffffff80 -; GFX12-GI-NEXT: s_max_i32 s3, s3, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s2, s2, 0xffffff80 ; GFX12-GI-NEXT: s_max_i32 s7, s7, 0xffffff80 +; GFX12-GI-NEXT: s_max_i32 s3, s3, 0xffffff80 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i8> @llvm.fptosi.sat.v8f16.v8i8(<8 x half> %f) ret <8 x i8> %x @@ -6419,37 +6400,32 @@ define <8 x i64> @test_s_signed_v8f16_v8i64(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 -; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 -; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s2 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s3 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_i32_f32 s4, s4 -; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_i32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_i32_f32 s5, s5 -; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_i32_f32 s1, s1 ; GFX12-GI-NEXT: s_cvt_i32_f32 s6, s6 -; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 +; GFX12-GI-NEXT: s_cvt_i32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_i32_f32 s7, s7 -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: s_cvt_i32_f32 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v12, s7 :: v_dual_mov_b32 v13, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v15, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f) ret <8 x i64> %x diff --git a/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll index 613871827584a..cadd39097b55e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll @@ -3117,26 +3117,23 @@ define <4 x i1> @test_s_unsigned_v4f16_v4i1(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 -; GFX12-GI-NEXT: s_min_u32 s0, s0, 1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_u32 s2, s2, 1 -; GFX12-GI-NEXT: s_min_u32 s1, s1, 1 +; GFX12-GI-NEXT: s_min_u32 s0, s0, 1 ; GFX12-GI-NEXT: s_min_u32 s3, s3, 1 +; GFX12-GI-NEXT: s_min_u32 s1, s1, 1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s1 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i1> @llvm.fptoui.sat.v4f16.v4i1(<4 x half> %f) ret <4 x i1> %x @@ -3340,26 +3337,23 @@ define <4 x i8> @test_s_unsigned_v4f16_v4i8(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 -; GFX12-GI-NEXT: s_min_u32 s0, s0, 0xff +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_u32 s2, s2, 0xff -; GFX12-GI-NEXT: s_min_u32 s1, s1, 0xff +; GFX12-GI-NEXT: s_min_u32 s0, s0, 0xff ; GFX12-GI-NEXT: s_min_u32 s3, s3, 0xff +; GFX12-GI-NEXT: s_min_u32 s1, s1, 0xff ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s2 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s3 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s1 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i8> @llvm.fptoui.sat.v4f16.v4i8(<4 x half> %f) ret <4 x i8> %x @@ -3615,23 +3609,20 @@ define <4 x i64> @test_s_unsigned_v4f16_v4i64(<4 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s2, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s3, s1, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i64> @llvm.fptoui.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x @@ -5135,42 +5126,37 @@ define <8 x i1> @test_s_unsigned_v8f16_v8i1(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 -; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 -; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s2 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s3 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_u32_f32 s4, s4 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_u32_f32 s5, s5 -; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_cvt_u32_f32 s6, s6 -; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 +; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_u32_f32 s7, s7 -; GFX12-GI-NEXT: s_min_u32 s0, s0, 1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_u32 s4, s4, 1 -; GFX12-GI-NEXT: s_min_u32 s1, s1, 1 +; GFX12-GI-NEXT: s_min_u32 s0, s0, 1 ; GFX12-GI-NEXT: s_min_u32 s5, s5, 1 -; GFX12-GI-NEXT: s_min_u32 s2, s2, 1 +; GFX12-GI-NEXT: s_min_u32 s1, s1, 1 ; GFX12-GI-NEXT: s_min_u32 s6, s6, 1 -; GFX12-GI-NEXT: s_min_u32 s3, s3, 1 +; GFX12-GI-NEXT: s_min_u32 s2, s2, 1 ; GFX12-GI-NEXT: s_min_u32 s7, s7, 1 +; GFX12-GI-NEXT: s_min_u32 s3, s3, 1 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i1> @llvm.fptoui.sat.v8f16.v8i1(<8 x half> %f) ret <8 x i1> %x @@ -5528,42 +5514,37 @@ define <8 x i8> @test_s_unsigned_v8f16_v8i8(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 -; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 -; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s2 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s3 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_u32_f32 s4, s4 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_u32_f32 s5, s5 -; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_cvt_u32_f32 s6, s6 -; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 +; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_u32_f32 s7, s7 -; GFX12-GI-NEXT: s_min_u32 s0, s0, 0xff +; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_min_u32 s4, s4, 0xff -; GFX12-GI-NEXT: s_min_u32 s1, s1, 0xff +; GFX12-GI-NEXT: s_min_u32 s0, s0, 0xff ; GFX12-GI-NEXT: s_min_u32 s5, s5, 0xff -; GFX12-GI-NEXT: s_min_u32 s2, s2, 0xff +; GFX12-GI-NEXT: s_min_u32 s1, s1, 0xff ; GFX12-GI-NEXT: s_min_u32 s6, s6, 0xff -; GFX12-GI-NEXT: s_min_u32 s3, s3, 0xff +; GFX12-GI-NEXT: s_min_u32 s2, s2, 0xff ; GFX12-GI-NEXT: s_min_u32 s7, s7, 0xff +; GFX12-GI-NEXT: s_min_u32 s3, s3, 0xff ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s5 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s6 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s7 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s2 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i8> @llvm.fptoui.sat.v8f16.v8i8(<8 x half> %f) ret <8 x i8> %x @@ -5974,37 +5955,32 @@ define <8 x i64> @test_s_unsigned_v8f16_v8i64(<8 x half> inreg %f) { ; GFX12-GI-NEXT: s_wait_samplecnt 0x0 ; GFX12-GI-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GI-NEXT: s_wait_kmcnt 0x0 -; GFX12-GI-NEXT: s_lshr_b32 s4, s0, 16 -; GFX12-GI-NEXT: s_lshr_b32 s5, s1, 16 -; GFX12-GI-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-GI-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-GI-NEXT: s_cvt_f32_f16 s0, s0 -; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s4 -; GFX12-GI-NEXT: s_cvt_f32_f16 s1, s1 -; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s5 -; GFX12-GI-NEXT: s_cvt_f32_f16 s2, s2 -; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s6 -; GFX12-GI-NEXT: s_cvt_f32_f16 s3, s3 -; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s7 -; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s4, s0 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s0, s0 +; GFX12-GI-NEXT: s_cvt_f32_f16 s5, s1 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s1, s1 +; GFX12-GI-NEXT: s_cvt_f32_f16 s6, s2 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s2, s2 +; GFX12-GI-NEXT: s_cvt_f32_f16 s7, s3 +; GFX12-GI-NEXT: s_cvt_hi_f32_f16 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GI-NEXT: s_cvt_u32_f32 s4, s4 -; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 +; GFX12-GI-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-GI-NEXT: s_cvt_u32_f32 s5, s5 -; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 +; GFX12-GI-NEXT: s_cvt_u32_f32 s1, s1 ; GFX12-GI-NEXT: s_cvt_u32_f32 s6, s6 -; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 +; GFX12-GI-NEXT: s_cvt_u32_f32 s2, s2 ; GFX12-GI-NEXT: s_cvt_u32_f32 s7, s7 -; GFX12-GI-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: s_cvt_u32_f32 s3, s3 ; GFX12-GI-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GI-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, 0 -; GFX12-GI-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v12, s7 :: v_dual_mov_b32 v13, 0 +; GFX12-GI-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v15, 0 ; GFX12-GI-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i64> @llvm.fptoui.sat.v8f16.v8i64(<8 x half> %f) ret <8 x i64> %x From 1e44c0b7bf6ed2aecf09b50c8710a57c32006715 Mon Sep 17 00:00:00 2001 From: Yihan Wang Date: Tue, 12 May 2026 21:43:33 +0800 Subject: [PATCH 447/538] [NFC][clang] Mark P3868R1 as implemented (#197181) This feature was already implemented by https://github.com/llvm/llvm-project/issues/153641. --------- Signed-off-by: yronglin --- clang/www/cxx_status.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index 8b7290a2a60cc..315fa54531a02 100755 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -336,7 +336,7 @@

C++2c implementation status

Allow #line before module declarations P3868R1 - No + Clang 21 From 3bdbf49065ee3b97606d22ba69399a1548011cb0 Mon Sep 17 00:00:00 2001 From: Uyiosa Iyekekpolor <96444432+uyoyo0@users.noreply.github.com> Date: Tue, 12 May 2026 09:44:08 -0400 Subject: [PATCH 448/538] [z/OS] Add z/OS archive reading support (#187110) Add support for reading `z/OS` archives, which use EBCDIC-encoded header fields and an EBCDIC magic string. The `z/OS` archive format shares the same structural layout as traditional Unix archives but all text fields (member names, timestamps, permissions, and symbol names) are in EBCDIC. This patch adds: - `K_ZOS` archive kind - `ZOSArchiveMemberHeader`: converts EBCDIC header fields to ASCII on read - `ZOSArchive`: parses the __.SYMDEF symbol table, converting EBCDIC symbol names to ASCII - Updates to symbol table traversal for `K_ZOS`, which uses big-endian 4-byte offsets paired with 4-byte attribute words per symbol This is part 2 of a patch series adding `z/OS` archive support to LLVM. Part 1: #186854 --- llvm/include/llvm/Object/Archive.h | 63 ++- llvm/lib/Object/Archive.cpp | 222 +++++++++- llvm/lib/Object/ArchiveWriter.cpp | 3 + .../Object/Inputs/generate_zos_archive.py | 400 ++++++++++++++++++ llvm/test/Object/zos-archive-read.test | 69 +++ 5 files changed, 754 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Object/Inputs/generate_zos_archive.py create mode 100644 llvm/test/Object/zos-archive-read.test diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h index c97018d3231d5..cdb7d32311e4a 100644 --- a/llvm/include/llvm/Object/Archive.h +++ b/llvm/include/llvm/Object/Archive.h @@ -158,6 +158,46 @@ class LLVM_ABI BigArchiveMemberHeader Expected isThin() const override { return false; } }; +// Define file member header of z/OS archive. +// The fixed part of the member header (in EBCDIC) is: +// struct ar_hdr { +// char ar_name[16]; /* space-padded member name */ +// char ar_date[12]; /* date (decimal) */ +// char ar_uid[6]; /* user id (decimal) */ +// char ar_gid[6]; /* group id (decimal) */ +// char ar_mode[8]; /* access mode (octal) */ +// char ar_size[10]; /* length in bytes (decimal) */ +// char ar_fmag[2]; /* contains backtick (X'79'), followed by new line +// (X'15') */ +// }; +class ZOSArchiveMemberHeader : public ArchiveMemberHeader { +public: + ZOSArchiveMemberHeader(Archive const *Parent, const char *RawHeaderPtr, + uint64_t Size, Error *Err); + std::unique_ptr clone() const override { + return std::make_unique(*this); + } + + // Converted EBCDIC to ASCII header string fields. + std::string RawMemberName; + std::string MemberName; + std::string LastModified; + std::string UID; + std::string GID; + std::string AccessMode; + + void setMemberHeaderStrings(Error *Err, uint64_t Size); + + Expected getRawName() const override; + Expected getName(uint64_t Size) const override; + StringRef getRawAccessMode() const override; + StringRef getRawLastModified() const override; + StringRef getRawUID() const override; + StringRef getRawGID() const override; + Expected getSize() const override; + Expected isThin() const override { return false; } +}; + class LLVM_ABI Archive : public Binary { virtual void anchor(); @@ -343,7 +383,16 @@ class LLVM_ABI Archive : public Binary { /// Size field is 10 decimal digits long static const uint64_t MaxMemberSize = 9999999999; - enum Kind { K_GNU, K_GNU64, K_BSD, K_DARWIN, K_DARWIN64, K_COFF, K_AIXBIG }; + enum Kind { + K_GNU, + K_GNU64, + K_BSD, + K_DARWIN, + K_DARWIN64, + K_COFF, + K_AIXBIG, + K_ZOS + }; Kind kind() const { return (Kind)Format; } bool isThin() const { return IsThin; } @@ -434,6 +483,18 @@ class BigArchive : public Archive { bool has64BitGlobalSymtab() { return Has64BitGlobalSymtab; } }; +class ZOSArchive : public Archive { +public: + // Fixed-Length header. + struct FixLenHdr { + char Magic[sizeof(ZOSArchiveMagic) - 1]; ///< ZOS archive magic string. + }; + + ZOSArchive(MemoryBufferRef Source, Error &Err); + +private: + std::string SymbolTableBuf; // __.SYMDEF strings converted to ASCII. +}; } // end namespace object } // end namespace llvm diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp index 17c926e621f36..d95f9103747aa 100644 --- a/llvm/lib/Object/Archive.cpp +++ b/llvm/lib/Object/Archive.cpp @@ -17,6 +17,7 @@ #include "llvm/Object/Binary.h" #include "llvm/Object/Error.h" #include "llvm/Support/Chrono.h" +#include "llvm/Support/ConvertEBCDIC.h" #include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/Error.h" @@ -104,7 +105,14 @@ ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent, *Err = createMemberHeaderParseError(this, RawHeaderPtr, Size); return; } - if (ArMemHdr->Terminator[0] != '`' || ArMemHdr->Terminator[1] != '\n') { + // '\x79\x15' is the EBCDIC equivalent of '`\n' for the z/OS archive + // terminator. + bool ValidTerminator = + Parent->kind() == Archive::K_ZOS + ? (ArMemHdr->Terminator[0] == '\x79' && + ArMemHdr->Terminator[1] == '\x15') + : (ArMemHdr->Terminator[0] == '`' && ArMemHdr->Terminator[1] == '\n'); + if (!ValidTerminator) { if (Err) { std::string Buf; raw_string_ostream OS(Buf); @@ -118,8 +126,9 @@ ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent, consumeError(NameOrErr.takeError()); uint64_t Offset = RawHeaderPtr - Parent->getData().data(); *Err = malformedError(Msg + "at offset " + Twine(Offset)); - } else + } else { *Err = malformedError(Msg + "for " + NameOrErr.get()); + } } return; } @@ -368,6 +377,114 @@ Expected BigArchiveMemberHeader::getSize() const { return *SizeOrErr + alignTo(*NameLenOrErr, 2); } +template +std::string ebcdicFieldToASCII(const char (&Field)[N]) { + SmallString<64> Dst; + StringRef Src = StringRef(Field, N); + ConverterEBCDIC::convertToUTF8(Src, Dst); + return Dst.str().rtrim(" ").str(); +} + +ZOSArchiveMemberHeader::ZOSArchiveMemberHeader(const Archive *Parent, + const char *RawHeaderPtr, + uint64_t Size, Error *Err) + : ArchiveMemberHeader(Parent, RawHeaderPtr, Size, Err) { + ErrorAsOutParameter ErrAsOutParam(Err); + setMemberHeaderStrings(Err, Size); +} + +Expected ZOSArchiveMemberHeader::getSize() const { + return getArchiveMemberDecField("size", ebcdicFieldToASCII(ArMemHdr->Size), + Parent, this); +} + +Expected ZOSArchiveMemberHeader::getRawName() const { + return StringRef(RawMemberName); +} + +Expected ZOSArchiveMemberHeader::getName(uint64_t /*Size*/) const { + return StringRef(MemberName); +} + +StringRef ZOSArchiveMemberHeader::getRawAccessMode() const { + return StringRef(AccessMode); +} + +StringRef ZOSArchiveMemberHeader::getRawLastModified() const { + return StringRef(LastModified); +} + +StringRef ZOSArchiveMemberHeader::getRawUID() const { return StringRef(UID); } + +StringRef ZOSArchiveMemberHeader::getRawGID() const { return StringRef(GID); } + +void ZOSArchiveMemberHeader::setMemberHeaderStrings(Error *Err, uint64_t Size) { + uint64_t Offset = + reinterpret_cast(ArMemHdr) - Parent->getData().data(); + + // Set RawMemberName + RawMemberName = ebcdicFieldToASCII(ArMemHdr->Name); + if (RawMemberName.empty() || RawMemberName[0] == ' ') { + *Err = malformedError("name contains a leading space for archive member " + "header at offset " + + Twine(Offset)); + return; + } + + // Set MemberName. + if (StringRef(RawMemberName).starts_with("#1/")) { + Expected NameOrErr = ArchiveMemberHeader::getName(Size); + if (!NameOrErr) { + *Err = NameOrErr.takeError(); + return; + } + StringRef Name = NameOrErr.get(); + SmallString<64> ConvertedName; + ConverterEBCDIC::convertToUTF8(Name, ConvertedName); + MemberName = std::string(ConvertedName); + } else { + MemberName = RawMemberName; + } + + // LastModified + LastModified = ebcdicFieldToASCII(ArMemHdr->LastModified); + if (LastModified.empty()) { + *Err = + malformedError("LastModified field is empty or contains only spaces in " + "archive member header at offset " + + Twine(Offset)); + return; + } + + // UID + UID = ebcdicFieldToASCII(ArMemHdr->UID); + if (UID.empty()) { + *Err = malformedError("UID field is empty or contains only spaces in " + "archive member header at offset " + + Twine(Offset)); + return; + } + + // GID + GID = ebcdicFieldToASCII(ArMemHdr->GID); + if (GID.empty()) { + *Err = malformedError("GID field is empty or contains only spaces in " + "archive member header at offset " + + Twine(Offset)); + return; + } + + // AccessMode + AccessMode = ebcdicFieldToASCII(ArMemHdr->AccessMode); + if (AccessMode.empty()) { + *Err = + malformedError("AccessMode field is empty or contains only spaces in " + "archive member header at offset " + + Twine(Offset)); + return; + } +} + Expected BigArchiveMemberHeader::getRawNameSize() const { return getArchiveMemberDecField( "NameLen", getFieldRawString(ArMemHdr->NameLen), Parent, this); @@ -668,6 +785,8 @@ Expected> Archive::create(MemoryBufferRef Source) { if (Buffer.starts_with(BigArchiveMagic)) Ret = std::make_unique(Source, Err); + else if (Buffer.starts_with(ZOSArchiveMagic)) + Ret = std::make_unique(Source, Err); else Ret = std::make_unique(Source, Err); @@ -680,6 +799,10 @@ std::unique_ptr Archive::createArchiveMemberHeader(const char *RawHeaderPtr, uint64_t Size, Error *Err) const { ErrorAsOutParameter ErrAsOutParam(Err); + + if (kind() == K_ZOS) + return std::make_unique(this, RawHeaderPtr, Size, + Err); if (kind() != K_AIXBIG) return std::make_unique(this, RawHeaderPtr, Size, Err); return std::make_unique(this, RawHeaderPtr, Size, @@ -714,6 +837,10 @@ Archive::Archive(MemoryBufferRef Source, Error &Err) Format = K_AIXBIG; IsThin = false; return; + } else if (Buffer.starts_with(ZOSArchiveMagic)) { + Format = K_ZOS; + IsThin = false; + return; } else { Err = make_error("file too small to be an archive", object_error::invalid_file_type); @@ -971,6 +1098,8 @@ object::Archive::Kind Archive::getDefaultKindForTriple(const Triple &T) { return object::Archive::K_AIXBIG; if (T.isOSWindows()) return object::Archive::K_COFF; + if (T.isOSzOS()) + return object::Archive::K_ZOS; return object::Archive::K_GNU; } @@ -1042,6 +1171,12 @@ Expected Archive::Symbol::getMember() const { // the archive of the member that defines the symbol. Which is what // is needed here. Offset = read64le(Offsets + SymbolIndex * 16 + 8); + } else if (Parent->kind() == K_ZOS) { + // Each entry in the offset array is 8 bytes long: + // A 4-byte offset followed by 4 bytes of coded attributes. + // We multiply the SymbolIndex by 8 to reach the correct entry, + // and read the first 4 bytes (the offset). + Offset = read32be(Offsets + SymbolIndex * 8); } else { // Skip offsets. uint32_t MemberCount = read32le(Buf); @@ -1171,6 +1306,15 @@ Archive::symbol_iterator Archive::symbol_begin() const { buf += ran_strx; } else if (kind() == K_AIXBIG) { buf = getStringTable().begin(); + } else if (kind() == K_ZOS) { + // The contents of the z/OS symbol table member are: + // 1. The number of symbols, NS (4-byte integer). + // 2. NS pairs of 4-byte integers (offset and attributes). Length is NS*8 + // bytes. + // 3. NS null terminated strings of corresponding symbol names. + // Here we skip parts 1 and 2 to reach the start of the string table. + uint32_t SymbolCount = read32be(buf); + buf += sizeof(uint32_t) + (SymbolCount * (sizeof(uint64_t))); } else { uint32_t member_count = 0; uint32_t symbol_count = 0; @@ -1244,6 +1388,8 @@ uint32_t Archive::getNumberOfSymbols() const { return read32le(buf) / 8; if (kind() == K_DARWIN64) return read64le(buf) / 16; + if (kind() == K_ZOS) + return read32be(buf); uint32_t member_count = 0; member_count = read32le(buf); buf += 4 + (member_count * 4); // Skip offsets. @@ -1448,3 +1594,75 @@ BigArchive::BigArchive(MemoryBufferRef Source, Error &Err) setFirstRegular(*I); Err = Error::success(); } + +ZOSArchive::ZOSArchive(MemoryBufferRef Source, Error &Err) + : Archive(Source, Err) { + ErrorAsOutParameter ErrAsOutParam(&Err); + + // Get the special members. + child_iterator I = child_begin(Err, false); + if (Err) + return; + child_iterator E = child_end(); + + // See if this is a valid empty archive and if so return. + if (I == E) { + Err = Error::success(); + return; + } + const Child *C = &*I; + + Expected NameOrErr = C->getRawName(); + if (!NameOrErr) { + Err = NameOrErr.takeError(); + return; + } + StringRef Name = NameOrErr.get(); + + if (Name == "__.SYMDEF") { + // Copy symbol table converting embedded EBCDIC names to ASCII. + // getBuffer() cannot fail here because the Child constructor and + // getNext() already validate that the member's size fits within + // the archive. + StringRef EbcdicSymbolTable = cantFail(C->getBuffer()); + if (EbcdicSymbolTable.size() < sizeof(uint32_t)) { + Err = malformedError( + "z/OS archive symbol table is too small to read the symbol count, " + "symbol table size is " + + Twine(EbcdicSymbolTable.size())); + return; + } + uint64_t EbcdicSymbolCount = read32be(EbcdicSymbolTable.data()); + uint64_t OffsetToEbcdicNames = + sizeof(uint32_t) + (EbcdicSymbolCount * (sizeof(uint64_t))); + if (OffsetToEbcdicNames > EbcdicSymbolTable.size()) { + Err = malformedError("z/OS archive symbol table names offset " + + Twine(OffsetToEbcdicNames) + + " exceeds symbol table size " + + Twine(EbcdicSymbolTable.size())); + return; + } + uint64_t EbcdicNamesSize = EbcdicSymbolTable.size() - OffsetToEbcdicNames; + const char *EbcdicNamesPtr = EbcdicSymbolTable.data() + OffsetToEbcdicNames; + StringRef EbcdicNames(EbcdicNamesPtr, EbcdicNamesSize); + + SmallString<64> Dst; + ConverterEBCDIC::convertToUTF8(EbcdicNames, Dst); + SymbolTableBuf.append(EbcdicSymbolTable.data(), OffsetToEbcdicNames); + SymbolTableBuf.append(Dst.str()); + SymbolTable = StringRef(SymbolTableBuf.data(), SymbolTableBuf.size()); + + ++I; + if (Err) + return; + C = &*I; + + setFirstRegular(*C); + Err = Error::success(); + return; + } + + setFirstRegular(*C); + Err = Error::success(); + return; +} diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp index 6d2bbca179836..4610fb4303274 100644 --- a/llvm/lib/Object/ArchiveWriter.cpp +++ b/llvm/lib/Object/ArchiveWriter.cpp @@ -192,6 +192,7 @@ static bool isBSDLike(object::Archive::Kind Kind) { case object::Archive::K_GNU64: case object::Archive::K_AIXBIG: case object::Archive::K_COFF: + case object::Archive::K_ZOS: return false; case object::Archive::K_BSD: case object::Archive::K_DARWIN: @@ -287,6 +288,7 @@ static bool is64BitKind(object::Archive::Kind Kind) { case object::Archive::K_BSD: case object::Archive::K_DARWIN: case object::Archive::K_COFF: + case object::Archive::K_ZOS: return false; case object::Archive::K_AIXBIG: case object::Archive::K_DARWIN64: @@ -517,6 +519,7 @@ getSymbolicFile(MemoryBufferRef Buf, LLVMContext &Context, case object::Archive::K_COFF: case object::Archive::K_DARWIN: case object::Archive::K_DARWIN64: + case object::Archive::K_ZOS: return ObjOrErr.takeError(); } } diff --git a/llvm/test/Object/Inputs/generate_zos_archive.py b/llvm/test/Object/Inputs/generate_zos_archive.py new file mode 100644 index 0000000000000..5489612f2633e --- /dev/null +++ b/llvm/test/Object/Inputs/generate_zos_archive.py @@ -0,0 +1,400 @@ +#!/usr/bin/env python3 +"""Generate z/OS archive files + +z/OS archives use EBCDIC encoding for headers, magic bytes, and symbol names. +This script generates archives in place to avoid reliance on canned binaries. + +Usage examples: + # Valid archive with one member and symbol table: + %python %S/Inputs/generate_zos_archive.py --output %t.a \ + --symtab "foo:0" --member foo.o:%S/Inputs/foo.o + + # Empty archive: + %python %S/Inputs/generate_zos_archive.py --output %t.a --empty + + # Malformed member header: bad terminator + %python %S/Inputs/generate_zos_archive.py --output %t.a \ + --member foo.o --bad-terminator + + # Malformed __.SYMDEF header: bad terminator + %python %S/Inputs/generate_zos_archive.py --output %t.a \ + --member foo.o --symtab foo:0 --malform-symtab-hdr bad-terminator + + # Member with explicit hex content: + %python %S/Inputs/generate_zos_archive.py --output %t.a \ + --member foo.o:hex:deadbeef +""" + +import argparse +import struct +import sys +import os + +# EBCDIC / ASCII conversion table. +# fmt: off +ASCII_TO_EBCDIC_TABLE = ( + 0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F, + 0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F, + 0x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61, + 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F, + 0x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6, + 0xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D, + 0x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96, + 0x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07, +) +# fmt: on + + +def ascii_to_ebcdic(s): + """Convert an ASCII string/bytes to EBCDIC (IBM-1047).""" + if isinstance(s, str): + s = s.encode("ascii") + return bytes(ASCII_TO_EBCDIC_TABLE[b] for b in s) + + +def ebcdic_pad(s, width, pad_char=" "): + """Convert ASCII string to EBCDIC, right-padded with EBCDIC spaces.""" + ascii_padded = s.ljust(width, pad_char) + return ascii_to_ebcdic(ascii_padded) + + +# z/OS archive magic: "!\n" in EBCDIC. +ZOS_MAGIC = b"\x5a\x4c\x81\x99\x83\x88\x6e\x15" + +# Terminator: "`\n" in EBCDIC. +ZOS_TERMINATOR = b"\x79\x15" + +# EBCDIC newline for padding. +EBCDIC_NEWLINE = b"\x15" + + +def make_member_header( + name, + modtime, + uid, + gid, + mode, + size, + bad_terminator=False, + empty_name=False, + empty_uid=False, + empty_gid=False, + empty_modtime=False, + empty_mode=False, +): + """Build a 60-byte z/OS archive member header. + + Fields (all EBCDIC, space-padded): + ar_name: 16 bytes + ar_date: 12 bytes + ar_uid: 6 bytes + ar_gid: 6 bytes + ar_mode: 8 bytes + ar_size: 10 bytes + ar_fmag: 2 bytes (terminator) + Total: 60 bytes + """ + # Handle long names. + long_name_ext = b"" + if len(name) > 16: + name_ebcdic = ascii_to_ebcdic(name) + ext_len = len(name_ebcdic) + display_name = "#1/%d" % ext_len + long_name_ext = name_ebcdic + # The size field includes the extended name length. + size = size + ext_len + else: + display_name = name + + if empty_name: + hdr = ebcdic_pad(" ", 16) + else: + hdr = ebcdic_pad(display_name, 16) + + if empty_modtime: + hdr += ebcdic_pad("", 12) + else: + hdr += ebcdic_pad(str(modtime), 12) + + if empty_uid: + hdr += ebcdic_pad("", 6) + else: + hdr += ebcdic_pad(str(uid), 6) + + if empty_gid: + hdr += ebcdic_pad("", 6) + else: + hdr += ebcdic_pad(str(gid), 6) + + if empty_mode: + hdr += ebcdic_pad("", 8) + else: + hdr += ebcdic_pad(str(mode), 8) + + hdr += ebcdic_pad(str(size), 10) + + if bad_terminator: + hdr += b"\x00\x00" + else: + hdr += ZOS_TERMINATOR + + assert len(hdr) == 60, f"Header is {len(hdr)} bytes, expected 60" + return hdr + long_name_ext + + +def make_symtab(symbols, member_offsets, truncated=False, bad_count=False): + """Build a __.SYMDEF symbol table body. + + symbols: list of (symbol_name_ascii, member_index, attributes) + member_offsets: list of offsets for each member (indexed by member_index) + + Format: + 4 bytes: number of symbols (big-endian) + For each symbol: 4 bytes offset + 4 bytes attributes (big-endian) + Null-terminated symbol names in EBCDIC + """ + num_syms = len(symbols) + if bad_count: + # Write a count that exceeds the buffer. + body = struct.pack(">I", 0xFFFFFFFF) + else: + body = struct.pack(">I", num_syms) + + if truncated: + # Return just the count, truncated before offset table. + return body[:2] + + for sym_name, mem_idx, attrs in symbols: + offset = member_offsets[mem_idx] + body += struct.pack(">II", offset, attrs) + + for sym_name, mem_idx, attrs in symbols: + body += ascii_to_ebcdic(sym_name) + b"\x00" + + return body + + +def parse_member_data(raw): + """Parse the data portion of a --member argument. + + Supports three forms: + /path/to/file - read file contents + hex: - decode hex bytes + - encode as raw ASCII bytes + """ + if os.path.isfile(raw): + with open(raw, "rb") as f: + return f.read() + if raw.startswith("hex:"): + return bytes.fromhex(raw[4:]) + return raw.encode("ascii") + + +# Valid malformation names for --malform-symtab-hdr, mapped to +# make_member_header keyword arguments. +_SYMTAB_HDR_MALFORMATIONS = { + "bad-terminator": "bad_terminator", + "empty-name": "empty_name", + "empty-uid": "empty_uid", + "empty-gid": "empty_gid", + "empty-modtime": "empty_modtime", + "empty-mode": "empty_mode", +} + + +def build_archive(args): + """Build the complete archive bytes.""" + output = bytearray() + output += ZOS_MAGIC + + if args.empty: + return bytes(output) + + # Parse members. + members = [] + if args.member: + for m in args.member: + parts = m.split(":", 1) + name = parts[0] + if len(parts) > 1: + data = parse_member_data(parts[1]) + else: + data = b"\x00" * 16 # Dummy content. + members.append((name, data)) + + # Parse symbols. + symbols = [] + if args.symtab: + for s in args.symtab: + parts = s.split(":") + sym_name = parts[0] + mem_idx = int(parts[1]) if len(parts) > 1 else 0 + attrs = int(parts[2]) if len(parts) > 2 else 0 + symbols.append((sym_name, mem_idx, attrs)) + + # Parse symtab header malformation flags. + symtab_hdr_kwargs = {} + if args.malform_symtab_hdr: + key = args.malform_symtab_hdr + if key not in _SYMTAB_HDR_MALFORMATIONS: + sys.exit( + f"Unknown --malform-symtab-hdr value: {key}. " + f"Valid: {', '.join(_SYMTAB_HDR_MALFORMATIONS.keys())}" + ) + symtab_hdr_kwargs[_SYMTAB_HDR_MALFORMATIONS[key]] = True + + # Phase 1: Compute member offsets. + # Start after magic. + pos = len(ZOS_MAGIC) + + # If we have a symbol table, it comes first. + symtab_body = None + has_symtab = ( + symbols + or args.symtab_no_symbols + or args.symtab_truncated + or args.symtab_bad_count + ) + if has_symtab: + # We need to compute the symtab size, but symtab contains member + # offsets, which depend on symtab size so we do two passes. + + # First pass: compute symtab body with placeholder offsets. + if args.symtab_truncated: + symtab_body = make_symtab([], [], truncated=True) + elif args.symtab_bad_count: + symtab_body = make_symtab([], [], bad_count=True) + elif args.symtab_no_symbols: + symtab_body = struct.pack(">I", 0) # 0 symbols. + else: + placeholder_offsets = [0] * (len(members) + 1) + symtab_body = make_symtab(symbols, placeholder_offsets) + + symtab_hdr_size = 60 # Fixed header for __.SYMDEF. + symtab_total = symtab_hdr_size + len(symtab_body) + # Padding to even boundary. + if symtab_total % 2 != 0: + symtab_total += 1 + pos += symtab_total + + # Compute member offsets. + member_offsets = [] + for name, data in members: + member_offsets.append(pos) + hdr_size = 60 + name_ext = 0 + if len(name) > 16: + name_ext = len(ascii_to_ebcdic(name)) + total = hdr_size + name_ext + len(data) + if total % 2 != 0: + total += 1 + pos += total + + # Second pass: recompute symtab with correct offsets. + if symbols and not args.symtab_truncated and not args.symtab_bad_count: + symtab_body = make_symtab(symbols, member_offsets) + + # Phase 2: Write output. + if symtab_body is not None: + symtab_hdr = make_member_header( + "__.SYMDEF", 0, 0, 0, 0, len(symtab_body), **symtab_hdr_kwargs + ) + output += symtab_hdr + output += symtab_body + # Pad to even boundary. + if len(output) % 2 != 0: + output += EBCDIC_NEWLINE + + for i, (name, data) in enumerate(members): + hdr = make_member_header( + name, + 1234567890, + 0, + 0, + 100644, + len(data), + bad_terminator=args.bad_terminator, + empty_name=args.empty_name, + empty_uid=args.empty_uid, + empty_gid=args.empty_gid, + empty_modtime=args.empty_modtime, + empty_mode=args.empty_mode, + ) + output += hdr + output += data + if len(output) % 2 != 0: + output += EBCDIC_NEWLINE + + return bytes(output) + + +def main(): + parser = argparse.ArgumentParser( + description="Generate z/OS archive files for testing" + ) + parser.add_argument("--output", "-o", required=True, help="Output file path") + parser.add_argument( + "--empty", action="store_true", help="Create an empty archive (magic only)" + ) + parser.add_argument( + "--member", + action="append", + help="Add member as name[:data]. " + "Data can be a file path, hex:DEADBEEF, " + "or a raw ASCII string. If omitted, uses " + "16 zero bytes as dummy content.", + ) + parser.add_argument( + "--symtab", action="append", help="Add symbol: name[:member_index[:attributes]]" + ) + parser.add_argument( + "--symtab-no-symbols", + action="store_true", + help="Add empty symbol table (0 symbols)", + ) + parser.add_argument( + "--symtab-truncated", action="store_true", help="Create truncated symbol table" + ) + parser.add_argument( + "--symtab-bad-count", action="store_true", help="Symbol count exceeds buffer" + ) + parser.add_argument( + "--malform-symtab-hdr", + metavar="MALFORMATION", + help="Apply a malformation to the __.SYMDEF header. " + "Valid values: bad-terminator, empty-name, " + "empty-uid, empty-gid, empty-modtime, empty-mode", + ) + parser.add_argument( + "--bad-terminator", + action="store_true", + help="Use invalid terminator on member headers", + ) + parser.add_argument( + "--empty-name", + action="store_true", + help="Empty/space-leading name on member headers", + ) + parser.add_argument( + "--empty-uid", action="store_true", help="Empty UID on member headers" + ) + parser.add_argument( + "--empty-gid", action="store_true", help="Empty GID on member headers" + ) + parser.add_argument( + "--empty-modtime", + action="store_true", + help="Empty LastModified on member headers", + ) + parser.add_argument( + "--empty-mode", action="store_true", help="Empty AccessMode on member headers" + ) + args = parser.parse_args() + + data = build_archive(args) + with open(args.output, "wb") as f: + f.write(data) + + +if __name__ == "__main__": + main() diff --git a/llvm/test/Object/zos-archive-read.test b/llvm/test/Object/zos-archive-read.test new file mode 100644 index 0000000000000..8f01d13a09f36 --- /dev/null +++ b/llvm/test/Object/zos-archive-read.test @@ -0,0 +1,69 @@ +## Test reading a valid z/OS archive. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.valid.a \ +# RUN: --symtab "foo.txt:0" --symtab "bar:0" --member foo.txt:abcd +# RUN: llvm-ar t %t.valid.a | FileCheck %s --check-prefix=LIST +# RUN: llvm-nm --print-armap %t.valid.a | FileCheck %s --check-prefix=SYMS +# RUN: llvm-ar p %t.valid.a foo.txt | FileCheck %s --check-prefix=CONTENT + +# LIST: foo.txt +# SYMS: Archive map +# SYMS-NEXT: foo.txt in foo.txt +# SYMS-NEXT: bar in foo.txt +# CONTENT: abcd + +## Test malformed terminator on member header. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.bad_term.a \ +# RUN: --member foo.txt --bad-terminator +# RUN: not llvm-ar t %t.bad_term.a 2>&1 | FileCheck %s --check-prefix=ERR-TERM +# ERR-TERM: terminator characters in archive member + +## Test empty UID field on member header. +## We know that the member header starts at offset 8 because no +## symbol table name is generated. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.empty_uid.a \ +# RUN: --member foo.txt --empty-uid +# RUN: not llvm-ar t %t.empty_uid.a 2>&1 | FileCheck %s --check-prefix=ERR-UID +# ERR-UID: UID field is empty or contains only spaces in archive member header at offset 8 + +## Test leading space in member name. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.empty_name.a \ +# RUN: --member foo.txt --empty-name +# RUN: not llvm-ar t %t.empty_name.a 2>&1 | FileCheck %s --check-prefix=ERR-NAME +# ERR-NAME: name contains a leading space for archive member header + +## Test truncated z/OS symbol table. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.trunc_sym.a \ +# RUN: --symtab-truncated +# RUN: not llvm-ar t %t.trunc_sym.a 2>&1 | FileCheck %s --check-prefix=ERR-TRUNC +# ERR-TRUNC: z/OS archive symbol table is too small to read the symbol count, symbol table size is 2 +## Test z/OS symbol table where count exceeds buffer. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.bad_count.a \ +# RUN: --symtab-bad-count +# RUN: not llvm-ar t %t.bad_count.a 2>&1 | FileCheck %s --check-prefix=ERR-COUNT +# ERR-COUNT: z/OS archive symbol table names offset {{[0-9]+}} exceeds symbol table size 4 + +## Test malformed __.SYMDEF header. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.bad_sym_hdr.a \ +# RUN: --member foo.txt --symtab foo.txt:0 --malform-symtab-hdr empty-mode +# RUN: not llvm-ar t %t.bad_sym_hdr.a 2>&1 | FileCheck %s --check-prefix=ERR-SYM-HDR +# ERR-SYM-HDR: AccessMode field is empty or contains only spaces + +## Test empty LastModified field. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.empty_modtime.a \ +# RUN: --member foo.txt --empty-modtime +# RUN: not llvm-ar t %t.empty_modtime.a 2>&1 | FileCheck %s --check-prefix=ERR-MODTIME +# ERR-MODTIME: LastModified field is empty or contains only spaces in archive member header at offset 8 + +## Test empty GID field. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.empty_gid.a \ +# RUN: --member foo.txt --empty-gid +# RUN: not llvm-ar t %t.empty_gid.a 2>&1 | FileCheck %s --check-prefix=ERR-GID +# ERR-GID: GID field is empty or contains only spaces in archive member header at offset 8 + +## Test that a truncated archive with incomplete member header is rejected. +## The z/OS magic is 8 bytes and a member header requires 60 bytes. +## Writing only 3 bytes after the magic is not enough for a valid header. +# RUN: printf '\x5A\x4C\x81\x99\x83\x88\x6E\x15' > %t.badhdr.a +# RUN: printf '\x00\x00\x00' >> %t.badhdr.a +# RUN: not llvm-ar t %t.badhdr.a 2>&1 | FileCheck %s --check-prefix=ERR-CHILD +# ERR-CHILD: truncated or malformed archive From 7899d550d169680f13df98796373d82262549910 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Tue, 12 May 2026 14:48:38 +0100 Subject: [PATCH 449/538] [AArch64][ISel] Add custom lowering for clmul nxv8i16 (#195893) When sve2/sme are available, this sequence provides faster and smaller codegen than the current lowering: ``` clmul.i16(a, b) = xor(pmullb(a_lo, b_lo), lsl(xor(pmul(a_hi, b_lo), pmul(a_lo, b_hi)), 8)) ``` Assisted-by: codex with gpt-5.5 --- .../Target/AArch64/AArch64ISelLowering.cpp | 42 +- llvm/test/CodeGen/AArch64/clmul-scalable.ll | 396 +++--------------- 2 files changed, 95 insertions(+), 343 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 782b928f4b841..413b585503335 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2055,6 +2055,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal); setOperationAction(ISD::CLMUL, {MVT::nxv16i8, MVT::nxv4i32}, Legal); + setOperationAction(ISD::CLMUL, {MVT::nxv8i16}, Custom); setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32, MVT::nxv8f16, Legal); @@ -8098,10 +8099,45 @@ SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerCLMUL(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert( - (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) && - "Unexpected Type"); SDLoc DL(Op); + assert((VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || + VT == MVT::nxv8i16) && + "Unexpected Type"); + + if (VT == MVT::nxv8i16) { + // clmul.i16(a, b) = xor(pmullb(a_lo, b_lo), + // lsl(xor(pmul(a_hi, b_lo), + // pmul(a_lo, b_hi)), + // 8)) + SDValue OpA = Op.getOperand(0); + SDValue OpB = Op.getOperand(1); + // Bitcast to i8 for byte-wise PMUL and PMULLB. + OpA = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::nxv16i8, OpA); + OpB = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::nxv16i8, OpB); + + // Form adjacent byte pairs {a_hi, b_hi} and {b_lo, a_lo}. PMUL then + // computes {a_hi * b_lo, b_hi * a_lo}, and EORBT xors those pairs. + SDValue LoBytes = DAG.getNode(AArch64ISD::TRN1, DL, MVT::nxv16i8, OpB, OpA); + SDValue HiBytes = DAG.getNode(AArch64ISD::TRN2, DL, MVT::nxv16i8, OpA, OpB); + SDValue PMUL = DAG.getNode(ISD::CLMUL, DL, MVT::nxv16i8, HiBytes, LoBytes); + + SDValue EORBT = + DAG.getTargetConstant(Intrinsic::aarch64_sve_eorbt, DL, MVT::i64); + EORBT = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv16i8, EORBT, PMUL, + PMUL, PMUL); + + SDValue PMULLB = + DAG.getTargetConstant(Intrinsic::aarch64_sve_pmullb_pair, DL, MVT::i64); + PMULLB = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv16i8, PMULLB, OpA, + OpB); + + SDValue EORTB = + DAG.getTargetConstant(Intrinsic::aarch64_sve_eortb, DL, MVT::i64); + EORTB = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv16i8, EORTB, + PMULLB, PMULLB, EORBT); + return DAG.getNode(AArch64ISD::NVCAST, DL, MVT::nxv8i16, EORTB); + } + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 64 / VT.getSizeInBits()); EVT CLMULTy = VT == MVT::i8 ? MVT::v8i8 : MVT::v1i64; EVT ExtractTy = VT == MVT::i64 ? MVT::i64 : MVT::i32; diff --git a/llvm/test/CodeGen/AArch64/clmul-scalable.ll b/llvm/test/CodeGen/AArch64/clmul-scalable.ll index ef903902b9e65..6b26295336331 100644 --- a/llvm/test/CodeGen/AArch64/clmul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/clmul-scalable.ll @@ -237,242 +237,42 @@ define @clmul_nxv8i16( %x, @llvm.clmul.nxv8i16( %x, %y) ret %a @@ -1695,134 +1495,50 @@ define @clmul_nxv8i16_zext( %x, %x to %zexty = zext %y to From 6d4f4642d978debd4c4a4057361a0a0f42de5743 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 12 May 2026 09:48:43 -0400 Subject: [PATCH 450/538] [libc] Build with -Wshadow (#196519) This way, clients including libc/shared/math.h don't need to `#pragma GCC diagnostic ignored "-Wshadow"` around the include. This works locally after #196337 #196342 #196346. CI also needed #196529 #196810 #196850 #196851 #196852 #196853 #196855 #196857 #196858 #196859 #196860. --- libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 2d3703eefa0ac..db541a4dfb985 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -301,6 +301,7 @@ function(_get_common_compile_options output_var flags) if(NOT CMAKE_COMPILER_IS_GNUCXX) list(APPEND compile_options "-Wnewline-eof") list(APPEND compile_options "-Wnonportable-system-include-path") + list(APPEND compile_options "-Wshadow") list(APPEND compile_options "-Wstrict-prototypes") list(APPEND compile_options "-Wthread-safety") list(APPEND compile_options "-Wglobal-constructors") From 5a13758207cc0f1e02a0fe0ae492bb98e2d3d070 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 12 May 2026 09:58:15 -0400 Subject: [PATCH 451/538] Revert "[libc] Build with -Wshadow" (#197201) Reverts llvm/llvm-project#196519 Passed CI on the PR, but apparently breaks several bots. --- libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index db541a4dfb985..2d3703eefa0ac 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -301,7 +301,6 @@ function(_get_common_compile_options output_var flags) if(NOT CMAKE_COMPILER_IS_GNUCXX) list(APPEND compile_options "-Wnewline-eof") list(APPEND compile_options "-Wnonportable-system-include-path") - list(APPEND compile_options "-Wshadow") list(APPEND compile_options "-Wstrict-prototypes") list(APPEND compile_options "-Wthread-safety") list(APPEND compile_options "-Wglobal-constructors") From 68dfda06dfd20184d06e8ac82cb6eb96fd8f122e Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 12 May 2026 06:59:10 -0700 Subject: [PATCH 452/538] [AMDGPU] Optimize fneg and fsub with packed fp16 ops (#196659) The work optimize fneg and fsub when packed half math instructions are supported. On global isel path, for wider vectors of G_FSUB with element type of f16, we should split them to v2f16 for v_pk_add_f16 to be selected. On SelectionDAG path, we make FNEG legal, and also make sure to split wider vectors to v2f16. In this way, we can fold fneg into the source modifiers for packed half ops. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +- .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll | 112 +--- .../GlobalISel/combine-fma-sub-neg-mul.ll | 32 +- .../AMDGPU/GlobalISel/legalize-fsub.mir | 193 +++--- .../CodeGen/AMDGPU/packed-fneg-fsub-fp16.ll | 557 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 94 +-- ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 33 +- 8 files changed, 724 insertions(+), 306 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/packed-fneg-fsub-fp16.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 43db1ead84c80..2b08c09ea9563 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1138,6 +1138,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2); FSubActions + .clampMaxNumElements(0, S16, 2) .scalarize(0) .clampScalar(0, S32, S64); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4bd0a0bee00ff..6c88bc7145eda 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -848,8 +848,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, MVT::v2i16, Legal); - setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, - ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FABS, + ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, + ISD::FCANONICALIZE}, MVT::v2f16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, @@ -871,7 +872,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) // Split vector operations. - setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FABS, + ISD::FCANONICALIZE}, VT, Custom); setOperationAction( diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll index d046b854fb0d8..2a84376012eec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -545,12 +545,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -565,12 +561,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul: @@ -578,12 +570,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -598,12 +586,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -618,19 +602,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-DENORM-NEXT: v_sub_f16_e32 v0, v0, v4 -; GFX11-DENORM-NEXT: v_sub_f16_e32 v1, v1, v5 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DENORM-NEXT: v_sub_f16_e32 v2, v6, v2 -; GFX11-DENORM-NEXT: v_sub_f16_e32 v3, v7, v3 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-DENORM-NEXT: v_pack_b32_f16 v0, v0, v2 -; GFX11-DENORM-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y @@ -642,14 +616,10 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX9-LABEL: test_v4f16_sub_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -662,27 +632,19 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -695,14 +657,10 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -715,21 +673,11 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX11-DENORM-LABEL: test_v4f16_sub_mul_rhs: ; GFX11-DENORM: ; %bb.0: ; %.entry ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-DENORM-NEXT: v_sub_f16_e32 v0, v4, v0 -; GFX11-DENORM-NEXT: v_sub_f16_e32 v1, v5, v1 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DENORM-NEXT: v_sub_f16_e32 v2, v2, v6 -; GFX11-DENORM-NEXT: v_sub_f16_e32 v3, v3, v7 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-DENORM-NEXT: v_pack_b32_f16 v0, v0, v2 -; GFX11-DENORM-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll index c0a828ecacbae..129e17b0db2d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -221,12 +221,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -241,12 +237,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul: @@ -254,12 +246,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -274,12 +262,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir index b9702d2befab8..6c444f26c13d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir @@ -457,25 +457,24 @@ body: | ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) - ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] + ; SI-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV3]] + ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG]](<2 x s16>) + ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG]](s16) + ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) ; SI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT]], [[FPEXT1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) - ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC4]] ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG1]](s16) + ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) ; SI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT2]], [[FPEXT3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) - ; SI-NEXT: [[FNEG2:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC5]] + ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG2]](s16) + ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG1]](s16) ; SI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FPEXT4]], [[FPEXT5]] ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) @@ -507,21 +506,22 @@ body: | ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) - ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; VI-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[TRUNC3]] - ; VI-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[TRUNC4]] - ; VI-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[TRUNC5]] - ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB]](s16) - ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB1]](s16) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV3]] + ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG]](<2 x s16>) + ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; VI-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC]], [[TRUNC4]] + ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[TRUNC1]], [[TRUNC5]] + ; VI-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[TRUNC3]] + ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16) + ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16) ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB2]](s16) + ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB]](s16) ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] @@ -537,26 +537,22 @@ body: | ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) - ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[TRUNC3]] - ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[TRUNC4]] - ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[TRUNC5]] + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV3]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[UV]], [[FNEG]] + ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[TRUNC1]] + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FADD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB]](s16), [[FSUB1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -583,55 +579,53 @@ body: | ; SI-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; SI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; SI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; SI-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV2]] ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG]](<2 x s16>) ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; SI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC4]] ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG]](s16) + ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; SI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT]], [[FPEXT1]] ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) - ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC5]] ; SI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG1]](s16) + ; SI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) ; SI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT2]], [[FPEXT3]] ; SI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) - ; SI-NEXT: [[FNEG2:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC6]] - ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG2]](s16) - ; SI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FPEXT4]], [[FPEXT5]] - ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) - ; SI-NEXT: [[FNEG3:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC7]] - ; SI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; SI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG3]](s16) - ; SI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT6]], [[FPEXT7]] - ; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV3]] + ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG1]](<2 x s16>) + ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; SI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; SI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) + ; SI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FPEXT4]], [[FPEXT5]] + ; SI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) + ; SI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; SI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) + ; SI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT6]], [[FPEXT7]] + ; SI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC2]](s16) ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC3]](s16) ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; VI-LABEL: name: test_fsub_v4s16 @@ -640,39 +634,41 @@ body: | ; VI-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; VI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV2]] ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG]](<2 x s16>) ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; VI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; VI-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[TRUNC4]] - ; VI-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[TRUNC5]] - ; VI-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[TRUNC6]] - ; VI-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[TRUNC7]] - ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB]](s16) - ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB1]](s16) + ; VI-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC]], [[TRUNC2]] + ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[TRUNC1]], [[TRUNC3]] + ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16) + ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16) ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB2]](s16) - ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FSUB3]](s16) + ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV3]] + ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG1]](<2 x s16>) + ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[TRUNC4]], [[TRUNC6]] + ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[TRUNC5]], [[TRUNC7]] + ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FADD2]](s16) + ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[FADD3]](s16) ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX9-LABEL: name: test_fsub_v4s16 @@ -681,31 +677,12 @@ body: | ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[TRUNC4]] - ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[TRUNC5]] - ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[TRUNC6]] - ; GFX9-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[TRUNC7]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB]](s16), [[FSUB1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB2]](s16), [[FSUB3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV2]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[UV]], [[FNEG]] + ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(<2 x s16>) = G_FADD [[UV1]], [[FNEG1]] + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FADD]](<2 x s16>), [[FADD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fneg-fsub-fp16.ll b/llvm/test/CodeGen/AMDGPU/packed-fneg-fsub-fp16.ll new file mode 100644 index 0000000000000..f788803a3c358 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/packed-fneg-fsub-fp16.ll @@ -0,0 +1,557 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +define <4 x half> @fadd_v4f16_neg(<4 x half> %first, <4 x half> %second) { +; GFX950-LABEL: fadd_v4f16_neg: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fadd_v4f16_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <4 x half> %second + %add = fadd <4 x half> %first, %neg + ret <4 x half> %add +} + +define <8 x half> @fadd_v8f16_neg(<8 x half> %first, <8 x half> %second) { +; GFX950-LABEL: fadd_v8f16_neg: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fadd_v8f16_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <8 x half> %second + %add = fadd <8 x half> %first, %neg + ret <8 x half> %add +} + +define <4 x half> @fsub_v4f16(<4 x half> %first, <4 x half> %second) { +; GFX950-LABEL: fsub_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fsub_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %sub = fsub <4 x half> %first, %second + ret <4 x half> %sub +} + +define <8 x half> @fsub_v8f16(<8 x half> %first, <8 x half> %second) { +; GFX950-LABEL: fsub_v8f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_add_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fsub_v8f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_add_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %sub = fsub <8 x half> %first, %second + ret <8 x half> %sub +} + +define <2 x half> @fneg_v2f16(<2 x half> %first) { +; GFX950-LABEL: fneg_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fneg_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <2 x half> %first + ret <2 x half> %neg +} + +define <4 x half> @fneg_v4f16(<4 x half> %first) { +; GFX950-LABEL: fneg_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fneg_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <4 x half> %first + ret <4 x half> %neg +} + +define <4 x half> @fmul_v4f16_neg(<4 x half> %first, <4 x half> %second) { +; GFX950-LABEL: fmul_v4f16_neg: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fmul_v4f16_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <4 x half> %second + %mul = fmul <4 x half> %first, %neg + ret <4 x half> %mul +} + +define <2 x half> @fabs_v2f16(<2 x half> %first) { +; GFX950-LABEL: fabs_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fabs_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %abs = tail call <2 x half> @llvm.fabs.v2f16(<2 x half> %first) + ret <2 x half> %abs +} + +define <4 x half> @fabs_v4f16(<4 x half> %first) { +; GFX950-LABEL: fabs_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fabs_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %abs = tail call <4 x half> @llvm.fabs.v4f16(<4 x half> %first) + ret <4 x half> %abs +} + +define <2 x half> @fneg_fabs_v2f16(<2 x half> %first) { +; GFX950-LABEL: fneg_fabs_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fneg_fabs_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %abs = tail call <2 x half> @llvm.fabs.v2f16(<2 x half> %first) + %neg = fneg <2 x half> %abs + ret <2 x half> %neg +} + +define <4 x half> @fneg_fabs_v4f16(<4 x half> %first) { +; GFX950-LABEL: fneg_fabs_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GFX950-NEXT: v_or_b32_e32 v1, 0x80008000, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fneg_fabs_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x80008000, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %abs = tail call <4 x half> @llvm.fabs.v4f16(<4 x half> %first) + %neg = fneg <4 x half> %abs + ret <4 x half> %neg +} + +define <8 x half> @fmul_v8f16_neg(<8 x half> %first, <8 x half> %second) { +; GFX950-LABEL: fmul_v8f16_neg: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fmul_v8f16_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <8 x half> %second + %mul = fmul <8 x half> %first, %neg + ret <8 x half> %mul +} + +define <4 x half> @fma_v4f16_neg(<4 x half> %first, <4 x half> %second, <4 x half> %third) { +; GFX950-LABEL: fma_v4f16_neg: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fma_v4f16_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <4 x half> %second + %fma = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %first, <4 x half> %neg, <4 x half> %third) + ret <4 x half> %fma +} + +define <8 x half> @fma_v8f16_neg(<8 x half> %first, <8 x half> %second, <8 x half> %third) { +; GFX950-LABEL: fma_v8f16_neg: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_fma_f16 v0, v0, v4, v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950-NEXT: v_pk_fma_f16 v1, v1, v5, v9 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950-NEXT: v_pk_fma_f16 v2, v2, v6, v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950-NEXT: v_pk_fma_f16 v3, v3, v7, v11 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: fma_v8f16_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_f16 v0, v0, v4, v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_pk_fma_f16 v1, v1, v5, v9 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_pk_fma_f16 v2, v2, v6, v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_pk_fma_f16 v3, v3, v7, v11 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <8 x half> %second + %fma = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %first, <8 x half> %neg, <8 x half> %third) + ret <8 x half> %fma +} + +define <4 x half> @fminnum_v4f16_neg(<4 x half> %first, <4 x half> %second) { +; GFX950-SDAG-LABEL: fminnum_v4f16_neg: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: fminnum_v4f16_neg: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-LABEL: fminnum_v4f16_neg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v3, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: fminnum_v4f16_neg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v1, v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <4 x half> %second + %fmin = tail call <4 x half> @llvm.minnum.v4f16(<4 x half> %first, <4 x half> %neg) + ret <4 x half> %fmin +} + +define <8 x half> @fminnum_v8f16_neg(<8 x half> %first, <8 x half> %second) { +; GFX950-SDAG-LABEL: fminnum_v8f16_neg: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-SDAG-NEXT: v_pk_min_f16 v1, v1, v4 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-SDAG-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v3, v3, v4 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: fminnum_v8f16_neg: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-GISEL-NEXT: v_pk_min_f16 v1, v1, v4 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-GISEL-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_min_f16 v3, v3, v4 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-LABEL: fminnum_v8f16_neg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v5, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v6, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v7, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v0, v0, v4 +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v1, v1, v5 +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v2, v2, v6 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v3, v3, v7 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: fminnum_v8f16_neg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v5, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v6, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v7, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v1, v1, v5 +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v2, v2, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v3, v3, v7 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <8 x half> %second + %fmin = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> %first, <8 x half> %neg) + ret <8 x half> %fmin +} + +define <4 x half> @fmaxnum_v4f16_neg(<4 x half> %first, <4 x half> %second) { +; GFX950-SDAG-LABEL: fmaxnum_v4f16_neg: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: fmaxnum_v4f16_neg: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-LABEL: fmaxnum_v4f16_neg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v3, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: fmaxnum_v4f16_neg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v2, v2, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v3, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <4 x half> %second + %fmax = tail call <4 x half> @llvm.maxnum.v4f16(<4 x half> %first, <4 x half> %neg) + ret <4 x half> %fmax +} + +define <8 x half> @fmaxnum_v8f16_neg(<8 x half> %first, <8 x half> %second) { +; GFX950-SDAG-LABEL: fmaxnum_v8f16_neg: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v4 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v3, v3, v4 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: fmaxnum_v8f16_neg: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v4 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-GISEL-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v3, v3, v4 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-LABEL: fmaxnum_v8f16_neg: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v5, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v6, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v7, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v0, v0, v4 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v1, v1, v5 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v2, v2, v6 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v3, v3, v7 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: fmaxnum_v8f16_neg: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v4, v4, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v5, v5, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v2, v2, v2 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v6, v6, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v3, v3 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v7, v7, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v5 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v2, v2, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v3, v7 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] + %neg = fneg <8 x half> %second + %fmax = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> %first, <8 x half> %neg) + ret <8 x half> %fmax +} diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 8d94bacf444b6..d05ea8951f23e 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -506,10 +506,8 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX9-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, s4, 16, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -525,8 +523,9 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX8-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-GISEL-NEXT: v_add_f16_e32 v4, v0, v2 +; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -543,12 +542,9 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX10-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 ; GFX10-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-GISEL-NEXT: v_lshl_or_b32 v1, s4, 16, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -574,22 +570,16 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h +; GFX11-GISEL-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX11-GISEL-FAKE16: ; %bb.0: ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v2, v4, v5 -; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 ; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -612,8 +602,7 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l -; GFX12-GISEL-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h +; GFX12-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-GISEL-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: @@ -658,12 +647,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 -; GFX9-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-GISEL-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -680,12 +665,14 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX8-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3 -; GFX8-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-GISEL-NEXT: v_add_f16_e32 v4, v0, v2 +; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 +; GFX8-GISEL-NEXT: v_add_f16_e32 v3, v1, v2 +; GFX8-GISEL-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -702,14 +689,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-GISEL-NEXT: v_sub_f16_e32 v5, v1, v3 -; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-GISEL-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -736,31 +717,12 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: -; GFX11-GISEL-TRUE16: ; %bb.0: -; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.h, v1.h, v3.h -; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: -; GFX11-GISEL-FAKE16: ; %bb.0: -; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v2, v4, v6 -; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v3, v5, v7 -; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-GISEL-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX12-SDAG: ; %bb.0: @@ -782,10 +744,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l -; GFX12-GISEL-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h -; GFX12-GISEL-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l -; GFX12-GISEL-NEXT: v_sub_f16_e32 v1.h, v1.h, v3.h +; GFX12-GISEL-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-GISEL-NEXT: v_pk_add_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index 27c6653d009b3..6c943da2f1dd7 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -409,34 +409,21 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x ; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX1170: ; %bb.0: ; %bb ; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16 -; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9] +; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9] +; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16 ; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GFX1170-NEXT: v_perm_b32 v15, v15, v14, 0x5040100 -; GFX1170-NEXT: v_perm_b32 v14, v13, v12, 0x5040100 +; GFX1170-NEXT: v_mov_b16_e32 v14.h, v15.l ; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1170-NEXT: v_perm_b32 v13, v19, v18, 0x5040100 -; GFX1170-NEXT: v_perm_b32 v12, v17, v16, 0x5040100 +; GFX1170-NEXT: v_mov_b16_e32 v16.h, v17.l +; GFX1170-NEXT: v_mov_b16_e32 v18.h, v19.l +; GFX1170-NEXT: v_mov_b16_e32 v12.h, v13.l +; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1170-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v16 +; GFX1170-NEXT: v_mov_b32_e32 v15, v18 ; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] ; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off ; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16 -; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 -; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100 -; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100 -; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] -; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off -; GFX12-NEXT: s_endpgm bb: %C = load <16 x half>, ptr %Caddr %C_shuffle = shufflevector <16 x half> %C, <16 x half> poison, <8 x i32> @@ -459,3 +446,5 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x float>, i16) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12: {{.*}} From ef6df9dcea99d7dad003539c82264cf8777374e7 Mon Sep 17 00:00:00 2001 From: TPPPP Date: Tue, 12 May 2026 21:59:13 +0800 Subject: [PATCH 453/538] [Clang] Fix assertion when __block is used on global variables in C mode (#194856) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a reland PR, related to #183988 I added an extra check in handleBlocksAttr to ensure that illegal Decl values ​​are not passed to downstream functions. And remove unnecessary check in `CheckCompleteVariableDeclaration`. Also added a extra regression test. Fixes #183974 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaDecl.cpp | 6 ------ clang/lib/Sema/SemaObjC.cpp | 6 ++++++ clang/test/Sema/block-on-objc-ivars.m | 11 +++++++++++ clang/test/Sema/gh183974.c | 5 +++++ 5 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 clang/test/Sema/block-on-objc-ivars.m create mode 100644 clang/test/Sema/gh183974.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bd91b8723a5c6..7ef88461c11a7 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -605,6 +605,7 @@ Bug Fixes to AST Handling parameter list. This also adds asserts to prevent this from happening again. - Fixed a crash when parsing Doxygen ``@param`` commands attached to invalid declarations or non-function entities. (#GH182737) - Fixed the SourceLocation and SourceRange of reversed rewritten CXXOperatorCallExpr. (#GH192467) +- Fixed a assertion when ``__block`` is used on global variables in C mode. (#GH183974) Miscellaneous Bug Fixes ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index a9a4cb89d115f..bec351528760a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -9133,12 +9133,6 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) { } } - if (!NewVD->hasLocalStorage() && NewVD->hasAttr()) { - Diag(NewVD->getLocation(), diag::err_block_on_nonlocal); - NewVD->setInvalidDecl(); - return; - } - if (!NewVD->hasLocalStorage() && T->isSizelessType() && !T.isWebAssemblyReferenceType() && !T->isHLSLSpecificType()) { Diag(NewVD->getLocation(), diag::err_sizeless_nonlocal) << T; diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp index dae30b7e941d1..c351e1e2079c1 100644 --- a/clang/lib/Sema/SemaObjC.cpp +++ b/clang/lib/Sema/SemaObjC.cpp @@ -1711,6 +1711,12 @@ void SemaObjC::handleBlocksAttr(Decl *D, const ParsedAttr &AL) { return; } + VarDecl *VD = dyn_cast(D); + if (VD && !VD->hasLocalStorage()) { + Diag(AL.getLoc(), diag::err_block_on_nonlocal) << AL; + return; + } + D->addAttr(::new (getASTContext()) BlocksAttr(getASTContext(), AL, type)); } diff --git a/clang/test/Sema/block-on-objc-ivars.m b/clang/test/Sema/block-on-objc-ivars.m new file mode 100644 index 0000000000000..f37dc12fc109c --- /dev/null +++ b/clang/test/Sema/block-on-objc-ivars.m @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -fblocks -fsyntax-only -verify %s + +@interface MyClass { + // expected-warning@-1 {{class 'MyClass' defined without specifying a base class}} + // expected-note@-2 {{add a super class to fix this problem}} + __block int _myIvar; +} +@end + +@implementation MyClass +@end diff --git a/clang/test/Sema/gh183974.c b/clang/test/Sema/gh183974.c new file mode 100644 index 0000000000000..642a622761f69 --- /dev/null +++ b/clang/test/Sema/gh183974.c @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 -fblocks -fsyntax-only -verify %s + +__block int x; // expected-error {{__block attribute not allowed, only allowed on local variables}} + +int x; From 17dde875c1064ad5150a48572008a826b8869103 Mon Sep 17 00:00:00 2001 From: Benjamin Stott Date: Tue, 12 May 2026 14:59:35 +0100 Subject: [PATCH 454/538] [Support] Add a function to print the debug log (#197184) With `EnableDebugBuffering`, the debug log is stored in a circular buffer and printed, with a nice banner, on program termination - this is achieved via a signal handler. For in-process tool execution, such as for running the regression tests using daemon versions of the tools, we need to be able to trigger the printing/flushing of the debug log from the process itself. This PR just adds a small function `printDebugLog` which checks if debug output and debug log buffering are enabled and, if so, prints the debug log. The code for printing the debug log in the signal handler is moved to a new function `printDebugLogImpl` which is called by the signal handler and `printDebugLog` - the reason this is separate from `printDebugLog` is to avoid running the option check in the signal handler implementation, in case options were reset before the signal handler is called, as this would be an unintentional behavioral change. --- llvm/include/llvm/Support/Debug.h | 5 +++++ llvm/lib/Support/Debug.cpp | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Support/Debug.h b/llvm/include/llvm/Support/Debug.h index b73f2d7c8b852..54181e1255fdd 100644 --- a/llvm/include/llvm/Support/Debug.h +++ b/llvm/include/llvm/Support/Debug.h @@ -104,6 +104,11 @@ LLVM_ABI extern bool EnableDebugBuffering; /// like: dbgs() << "foo" << "bar"; LLVM_ABI raw_ostream &dbgs(); +/// If EnableDebugBuffering is true, this flushes the debug stream with +/// the banner displayed, the same way it is printed automatically on +/// program termination. +LLVM_ABI void printDebugLog(); + // DEBUG macro - This macro should be used by passes to emit debug information. // If the '-debug' option is specified on the commandline, and if this is a // debug build, then the code specified as the option to the macro will be diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp index b6f338f903a9d..10c0e103e5a47 100644 --- a/llvm/lib/Support/Debug.cpp +++ b/llvm/lib/Support/Debug.cpp @@ -192,8 +192,7 @@ void llvm::initDebugOptions() { *DebugOnly; } -// Signal handlers - dump debug output on termination. -static void debug_user_sig_handler(void *Cookie) { +static void printDebugLogImpl() { // This is a bit sneaky. Since this is under #ifndef NDEBUG, we // know that debug mode is enabled and dbgs() really is a // circular_raw_ostream. If NDEBUG is defined, then dbgs() == @@ -203,6 +202,9 @@ static void debug_user_sig_handler(void *Cookie) { dbgout.flushBufferWithBanner(); } +// Signal handlers - dump debug output on termination. +static void debug_user_sig_handler(void *Cookie) { printDebugLogImpl(); } + /// dbgs - Return a circular-buffered debug stream. raw_ostream &llvm::dbgs() { // Do one-time initialization in a thread-safe way. @@ -224,6 +226,11 @@ raw_ostream &llvm::dbgs() { return thestrm.strm; } +void llvm::printDebugLog() { + if (EnableDebugBuffering && DebugFlag && *DebugBufferSize != 0) + printDebugLogImpl(); +} + #else // Avoid "has no symbols" warning. namespace llvm { @@ -233,6 +240,8 @@ namespace llvm { } } void llvm::initDebugOptions() {} + +void llvm::printDebugLog() {} #endif /// EnableDebugBuffering - Turn on signal handler installation. From 13a20bd71278f5e8784da3037be3ee1d2e3a25b4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2026 15:21:26 +0100 Subject: [PATCH 455/538] [X86] Add test coverage for #196493 (#197198) --- llvm/test/CodeGen/X86/select-big-integer.ll | 83 ++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/X86/select-big-integer.ll b/llvm/test/CodeGen/X86/select-big-integer.ll index b7f4a57b01338..688dca132cb88 100644 --- a/llvm/test/CodeGen/X86/select-big-integer.ll +++ b/llvm/test/CodeGen/X86/select-big-integer.ll @@ -456,4 +456,85 @@ define void @test_neg_i512(ptr %p0, ptr %p1, i1 zeroext %a2, ptr %p3) nounwind { ret void } - +define i128 @PR196493(i1 %c, i128 %a, ptr %m) { +; SSE2-LABEL: PR196493: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: andl $1, %edi +; SSE2-NEXT: negl %edi +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: retq +; +; SSE4-LABEL: PR196493: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE4-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE4-NEXT: andl $1, %edi +; SSE4-NEXT: negl %edi +; SSE4-NEXT: movd %edi, %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE4-NEXT: pand %xmm1, %xmm0 +; SSE4-NEXT: movq %xmm0, %rax +; SSE4-NEXT: pextrq $1, %xmm0, %rdx +; SSE4-NEXT: retq +; +; AVX1-LABEL: PR196493: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: negl %edi +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR196493: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: negl %edi +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: PR196493: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-NEXT: andb $1, %dil +; AVX512F-NEXT: negb %dil +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR196493: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: andb $1, %dil +; AVX512VL-NEXT: negb %dil +; AVX512VL-NEXT: kmovd %edi, %k1 +; AVX512VL-NEXT: vmovdqa32 (%rcx), %xmm0 {%k1} {z} +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: retq + %b = load i128, ptr %m + %sel = select i1 %c, i128 %b, i128 0 + ret i128 %sel +} From 422d023b40023d8c74e0331f08ed3c4ace409255 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 12 May 2026 10:26:40 -0400 Subject: [PATCH 456/538] [libc] Pass -c to compiler when detecting target (#197012) Follow-up to #176680 where I claimed having done this, but apparantly didn't actually add it to the commit. Hopefully no observable behavior change; will tell the compiler to omit linker info in its output, which we don't need for this detection step. --- libc/cmake/modules/LLVMLibCArchitectures.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake index 8ada45dd07cec..095531c0f6bec 100644 --- a/libc/cmake/modules/LLVMLibCArchitectures.cmake +++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake @@ -78,7 +78,7 @@ function(get_arch_and_system_from_triple triple arch_var sys_var) set(${sys_var} ${target_sys} PARENT_SCOPE) endfunction(get_arch_and_system_from_triple) -execute_process(COMMAND ${CMAKE_CXX_COMPILER} -v +execute_process(COMMAND ${CMAKE_CXX_COMPILER} -c -v RESULT_VARIABLE libc_compiler_info_result OUTPUT_VARIABLE libc_compiler_info ERROR_VARIABLE libc_compiler_info) From 16e8a3c8faa569dca6f4d10b31b6f16ae57b50c6 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 12 May 2026 16:28:29 +0200 Subject: [PATCH 457/538] [MemoryBuiltins] Remove isNewLikeFn() (#197209) This function is unused. --- llvm/include/llvm/Analysis/MemoryBuiltins.h | 4 ---- llvm/lib/Analysis/MemoryBuiltins.cpp | 6 ------ 2 files changed, 10 deletions(-) diff --git a/llvm/include/llvm/Analysis/MemoryBuiltins.h b/llvm/include/llvm/Analysis/MemoryBuiltins.h index 1b509fb66ce37..10a31973be7fa 100644 --- a/llvm/include/llvm/Analysis/MemoryBuiltins.h +++ b/llvm/include/llvm/Analysis/MemoryBuiltins.h @@ -58,10 +58,6 @@ LLVM_ABI bool isAllocationFn(const Value *V, function_ref GetTLI); -/// Tests if a value is a call or invoke to a library function that -/// allocates memory via new. -LLVM_ABI bool isNewLikeFn(const Value *V, const TargetLibraryInfo *TLI); - /// Tests if a value is a call or invoke to a library function that /// allocates memory similar to malloc or calloc. LLVM_ABI bool isMallocOrCallocLikeFn(const Value *V, diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index a02949548add8..a587f72fa9a18 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -291,12 +291,6 @@ bool llvm::isAllocationFn( checkFnAllocKind(V, AllocFnKind::Alloc | AllocFnKind::Realloc); } -/// Tests if a value is a call or invoke to a library function that -/// allocates memory via new. -bool llvm::isNewLikeFn(const Value *V, const TargetLibraryInfo *TLI) { - return getAllocationData(V, OpNewLike, TLI).has_value(); -} - /// Tests if a value is a call or invoke to a library function that /// allocates memory similar to malloc or calloc. bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { From f612211116e1234a1ffc36c64aa662c0a462eac4 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 12 May 2026 15:30:26 +0100 Subject: [PATCH 458/538] [VPlan] Introduce reduction selects for tail folding in foldTailByMasking. NFCI (#192987) Currently addComputeReductionResult has to check the cost model to see if the loop is tail folded, and if so then manually fix up the backedge value so any tail elements are ignored. This PR moves this handling into foldTailByMasking itself so the plan doesn't requiring fixing up. We do this by setting the incoming value for the latch phi to the reduction phi instead of poison. A blend will be created for this automatically. The main benefits of this are that the reduction is correct when tail folding is applied, and we don't need to worry about tail folding in as many places. In order to preserve some of the optimizations that we get on VPInstruction::Select we need to convert the VPBlendRecipe to a select. --- .../Transforms/Vectorize/LoopVectorize.cpp | 48 ++++++++++++------- .../Vectorize/VPlanConstruction.cpp | 20 +++++--- .../LoopVectorize/VPlan/tail-folding.ll | 2 +- 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 61c2d3cd228ec..9fec3b74d9630 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6620,9 +6620,19 @@ bool VPRecipeBuilder::replaceWithFinalIfReductionStore( Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { // Only create recipe for the final invariant store of the reduction. if (Legal->isInvariantStoreOfReduction(SI)) { + VPValue *Val = VPI->getOperand(0); + VPValue *Addr = VPI->getOperand(1); + // We need to store the exiting value of the reduction, so use the blend + // if tail folded. + if (auto *Blend = vputils::findUserOf(Val)) + Val = Blend; + assert( + vputils::findUserOf(Val)->getBackedgeValue() == + Val && + "Store isn't backedge value?"); auto *Recipe = new VPReplicateRecipe( - SI, VPI->operandsWithoutMask(), true /* IsUniform */, - nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc()); + SI, {Val, Addr}, true /* IsUniform */, nullptr /*Mask*/, *VPI, *VPI, + VPI->getDebugLoc()); FinalRedStoresBuilder.insert(Recipe); } VPI->eraseFromParent(); @@ -7087,6 +7097,7 @@ void LoopVectorizationPlanner::addReductionResultComputation( VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end()))); VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); + VPValue *HeaderMask = vputils::findHeaderMask(*Plan); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast(&R); @@ -7097,23 +7108,28 @@ void LoopVectorizationPlanner::addReductionResultComputation( const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor( cast(PhiR->getUnderlyingInstr())); Type *PhiTy = TypeInfo.inferScalarType(PhiR); - // If tail is folded by masking, introduce selects between the phi - // and the users outside the vector region of each reduction, at the - // beginning of the dedicated latch block. + + // Convert a VPBlendRecipe backedge to a select. + if (auto *Blend = dyn_cast(PhiR->getBackedgeValue())) { + if (Blend->getNumIncomingValues() == 2 && + Blend->getMask(0) == HeaderMask) { + auto *Sel = VPBuilder(Blend).createSelect( + Blend->getMask(0), Blend->getIncomingValue(0), + Blend->getIncomingValue(1), {}, "", *Blend); + Blend->replaceAllUsesWith(Sel); + Blend->eraseFromParent(); + } + } + auto *OrigExitingVPV = PhiR->getBackedgeValue(); auto *NewExitingVPV = PhiR->getBackedgeValue(); - if (!PhiR->isInLoop() && CM.foldTailByMasking()) { - VPValue *Cond = vputils::findHeaderMask(*Plan); - NewExitingVPV = - Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", *PhiR); - OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { - return match(&U, - m_VPInstruction()); - }); - if (CM.usePredicatedReductionSelect(RecurrenceKind)) - PhiR->setOperand(1, NewExitingVPV); - } + // Remove the predicated select if the target doesn't want it. + VPValue *V; + if (!CM.usePredicatedReductionSelect(RecurrenceKind) && + match(PhiR->getBackedgeValue(), + m_Select(m_Specific(HeaderMask), m_VPValue(V), m_Specific(PhiR)))) + PhiR->setBackedgeValue(V); // We want code in the middle block to appear to execute on the location of // the scalar loop's latch terminator because: (a) it is all compiler diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index c2b05984db272..7c54a223f9793 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1359,18 +1359,26 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) { if (match(&R, m_ExtractLastPart(m_VPValue(V)))) NeedsPhi[V].push_back(&R); - // Insert phis with a poison incoming value for past the end of the tail. + // Insert phis for values coming past the end of the tail. Builder.setInsertPoint(Latch, Latch->begin()); VPTypeAnalysis TypeInfo(Plan); for (const auto &[V, Users] : NeedsPhi) { if (isa(V)) continue; - // TODO: For reduction phis, use phi value instead of poison so we can - // remove the special casing for tail folding in - // LoopVectorizationPlanner::addReductionResultComputation - VPValue *Poison = + VPValue *TailVal = Plan.getOrAddLiveIn(PoisonValue::get(TypeInfo.inferScalarType(V))); - VPInstruction *Phi = Builder.createScalarPhi({V, Poison}); + VPIRFlags Flags; + assert(llvm::count_if(Users, IsaPred) <= 1 && + "Value used by more than two reduction phis?"); + auto *RedIt = find_if(Users, IsaPred); + auto *RdxPhi = + RedIt != Users.end() ? cast(*RedIt) : nullptr; + if (RdxPhi && !RdxPhi->isInLoop()) { + TailVal = RdxPhi; + Flags = *RdxPhi; + } + + VPInstruction *Phi = Builder.createScalarPhi({V, TailVal}, {}, "", Flags); for (VPUser *U : Users) U->replaceUsesOfWith(V, Phi); } diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll index 062ee47a43444..04b7c612f4587 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll @@ -284,7 +284,7 @@ define i32 @reduction(ptr noalias %p, i32 %n) { ; CHECK-NEXT: Successor(s): vector.latch ; CHECK-EMPTY: ; CHECK-NEXT: vector.latch: -; CHECK-NEXT: EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir, vector.body ] +; CHECK-NEXT: EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir<%rdx>, vector.body ] ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]> ; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> ; CHECK-NEXT: No successors From 015bb78e1b1646910273c3eeed4a092a9342fef8 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 12 May 2026 15:34:46 +0100 Subject: [PATCH 459/538] =?UTF-8?q?[NFC][LLVM][VPlan]=20Fix=20"parameter?= =?UTF-8?q?=20=E2=80=98P=E2=80=99=20set=20but=20not=20used"=20warning.=20(?= =?UTF-8?q?#197194)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For Is... = {} the fold expression short-circuits to true and does not evaluate P. --- llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 3cafeae7c4aea..f8631549bc179 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -295,7 +295,8 @@ struct Recipe_match { /// Helper to check if predicate \p P holds on all tuple elements in Ops using /// the provided index sequence. template - bool all_of_tuple_elements(std::index_sequence, Fn P) const { + bool all_of_tuple_elements(std::index_sequence, + [[maybe_unused]] Fn P) const { return (P(std::get(Ops), Is) && ...); } }; From 9c4ff6e82edc00616d0324b4a437c0e15bf6cebf Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Tue, 12 May 2026 15:41:16 +0100 Subject: [PATCH 460/538] [ASan][Darwin] Make multiple_sigaltstack.cpp test use MINSIGSTKSZ (#197204) --- .../test/asan/TestCases/Posix/multiple_sigaltstack.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp b/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp index a7cf4b3a43b91..c135d20b4ab87 100644 --- a/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp +++ b/compiler-rt/test/asan/TestCases/Posix/multiple_sigaltstack.cpp @@ -5,7 +5,11 @@ #include #include +#if (__APPLE__) +char global_alt_stack[MINSIGSTKSZ]; +#else char global_alt_stack[4096 * 4]; +#endif int main() { stack_t altstack; From 785e3c0172a858c1dcae89abf4c59c8588b55e04 Mon Sep 17 00:00:00 2001 From: Daniel Donenfeld Date: Tue, 12 May 2026 10:46:39 -0400 Subject: [PATCH 461/538] [GVN][NVPTX] Rename PRE flag to ScalarPRE, disable option in NVPTX (#190386) Scalar PRE in GVN may cause performance issues in the NVPTX backend by increasing register pressure. This PR renames the enable-pre flag to enable-scalar-pre and updates its usage to cover an additional case of scalar PRE being performed. The newly renamed option is also used to disable scalar PRE for NVPTX. --- llvm/include/llvm/Transforms/Scalar/GVN.h | 11 ++- llvm/lib/Passes/PassBuilder.cpp | 4 +- llvm/lib/Passes/PassRegistry.def | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 3 +- llvm/lib/Transforms/Scalar/GVN.cpp | 39 +++++--- .../NVPTX/gvn-scalar-pre-reg-pressure.ll | 96 +++++++++++++++++++ llvm/test/Other/new-pm-print-pipeline.ll | 4 +- llvm/test/Transforms/GVN/PRE/local-pre.ll | 4 +- llvm/test/Transforms/GVN/PRE/no-scalar-pre.ll | 61 ++++++++++++ .../Transforms/GVN/PRE/pre-aliasning-path.ll | 4 +- llvm/test/Transforms/GVN/PRE/pre-basic-add.ll | 6 +- llvm/test/Transforms/GVN/PRE/pre-jt-add.ll | 4 +- .../GVN/PRE/pre-loop-load-new-pm.ll | 4 +- llvm/test/Transforms/GVN/PRE/pre-loop-load.ll | 2 +- .../test/Transforms/GVN/PRE/pre-poison-add.ll | 4 +- 15 files changed, 208 insertions(+), 40 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/gvn-scalar-pre-reg-pressure.ll create mode 100644 llvm/test/Transforms/GVN/PRE/no-scalar-pre.ll diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h index 56a93c9770c99..806e969211e7a 100644 --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -76,7 +76,7 @@ class GVNLegacyPass; /// Intended use is to create a default object, modify parameters with /// additional setters and then pass it to GVN. struct GVNOptions { - std::optional AllowPRE; + std::optional AllowScalarPRE; std::optional AllowLoadPRE; std::optional AllowLoadInLoopPRE; std::optional AllowLoadPRESplitBackedge; @@ -85,9 +85,9 @@ struct GVNOptions { GVNOptions() = default; - /// Enables or disables PRE in GVN. - GVNOptions &setPRE(bool PRE) { - AllowPRE = PRE; + /// Enables or disables PRE of scalars in GVN. + GVNOptions &setScalarPRE(bool ScalarPRE) { + AllowScalarPRE = ScalarPRE; return *this; } @@ -148,7 +148,7 @@ class GVNPass : public OptionalPassInfoMixin { AAResults *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceResults &getMemDep() const { return *MD; } - LLVM_ABI bool isPREEnabled() const; + LLVM_ABI bool isScalarPREEnabled() const; LLVM_ABI bool isLoadPREEnabled() const; LLVM_ABI bool isLoadInLoopPREEnabled() const; LLVM_ABI bool isLoadPRESplitBackedgeEnabled() const; @@ -424,6 +424,7 @@ class GVNPass : public OptionalPassInfoMixin { }; /// Create a legacy GVN pass. +LLVM_ABI FunctionPass *createGVNPass(bool ScalarPRE); LLVM_ABI FunctionPass *createGVNPass(); /// A simple and fast domtree-based GVN pass to hoist common expressions diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 603d7f2f5dea2..a238dfb61c2c2 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1353,8 +1353,8 @@ Expected parseGVNOptions(StringRef Params) { std::tie(ParamName, Params) = Params.split(';'); bool Enable = !ParamName.consume_front("no-"); - if (ParamName == "pre") { - Result.setPRE(Enable); + if (ParamName == "scalar-pre") { + Result.setScalarPRE(Enable); } else if (ParamName == "load-pre") { Result.setLoadPRE(Enable); } else if (ParamName == "split-backedge-load-pre") { diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 9edb30fedd867..3328bc0fe836f 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -605,7 +605,7 @@ FUNCTION_PASS_WITH_PARAMS( FUNCTION_PASS_WITH_PARAMS( "gvn", "GVNPass", [](GVNOptions Opts) { return GVNPass(Opts); }, parseGVNOptions, - "no-pre;pre;no-load-pre;load-pre;no-split-backedge-load-pre;" + "no-scalar-pre;scalar-pre;no-load-pre;load-pre;no-split-backedge-load-pre;" "split-backedge-load-pre;no-memdep;memdep;no-memoryssa;memoryssa") FUNCTION_PASS_WITH_PARAMS( "hardware-loops", "HardwareLoopsPass", diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 10e746c502c09..9351c8dde60d4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -291,7 +291,8 @@ NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { void NVPTXPassConfig::addEarlyCSEOrGVNPass() { if (getOptLevel() == CodeGenOptLevel::Aggressive) - addPass(createGVNPass()); + // Disable scalar PRE due to Register Pressure increase + addPass(createGVNPass(/*ScalarPRE=*/false)); else addPass(createEarlyCSEPass()); } diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 0965c2ab361c0..c069c1600a79b 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -105,7 +105,8 @@ STATISTIC(MaxBBSpeculationCutoffReachedTimes, "Number of times we we reached gvn-max-block-speculations cut-off " "preventing further exploration"); -static cl::opt GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden); +static cl::opt GVNEnableScalarPRE("enable-scalar-pre", cl::init(true), + cl::Hidden); static cl::opt GVNEnableLoadPRE("enable-load-pre", cl::init(true)); static cl::opt GVNEnableLoadInLoopPRE("enable-load-in-loop-pre", cl::init(true)); @@ -843,8 +844,8 @@ void GVNPass::LeaderMap::erase(uint32_t N, Instruction *I, // GVN Pass //===----------------------------------------------------------------------===// -bool GVNPass::isPREEnabled() const { - return Options.AllowPRE.value_or(GVNEnablePRE); +bool GVNPass::isScalarPREEnabled() const { + return Options.AllowScalarPRE.value_or(GVNEnableScalarPRE); } bool GVNPass::isLoadPREEnabled() const { @@ -906,8 +907,8 @@ void GVNPass::printPipeline( OS, MapClassName2PassName); OS << '<'; - if (Options.AllowPRE != std::nullopt) - OS << (*Options.AllowPRE ? "" : "no-") << "pre;"; + if (Options.AllowScalarPRE != std::nullopt) + OS << (*Options.AllowScalarPRE ? "" : "no-") << "scalar-pre;"; if (Options.AllowLoadPRE != std::nullopt) OS << (*Options.AllowLoadPRE ? "" : "no-") << "load-pre;"; if (Options.AllowLoadPRESplitBackedge != std::nullopt) @@ -2022,12 +2023,15 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) { } bool Changed = false; - // If this load follows a GEP, see if we can PRE the indices before analyzing. - if (GetElementPtrInst *GEP = - dyn_cast(Load->getOperand(0))) { - for (Use &U : GEP->indices()) - if (Instruction *I = dyn_cast(U.get())) - Changed |= performScalarPRE(I); + // This is a limited form of scalar PRE for load indices. If this load follows + // a GEP, see if we can PRE the indices before analyzing. + if (isScalarPREEnabled()) { + if (GetElementPtrInst *GEP = + dyn_cast(Load->getOperand(0))) { + for (Use &U : GEP->indices()) + if (Instruction *I = dyn_cast(U.get())) + Changed |= performScalarPRE(I); + } } // Step 2: Analyze the availability of the load. @@ -2071,7 +2075,7 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) { } // Step 4: Eliminate partial redundancy. - if (!isPREEnabled() || !isLoadPREEnabled()) + if (!isLoadPREEnabled()) return Changed; if (!isLoadInLoopPREEnabled() && LI->getLoopFor(Load->getParent())) return Changed; @@ -2838,7 +2842,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, ++Iteration; } - if (isPREEnabled()) { + if (isScalarPREEnabled()) { // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); @@ -3334,10 +3338,12 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid. explicit GVNLegacyPass(bool MemDepAnalysis = GVNEnableMemDep, - bool MemSSAAnalysis = GVNEnableMemorySSA) + bool MemSSAAnalysis = GVNEnableMemorySSA, + bool ScalarPRE = true) : FunctionPass(ID), Impl(GVNOptions() .setMemDep(MemDepAnalysis) - .setMemorySSA(MemSSAAnalysis)) { + .setMemorySSA(MemSSAAnalysis) + .setScalarPRE(ScalarPRE)) { initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -3399,3 +3405,6 @@ INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false // The public interface to this file... FunctionPass *llvm::createGVNPass() { return new GVNLegacyPass(); } +FunctionPass *llvm::createGVNPass(bool ScalarPRE) { + return new GVNLegacyPass(GVNEnableMemDep, GVNEnableMemorySSA, ScalarPRE); +} diff --git a/llvm/test/CodeGen/NVPTX/gvn-scalar-pre-reg-pressure.ll b/llvm/test/CodeGen/NVPTX/gvn-scalar-pre-reg-pressure.ll new file mode 100644 index 0000000000000..5b7f893889744 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/gvn-scalar-pre-reg-pressure.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -O3 | FileCheck %s --check-prefix=PIPELINE +; RUN: opt < %s -passes='gvn' -S | llc -mtriple=nvptx64 -mcpu=sm_100 -O0 | FileCheck %s --check-prefix=NO-SCALAR-PRE +; RUN: opt < %s -passes='gvn' -S | llc -mtriple=nvptx64 -mcpu=sm_100 -O0 | FileCheck %s --check-prefix=SCALAR-PRE + +; Scalar PRE inserts a critical-edge computation and a PHI for the common add. +; That shape needs more NVPTX virtual registers than keeping the duplicated adds. + +define void @test_scalar_pre_option(ptr %arr, i8 %cond) { +; PIPELINE-LABEL: test_scalar_pre_option( +; PIPELINE: { +; PIPELINE-NEXT: .reg .pred %p<2>; +; PIPELINE-NEXT: .reg .b16 %rs<2>; +; PIPELINE-NEXT: .reg .b32 %r<3>; +; PIPELINE-NEXT: .reg .b64 %rd<2>; +; PIPELINE-EMPTY: +; PIPELINE-NEXT: // %bb.0: // %entry +; PIPELINE-NEXT: ld.param.b64 %rd1, [test_scalar_pre_option_param_0]; +; PIPELINE-NEXT: ld.param.b8 %rs1, [test_scalar_pre_option_param_1]; +; PIPELINE-NEXT: setp.eq.b16 %p1, %rs1, 0; +; PIPELINE-NEXT: ld.b32 %r2, [%rd1]; +; PIPELINE-NEXT: add.s32 %r1, %r2, 2; +; PIPELINE-NEXT: @%p1 bra $L__BB0_2; +; PIPELINE-NEXT: // %bb.1: // %if.then +; PIPELINE-NEXT: st.b32 [%rd1+8], %r1; +; PIPELINE-NEXT: $L__BB0_2: // %if.end +; PIPELINE-NEXT: st.b32 [%rd1+12], %r1; +; PIPELINE-NEXT: ret; +; +; NO-SCALAR-PRE-LABEL: test_scalar_pre_option( +; NO-SCALAR-PRE: { +; NO-SCALAR-PRE-NEXT: .reg .pred %p<2>; +; NO-SCALAR-PRE-NEXT: .reg .b16 %rs<2>; +; NO-SCALAR-PRE-NEXT: .reg .b32 %r<4>; +; NO-SCALAR-PRE-NEXT: .reg .b64 %rd<2>; +; NO-SCALAR-PRE-EMPTY: +; NO-SCALAR-PRE-NEXT: // %bb.0: // %entry +; NO-SCALAR-PRE-NEXT: ld.param.b8 %rs1, [test_scalar_pre_option_param_1]; +; NO-SCALAR-PRE-NEXT: ld.param.b64 %rd1, [test_scalar_pre_option_param_0]; +; NO-SCALAR-PRE-NEXT: setp.eq.b16 %p1, %rs1, 0; +; NO-SCALAR-PRE-NEXT: ld.b32 %r1, [%rd1]; +; NO-SCALAR-PRE-NEXT: @%p1 bra $L__BB0_2; +; NO-SCALAR-PRE-NEXT: bra.uni $L__BB0_1; +; NO-SCALAR-PRE-NEXT: $L__BB0_1: // %if.then +; NO-SCALAR-PRE-NEXT: add.s32 %r2, %r1, 2; +; NO-SCALAR-PRE-NEXT: st.b32 [%rd1+8], %r2; +; NO-SCALAR-PRE-NEXT: bra.uni $L__BB0_2; +; NO-SCALAR-PRE-NEXT: $L__BB0_2: // %if.end +; NO-SCALAR-PRE-NEXT: add.s32 %r3, %r1, 2; +; NO-SCALAR-PRE-NEXT: st.b32 [%rd1+12], %r3; +; NO-SCALAR-PRE-NEXT: ret; +; +; SCALAR-PRE-LABEL: test_scalar_pre_option( +; SCALAR-PRE: { +; SCALAR-PRE-NEXT: .reg .pred %p<2>; +; SCALAR-PRE-NEXT: .reg .b16 %rs<2>; +; SCALAR-PRE-NEXT: .reg .b32 %r<6>; +; SCALAR-PRE-NEXT: .reg .b64 %rd<2>; +; SCALAR-PRE-EMPTY: +; SCALAR-PRE-NEXT: // %bb.0: // %entry +; SCALAR-PRE-NEXT: ld.param.b8 %rs1, [test_scalar_pre_option_param_1]; +; SCALAR-PRE-NEXT: ld.param.b64 %rd1, [test_scalar_pre_option_param_0]; +; SCALAR-PRE-NEXT: setp.ne.b16 %p1, %rs1, 0; +; SCALAR-PRE-NEXT: ld.b32 %r1, [%rd1]; +; SCALAR-PRE-NEXT: @%p1 bra $L__BB0_2; +; SCALAR-PRE-NEXT: bra.uni $L__BB0_1; +; SCALAR-PRE-NEXT: $L__BB0_1: // %entry.if.end_crit_edge +; SCALAR-PRE-NEXT: add.s32 %r2, %r1, 2; +; SCALAR-PRE-NEXT: mov.b32 %r5, %r2; +; SCALAR-PRE-NEXT: bra.uni $L__BB0_3; +; SCALAR-PRE-NEXT: $L__BB0_2: // %if.then +; SCALAR-PRE-NEXT: add.s32 %r3, %r1, 2; +; SCALAR-PRE-NEXT: st.b32 [%rd1+8], %r3; +; SCALAR-PRE-NEXT: mov.b32 %r5, %r3; +; SCALAR-PRE-NEXT: bra.uni $L__BB0_3; +; SCALAR-PRE-NEXT: $L__BB0_3: // %if.end +; SCALAR-PRE-NEXT: mov.b32 %r4, %r5; +; SCALAR-PRE-NEXT: st.b32 [%rd1+12], %r4; +; SCALAR-PRE-NEXT: ret; +entry: + %tobool.not = icmp eq i8 %cond, 0 + %tmp7.pre = load i32, ptr %arr, align 4 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %add = add nsw i32 %tmp7.pre, 2 + %getElem = getelementptr inbounds nuw i8, ptr %arr, i64 8 + store i32 %add, ptr %getElem, align 4 + br label %if.end + +if.end: + %add8 = add nsw i32 %tmp7.pre, 2 + %getElem1 = getelementptr inbounds nuw i8, ptr %arr, i64 12 + store i32 %add8, ptr %getElem1, align 4 + ret void +} diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll index 53a17975c512b..21da3eb33dd6d 100644 --- a/llvm/test/Other/new-pm-print-pipeline.ll +++ b/llvm/test/Other/new-pm-print-pipeline.ll @@ -31,8 +31,8 @@ ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-unroll<>,loop-unroll,loop-unroll)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-10 ; CHECK-10: function(loop-unroll,loop-unroll,loop-unroll) -; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn,gvn)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11 -; CHECK-11: function(gvn<>,gvn,gvn) +; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn,gvn)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11 +; CHECK-11: function(gvn<>,gvn,gvn) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12 ; CHECK-12: function(early-cse<>,early-cse) diff --git a/llvm/test/Transforms/GVN/PRE/local-pre.ll b/llvm/test/Transforms/GVN/PRE/local-pre.ll index 7f465927f48b6..c67a5f1549f80 100644 --- a/llvm/test/Transforms/GVN/PRE/local-pre.ll +++ b/llvm/test/Transforms/GVN/PRE/local-pre.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -passes=gvn -enable-pre -S | FileCheck %s -; RUN: opt < %s -passes="gvn
" -enable-pre=false -S | FileCheck %s
+; RUN: opt < %s -passes=gvn -enable-scalar-pre -S | FileCheck %s
+; RUN: opt < %s -passes="gvn" -enable-scalar-pre=false -S | FileCheck %s
 
 declare void @may_exit() nounwind
 
diff --git a/llvm/test/Transforms/GVN/PRE/no-scalar-pre.ll b/llvm/test/Transforms/GVN/PRE/no-scalar-pre.ll
new file mode 100644
index 0000000000000..32eaa9a0e8a27
--- /dev/null
+++ b/llvm/test/Transforms/GVN/PRE/no-scalar-pre.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -enable-scalar-pre=false -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -enable-scalar-pre=true -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK-ENABLED
+; RUN: opt -passes='gvn' -S < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -passes='gvn' -S < %s | FileCheck %s --check-prefixes=CHECK-ENABLED
+
+define void @test_scalar_pre_option(ptr %arr, i8 %cond) {
+; CHECK-LABEL: define void @test_scalar_pre_option(
+; CHECK-SAME: ptr [[ARR:%.*]], i8 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[COND]], 0
+; CHECK-NEXT:    [[TMP7_PRE:%.*]] = load i32, ptr [[ARR]], align 4
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7_PRE]], 2
+; CHECK-NEXT:    [[GETELEM:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 8
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GETELEM]], align 4
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP7_PRE]], 2
+; CHECK-NEXT:    [[GETELEM1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 12
+; CHECK-NEXT:    store i32 [[ADD8]], ptr [[GETELEM1]], align 4
+; CHECK-NEXT:    ret void
+;
+; CHECK-ENABLED-LABEL: define void @test_scalar_pre_option(
+; CHECK-ENABLED-SAME: ptr [[ARR:%.*]], i8 [[COND:%.*]]) {
+; CHECK-ENABLED-NEXT:  [[ENTRY:.*:]]
+; CHECK-ENABLED-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[COND]], 0
+; CHECK-ENABLED-NEXT:    [[TMP7_PRE:%.*]] = load i32, ptr [[ARR]], align 4
+; CHECK-ENABLED-NEXT:    br i1 [[TOBOOL_NOT]], label %[[ENTRY_IF_END_CRIT_EDGE:.*]], label %[[IF_THEN:.*]]
+; CHECK-ENABLED:       [[ENTRY_IF_END_CRIT_EDGE]]:
+; CHECK-ENABLED-NEXT:    [[DOTPRE:%.*]] = add nsw i32 [[TMP7_PRE]], 2
+; CHECK-ENABLED-NEXT:    br label %[[IF_END:.*]]
+; CHECK-ENABLED:       [[IF_THEN]]:
+; CHECK-ENABLED-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7_PRE]], 2
+; CHECK-ENABLED-NEXT:    [[GETELEM:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 8
+; CHECK-ENABLED-NEXT:    store i32 [[ADD]], ptr [[GETELEM]], align 4
+; CHECK-ENABLED-NEXT:    br label %[[IF_END]]
+; CHECK-ENABLED:       [[IF_END]]:
+; CHECK-ENABLED-NEXT:    [[ADD8_PRE_PHI:%.*]] = phi i32 [ [[DOTPRE]], %[[ENTRY_IF_END_CRIT_EDGE]] ], [ [[ADD]], %[[IF_THEN]] ]
+; CHECK-ENABLED-NEXT:    [[GETELEM1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 12
+; CHECK-ENABLED-NEXT:    store i32 [[ADD8_PRE_PHI]], ptr [[GETELEM1]], align 4
+; CHECK-ENABLED-NEXT:    ret void
+;
+entry:
+  %tobool.not = icmp eq i8 %cond, 0
+  %tmp7.pre = load i32, ptr %arr, align 4
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %tmp7.pre, 2
+  %getElem = getelementptr inbounds nuw i8, ptr %arr, i64 8
+  store i32 %add, ptr %getElem, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %add8 = add nsw i32 %tmp7.pre, 2
+  %getElem1 = getelementptr inbounds nuw i8, ptr %arr, i64 12
+  store i32 %add8, ptr %getElem1, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll b/llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll
index 60611a032ded5..45e6bd1bf8b6b 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -enable-load-pre -enable-pre -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt -enable-load-pre -enable-pre -passes='gvn' -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt -enable-load-pre -enable-scalar-pre -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
+; RUN: opt -enable-load-pre -enable-scalar-pre -passes='gvn' -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA
 
 declare void @side_effect_0() nofree
 
diff --git a/llvm/test/Transforms/GVN/PRE/pre-basic-add.ll b/llvm/test/Transforms/GVN/PRE/pre-basic-add.ll
index 9bf64962ecb1f..92306015378cb 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-basic-add.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-basic-add.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=gvn -enable-pre -S | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -passes='gvn' -enable-pre -S | FileCheck %s --check-prefixes=CHECK,MSSA
-; RUN: opt < %s -passes="gvn
" -enable-pre=false -S | FileCheck %s
+; RUN: opt < %s -passes=gvn -enable-scalar-pre -S | FileCheck %s --check-prefixes=CHECK,MDEP
+; RUN: opt < %s -passes='gvn' -enable-scalar-pre -S | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt < %s -passes="gvn" -enable-scalar-pre=false -S | FileCheck %s
 
 @H = common global i32 0		;  [#uses=2]
 @G = common global i32 0		;  [#uses=1]
diff --git a/llvm/test/Transforms/GVN/PRE/pre-jt-add.ll b/llvm/test/Transforms/GVN/PRE/pre-jt-add.ll
index f62d06dbf0f84..c6470c1158e39 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-jt-add.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-jt-add.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=gvn,jump-threading -enable-pre -S | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -passes='gvn',jump-threading -enable-pre -S | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt < %s -passes=gvn,jump-threading -enable-scalar-pre -S | FileCheck %s --check-prefixes=CHECK,MDEP
+; RUN: opt < %s -passes='gvn',jump-threading -enable-scalar-pre -S | FileCheck %s --check-prefixes=CHECK,MSSA
 
 @H = common global i32 0
 @G = common global i32 0
diff --git a/llvm/test/Transforms/GVN/PRE/pre-loop-load-new-pm.ll b/llvm/test/Transforms/GVN/PRE/pre-loop-load-new-pm.ll
index 4cd2e47b3c31f..0472500282f42 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-loop-load-new-pm.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-loop-load-new-pm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -aa-pipeline=basic-aa -enable-load-pre -enable-pre -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt -aa-pipeline=basic-aa -enable-load-pre -enable-pre -passes='gvn' -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt -aa-pipeline=basic-aa -enable-load-pre -enable-scalar-pre -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
+; RUN: opt -aa-pipeline=basic-aa -enable-load-pre -enable-scalar-pre -passes='gvn' -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA
 
 declare void @side_effect()
 declare i1 @side_effect_cond()
diff --git a/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll b/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll
index a9b69a49005e3..9cd6274100c2a 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -enable-load-pre -enable-pre -passes=lcssa,gvn -S < %s | FileCheck %s
+; RUN: opt -enable-load-pre -enable-scalar-pre -passes=lcssa,gvn -S < %s | FileCheck %s
 
 declare void @side_effect() nofree
 declare i1 @side_effect_cond() nofree
diff --git a/llvm/test/Transforms/GVN/PRE/pre-poison-add.ll b/llvm/test/Transforms/GVN/PRE/pre-poison-add.ll
index 32f149b881d72..a4ee356628f11 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-poison-add.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-poison-add.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=gvn -enable-pre -S | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -passes='gvn' -enable-pre -S | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt < %s -passes=gvn -enable-scalar-pre -S | FileCheck %s --check-prefixes=CHECK,MDEP
+; RUN: opt < %s -passes='gvn' -enable-scalar-pre -S | FileCheck %s --check-prefixes=CHECK,MSSA
 
 @H = common global i32 0
 @G = common global i32 0

From 4bce216e6b550c770f2e536422c3d95333f65ba3 Mon Sep 17 00:00:00 2001
From: Derek Schuff 
Date: Tue, 12 May 2026 07:55:21 -0700
Subject: [PATCH 462/538] SymbolizableObjectFile: Fix Wasm test to avoid
 layering violation (#193574)

Tests for LLVM libraries should not require wasm-ld. It's not necessary
in this case to generate the binary at test time, so instead check in a
YAMLized pre-linked binary.
---
 llvm/test/tools/llvm-objdump/lit.local.cfg    |  4 --
 .../wasm/Inputs/line-numbers.yaml             | 72 +++++++++++++++++++
 .../tools/llvm-objdump/wasm/line-numbers.s    | 39 +++++-----
 3 files changed, 93 insertions(+), 22 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-objdump/lit.local.cfg
 create mode 100644 llvm/test/tools/llvm-objdump/wasm/Inputs/line-numbers.yaml

diff --git a/llvm/test/tools/llvm-objdump/lit.local.cfg b/llvm/test/tools/llvm-objdump/lit.local.cfg
deleted file mode 100644
index 21771693b720e..0000000000000
--- a/llvm/test/tools/llvm-objdump/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-from lit.llvm import llvm_config
-
-if llvm_config.use_lld(required=False):
-    config.available_features.add("lld")
diff --git a/llvm/test/tools/llvm-objdump/wasm/Inputs/line-numbers.yaml b/llvm/test/tools/llvm-objdump/wasm/Inputs/line-numbers.yaml
new file mode 100644
index 0000000000000..e73bee0a20eca
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/wasm/Inputs/line-numbers.yaml
@@ -0,0 +1,72 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+      - Index:           1
+        ParamTypes:
+          - I32
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 1 ]
+  - Type:            MEMORY
+    Memories:
+      - Minimum:         0x1
+  - Type:            GLOBAL
+    Globals:
+      - Index:           0
+        Type:            I32
+        Mutable:         true
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           65536
+  - Type:            EXPORT
+    Exports:
+      - Name:            memory
+        Kind:            MEMORY
+        Index:           0
+      - Name:            foo
+        Kind:            FUNCTION
+        Index:           0
+      - Name:            bar
+        Kind:            FUNCTION
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            010F0B
+      - Index:           1
+        Locals:          []
+        Body:            2000010F0B
+  - Type:            CUSTOM
+    Name:            .debug_info
+    Payload:         BB000000040000000000040100000000000000006C6C766D2D70726F6A6563742F6C6C766D2F746573742F746F6F6C732F6C6C766D2D6F626A64756D702F7761736D2F6C696E652D6E756D626572732E73002F622F6F626A64756D702D6A6574736B692F656D736372697074656E2D72656C6561736573006C6C766D2D6D6320286261736564206F6E204C4C564D2032332E302E306769742900018002666F6F00010000000E00000002000000026261720001000000150000000700000000
+  - Type:            CUSTOM
+    Name:            .debug_abbrev
+    Payload:         0111011017551703081B08250813050000020A0003083A063B061101000000
+  - Type:            CUSTOM
+    Name:            .debug_aranges
+    Payload:         '24000000020000000000040000000000020000000400000007000000060000000000000000000000'
+  - Type:            CUSTOM
+    Name:            .debug_ranges
+    Payload:         FFFFFFFF020000000000000004000000FFFFFFFF0700000000000000060000000000000000000000
+  - Type:            CUSTOM
+    Name:            .debug_line
+    Payload:         7E000000040055000000010101FB0E0D0001010101000000010000016C6C766D2D70726F6A6563742F6C6C766D2F746573742F746F6F6C732F6C6C766D2D6F626A64756D702F7761736D00006C696E652D6E756D626572732E73000100000000050202000000030F012F210201000101000502070000000316013D21210201000101
+  - Type:            CUSTOM
+    Name:            name
+    FunctionNames:
+      - Index:           0
+        Name:            foo
+      - Index:           1
+        Name:            bar
+    GlobalNames:
+      - Index:           0
+        Name:            __stack_pointer
+...
diff --git a/llvm/test/tools/llvm-objdump/wasm/line-numbers.s b/llvm/test/tools/llvm-objdump/wasm/line-numbers.s
index 829c79908bb12..d5c4b7966eab1 100644
--- a/llvm/test/tools/llvm-objdump/wasm/line-numbers.s
+++ b/llvm/test/tools/llvm-objdump/wasm/line-numbers.s
@@ -1,7 +1,10 @@
-# REQUIRES: webassembly-registered-target, lld
+# REQUIRES: webassembly-registered-target
 # RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj %s -o %t.o -g
-# RUN: wasm-ld %t.o -o %t.wasm --no-entry --export=foo --export=bar
 # RUN: llvm-objdump -d --line-numbers %t.o | FileCheck --check-prefix=OBJ %s
+
+# line-numbers.yaml was created by linking this object and converting to YAML:
+#  wasm-ld %t.o -o %t.wasm --no-entry --export=foo --export=bar
+# RUN: yaml2obj %S/Inputs/line-numbers.yaml -o %t.wasm
 # RUN: llvm-objdump -d --line-numbers %t.wasm | FileCheck --check-prefix=LINKED %s
 
 # This test mirrors test/tools/llvm-symbolizer/wasm-basic.s and tests that line
@@ -25,23 +28,23 @@ bar:
 # OBJ:      :
 # OBJ-EMPTY:
 # OBJ-NEXT: ; foo():
-# OBJ-NEXT: ; {{.*}}line-numbers.s:13
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-15]]
 # OBJ-NEXT:        3: 01            nop
-# OBJ-NEXT: ; {{.*}}line-numbers.s:14
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-16]]
 # OBJ-NEXT:        4: 0f            return
-# OBJ-NEXT: ; {{.*}}line-numbers.s:15
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-17]]
 # OBJ-NEXT:        5: 0b            end
 
 # OBJ:      :
 # OBJ-EMPTY:
 # OBJ-NEXT: ; bar():
-# OBJ-NEXT: ; {{.*}}line-numbers.s:20
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-18]]
 # OBJ-NEXT:        8: 20 00         local.get 0
-# OBJ-NEXT: ; {{.*}}line-numbers.s:21
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-19]]
 # OBJ-NEXT:        a: 01            nop
-# OBJ-NEXT: ; {{.*}}line-numbers.s:22
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-20]]
 # OBJ-NEXT:        b: 0f            return
-# OBJ-NEXT: ; {{.*}}line-numbers.s:23
+# OBJ-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-21]]
 # OBJ-NEXT:        c: 0b            end
 
 
@@ -54,21 +57,21 @@ bar:
 # LINKED:      :
 # LINKED-EMPTY:
 # LINKED-NEXT: ; foo():
-# LINKED-NEXT: ; {{.*}}line-numbers.s:13
-# LINKED-NEXT:        44: 01            nop
-# LINKED-NEXT: ; {{.*}}line-numbers.s:14
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-44]]
+# LINKED-NEXT:        5c: 01            nop
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-45]]
 # LINKED-NEXT:        {{.*}}: 0f            return
-# LINKED-NEXT: ; {{.*}}line-numbers.s:15
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-46]]
 # LINKED-NEXT:        {{.*}}: 0b            end
 
 # LINKED:      :
 # LINKED-EMPTY:
 # LINKED-NEXT: ; bar():
-# LINKED-NEXT: ; {{.*}}line-numbers.s:20
-# LINKED-NEXT:        49: 20 00         local.get 0
-# LINKED-NEXT: ; {{.*}}line-numbers.s:21
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-47]]
+# LINKED-NEXT:        61: 20 00         local.get 0
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-48]]
 # LINKED-NEXT:        {{.*}}: 01            nop
-# LINKED-NEXT: ; {{.*}}line-numbers.s:22
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-49]]
 # LINKED-NEXT:        {{.*}}: 0f            return
-# LINKED-NEXT: ; {{.*}}line-numbers.s:23
+# LINKED-NEXT: ; {{.*}}line-numbers.s:[[#@LINE-50]]
 # LINKED-NEXT:        {{.*}}: 0b            end

From 297e3e94c90ea75d72bcd881a300c852d8a4d725 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 12 May 2026 15:56:12 +0100
Subject: [PATCH 463/538] [X86] avx512-intrinsics-fast-isel.ll - add nounwind
 to remove cfi noise (#197207)

---
 .../X86/avx512-intrinsics-fast-isel.ll        | 878 ++++++++----------
 1 file changed, 407 insertions(+), 471 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 25632378cf23c..1e14256db3584 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5,14 +5,11 @@
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
 
 
-define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
+define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) nounwind {
 ; X86-LABEL: test_mm512_kunpackb:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
@@ -24,7 +21,6 @@ define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -56,14 +52,11 @@ entry:
   ret i16 %13
 }
 
-define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
+define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) nounwind {
 ; X86-LABEL: test_mm512_kortestc:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
@@ -76,7 +69,6 @@ define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C,
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -105,14 +97,11 @@ entry:
   ret i32 %9
 }
 
-define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
+define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) nounwind {
 ; X86-LABEL: test_mm512_kortestz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
@@ -125,7 +114,6 @@ define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C,
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -155,7 +143,7 @@ entry:
   ret i32 %9
 }
 
-define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
@@ -166,7 +154,7 @@ entry:
 }
 
 
-define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -186,7 +174,7 @@ entry:
   ret <16 x float> %1
 }
 
-define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -206,7 +194,7 @@ entry:
   ret <16 x float> %1
 }
 
-define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
@@ -216,7 +204,7 @@ entry:
   ret <8 x double> %shuffle
 }
 
-define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -236,7 +224,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -256,7 +244,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
@@ -266,7 +254,7 @@ entry:
   ret <8 x i64> %shuffle
 }
 
-define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -289,7 +277,7 @@ entry:
   ret <8 x i64> %4
 }
 
-define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -311,7 +299,7 @@ entry:
   ret <8 x i64> %3
 }
 
-define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
@@ -321,7 +309,7 @@ entry:
   ret <8 x i64> %shuffle
 }
 
-define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -341,7 +329,7 @@ entry:
   ret <8 x i64> %1
 }
 
-define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -362,7 +350,7 @@ entry:
 }
 
 
-define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
+define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm1, %k0
@@ -378,7 +366,7 @@ entry:
   ret i16 %2
 }
 
-define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -407,7 +395,7 @@ entry:
   ret i16 %4
 }
 
-define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
+define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vptestnmq %zmm0, %zmm1, %k0
@@ -422,7 +410,7 @@ entry:
   ret i8 %1
 }
 
-define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -450,7 +438,7 @@ entry:
   ret i8 %3
 }
 
-define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -479,7 +467,7 @@ entry:
   ret i16 %4
 }
 
-define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -507,7 +495,7 @@ entry:
   ret i8 %3
 }
 
-define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
+define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_set1_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -531,7 +519,7 @@ entry:
   ret <8 x i64> %3
 }
 
-define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A)  {
+define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_set1_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -554,7 +542,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
+define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_set1_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -577,7 +565,7 @@ entry:
   ret <8 x i64> %1
 }
 
-define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
+define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_set1_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -601,7 +589,7 @@ entry:
 }
 
 
-define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
+define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
@@ -612,7 +600,7 @@ define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -634,7 +622,7 @@ define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
+define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -655,7 +643,7 @@ define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
+define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
@@ -664,7 +652,7 @@ define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -683,7 +671,7 @@ define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -702,7 +690,7 @@ define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
   ret <8 x i64> %res1
 }
 
-define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
@@ -711,7 +699,7 @@ define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
+define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -730,7 +718,7 @@ define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -749,7 +737,7 @@ define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
   ret <8 x double> %res1
 }
 
-define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
+define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_broadcastss_ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
@@ -758,7 +746,7 @@ define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
   ret <16 x float> %res
 }
 
-define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
+define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -777,7 +765,7 @@ define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
+define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -796,7 +784,7 @@ define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
   ret <16 x float> %res1
 }
 
-define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
+define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_movedup_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
@@ -805,7 +793,7 @@ define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_movedup_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -824,7 +812,7 @@ define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x d
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
+define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_movedup_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -843,7 +831,7 @@ define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
   ret <8 x double> %res1
 }
 
-define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
+define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_movehdup_ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
@@ -852,7 +840,7 @@ define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
   ret <16 x float> %res
 }
 
-define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_movehdup_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -871,7 +859,7 @@ define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
+define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -890,7 +878,7 @@ define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
+define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_moveldup_ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
@@ -899,7 +887,7 @@ define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
   ret <16 x float> %res
 }
 
-define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_moveldup_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -918,7 +906,7 @@ define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
+define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -937,7 +925,7 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
   ret <16 x float> %res1
 }
 
-define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
+define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_permute_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
@@ -946,7 +934,7 @@ define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_permute_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -965,7 +953,7 @@ define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x d
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
+define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_permute_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -984,7 +972,7 @@ define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
   ret <8 x double> %res1
 }
 
-define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
+define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_permute_ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
@@ -993,7 +981,7 @@ define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
   ret <16 x float> %res
 }
 
-define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_permute_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1012,7 +1000,7 @@ define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
+define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_permute_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1031,7 +1019,7 @@ define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
   ret <16 x float> %res1
 }
 
-define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
+define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_permutex_epi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
@@ -1040,7 +1028,7 @@ define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
+define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_permutex_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1059,7 +1047,7 @@ define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
+define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1078,7 +1066,7 @@ define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
   ret <8 x i64> %res1
 }
 
-define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
+define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_permutex_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
@@ -1087,7 +1075,7 @@ define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_permutex_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1106,7 +1094,7 @@ define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
+define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_permutex_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1125,7 +1113,7 @@ define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
   ret <8 x double> %res1
 }
 
-define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
+define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) nounwind {
 ; CHECK-LABEL: test_mm512_shuffle_epi32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
@@ -1136,7 +1124,7 @@ define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
+define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1158,7 +1146,7 @@ define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
+define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) nounwind {
 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1179,7 +1167,7 @@ define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
   ret <8 x i64> %res2
 }
 
-define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
+define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_shuffle_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -1188,7 +1176,7 @@ define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_shuffle_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1207,7 +1195,7 @@ define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x d
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1226,7 +1214,7 @@ define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x
   ret <8 x double> %res1
 }
 
-define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
@@ -1238,7 +1226,7 @@ define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1261,7 +1249,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i6
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1283,7 +1271,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
@@ -1292,7 +1280,7 @@ define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1311,7 +1299,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1330,7 +1318,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i6
   ret <8 x i64> %res1
 }
 
-define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
+define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpackhi_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
@@ -1339,7 +1327,7 @@ define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1)
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1358,7 +1346,7 @@ define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1377,7 +1365,7 @@ define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x
   ret <8 x double> %res1
 }
 
-define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
+define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpackhi_ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
@@ -1386,7 +1374,7 @@ define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1)
   ret <16 x float> %res
 }
 
-define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
+define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1405,7 +1393,7 @@ define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1424,7 +1412,7 @@ define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16
   ret <16 x float> %res1
 }
 
-define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -1436,7 +1424,7 @@ define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1459,7 +1447,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i6
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1481,7 +1469,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i
   ret <8 x i64> %res2
 }
 
-define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -1490,7 +1478,7 @@ define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1509,7 +1497,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64
   ret <8 x i64> %res1
 }
 
-define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1528,7 +1516,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i6
   ret <8 x i64> %res1
 }
 
-define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
+define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpacklo_pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -1537,7 +1525,7 @@ define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1)
   ret <8 x double> %res
 }
 
-define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1556,7 +1544,7 @@ define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x
   ret <8 x double> %res1
 }
 
-define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -1575,7 +1563,7 @@ define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x
   ret <8 x double> %res1
 }
 
-define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
+define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) nounwind {
 ; CHECK-LABEL: test_mm512_unpacklo_ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -1584,7 +1572,7 @@ define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1)
   ret <16 x float> %res
 }
 
-define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
+define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) nounwind {
 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1603,7 +1591,7 @@ define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16
   ret <16 x float> %res1
 }
 
-define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) nounwind {
 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1818,7 +1806,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
+define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) nounwind {
 ; X86-LABEL: test_mm_cvtu32_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -1834,7 +1822,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
+define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) nounwind {
 ; X86-LABEL: test_mm_cvtu64_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -1856,7 +1844,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
+define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) nounwind {
 ; X86-LABEL: test_mm_cvtu32_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -1872,14 +1860,11 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
+define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) nounwind {
 ; X86-LABEL: test_mm_cvtu64_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
@@ -1894,7 +1879,6 @@ define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtu64_ss:
@@ -1907,7 +1891,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) {
+define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_cvtph_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
@@ -1919,7 +1903,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) {
+define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_cvtph_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1941,7 +1925,7 @@ entry:
   ret <16 x float> %4
 }
 
-define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) {
+define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_cvtph_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -1963,7 +1947,7 @@ entry:
   ret <16 x float> %4
 }
 
-define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
+define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_cvtps_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
@@ -1973,7 +1957,7 @@ entry:
   ret <8 x double> %conv.i
 }
 
-define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
+define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
@@ -1984,7 +1968,7 @@ entry:
   ret <8 x double> %conv.i.i
 }
 
-define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
+define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_cvtps_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2004,7 +1988,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
+define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2025,7 +2009,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
+define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2045,7 +2029,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
+define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
@@ -2058,7 +2042,7 @@ entry:
   ret <2 x i64> %1
 }
 
-define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
+define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2081,7 +2065,7 @@ entry:
   ret <2 x i64> %3
 }
 
-define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
+define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2103,7 +2087,7 @@ entry:
   ret <2 x i64> %2
 }
 
-define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
+define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
@@ -2114,7 +2098,7 @@ entry:
   ret <4 x i64> %0
 }
 
-define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
+define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2136,7 +2120,7 @@ entry:
   ret <4 x i64> %3
 }
 
-define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
+define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2157,7 +2141,7 @@ entry:
   ret <4 x i64> %2
 }
 
-define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
+define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
@@ -2169,7 +2153,7 @@ entry:
   ret <2 x i64> %0
 }
 
-define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
+define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2191,7 +2175,7 @@ entry:
   ret <2 x i64> %2
 }
 
-define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
+define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2215,7 +2199,7 @@ entry:
 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
 
-define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
+define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 & ~(zmm0 | zmm2)
@@ -2231,7 +2215,7 @@ entry:
 
 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
 
-define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
+define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2255,7 +2239,7 @@ entry:
   ret <8 x i64> %6
 }
 
-define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
+define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2279,7 +2263,7 @@ entry:
   ret <8 x i64> %6
 }
 
-define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
+define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 & ~(zmm0 | zmm2)
@@ -2291,7 +2275,7 @@ entry:
 
 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
 
-define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
+define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2311,7 +2295,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
+define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2333,7 +2317,7 @@ entry:
 
 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
 
-define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2361,7 +2345,7 @@ entry:
 
 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
 
-define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2386,7 +2370,7 @@ entry:
 
 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
 
-define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2412,7 +2396,7 @@ entry:
 
 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
 
-define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2434,7 +2418,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
@@ -2448,7 +2432,7 @@ entry:
   ret <8 x i64> %4
 }
 
-define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2472,7 +2456,7 @@ entry:
   ret <8 x i64> %6
 }
 
-define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2496,7 +2480,7 @@ entry:
   ret <8 x i64> %6
 }
 
-define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
+define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_permutex2var_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0
@@ -2506,7 +2490,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2526,7 +2510,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
+define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2546,7 +2530,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
+define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_permutex2var_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
@@ -2557,7 +2541,7 @@ entry:
   ret <16 x float> %1
 }
 
-define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2578,7 +2562,7 @@ entry:
   ret <16 x float> %3
 }
 
-define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
+define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2599,7 +2583,7 @@ entry:
   ret <16 x float> %3
 }
 
-define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
@@ -2609,7 +2593,7 @@ entry:
   ret <8 x i64> %0
 }
 
-define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2629,7 +2613,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2648,7 +2632,7 @@ entry:
   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   ret <8 x i64> %2
 }
-define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_add_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2673,7 +2657,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_add_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2697,7 +2681,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_add_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2722,7 +2706,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_add_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2746,7 +2730,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_sub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2771,7 +2755,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_sub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2795,7 +2779,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_sub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2820,7 +2804,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_sub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2844,7 +2828,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_mul_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2869,7 +2853,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_mul_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2893,7 +2877,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_mul_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2918,7 +2902,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_mul_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2942,7 +2926,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_div_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2967,7 +2951,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_div_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -2991,7 +2975,7 @@ entry:
   ret <4 x float> %6
 }
 
-define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_div_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3016,7 +3000,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_maskz_div_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3041,7 +3025,7 @@ entry:
 }
 
 
-define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -3053,7 +3037,7 @@ entry:
 
 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
 
-define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3073,7 +3057,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3095,7 +3079,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3115,7 +3099,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_fmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
@@ -3133,7 +3117,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3154,7 +3138,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3175,7 +3159,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_fnmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
@@ -3193,7 +3177,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3216,7 +3200,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3237,7 +3221,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -3252,7 +3236,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3274,7 +3258,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmadd_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
@@ -3284,7 +3268,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3304,7 +3288,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3326,7 +3310,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3346,7 +3330,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_fmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
@@ -3364,7 +3348,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3385,7 +3369,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3406,7 +3390,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
@@ -3424,7 +3408,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3447,7 +3431,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3468,7 +3452,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -3483,7 +3467,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -3505,7 +3489,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -3517,7 +3501,7 @@ entry:
 
 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
 
-define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3537,7 +3521,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3559,7 +3543,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3579,7 +3563,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_fmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
@@ -3597,7 +3581,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3618,7 +3602,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3639,7 +3623,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_fnmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
@@ -3657,7 +3641,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3680,7 +3664,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3701,7 +3685,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -3716,7 +3700,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3738,7 +3722,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmadd_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
@@ -3748,7 +3732,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3768,7 +3752,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3790,7 +3774,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3810,7 +3794,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_fmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
@@ -3828,7 +3812,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3849,7 +3833,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3870,7 +3854,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
@@ -3888,7 +3872,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3911,7 +3895,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3932,7 +3916,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -3947,7 +3931,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -3969,7 +3953,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -3981,7 +3965,7 @@ entry:
 
 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
 
-define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4001,7 +3985,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4023,7 +4007,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4043,7 +4027,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
@@ -4061,7 +4045,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4082,7 +4066,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4103,7 +4087,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
@@ -4116,7 +4100,7 @@ entry:
   ret <8 x double> %3
 }
 
-define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4139,7 +4123,7 @@ entry:
   ret <8 x double> %5
 }
 
-define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4164,7 +4148,7 @@ entry:
   ret <8 x double> %5
 }
 
-define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4187,7 +4171,7 @@ entry:
   ret <8 x double> %5
 }
 
-define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
@@ -4200,7 +4184,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4223,7 +4207,7 @@ entry:
   ret <8 x double> %4
 }
 
-define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4246,7 +4230,7 @@ entry:
   ret <8 x double> %4
 }
 
-define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -4258,7 +4242,7 @@ entry:
 
 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
 
-define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4278,7 +4262,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4300,7 +4284,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4320,7 +4304,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
@@ -4338,7 +4322,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4359,7 +4343,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4380,7 +4364,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
@@ -4393,7 +4377,7 @@ entry:
   ret <16 x float> %3
 }
 
-define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4416,7 +4400,7 @@ entry:
   ret <16 x float> %5
 }
 
-define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4441,7 +4425,7 @@ entry:
   ret <16 x float> %5
 }
 
-define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4464,7 +4448,7 @@ entry:
   ret <16 x float> %5
 }
 
-define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
@@ -4477,7 +4461,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4500,7 +4484,7 @@ entry:
   ret <16 x float> %4
 }
 
-define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4523,7 +4507,7 @@ entry:
   ret <16 x float> %4
 }
 
-define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4546,7 +4530,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4569,7 +4553,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4592,7 +4576,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4615,7 +4599,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4638,7 +4622,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4663,7 +4647,7 @@ entry:
   ret <8 x double> %4
 }
 
-define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4686,7 +4670,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4711,7 +4695,7 @@ entry:
   ret <16 x float> %4
 }
 
-define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4732,7 +4716,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4753,7 +4737,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4774,7 +4758,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4795,7 +4779,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4817,7 +4801,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4841,7 +4825,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
+define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4863,7 +4847,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
+define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -4887,7 +4871,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4909,7 +4893,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4933,7 +4917,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
+define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) nounwind {
 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4955,7 +4939,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
+define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -4979,7 +4963,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmadd_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5005,7 +4989,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5032,7 +5016,7 @@ entry:
 
 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
 
-define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmadd_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5057,7 +5041,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5082,7 +5066,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmadd_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5110,7 +5094,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5137,7 +5121,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmsub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5164,7 +5148,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5190,7 +5174,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmsub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5216,7 +5200,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5242,7 +5226,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmsub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5271,7 +5255,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5299,7 +5283,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmadd_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5326,7 +5310,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5352,7 +5336,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5378,7 +5362,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5404,7 +5388,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5433,7 +5417,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5461,7 +5445,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmsub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5489,7 +5473,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5516,7 +5500,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5543,7 +5527,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5570,7 +5554,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5600,7 +5584,7 @@ entry:
   ret <4 x float> %vecins.i
 }
 
-define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
+define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5629,7 +5613,7 @@ entry:
   ret <4 x float> %7
 }
 
-define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmadd_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5655,7 +5639,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5682,7 +5666,7 @@ entry:
 
 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
 
-define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmadd_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5707,7 +5691,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5732,7 +5716,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmadd_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5760,7 +5744,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5787,7 +5771,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmsub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5814,7 +5798,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5840,7 +5824,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmsub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5866,7 +5850,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5892,7 +5876,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmsub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5921,7 +5905,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5949,7 +5933,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmadd_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -5976,7 +5960,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6002,7 +5986,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6028,7 +6012,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6054,7 +6038,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6083,7 +6067,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6111,7 +6095,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmsub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6139,7 +6123,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) nounwind {
 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6166,7 +6150,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6193,7 +6177,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) nounwind {
 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6220,7 +6204,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6250,7 +6234,7 @@ entry:
   ret <2 x double> %vecins.i
 }
 
-define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
+define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) nounwind {
 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6279,7 +6263,7 @@ entry:
   ret <2 x double> %7
 }
 
-define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) {
+define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6299,7 +6283,7 @@ entry:
   ret <8 x i64> %1
 }
 
-define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, ptr readonly %__P) {
+define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6319,7 +6303,7 @@ entry:
   ret <8 x i64> %1
 }
 
-define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, ptr readonly %__P) {
+define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6339,7 +6323,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, ptr readonly %__P) {
+define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6359,7 +6343,7 @@ entry:
   ret <8 x double> %1
 }
 
-define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, ptr readonly %__P) {
+define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6381,7 +6365,7 @@ entry:
   ret <8 x i64> %3
 }
 
-define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, ptr readonly %__P) {
+define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6402,7 +6386,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, ptr readonly %__P) {
+define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6422,7 +6406,7 @@ entry:
   ret <16 x float> %1
 }
 
-define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, ptr readonly %__P) {
+define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, ptr readonly %__P) nounwind {
 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6442,7 +6426,7 @@ entry:
   ret <16 x float> %1
 }
 
-define void @test_mm512_mask_compressstoreu_pd(ptr %__P, i8 zeroext %__U, <8 x double> %__A) {
+define void @test_mm512_mask_compressstoreu_pd(ptr %__P, i8 zeroext %__U, <8 x double> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6464,7 +6448,7 @@ entry:
   ret void
 }
 
-define void @test_mm512_mask_compressstoreu_epi64(ptr %__P, i8 zeroext %__U, <8 x i64> %__A) {
+define void @test_mm512_mask_compressstoreu_epi64(ptr %__P, i8 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6486,7 +6470,7 @@ entry:
   ret void
 }
 
-define void @test_mm512_mask_compressstoreu_ps(ptr %__P, i16 zeroext %__U, <16 x float> %__A) {
+define void @test_mm512_mask_compressstoreu_ps(ptr %__P, i16 zeroext %__U, <16 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -6508,7 +6492,7 @@ entry:
   ret void
 }
 
-define void @test_mm512_mask_compressstoreu_epi32(ptr %__P, i16 zeroext %__U, <8 x i64> %__A) {
+define void @test_mm512_mask_compressstoreu_epi32(ptr %__P, i16 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -6531,7 +6515,7 @@ entry:
   ret void
 }
 
-define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_add_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6561,7 +6545,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_mul_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6633,7 +6617,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_or_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6663,7 +6647,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_and_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6693,7 +6677,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6730,7 +6714,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6811,7 +6795,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6850,7 +6834,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -6887,7 +6871,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6907,7 +6891,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6927,7 +6911,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6947,7 +6931,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -6967,7 +6951,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -7008,7 +6992,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -7051,7 +7035,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -7094,7 +7078,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -7135,14 +7119,11 @@ entry:
   ret i32 %vecext.i
 }
 
-define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
+define double @test_mm512_reduce_add_pd(<8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_add_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
@@ -7155,7 +7136,6 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7176,14 +7156,11 @@ entry:
   ret double %vecext.i
 }
 
-define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
+define double @test_mm512_reduce_mul_pd(<8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_mul_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
@@ -7196,7 +7173,6 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7217,11 +7193,10 @@ entry:
   ret double %vecext.i
 }
 
-define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
+define float @test_mm512_reduce_add_ps(<16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_add_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -7233,7 +7208,6 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7256,11 +7230,10 @@ entry:
   ret float %vecext.i
 }
 
-define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
+define float @test_mm512_reduce_mul_ps(<16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_mul_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; X86-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -7272,7 +7245,6 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7295,14 +7267,11 @@ entry:
   ret float %vecext.i
 }
 
-define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
+define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movzbl 8(%ebp), %eax
@@ -7318,7 +7287,6 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7343,14 +7311,11 @@ entry:
   ret double %vecext.i
 }
 
-define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
+define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movzbl 8(%ebp), %eax
@@ -7367,7 +7332,6 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7393,11 +7357,10 @@ entry:
   ret double %vecext.i
 }
 
-define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
+define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
@@ -7412,7 +7375,6 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7439,11 +7401,10 @@ entry:
   ret float %vecext.i
 }
 
-define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
+define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
@@ -7459,7 +7420,6 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7487,7 +7447,7 @@ entry:
   ret float %vecext.i
 }
 
-define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_max_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -7517,7 +7477,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_max_epu64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -7547,14 +7507,11 @@ entry:
   ret i64 %vecext.i
 }
 
-define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
+define double @test_mm512_reduce_max_pd(<8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_max_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
@@ -7567,7 +7524,6 @@ define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7586,7 +7542,7 @@ entry:
   ret double %vecext.i
 }
 
-define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_min_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -7616,7 +7572,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
+define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_min_epu64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -7646,14 +7602,11 @@ entry:
   ret i64 %vecext.i
 }
 
-define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
+define double @test_mm512_reduce_min_pd(<8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_min_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
@@ -7666,7 +7619,6 @@ define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7685,7 +7637,7 @@ entry:
   ret double %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -7724,7 +7676,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -7761,14 +7713,11 @@ entry:
   ret i64 %vecext.i
 }
 
-define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
+define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movzbl 8(%ebp), %eax
@@ -7785,7 +7734,6 @@ define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7809,7 +7757,7 @@ entry:
   ret double %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -7848,7 +7796,7 @@ entry:
   ret i64 %vecext.i
 }
 
-define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
+define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -7887,14 +7835,11 @@ entry:
   ret i64 %vecext.i
 }
 
-define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
+define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movzbl 8(%ebp), %eax
@@ -7911,7 +7856,6 @@ define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -7935,7 +7879,7 @@ entry:
   ret double %vecext.i
 }
 
-define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -7955,7 +7899,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -7975,11 +7919,10 @@ entry:
   ret i32 %vecext.i
 }
 
-define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
+define float @test_mm512_reduce_max_ps(<16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_max_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; X86-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -7991,7 +7934,6 @@ define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -8012,7 +7954,7 @@ entry:
   ret float %vecext.i
 }
 
-define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -8032,7 +7974,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
+define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) nounwind {
 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -8052,11 +7994,10 @@ entry:
   ret i32 %vecext.i
 }
 
-define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
+define float @test_mm512_reduce_min_ps(<16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_reduce_min_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; X86-NEXT:    vminps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -8068,7 +8009,6 @@ define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -8089,7 +8029,7 @@ entry:
   ret float %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8132,7 +8072,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8173,11 +8113,10 @@ entry:
   ret i32 %vecext.i
 }
 
-define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
+define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
@@ -8193,7 +8132,6 @@ define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -8219,7 +8157,7 @@ entry:
   ret float %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8262,7 +8200,7 @@ entry:
   ret i32 %vecext.i
 }
 
-define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
+define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8305,11 +8243,10 @@ entry:
   ret i32 %vecext.i
 }
 
-define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
+define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) nounwind {
 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
@@ -8325,7 +8262,6 @@ define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -8351,7 +8287,7 @@ entry:
   ret float %vecext.i
 }
 
-define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_max_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8371,7 +8307,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_max_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8391,7 +8327,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_max_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8411,7 +8347,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_max_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8433,7 +8369,7 @@ entry:
 
 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
 
-define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_max_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8453,7 +8389,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_max_round_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
@@ -8463,7 +8399,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_max_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8483,7 +8419,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_max_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8505,7 +8441,7 @@ entry:
 
 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
 
-define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_max_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8525,7 +8461,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_max_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
@@ -8535,7 +8471,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_min_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8555,7 +8491,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_min_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8575,7 +8511,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_min_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8597,7 +8533,7 @@ entry:
 
 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
 
-define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_min_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8617,7 +8553,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
+define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_min_round_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
@@ -8627,7 +8563,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_min_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8647,7 +8583,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_min_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8667,7 +8603,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_min_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8689,7 +8625,7 @@ entry:
 
 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
 
-define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_min_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8709,7 +8645,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
+define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_min_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
@@ -8719,7 +8655,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
+define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) nounwind {
 ; CHECK-LABEL: test_mm512_sqrt_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
@@ -8729,7 +8665,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
+define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_sqrt_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8749,7 +8685,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
+define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8769,7 +8705,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
+define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8791,7 +8727,7 @@ entry:
 
 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
 
-define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
+define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -8811,7 +8747,7 @@ entry:
   ret <8 x double> %2
 }
 
-define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
+define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0
@@ -8821,7 +8757,7 @@ entry:
   ret <8 x double> %0
 }
 
-define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
+define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) nounwind {
 ; CHECK-LABEL: test_mm512_sqrt_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
@@ -8831,7 +8767,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
+define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_sqrt_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8851,7 +8787,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
+define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8871,7 +8807,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
+define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8893,7 +8829,7 @@ entry:
 
 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
 
-define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
+define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8913,7 +8849,7 @@ entry:
   ret <16 x float> %2
 }
 
-define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
+define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0
@@ -8923,7 +8859,7 @@ entry:
   ret <16 x float> %0
 }
 
-define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
+define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_rol_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprold $5, %zmm0, %zmm0
@@ -8935,7 +8871,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_rol_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8958,7 +8894,7 @@ entry:
   ret <8 x i64> %5
 }
 
-define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_rol_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -8980,7 +8916,7 @@ entry:
   ret <8 x i64> %4
 }
 
-define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
+define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_rol_epi64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprolq $5, %zmm0, %zmm0
@@ -8990,7 +8926,7 @@ entry:
   ret <8 x i64> %0
 }
 
-define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_rol_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9010,7 +8946,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_rol_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9030,7 +8966,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_rolv_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
@@ -9043,7 +8979,7 @@ entry:
   ret <8 x i64> %3
 }
 
-define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_rolv_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -9067,7 +9003,7 @@ entry:
   ret <8 x i64> %6
 }
 
-define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -9090,7 +9026,7 @@ entry:
   ret <8 x i64> %5
 }
 
-define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_rolv_epi64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
@@ -9100,7 +9036,7 @@ entry:
   ret <8 x i64> %0
 }
 
-define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_rolv_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9120,7 +9056,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9140,7 +9076,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
+define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_ror_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprord $5, %zmm0, %zmm0
@@ -9153,7 +9089,7 @@ entry:
 }
 
 
-define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_ror_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -9176,7 +9112,7 @@ entry:
   ret <8 x i64> %5
 }
 
-define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_ror_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -9198,7 +9134,7 @@ entry:
   ret <8 x i64> %4
 }
 
-define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
+define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) nounwind {
 ; CHECK-LABEL: test_mm512_ror_epi64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprorq $5, %zmm0, %zmm0
@@ -9208,7 +9144,7 @@ entry:
   ret <8 x i64> %0
 }
 
-define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_mask_ror_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9228,7 +9164,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
+define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) nounwind {
 ; X86-LABEL: test_mm512_maskz_ror_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9248,7 +9184,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_rorv_epi32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
@@ -9261,7 +9197,7 @@ entry:
   ret <8 x i64> %3
 }
 
-define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_rorv_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -9285,7 +9221,7 @@ entry:
   ret <8 x i64> %6
 }
 
-define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -9308,7 +9244,7 @@ entry:
   ret <8 x i64> %5
 }
 
-define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_rorv_epi64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
@@ -9318,7 +9254,7 @@ entry:
   ret <8 x i64> %0
 }
 
-define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_mask_rorv_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -9338,7 +9274,7 @@ entry:
   ret <8 x i64> %2
 }
 
-define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax

From 5977cb59bda303e1ca98e7e7edc961fdb8160ab7 Mon Sep 17 00:00:00 2001
From: nataliakokoromyti 
Date: Tue, 12 May 2026 07:57:38 -0700
Subject: [PATCH 464/538] [clangd] Avoid crash on pseudo-destructor selection
 (#195939)

clangd crashes during textDocument/codeAction on valid pseudo-destructor
expressions like y->~decltype(A())(). The bug is in
Selection.cpp::earlySourceRange(), which assumes destructor names always
have NamedTypeInfo. The fix is adding null checks before calling
getTypeLoc().

Fixes #195788.
---
 clang-tools-extra/clangd/Selection.cpp         | 14 ++++++++++----
 .../clangd/unittests/SelectionTests.cpp        | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp
index b79ffc7d5a6e9..21c9e71d3db65 100644
--- a/clang-tools-extra/clangd/Selection.cpp
+++ b/clang-tools-extra/clangd/Selection.cpp
@@ -895,13 +895,19 @@ class SelectionVisitor : public RecursiveASTVisitor {
     // rather than the TypeLoc nested inside it.
     // We still traverse the TypeLoc, because it may contain other targeted
     // things like the T in ~Foo().
-    if (const auto *CDD = N.get())
-      return CDD->getNameInfo().getNamedTypeInfo()->getTypeLoc().getBeginLoc();
+    // FIXME: Investigate if getNamedTypeInfo() can still return null for
+    // invalid cases, and drop these checks when it never returns null.
+    if (const auto *CDD = N.get()) {
+      if (auto *TypeInfo = CDD->getNameInfo().getNamedTypeInfo())
+        return TypeInfo->getTypeLoc().getBeginLoc();
+    }
     if (const auto *ME = N.get()) {
       auto NameInfo = ME->getMemberNameInfo();
       if (NameInfo.getName().getNameKind() ==
-          DeclarationName::CXXDestructorName)
-        return NameInfo.getNamedTypeInfo()->getTypeLoc().getBeginLoc();
+          DeclarationName::CXXDestructorName) {
+        if (auto *TypeInfo = NameInfo.getNamedTypeInfo())
+          return TypeInfo->getTypeLoc().getBeginLoc();
+      }
     }
 
     return SourceRange();
diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
index 5e897fae79df4..396990e6ea929 100644
--- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
@@ -665,6 +665,24 @@ TEST(SelectionTest, InjectedClassName) {
   EXPECT_FALSE(D->isInjectedClassName());
 }
 
+TEST(SelectionTest, PseudoDestructorMissingTypeInfo) {
+  llvm::StringLiteral Code = R"cpp(
+    struct A { ~A(); };
+    void b(const A *y) {
+      y->~decltype(A())();
+    }
+  )cpp";
+  auto AST = TestTU::withCode(Code).build();
+  bool Seen = false;
+  // No crash.
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(), 0,
+                            Code.size(), [&](SelectionTree) {
+                              Seen = true;
+                              return true;
+                            });
+  EXPECT_TRUE(Seen);
+}
+
 TEST(SelectionTree, Metrics) {
   const char *Code = R"cpp(
     // error-ok: testing behavior on recovery expression

From f48026b10efe2c22cafda6f8def385b577ef8dd7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 12 May 2026 16:02:13 +0100
Subject: [PATCH 465/538] [DAG] canCreateUndefOrPoison -
 fmaxnum/fminnum/fmaximum/fminimum/fmaximumnum/fminimumnum don't create poison
 (#197195)

Test coverage is proving tricky due to lack of folds that work with these - I'm open to suggestions if we don't want to just eyeball this.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f0b03b89ec1d9..9af378ec28868 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5964,6 +5964,14 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FMAD:
   case ISD::FMULADD:
   case ISD::FP_EXTEND:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::TRUNCATE_SSAT_S:

From 350536e1bea9cfa2cc8c252b6f0ccea2890af7bf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 12 May 2026 16:04:46 +0100
Subject: [PATCH 466/538] [X86] add llvm.vector.reduce.fminimum test coverage
 (#197210)

---
 .../CodeGen/X86/vector-reduce-fminimum.ll     | 1252 +++++++++++++++++
 1 file changed, 1252 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/vector-reduce-fminimum.ll

diff --git a/llvm/test/CodeGen/X86/vector-reduce-fminimum.ll b/llvm/test/CodeGen/X86/vector-reduce-fminimum.ll
new file mode 100644
index 0000000000000..2c85839f3a266
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-reduce-fminimum.ll
@@ -0,0 +1,1252 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL
+
+;
+; vXf32
+;
+
+define float @test_v1f32(<1 x float> %a0) {
+; ALL-LABEL: test_v1f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    retq
+  %1 = call float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a0)
+  ret float %1
+}
+
+define float @test_v2f32(<2 x float> %a0) {
+; SSE2-LABEL: test_v2f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    andps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    minss %xmm3, %xmm4
+; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movaps %xmm0, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    movaps %xmm1, %xmm3
+; SSE41-NEXT:    minss %xmm0, %xmm3
+; SSE41-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    andps %xmm1, %xmm2
+; SSE41-NEXT:    orps %xmm3, %xmm2
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v2f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vorps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vminss %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vorps %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    vandps %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vminss %xmm2, %xmm0, %xmm2
+; AVX512BW-NEXT:    vorps %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
+; AVX512VL-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+  %1 = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a0)
+  ret float %1
+}
+
+define float @test_v4f32(<4 x float> %a0) {
+; SSE2-LABEL: test_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    movaps %xmm0, %xmm5
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    cmpunordss %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm6, %xmm7
+; SSE2-NEXT:    andps %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE2-NEXT:    minss %xmm3, %xmm5
+; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    orps %xmm4, %xmm5
+; SSE2-NEXT:    andnps %xmm5, %xmm7
+; SSE2-NEXT:    orps %xmm7, %xmm6
+; SSE2-NEXT:    movaps %xmm6, %xmm4
+; SSE2-NEXT:    cmpunordss %xmm6, %xmm4
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    andps %xmm6, %xmm0
+; SSE2-NEXT:    movaps %xmm6, %xmm5
+; SSE2-NEXT:    andps %xmm3, %xmm5
+; SSE2-NEXT:    minss %xmm1, %xmm6
+; SSE2-NEXT:    orps %xmm5, %xmm6
+; SSE2-NEXT:    andnps %xmm6, %xmm4
+; SSE2-NEXT:    orps %xmm0, %xmm4
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andps %xmm4, %xmm1
+; SSE2-NEXT:    andps %xmm4, %xmm3
+; SSE2-NEXT:    minss %xmm2, %xmm4
+; SSE2-NEXT:    orps %xmm3, %xmm4
+; SSE2-NEXT:    andnps %xmm4, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movaps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm0, %xmm3
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    movaps %xmm1, %xmm4
+; SSE41-NEXT:    andps %xmm2, %xmm4
+; SSE41-NEXT:    movaps %xmm1, %xmm5
+; SSE41-NEXT:    minss %xmm0, %xmm5
+; SSE41-NEXT:    orps %xmm4, %xmm5
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movaps %xmm1, %xmm4
+; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3]
+; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
+; SSE41-NEXT:    movaps %xmm5, %xmm0
+; SSE41-NEXT:    andps %xmm2, %xmm0
+; SSE41-NEXT:    movaps %xmm5, %xmm6
+; SSE41-NEXT:    minss %xmm3, %xmm6
+; SSE41-NEXT:    orps %xmm0, %xmm6
+; SSE41-NEXT:    movaps %xmm5, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm6
+; SSE41-NEXT:    andps %xmm6, %xmm2
+; SSE41-NEXT:    movaps %xmm6, %xmm1
+; SSE41-NEXT:    minss %xmm4, %xmm1
+; SSE41-NEXT:    orps %xmm2, %xmm1
+; SSE41-NEXT:    movaps %xmm6, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm6, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vandps %xmm4, %xmm0, %xmm5
+; AVX-NEXT:    vminss %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vorps %xmm3, %xmm5, %xmm3
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm5
+; AVX-NEXT:    vblendvps %xmm5, %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vandps %xmm4, %xmm0, %xmm3
+; AVX-NEXT:    vminss %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vorps %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvps %xmm3, %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vandps %xmm4, %xmm0, %xmm2
+; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vorps %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    vandps %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT:    vminss %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorps %xmm3, %xmm5, %xmm3
+; AVX512BW-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512BW-NEXT:    vandps %xmm4, %xmm3, %xmm0
+; AVX512BW-NEXT:    vminss %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vorps %xmm2, %xmm0, %xmm2
+; AVX512BW-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
+; AVX512BW-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT:    vandps %xmm4, %xmm2, %xmm0
+; AVX512BW-NEXT:    vminss %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vcmpunordss %xmm2, %xmm2, %k1
+; AVX512BW-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vminss %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm3 = xmm3 | (xmm0 & xmm4)
+; AVX512VL-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT:    vminss %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm2 | (xmm3 & xmm4)
+; AVX512VL-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
+; AVX512VL-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT:    vminss %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 | (xmm2 & xmm4)
+; AVX512VL-NEXT:    vcmpunordss %xmm2, %xmm2, %k1
+; AVX512VL-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a0)
+  ret float %1
+}
+
+define float @test_v8f32(<8 x float> %a0) {
+; SSE2-LABEL: test_v8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    minps %xmm1, %xmm2
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm3
+; SSE2-NEXT:    orps %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpunordps %xmm0, %xmm2
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm2, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
+; SSE2-NEXT:    minss %xmm4, %xmm2
+; SSE2-NEXT:    andps %xmm1, %xmm5
+; SSE2-NEXT:    orps %xmm2, %xmm5
+; SSE2-NEXT:    andnps %xmm5, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpunordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm1, %xmm4
+; SSE2-NEXT:    minss %xmm6, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm1
+; SSE2-NEXT:    minss %xmm7, %xmm2
+; SSE2-NEXT:    orps %xmm1, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movaps %xmm0, %xmm2
+; SSE41-NEXT:    minps %xmm1, %xmm0
+; SSE41-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    movaps %xmm2, %xmm4
+; SSE41-NEXT:    andps %xmm3, %xmm4
+; SSE41-NEXT:    orps %xmm0, %xmm4
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordps %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    movaps %xmm4, %xmm1
+; SSE41-NEXT:    minss %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm4, %xmm2
+; SSE41-NEXT:    andps %xmm3, %xmm2
+; SSE41-NEXT:    orps %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm4, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    andps %xmm3, %xmm0
+; SSE41-NEXT:    movaps %xmm4, %xmm1
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
+; SSE41-NEXT:    movaps %xmm2, %xmm5
+; SSE41-NEXT:    minss %xmm1, %xmm5
+; SSE41-NEXT:    orps %xmm0, %xmm5
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm5
+; SSE41-NEXT:    andps %xmm5, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    minss %xmm4, %xmm1
+; SSE41-NEXT:    orps %xmm3, %xmm1
+; SSE41-NEXT:    movaps %xmm5, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX-NEXT:    vminss %xmm4, %xmm1, %xmm4
+; AVX-NEXT:    vorps %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm4
+; AVX-NEXT:    vblendvps %xmm4, %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT:    vminss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vorps %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vminps %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX512BW-NEXT:    vminss %xmm4, %xmm1, %xmm4
+; AVX512BW-NEXT:    vorps %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT:    vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512BW-NEXT:    vandps %xmm2, %xmm3, %xmm1
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vminss %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT:    vorps %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
+; AVX512BW-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vminps %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & xmm2)
+; AVX512VL-NEXT:    vcmpunordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512VL-NEXT:    vminss %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 | (xmm1 & xmm2)
+; AVX512VL-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512VL-NEXT:    vminss %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm3 = xmm3 | (xmm0 & xmm2)
+; AVX512VL-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; AVX512VL-NEXT:    vminss %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 | (xmm3 & xmm2)
+; AVX512VL-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
+; AVX512VL-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a0)
+  ret float %1
+}
+
+define float @test_v16f32(<16 x float> %a0) {
+; SSE2-LABEL: test_v16f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    minps %xmm2, %xmm4
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    movaps %xmm0, %xmm5
+; SSE2-NEXT:    andps %xmm2, %xmm5
+; SSE2-NEXT:    orps %xmm4, %xmm5
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    cmpunordps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm4, %xmm0
+; SSE2-NEXT:    andnps %xmm5, %xmm4
+; SSE2-NEXT:    orps %xmm0, %xmm4
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    cmpunordps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm5
+; SSE2-NEXT:    andps %xmm0, %xmm5
+; SSE2-NEXT:    movaps %xmm1, %xmm6
+; SSE2-NEXT:    minps %xmm3, %xmm6
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    orps %xmm6, %xmm3
+; SSE2-NEXT:    movaps %xmm1, %xmm6
+; SSE2-NEXT:    cmpunordps %xmm1, %xmm6
+; SSE2-NEXT:    andps %xmm6, %xmm1
+; SSE2-NEXT:    andnps %xmm3, %xmm6
+; SSE2-NEXT:    orps %xmm1, %xmm6
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    minps %xmm6, %xmm1
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    orps %xmm1, %xmm4
+; SSE2-NEXT:    andnps %xmm4, %xmm0
+; SSE2-NEXT:    orps %xmm5, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
+; SSE2-NEXT:    minss %xmm4, %xmm0
+; SSE2-NEXT:    andps %xmm2, %xmm5
+; SSE2-NEXT:    orps %xmm0, %xmm5
+; SSE2-NEXT:    andnps %xmm5, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    cmpunordss %xmm1, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    minss %xmm6, %xmm1
+; SSE2-NEXT:    orps %xmm4, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm3
+; SSE2-NEXT:    orps %xmm0, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm2
+; SSE2-NEXT:    minss %xmm7, %xmm3
+; SSE2-NEXT:    orps %xmm2, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movaps %xmm0, %xmm4
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    minps %xmm3, %xmm0
+; SSE41-NEXT:    movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    movaps %xmm1, %xmm5
+; SSE41-NEXT:    andps %xmm3, %xmm5
+; SSE41-NEXT:    orps %xmm0, %xmm5
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordps %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    minps %xmm2, %xmm0
+; SSE41-NEXT:    movaps %xmm4, %xmm1
+; SSE41-NEXT:    andps %xmm3, %xmm1
+; SSE41-NEXT:    orps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    cmpunordps %xmm4, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    minps %xmm5, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    andps %xmm3, %xmm2
+; SSE41-NEXT:    orps %xmm0, %xmm2
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordps %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    minss %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm4
+; SSE41-NEXT:    andps %xmm3, %xmm4
+; SSE41-NEXT:    orps %xmm1, %xmm4
+; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    andps %xmm3, %xmm0
+; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE41-NEXT:    movaps %xmm4, %xmm5
+; SSE41-NEXT:    minss %xmm1, %xmm5
+; SSE41-NEXT:    orps %xmm0, %xmm5
+; SSE41-NEXT:    movaps %xmm4, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm4, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm5
+; SSE41-NEXT:    andps %xmm5, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    minss %xmm2, %xmm1
+; SSE41-NEXT:    orps %xmm3, %xmm1
+; SSE41-NEXT:    movaps %xmm5, %xmm0
+; SSE41-NEXT:    cmpunordss %xmm5, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminps %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm3
+; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm3
+; AVX-NEXT:    vblendvps %ymm3, %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX-NEXT:    vminss %xmm4, %xmm1, %xmm4
+; AVX-NEXT:    vorps %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm4
+; AVX-NEXT:    vblendvps %xmm4, %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT:    vminss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vorps %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vminps %ymm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    vandps %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm3
+; AVX512BW-NEXT:    vblendvps %ymm3, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vminps %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX512BW-NEXT:    vminss %xmm4, %xmm1, %xmm4
+; AVX512BW-NEXT:    vorps %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT:    vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512BW-NEXT:    vandps %xmm2, %xmm3, %xmm1
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vminss %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT:    vorps %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
+; AVX512BW-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v16f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vminps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & ymm2)
+; AVX512VL-NEXT:    vcmpunordps %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX512VL-NEXT:    vminps %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 | (xmm1 & xmm2)
+; AVX512VL-NEXT:    vcmpunordps %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & xmm2)
+; AVX512VL-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512VL-NEXT:    vminss %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm3 = xmm3 | (xmm1 & xmm2)
+; AVX512VL-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VL-NEXT:    vminss %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 | (xmm3 & xmm2)
+; AVX512VL-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
+; AVX512VL-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call float @llvm.vector.reduce.fminimum.v16f32(<16 x float> %a0)
+  ret float %1
+}
+
+;
+; vXf64
+;
+
+define double @test_v2f64(<2 x double> %a0) {
+; SSE2-LABEL: test_v2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpunordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm3
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    minsd %xmm2, %xmm4
+; SSE2-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    orpd %xmm4, %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm3, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm0, %xmm1
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    movapd %xmm1, %xmm3
+; SSE41-NEXT:    minsd %xmm0, %xmm3
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    andpd %xmm1, %xmm2
+; SSE41-NEXT:    orpd %xmm3, %xmm2
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordsd %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v2f64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT:    vminsd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT:    vminsd %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
+; AVX512VL-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+  %1 = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a0)
+  ret double %1
+}
+
+define double @test_v4f64(<4 x double> %a0) {
+; SSE2-LABEL: test_v4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    minpd %xmm1, %xmm2
+; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    movapd %xmm0, %xmm3
+; SSE2-NEXT:    andpd %xmm1, %xmm3
+; SSE2-NEXT:    orpd %xmm2, %xmm3
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    cmpunordpd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm3, %xmm2
+; SSE2-NEXT:    orpd %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordsd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm3
+; SSE2-NEXT:    andpd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movapd %xmm2, %xmm5
+; SSE2-NEXT:    minsd %xmm4, %xmm5
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm5, %xmm2
+; SSE2-NEXT:    andnpd %xmm2, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    minpd %xmm1, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    andpd %xmm1, %xmm3
+; SSE41-NEXT:    orpd %xmm0, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
+; SSE41-NEXT:    movapd %xmm3, %xmm2
+; SSE41-NEXT:    minsd %xmm0, %xmm2
+; SSE41-NEXT:    andpd %xmm3, %xmm1
+; SSE41-NEXT:    orpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    cmpunordsd %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; AVX-NEXT:    # xmm2 = mem[0,0]
+; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vorpd %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4f64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    # xmm2 = mem[0,0]
+; AVX512BW-NEXT:    vandpd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorpd %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512BW-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vandpd %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT:    vorpd %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512BW-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & xmm2)
+; AVX512VL-NEXT:    vcmpunordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512VL-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 | (xmm1 & xmm2)
+; AVX512VL-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %a0)
+  ret double %1
+}
+
+define double @test_v8f64(<8 x double> %a0) {
+; SSE2-LABEL: test_v8f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    minpd %xmm2, %xmm4
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    movapd %xmm0, %xmm6
+; SSE2-NEXT:    andpd %xmm2, %xmm6
+; SSE2-NEXT:    orpd %xmm4, %xmm6
+; SSE2-NEXT:    movapd %xmm0, %xmm5
+; SSE2-NEXT:    cmpunordpd %xmm0, %xmm5
+; SSE2-NEXT:    andpd %xmm5, %xmm0
+; SSE2-NEXT:    andnpd %xmm6, %xmm5
+; SSE2-NEXT:    orpd %xmm0, %xmm5
+; SSE2-NEXT:    movapd %xmm5, %xmm4
+; SSE2-NEXT:    cmpunordpd %xmm5, %xmm4
+; SSE2-NEXT:    movapd %xmm5, %xmm0
+; SSE2-NEXT:    andpd %xmm4, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm6
+; SSE2-NEXT:    minpd %xmm3, %xmm6
+; SSE2-NEXT:    movapd %xmm1, %xmm3
+; SSE2-NEXT:    andpd %xmm2, %xmm3
+; SSE2-NEXT:    orpd %xmm6, %xmm3
+; SSE2-NEXT:    movapd %xmm1, %xmm6
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm6
+; SSE2-NEXT:    andpd %xmm6, %xmm1
+; SSE2-NEXT:    andnpd %xmm3, %xmm6
+; SSE2-NEXT:    orpd %xmm1, %xmm6
+; SSE2-NEXT:    movapd %xmm5, %xmm1
+; SSE2-NEXT:    minpd %xmm6, %xmm1
+; SSE2-NEXT:    andpd %xmm2, %xmm5
+; SSE2-NEXT:    orpd %xmm1, %xmm5
+; SSE2-NEXT:    andnpd %xmm5, %xmm4
+; SSE2-NEXT:    orpd %xmm0, %xmm4
+; SSE2-NEXT:    movapd %xmm4, %xmm0
+; SSE2-NEXT:    cmpunordsd %xmm4, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    andpd %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movapd %xmm4, %xmm5
+; SSE2-NEXT:    minsd %xmm3, %xmm5
+; SSE2-NEXT:    andpd %xmm2, %xmm4
+; SSE2-NEXT:    orpd %xmm5, %xmm4
+; SSE2-NEXT:    andnpd %xmm4, %xmm0
+; SSE2-NEXT:    orpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm0, %xmm4
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    minpd %xmm3, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    movapd %xmm1, %xmm5
+; SSE41-NEXT:    andpd %xmm3, %xmm5
+; SSE41-NEXT:    orpd %xmm0, %xmm5
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    minpd %xmm2, %xmm0
+; SSE41-NEXT:    movapd %xmm4, %xmm1
+; SSE41-NEXT:    andpd %xmm3, %xmm1
+; SSE41-NEXT:    orpd %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm4, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    minpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    andpd %xmm3, %xmm2
+; SSE41-NEXT:    orpd %xmm0, %xmm2
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; SSE41-NEXT:    movapd %xmm2, %xmm1
+; SSE41-NEXT:    minsd %xmm0, %xmm1
+; SSE41-NEXT:    andpd %xmm2, %xmm3
+; SSE41-NEXT:    orpd %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordsd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v8f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminpd %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vandpd %ymm2, %ymm0, %ymm3
+; AVX-NEXT:    vorpd %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm3
+; AVX-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vorpd %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8f64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vminpd %ymm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    vandpd %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT:    vorpd %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm3
+; AVX512BW-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vandpd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorpd %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512BW-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vandpd %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT:    vorpd %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512BW-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v8f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vminpd %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ymm2)
+; AVX512VL-NEXT:    vcmpunordpd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vmovapd %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX512VL-NEXT:    vminpd %xmm0, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 | (xmm1 & xmm2)
+; AVX512VL-NEXT:    vcmpunordpd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vmovapd %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm3[1,0]
+; AVX512VL-NEXT:    vminsd %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 | (xmm3 & xmm2)
+; AVX512VL-NEXT:    vcmpunordsd %xmm3, %xmm3, %k1
+; AVX512VL-NEXT:    vmovsd %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call double @llvm.vector.reduce.fminimum.v8f64(<8 x double> %a0)
+  ret double %1
+}
+
+define double @test_v16f64(<16 x double> %a0) {
+; SSE2-LABEL: test_v16f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm8
+; SSE2-NEXT:    minpd %xmm4, %xmm8
+; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    movapd %xmm0, %xmm9
+; SSE2-NEXT:    andpd %xmm4, %xmm9
+; SSE2-NEXT:    orpd %xmm8, %xmm9
+; SSE2-NEXT:    movapd %xmm0, %xmm8
+; SSE2-NEXT:    cmpunordpd %xmm0, %xmm8
+; SSE2-NEXT:    andpd %xmm8, %xmm0
+; SSE2-NEXT:    andnpd %xmm9, %xmm8
+; SSE2-NEXT:    orpd %xmm0, %xmm8
+; SSE2-NEXT:    movapd %xmm8, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm8, %xmm0
+; SSE2-NEXT:    movapd %xmm8, %xmm9
+; SSE2-NEXT:    andpd %xmm0, %xmm9
+; SSE2-NEXT:    movapd %xmm2, %xmm10
+; SSE2-NEXT:    minpd %xmm6, %xmm10
+; SSE2-NEXT:    movapd %xmm2, %xmm6
+; SSE2-NEXT:    andpd %xmm4, %xmm6
+; SSE2-NEXT:    orpd %xmm10, %xmm6
+; SSE2-NEXT:    movapd %xmm2, %xmm10
+; SSE2-NEXT:    cmpunordpd %xmm2, %xmm10
+; SSE2-NEXT:    andpd %xmm10, %xmm2
+; SSE2-NEXT:    andnpd %xmm6, %xmm10
+; SSE2-NEXT:    orpd %xmm2, %xmm10
+; SSE2-NEXT:    movapd %xmm8, %xmm2
+; SSE2-NEXT:    minpd %xmm10, %xmm2
+; SSE2-NEXT:    andpd %xmm4, %xmm8
+; SSE2-NEXT:    orpd %xmm2, %xmm8
+; SSE2-NEXT:    andnpd %xmm8, %xmm0
+; SSE2-NEXT:    orpd %xmm9, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    cmpunordpd %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm0, %xmm6
+; SSE2-NEXT:    andpd %xmm2, %xmm6
+; SSE2-NEXT:    movapd %xmm1, %xmm8
+; SSE2-NEXT:    minpd %xmm5, %xmm8
+; SSE2-NEXT:    movapd %xmm1, %xmm9
+; SSE2-NEXT:    andpd %xmm4, %xmm9
+; SSE2-NEXT:    orpd %xmm8, %xmm9
+; SSE2-NEXT:    movapd %xmm1, %xmm5
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm5
+; SSE2-NEXT:    andpd %xmm5, %xmm1
+; SSE2-NEXT:    andnpd %xmm9, %xmm5
+; SSE2-NEXT:    orpd %xmm1, %xmm5
+; SSE2-NEXT:    movapd %xmm5, %xmm1
+; SSE2-NEXT:    cmpunordpd %xmm5, %xmm1
+; SSE2-NEXT:    movapd %xmm5, %xmm8
+; SSE2-NEXT:    andpd %xmm1, %xmm8
+; SSE2-NEXT:    movapd %xmm3, %xmm9
+; SSE2-NEXT:    minpd %xmm7, %xmm9
+; SSE2-NEXT:    movapd %xmm3, %xmm7
+; SSE2-NEXT:    andpd %xmm4, %xmm7
+; SSE2-NEXT:    orpd %xmm9, %xmm7
+; SSE2-NEXT:    movapd %xmm3, %xmm9
+; SSE2-NEXT:    cmpunordpd %xmm3, %xmm9
+; SSE2-NEXT:    andpd %xmm9, %xmm3
+; SSE2-NEXT:    andnpd %xmm7, %xmm9
+; SSE2-NEXT:    orpd %xmm3, %xmm9
+; SSE2-NEXT:    movapd %xmm5, %xmm3
+; SSE2-NEXT:    minpd %xmm9, %xmm3
+; SSE2-NEXT:    andpd %xmm4, %xmm5
+; SSE2-NEXT:    orpd %xmm3, %xmm5
+; SSE2-NEXT:    andnpd %xmm5, %xmm1
+; SSE2-NEXT:    orpd %xmm8, %xmm1
+; SSE2-NEXT:    movapd %xmm0, %xmm3
+; SSE2-NEXT:    minpd %xmm1, %xmm3
+; SSE2-NEXT:    andpd %xmm4, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm2
+; SSE2-NEXT:    orpd %xmm6, %xmm2
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordsd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    andpd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movapd %xmm2, %xmm5
+; SSE2-NEXT:    minsd %xmm3, %xmm5
+; SSE2-NEXT:    andpd %xmm4, %xmm2
+; SSE2-NEXT:    orpd %xmm5, %xmm2
+; SSE2-NEXT:    andnpd %xmm2, %xmm0
+; SSE2-NEXT:    orpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm0, %xmm8
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    minpd %xmm7, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [-0.0E+0,-0.0E+0]
+; SSE41-NEXT:    movapd %xmm3, %xmm9
+; SSE41-NEXT:    andpd %xmm7, %xmm9
+; SSE41-NEXT:    orpd %xmm0, %xmm9
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm9
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    minpd %xmm5, %xmm0
+; SSE41-NEXT:    movapd %xmm1, %xmm3
+; SSE41-NEXT:    andpd %xmm7, %xmm3
+; SSE41-NEXT:    orpd %xmm0, %xmm3
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    minpd %xmm9, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    andpd %xmm7, %xmm1
+; SSE41-NEXT:    orpd %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    minpd %xmm6, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    andpd %xmm7, %xmm3
+; SSE41-NEXT:    orpd %xmm0, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    movapd %xmm8, %xmm0
+; SSE41-NEXT:    minpd %xmm4, %xmm0
+; SSE41-NEXT:    movapd %xmm8, %xmm2
+; SSE41-NEXT:    andpd %xmm7, %xmm2
+; SSE41-NEXT:    orpd %xmm0, %xmm2
+; SSE41-NEXT:    movapd %xmm8, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm8, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    minpd %xmm3, %xmm0
+; SSE41-NEXT:    movapd %xmm2, %xmm3
+; SSE41-NEXT:    andpd %xmm7, %xmm3
+; SSE41-NEXT:    orpd %xmm0, %xmm3
+; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    minpd %xmm1, %xmm0
+; SSE41-NEXT:    movapd %xmm3, %xmm1
+; SSE41-NEXT:    andpd %xmm7, %xmm1
+; SSE41-NEXT:    orpd %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm3, %xmm0
+; SSE41-NEXT:    cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE41-NEXT:    movapd %xmm1, %xmm2
+; SSE41-NEXT:    minsd %xmm0, %xmm2
+; SSE41-NEXT:    andpd %xmm1, %xmm7
+; SSE41-NEXT:    orpd %xmm2, %xmm7
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpunordsd %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT:    movapd %xmm7, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v16f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminpd %ymm3, %ymm1, %ymm3
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vandpd %ymm4, %ymm1, %ymm5
+; AVX-NEXT:    vorpd %ymm3, %ymm5, %ymm3
+; AVX-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm5
+; AVX-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    vminpd %ymm2, %ymm0, %ymm2
+; AVX-NEXT:    vandpd %ymm4, %ymm0, %ymm3
+; AVX-NEXT:    vorpd %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm3
+; AVX-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vminpd %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vandpd %ymm4, %ymm0, %ymm2
+; AVX-NEXT:    vorpd %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm2
+; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandpd %xmm4, %xmm0, %xmm2
+; AVX-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vandpd %xmm4, %xmm0, %xmm2
+; AVX-NEXT:    vorpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16f64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vminpd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
+; AVX512BW-NEXT:    vcmpunordpd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
+; AVX512BW-NEXT:    vminpd %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vandpd %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vorpd %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT:    vcmpunordpd %ymm1, %ymm1, %ymm3
+; AVX512BW-NEXT:    vblendvpd %ymm3, %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vminpd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT:    vandpd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vorpd %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512BW-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vandpd %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT:    vorpd %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512BW-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v16f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vminpd %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
+; AVX512VL-NEXT:    vcmpunordpd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
+; AVX512VL-NEXT:    vminpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm1) | ymm0
+; AVX512VL-NEXT:    vcmpunordpd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT:    vmovapd %ymm1, %ymm3 {%k1}
+; AVX512VL-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX512VL-NEXT:    vminpd %xmm0, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm3 & xmm2)
+; AVX512VL-NEXT:    vcmpunordpd %xmm3, %xmm3, %k1
+; AVX512VL-NEXT:    vmovapd %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512VL-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 | (xmm1 & xmm2)
+; AVX512VL-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+  %1 = call double @llvm.vector.reduce.fminimum.v16f64(<16 x double> %a0)
+  ret double %1
+}
+
+declare float @llvm.vector.reduce.fminimum.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fminimum.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fminimum.v3f32(<3 x float>)
+declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fminimum.v16f32(<16 x float>)
+
+declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fminimum.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fminimum.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fminimum.v16f64(<16 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE: {{.*}}

From a83cfda0a0ee8976b4e1e893da4c59593a16d884 Mon Sep 17 00:00:00 2001
From: Adrian Prantl 
Date: Tue, 12 May 2026 08:04:56 -0700
Subject: [PATCH 467/538] [LLDB] Simplify the API of
 ClangUserExpression::ScanContext [NFC] (#197037)

- this function is a virtual function, but it is called by the leaf
class ClangUserExpression

- it also returns a Status only to then report any error as a warning

This patch devirtualizes the function, since there is use-case for
overloading it in other expression evaluator plugins, and it cleans up
the Status usage by passing in DiagnosticManager directly, like its
sibling functions do.
---
 .../lldb/Expression/LLVMUserExpression.h      |  3 -
 .../Clang/ClangUserExpression.cpp             | 62 +++++++++++--------
 .../Clang/ClangUserExpression.h               |  4 +-
 3 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/lldb/include/lldb/Expression/LLVMUserExpression.h b/lldb/include/lldb/Expression/LLVMUserExpression.h
index 40b463933c07e..568765d9b3d01 100644
--- a/lldb/include/lldb/Expression/LLVMUserExpression.h
+++ b/lldb/include/lldb/Expression/LLVMUserExpression.h
@@ -78,9 +78,6 @@ class LLVMUserExpression : public UserExpression {
             lldb::UserExpressionSP &shared_ptr_to_me,
             lldb::ExpressionVariableSP &result) override;
 
-  virtual void ScanContext(ExecutionContext &exe_ctx,
-                           lldb_private::Status &err) = 0;
-
   bool PrepareToExecuteJITExpression(DiagnosticManager &diagnostic_manager,
                                      ExecutionContext &exe_ctx,
                                      lldb::addr_t &struct_address);
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index f830aad45ec98..ff158f4c99301 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -91,7 +91,8 @@ ClangUserExpression::ClangUserExpression(
 
 ClangUserExpression::~ClangUserExpression() = default;
 
-void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
+void ClangUserExpression::ScanContext(DiagnosticManager &diagnostic_manager,
+                                      ExecutionContext &exe_ctx) {
   Log *log = GetLog(LLDBLog::Expressions);
 
   LLDB_LOGF(log, "ClangUserExpression::ScanContext()");
@@ -159,12 +160,12 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
         lldb::VariableListSP variable_list_sp(
             function_block->GetBlockVariableList(true));
 
-        const char *thisErrorString = "Stopped in a C++ method, but 'this' "
-                                      "isn't available; pretending we are in a "
-                                      "generic context";
+        const char *msg = "Stopped in a C++ method, but 'this' isn't "
+                          "available; pretending we are in a generic context";
 
         if (!variable_list_sp) {
-          err = Status::FromErrorString(thisErrorString);
+          diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                           eDiagnosticOriginLLDB);
           return;
         }
 
@@ -173,7 +174,8 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
 
         if (!this_var_sp || !this_var_sp->IsInScope(frame) ||
             !this_var_sp->LocationIsValidForFrame(frame)) {
-          err = Status::FromErrorString(thisErrorString);
+          diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                           eDiagnosticOriginLLDB);
           return;
         }
       }
@@ -189,12 +191,12 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
         lldb::VariableListSP variable_list_sp(
             function_block->GetBlockVariableList(true));
 
-        const char *selfErrorString = "Stopped in an Objective-C method, but "
-                                      "'self' isn't available; pretending we "
-                                      "are in a generic context";
+        const char *msg = "Stopped in an Objective-C method, but 'self' isn't "
+                          "available; pretending we are in a generic context";
 
         if (!variable_list_sp) {
-          err = Status::FromErrorString(selfErrorString);
+          diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                           eDiagnosticOriginLLDB);
           return;
         }
 
@@ -203,7 +205,8 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
 
         if (!self_variable_sp || !self_variable_sp->IsInScope(frame) ||
             !self_variable_sp->LocationIsValidForFrame(frame)) {
-          err = Status::FromErrorString(selfErrorString);
+          diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                           eDiagnosticOriginLLDB);
           return;
         }
       }
@@ -232,13 +235,13 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
           lldb::VariableListSP variable_list_sp(
               function_block->GetBlockVariableList(true));
 
-          const char *thisErrorString = "Stopped in a context claiming to "
-                                        "capture a C++ object pointer, but "
-                                        "'this' isn't available; pretending we "
-                                        "are in a generic context";
+          const char *msg = "Stopped in a context claiming to capture a C++ "
+                            "object pointer, but 'this' isn't available; "
+                            "pretending we are in a generic context";
 
           if (!variable_list_sp) {
-            err = Status::FromErrorString(thisErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
 
@@ -247,7 +250,8 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
 
           if (!this_var_sp || !this_var_sp->IsInScope(frame) ||
               !this_var_sp->LocationIsValidForFrame(frame)) {
-            err = Status::FromErrorString(thisErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
         }
@@ -259,13 +263,13 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
           lldb::VariableListSP variable_list_sp(
               function_block->GetBlockVariableList(true));
 
-          const char *selfErrorString =
-              "Stopped in a context claiming to capture an Objective-C object "
-              "pointer, but 'self' isn't available; pretending we are in a "
-              "generic context";
+          const char *msg = "Stopped in a context claiming to capture an "
+                            "Objective-C object pointer, but 'self' isn't "
+                            "available; pretending we are in a generic context";
 
           if (!variable_list_sp) {
-            err = Status::FromErrorString(selfErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
 
@@ -274,21 +278,24 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
 
           if (!self_variable_sp || !self_variable_sp->IsInScope(frame) ||
               !self_variable_sp->LocationIsValidForFrame(frame)) {
-            err = Status::FromErrorString(selfErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
 
           Type *self_type = self_variable_sp->GetType();
 
           if (!self_type) {
-            err = Status::FromErrorString(selfErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
 
           CompilerType self_clang_type = self_type->GetForwardCompilerType();
 
           if (!self_clang_type) {
-            err = Status::FromErrorString(selfErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
 
@@ -299,7 +306,8 @@ void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
             m_in_objectivec_method = true;
             m_needs_object_ptr = true;
           } else {
-            err = Status::FromErrorString(selfErrorString);
+            diagnostic_manager.AddDiagnostic(msg, lldb::eSeverityWarning,
+                                             eDiagnosticOriginLLDB);
             return;
           }
         } else {
@@ -525,7 +533,7 @@ bool ClangUserExpression::PrepareForParsing(
     return false;
 
   Status err;
-  ScanContext(exe_ctx, err);
+  ScanContext(diagnostic_manager, exe_ctx);
 
   if (!err.Success()) {
     diagnostic_manager.PutString(lldb::eSeverityWarning, err.AsCString());
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
index 74aceed1d637e..fa2bc04924c63 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
@@ -197,8 +197,8 @@ class ClangUserExpression : public LLVMUserExpression {
 
   void SetupCppModuleImports(ExecutionContext &exe_ctx);
 
-  void ScanContext(ExecutionContext &exe_ctx,
-                   lldb_private::Status &err) override;
+  void ScanContext(DiagnosticManager &diagnostic_manager,
+                   ExecutionContext &exe_ctx);
 
   bool AddArguments(ExecutionContext &exe_ctx, std::vector &args,
                     lldb::addr_t struct_address,

From 8bddd0f35f6b3d009a3e28160eb0e97a4ff5b5b5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim 
Date: Tue, 12 May 2026 16:05:50 +0100
Subject: [PATCH 468/538] [DAG] visitBITCAST - fold (conv
 (scalar_to_vector(load x))) -> (load (conv*)x) (#196978)

Legalization can leave superfluous scalar_to_vector nodes with the
scalar bitwidth matching the vector bitwidth - peek through these when
attempting to bitcast folds

Only one match in trunk at the moment, but there are some additional
folds encountered in #149798
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++++
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4d441b844ebdc..707043736ed3c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17633,6 +17633,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // fold (conv (freeze (load x))) -> (freeze (load (conv*)x))
   // If the resultant load doesn't need a higher alignment than the original!
   auto CastLoad = [this, &VT](SDValue N0, const SDLoc &DL) {
+    // Peek through scalar_to_vector if the scalar is same size as VT - often a
+    // leftover from legalization.
+    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && N0.hasOneUse() &&
+        N0.getOperand(0).getValueSizeInBits() == VT.getSizeInBits())
+      N0 = N0.getOperand(0);
     if (N0.getOpcode() == ISD::AssertNoFPClass)
       N0 = N0.getOperand(0);
     if (!ISD::isNormalLoad(N0.getNode()) || !N0.hasOneUse())
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 8854d8ab80798..ff9c75cfd0c5e 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -445,9 +445,8 @@ entry:
 define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-SD-LABEL: test_udot_v5i8_nomla:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr x8, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    fmov d0, x8
 ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    mov v1.s[0], v2.s[0]

From 25507b66bbb42a83b06d6561452e2d4ad5ed564d Mon Sep 17 00:00:00 2001
From: Alexey Karyakin 
Date: Tue, 12 May 2026 10:18:25 -0500
Subject: [PATCH 469/538] [Hexagon] Define Hexagon v93 ELF flags (#196643)

---
 llvm/include/llvm/BinaryFormat/ELF.h          |  2 ++
 llvm/lib/ObjectYAML/ELFYAML.cpp               |  2 ++
 .../tools/obj2yaml/ELF/hexagon-eflags.yaml    | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+)
 create mode 100644 llvm/test/tools/obj2yaml/ELF/hexagon-eflags.yaml

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 3fb5b51fb0a94..72cded68463a6 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -648,6 +648,7 @@ enum {
   EF_HEXAGON_ISA_V87 = 0x00000087,  // Hexagon V87 ISA
   EF_HEXAGON_ISA_V89 = 0x00000089,  // Hexagon V89 ISA
   EF_HEXAGON_ISA_V91 = 0x00000091,  // Hexagon V91 ISA
+  EF_HEXAGON_ISA_V93 = 0x00000093,  // Hexagon V93 ISA
   EF_HEXAGON_ISA = 0x000003ff,      // Hexagon V.. ISA
 
   // Tiny core flag, bit[15]
@@ -682,6 +683,7 @@ enum {
   EF_HEXAGON_MACH_V87 = EF_HEXAGON_ISA_V87,      // Hexagon V87
   EF_HEXAGON_MACH_V89 = EF_HEXAGON_ISA_V89,      // Hexagon V89
   EF_HEXAGON_MACH_V91 = EF_HEXAGON_ISA_V91,      // Hexagon V91
+  EF_HEXAGON_MACH_V93 = EF_HEXAGON_ISA_V93,      // Hexagon V93
 
   EF_HEXAGON_MACH = 0x0000ffff, // Hexagon V..
 };
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 58013f7a4be0c..0d742e34b4b22 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -507,6 +507,7 @@ void ScalarBitSetTraits::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V87, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V89, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V91, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V93, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA);
@@ -531,6 +532,7 @@ void ScalarBitSetTraits::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_ISA_V87, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V89, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V91, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V93, EF_HEXAGON_ISA);
     break;
   case ELF::EM_AVR:
     BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK);
diff --git a/llvm/test/tools/obj2yaml/ELF/hexagon-eflags.yaml b/llvm/test/tools/obj2yaml/ELF/hexagon-eflags.yaml
new file mode 100644
index 0000000000000..14c704fac78fe
--- /dev/null
+++ b/llvm/test/tools/obj2yaml/ELF/hexagon-eflags.yaml
@@ -0,0 +1,19 @@
+# RUN: yaml2obj -DF=EF_HEXAGON_MACH_V68 %s | obj2yaml - | FileCheck %s --check-prefix=CHECK-V68
+# CHECK-V68: Flags: [ EF_HEXAGON_MACH_V68, EF_HEXAGON_ISA_V68 ]
+
+# RUN: yaml2obj -DF=EF_HEXAGON_MACH_V71T %s | obj2yaml - | FileCheck %s --check-prefix=CHECK-V71T
+# CHECK-V71T: Flags: [ EF_HEXAGON_MACH_V71T, EF_HEXAGON_ISA_V71 ]
+
+# RUN: yaml2obj -DF=EF_HEXAGON_MACH_V91 %s | obj2yaml - | FileCheck %s --check-prefix=CHECK-V91
+# CHECK-V91: Flags: [ EF_HEXAGON_MACH_V91, EF_HEXAGON_ISA_V91 ]
+
+# RUN: yaml2obj -DF=EF_HEXAGON_MACH_V93 %s | obj2yaml - | FileCheck %s --check-prefix=CHECK-V93
+# CHECK-V93: Flags: [ EF_HEXAGON_MACH_V93, EF_HEXAGON_ISA_V93 ]
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_HEXAGON
+  Flags:           [ [[F]] ]

From 76cdf6012b883529e3d608ed4e30d622ba43e10c Mon Sep 17 00:00:00 2001
From: Nick Terrell 
Date: Tue, 12 May 2026 11:22:07 -0400
Subject: [PATCH 470/538] [BasicAA] Don't look through llvm.ptrmask in GEP
 decomposition (#197082)

DecomposeGEPExpression() looked through llvm.ptrmask via
getArgumentAliasingToReturnedPointer(Call, MustPreserveNullness=false).
ptrmask preserves the underlying object but can change the byte address
by clearing low bits, so treating its result as having the same symbolic
offset as its argument produces stale offsets and bogus NoAlias answers.
The bug was introduced by 3f2850bc606c847075673554fe49d4a35f525b61.

Rename MustPreserveNullness to MustPreserveOffset, the property
DecomposeGEPExpression actually needs. Offset preservation is strictly
stronger than nullness preservation, so existing callers remain correct
and the accepted intrinsic set is unchanged (ptrmask stays excluded).
switch DecomposeGEPExpression to pass MustPreserveOffset=true. Every
call site is now tagged with MustPreserveOffset=.
---
 llvm/include/llvm/Analysis/ValueTracking.h    | 20 ++++++++-----
 llvm/lib/Analysis/AliasAnalysis.cpp           |  3 +-
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      |  6 +++-
 llvm/lib/Analysis/CaptureTracking.cpp         |  3 +-
 llvm/lib/Analysis/Loads.cpp                   |  3 +-
 llvm/lib/Analysis/ValueTracking.cpp           | 30 ++++++++++---------
 .../Transforms/IPO/AttributorAttributes.cpp   |  6 ++--
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp     |  2 +-
 .../BasicAA/ptrmask-gep-decomposition.ll      | 25 ++++++++++++++++
 9 files changed, 68 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/Analysis/BasicAA/ptrmask-gep-decomposition.ll

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 00469421d55b4..b2f664a9c9c0d 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -423,25 +423,29 @@ LLVM_ABI uint64_t GetStringLength(const Value *V, unsigned CharSize = 8);
 
 /// This function returns call pointer argument that is considered the same by
 /// aliasing rules. You CAN'T use it to replace one value with another. If
-/// \p MustPreserveNullness is true, the call must preserve the nullness of
-/// the pointer.
+/// \p MustPreserveOffset is true, the call must preserve the byte offset of
+/// the pointer within its underlying object. Offset preservation implies
+/// nullness preservation; pass true when callers reason about either offset or
+/// null equality (e.g. GEP decomposition, dereferenceability, isKnownNonZero).
 LLVM_ABI const Value *
 getArgumentAliasingToReturnedPointer(const CallBase *Call,
-                                     bool MustPreserveNullness);
+                                     bool MustPreserveOffset);
 inline Value *getArgumentAliasingToReturnedPointer(CallBase *Call,
-                                                   bool MustPreserveNullness) {
+                                                   bool MustPreserveOffset) {
   return const_cast(getArgumentAliasingToReturnedPointer(
-      const_cast(Call), MustPreserveNullness));
+      const_cast(Call), MustPreserveOffset));
 }
 
 /// {launder,strip}.invariant.group returns pointer that aliases its argument,
 /// and it only captures pointer by returning it.
 /// These intrinsics are not marked as nocapture, because returning is
 /// considered as capture. The arguments are not marked as returned neither,
-/// because it would make it useless. If \p MustPreserveNullness is true,
-/// the intrinsic must preserve the nullness of the pointer.
+/// because it would make it useless. If \p MustPreserveOffset is true, the
+/// intrinsic must preserve the byte offset of the pointer within its
+/// underlying object (which excludes `llvm.ptrmask`, since masking off low
+/// bits changes the byte offset while still aliasing the same object).
 LLVM_ABI bool isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-    const CallBase *Call, bool MustPreserveNullness);
+    const CallBase *Call, bool MustPreserveOffset);
 
 /// This method strips off any GEP address adjustments, pointer casts
 /// or `llvm.threadlocal.address` from the specified value \p V, returning the
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 9d89a6d90f706..82f1b7c3a4fd3 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -928,7 +928,8 @@ bool llvm::isBaseOfObject(const Value *V) {
 
 bool llvm::isEscapeSource(const Value *V) {
   if (auto *CB = dyn_cast(V)) {
-    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, true))
+    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+            CB, /*MustPreserveOffset=*/true))
       return false;
 
     // The return value of a function with a captures(ret: address, provenance)
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 8b120f0ad1e11..832749f949aee 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -652,7 +652,11 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
         // because it should be in sync with CaptureTracking. Not using it may
         // cause weird miscompilations where 2 aliasing pointers are assumed to
         // noalias.
-        if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) {
+        // Pass MustPreserveOffset=true so we exclude llvm.ptrmask, which can
+        // change the byte offset by clearing low bits and would otherwise
+        // corrupt the symbolic offset we are accumulating in `Decomposed`.
+        if (auto *RP = getArgumentAliasingToReturnedPointer(
+                Call, /*MustPreserveOffset=*/true)) {
           V = RP;
           continue;
         }
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index c34fde8d11704..2bea8e2129b4f 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -281,7 +281,8 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
     // marked with nocapture do not capture. This means that places like
     // getUnderlyingObject in ValueTracking or DecomposeGEPExpression
     // in BasicAA also need to know about this property.
-    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true))
+    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+            Call, /*MustPreserveOffset=*/true))
       return UseCaptureInfo::passthrough();
 
     // Volatile operations effectively capture the memory location that they
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index ac4d37aca7673..797d18325c336 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -161,7 +161,8 @@ static bool isDereferenceableAndAlignedPointer(
 
 
   if (const auto *Call = dyn_cast(V)) {
-    if (auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
+    if (auto *RP = getArgumentAliasingToReturnedPointer(
+            Call, /*MustPreserveOffset=*/true))
       return isDereferenceableAndAlignedPointer(RP, Alignment, Size, DL, CtxI,
                                                 AC, DT, TLI, Visited, MaxDepth);
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 91ad517f194a6..3be0db5ad9c7e 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3536,7 +3536,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     if (I->getType()->isPointerTy()) {
       if (Call->isReturnNonNull())
         return true;
-      if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
+      if (const auto *RP = getArgumentAliasingToReturnedPointer(
+              Call, /*MustPreserveOffset=*/true))
         return isKnownNonZero(RP, Q, Depth);
     } else {
       if (MDNode *Ranges = Q.IIQ.getMetadata(Call, LLVMContext::MD_range))
@@ -6868,38 +6869,38 @@ uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
 
 const Value *
 llvm::getArgumentAliasingToReturnedPointer(const CallBase *Call,
-                                           bool MustPreserveNullness) {
+                                           bool MustPreserveOffset) {
   assert(Call &&
          "getArgumentAliasingToReturnedPointer only works on nonnull calls");
   if (const Value *RV = Call->getReturnedArgOperand())
     return RV;
   // This can be used only as a aliasing property.
   if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-          Call, MustPreserveNullness))
+          Call, MustPreserveOffset))
     return Call->getArgOperand(0);
   return nullptr;
 }
 
 bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-    const CallBase *Call, bool MustPreserveNullness) {
+    const CallBase *Call, bool MustPreserveOffset) {
   switch (Call->getIntrinsicID()) {
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::aarch64_irg:
   case Intrinsic::aarch64_tagp:
   // The amdgcn_make_buffer_rsrc function does not alter the address of the
-  // input pointer (and thus preserve null-ness for the purposes of escape
-  // analysis, which is where the MustPreserveNullness flag comes in to play).
-  // However, it will not necessarily map ptr addrspace(N) null to ptr
-  // addrspace(8) null, aka the "null descriptor", which has "all loads return
-  // 0, all stores are dropped" semantics. Given the context of this intrinsic
-  // list, no one should be relying on such a strict interpretation of
-  // MustPreserveNullness (and, at time of writing, they are not), but we
-  // document this fact out of an abundance of caution.
+  // input pointer (and thus preserves the byte offset, which is the property
+  // the MustPreserveOffset flag selects). However, it will not necessarily
+  // map ptr addrspace(N) null to ptr addrspace(8) null, aka the "null
+  // descriptor", which has "all loads return 0, all stores are dropped"
+  // semantics. Given the context of this intrinsic list, no one should be
+  // relying on such a strict bit-exact null mapping (and, at time of
+  // writing, they are not), but we document this fact out of an abundance
+  // of caution.
   case Intrinsic::amdgcn_make_buffer_rsrc:
     return true;
   case Intrinsic::ptrmask:
-    return !MustPreserveNullness;
+    return !MustPreserveOffset;
   case Intrinsic::threadlocal_address:
     // The underlying variable changes with thread ID. The Thread ID may change
     // at coroutine suspend points.
@@ -6970,7 +6971,8 @@ const Value *llvm::getUnderlyingObject(const Value *V, unsigned MaxLookup) {
         // because it should be in sync with CaptureTracking. Not using it may
         // cause weird miscompilations where 2 aliasing pointers are assumed to
         // noalias.
-        if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) {
+        if (auto *RP = getArgumentAliasingToReturnedPointer(
+                Call, /*MustPreserveOffset=*/false)) {
           V = RP;
           continue;
         }
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 7c50b9faf3c80..69b479fb41ea4 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12768,7 +12768,7 @@ struct AAInvariantLoadPointerImpl
     case IRP_CALL_SITE_RETURNED: {
       const auto &CB = cast(getAnchorValue());
       return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-          &CB, /*MustPreserveNullness=*/false);
+          &CB, /*MustPreserveOffset=*/false);
     }
     case IRP_ARGUMENT: {
       const Function *F = getAssociatedFunction();
@@ -12903,7 +12903,7 @@ struct AAInvariantLoadPointerImpl
 
     if (const auto *CB = dyn_cast(&getAnchorValue())) {
       if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-              CB, /*MustPreserveNullness=*/false)) {
+              CB, /*MustPreserveOffset=*/false)) {
         for (const Value *Arg : CB->args()) {
           if (!IsLocallyInvariantLoadIfPointer(*Arg))
             return indicatePessimisticFixpoint();
@@ -12949,7 +12949,7 @@ struct AAInvariantLoadPointerCallSiteReturned final
 
     const auto &CB = cast(getAnchorValue());
     if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-            &CB, /*MustPreserveNullness=*/false))
+            &CB, /*MustPreserveOffset=*/false))
       return AAInvariantLoadPointerImpl::initialize(A);
 
     if (F->onlyReadsMemory() && F->hasNoSync())
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index c6113fbb96b3f..1aea1ee301ad5 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -916,7 +916,7 @@ determinePointerAccessAttrs(Argument *A,
       // but return results thas alias their pointer argument, and thus should
       // be handled like GEP or addrspacecast above.
       if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-              &CB, /*MustPreserveNullness=*/false)) {
+              &CB, /*MustPreserveOffset=*/false)) {
         for (Use &UU : CB.uses())
           if (Visited.insert(&UU).second)
             Worklist.push_back(&UU);
diff --git a/llvm/test/Analysis/BasicAA/ptrmask-gep-decomposition.ll b/llvm/test/Analysis/BasicAA/ptrmask-gep-decomposition.ll
new file mode 100644
index 0000000000000..d746c29e90deb
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/ptrmask-gep-decomposition.ll
@@ -0,0 +1,25 @@
+; RUN: opt -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output %s 2>&1 | FileCheck %s
+
+; BasicAA must not look through llvm.ptrmask when decomposing a GEP into a
+; symbolic byte offset, because ptrmask preserves the underlying object but
+; can change the byte address. With %base 2-aligned:
+;   %p = %base + 1
+;   %q = ptrmask(%p, -2) == %base
+;   %r = %q + 1          == %p
+; so %p and %r alias.
+
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
+
+define i8 @ptrmask_gep_may_alias(ptr align 2 %base) {
+; CHECK-LABEL: Function: ptrmask_gep_may_alias
+; CHECK: MayAlias: i8* %p, i8* %r
+entry:
+  %p = getelementptr i8, ptr %base, i64 1
+  %q = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -2)
+  %r = getelementptr i8, ptr %q, i64 1
+
+  store i8 7, ptr %p, align 1
+  store i8 42, ptr %r, align 1
+  %v = load i8, ptr %p, align 1
+  ret i8 %v
+}

From 4f5b905a2ce4ce8d638f4ef2b273b2fa74445945 Mon Sep 17 00:00:00 2001
From: James Molloy 
Date: Tue, 12 May 2026 16:52:27 +0100
Subject: [PATCH 471/538] [ScheduleDAG] Add a reachability cache to amortize
 DFS calls (#195079)

ScheduleDAGTopologicalSort::IsReachable falls out to a DFS on its
slow path. For some connectivity patterns this can result in ~quadratic
behavior.

Add a cache of {A, B} -> Reachable(A, B). This is invalidated whenever
AddPred or InitDAGTopologicalSorting is called.

For an antagnostic testcase, SelectionDAG time went from 1300s to 250s.

No testcase as no functional change, performance only.

---------

Co-authored-by: James Molloy 
---
 llvm/include/llvm/CodeGen/ScheduleDAG.h |  3 +++
 llvm/lib/CodeGen/ScheduleDAG.cpp        | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h
index b84f8b99a06e2..2d1b0484a549f 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAG.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -747,6 +747,9 @@ class TargetRegisterInfo;
     std::vector Node2Index;
     /// a set of nodes visited during a DFS traversal.
     BitVector Visited;
+    /// Cache of reachability queries. {A, B} -> true if B is reachable from A.
+    /// The keys are SUnit NodeNums.
+    DenseMap, bool> Reachable;
 
     /// Makes a DFS traversal and mark all nodes affected by the edge insertion.
     /// These nodes will later get new topological indexes by means of the Shift
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index 7dd4aa2d63c7b..c621fef7c1e0c 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -467,6 +467,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
   // Cancel pending updates, mark as valid.
   Dirty = false;
   Updates.clear();
+  Reachable.clear();
 
   unsigned DAGSize = SUnits.size();
   std::vector WorkList;
@@ -562,6 +563,7 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   }
 
   NumNewPredsAdded++;
+  Reachable.clear();
 }
 
 void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
@@ -734,9 +736,18 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   bool HasLoop = false;
   // Is Ord(TargetSU) < Ord(SU) ?
   if (LowerBound < UpperBound) {
+    if (auto It = Reachable.find({TargetSU->NodeNum, SU->NodeNum});
+        It != Reachable.end()) {
+      return It->second;
+    }
     Visited.reset();
     // There may be a path from TargetSU to SU. Check for it.
     DFS(TargetSU, UpperBound, HasLoop);
+    // If there's no loop, cache the result. We only cache negative results,
+    // as positive results are not safe to cache; users call SU.removePred()
+    // without notifying us.
+    if (!HasLoop)
+      Reachable[{TargetSU->NodeNum, SU->NodeNum}] = false;
   }
   return HasLoop;
 }

From b9fd84076ab4f786600b97280757d937e0671bb7 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Wed, 13 May 2026 00:02:25 +0800
Subject: [PATCH 472/538] [InstCombine] Fix one operator precedence (#197164)

---
 llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index dc1a90d486201..f385cc6a7cc83 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2326,10 +2326,10 @@ static Instruction *foldICmpUSubSatWithAndForMostSignificantBitCmp(
 
   Value *TrueVal = SI.getTrueValue();
   Value *FalseVal = SI.getFalseValue();
-  if (!(Pred == ICmpInst::ICMP_EQ &&
-        (match(TrueVal, m_Zero()) && match(FalseVal, m_SignMask()))) ||
-      (Pred == ICmpInst::ICMP_NE &&
-       (match(TrueVal, m_SignMask()) && match(FalseVal, m_Zero()))))
+  if (!((Pred == ICmpInst::ICMP_EQ && match(TrueVal, m_Zero()) &&
+         match(FalseVal, m_SignMask())) ||
+        (Pred == ICmpInst::ICMP_NE && match(TrueVal, m_SignMask()) &&
+         match(FalseVal, m_Zero()))))
     return nullptr;
 
   auto *Ty = A->getType();

From 80994b7697448d2cf7982dea8a96431141e343c2 Mon Sep 17 00:00:00 2001
From: Macsen Casaus <135416202+macsencasaus@users.noreply.github.com>
Date: Tue, 12 May 2026 11:02:40 -0500
Subject: [PATCH 473/538] Fix assertion failure of `APInt::sqrt` on U64 MAX
 input (#197161)

Closes #197145

In https://github.com/llvm/llvm-project/blob/65a206f2ec552cccf7c96c5306147f0437832ec7/llvm/lib/Support/APInt.cpp#L1305-L1312:

Instead of computing `nextSquare` completely (which overflows), we only
need to compute the difference between `x_old^2` and `(x_old + 1)^2`
which is simply `2 * x_old + 1` since `(x_old + 1)^2 = x_old^2 + 2 *
x_old + 1`. We can use this difference for the following computation of
the midpoint.
---
 llvm/lib/Support/APInt.cpp       | 6 +++---
 llvm/unittests/ADT/APIntTest.cpp | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index eb3762e396258..df3616abd7dcf 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1302,12 +1302,12 @@ APInt APInt::sqrt() const {
   // floating point representation after 192 bits. There are no discrepancies
   // between this algorithm and pari/gp for bit widths < 192 bits.
   APInt square(x_old * x_old);
-  APInt nextSquare((x_old + 1) * (x_old +1));
   if (this->ult(square))
     return x_old;
-  assert(this->ule(nextSquare) && "Error in APInt::sqrt computation");
-  APInt midpoint((nextSquare - square).udiv(two));
+  APInt delta(2 * x_old + 1);
   APInt offset(*this - square);
+  assert(offset.ule(delta) && "Error in APInt::sqrt computation");
+  APInt midpoint(delta.udiv(two));
   if (offset.ult(midpoint))
     return x_old;
   return x_old + 1;
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 7a45fd0f4558c..3c0446867b14b 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -4009,4 +4009,12 @@ TEST(APIntTest, clmulh) {
                 .getSExtValue(),
             21845);
 }
+
+TEST(APIntTest, sqrt) {
+  EXPECT_EQ(APInt::getMaxValue(64).sqrt(), 4294967296U);
+  EXPECT_EQ(APInt::getMaxValue(128).sqrt(),
+            APInt(128, "18446744073709551616", 10));
+  EXPECT_EQ(APInt::getMaxValue(256).sqrt(),
+            APInt(256, "340282366920938463463374607431768211456", 10));
+}
 } // end anonymous namespace

From 51770300e13e8b9b9144629009220ce6b5d68972 Mon Sep 17 00:00:00 2001
From: NagaChaitanya Vellanki 
Date: Tue, 12 May 2026 09:09:04 -0700
Subject: [PATCH 474/538] [clang] Add typed variants for C2y stdbit.h rotate
 builtins (#195299)

stdc_rotate_left_{uc,us,ui,ul,ull}
stdc_rotate_right_{uc,us,ui,ul,ull}

Lower type-specific  rotate functions to LLVM intrinsics
(fshl/fshr). Includes constant expression support and tests for Sema,
CodeGen, and constant evaluation.

Followup: #160259
---
 clang/docs/ReleaseNotes.rst              |  5 ++
 clang/include/clang/Basic/Builtins.h     |  1 +
 clang/include/clang/Basic/Builtins.td    | 11 +++
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 15 ++++
 clang/lib/AST/ExprConstant.cpp           | 15 ++++
 clang/lib/Basic/Builtins.cpp             |  3 +
 clang/lib/CodeGen/CGBuiltin.cpp          | 10 +++
 clang/test/CodeGen/Inputs/stdbit.h       | 12 ++++
 clang/test/CodeGen/builtin-rotate.c      | 88 ++++++++++++++++++++++++
 clang/test/Sema/Inputs/stdbit.h          | 12 ++++
 clang/test/Sema/builtin-stdc-rotate.c    | 77 +++++++++++++++++++++
 11 files changed, 249 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7ef88461c11a7..896a4b4867ed4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -177,6 +177,11 @@ C Language Changes
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
 
+- Implemented the type-specific C2y ```` rotate functions with constexpr
+  evaluation support:
+  ``stdc_rotate_left_{uc,us,ui,ul,ull}`` and
+  ``stdc_rotate_right_{uc,us,ui,ul,ull}``.
+
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
 - Clang now allows C23 ``constexpr`` struct member access through the dot operator in constant expressions. (#GH178349)
diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index 51f0745d47015..9054f9415ce67 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -46,6 +46,7 @@ enum LanguageID : uint16_t {
   ALL_OCL_LANGUAGES = 0x800, // builtin for OCL languages.
   HLSL_LANG = 0x1000,        // builtin requires HLSL.
   C23_LANG = 0x2000,         // builtin requires C23 or later.
+  C2Y_LANG = 0x4000,         // builtin requires C2y or later.
   ALL_LANGUAGES = C_LANG | CXX_LANG | OBJC_LANG, // builtin for all languages.
   ALL_GNU_LANGUAGES = ALL_LANGUAGES | GNU_LANG,  // builtin requires GNU mode.
   ALL_MS_LANGUAGES = ALL_LANGUAGES | MS_LANG     // builtin requires MS mode.
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 5341a8c347f74..40ec94ab75046 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -974,6 +974,17 @@ def StdcBitCeilTyped : LibBuiltin<"stdbit.h", "C23_LANG">, IntBitUtilTemplate {
   let Prototype = "T(T)";
 }
 
+def StdcRotateLeftTyped : LibBuiltin<"stdbit.h", "C2Y_LANG">, IntBitUtilTemplate {
+  let Spellings = ["stdc_rotate_left"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T, unsigned int)";
+}
+
+def StdcRotateRightTyped : LibBuiltin<"stdbit.h", "C2Y_LANG">, IntBitUtilTemplate {
+  let Spellings = ["stdc_rotate_right"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T, unsigned int)";
+}
 
 
 // Random GCC builtins
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5ba15b7ad4f63..e05c9aed39b14 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -4613,6 +4613,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_rotateleft32:
   case Builtin::BI__builtin_rotateleft64:
   case Builtin::BI__builtin_stdc_rotate_left:
+  case Builtin::BIstdc_rotate_left_uc:
+  case Builtin::BIstdc_rotate_left_us:
+  case Builtin::BIstdc_rotate_left_ui:
+  case Builtin::BIstdc_rotate_left_ul:
+  case Builtin::BIstdc_rotate_left_ull:
   case Builtin::BI_rotl8: // Microsoft variants of rotate left
   case Builtin::BI_rotl16:
   case Builtin::BI_rotl:
@@ -4623,6 +4628,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_rotateright32:
   case Builtin::BI__builtin_rotateright64:
   case Builtin::BI__builtin_stdc_rotate_right:
+  case Builtin::BIstdc_rotate_right_uc:
+  case Builtin::BIstdc_rotate_right_us:
+  case Builtin::BIstdc_rotate_right_ui:
+  case Builtin::BIstdc_rotate_right_ul:
+  case Builtin::BIstdc_rotate_right_ull:
   case Builtin::BI_rotr8: // Microsoft variants of rotate right
   case Builtin::BI_rotr16:
   case Builtin::BI_rotr:
@@ -4636,6 +4646,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     case Builtin::BI__builtin_rotateright32:
     case Builtin::BI__builtin_rotateright64:
     case Builtin::BI__builtin_stdc_rotate_right:
+    case Builtin::BIstdc_rotate_right_uc:
+    case Builtin::BIstdc_rotate_right_us:
+    case Builtin::BIstdc_rotate_right_ui:
+    case Builtin::BIstdc_rotate_right_ul:
+    case Builtin::BIstdc_rotate_right_ull:
     case Builtin::BI_rotr8:
     case Builtin::BI_rotr16:
     case Builtin::BI_rotr:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 81b42ef1467c7..5f09c9ea4a7b8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -16874,6 +16874,16 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
   case Builtin::BI__builtin_rotateright64:
   case Builtin::BI__builtin_stdc_rotate_left:
   case Builtin::BI__builtin_stdc_rotate_right:
+  case Builtin::BIstdc_rotate_left_uc:
+  case Builtin::BIstdc_rotate_left_us:
+  case Builtin::BIstdc_rotate_left_ui:
+  case Builtin::BIstdc_rotate_left_ul:
+  case Builtin::BIstdc_rotate_left_ull:
+  case Builtin::BIstdc_rotate_right_uc:
+  case Builtin::BIstdc_rotate_right_us:
+  case Builtin::BIstdc_rotate_right_ui:
+  case Builtin::BIstdc_rotate_right_ul:
+  case Builtin::BIstdc_rotate_right_ull:
   case Builtin::BI_rotl8: // Microsoft variants of rotate left
   case Builtin::BI_rotl16:
   case Builtin::BI_rotl:
@@ -16897,6 +16907,11 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     case Builtin::BI__builtin_rotateright32:
     case Builtin::BI__builtin_rotateright64:
     case Builtin::BI__builtin_stdc_rotate_right:
+    case Builtin::BIstdc_rotate_right_uc:
+    case Builtin::BIstdc_rotate_right_us:
+    case Builtin::BIstdc_rotate_right_ui:
+    case Builtin::BIstdc_rotate_right_ul:
+    case Builtin::BIstdc_rotate_right_ull:
     case Builtin::BI_rotr8:
     case Builtin::BI_rotr16:
     case Builtin::BI_rotr:
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index 72735b04a16ba..49517fc748112 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -194,6 +194,9 @@ static bool builtinIsSupported(const llvm::StringTable &Strings,
   /* C23 unsupported */
   if (!LangOpts.C23 && BuiltinInfo.Langs == C23_LANG)
     return false;
+  /* C2y unsupported */
+  if (!LangOpts.C2y && BuiltinInfo.Langs == C2Y_LANG)
+    return false;
   return true;
 }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f29e27818d7ec..1f97c67f26c8e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3770,6 +3770,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_rotateleft32:
   case Builtin::BI__builtin_rotateleft64:
   case Builtin::BI__builtin_stdc_rotate_left:
+  case Builtin::BIstdc_rotate_left_uc:
+  case Builtin::BIstdc_rotate_left_us:
+  case Builtin::BIstdc_rotate_left_ui:
+  case Builtin::BIstdc_rotate_left_ul:
+  case Builtin::BIstdc_rotate_left_ull:
   case Builtin::BI_rotl8: // Microsoft variants of rotate left
   case Builtin::BI_rotl16:
   case Builtin::BI_rotl:
@@ -3782,6 +3787,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_rotateright32:
   case Builtin::BI__builtin_rotateright64:
   case Builtin::BI__builtin_stdc_rotate_right:
+  case Builtin::BIstdc_rotate_right_uc:
+  case Builtin::BIstdc_rotate_right_us:
+  case Builtin::BIstdc_rotate_right_ui:
+  case Builtin::BIstdc_rotate_right_ul:
+  case Builtin::BIstdc_rotate_right_ull:
   case Builtin::BI_rotr8: // Microsoft variants of rotate right
   case Builtin::BI_rotr16:
   case Builtin::BI_rotr:
diff --git a/clang/test/CodeGen/Inputs/stdbit.h b/clang/test/CodeGen/Inputs/stdbit.h
index 529f456151007..bdabb9ec8ee7d 100644
--- a/clang/test/CodeGen/Inputs/stdbit.h
+++ b/clang/test/CodeGen/Inputs/stdbit.h
@@ -100,4 +100,16 @@ unsigned int stdc_bit_ceil_ui(unsigned int);
 unsigned long stdc_bit_ceil_ul(unsigned long);
 unsigned long long stdc_bit_ceil_ull(unsigned long long);
 
+unsigned char stdc_rotate_left_uc(unsigned char, unsigned int);
+unsigned short stdc_rotate_left_us(unsigned short, unsigned int);
+unsigned int stdc_rotate_left_ui(unsigned int, unsigned int);
+unsigned long stdc_rotate_left_ul(unsigned long, unsigned int);
+unsigned long long stdc_rotate_left_ull(unsigned long long, unsigned int);
+
+unsigned char stdc_rotate_right_uc(unsigned char, unsigned int);
+unsigned short stdc_rotate_right_us(unsigned short, unsigned int);
+unsigned int stdc_rotate_right_ui(unsigned int, unsigned int);
+unsigned long stdc_rotate_right_ul(unsigned long, unsigned int);
+unsigned long long stdc_rotate_right_ull(unsigned long long, unsigned int);
+
 #endif
diff --git a/clang/test/CodeGen/builtin-rotate.c b/clang/test/CodeGen/builtin-rotate.c
index 6e2d3f3dd31b6..85ee9781f2f0c 100644
--- a/clang/test/CodeGen/builtin-rotate.c
+++ b/clang/test/CodeGen/builtin-rotate.c
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 -ffreestanding %s -emit-llvm -o - | FileCheck %s
 // RUN: %if clang-target-64-bits %{ %clang_cc1 -ffreestanding %s -emit-llvm -o - | FileCheck %s --check-prefix=INT128 %}
+// RUN: %clang_cc1 -std=c2y -isystem %S/Inputs -DTEST_C2Y_LIB_SPELLINGS %s -emit-llvm -o - | FileCheck %s --check-prefix=C2Y
 
+#ifndef TEST_C2Y_LIB_SPELLINGS
 #include
 
 unsigned char rotl8(unsigned char x, unsigned char y) {
@@ -313,3 +315,89 @@ void test_int128_rotate(unsigned __int128 u128) {
   result_u128 = __builtin_stdc_rotate_right(u128, 32);
 }
 #endif
+
+#endif // !TEST_C2Y_LIB_SPELLINGS
+
+#ifdef TEST_C2Y_LIB_SPELLINGS
+#include 
+
+// C2Y-LABEL: test_typed_rotate_left_uc
+// C2Y:  call i8 @llvm.fshl.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+unsigned char test_typed_rotate_left_uc(unsigned char x, unsigned int cnt) {
+  return stdc_rotate_left_uc(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_left_us
+// C2Y:  call i16 @llvm.fshl.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+unsigned short test_typed_rotate_left_us(unsigned short x, unsigned int cnt) {
+  return stdc_rotate_left_us(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_left_ui
+// C2Y:  call i32 @llvm.fshl.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+unsigned int test_typed_rotate_left_ui(unsigned int x, unsigned int cnt) {
+  return stdc_rotate_left_ui(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_left_ul
+// C2Y:  call {{i32|i64}} @llvm.fshl.{{i32|i64}}({{i32|i64}} %{{.*}}, {{i32|i64}} %{{.*}}, {{i32|i64}} %{{.*}})
+unsigned long test_typed_rotate_left_ul(unsigned long x, unsigned int cnt) {
+  return stdc_rotate_left_ul(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_left_ull
+// C2Y:  call i64 @llvm.fshl.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+unsigned long long test_typed_rotate_left_ull(unsigned long long x, unsigned int cnt) {
+  return stdc_rotate_left_ull(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_right_uc
+// C2Y:  call i8 @llvm.fshr.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+unsigned char test_typed_rotate_right_uc(unsigned char x, unsigned int cnt) {
+  return stdc_rotate_right_uc(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_right_us
+// C2Y:  call i16 @llvm.fshr.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+unsigned short test_typed_rotate_right_us(unsigned short x, unsigned int cnt) {
+  return stdc_rotate_right_us(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_right_ui
+// C2Y:  call i32 @llvm.fshr.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+unsigned int test_typed_rotate_right_ui(unsigned int x, unsigned int cnt) {
+  return stdc_rotate_right_ui(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_right_ul
+// C2Y:  call {{i32|i64}} @llvm.fshr.{{i32|i64}}({{i32|i64}} %{{.*}}, {{i32|i64}} %{{.*}}, {{i32|i64}} %{{.*}})
+unsigned long test_typed_rotate_right_ul(unsigned long x, unsigned int cnt) {
+  return stdc_rotate_right_ul(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_right_ull
+// C2Y:  call i64 @llvm.fshr.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+unsigned long long test_typed_rotate_right_ull(unsigned long long x, unsigned int cnt) {
+  return stdc_rotate_right_ull(x, cnt);
+}
+
+// C2Y-LABEL: test_typed_rotate_constant_count
+// C2Y:  call i8 @llvm.fshl.i8(i8 %{{.*}}, i8 %{{.*}}, i8 3)
+// C2Y:  call i8 @llvm.fshr.i8(i8 %{{.*}}, i8 %{{.*}}, i8 3)
+// C2Y:  call i16 @llvm.fshl.i16(i16 %{{.*}}, i16 %{{.*}}, i16 5)
+// C2Y:  call i32 @llvm.fshl.i32(i32 %{{.*}}, i32 %{{.*}}, i32 8)
+// C2Y:  call i64 @llvm.fshr.i64(i64 %{{.*}}, i64 %{{.*}}, i64 16)
+void test_typed_rotate_constant_count(unsigned char uc, unsigned short us,
+                                      unsigned int ui, unsigned long long ull) {
+  volatile unsigned char r_uc;
+  volatile unsigned short r_us;
+  volatile unsigned int r_ui;
+  volatile unsigned long long r_ull;
+  r_uc = stdc_rotate_left_uc(uc, 3);
+  r_uc = stdc_rotate_right_uc(uc, 3);
+  r_us = stdc_rotate_left_us(us, 5);
+  r_ui = stdc_rotate_left_ui(ui, 8);
+  r_ull = stdc_rotate_right_ull(ull, 16);
+}
+
+#endif // TEST_C2Y_LIB_SPELLINGS
diff --git a/clang/test/Sema/Inputs/stdbit.h b/clang/test/Sema/Inputs/stdbit.h
index 529f456151007..bdabb9ec8ee7d 100644
--- a/clang/test/Sema/Inputs/stdbit.h
+++ b/clang/test/Sema/Inputs/stdbit.h
@@ -100,4 +100,16 @@ unsigned int stdc_bit_ceil_ui(unsigned int);
 unsigned long stdc_bit_ceil_ul(unsigned long);
 unsigned long long stdc_bit_ceil_ull(unsigned long long);
 
+unsigned char stdc_rotate_left_uc(unsigned char, unsigned int);
+unsigned short stdc_rotate_left_us(unsigned short, unsigned int);
+unsigned int stdc_rotate_left_ui(unsigned int, unsigned int);
+unsigned long stdc_rotate_left_ul(unsigned long, unsigned int);
+unsigned long long stdc_rotate_left_ull(unsigned long long, unsigned int);
+
+unsigned char stdc_rotate_right_uc(unsigned char, unsigned int);
+unsigned short stdc_rotate_right_us(unsigned short, unsigned int);
+unsigned int stdc_rotate_right_ui(unsigned int, unsigned int);
+unsigned long stdc_rotate_right_ul(unsigned long, unsigned int);
+unsigned long long stdc_rotate_right_ull(unsigned long long, unsigned int);
+
 #endif
diff --git a/clang/test/Sema/builtin-stdc-rotate.c b/clang/test/Sema/builtin-stdc-rotate.c
index d8b506dbd6e46..b7ce7ff5adf25 100644
--- a/clang/test/Sema/builtin-stdc-rotate.c
+++ b/clang/test/Sema/builtin-stdc-rotate.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c2y -isystem %S/Inputs -fsyntax-only -verify %s
 
 _Static_assert(__builtin_stdc_rotate_left((unsigned char)0xB1, 3) == 0x8D, "");
 _Static_assert(__builtin_stdc_rotate_right((unsigned char)0xB1, 3) == 0x36, "");
@@ -130,3 +131,79 @@ void test_invalid_types(float f, int si) {
   (void)__builtin_stdc_rotate_left(-5, 1); // expected-error {{1st argument must be a scalar unsigned integer type (was 'int')}}
   (void)__builtin_stdc_rotate_right(3.0, 1); // expected-error {{1st argument must be a scalar unsigned integer type (was 'double')}}
 }
+
+#ifdef __has_include
+#if __has_include()
+#include 
+
+_Static_assert(stdc_rotate_left_uc(0xB1, 3) == 0x8D, "");
+_Static_assert(stdc_rotate_right_uc(0xB1, 3) == 0x36, "");
+_Static_assert(stdc_rotate_left_us(0x1234, 4) == 0x2341, "");
+_Static_assert(stdc_rotate_right_us(0x1234, 4) == 0x4123, "");
+_Static_assert(stdc_rotate_left_ui(0x12345678U, 8) == 0x34567812U, "");
+_Static_assert(stdc_rotate_right_ui(0x12345678U, 8) == 0x78123456U, "");
+_Static_assert(stdc_rotate_left_ull(0x123456789ABCDEF0ULL, 16) == 0x56789ABCDEF01234ULL, "");
+_Static_assert(stdc_rotate_right_ull(0x123456789ABCDEF0ULL, 16) == 0xDEF0123456789ABCULL, "");
+
+_Static_assert(stdc_rotate_left_uc(0xAB, 0) == 0xAB, "");
+_Static_assert(stdc_rotate_right_uc(0xAB, 0) == 0xAB, "");
+_Static_assert(stdc_rotate_left_us(0x1234, 0) == 0x1234, "");
+_Static_assert(stdc_rotate_right_us(0x1234, 0) == 0x1234, "");
+_Static_assert(stdc_rotate_left_ui(0x12345678U, 0) == 0x12345678U, "");
+_Static_assert(stdc_rotate_right_ui(0x12345678U, 0) == 0x12345678U, "");
+_Static_assert(stdc_rotate_left_ull(0x123456789ABCDEF0ULL, 0) == 0x123456789ABCDEF0ULL, "");
+_Static_assert(stdc_rotate_right_ull(0x123456789ABCDEF0ULL, 0) == 0x123456789ABCDEF0ULL, "");
+
+_Static_assert(stdc_rotate_left_uc(0xAB, 8) == 0xAB, "");
+_Static_assert(stdc_rotate_right_uc(0xAB, 8) == 0xAB, "");
+_Static_assert(stdc_rotate_left_us(0x1234, 16) == 0x1234, "");
+_Static_assert(stdc_rotate_right_us(0x1234, 16) == 0x1234, "");
+_Static_assert(stdc_rotate_left_ui(0x12345678U, 32) == 0x12345678U, "");
+_Static_assert(stdc_rotate_right_ui(0x12345678U, 32) == 0x12345678U, "");
+_Static_assert(stdc_rotate_left_ull(0x123456789ABCDEF0ULL, 64) == 0x123456789ABCDEF0ULL, "");
+_Static_assert(stdc_rotate_right_ull(0x123456789ABCDEF0ULL, 64) == 0x123456789ABCDEF0ULL, "");
+
+_Static_assert(stdc_rotate_left_uc(0xB1, 11) == stdc_rotate_left_uc(0xB1, 3), "");
+_Static_assert(stdc_rotate_right_uc(0xB1, 11) == stdc_rotate_right_uc(0xB1, 3), "");
+_Static_assert(stdc_rotate_left_us(0x1234, 20) == stdc_rotate_left_us(0x1234, 4), "");
+_Static_assert(stdc_rotate_right_us(0x1234, 20) == stdc_rotate_right_us(0x1234, 4), "");
+_Static_assert(stdc_rotate_left_ui(0x12345678U, 40) == stdc_rotate_left_ui(0x12345678U, 8), "");
+_Static_assert(stdc_rotate_right_ui(0x12345678U, 40) == stdc_rotate_right_ui(0x12345678U, 8), "");
+_Static_assert(stdc_rotate_left_ull(0x123456789ABCDEF0ULL, 80) == stdc_rotate_left_ull(0x123456789ABCDEF0ULL, 16), "");
+_Static_assert(stdc_rotate_right_ull(0x123456789ABCDEF0ULL, 80) == stdc_rotate_right_ull(0x123456789ABCDEF0ULL, 16), "");
+
+_Static_assert(stdc_rotate_left_uc(0, 3) == 0, "");
+_Static_assert(stdc_rotate_right_ui(0U, 7) == 0U, "");
+
+_Static_assert(stdc_rotate_left_uc(0xFF, 3) == 0xFF, "");
+_Static_assert(stdc_rotate_right_ull(0xFFFFFFFFFFFFFFFFULL, 7) == 0xFFFFFFFFFFFFFFFFULL, "");
+
+_Static_assert(stdc_rotate_left_uc(0x80, 1) == 0x01, "");
+_Static_assert(stdc_rotate_right_uc(0x01, 1) == 0x80, "");
+_Static_assert(stdc_rotate_left_us(0x8000, 1) == 0x0001, "");
+_Static_assert(stdc_rotate_right_us(0x0001, 1) == 0x8000, "");
+_Static_assert(stdc_rotate_left_ui(0x80000000U, 1) == 0x00000001U, "");
+_Static_assert(stdc_rotate_right_ui(0x00000001U, 1) == 0x80000000U, "");
+_Static_assert(stdc_rotate_left_ull(0x8000000000000000ULL, 1) == 0x0000000000000001ULL, "");
+_Static_assert(stdc_rotate_right_ull(0x0000000000000001ULL, 1) == 0x8000000000000000ULL, "");
+
+enum { ULONG_WIDTH = __SIZEOF_LONG__ * 8 };
+_Static_assert(stdc_rotate_left_ul(0UL, 3) == 0UL, "");
+_Static_assert(stdc_rotate_right_ul(0UL, 3) == 0UL, "");
+_Static_assert(stdc_rotate_left_ul(~0UL, 5) == ~0UL, "");
+_Static_assert(stdc_rotate_right_ul(~0UL, 5) == ~0UL, "");
+_Static_assert(stdc_rotate_left_ul(1UL, ULONG_WIDTH - 1) == (1UL << (ULONG_WIDTH - 1)), "");
+_Static_assert(stdc_rotate_right_ul(1UL << (ULONG_WIDTH - 1), ULONG_WIDTH - 1) == 1UL, "");
+_Static_assert(stdc_rotate_left_ul(1UL, ULONG_WIDTH) == 1UL, "");
+_Static_assert(stdc_rotate_right_ul(1UL, ULONG_WIDTH) == 1UL, "");
+
+void test_typed_variant_errors(void) {
+  stdc_rotate_left_uc(0xAB);               // expected-error {{too few arguments to function call}}
+  stdc_rotate_left_uc(0xAB, 1, 2);         // expected-error {{too many arguments to function call}}
+  stdc_rotate_right_ui(0x12345678U);       // expected-error {{too few arguments to function call}}
+  stdc_rotate_right_ui(0x12345678U, 1, 2); // expected-error {{too many arguments to function call}}
+  stdc_rotate_left_ull(0xFFULL);           // expected-error {{too few arguments to function call}}
+  stdc_rotate_right_ull(0xFFULL, 1, 2);    // expected-error {{too many arguments to function call}}
+}
+#endif
+#endif

From 8572106af8d8099e24167e07503c1d2fc0236396 Mon Sep 17 00:00:00 2001
From: Koakuma 
Date: Tue, 12 May 2026 23:17:33 +0700
Subject: [PATCH 475/538] [SPARC] Add BSWAP tests (#192838)

---
 llvm/test/CodeGen/SPARC/bswap.ll | 523 +++++++++++++++++++++++++++++++
 1 file changed, 523 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPARC/bswap.ll

diff --git a/llvm/test/CodeGen/SPARC/bswap.ll b/llvm/test/CodeGen/SPARC/bswap.ll
new file mode 100644
index 0000000000000..dd389f7902a72
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/bswap.ll
@@ -0,0 +1,523 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=sparc   -mcpu=v9 | FileCheck %s --check-prefix=SPARC32
+; RUN: llc < %s -mtriple=sparcel -mcpu=v9 | FileCheck %s --check-prefix=SPARCEL
+; RUN: llc < %s -mtriple=sparc64 -mcpu=v9 | FileCheck %s --check-prefix=SPARC64
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare i64 @llvm.bswap.i64(i64)
+
+define i16 @u16_bswap(i16 %0) #0 {
+; SPARC32-LABEL: u16_bswap:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    sethi 63, %o1
+; SPARC32-NEXT:    or %o1, 768, %o1
+; SPARC32-NEXT:    and %o0, %o1, %o1
+; SPARC32-NEXT:    srl %o1, 8, %o1
+; SPARC32-NEXT:    sll %o0, 8, %o0
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    or %o0, %o1, %o0
+;
+; SPARCEL-LABEL: u16_bswap:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    sethi 63, %o1
+; SPARCEL-NEXT:    or %o1, 768, %o1
+; SPARCEL-NEXT:    and %o0, %o1, %o1
+; SPARCEL-NEXT:    srl %o1, 8, %o1
+; SPARCEL-NEXT:    sll %o0, 8, %o0
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    or %o0, %o1, %o0
+;
+; SPARC64-LABEL: u16_bswap:
+; SPARC64:       ! %bb.0:
+; SPARC64-NEXT:    sethi 63, %o1
+; SPARC64-NEXT:    or %o1, 768, %o1
+; SPARC64-NEXT:    and %o0, %o1, %o1
+; SPARC64-NEXT:    srl %o1, 8, %o1
+; SPARC64-NEXT:    sll %o0, 8, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    or %o0, %o1, %o0
+  %2 = tail call i16 @llvm.bswap.i16(i16 %0)
+  ret i16 %2
+}
+
+define i32 @u32_bswap(i32 %0) #0 {
+; SPARC32-LABEL: u32_bswap:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    srl %o0, 8, %o1
+; SPARC32-NEXT:    sethi 63, %o2
+; SPARC32-NEXT:    or %o2, 768, %o2
+; SPARC32-NEXT:    and %o1, %o2, %o1
+; SPARC32-NEXT:    srl %o0, 24, %o3
+; SPARC32-NEXT:    or %o1, %o3, %o1
+; SPARC32-NEXT:    and %o0, %o2, %o2
+; SPARC32-NEXT:    sll %o2, 8, %o2
+; SPARC32-NEXT:    sll %o0, 24, %o0
+; SPARC32-NEXT:    or %o0, %o2, %o0
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    or %o0, %o1, %o0
+;
+; SPARCEL-LABEL: u32_bswap:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    srl %o0, 8, %o1
+; SPARCEL-NEXT:    sethi 63, %o2
+; SPARCEL-NEXT:    or %o2, 768, %o2
+; SPARCEL-NEXT:    and %o1, %o2, %o1
+; SPARCEL-NEXT:    srl %o0, 24, %o3
+; SPARCEL-NEXT:    or %o1, %o3, %o1
+; SPARCEL-NEXT:    and %o0, %o2, %o2
+; SPARCEL-NEXT:    sll %o2, 8, %o2
+; SPARCEL-NEXT:    sll %o0, 24, %o0
+; SPARCEL-NEXT:    or %o0, %o2, %o0
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    or %o0, %o1, %o0
+;
+; SPARC64-LABEL: u32_bswap:
+; SPARC64:       ! %bb.0:
+; SPARC64-NEXT:    srl %o0, 8, %o1
+; SPARC64-NEXT:    sethi 63, %o2
+; SPARC64-NEXT:    or %o2, 768, %o2
+; SPARC64-NEXT:    and %o1, %o2, %o1
+; SPARC64-NEXT:    srl %o0, 24, %o3
+; SPARC64-NEXT:    or %o1, %o3, %o1
+; SPARC64-NEXT:    and %o0, %o2, %o2
+; SPARC64-NEXT:    sll %o2, 8, %o2
+; SPARC64-NEXT:    sll %o0, 24, %o0
+; SPARC64-NEXT:    or %o0, %o2, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    or %o0, %o1, %o0
+  %2 = tail call i32 @llvm.bswap.i32(i32 %0)
+  ret i32 %2
+}
+
+define i64 @u64_bswap(i64 %0) #0 {
+; SPARC32-LABEL: u64_bswap:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    srl %o1, 8, %o2
+; SPARC32-NEXT:    sethi 63, %o3
+; SPARC32-NEXT:    or %o3, 768, %o3
+; SPARC32-NEXT:    and %o2, %o3, %o2
+; SPARC32-NEXT:    srl %o1, 24, %o4
+; SPARC32-NEXT:    or %o2, %o4, %o2
+; SPARC32-NEXT:    and %o1, %o3, %o4
+; SPARC32-NEXT:    sll %o4, 8, %o4
+; SPARC32-NEXT:    sll %o1, 24, %o1
+; SPARC32-NEXT:    or %o1, %o4, %o1
+; SPARC32-NEXT:    or %o1, %o2, %o2
+; SPARC32-NEXT:    srl %o0, 8, %o1
+; SPARC32-NEXT:    and %o1, %o3, %o1
+; SPARC32-NEXT:    srl %o0, 24, %o4
+; SPARC32-NEXT:    or %o1, %o4, %o1
+; SPARC32-NEXT:    and %o0, %o3, %o3
+; SPARC32-NEXT:    sll %o3, 8, %o3
+; SPARC32-NEXT:    sll %o0, 24, %o0
+; SPARC32-NEXT:    or %o0, %o3, %o0
+; SPARC32-NEXT:    or %o0, %o1, %o1
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    mov %o2, %o0
+;
+; SPARCEL-LABEL: u64_bswap:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    srl %o1, 8, %o2
+; SPARCEL-NEXT:    sethi 63, %o3
+; SPARCEL-NEXT:    or %o3, 768, %o3
+; SPARCEL-NEXT:    and %o2, %o3, %o2
+; SPARCEL-NEXT:    srl %o1, 24, %o4
+; SPARCEL-NEXT:    or %o2, %o4, %o2
+; SPARCEL-NEXT:    and %o1, %o3, %o4
+; SPARCEL-NEXT:    sll %o4, 8, %o4
+; SPARCEL-NEXT:    sll %o1, 24, %o1
+; SPARCEL-NEXT:    or %o1, %o4, %o1
+; SPARCEL-NEXT:    or %o1, %o2, %o2
+; SPARCEL-NEXT:    srl %o0, 8, %o1
+; SPARCEL-NEXT:    and %o1, %o3, %o1
+; SPARCEL-NEXT:    srl %o0, 24, %o4
+; SPARCEL-NEXT:    or %o1, %o4, %o1
+; SPARCEL-NEXT:    and %o0, %o3, %o3
+; SPARCEL-NEXT:    sll %o3, 8, %o3
+; SPARCEL-NEXT:    sll %o0, 24, %o0
+; SPARCEL-NEXT:    or %o0, %o3, %o0
+; SPARCEL-NEXT:    or %o0, %o1, %o1
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    mov %o2, %o0
+;
+; SPARC64-LABEL: u64_bswap:
+; SPARC64:         .register %g2, #scratch
+; SPARC64-NEXT:  ! %bb.0:
+; SPARC64-NEXT:    srlx %o0, 24, %o1
+; SPARC64-NEXT:    sethi 16320, %o2
+; SPARC64-NEXT:    and %o1, %o2, %o1
+; SPARC64-NEXT:    srlx %o0, 8, %o3
+; SPARC64-NEXT:    sethi 4177920, %o4
+; SPARC64-NEXT:    and %o3, %o4, %o3
+; SPARC64-NEXT:    or %o3, %o1, %o1
+; SPARC64-NEXT:    srlx %o0, 40, %o3
+; SPARC64-NEXT:    sethi 63, %o5
+; SPARC64-NEXT:    or %o5, 768, %o5
+; SPARC64-NEXT:    and %o3, %o5, %o3
+; SPARC64-NEXT:    srlx %o0, 56, %g2
+; SPARC64-NEXT:    or %o3, %g2, %o3
+; SPARC64-NEXT:    or %o1, %o3, %o1
+; SPARC64-NEXT:    and %o0, %o4, %o3
+; SPARC64-NEXT:    sllx %o3, 8, %o3
+; SPARC64-NEXT:    and %o0, %o2, %o2
+; SPARC64-NEXT:    sllx %o2, 24, %o2
+; SPARC64-NEXT:    or %o2, %o3, %o2
+; SPARC64-NEXT:    and %o0, %o5, %o3
+; SPARC64-NEXT:    sllx %o3, 40, %o3
+; SPARC64-NEXT:    sllx %o0, 56, %o0
+; SPARC64-NEXT:    or %o0, %o3, %o0
+; SPARC64-NEXT:    or %o0, %o2, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    or %o0, %o1, %o0
+  %2 = tail call i64 @llvm.bswap.i64(i64 %0)
+  ret i64 %2
+}
+
+define i16 @u16_bswapload(ptr %0) #0 {
+; SPARC32-LABEL: u16_bswapload:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    lduh [%o0], %o0
+; SPARC32-NEXT:    srl %o0, 8, %o1
+; SPARC32-NEXT:    sll %o0, 8, %o0
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    or %o0, %o1, %o0
+;
+; SPARCEL-LABEL: u16_bswapload:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    lduh [%o0], %o0
+; SPARCEL-NEXT:    srl %o0, 8, %o1
+; SPARCEL-NEXT:    sll %o0, 8, %o0
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    or %o0, %o1, %o0
+;
+; SPARC64-LABEL: u16_bswapload:
+; SPARC64:       ! %bb.0:
+; SPARC64-NEXT:    lduh [%o0], %o0
+; SPARC64-NEXT:    srl %o0, 8, %o1
+; SPARC64-NEXT:    sll %o0, 8, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    or %o0, %o1, %o0
+  %2 = load i16, ptr %0, align 2
+  %3 = tail call i16 @llvm.bswap.i16(i16 %2)
+  ret i16 %3
+}
+
+define i32 @u32_bswapload(ptr %0) #0 {
+; SPARC32-LABEL: u32_bswapload:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    ld [%o0], %o0
+; SPARC32-NEXT:    srl %o0, 8, %o1
+; SPARC32-NEXT:    sethi 63, %o2
+; SPARC32-NEXT:    or %o2, 768, %o2
+; SPARC32-NEXT:    and %o1, %o2, %o1
+; SPARC32-NEXT:    srl %o0, 24, %o3
+; SPARC32-NEXT:    or %o1, %o3, %o1
+; SPARC32-NEXT:    and %o0, %o2, %o2
+; SPARC32-NEXT:    sll %o2, 8, %o2
+; SPARC32-NEXT:    sll %o0, 24, %o0
+; SPARC32-NEXT:    or %o0, %o2, %o0
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    or %o0, %o1, %o0
+;
+; SPARCEL-LABEL: u32_bswapload:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    ld [%o0], %o0
+; SPARCEL-NEXT:    srl %o0, 8, %o1
+; SPARCEL-NEXT:    sethi 63, %o2
+; SPARCEL-NEXT:    or %o2, 768, %o2
+; SPARCEL-NEXT:    and %o1, %o2, %o1
+; SPARCEL-NEXT:    srl %o0, 24, %o3
+; SPARCEL-NEXT:    or %o1, %o3, %o1
+; SPARCEL-NEXT:    and %o0, %o2, %o2
+; SPARCEL-NEXT:    sll %o2, 8, %o2
+; SPARCEL-NEXT:    sll %o0, 24, %o0
+; SPARCEL-NEXT:    or %o0, %o2, %o0
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    or %o0, %o1, %o0
+;
+; SPARC64-LABEL: u32_bswapload:
+; SPARC64:       ! %bb.0:
+; SPARC64-NEXT:    ld [%o0], %o0
+; SPARC64-NEXT:    srl %o0, 8, %o1
+; SPARC64-NEXT:    sethi 63, %o2
+; SPARC64-NEXT:    or %o2, 768, %o2
+; SPARC64-NEXT:    and %o1, %o2, %o1
+; SPARC64-NEXT:    srl %o0, 24, %o3
+; SPARC64-NEXT:    or %o1, %o3, %o1
+; SPARC64-NEXT:    and %o0, %o2, %o2
+; SPARC64-NEXT:    sll %o2, 8, %o2
+; SPARC64-NEXT:    sll %o0, 24, %o0
+; SPARC64-NEXT:    or %o0, %o2, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    or %o0, %o1, %o0
+  %2 = load i32, ptr %0, align 4
+  %3 = tail call i32 @llvm.bswap.i32(i32 %2)
+  ret i32 %3
+}
+
+define i64 @u64_bswapload(ptr %0) #0 {
+; SPARC32-LABEL: u64_bswapload:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    ldd [%o0], %o2
+; SPARC32-NEXT:    srl %o3, 8, %o0
+; SPARC32-NEXT:    sethi 63, %o1
+; SPARC32-NEXT:    or %o1, 768, %o1
+; SPARC32-NEXT:    and %o0, %o1, %o0
+; SPARC32-NEXT:    srl %o3, 24, %o4
+; SPARC32-NEXT:    or %o0, %o4, %o0
+; SPARC32-NEXT:    and %o3, %o1, %o4
+; SPARC32-NEXT:    sll %o4, 8, %o4
+; SPARC32-NEXT:    sll %o3, 24, %o5
+; SPARC32-NEXT:    or %o5, %o4, %o4
+; SPARC32-NEXT:    or %o4, %o0, %o0
+; SPARC32-NEXT:    srl %o2, 8, %o4
+; SPARC32-NEXT:    and %o4, %o1, %o4
+; SPARC32-NEXT:    srl %o2, 24, %o5
+; SPARC32-NEXT:    or %o4, %o5, %o4
+; SPARC32-NEXT:    and %o2, %o1, %o1
+; SPARC32-NEXT:    sll %o1, 8, %o1
+; SPARC32-NEXT:    sll %o2, 24, %o2
+; SPARC32-NEXT:    or %o2, %o1, %o1
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    or %o1, %o4, %o1
+;
+; SPARCEL-LABEL: u64_bswapload:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    ldd [%o0], %o2
+; SPARCEL-NEXT:    srl %o3, 8, %o0
+; SPARCEL-NEXT:    sethi 63, %o1
+; SPARCEL-NEXT:    or %o1, 768, %o1
+; SPARCEL-NEXT:    and %o0, %o1, %o0
+; SPARCEL-NEXT:    srl %o3, 24, %o4
+; SPARCEL-NEXT:    or %o0, %o4, %o0
+; SPARCEL-NEXT:    and %o3, %o1, %o4
+; SPARCEL-NEXT:    sll %o4, 8, %o4
+; SPARCEL-NEXT:    sll %o3, 24, %o5
+; SPARCEL-NEXT:    or %o5, %o4, %o4
+; SPARCEL-NEXT:    or %o4, %o0, %o0
+; SPARCEL-NEXT:    srl %o2, 8, %o4
+; SPARCEL-NEXT:    and %o4, %o1, %o4
+; SPARCEL-NEXT:    srl %o2, 24, %o5
+; SPARCEL-NEXT:    or %o4, %o5, %o4
+; SPARCEL-NEXT:    and %o2, %o1, %o1
+; SPARCEL-NEXT:    sll %o1, 8, %o1
+; SPARCEL-NEXT:    sll %o2, 24, %o2
+; SPARCEL-NEXT:    or %o2, %o1, %o1
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    or %o1, %o4, %o1
+;
+; SPARC64-LABEL: u64_bswapload:
+; SPARC64:         .register %g2, #scratch
+; SPARC64-NEXT:  ! %bb.0:
+; SPARC64-NEXT:    ldx [%o0], %o0
+; SPARC64-NEXT:    srlx %o0, 24, %o1
+; SPARC64-NEXT:    sethi 16320, %o2
+; SPARC64-NEXT:    and %o1, %o2, %o1
+; SPARC64-NEXT:    srlx %o0, 8, %o3
+; SPARC64-NEXT:    sethi 4177920, %o4
+; SPARC64-NEXT:    and %o3, %o4, %o3
+; SPARC64-NEXT:    or %o3, %o1, %o1
+; SPARC64-NEXT:    srlx %o0, 40, %o3
+; SPARC64-NEXT:    sethi 63, %o5
+; SPARC64-NEXT:    or %o5, 768, %o5
+; SPARC64-NEXT:    and %o3, %o5, %o3
+; SPARC64-NEXT:    srlx %o0, 56, %g2
+; SPARC64-NEXT:    or %o3, %g2, %o3
+; SPARC64-NEXT:    or %o1, %o3, %o1
+; SPARC64-NEXT:    and %o0, %o4, %o3
+; SPARC64-NEXT:    sllx %o3, 8, %o3
+; SPARC64-NEXT:    and %o0, %o2, %o2
+; SPARC64-NEXT:    sllx %o2, 24, %o2
+; SPARC64-NEXT:    or %o2, %o3, %o2
+; SPARC64-NEXT:    and %o0, %o5, %o3
+; SPARC64-NEXT:    sllx %o3, 40, %o3
+; SPARC64-NEXT:    sllx %o0, 56, %o0
+; SPARC64-NEXT:    or %o0, %o3, %o0
+; SPARC64-NEXT:    or %o0, %o2, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    or %o0, %o1, %o0
+  %2 = load i64, ptr %0, align 8
+  %3 = tail call i64 @llvm.bswap.i64(i64 %2)
+  ret i64 %3
+}
+
+define void @u16_bswapstore(ptr %0, i16 %1) #0 {
+; SPARC32-LABEL: u16_bswapstore:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    sethi 63, %o2
+; SPARC32-NEXT:    or %o2, 768, %o2
+; SPARC32-NEXT:    and %o1, %o2, %o2
+; SPARC32-NEXT:    srl %o2, 8, %o2
+; SPARC32-NEXT:    sll %o1, 8, %o1
+; SPARC32-NEXT:    or %o1, %o2, %o1
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    sth %o1, [%o0]
+;
+; SPARCEL-LABEL: u16_bswapstore:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    sethi 63, %o2
+; SPARCEL-NEXT:    or %o2, 768, %o2
+; SPARCEL-NEXT:    and %o1, %o2, %o2
+; SPARCEL-NEXT:    srl %o2, 8, %o2
+; SPARCEL-NEXT:    sll %o1, 8, %o1
+; SPARCEL-NEXT:    or %o1, %o2, %o1
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    sth %o1, [%o0]
+;
+; SPARC64-LABEL: u16_bswapstore:
+; SPARC64:       ! %bb.0:
+; SPARC64-NEXT:    sethi 63, %o2
+; SPARC64-NEXT:    or %o2, 768, %o2
+; SPARC64-NEXT:    and %o1, %o2, %o2
+; SPARC64-NEXT:    srl %o2, 8, %o2
+; SPARC64-NEXT:    sll %o1, 8, %o1
+; SPARC64-NEXT:    or %o1, %o2, %o1
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    sth %o1, [%o0]
+  %3 = tail call i16 @llvm.bswap.i16(i16 %1)
+  store i16 %3, ptr %0, align 2
+  ret void
+}
+
+define void @u32_bswapstore(ptr %0, i32 %1) #0 {
+; SPARC32-LABEL: u32_bswapstore:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    srl %o1, 8, %o2
+; SPARC32-NEXT:    sethi 63, %o3
+; SPARC32-NEXT:    or %o3, 768, %o3
+; SPARC32-NEXT:    and %o2, %o3, %o2
+; SPARC32-NEXT:    srl %o1, 24, %o4
+; SPARC32-NEXT:    or %o2, %o4, %o2
+; SPARC32-NEXT:    and %o1, %o3, %o3
+; SPARC32-NEXT:    sll %o3, 8, %o3
+; SPARC32-NEXT:    sll %o1, 24, %o1
+; SPARC32-NEXT:    or %o1, %o3, %o1
+; SPARC32-NEXT:    or %o1, %o2, %o1
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    st %o1, [%o0]
+;
+; SPARCEL-LABEL: u32_bswapstore:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    srl %o1, 8, %o2
+; SPARCEL-NEXT:    sethi 63, %o3
+; SPARCEL-NEXT:    or %o3, 768, %o3
+; SPARCEL-NEXT:    and %o2, %o3, %o2
+; SPARCEL-NEXT:    srl %o1, 24, %o4
+; SPARCEL-NEXT:    or %o2, %o4, %o2
+; SPARCEL-NEXT:    and %o1, %o3, %o3
+; SPARCEL-NEXT:    sll %o3, 8, %o3
+; SPARCEL-NEXT:    sll %o1, 24, %o1
+; SPARCEL-NEXT:    or %o1, %o3, %o1
+; SPARCEL-NEXT:    or %o1, %o2, %o1
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    st %o1, [%o0]
+;
+; SPARC64-LABEL: u32_bswapstore:
+; SPARC64:       ! %bb.0:
+; SPARC64-NEXT:    srl %o1, 8, %o2
+; SPARC64-NEXT:    sethi 63, %o3
+; SPARC64-NEXT:    or %o3, 768, %o3
+; SPARC64-NEXT:    and %o2, %o3, %o2
+; SPARC64-NEXT:    srl %o1, 24, %o4
+; SPARC64-NEXT:    or %o2, %o4, %o2
+; SPARC64-NEXT:    and %o1, %o3, %o3
+; SPARC64-NEXT:    sll %o3, 8, %o3
+; SPARC64-NEXT:    sll %o1, 24, %o1
+; SPARC64-NEXT:    or %o1, %o3, %o1
+; SPARC64-NEXT:    or %o1, %o2, %o1
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    st %o1, [%o0]
+  %3 = tail call i32 @llvm.bswap.i32(i32 %1)
+  store i32 %3, ptr %0, align 4
+  ret void
+}
+
+define void @u64_bswapstore(ptr %0, i64 %1) #0 {
+; SPARC32-LABEL: u64_bswapstore:
+; SPARC32:       ! %bb.0:
+; SPARC32-NEXT:    srl %o1, 8, %o3
+; SPARC32-NEXT:    sethi 63, %o4
+; SPARC32-NEXT:    or %o4, 768, %o4
+; SPARC32-NEXT:    and %o3, %o4, %o3
+; SPARC32-NEXT:    srl %o1, 24, %o5
+; SPARC32-NEXT:    or %o3, %o5, %o3
+; SPARC32-NEXT:    and %o1, %o4, %o5
+; SPARC32-NEXT:    sll %o5, 8, %o5
+; SPARC32-NEXT:    sll %o1, 24, %o1
+; SPARC32-NEXT:    or %o1, %o5, %o1
+; SPARC32-NEXT:    or %o1, %o3, %g3
+; SPARC32-NEXT:    srl %o2, 8, %o1
+; SPARC32-NEXT:    and %o1, %o4, %o1
+; SPARC32-NEXT:    srl %o2, 24, %o3
+; SPARC32-NEXT:    or %o1, %o3, %o1
+; SPARC32-NEXT:    and %o2, %o4, %o3
+; SPARC32-NEXT:    sll %o3, 8, %o3
+; SPARC32-NEXT:    sll %o2, 24, %o2
+; SPARC32-NEXT:    or %o2, %o3, %o2
+; SPARC32-NEXT:    or %o2, %o1, %g2
+; SPARC32-NEXT:    retl
+; SPARC32-NEXT:    std %g2, [%o0]
+;
+; SPARCEL-LABEL: u64_bswapstore:
+; SPARCEL:       ! %bb.0:
+; SPARCEL-NEXT:    srl %o1, 8, %o3
+; SPARCEL-NEXT:    sethi 63, %o4
+; SPARCEL-NEXT:    or %o4, 768, %o4
+; SPARCEL-NEXT:    and %o3, %o4, %o3
+; SPARCEL-NEXT:    srl %o1, 24, %o5
+; SPARCEL-NEXT:    or %o3, %o5, %o3
+; SPARCEL-NEXT:    and %o1, %o4, %o5
+; SPARCEL-NEXT:    sll %o5, 8, %o5
+; SPARCEL-NEXT:    sll %o1, 24, %o1
+; SPARCEL-NEXT:    or %o1, %o5, %o1
+; SPARCEL-NEXT:    or %o1, %o3, %g3
+; SPARCEL-NEXT:    srl %o2, 8, %o1
+; SPARCEL-NEXT:    and %o1, %o4, %o1
+; SPARCEL-NEXT:    srl %o2, 24, %o3
+; SPARCEL-NEXT:    or %o1, %o3, %o1
+; SPARCEL-NEXT:    and %o2, %o4, %o3
+; SPARCEL-NEXT:    sll %o3, 8, %o3
+; SPARCEL-NEXT:    sll %o2, 24, %o2
+; SPARCEL-NEXT:    or %o2, %o3, %o2
+; SPARCEL-NEXT:    or %o2, %o1, %g2
+; SPARCEL-NEXT:    retl
+; SPARCEL-NEXT:    std %g2, [%o0]
+;
+; SPARC64-LABEL: u64_bswapstore:
+; SPARC64:         .register %g2, #scratch
+; SPARC64-NEXT:    .register %g3, #scratch
+; SPARC64-NEXT:  ! %bb.0:
+; SPARC64-NEXT:    srlx %o1, 24, %o2
+; SPARC64-NEXT:    sethi 16320, %o3
+; SPARC64-NEXT:    and %o2, %o3, %o2
+; SPARC64-NEXT:    srlx %o1, 8, %o4
+; SPARC64-NEXT:    sethi 4177920, %o5
+; SPARC64-NEXT:    and %o4, %o5, %o4
+; SPARC64-NEXT:    or %o4, %o2, %o2
+; SPARC64-NEXT:    srlx %o1, 40, %o4
+; SPARC64-NEXT:    sethi 63, %g2
+; SPARC64-NEXT:    or %g2, 768, %g2
+; SPARC64-NEXT:    and %o4, %g2, %o4
+; SPARC64-NEXT:    srlx %o1, 56, %g3
+; SPARC64-NEXT:    or %o4, %g3, %o4
+; SPARC64-NEXT:    or %o2, %o4, %o2
+; SPARC64-NEXT:    and %o1, %o5, %o4
+; SPARC64-NEXT:    sllx %o4, 8, %o4
+; SPARC64-NEXT:    and %o1, %o3, %o3
+; SPARC64-NEXT:    sllx %o3, 24, %o3
+; SPARC64-NEXT:    or %o3, %o4, %o3
+; SPARC64-NEXT:    and %o1, %g2, %o4
+; SPARC64-NEXT:    sllx %o4, 40, %o4
+; SPARC64-NEXT:    sllx %o1, 56, %o1
+; SPARC64-NEXT:    or %o1, %o4, %o1
+; SPARC64-NEXT:    or %o1, %o3, %o1
+; SPARC64-NEXT:    or %o1, %o2, %o1
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    stx %o1, [%o0]
+  %3 = tail call i64 @llvm.bswap.i64(i64 %1)
+  store i64 %3, ptr %0, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }

From a185f46661fab005a253fbb54009417f0964b03a Mon Sep 17 00:00:00 2001
From: Charles Zablit 
Date: Tue, 12 May 2026 17:30:55 +0100
Subject: [PATCH 476/538] [lldb][windows] fix 4-byte error-code read (#197177)

Reading `word_size` (8) bytes here would include 4 bytes of stack
garbage past the struct and produce bogus error codes.
---
 lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
index 9e11b66068381..c82841ab029aa 100644
--- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
+++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
@@ -398,10 +398,10 @@ uint32_t PlatformWindows::DoLoadImage(Process *process,
   }
 
   if (!token) {
-    // XXX(compnerd) should we use the compiler to get the sizeof(unsigned)?
-    uint64_t error_code =
-        process->ReadUnsignedIntegerFromMemory(injected_result + 2 * word_size + sizeof(unsigned),
-                                               word_size, 0, status);
+    // ErrorCode is a 4-byte `unsigned` field in __lldb_LoadLibraryResult.
+    uint64_t error_code = process->ReadUnsignedIntegerFromMemory(
+        injected_result + 2 * word_size + sizeof(unsigned), sizeof(unsigned), 0,
+        status);
     if (status.Fail()) {
       error = Status::FromErrorStringWithFormat(
           "LoadLibrary error: could not read error status: %s",

From 2269467da859965bfcb75b895f3a294137c5ed11 Mon Sep 17 00:00:00 2001
From: David Rivera 
Date: Tue, 12 May 2026 12:36:26 -0400
Subject: [PATCH 477/538] [CIR][HIP] Handle HIP module constructor and
 destructor emission (#195391)

Related: https://github.com/llvm/llvm-project/issues/179278,
https://github.com/llvm/llvm-project/issues/175871

Similar to https://github.com/llvm/llvm-project/pull/188673, This adds
the HIP host-side module registration path in CIR lowering for the
non-RDC, included-fatbin case.

Generated sequence for HIP, non-RDC, with `-fcuda-include-gpubinary`:

  ```c
  void **__hip_gpubin_handle = nullptr;

  void __hip_module_ctor() {
      if (__hip_gpubin_handle == nullptr)
__hip_gpubin_handle = __hipRegisterFatBinary(&__hip_fatbin_wrapper);

__hip_register_globals(__hip_gpubin_handle); // we only register kernels
so far.
      atexit(__hip_module_dtor);
  }

  void __hip_module_dtor() {
      if (__hip_gpubin_handle != nullptr) {
          __hipUnregisterFatBinary(__hip_gpubin_handle);
          __hip_gpubin_handle = nullptr;
      }
  }
```

Another divergence I added (but not depicted) is the per-kernel `__hipRegisterFunction` calls using the HIP kernel-handle global, not the device-stub function symbol.
---
 clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp        |   4 +-
 .../Dialect/Transforms/LoweringPrepare.cpp    | 159 ++++++++++++++++--
 clang/test/CIR/CodeGenCUDA/device-stub.cu     | 157 +++++++++++++++++
 clang/test/CIR/CodeGenCUDA/kernel-call.cu     |   4 +-
 clang/test/CIR/CodeGenHIP/simple.cpp          |   2 +-
 5 files changed, 310 insertions(+), 16 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index 1bae5e470aadd..0466c212fe165 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -333,9 +333,9 @@ mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
   CIRGenBuilderTy &builder = cgm.getBuilder();
   StringRef globalName = cgm.getMangledName(
       gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
+  cir::PointerType fnPtrTy = builder.getPointerTo(fn.getFunctionType());
   cir::GlobalOp globalOp = CIRGenModule::createGlobalOp(
-      cgm, fn.getLoc(), globalName, fn.getFunctionType(),
-      /*isConstant=*/true);
+      cgm, fn.getLoc(), globalName, fnPtrTy, /*isConstant=*/true);
 
   globalOp->setAttr("alignment", builder.getI64IntegerAttr(
                                      cgm.getPointerAlign().getQuantity()));
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index ec16c162dbad4..5ada3248d3185 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -151,6 +151,7 @@ struct LoweringPreparePass
   /// with the CUDA runtime.
   void buildCUDAModuleCtor();
   std::optional buildCUDAModuleDtor();
+  std::optional buildHIPModuleDtor();
   std::optional buildCUDARegisterGlobals();
   void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
                                         FuncOp regGlobalFunc);
@@ -1935,8 +1936,6 @@ static std::string addUnderscoredPrefix(llvm::StringRef prefix,
 void LoweringPreparePass::buildCUDAModuleCtor() {
   bool isHIP = astCtx->getLangOpts().HIP;
 
-  if (isHIP)
-    assert(!cir::MissingFeatures::hipModuleCtor());
   if (astCtx->getLangOpts().GPURelocatableDeviceCode)
     llvm_unreachable("GPU RDC NYI");
 
@@ -2063,8 +2062,67 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
   assert(!cir::MissingFeatures::opGlobalCtorPriority());
   if (isHIP) {
-    llvm_unreachable("HIP Module Constructor Support");
-  } else if (!astCtx->getLangOpts().GPURelocatableDeviceCode) {
+    // --- Create HIP CTOR ---
+    //   if (__hip_gpubin_handle == nullptr)
+    //     __hip_gpubin_handle = __hipRegisterFatBinary(&fatbinWrapper);
+    //   __hip_register_globals(__hip_gpubin_handle);
+    //   atexit(__hip_module_dtor);
+    mlir::Block *entryBlock = builder.getInsertionBlock();
+    mlir::Region *parent = entryBlock->getParent();
+    mlir::Block *ifBlock = builder.createBlock(parent);
+    mlir::Block *exitBlock = builder.createBlock(parent);
+    {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToEnd(entryBlock);
+      mlir::Value handle =
+          builder.createLoad(loc, builder.createGetGlobal(gpuBinHandle));
+      auto handlePtrTy = mlir::cast(handle.getType());
+      mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc);
+      mlir::Value isNull =
+          builder.createCompare(loc, cir::CmpOpKind::eq, handle, nullPtr);
+      cir::BrCondOp::create(builder, loc, isNull, ifBlock, exitBlock);
+    }
+    {
+      // Handle is null: load the fatbin and register it.
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToStart(ifBlock);
+      mlir::Value wrapper = builder.createGetGlobal(fatbinWrapper);
+      mlir::Value fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
+      cir::CallOp gpuBinaryHandleCall =
+          builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+      mlir::Value gpuBinaryHandle = gpuBinaryHandleCall.getResult();
+      // Store the value back to the global `__hip_gpubin_handle`.
+      mlir::Value gpuBinaryHandleGlobal = builder.createGetGlobal(gpuBinHandle);
+      builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
+      cir::BrOp::create(builder, loc, exitBlock);
+    }
+    {
+      // Exit block: load the (possibly newly-registered) handle, call
+      // __hip_register_globals, and register the module dtor with atexit().
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToStart(exitBlock);
+      mlir::Value gHandle =
+          builder.createLoad(loc, builder.createGetGlobal(gpuBinHandle));
+
+      if (std::optional regGlobal = buildCUDARegisterGlobals())
+        builder.createCallOp(loc, *regGlobal, gHandle);
+
+      if (std::optional dtor = buildHIPModuleDtor()) {
+        cir::CIRBaseBuilderTy globalBuilder(getContext());
+        globalBuilder.setInsertionPointToStart(mlirModule.getBody());
+        FuncOp atexit = buildRuntimeFunction(
+            globalBuilder, "atexit", loc,
+            FuncType::get(PointerType::get(dtor->getFunctionType()), intTy));
+        mlir::Value dtorFunc = GetGlobalOp::create(
+            builder, loc, PointerType::get(dtor->getFunctionType()),
+            mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr()));
+        builder.createCallOp(loc, atexit, dtorFunc);
+      }
+      cir::ReturnOp::create(builder, loc);
+    }
+    return;
+  }
+  if (!astCtx->getLangOpts().GPURelocatableDeviceCode) {
 
     // --- Create CUDA CTOR-DTOR ---
     // Register binary with CUDA runtime. This is substantially different in
@@ -2165,6 +2223,77 @@ std::optional LoweringPreparePass::buildCUDAModuleDtor() {
   return dtor;
 }
 
+/// Build the HIP module dtor:
+///
+///     void __hip_module_dtor() {
+///       if (__hip_gpubin_handle != nullptr) {
+///         __hipUnregisterFatBinary(__hip_gpubin_handle);
+///         __hip_gpubin_handle = nullptr;
+///       }
+///     }
+///
+/// Despite the name, OG doesn't treat this as a real destructor: putting it on
+/// the dtor list would cause a double-free. It is meant to be registered via
+/// atexit() at the end of the module ctor.
+std::optional LoweringPreparePass::buildHIPModuleDtor() {
+  if (!mlirModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
+    return {};
+
+  llvm::StringRef prefix = getCUDAPrefix(astCtx);
+
+  VoidType voidTy = VoidType::get(&getContext());
+  PointerType voidPtrPtrTy = PointerType::get(PointerType::get(voidTy));
+
+  mlir::Location loc = mlirModule.getLoc();
+
+  cir::CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(mlirModule.getBody());
+
+  // void __hipUnregisterFatBinary(void ** handle);
+  std::string unregisterFuncName =
+      addUnderscoredPrefix(prefix, "UnregisterFatBinary");
+  FuncOp unregisterFunc = buildRuntimeFunction(
+      builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy));
+
+  std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor");
+  FuncOp dtor =
+      buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy),
+                           GlobalLinkageKind::InternalLinkage);
+
+  std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle");
+  GlobalOp gpuBinGlobal = cast(mlirModule.lookupSymbol(gpubinName));
+
+  mlir::Block *entryBlock = dtor.addEntryBlock();
+  mlir::Block *ifBlock = builder.createBlock(&dtor.getBody());
+  mlir::Block *exitBlock = builder.createBlock(&dtor.getBody());
+
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(entryBlock);
+  mlir::Value handle =
+      builder.createLoad(loc, builder.createGetGlobal(gpuBinGlobal));
+  auto handlePtrTy = mlir::cast(handle.getType());
+  mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc);
+  mlir::Value isNotNull =
+      builder.createCompare(loc, cir::CmpOpKind::ne, handle, nullPtr);
+  cir::BrCondOp::create(builder, loc, isNotNull, ifBlock, exitBlock);
+
+  {
+    // Handle is non-null: unregister and clear it.
+    mlir::OpBuilder::InsertionGuard ifGuard(builder);
+    builder.setInsertionPointToStart(ifBlock);
+    builder.createCallOp(loc, unregisterFunc, handle);
+    builder.createStore(loc, nullPtr, builder.createGetGlobal(gpuBinGlobal));
+    cir::BrOp::create(builder, loc, exitBlock);
+  }
+  {
+    mlir::OpBuilder::InsertionGuard exitGuard(builder);
+    builder.setInsertionPointToStart(exitBlock);
+    cir::ReturnOp::create(builder, loc);
+  }
+
+  return dtor;
+}
+
 std::optional LoweringPreparePass::buildCUDARegisterGlobals() {
   // There is nothing to register.
   if (cudaKernelMap.empty())
@@ -2257,20 +2386,28 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
     mlir::Value deviceFunc = builder.createBitcast(
         builder.createGetGlobal(deviceFuncStr), voidPtrTy);
 
+    mlir::Value hostFunc;
     if (isHIP) {
-      llvm_unreachable("HIP kernel registration NYI");
+      // Under HIP, the kernel-handle is a GlobalOp shadow created by CIR
+      // codegen and named with the kernel-reference mangled name (e.g.
+      // `@_Z2fnv` pointing at the device-stub function
+      // `_Z17__device_stub__fnv`). The CUDAKernelNameAttr on the device-stub
+      // uses the same name, so we can resolve the shadow by symbol lookup.
+      auto funcHandle = cast(mlirModule.lookupSymbol(kernelName));
+      hostFunc =
+          builder.createBitcast(builder.createGetGlobal(funcHandle), voidPtrTy);
     } else {
-      mlir::Value hostFunc = builder.createBitcast(
+      hostFunc = builder.createBitcast(
           GetGlobalOp::create(
               builder, loc, PointerType::get(deviceStub.getFunctionType()),
               mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
           voidPtrTy);
-      builder.createCallOp(
-          loc, cudaRegisterFunction,
-          {fatbinHandle, hostFunc, deviceFunc, deviceFunc,
-           ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)),
-           cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
     }
+    builder.createCallOp(
+        loc, cudaRegisterFunction,
+        {fatbinHandle, hostFunc, deviceFunc, deviceFunc,
+         ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), cirNullPtr,
+         cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
   }
 }
 
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index 0e95c74324592..b635f5931df77 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -19,6 +19,22 @@
 // RUN:   -target-sdk-version=12.3 -o %t.nogpu.cir
 // RUN: FileCheck --input-file=%t.nogpu.cir %s --check-prefix=NOGPUBIN
 
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x hip \
+// RUN:   -fhip-new-launch-api -fcuda-include-gpubinary %t -o %t.hip.cir
+// RUN: FileCheck --input-file=%t.hip.cir %s --check-prefix=HIP-CIR
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fclangir -emit-llvm %s -x hip \
+// RUN:   -fhip-new-launch-api -fcuda-include-gpubinary %t -o %t.hip-cir.ll
+// RUN: FileCheck --input-file=%t.hip-cir.ll %s --check-prefix=HIP-LLVM
+//
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -x hip \
+// RUN:   -fhip-new-launch-api -fcuda-include-gpubinary %t -o %t.hip.ll
+// RUN: FileCheck --input-file=%t.hip.ll %s --check-prefix=HIP-OGCG
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-cir %s -x hip \
+// RUN:   -fhip-new-launch-api -o %t.nogpu.hip.cir
+// RUN: FileCheck --input-file=%t.nogpu.hip.cir %s --check-prefix=HIP-NOGPUBIN
+
 #include "Inputs/cuda.h"
 
 __global__ void kernelfunc(int i, int j, int k) {}
@@ -134,3 +150,144 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // NOGPUBIN-NOT: __cuda_register_globals
 // NOGPUBIN-NOT: __cuda_module_ctor
 // NOGPUBIN-NOT: __cuda_module_dtor
+
+// =============================================================================
+// HIP host-side registration (`buildCUDAModuleCtor` / `buildHIPModuleDtor` /
+// `buildCUDARegisterGlobalFunctions` HIP arms in CIR LoweringPrepare).
+// =============================================================================
+
+// HIP module ctor is registered with the default global-ctor priority.
+// HIP-CIR: cir.global_ctors = [#cir.global_ctor<"__hip_module_ctor", 65535>]
+
+// Runtime function decls.
+// HIP-CIR: cir.func private @atexit(!cir.ptr>) -> !s32i
+// HIP-CIR: cir.func private @__hipUnregisterFatBinary(!cir.ptr>)
+
+// Module dtor: only unregister when the handle is non-null, then null it out.
+// Reuses the SSA value loaded in the entry block for the unregister call.
+// HIP-CIR: cir.func internal private @__hip_module_dtor()
+// HIP-CIR:   %[[DH0:.*]] = cir.get_global @__hip_gpubin_handle
+// HIP-CIR:   %[[H0:.*]] = cir.load %[[DH0]]
+// HIP-CIR:   %[[NULL0:.*]] = cir.const #cir.ptr
+// HIP-CIR:   %[[NE:.*]] = cir.cmp ne %[[H0]], %[[NULL0]]
+// HIP-CIR:   cir.brcond %[[NE]] ^bb1, ^bb2
+// HIP-CIR: ^bb1:
+// HIP-CIR:   cir.call @__hipUnregisterFatBinary(%[[H0]])
+// HIP-CIR:   %[[DH1:.*]] = cir.get_global @__hip_gpubin_handle
+// HIP-CIR:   cir.store %[[NULL0]], %[[DH1]]
+// HIP-CIR:   cir.br ^bb2
+// HIP-CIR: ^bb2:
+// HIP-CIR:   cir.return
+
+// __hipRegisterFunction runtime declaration.
+// HIP-CIR: cir.func private @__hipRegisterFunction(!cir.ptr>, !cir.ptr, !cir.ptr, !cir.ptr, !s32i, !cir.ptr, !cir.ptr, !cir.ptr, !cir.ptr, !cir.ptr) -> !s32i
+
+// __hip_register_globals: under -fhip-new-launch-api the host-side argument is
+// the kernel-handle GlobalOp shadow (e.g. @_Z10kernelfunciii) — not the
+// device-stub function pointer that the CUDA arm uses.
+// HIP-CIR: cir.global "private" constant cir_private @".str_Z10kernelfunciii" = #cir.const_array<"_Z10kernelfunciii" : !cir.array, trailing_zeros> : !cir.array
+// HIP-CIR: cir.func internal private @__hip_register_globals(%[[FATBIN:.*]]: !cir.ptr>
+// HIP-CIR:   %[[NULL1:.*]] = cir.const #cir.ptr : !cir.ptr
+// HIP-CIR:   %[[STR_ADDR:.*]] = cir.get_global @".str_Z10kernelfunciii"
+// HIP-CIR:   %[[DEVICE_FUNC:.*]] = cir.cast bitcast %[[STR_ADDR]]
+// HIP-CIR:   %[[KH:.*]] = cir.get_global @_Z10kernelfunciii : !cir.ptr>>
+// HIP-CIR:   %[[HOST_FUNC:.*]] = cir.cast bitcast %[[KH]]
+// HIP-CIR:   %[[MINUS_ONE:.*]] = cir.const #cir.int<-1> : !s32i
+// HIP-CIR:   cir.call @__hipRegisterFunction(%[[FATBIN]], %[[HOST_FUNC]], %[[DEVICE_FUNC]], %[[DEVICE_FUNC]], %[[MINUS_ONE]], %[[NULL1]], %[[NULL1]], %[[NULL1]], %[[NULL1]], %[[NULL1]])
+// HIP-CIR:   cir.return
+
+// Fatbin string + wrapper live in the HIP-specific sections; magic
+// 0x48495046 = 1212764230.
+// HIP-CIR: cir.global "private" constant cir_private @__hip_fatbin_str = #cir.const_array<"GPU binary would be here." : !cir.array> : !cir.array {alignment = 8 : i64, section = ".hip_fatbin"}
+// HIP-CIR: cir.global constant cir_private @__hip_fatbin_wrapper = #cir.const_record<{
+// HIP-CIR-SAME: #cir.int<1212764230> : !s32i,
+// HIP-CIR-SAME: #cir.int<1> : !s32i,
+// HIP-CIR-SAME: #cir.global_view<@__hip_fatbin_str> : !cir.ptr,
+// HIP-CIR-SAME: #cir.ptr : !cir.ptr
+// HIP-CIR-SAME: }> : !rec_anon_struct {section = ".hipFatBinSegment"}
+
+// HIP-CIR: cir.global "private" internal @__hip_gpubin_handle = #cir.ptr : !cir.ptr>
+// HIP-CIR: cir.func private @__hipRegisterFatBinary(!cir.ptr) -> !cir.ptr>
+
+// Module ctor: guard registration on a null handle, register globals from the
+// (possibly newly-stored) handle, then atexit(__hip_module_dtor).
+// HIP-CIR: cir.func internal private @__hip_module_ctor()
+// HIP-CIR:   %[[GHA:.*]] = cir.get_global @__hip_gpubin_handle
+// HIP-CIR:   %[[H:.*]] = cir.load %[[GHA]]
+// HIP-CIR:   %[[NULLPTR:.*]] = cir.const #cir.ptr
+// HIP-CIR:   %[[EQ:.*]] = cir.cmp eq %[[H]], %[[NULLPTR]]
+// HIP-CIR:   cir.brcond %[[EQ]] ^bb1, ^bb2
+// HIP-CIR: ^bb1:
+// HIP-CIR:   %[[WRAPPER:.*]] = cir.get_global @__hip_fatbin_wrapper
+// HIP-CIR:   %[[VOID_PTR:.*]] = cir.cast bitcast %[[WRAPPER]]
+// HIP-CIR:   %[[REG:.*]] = cir.call @__hipRegisterFatBinary(%[[VOID_PTR]])
+// HIP-CIR:   %[[GHA2:.*]] = cir.get_global @__hip_gpubin_handle
+// HIP-CIR:   cir.store %[[REG]], %[[GHA2]]
+// HIP-CIR:   cir.br ^bb2
+// HIP-CIR: ^bb2:
+// HIP-CIR:   %[[GHA3:.*]] = cir.get_global @__hip_gpubin_handle
+// HIP-CIR:   %[[H2:.*]] = cir.load %[[GHA3]]
+// HIP-CIR:   cir.call @__hip_register_globals(%[[H2]])
+// HIP-CIR:   %[[DTOR_PTR:.*]] = cir.get_global @__hip_module_dtor
+// HIP-CIR:   {{.*}} = cir.call @atexit(%[[DTOR_PTR]])
+// HIP-CIR:   cir.return
+
+// HIP-CIR: cir.global constant external @_Z10kernelfunciii = #cir.global_view<@_Z25__device_stub__kernelfunciii> : !cir.ptr> {alignment = 8 : i64}
+
+// HIP OGCG cross-check (LLVM IR matches what OG codegen emits for HIP).
+// HIP-OGCG: @{{.*}} = private constant [25 x i8] c"GPU binary would be here.", section ".hip_fatbin"
+// HIP-OGCG: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @{{.*}}, ptr null }, section ".hipFatBinSegment"
+// HIP-OGCG: @__hip_gpubin_handle = internal global ptr null
+// HIP-OGCG: @llvm.global_ctors = appending global {{.*}}@__hip_module_ctor
+
+// HIP-OGCG: define internal void @__hip_module_ctor()
+// HIP-OGCG:   load ptr, ptr @__hip_gpubin_handle
+// HIP-OGCG:   icmp eq ptr {{.*}}, null
+// HIP-OGCG:   call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper)
+// HIP-OGCG:   store ptr {{.*}}, ptr @__hip_gpubin_handle
+// HIP-OGCG:   call void @__hip_register_globals(
+// HIP-OGCG:   call i32 @atexit(ptr @__hip_module_dtor)
+// HIP-OGCG:   ret void
+
+// HIP-OGCG: define internal void @__hip_module_dtor()
+// HIP-OGCG:   load ptr, ptr @__hip_gpubin_handle
+// HIP-OGCG:   icmp ne ptr {{.*}}, null
+// HIP-OGCG:   call void @__hipUnregisterFatBinary
+// HIP-OGCG:   store ptr null, ptr @__hip_gpubin_handle
+
+// HIP LLVM lowering cross-check.
+// HIP-LLVM: @{{.*}} = private constant [25 x i8] c"GPU binary would be here.", section ".hip_fatbin", align 8
+// HIP-LLVM: @__hip_fatbin_wrapper = {{.*}}constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @{{.*}}, ptr null }, section ".hipFatBinSegment"
+// HIP-LLVM: @__hip_gpubin_handle = internal global ptr null
+// HIP-LLVM: @_Z10kernelfunciii = constant ptr @_Z25__device_stub__kernelfunciii, align 8
+// HIP-LLVM: @llvm.global_ctors = appending global {{.*}}@__hip_module_ctor
+
+// HIP-LLVM: define internal void @__hip_module_dtor()
+// HIP-LLVM: load ptr, ptr @__hip_gpubin_handle
+// HIP-LLVM: icmp ne ptr {{.*}}, null
+// HIP-LLVM: br i1 {{.*}}, label %{{.*}}, label %{{.*}}
+// HIP-LLVM: call void @__hipUnregisterFatBinary(ptr {{.*}})
+// HIP-LLVM: store ptr null, ptr @__hip_gpubin_handle
+// HIP-LLVM: ret void
+
+// HIP-LLVM: define internal void @__hip_register_globals(ptr %[[FATBIN:.*]])
+// HIP-LLVM: call{{.*}}@__hipRegisterFunction(ptr %[[FATBIN]], ptr @_Z10kernelfunciii, ptr @{{.*}}, ptr @{{.*}}, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+// HIP-LLVM: ret void
+
+// HIP-LLVM: define internal void @__hip_module_ctor()
+// HIP-LLVM: load ptr, ptr @__hip_gpubin_handle
+// HIP-LLVM: icmp eq ptr {{.*}}, null
+// HIP-LLVM: br i1 {{.*}}, label %{{.*}}, label %{{.*}}
+// HIP-LLVM: call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper)
+// HIP-LLVM: store ptr {{.*}}, ptr @__hip_gpubin_handle
+// HIP-LLVM: load ptr, ptr @__hip_gpubin_handle
+// HIP-LLVM: call void @__hip_register_globals(ptr {{.*}})
+// HIP-LLVM: call i32 @atexit(ptr @__hip_module_dtor)
+// HIP-LLVM: ret void
+
+// No GPU binary: no fatbin, no handle, no registration scaffolding.
+// HIP-NOGPUBIN-NOT: __hip_fatbin
+// HIP-NOGPUBIN-NOT: __hip_gpubin_handle
+// HIP-NOGPUBIN-NOT: __hip_register_globals
+// HIP-NOGPUBIN-NOT: __hip_module_ctor
+// HIP-NOGPUBIN-NOT: __hip_module_dtor
diff --git a/clang/test/CIR/CodeGenCUDA/kernel-call.cu b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
index 34719a2d3acb1..924f26c6dafbc 100644
--- a/clang/test/CIR/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
@@ -66,11 +66,11 @@
 // CUDA-PTH: cir.call @cudaLaunchKernel_ptsz
 
 //
-// HIP-NEW: cir.global constant external @_Z6kernelif = #cir.global_view<@_Z21__device_stub__kernelif> : !cir.func<(!s32i, !cir.float)>
+// HIP-NEW: cir.global constant external @_Z6kernelif = #cir.global_view<@_Z21__device_stub__kernelif> : !cir.ptr> {alignment = 8 : i64}
 // HIP-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelif
 // HIP-NEW: cir.alloca !cir.ptr, {{.*}} ["stream"]
 // HIP-NEW: cir.call @__hipPopCallConfiguration({{.*}}) : (!cir.ptr, !cir.ptr, !cir.ptr, !cir.ptr>) -> !s32i
-// HIP-NEW: cir.get_global @_Z6kernelif : !cir.ptr>
+// HIP-NEW: cir.get_global @_Z6kernelif : !cir.ptr>>
 // HIP-NEW: cir.call @hipLaunchKernel({{.*}}) : (!cir.ptr {{.*}}, !rec_dim3, !rec_dim3, !cir.ptr>{{.*}}, !u64i{{.*}}, !cir.ptr{{.*}}) -> (!u32i {llvm.noundef})
 // HIP-PTH: cir.call @hipLaunchKernel_spt
 
diff --git a/clang/test/CIR/CodeGenHIP/simple.cpp b/clang/test/CIR/CodeGenHIP/simple.cpp
index f4d2645c7803a..e684fc2104f4f 100644
--- a/clang/test/CIR/CodeGenHIP/simple.cpp
+++ b/clang/test/CIR/CodeGenHIP/simple.cpp
@@ -46,7 +46,7 @@ __global__ void global_fn(int a) {}
 // CIR-HOST: %[[#CIRKernelArgs:]] = cir.alloca {{.*}}"kernel_args"
 // CIR-HOST: %[[#Decayed:]] = cir.cast array_to_ptrdecay %[[#CIRKernelArgs]]
 // CIR-HOST: cir.call @__hipPopCallConfiguration
-// CIR-HOST: cir.get_global @_Z9global_fni : !cir.ptr>
+// CIR-HOST: cir.get_global @_Z9global_fni : !cir.ptr>>
 // CIR-HOST: cir.call @hipLaunchKernel
 
 // OGCG-HOST: define dso_local void @_Z24__device_stub__global_fni

From eb5a94218d591c194bdfa4b5d3c02d5f49038b13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 12 May 2026 09:42:49 -0700
Subject: [PATCH 478/538] DAGCombiner: (srl/sra (add nuw/nsw X, c), d) --> (add
 nuw/nsw (srl/sra X, d), c >> d) (#196379)

Additional precondition:
* The LSBs of c are 0; equivalently: c >> d is exact

Alive2 for
* unsigned case: https://alive2.llvm.org/ce/z/YcJ8qA
* signed case: https://alive2.llvm.org/ce/z/fgpvyE

We already canonicalize (shl (add ...) ...) to (add (shl ...) ...).

Restrict this combine to the single-use case to minimize risk for now.
The main target of this combine is a fan-out tree of `add`s that all end
up being shifted by the same amount at the leaves. This change happens
to
improve a bunch of existing CodeGen tests in AMDGPU.

v2:
- remove a redundant check on the shift amount -- large shift amounts
results in poison anyway
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   36 +
 llvm/test/CodeGen/AMDGPU/ctpop16.ll           |   13 +-
 llvm/test/CodeGen/AMDGPU/fp_to_sint.ll        |  274 +-
 llvm/test/CodeGen/AMDGPU/fp_to_uint.ll        |  274 +-
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       |  607 ++-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  646 ++-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |  646 ++-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  102 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  213 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  213 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  102 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |   89 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  | 1977 ++++----
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 1490 +++---
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  844 ++--
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |   94 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 1617 +++----
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 3185 ++++++-------
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |  896 ++--
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    | 3596 +++++++-------
 llvm/test/CodeGen/AMDGPU/max.ll               |    9 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |   36 +-
 .../CodeGen/AMDGPU/r600.llvm.is.fpclass.ll    |  335 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   22 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |   96 +-
 llvm/test/CodeGen/AMDGPU/sra.ll               |   23 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              | 4147 ++++++++---------
 llvm/test/CodeGen/AMDGPU/srl.ll               |   19 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll     |  202 +-
 29 files changed, 10091 insertions(+), 11712 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 707043736ed3c..48400dadbe352 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11428,6 +11428,24 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     }
   }
 
+  // fold (sra (add nsw X, C), D) -> (add nsw (sra X, D), C s>> D)
+  // when C has D trailing zeros (so C s>> D is exact).
+  if (N1C && N0.hasOneUse() && N0.getOpcode() == ISD::ADD &&
+      N0->getFlags().hasNoSignedWrap()) {
+    if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
+      const APInt &ShAmt = N1C->getAPIntValue();
+      const APInt &AddVal = AddC->getAPIntValue();
+      if (ShAmt.ult(AddVal.countr_zero())) {
+        SDNodeFlags ShiftFlags = N->getFlags();
+        SDValue NewSra =
+            DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), N1, ShiftFlags);
+        SDValue NewC = DAG.getConstant(AddVal.ashr(ShAmt), DL, VT);
+        SDNodeFlags AddFlags = N0->getFlags();
+        return DAG.getNode(ISD::ADD, DL, VT, NewSra, NewC, AddFlags);
+      }
+    }
+  }
+
   // Simplify, based on bits shifted out of the LHS.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
@@ -11696,6 +11714,24 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     }
   }
 
+  // fold (srl (add nuw X, C), D) -> (add nuw (srl X, D), C u>> D)
+  // when C has D trailing zeros (so C >> D is exact).
+  if (N1C && N0.hasOneUse() && N0.getOpcode() == ISD::ADD &&
+      N0->getFlags().hasNoUnsignedWrap()) {
+    if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
+      const APInt &ShAmt = N1C->getAPIntValue();
+      const APInt &AddVal = AddC->getAPIntValue();
+      if (ShAmt.ult(AddVal.countr_zero())) {
+        SDNodeFlags ShiftFlags = N->getFlags();
+        SDValue NewSrl =
+            DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1, ShiftFlags);
+        SDValue NewC = DAG.getConstant(AddVal.lshr(ShAmt), DL, VT);
+        SDNodeFlags AddFlags = N0->getFlags();
+        return DAG.getNode(ISD::ADD, DL, VT, NewSrl, NewC, AddFlags);
+      }
+    }
+  }
+
   // fold operands of srl based on knowledge that the low bits are not
   // demanded.
   if (SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index fca57be5764f8..dddf0c3b0108c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -840,7 +840,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
 ; EG-NEXT:    ALU 3, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
 ; EG-NEXT:    ALU 114, @16, KC0[], KC1[]
-; EG-NEXT:    ALU 34, @131, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 33, @131, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
 ; EG-NEXT:    CF_END
@@ -993,13 +993,12 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
 ; EG-NEXT:     AND_INT T1.W, T21.W, literal.x,
 ; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     AND_INT T0.Z, PV.X, literal.x,
 ; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -65536(nan), 16(2.242078e-44)
-; EG-NEXT:     LSHR T22.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T20.W, PV.Z, PV.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
+; EG-NEXT:    -65536(nan), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T22.X, T21.X, literal.x,
+; EG-NEXT:     OR_INT * T20.W, PS, PV.W,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T17.X, PV.W,
 ; EG-NEXT:     MOV * T0.X, T4.X,
 ; EG-NEXT:     MOV * T0.Z, T8.X,
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 0af603b4ccf5f..f1f9a347486a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -897,169 +897,167 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
 ;
 ; EG-LABEL: fp_to_sint_v4i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 54, @106, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
+; EG-NEXT:    ALU 102, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 49, @109, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, KC0[3].Z, literal.y,
+; EG-NEXT:     BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[4].X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
 ; EG-NEXT:     OR_INT T2.W, PS, literal.x,
 ; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT:     BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
-; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 23(3.222986e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.Y, PS, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, literal.z, T1.W,
+; EG-NEXT:     NOT_INT T1.W, PS,
+; EG-NEXT:     LSHR * T4.W, PV.W, 1,
+; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
 ; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T1.Y, T3.W, literal.x,
-; EG-NEXT:     LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT T3.W, KC0[4].X, literal.y,
-; EG-NEXT:     ADD_INT * T4.W, PV.Y, literal.z,
-; EG-NEXT:    32(4.484155e-44), 8388607(1.175494e-38)
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.Y, PS, literal.x,
-; EG-NEXT:     OR_INT T1.Z, PV.W, literal.y,
-; EG-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
-; EG-NEXT:     SETGT_INT * T5.W, T0.X, literal.z,
-; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.Y, PS, 0.0, PV.W,
-; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.W,
-; EG-NEXT:     LSHL T1.W, PV.Z, PV.Y,
-; EG-NEXT:     AND_INT * T3.W, T4.W, literal.y,
-; EG-NEXT:    150(2.101948e-43), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     AND_INT T2.Y, PV.Z, literal.x,
-; EG-NEXT:     SUB_INT T3.Z, literal.y, T0.Y,
-; EG-NEXT:     NOT_INT T4.W, T4.W,
-; EG-NEXT:     LSHR * T6.W, T1.Z, 1,
-; EG-NEXT:    32(4.484155e-44), 150(2.101948e-43)
-; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
-; EG-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
-; EG-NEXT:     BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
-; EG-NEXT:     AND_INT * T4.W, PV.Z, literal.y,
-; EG-NEXT:    -127(nan), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
-; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
-; EG-NEXT:     CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
+; EG-NEXT:     AND_INT T1.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T2.W, PV.Z,
+; EG-NEXT:     LSHL T1.W, T2.W, PV.Y,
+; EG-NEXT:     AND_INT * T2.W, T3.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T1.W, PS, PV.X, PV.W,
+; EG-NEXT:     SETGT_INT * T2.W, T0.X, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T2.X, T5.W, PS, PV.W,
-; EG-NEXT:     ASHR T1.Y, KC0[3].Z, literal.x,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T1.X,
+; EG-NEXT:     CNDE_INT T1.Z, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T1.W, PS, PV.Z, PV.Y,
 ; EG-NEXT:     ASHR * T2.W, KC0[4].X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T2.Y, PV.W, PS,
-; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     XOR_INT T1.W, PV.X, PV.Y,
-; EG-NEXT:     XOR_INT * T3.W, T3.Y, PV.Y,
-; EG-NEXT:     SUB_INT T3.Y, PS, T1.Y,
-; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T1.Y,
-; EG-NEXT:     SUB_INT T3.W, PV.Z, T2.W,
-; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T2.W,
-; EG-NEXT:     SUB_INT T4.Y, PV.W, PS,
-; EG-NEXT:     SUB_INT T0.Z, PV.Y, PV.Z,
-; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT:     AND_INT * T4.W, KC0[3].Y, literal.y,
-; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
-; EG-NEXT:     ADD_INT T3.Y, PV.W, literal.x,
-; EG-NEXT:     OR_INT T1.Z, PS, literal.y,
-; EG-NEXT:     BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
-; EG-NEXT:     ADD_INT * T4.W, PV.W, literal.w,
-; EG-NEXT:    -127(nan), 8388608(1.175494e-38)
-; EG-NEXT:    23(3.222986e-44), -150(nan)
-; EG-NEXT:     AND_INT T1.X, KC0[3].W, literal.x,
-; EG-NEXT:     ADD_INT T5.Y, PV.W, literal.y,
-; EG-NEXT:     SUB_INT T2.Z, literal.z, T3.W,
-; EG-NEXT:     NOT_INT T3.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
+; EG-NEXT:     XOR_INT T0.Z, PV.W, PS,
+; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, T0.W,
+; EG-NEXT:     XOR_INT * T3.W, PV.Z, PS,
+; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.Y, KC0[3].Z, literal.x,
+; EG-NEXT:     ADD_INT T1.Z, PV.W, literal.y,
+; EG-NEXT:     SUB_INT T3.W, PS, T2.W,
+; EG-NEXT:     SUBB_UINT * T4.W, PV.Z, T2.W,
 ; EG-NEXT:    8388607(1.175494e-38), -150(nan)
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T6.Y, PV.Z, literal.x,
-; EG-NEXT:     AND_INT T3.Z, PV.Y, literal.y,
-; EG-NEXT:     OR_INT T3.W, PV.X, literal.z,
-; EG-NEXT:     AND_INT * T5.W, T4.W, literal.y,
-; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
+; EG-NEXT:     SUB_INT T1.Y, PV.W, PS,
+; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.x,
+; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.y, T0.W,
+; EG-NEXT:     OR_INT * T4.W, PV.Y, literal.z,
+; EG-NEXT:    31(4.344025e-44), 23(3.222986e-44)
 ; EG-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
-; EG-NEXT:     LSHL T7.Y, T1.Z, PS,
-; EG-NEXT:     AND_INT T1.Z, T4.W, literal.x,
-; EG-NEXT:     LSHL T4.W, PV.W, PV.Z,
-; EG-NEXT:     AND_INT * T5.W, T5.Y, literal.x,
+; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     AND_INT T0.Y, KC0[3].Y, literal.x,
+; EG-NEXT:     ADD_INT T3.Z, PV.W, literal.y,
+; EG-NEXT:     LSHL T5.W, PS, PV.Z,
+; EG-NEXT:     AND_INT * T6.W, T1.Z, literal.z,
+; EG-NEXT:    8388607(1.175494e-38), -150(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
-; EG-NEXT:    ALU clause starting at 106:
-; EG-NEXT:     CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGT_INT * T7.W, T3.Y, literal.x,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T6.Y, PS, T2.Z, T8.Y,
-; EG-NEXT:     SUB_INT T1.Z, literal.x, T0.W,
-; EG-NEXT:     NOT_INT T6.W, T5.Y,
-; EG-NEXT:     LSHR * T7.W, T3.W, 1,
+; EG-NEXT:     BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
+; EG-NEXT:     CNDE_INT T2.Y, PS, PV.W, 0.0,
+; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.y,
+; EG-NEXT:     OR_INT T0.W, PV.Y, literal.z,
+; EG-NEXT:     SUB_INT * T7.W, literal.w, T1.W,
+; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
+; EG-NEXT:    8388608(1.175494e-38), 150(2.101948e-43)
+; EG-NEXT:     NOT_INT T2.X, T1.Z,
+; EG-NEXT:     LSHR T0.Y, T4.W, 1,
+; EG-NEXT:     AND_INT T1.Z, PS, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, T4.W, PS,
+; EG-NEXT:     ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT:    32(4.484155e-44), -127(nan)
+; EG-NEXT:     SETGT_INT T3.X, PS, literal.x,
+; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.W, 0.0,
+; EG-NEXT:     BIT_ALIGN_INT T1.Z, 0.0, PV.Y, PV.X,
+; EG-NEXT:     LSHL T4.W, T0.W, T2.Z,
+; EG-NEXT:     AND_INT * T7.W, T3.Z, literal.y,
+; EG-NEXT:    23(3.222986e-44), 32(4.484155e-44)
+; EG-NEXT:     AND_INT T2.X, KC0[3].W, literal.x,
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Z, T6.W, PV.Z, T5.W,
+; EG-NEXT:     CNDE_INT T5.W, PV.X, PV.Y, T2.Y,
+; EG-NEXT:     ASHR * T6.W, KC0[3].Z, literal.y,
+; EG-NEXT:    8388607(1.175494e-38), 31(4.344025e-44)
+; EG-NEXT:     XOR_INT T4.X, PV.W, PS,
+; EG-NEXT:     SUB_INT T2.Y, literal.x, T3.W,
+; EG-NEXT:     NOT_INT T2.Z, T3.Z,
+; EG-NEXT:     LSHR T5.W, T0.W, 1,
+; EG-NEXT:     CNDE_INT * T8.W, T3.X, 0.0, PV.Z, BS:VEC_021/SCL_122
 ; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     ASHR T2.X, KC0[3].Y, literal.x,
-; EG-NEXT:     ADD_INT T5.Y, T0.W, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
-; EG-NEXT:     AND_INT * T3.W, PV.Z, literal.z,
-; EG-NEXT:    31(4.344025e-44), -127(nan)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
+; EG-NEXT:     XOR_INT T3.X, PS, T6.W,
+; EG-NEXT:     ADD_INT T3.Y, T3.W, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T1.Z, 0.0, PV.W, PV.Z,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T0.W, PV.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.y,
+; EG-NEXT:    -127(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T5.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Y, T7.W, PV.Z, T4.W,
 ; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     XOR_INT T0.W, T6.Y, PV.X,
-; EG-NEXT:     XOR_INT * T3.W, T1.X, PV.X,
+; EG-NEXT:     SUB_INT T0.W, PV.X, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T4.X, T6.W,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T1.X, PS, T2.X,
-; EG-NEXT:     SUBB_UINT T6.Y, PV.W, T2.X,
-; EG-NEXT:     CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T3.W, PV.Z, PV.X, T3.X,
-; EG-NEXT:     ASHR * T4.W, KC0[3].W, literal.x,
+; EG-NEXT:     SUB_INT T3.X, PV.W, PS,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, T0.Y,
+; EG-NEXT:     OR_INT T0.W, T2.X, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, T1.X, literal.y,
+; EG-NEXT:    8388608(1.175494e-38), -150(nan)
+; EG-NEXT:     ADD_INT * T2.X, T1.X, literal.x,
+; EG-NEXT:    -127(nan), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 109:
+; EG-NEXT:     AND_INT T0.Y, T3.W, literal.x,
+; EG-NEXT:     SUB_INT T2.Z, literal.y, T1.X,
+; EG-NEXT:     NOT_INT T4.W, T3.W,
+; EG-NEXT:     LSHR * T5.W, T0.W, 1,
+; EG-NEXT:    31(4.344025e-44), 150(2.101948e-43)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T4.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, T0.W, PV.Z,
+; EG-NEXT:     LSHL T0.W, T0.W, PV.Y,
+; EG-NEXT:     AND_INT * T3.W, T3.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T5.X, KC0[3].Y, literal.x,
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.X, PV.W,
+; EG-NEXT:     SETGT_INT * T3.W, T2.X, literal.y,
+; EG-NEXT:    31(4.344025e-44), 23(3.222986e-44)
+; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.Y,
+; EG-NEXT:     ASHR T2.Z, KC0[3].W, literal.x,
+; EG-NEXT:     XOR_INT T0.W, T1.Z, PV.X,
+; EG-NEXT:     XOR_INT * T3.W, T2.Y, PV.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T3.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T7.Y, PV.Z, PS,
-; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T6.X, 0.0, T1.W,
+; EG-NEXT:     SUB_INT T2.Y, PS, T5.X,
+; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T5.X,
+; EG-NEXT:     XOR_INT T1.W, PV.Y, PV.Z,
+; EG-NEXT:     XOR_INT * T3.W, PV.X, PV.Z,
+; EG-NEXT:     SUB_INT T1.X, PS, T2.Z,
+; EG-NEXT:     SUBB_UINT T0.Y, PV.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.Z, PV.Y, PV.Z,
 ; EG-NEXT:     SETGT_INT T3.W, 0.0, T3.Y,
-; EG-NEXT:     CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
-; EG-NEXT:     SETGT_INT T1.X, 0.0, T0.Y,
+; EG-NEXT:     CNDE_INT * T4.W, PV.X, T3.X, 0.0,
+; EG-NEXT:     SUB_INT T3.X, T4.X, T6.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Z, 0.0,
+; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T5.W, 0.0, T2.X,
+; EG-NEXT:     CNDE_INT * T6.W, T0.X, T1.Y, 0.0,
 ; EG-NEXT:     CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T1.W, PV.Y, T4.W,
-; EG-NEXT:     SUBB_UINT * T5.W, PV.X, T4.W,
-; EG-NEXT:     SUB_INT T4.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T0.W, T2.X,
-; EG-NEXT:     CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
-; EG-NEXT:     CNDE_INT T6.X, T3.W, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T2.Y, T2.W,
+; EG-NEXT:     CNDE_INT T4.Z, T6.X, PV.X, 0.0,
+; EG-NEXT:     SUB_INT T2.W, T0.Z, T2.W,
+; EG-NEXT:     SUB_INT * T0.W, T0.W, T5.X,
+; EG-NEXT:     CNDE_INT T4.X, T3.W, PS, 0.0,
+; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.W, 0.0,
+; EG-NEXT:     SUB_INT * T0.W, T1.W, T2.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T6.X, T5.W, PV.W, 0.0,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.Z, T1.X, PV.W, 0.0,
-; EG-NEXT:     SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T1.X, T0.Y, PV.W, 0.0,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T1.X, PS, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 165ba24babf6b..0ab44fe2dd902 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -677,169 +677,167 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
 ;
 ; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 54, @106, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
+; EG-NEXT:    ALU 102, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 49, @109, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, KC0[3].Z, literal.y,
+; EG-NEXT:     BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[4].X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
 ; EG-NEXT:     OR_INT T2.W, PS, literal.x,
 ; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT:     BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
-; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 23(3.222986e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.Y, PS, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, literal.z, T1.W,
+; EG-NEXT:     NOT_INT T1.W, PS,
+; EG-NEXT:     LSHR * T4.W, PV.W, 1,
+; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
 ; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T1.Y, T3.W, literal.x,
-; EG-NEXT:     LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT T3.W, KC0[4].X, literal.y,
-; EG-NEXT:     ADD_INT * T4.W, PV.Y, literal.z,
-; EG-NEXT:    32(4.484155e-44), 8388607(1.175494e-38)
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.Y, PS, literal.x,
-; EG-NEXT:     OR_INT T1.Z, PV.W, literal.y,
-; EG-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
-; EG-NEXT:     SETGT_INT * T5.W, T0.X, literal.z,
-; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.Y, PS, 0.0, PV.W,
-; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.W,
-; EG-NEXT:     LSHL T1.W, PV.Z, PV.Y,
-; EG-NEXT:     AND_INT * T3.W, T4.W, literal.y,
-; EG-NEXT:    150(2.101948e-43), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     AND_INT T2.Y, PV.Z, literal.x,
-; EG-NEXT:     SUB_INT T3.Z, literal.y, T0.Y,
-; EG-NEXT:     NOT_INT T4.W, T4.W,
-; EG-NEXT:     LSHR * T6.W, T1.Z, 1,
-; EG-NEXT:    32(4.484155e-44), 150(2.101948e-43)
-; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
-; EG-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
-; EG-NEXT:     BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
-; EG-NEXT:     AND_INT * T4.W, PV.Z, literal.y,
-; EG-NEXT:    -127(nan), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
-; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
-; EG-NEXT:     CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
+; EG-NEXT:     AND_INT T1.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T2.W, PV.Z,
+; EG-NEXT:     LSHL T1.W, T2.W, PV.Y,
+; EG-NEXT:     AND_INT * T2.W, T3.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T1.W, PS, PV.X, PV.W,
+; EG-NEXT:     SETGT_INT * T2.W, T0.X, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T2.X, T5.W, PS, PV.W,
-; EG-NEXT:     ASHR T1.Y, KC0[3].Z, literal.x,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T1.X,
+; EG-NEXT:     CNDE_INT T1.Z, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T1.W, PS, PV.Z, PV.Y,
 ; EG-NEXT:     ASHR * T2.W, KC0[4].X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T2.Y, PV.W, PS,
-; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     XOR_INT T1.W, PV.X, PV.Y,
-; EG-NEXT:     XOR_INT * T3.W, T3.Y, PV.Y,
-; EG-NEXT:     SUB_INT T3.Y, PS, T1.Y,
-; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T1.Y,
-; EG-NEXT:     SUB_INT T3.W, PV.Z, T2.W,
-; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T2.W,
-; EG-NEXT:     SUB_INT T4.Y, PV.W, PS,
-; EG-NEXT:     SUB_INT T0.Z, PV.Y, PV.Z,
-; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT:     AND_INT * T4.W, KC0[3].Y, literal.y,
-; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
-; EG-NEXT:     ADD_INT T3.Y, PV.W, literal.x,
-; EG-NEXT:     OR_INT T1.Z, PS, literal.y,
-; EG-NEXT:     BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
-; EG-NEXT:     ADD_INT * T4.W, PV.W, literal.w,
-; EG-NEXT:    -127(nan), 8388608(1.175494e-38)
-; EG-NEXT:    23(3.222986e-44), -150(nan)
-; EG-NEXT:     AND_INT T1.X, KC0[3].W, literal.x,
-; EG-NEXT:     ADD_INT T5.Y, PV.W, literal.y,
-; EG-NEXT:     SUB_INT T2.Z, literal.z, T3.W,
-; EG-NEXT:     NOT_INT T3.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
+; EG-NEXT:     XOR_INT T0.Z, PV.W, PS,
+; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, T0.W,
+; EG-NEXT:     XOR_INT * T3.W, PV.Z, PS,
+; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.Y, KC0[3].Z, literal.x,
+; EG-NEXT:     ADD_INT T1.Z, PV.W, literal.y,
+; EG-NEXT:     SUB_INT T3.W, PS, T2.W,
+; EG-NEXT:     SUBB_UINT * T4.W, PV.Z, T2.W,
 ; EG-NEXT:    8388607(1.175494e-38), -150(nan)
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T6.Y, PV.Z, literal.x,
-; EG-NEXT:     AND_INT T3.Z, PV.Y, literal.y,
-; EG-NEXT:     OR_INT T3.W, PV.X, literal.z,
-; EG-NEXT:     AND_INT * T5.W, T4.W, literal.y,
-; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
+; EG-NEXT:     SUB_INT T1.Y, PV.W, PS,
+; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.x,
+; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.y, T0.W,
+; EG-NEXT:     OR_INT * T4.W, PV.Y, literal.z,
+; EG-NEXT:    31(4.344025e-44), 23(3.222986e-44)
 ; EG-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
-; EG-NEXT:     LSHL T7.Y, T1.Z, PS,
-; EG-NEXT:     AND_INT T1.Z, T4.W, literal.x,
-; EG-NEXT:     LSHL T4.W, PV.W, PV.Z,
-; EG-NEXT:     AND_INT * T5.W, T5.Y, literal.x,
+; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     AND_INT T0.Y, KC0[3].Y, literal.x,
+; EG-NEXT:     ADD_INT T3.Z, PV.W, literal.y,
+; EG-NEXT:     LSHL T5.W, PS, PV.Z,
+; EG-NEXT:     AND_INT * T6.W, T1.Z, literal.z,
+; EG-NEXT:    8388607(1.175494e-38), -150(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
-; EG-NEXT:    ALU clause starting at 106:
-; EG-NEXT:     CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGT_INT * T7.W, T3.Y, literal.x,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T6.Y, PS, T2.Z, T8.Y,
-; EG-NEXT:     SUB_INT T1.Z, literal.x, T0.W,
-; EG-NEXT:     NOT_INT T6.W, T5.Y,
-; EG-NEXT:     LSHR * T7.W, T3.W, 1,
+; EG-NEXT:     BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
+; EG-NEXT:     CNDE_INT T2.Y, PS, PV.W, 0.0,
+; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.y,
+; EG-NEXT:     OR_INT T0.W, PV.Y, literal.z,
+; EG-NEXT:     SUB_INT * T7.W, literal.w, T1.W,
+; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
+; EG-NEXT:    8388608(1.175494e-38), 150(2.101948e-43)
+; EG-NEXT:     NOT_INT T2.X, T1.Z,
+; EG-NEXT:     LSHR T0.Y, T4.W, 1,
+; EG-NEXT:     AND_INT T1.Z, PS, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, T4.W, PS,
+; EG-NEXT:     ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT:    32(4.484155e-44), -127(nan)
+; EG-NEXT:     SETGT_INT T3.X, PS, literal.x,
+; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.W, 0.0,
+; EG-NEXT:     BIT_ALIGN_INT T1.Z, 0.0, PV.Y, PV.X,
+; EG-NEXT:     LSHL T4.W, T0.W, T2.Z,
+; EG-NEXT:     AND_INT * T7.W, T3.Z, literal.y,
+; EG-NEXT:    23(3.222986e-44), 32(4.484155e-44)
+; EG-NEXT:     AND_INT T2.X, KC0[3].W, literal.x,
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Z, T6.W, PV.Z, T5.W,
+; EG-NEXT:     CNDE_INT T5.W, PV.X, PV.Y, T2.Y,
+; EG-NEXT:     ASHR * T6.W, KC0[3].Z, literal.y,
+; EG-NEXT:    8388607(1.175494e-38), 31(4.344025e-44)
+; EG-NEXT:     XOR_INT T4.X, PV.W, PS,
+; EG-NEXT:     SUB_INT T2.Y, literal.x, T3.W,
+; EG-NEXT:     NOT_INT T2.Z, T3.Z,
+; EG-NEXT:     LSHR T5.W, T0.W, 1,
+; EG-NEXT:     CNDE_INT * T8.W, T3.X, 0.0, PV.Z, BS:VEC_021/SCL_122
 ; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     ASHR T2.X, KC0[3].Y, literal.x,
-; EG-NEXT:     ADD_INT T5.Y, T0.W, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
-; EG-NEXT:     AND_INT * T3.W, PV.Z, literal.z,
-; EG-NEXT:    31(4.344025e-44), -127(nan)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
+; EG-NEXT:     XOR_INT T3.X, PS, T6.W,
+; EG-NEXT:     ADD_INT T3.Y, T3.W, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T1.Z, 0.0, PV.W, PV.Z,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T0.W, PV.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.y,
+; EG-NEXT:    -127(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T5.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Y, T7.W, PV.Z, T4.W,
 ; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     XOR_INT T0.W, T6.Y, PV.X,
-; EG-NEXT:     XOR_INT * T3.W, T1.X, PV.X,
+; EG-NEXT:     SUB_INT T0.W, PV.X, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T4.X, T6.W,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T1.X, PS, T2.X,
-; EG-NEXT:     SUBB_UINT T6.Y, PV.W, T2.X,
-; EG-NEXT:     CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T3.W, PV.Z, PV.X, T3.X,
-; EG-NEXT:     ASHR * T4.W, KC0[3].W, literal.x,
+; EG-NEXT:     SUB_INT T3.X, PV.W, PS,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, T0.Y,
+; EG-NEXT:     OR_INT T0.W, T2.X, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, T1.X, literal.y,
+; EG-NEXT:    8388608(1.175494e-38), -150(nan)
+; EG-NEXT:     ADD_INT * T2.X, T1.X, literal.x,
+; EG-NEXT:    -127(nan), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 109:
+; EG-NEXT:     AND_INT T0.Y, T3.W, literal.x,
+; EG-NEXT:     SUB_INT T2.Z, literal.y, T1.X,
+; EG-NEXT:     NOT_INT T4.W, T3.W,
+; EG-NEXT:     LSHR * T5.W, T0.W, 1,
+; EG-NEXT:    31(4.344025e-44), 150(2.101948e-43)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T4.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, T0.W, PV.Z,
+; EG-NEXT:     LSHL T0.W, T0.W, PV.Y,
+; EG-NEXT:     AND_INT * T3.W, T3.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T5.X, KC0[3].Y, literal.x,
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.X, PV.W,
+; EG-NEXT:     SETGT_INT * T3.W, T2.X, literal.y,
+; EG-NEXT:    31(4.344025e-44), 23(3.222986e-44)
+; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.Z, PV.Y,
+; EG-NEXT:     ASHR T2.Z, KC0[3].W, literal.x,
+; EG-NEXT:     XOR_INT T0.W, T1.Z, PV.X,
+; EG-NEXT:     XOR_INT * T3.W, T2.Y, PV.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T3.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T7.Y, PV.Z, PS,
-; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T6.X, 0.0, T1.W,
+; EG-NEXT:     SUB_INT T2.Y, PS, T5.X,
+; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T5.X,
+; EG-NEXT:     XOR_INT T1.W, PV.Y, PV.Z,
+; EG-NEXT:     XOR_INT * T3.W, PV.X, PV.Z,
+; EG-NEXT:     SUB_INT T1.X, PS, T2.Z,
+; EG-NEXT:     SUBB_UINT T0.Y, PV.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.Z, PV.Y, PV.Z,
 ; EG-NEXT:     SETGT_INT T3.W, 0.0, T3.Y,
-; EG-NEXT:     CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
-; EG-NEXT:     SETGT_INT T1.X, 0.0, T0.Y,
+; EG-NEXT:     CNDE_INT * T4.W, PV.X, T3.X, 0.0,
+; EG-NEXT:     SUB_INT T3.X, T4.X, T6.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Z, 0.0,
+; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T5.W, 0.0, T2.X,
+; EG-NEXT:     CNDE_INT * T6.W, T0.X, T1.Y, 0.0,
 ; EG-NEXT:     CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T1.W, PV.Y, T4.W,
-; EG-NEXT:     SUBB_UINT * T5.W, PV.X, T4.W,
-; EG-NEXT:     SUB_INT T4.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T0.W, T2.X,
-; EG-NEXT:     CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
-; EG-NEXT:     CNDE_INT T6.X, T3.W, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T2.Y, T2.W,
+; EG-NEXT:     CNDE_INT T4.Z, T6.X, PV.X, 0.0,
+; EG-NEXT:     SUB_INT T2.W, T0.Z, T2.W,
+; EG-NEXT:     SUB_INT * T0.W, T0.W, T5.X,
+; EG-NEXT:     CNDE_INT T4.X, T3.W, PS, 0.0,
+; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.W, 0.0,
+; EG-NEXT:     SUB_INT * T0.W, T1.W, T2.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T6.X, T5.W, PV.W, 0.0,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.Z, T1.X, PV.W, 0.0,
-; EG-NEXT:     SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T1.X, T0.Y, PV.W, 0.0,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T1.X, PS, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 6a5f2182961ba..ea0a8cbe663fd 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1142,7 +1142,7 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
 ;
 ; EG-LABEL: v3i32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
@@ -1151,28 +1151,24 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
 ; EG-NEXT:     MOV T0.X, KC0[3].Y,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T2.X, PS, literal.x,
 ; EG-NEXT:     MOV * T3.X, KC0[3].W,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v3i32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
+; CM-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV T1.X, KC0[3].W,
-; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
-; CM-NEXT:     MOV * T2.X, KC0[3].Y,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT * T1.X, PV.X, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV T2.X, KC0[3].W,
+; CM-NEXT:     MOV * T3.Y, KC0[3].Z,
+; CM-NEXT:     MOV * T3.X, KC0[3].Y,
 entry:
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -1221,7 +1217,7 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
 ;
 ; EG-LABEL: v3f32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
@@ -1230,28 +1226,24 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
 ; EG-NEXT:     MOV T0.X, KC0[3].Y,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T2.X, PS, literal.x,
 ; EG-NEXT:     MOV * T3.X, KC0[3].W,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v3f32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
+; CM-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV T1.X, KC0[3].W,
-; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
-; CM-NEXT:     MOV * T2.X, KC0[3].Y,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT * T1.X, PV.X, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV T2.X, KC0[3].W,
+; CM-NEXT:     MOV * T3.Y, KC0[3].Z,
+; CM-NEXT:     MOV * T3.X, KC0[3].Y,
 entry:
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -1957,7 +1949,7 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32>
 ;
 ; EG-LABEL: v5i32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
@@ -1968,30 +1960,26 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32>
 ; EG-NEXT:     MOV T0.X, KC0[4].Y,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T2.X, PS, literal.x,
 ; EG-NEXT:     MOV * T3.X, KC0[5].Y,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v5i32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
+; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T2.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T0.W, KC0[5].X,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
-; CM-NEXT:     MOV * T0.Z, KC0[4].W,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV T2.X, KC0[5].Y,
-; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
-; CM-NEXT:     MOV * T0.X, KC0[4].Y,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.W, KC0[5].X,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T2.X, PV.X, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[4].W,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV T3.X, KC0[5].Y,
+; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
+; CM-NEXT:     MOV * T1.X, KC0[4].Y,
 entry:
   store <5 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -2056,7 +2044,7 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float
 ;
 ; EG-LABEL: v5f32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
@@ -2067,30 +2055,26 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float
 ; EG-NEXT:     MOV T0.X, KC0[4].Y,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T2.X, PS, literal.x,
 ; EG-NEXT:     MOV * T3.X, KC0[5].Y,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v5f32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
+; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T2.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T0.W, KC0[5].X,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
-; CM-NEXT:     MOV * T0.Z, KC0[4].W,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV T2.X, KC0[5].Y,
-; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
-; CM-NEXT:     MOV * T0.X, KC0[4].Y,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.W, KC0[5].X,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T2.X, PV.X, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[4].W,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV T3.X, KC0[5].Y,
+; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
+; CM-NEXT:     MOV * T1.X, KC0[4].Y,
 entry:
   store <5 x float> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -2178,7 +2162,7 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64>
 ;
 ; EG-LABEL: v5i64_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 14, @6, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
@@ -2186,53 +2170,46 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64>
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, KC0[7].X,
-; EG-NEXT:     MOV * T0.Z, KC0[6].W,
-; EG-NEXT:     MOV T0.Y, KC0[6].Z,
+; EG-NEXT:     MOV T0.Z, KC0[6].W,
 ; EG-NEXT:     MOV * T1.W, KC0[8].X,
-; EG-NEXT:     MOV T0.X, KC0[6].Y,
+; EG-NEXT:     MOV T0.Y, KC0[6].Z,
 ; EG-NEXT:     MOV * T1.Z, KC0[7].W,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[6].Y,
 ; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T1.X, KC0[7].Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     MOV T5.Y, KC0[8].Z,
-; EG-NEXT:     MOV * T5.X, KC0[8].Y,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, PS, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     MOV * T5.Y, KC0[8].Z,
+; EG-NEXT:     MOV * T5.X, KC0[8].Y,
 ;
 ; CM-LABEL: v5i64_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
+; CM-NEXT:    ALU 15, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T5.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
-; CM-NEXT:     MOV * T0.W, KC0[8].X,
-; CM-NEXT:     MOV T1.Y, KC0[8].Z,
-; CM-NEXT:     MOV * T0.Z, KC0[7].W,
-; CM-NEXT:     MOV T1.X, KC0[8].Y,
-; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
-; CM-NEXT:     MOV T0.X, KC0[7].Y,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T2.W, KC0[7].X,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
-; CM-NEXT:     MOV T2.Z, KC0[6].W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
-; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV * T2.X, KC0[6].Y,
-; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T0.Y, KC0[8].Z,
+; CM-NEXT:     MOV T0.X, KC0[8].Y,
+; CM-NEXT:     MOV * T1.W, KC0[8].X,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[7].W,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     MOV T1.Y, KC0[7].Z,
+; CM-NEXT:     MOV * T4.W, KC0[7].X,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MOV T1.X, KC0[7].Y,
+; CM-NEXT:     MOV * T4.Z, KC0[6].W,
+; CM-NEXT:     ADD_INT T5.X, T2.X, literal.x,
+; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV * T4.X, KC0[6].Y,
 entry:
   store <5 x i64> %in, ptr addrspace(1) %out, align 8
   ret void
@@ -2320,7 +2297,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ;
 ; EG-LABEL: v5f64_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 14, @6, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
@@ -2328,53 +2305,46 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, KC0[7].X,
-; EG-NEXT:     MOV * T0.Z, KC0[6].W,
-; EG-NEXT:     MOV T0.Y, KC0[6].Z,
+; EG-NEXT:     MOV T0.Z, KC0[6].W,
 ; EG-NEXT:     MOV * T1.W, KC0[8].X,
-; EG-NEXT:     MOV T0.X, KC0[6].Y,
+; EG-NEXT:     MOV T0.Y, KC0[6].Z,
 ; EG-NEXT:     MOV * T1.Z, KC0[7].W,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[6].Y,
 ; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T1.X, KC0[7].Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     MOV T5.Y, KC0[8].Z,
-; EG-NEXT:     MOV * T5.X, KC0[8].Y,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, PS, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     MOV * T5.Y, KC0[8].Z,
+; EG-NEXT:     MOV * T5.X, KC0[8].Y,
 ;
 ; CM-LABEL: v5f64_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
+; CM-NEXT:    ALU 15, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T5.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
-; CM-NEXT:     MOV * T0.W, KC0[8].X,
-; CM-NEXT:     MOV T1.Y, KC0[8].Z,
-; CM-NEXT:     MOV * T0.Z, KC0[7].W,
-; CM-NEXT:     MOV T1.X, KC0[8].Y,
-; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
-; CM-NEXT:     MOV T0.X, KC0[7].Y,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T2.W, KC0[7].X,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
-; CM-NEXT:     MOV T2.Z, KC0[6].W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
-; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV * T2.X, KC0[6].Y,
-; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T0.Y, KC0[8].Z,
+; CM-NEXT:     MOV T0.X, KC0[8].Y,
+; CM-NEXT:     MOV * T1.W, KC0[8].X,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[7].W,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     MOV T1.Y, KC0[7].Z,
+; CM-NEXT:     MOV * T4.W, KC0[7].X,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MOV T1.X, KC0[7].Y,
+; CM-NEXT:     MOV * T4.Z, KC0[6].W,
+; CM-NEXT:     ADD_INT T5.X, T2.X, literal.x,
+; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV * T4.X, KC0[6].Y,
 entry:
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
@@ -2945,31 +2915,29 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ;
 ; EG-LABEL: v8i32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MOV * T0.W, KC0[5].X,
-; EG-NEXT:     MOV * T0.Z, KC0[4].W,
-; EG-NEXT:     MOV T0.Y, KC0[4].Z,
+; EG-NEXT:     MOV T0.Z, KC0[4].W,
 ; EG-NEXT:     MOV * T1.W, KC0[6].X,
-; EG-NEXT:     MOV T0.X, KC0[4].Y,
+; EG-NEXT:     MOV T0.Y, KC0[4].Z,
 ; EG-NEXT:     MOV * T1.Z, KC0[5].W,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[4].Y,
 ; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T1.X, KC0[5].Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PS, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v8i32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
+; CM-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     MOV * T0.W, KC0[6].X,
@@ -2977,15 +2945,13 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
 ; CM-NEXT:     MOV T0.X, KC0[5].Y,
 ; CM-NEXT:     MOV * T1.W, KC0[5].X,
-; CM-NEXT:     MOV T1.Z, KC0[4].W,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
-; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[4].W,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T1.X, KC0[4].Y,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -3056,31 +3022,29 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
 ;
 ; EG-LABEL: v8f32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MOV * T0.W, KC0[5].X,
-; EG-NEXT:     MOV * T0.Z, KC0[4].W,
-; EG-NEXT:     MOV T0.Y, KC0[4].Z,
+; EG-NEXT:     MOV T0.Z, KC0[4].W,
 ; EG-NEXT:     MOV * T1.W, KC0[6].X,
-; EG-NEXT:     MOV T0.X, KC0[4].Y,
+; EG-NEXT:     MOV T0.Y, KC0[4].Z,
 ; EG-NEXT:     MOV * T1.Z, KC0[5].W,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[4].Y,
 ; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T1.X, KC0[5].Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PS, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v8f32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
+; CM-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     MOV * T0.W, KC0[6].X,
@@ -3088,15 +3052,13 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
 ; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
 ; CM-NEXT:     MOV T0.X, KC0[5].Y,
 ; CM-NEXT:     MOV * T1.W, KC0[5].X,
-; CM-NEXT:     MOV T1.Z, KC0[4].W,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
-; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[4].W,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T1.X, KC0[4].Y,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   store <8 x float> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -3650,7 +3612,7 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
 ; EG-NEXT:    TEX 0 @64
 ; EG-NEXT:    ALU 5, @154, KC0[], KC1[]
 ; EG-NEXT:    TEX 0 @66
-; EG-NEXT:    ALU 13, @160, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 12, @160, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
 ; EG-NEXT:    CF_END
@@ -3795,13 +3757,12 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
 ; EG-NEXT:     MOV T6.X, PV.Z,
 ; EG-NEXT:     MOV * T0.Y, T8.X,
 ; EG-NEXT:    ALU clause starting at 160:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
 ; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
 ; EG-NEXT:     AND_INT * T1.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), -65536(nan)
+; EG-NEXT:    4(5.605194e-45), -65536(nan)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T11.X, PV.W, PS,
 ; EG-NEXT:     MOV T8.X, PV.X,
@@ -3844,9 +3805,9 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
 ; CM-NEXT:    TEX 0 @64
 ; CM-NEXT:    ALU 5, @154, KC0[], KC1[]
 ; CM-NEXT:    TEX 0 @66
-; CM-NEXT:    ALU 14, @160, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
+; CM-NEXT:    ALU 12, @160, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T14.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 36:
 ; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
@@ -3989,14 +3950,12 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
 ; CM-NEXT:     MOV T6.X, PV.Z,
 ; CM-NEXT:     MOV * T0.Y, T8.X,
 ; CM-NEXT:    ALU clause starting at 160:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T14.X, PV.X, literal.x,
 ; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
 ; CM-NEXT:     AND_INT * T0.W, T11.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), -65536(nan)
+; CM-NEXT:    4(5.605194e-45), -65536(nan)
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     OR_INT * T11.X, PV.Z, PV.W,
 ; CM-NEXT:     MOV T8.X, PV.X,
@@ -4116,50 +4075,44 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
 ;
 ; EG-LABEL: v16i32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 23, @6, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, KC0[7].X,
-; EG-NEXT:     MOV * T0.Z, KC0[6].W,
-; EG-NEXT:     MOV T0.Y, KC0[6].Z,
+; EG-NEXT:     MOV T0.Z, KC0[6].W,
 ; EG-NEXT:     MOV * T1.W, KC0[8].X,
-; EG-NEXT:     MOV T0.X, KC0[6].Y,
+; EG-NEXT:     MOV T0.Y, KC0[6].Z,
 ; EG-NEXT:     MOV * T1.Z, KC0[7].W,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[6].Y,
 ; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV * T3.W, KC0[9].X,
 ; EG-NEXT:     MOV T1.X, KC0[7].Y,
-; EG-NEXT:     MOV * T3.Z, KC0[8].W,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     MOV T3.Y, KC0[8].Z,
-; EG-NEXT:     MOV * T5.W, KC0[10].X,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T3.X, KC0[8].Y,
-; EG-NEXT:     MOV * T5.Z, KC0[9].W,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     MOV T5.Y, KC0[9].Z,
-; EG-NEXT:     MOV * T5.X, KC0[9].Y,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[9].X,
+; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T2.Z, KC0[8].W,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T4.X, PV.X, literal.x,
+; EG-NEXT:     MOV T2.Y, KC0[8].Z,
+; EG-NEXT:     MOV * T5.W, KC0[10].X,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV T2.X, KC0[8].Y,
+; EG-NEXT:     MOV T5.Z, KC0[9].W,
+; EG-NEXT:     ADD_INT * T6.X, T3.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV * T5.Y, KC0[9].Z,
+; EG-NEXT:     MOV T5.X, KC0[9].Y,
+; EG-NEXT:     ADD_INT * T7.X, T3.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: v16i32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 23, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T2.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T5.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 6:
@@ -4167,31 +4120,26 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
 ; CM-NEXT:     MOV * T0.Z, KC0[9].W,
 ; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
 ; CM-NEXT:     MOV T0.X, KC0[9].Y,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T2.W, KC0[9].X,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     MOV T2.Z, KC0[8].W,
-; CM-NEXT:     MOV * T1.W, KC0[8].X,
-; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
-; CM-NEXT:     MOV T2.Y, KC0[8].Z,
-; CM-NEXT:     MOV * T1.Z, KC0[7].W,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV T2.X, KC0[8].Y,
-; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
-; CM-NEXT:     MOV T1.X, KC0[7].Y,
-; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T4.W, KC0[7].X,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
-; CM-NEXT:     MOV T4.Z, KC0[6].W,
-; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
-; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV * T4.X, KC0[6].Y,
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.W, KC0[9].X,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[8].W,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     MOV T1.Y, KC0[8].Z,
+; CM-NEXT:     MOV * T4.W, KC0[8].X,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     MOV T1.X, KC0[8].Y,
+; CM-NEXT:     MOV * T4.Z, KC0[7].W,
+; CM-NEXT:     ADD_INT T5.X, T2.X, literal.x,
+; CM-NEXT:     MOV T4.Y, KC0[7].Z,
+; CM-NEXT:     MOV * T6.W, KC0[7].X,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MOV T4.X, KC0[7].Y,
+; CM-NEXT:     MOV * T6.Z, KC0[6].W,
+; CM-NEXT:     ADD_INT T7.X, T2.X, literal.x,
+; CM-NEXT:     MOV * T6.Y, KC0[6].Z,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV * T6.X, KC0[6].Y,
 entry:
   store <16 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -4304,50 +4252,44 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
 ;
 ; EG-LABEL: v16f32_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 23, @6, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, KC0[7].X,
-; EG-NEXT:     MOV * T0.Z, KC0[6].W,
-; EG-NEXT:     MOV T0.Y, KC0[6].Z,
+; EG-NEXT:     MOV T0.Z, KC0[6].W,
 ; EG-NEXT:     MOV * T1.W, KC0[8].X,
-; EG-NEXT:     MOV T0.X, KC0[6].Y,
+; EG-NEXT:     MOV T0.Y, KC0[6].Z,
 ; EG-NEXT:     MOV * T1.Z, KC0[7].W,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[6].Y,
 ; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV * T3.W, KC0[9].X,
 ; EG-NEXT:     MOV T1.X, KC0[7].Y,
-; EG-NEXT:     MOV * T3.Z, KC0[8].W,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     MOV T3.Y, KC0[8].Z,
-; EG-NEXT:     MOV * T5.W, KC0[10].X,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T3.X, KC0[8].Y,
-; EG-NEXT:     MOV * T5.Z, KC0[9].W,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     MOV T5.Y, KC0[9].Z,
-; EG-NEXT:     MOV * T5.X, KC0[9].Y,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[9].X,
+; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T2.Z, KC0[8].W,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T4.X, PV.X, literal.x,
+; EG-NEXT:     MOV T2.Y, KC0[8].Z,
+; EG-NEXT:     MOV * T5.W, KC0[10].X,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV T2.X, KC0[8].Y,
+; EG-NEXT:     MOV T5.Z, KC0[9].W,
+; EG-NEXT:     ADD_INT * T6.X, T3.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV * T5.Y, KC0[9].Z,
+; EG-NEXT:     MOV T5.X, KC0[9].Y,
+; EG-NEXT:     ADD_INT * T7.X, T3.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: v16f32_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 23, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T2.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T5.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 6:
@@ -4355,31 +4297,26 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
 ; CM-NEXT:     MOV * T0.Z, KC0[9].W,
 ; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
 ; CM-NEXT:     MOV T0.X, KC0[9].Y,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T2.W, KC0[9].X,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     MOV T2.Z, KC0[8].W,
-; CM-NEXT:     MOV * T1.W, KC0[8].X,
-; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
-; CM-NEXT:     MOV T2.Y, KC0[8].Z,
-; CM-NEXT:     MOV * T1.Z, KC0[7].W,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV T2.X, KC0[8].Y,
-; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
-; CM-NEXT:     MOV T1.X, KC0[7].Y,
-; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T4.W, KC0[7].X,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
-; CM-NEXT:     MOV T4.Z, KC0[6].W,
-; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
-; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     MOV * T4.X, KC0[6].Y,
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.W, KC0[9].X,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.Z, KC0[8].W,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     MOV T1.Y, KC0[8].Z,
+; CM-NEXT:     MOV * T4.W, KC0[8].X,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     MOV T1.X, KC0[8].Y,
+; CM-NEXT:     MOV * T4.Z, KC0[7].W,
+; CM-NEXT:     ADD_INT T5.X, T2.X, literal.x,
+; CM-NEXT:     MOV T4.Y, KC0[7].Z,
+; CM-NEXT:     MOV * T6.W, KC0[7].X,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MOV T4.X, KC0[7].Y,
+; CM-NEXT:     MOV * T6.Z, KC0[6].W,
+; CM-NEXT:     ADD_INT T7.X, T2.X, literal.x,
+; CM-NEXT:     MOV * T6.Y, KC0[6].Z,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT:     MOV * T6.X, KC0[6].Y,
 entry:
   store <16 x float> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -6092,14 +6029,14 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
 ; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @16
 ; EG-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
-; EG-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    ALU 1, @29, KC0[], KC1[]
 ; EG-NEXT:    TEX 0 @18
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
-; EG-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 0
+; EG-NEXT:    ALU 1, @31, KC0[], KC1[]
 ; EG-NEXT:    TEX 0 @20
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
-; EG-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 0
+; EG-NEXT:    ALU 0, @33, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
@@ -6108,46 +6045,40 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
 ; EG-NEXT:    Fetch clause starting at 16:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
 ; EG-NEXT:    Fetch clause starting at 18:
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
 ; EG-NEXT:    Fetch clause starting at 20:
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 24:
 ; EG-NEXT:     MOV * T0.X, KC0[6].Y,
 ; EG-NEXT:    ALU clause starting at 25:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; EG-NEXT:    ALU clause starting at 29:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T1.X, T2.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 31:
+; EG-NEXT:     ADD_INT * T1.X, T2.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:    ALU clause starting at 33:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 37:
-; EG-NEXT:     MOV T1.X, KC0[10].Y,
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV * T1.X, KC0[10].Y,
 ;
 ; CM-LABEL: byref_natural_align_constant_v16i32_arg:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @16
 ; CM-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
-; CM-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
+; CM-NEXT:    ALU 1, @29, KC0[], KC1[]
 ; CM-NEXT:    TEX 0 @18
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
-; CM-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T1.X
+; CM-NEXT:    ALU 1, @31, KC0[], KC1[]
 ; CM-NEXT:    TEX 0 @20
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
-; CM-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T1.X
+; CM-NEXT:    ALU 0, @33, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @22
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
@@ -6156,32 +6087,26 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
 ; CM-NEXT:    Fetch clause starting at 16:
 ; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
 ; CM-NEXT:    Fetch clause starting at 18:
-; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
 ; CM-NEXT:    Fetch clause starting at 20:
-; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
+; CM-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
 ; CM-NEXT:    Fetch clause starting at 22:
 ; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 24:
 ; CM-NEXT:     MOV * T0.X, KC0[6].Y,
 ; CM-NEXT:    ALU clause starting at 25:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T2.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; CM-NEXT:    ALU clause starting at 29:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T1.X, T2.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 31:
+; CM-NEXT:     ADD_INT * T1.X, T2.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:    ALU clause starting at 33:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 37:
 ; CM-NEXT:     MOV * T1.X, KC0[10].Y,
-; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %in = load <16 x i32>, ptr addrspace(4) %in.byref
   store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
   store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 0aa5681b4215c..91e4c0f1d7f7a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -1212,375 +1212,371 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; R600-LABEL: s_exp_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 69, @106, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 69, @105, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.W, PV.W, literal.x,
-; R600-NEXT:     ADD * T2.W, KC0[3].Y, -PV.W,
+; R600-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:     RNDNE T4.W, PS,
+; R600-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE * T3.W, PV.W,
-; R600-NEXT:     TRUNC T4.W, PV.W,
-; R600-NEXT:     MUL_IEEE * T5.W, T2.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PS,
+; R600-NEXT:     ADD * T1.W, T3.W, -PV.W,
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T2.W, T2.W, literal.x, PS,
-; R600-NEXT:     FLT_TO_INT * T4.W, PV.W,
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     MAX_INT T0.Z, PS, literal.x,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.W,
-; R600-NEXT:     ADD * T1.W, T1.W, -T3.W,
-; R600-NEXT:    -330(nan), 967029397(3.122284e-04)
-; R600-NEXT:     ADD T0.Y, PS, PV.W,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD_INT T0.W, T4.W, literal.y,
-; R600-NEXT:     SETGT_UINT * T1.W, T4.W, literal.z,
-; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; R600-NEXT:    -229(nan), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T0.W, T4.W, literal.x,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
-; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD T0.W, PS, PV.W,
+; R600-NEXT:     TRUNC * T1.W, T4.W,
+; R600-NEXT:     FLT_TO_INT T1.W, PS,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MAX_INT T0.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.y,
+; R600-NEXT:    -330(nan), 209715200(1.972152e-31)
 ; R600-NEXT:     MUL_IEEE T1.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, T4.W,
-; R600-NEXT:     MIN_INT T0.Z, T4.W, literal.y,
-; R600-NEXT:     AND_INT T2.W, KC0[3].W, literal.z,
-; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
-; R600-NEXT:    -4096(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
-; R600-NEXT:     ADD T1.Y, KC0[3].W, -PV.W,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T5.W, T4.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T6.W, T4.W, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), -254(nan)
-; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T2.Y, T4.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212
-; R600-NEXT:    127(1.779649e-43), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
-; R600-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122
-; R600-NEXT:     RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212
-; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z,
-; R600-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE * T1.W, T1.X, literal.y,
-; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T6.W, T1.X, PS,
-; R600-NEXT:     LSHL T0.Y, PV.W, literal.x,
-; R600-NEXT:     AND_INT T1.Z, KC0[3].Z, literal.y,
-; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T4.W, -PV.Y,
-; R600-NEXT:    23(3.222986e-44), -4096(nan)
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Y, PS, PV.W,
-; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T2.Y, T0.X, PV.X,
-; R600-NEXT:    1069064192(1.442383e+00), 1065353216(1.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
-; R600-NEXT:     ADD T0.Y, KC0[3].Z, -T1.Z,
-; R600-NEXT:     RNDNE T2.Z, PV.Z,
-; R600-NEXT:     TRUNC T0.W, T3.Y,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Y,
-; R600-NEXT:     SETGT T2.X, literal.x, KC0[3].Y,
-; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
-; R600-NEXT:     TRUNC T3.Z, PV.Z,
-; R600-NEXT:     MUL_IEEE T0.W, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.z,
-; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, T1.X, literal.y,
-; R600-NEXT:     MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W,
-; R600-NEXT:     FLT_TO_INT T0.W, PV.Z,
-; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
-; R600-NEXT:    1069064192(1.442383e+00), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
-; R600-NEXT:     MAX_INT T0.Y, PV.W, literal.y,
-; R600-NEXT:     MULADD_IEEE T1.Z, T1.Z, literal.z, PV.Z,
-; R600-NEXT:     ADD T2.W, T0.Z, -T2.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     MIN_INT * T3.W, PV.W, literal.w,
-; R600-NEXT:    -254(nan), -330(nan)
-; R600-NEXT:    967029397(3.122284e-04), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T5.X, PS, literal.x,
-; R600-NEXT:     ADD T3.Y, PV.W, PV.Z,
-; R600-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; R600-NEXT:     ADD_INT T2.W, T0.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T3.W, T0.W, literal.w,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
+; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T0.Z, T1.W, literal.z,
+; R600-NEXT:     SETGT_UINT * T0.W, T1.W, literal.w,
+; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT * T6.X, T0.W, literal.x,
+; R600-NEXT:     AND_INT * T3.W, KC0[3].W, literal.x,
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, T0.X, literal.x,
+; R600-NEXT:     AND_INT T1.Y, KC0[3].Z, literal.y,
+; R600-NEXT:     ADD T1.Z, KC0[3].W, -PV.W,
+; R600-NEXT:     CNDE_INT T4.W, T0.W, T0.Y, T0.Z,
+; R600-NEXT:     SETGT_INT * T5.W, T1.W, literal.z,
+; R600-NEXT:    2130706432(1.701412e+38), -4096(nan)
 ; R600-NEXT:    -127(nan), 0(0.000000e+00)
-; R600-NEXT:    ALU clause starting at 106:
-; R600-NEXT:     SETGT_UINT T0.Y, T0.W, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221
-; R600-NEXT:     SETGT_INT T2.W, T0.W, literal.y,
-; R600-NEXT:     EXP_IEEE * T1.Z, T3.Y,
-; R600-NEXT:    254(3.559298e-43), -127(nan)
-; R600-NEXT:     ADD_INT T7.X, T1.Y, literal.x,
-; R600-NEXT:     MUL_IEEE T3.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, PV.W, PV.Z, T0.W,
-; R600-NEXT:     CNDE_INT T4.W, PV.Y, T6.X, T5.X,
-; R600-NEXT:     SETGT_INT * T0.W, T0.W, literal.z,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     SETGT_UINT T5.X, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     MAX_INT T0.Z, T1.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T4.W, PV.Y, literal.z,
-; R600-NEXT:     MUL_IEEE * T5.W, T1.Z, literal.w,
-; R600-NEXT:    254(3.559298e-43), -330(nan)
-; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
-; R600-NEXT:     MUL_IEEE T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT * T4.W, T1.Y, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     CNDE_INT T8.X, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T5.Y, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, T1.Z,
-; R600-NEXT:     CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     LSHL * T3.W, T4.Y, literal.y,
-; R600-NEXT:    -127(nan), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T1.Y,
-; R600-NEXT:     CNDE_INT T0.W, T5.X, T7.X, T4.X,
-; R600-NEXT:     SETGT_INT * T2.W, T1.Y, literal.y,
-; R600-NEXT:    1065353216(1.000000e+00), 127(1.779649e-43)
-; R600-NEXT:     CNDE_INT T4.X, PS, PV.Z, PV.W,
-; R600-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].Z,
+; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, T1.W,
+; R600-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Z, T3.W, literal.y,
+; R600-NEXT:     MIN_INT T4.W, T1.W, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD * T6.W, KC0[3].Z, -PV.Y,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T1.Y, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T4.W, T1.W, literal.w,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:     SETGT_UINT * T7.W, T1.W, literal.x,
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T5.X, PV.W, T4.W, T2.Z,
+; R600-NEXT:     RNDNE T3.Y, T2.Y,
+; R600-NEXT:     MULADD_IEEE T2.Z, T6.W, literal.x, T4.X,
+; R600-NEXT:     RNDNE T4.W, T0.Z,
+; R600-NEXT:     MULADD_IEEE * T6.W, T1.Z, literal.x, T0.Y, BS:VEC_021/SCL_122
+; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; R600-NEXT:     SETGT_INT T4.X, T1.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.Y, T3.W, literal.y, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD T0.Z, T0.Z, -PV.W,
+; R600-NEXT:     MULADD_IEEE T1.W, T1.Y, literal.y, PV.Z,
+; R600-NEXT:     ADD * T3.W, T2.Y, -PV.Y,
+; R600-NEXT:    127(1.779649e-43), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T6.X, PS, PV.W,
+; R600-NEXT:     ADD T0.Y, PV.Z, PV.Y,
+; R600-NEXT:     CNDE_INT T0.Z, PV.X, T3.X, T5.X,
+; R600-NEXT:     MUL_IEEE * T1.W, T2.X, literal.x,
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT * T0.W, T0.W, T1.X, T2.W,
+; R600-NEXT:     CNDE_INT T0.X, T5.W, PV.W, T0.X,
+; R600-NEXT:     CNDE_INT T1.Y, T7.W, T2.X, T1.W, BS:VEC_102/SCL_221
+; R600-NEXT:     LSHL * T0.Z, T0.Z, literal.x,
+; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; R600-NEXT:     TRUNC T0.W, T4.W,
+; R600-NEXT:     EXP_IEEE * T0.Y, T0.Y,
+; R600-NEXT:     FLT_TO_INT T1.X, PV.W,
+; R600-NEXT:     TRUNC T2.Y, T3.Y, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE T1.Z, PS, literal.x,
+; R600-NEXT:     ADD_INT T0.W, T0.Z, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T4.X, T0.X, T1.Y,
+; R600-NEXT:    209715200(1.972152e-31), 1065353216(1.000000e+00)
+; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.x,
+; R600-NEXT:     FLT_TO_INT T0.Z, PV.Y,
+; R600-NEXT:     MAX_INT T0.W, PV.X, literal.y,
+; R600-NEXT:     EXP_IEEE * T1.W, T6.X,
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
+; R600-NEXT:     MUL_IEEE T2.X, T0.Y, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T1.X, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     MAX_INT * T2.W, T0.Z, literal.x,
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     SETGT_UINT T3.X, T1.X, literal.x,
+; R600-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T3.Z, T0.Z, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.x,
+; R600-NEXT:    -229(nan), 204(2.858649e-43)
+; R600-NEXT:    102(1.429324e-43), 0(0.000000e+00)
+; R600-NEXT:    ALU clause starting at 105:
+; R600-NEXT:     MIN_INT * T3.W, T0.Z, literal.x,
+; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT T4.X, PV.W, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, T0.Z, literal.y,
+; R600-NEXT:     SETGT_UINT T4.Z, T0.Z, literal.z,
+; R600-NEXT:     CNDE_INT T3.W, T2.W, T3.Y, T3.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.y,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T5.X, PS, PV.W, T0.Z,
+; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T0.Z, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.W, T3.X, T2.Z, T0.W,
+; R600-NEXT:     SETGT_INT * T3.W, T1.X, literal.y,
+; R600-NEXT:    127(1.779649e-43), -127(nan)
+; R600-NEXT:     CNDE_INT T4.X, PS, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MIN_INT T2.Z, T1.X, literal.x,
 ; R600-NEXT:     MUL_IEEE T0.W, T2.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T4.W, T3.X, T1.W,
-; R600-NEXT:    -1026650416(-1.032789e+02), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T5.Y, PS, T1.X,
-; R600-NEXT:     CNDE_INT T1.Y, T5.X, T2.Y, PV.W,
-; R600-NEXT:     CNDE T0.Z, PV.Z, PV.Y, 0.0,
-; R600-NEXT:     SETGT T0.W, KC0[3].Z, literal.x,
-; R600-NEXT:     LSHL * T1.W, PV.X, literal.y,
-; R600-NEXT:    1118925336(8.872284e+01), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T3.X, PS, literal.x,
-; R600-NEXT:     CNDE T0.Y, PV.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.X, PV.Y,
-; R600-NEXT:     CNDE T0.W, T2.X, T0.X, 0.0,
+; R600-NEXT:     MUL_IEEE * T5.W, T1.W, literal.z,
+; R600-NEXT:    381(5.338947e-43), 209715200(1.972152e-31)
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T5.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T2.Y, T2.W, PV.W, T2.Y,
+; R600-NEXT:     ADD_INT T2.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, T1.X, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T1.X, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
+; R600-NEXT:    -127(nan), 254(3.559298e-43)
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T4.Y, T1.X, literal.x,
+; R600-NEXT:     CNDE_INT T2.Z, T4.W, PV.Y, T1.W,
+; R600-NEXT:     CNDE_INT T0.W, T4.Z, T5.W, PV.X,
+; R600-NEXT:     LSHL * T1.W, T3.Y, literal.y,
+; R600-NEXT:    127(1.779649e-43), 23(3.222986e-44)
+; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T2.Y, T0.Z, PV.Z, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X,
+; R600-NEXT:     MUL_IEEE T0.W, T2.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T3.X, T1.Y, T1.Z,
+; R600-NEXT:    1065353216(1.000000e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T3.X, T3.W, PS, T0.Y,
+; R600-NEXT:     CNDE_INT T0.Y, T2.W, T2.X, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Y, PV.X,
+; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].Z,
+; R600-NEXT:    23(3.222986e-44), -1026650416(-1.032789e+02)
+; R600-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
+; R600-NEXT:     CNDE T1.Y, PS, PV.W, 0.0,
+; R600-NEXT:     SETGT T1.Z, KC0[3].Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, PV.Z, literal.z,
+; R600-NEXT:     CNDE_INT * T1.W, T4.Y, PV.X, PV.Y,
+; R600-NEXT:    -1026650416(-1.032789e+02), 1118925336(8.872284e+01)
+; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, PS, PV.W,
+; R600-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.x,
+; R600-NEXT:     SETGT T0.Z, literal.y, KC0[3].W,
+; R600-NEXT:     CNDE T0.W, PV.X, T0.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].Y, literal.z,
-; R600-NEXT:    1065353216(1.000000e+00), 2139095040(INF)
+; R600-NEXT:    2139095040(INF), -1026650416(-1.032789e+02)
 ; R600-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
 ; R600-NEXT:     CNDE T0.X, PS, PV.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, PV.Z, PV.X,
-; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
-; R600-NEXT:    2139095040(INF), -1026650416(-1.032789e+02)
-; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
-; R600-NEXT:     CNDE T0.W, PS, PV.W, 0.0,
+; R600-NEXT:     CNDE T0.W, PV.Z, PV.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].W, literal.y,
-; R600-NEXT:    2(2.802597e-45), 1118925336(8.872284e+01)
-; R600-NEXT:     CNDE T2.X, PS, PV.W, literal.x,
-; R600-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; R600-NEXT:    2139095040(INF), 8(1.121039e-44)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:    2139095040(INF), 1118925336(8.872284e+01)
+; R600-NEXT:     CNDE T1.X, PS, PV.W, literal.x,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.y,
+; R600-NEXT:    2139095040(INF), 2(2.802597e-45)
+; R600-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: s_exp_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 77, @108, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X
+; CM-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 77, @105, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
-; CM-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT:     AND_INT * T0.W, KC0[3].W, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
-; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:     RNDNE T1.Z, PV.W,
-; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.W, literal.x,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T1.Z, T1.Z,
-; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     RNDNE T0.Z, PV.W,
+; CM-NEXT:     ADD * T2.W, KC0[3].W, -T0.W,
 ; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
-; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
+; CM-NEXT:     AND_INT T1.Z, KC0[3].Y, literal.y,
+; CM-NEXT:     TRUNC * T3.W, PV.Z,
+; CM-NEXT:    967029397(3.122284e-04), -4096(nan)
+; CM-NEXT:     FLT_TO_INT T1.Y, PV.W,
+; CM-NEXT:     ADD T2.Z, KC0[3].Y, -PV.Z,
+; CM-NEXT:     MULADD_IEEE * T2.W, T2.W, literal.x, PV.Y,
+; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; CM-NEXT:     MAX_INT T3.Z, PV.Y, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, PV.Y, literal.z,
+; CM-NEXT:    967029397(3.122284e-04), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
 ; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T3.Z, T1.Y, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T1.Y, literal.w,
 ; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.X, T1.Y, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T1.Y, literal.y,
+; CM-NEXT:     CNDE_INT T3.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T1.Y, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
-; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
-; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T1.Z, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     CNDE_INT T3.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T1.Y, literal.y,
+; CM-NEXT:    1069064192(1.442383e+00), 127(1.779649e-43)
+; CM-NEXT:     ADD T1.X, T1.W, -T0.Z,
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
-; CM-NEXT:     CNDE_INT T0.Y, T2.Y, T3.X, PV.Z,
-; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.y,
-; CM-NEXT:    23(3.222986e-44), -4096(nan)
-; CM-NEXT:     ADD T1.Y, KC0[3].Z, -PV.W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT * T1.W, T3.W, PV.X, PV.Y,
-; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
-; CM-NEXT:     AND_INT * T1.W, KC0[3].W, literal.z,
-; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
-; CM-NEXT:     ADD T2.Y, KC0[3].W, -PV.W,
-; CM-NEXT:     RNDNE T1.Z, PV.Z,
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y,
-; CM-NEXT:    -1026650416(-1.032789e+02), 1069064192(1.442383e+00)
-; CM-NEXT:     MULADD_IEEE T2.X, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:     TRUNC T3.X, T1.Z,
-; CM-NEXT:     RNDNE T1.Y, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z,
-; CM-NEXT:     ADD * T2.W, PV.Y, PV.X,
+; CM-NEXT:     RNDNE T0.Z, PV.X,
+; CM-NEXT:     MULADD_IEEE * T1.W, T2.Z, literal.x, T0.Y,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T2.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T2.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T2.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T2.W,
-; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, T0.Z,
-; CM-NEXT:     ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     FLT_TO_INT T0.Z, T3.X,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.y,
-; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.y,
-; CM-NEXT:     TRUNC T1.Z, T1.Y,
-; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
-; CM-NEXT:    209715200(1.972152e-31), -229(nan)
-; CM-NEXT:     EXP_IEEE T1.X (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Y, T1.W,
-; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T1.W,
-; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
-; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, T3.Y, T3.X, T0.W,
-; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    209715200(1.972152e-31), -127(nan)
-; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.Y,
-; CM-NEXT:     MUL_IEEE * T4.Y, PV.Y, literal.x,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 108:
-; CM-NEXT:     SETGT_UINT T1.Z, T2.X, literal.x,
-; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.y,
-; CM-NEXT:    -229(nan), -330(nan)
-; CM-NEXT:     ADD_INT T4.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T5.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y,
-; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
-; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:     MULADD_IEEE T2.X, T1.Z, literal.x, PV.W,
+; CM-NEXT:     ADD T0.Y, T3.X, -PV.Z,
+; CM-NEXT:     LSHL T1.Z, PV.Y, literal.y,
+; CM-NEXT:     ADD * T1.W, PV.X, T0.X,
+; CM-NEXT:    967029397(3.122284e-04), 23(3.222986e-44)
+; CM-NEXT:     EXP_IEEE T0.X, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     ADD_INT T1.Y, T1.Z, literal.x,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.X, literal.y,
+; CM-NEXT:     ADD * T1.W, T0.Y, T2.X,
+; CM-NEXT:    1065353216(1.000000e+00), 209715200(1.972152e-31)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     TRUNC T1.X, T0.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE T0.Z, T0.X, literal.y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.Y, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T4.Y, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, T0.W, PV.Y, T1.Z,
+; CM-NEXT:     FLT_TO_INT * T0.W, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; CM-NEXT:     SETGT_UINT T1.X, PV.W, literal.x,
+; CM-NEXT:     AND_INT T2.Y, KC0[3].Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, T2.W, PV.Z, T0.X,
+; CM-NEXT:     CNDE_INT * T2.W, T3.Y, T0.Z, PV.Y,
+; CM-NEXT:    -229(nan), -4096(nan)
+; CM-NEXT:     CNDE_INT T0.X, T3.W, PV.Z, PV.W,
+; CM-NEXT:     ADD T3.Y, KC0[3].Z, -PV.Y,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T2.X, T1.W,
+; CM-NEXT:     SETGT_INT * T1.W, T0.W, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     CNDE_INT T2.X, PV.W, PV.Z, T0.Y,
 ; CM-NEXT:     MUL_IEEE T0.Y, T0.Y, literal.x,
-; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y,
-; CM-NEXT:     CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212
-; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
-; CM-NEXT:     CNDE_INT T4.X, T0.W, PV.W, T0.Z,
-; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
-; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:     MUL_IEEE T0.Z, T2.Y, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T2.W, PV.Y, literal.z,
+; CM-NEXT:    2130706432(1.701412e+38), 1069064192(1.442383e+00)
+; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
+; CM-NEXT:     MULADD_IEEE T3.X, T3.Y, literal.x, PV.W,
+; CM-NEXT:     RNDNE T3.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.y,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.W, literal.z,
+; CM-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T4.X, PV.W, T0.Y, PV.Z,
+; CM-NEXT:     TRUNC T0.Y, PV.Y,
+; CM-NEXT:     MULADD_IEEE T1.Z, T2.Y, literal.x, PV.X,
+; CM-NEXT:     MAX_INT * T3.W, T0.W, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), -330(nan)
+; CM-NEXT:     ADD T3.X, T0.Z, -T3.Y,
+; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.x,
+; CM-NEXT:     ADD_INT * T0.Z, T0.W, literal.y,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    ALU clause starting at 105:
+; CM-NEXT:     MIN_INT * T3.W, T0.W, literal.x,
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
-; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T0.W, literal.y,
+; CM-NEXT:     CNDE_INT T0.Z, T1.X, T2.Y, T0.Z,
+; CM-NEXT:     ADD * T3.W, T3.X, T1.Z, BS:VEC_102/SCL_221
 ; CM-NEXT:    -254(nan), -127(nan)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, T1.W, PV.W, T2.X,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
-; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
-; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
-; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     EXP_IEEE T1.X, T3.W,
+; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T3.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T3.W,
+; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T3.W,
+; CM-NEXT:     CNDE_INT T3.X, T1.W, T0.Z, T0.W,
+; CM-NEXT:     CNDE_INT T2.Y, T2.W, T3.Y, T5.X, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     SETGT_INT T5.X, T0.W, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.y,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.z,
+; CM-NEXT:     MIN_INT * T0.W, PV.Z, literal.w,
+; CM-NEXT:    127(1.779649e-43), 209715200(1.972152e-31)
+; CM-NEXT:    -330(nan), 381(5.338947e-43)
+; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
 ; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
-; CM-NEXT:    127(1.779649e-43), -254(nan)
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T7.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
-; CM-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.Y, T1.Z,
-; CM-NEXT:     MUL_IEEE * T1.W, T7.X, literal.y,
-; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T7.X, T3.Z, T7.X, PV.W,
-; CM-NEXT:     LSHL T1.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.y,
-; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T4.X, T0.W, T0.Y, PV.W,
-; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T2.X, T5.X, PV.X,
-; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
+; CM-NEXT:     MUL_IEEE T8.X, T1.X, literal.x,
+; CM-NEXT:     CNDE_INT T3.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T6.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T3.Y, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, T0.W, T0.Y, T1.W,
+; CM-NEXT:     CNDE_INT * T0.W, T5.X, T3.X, T2.Y,
+; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     LSHL T3.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.Z, T1.X,
+; CM-NEXT:     CNDE_INT T0.Z, T4.Y, T8.X, PV.Y,
+; CM-NEXT:     LSHL * T0.W, PV.X, literal.x,
+; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T3.W, PV.Y, PV.Z,
+; CM-NEXT:     ADD_INT T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T5.X, T2.X, T4.X,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T2.Y, T3.X, PV.X,
-; CM-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
-; CM-NEXT:     CNDE T0.Z, PV.Y, PV.X, 0.0,
-; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.y,
-; CM-NEXT:    -1026650416(-1.032789e+02), 1118925336(8.872284e+01)
-; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, literal.x,
-; CM-NEXT:     CNDE T0.Y, PV.Y, PV.X, 0.0,
-; CM-NEXT:     SETGT T0.Z, KC0[3].Z, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2139095040(INF), 1118925336(8.872284e+01)
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE T0.Z, T1.X, T0.X, 0.0,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.X, T1.Y,
+; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].W,
+; CM-NEXT:    -1026650416(-1.032789e+02), 0(0.000000e+00)
+; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT:     SETGT T1.Y, literal.y, KC0[3].Z,
+; CM-NEXT:     CNDE T0.Z, PV.W, PV.Z, 0.0,
+; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.z,
+; CM-NEXT:    2(2.802597e-45), -1026650416(-1.032789e+02)
+; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
+; CM-NEXT:     CNDE T1.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     SETGT T2.Y, literal.y, KC0[3].Y,
+; CM-NEXT:     CNDE T0.Z, PV.Y, T0.Y, 0.0,
+; CM-NEXT:     SETGT * T0.W, KC0[3].Z, literal.z,
+; CM-NEXT:    2139095040(INF), -1026650416(-1.032789e+02)
+; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE T4.Y, PV.W, PV.Z, literal.y,
+; CM-NEXT:     CNDE T0.Z, PV.Y, T2.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE * T4.X, PV.W, PV.Z, literal.x,
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a696a17d7bc6c..34799aed60793 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -1214,375 +1214,371 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; R600-LABEL: s_exp10_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 69, @106, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 69, @105, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.W, PV.W, literal.x,
-; R600-NEXT:     ADD * T2.W, KC0[3].Y, -PV.W,
+; R600-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:     RNDNE T4.W, PS,
+; R600-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE * T3.W, PV.W,
-; R600-NEXT:     TRUNC T4.W, PV.W,
-; R600-NEXT:     MUL_IEEE * T5.W, T2.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PS,
+; R600-NEXT:     ADD * T1.W, T3.W, -PV.W,
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T2.W, T2.W, literal.x, PS,
-; R600-NEXT:     FLT_TO_INT * T4.W, PV.W,
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     MAX_INT T0.Z, PS, literal.x,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.W,
-; R600-NEXT:     ADD * T1.W, T1.W, -T3.W,
-; R600-NEXT:    -330(nan), 975668412(6.390323e-04)
-; R600-NEXT:     ADD T0.Y, PS, PV.W,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD_INT T0.W, T4.W, literal.y,
-; R600-NEXT:     SETGT_UINT * T1.W, T4.W, literal.z,
-; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; R600-NEXT:    -229(nan), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T0.W, T4.W, literal.x,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
-; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD T0.W, PS, PV.W,
+; R600-NEXT:     TRUNC * T1.W, T4.W,
+; R600-NEXT:     FLT_TO_INT T1.W, PS,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MAX_INT T0.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.y,
+; R600-NEXT:    -330(nan), 209715200(1.972152e-31)
 ; R600-NEXT:     MUL_IEEE T1.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, T4.W,
-; R600-NEXT:     MIN_INT T0.Z, T4.W, literal.y,
-; R600-NEXT:     AND_INT T2.W, KC0[3].W, literal.z,
-; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
-; R600-NEXT:    -4096(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
-; R600-NEXT:     ADD T1.Y, KC0[3].W, -PV.W,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T5.W, T4.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T6.W, T4.W, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), -254(nan)
-; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T2.Y, T4.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212
-; R600-NEXT:    127(1.779649e-43), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
-; R600-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122
-; R600-NEXT:     RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212
-; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z,
-; R600-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE * T1.W, T1.X, literal.y,
-; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T6.W, T1.X, PS,
-; R600-NEXT:     LSHL T0.Y, PV.W, literal.x,
-; R600-NEXT:     AND_INT T1.Z, KC0[3].Z, literal.y,
-; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T4.W, -PV.Y,
-; R600-NEXT:    23(3.222986e-44), -4096(nan)
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Y, PS, PV.W,
-; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T2.Y, T0.X, PV.X,
-; R600-NEXT:    1079283712(3.321289e+00), 1065353216(1.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
-; R600-NEXT:     ADD T0.Y, KC0[3].Z, -T1.Z,
-; R600-NEXT:     RNDNE T2.Z, PV.Z,
-; R600-NEXT:     TRUNC T0.W, T3.Y,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Y,
-; R600-NEXT:     SETGT T2.X, literal.x, KC0[3].Y,
-; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
-; R600-NEXT:     TRUNC T3.Z, PV.Z,
-; R600-NEXT:     MUL_IEEE T0.W, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.z,
-; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, T1.X, literal.y,
-; R600-NEXT:     MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W,
-; R600-NEXT:     FLT_TO_INT T0.W, PV.Z,
-; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
-; R600-NEXT:    1079283712(3.321289e+00), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
-; R600-NEXT:     MAX_INT T0.Y, PV.W, literal.y,
-; R600-NEXT:     MULADD_IEEE T1.Z, T1.Z, literal.z, PV.Z,
-; R600-NEXT:     ADD T2.W, T0.Z, -T2.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     MIN_INT * T3.W, PV.W, literal.w,
-; R600-NEXT:    -254(nan), -330(nan)
-; R600-NEXT:    975668412(6.390323e-04), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T5.X, PS, literal.x,
-; R600-NEXT:     ADD T3.Y, PV.W, PV.Z,
-; R600-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; R600-NEXT:     ADD_INT T2.W, T0.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T3.W, T0.W, literal.w,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
+; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T0.Z, T1.W, literal.z,
+; R600-NEXT:     SETGT_UINT * T0.W, T1.W, literal.w,
+; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT * T6.X, T0.W, literal.x,
+; R600-NEXT:     AND_INT * T3.W, KC0[3].W, literal.x,
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, T0.X, literal.x,
+; R600-NEXT:     AND_INT T1.Y, KC0[3].Z, literal.y,
+; R600-NEXT:     ADD T1.Z, KC0[3].W, -PV.W,
+; R600-NEXT:     CNDE_INT T4.W, T0.W, T0.Y, T0.Z,
+; R600-NEXT:     SETGT_INT * T5.W, T1.W, literal.z,
+; R600-NEXT:    2130706432(1.701412e+38), -4096(nan)
 ; R600-NEXT:    -127(nan), 0(0.000000e+00)
-; R600-NEXT:    ALU clause starting at 106:
-; R600-NEXT:     SETGT_UINT T0.Y, T0.W, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221
-; R600-NEXT:     SETGT_INT T2.W, T0.W, literal.y,
-; R600-NEXT:     EXP_IEEE * T1.Z, T3.Y,
-; R600-NEXT:    254(3.559298e-43), -127(nan)
-; R600-NEXT:     ADD_INT T7.X, T1.Y, literal.x,
-; R600-NEXT:     MUL_IEEE T3.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, PV.W, PV.Z, T0.W,
-; R600-NEXT:     CNDE_INT T4.W, PV.Y, T6.X, T5.X,
-; R600-NEXT:     SETGT_INT * T0.W, T0.W, literal.z,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     SETGT_UINT T5.X, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     MAX_INT T0.Z, T1.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T4.W, PV.Y, literal.z,
-; R600-NEXT:     MUL_IEEE * T5.W, T1.Z, literal.w,
-; R600-NEXT:    254(3.559298e-43), -330(nan)
-; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
-; R600-NEXT:     MUL_IEEE T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT * T4.W, T1.Y, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     CNDE_INT T8.X, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T5.Y, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, T1.Z,
-; R600-NEXT:     CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     LSHL * T3.W, T4.Y, literal.y,
-; R600-NEXT:    -127(nan), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T1.Y,
-; R600-NEXT:     CNDE_INT T0.W, T5.X, T7.X, T4.X,
-; R600-NEXT:     SETGT_INT * T2.W, T1.Y, literal.y,
-; R600-NEXT:    1065353216(1.000000e+00), 127(1.779649e-43)
-; R600-NEXT:     CNDE_INT T4.X, PS, PV.Z, PV.W,
-; R600-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].Z,
+; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, T1.W,
+; R600-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Z, T3.W, literal.y,
+; R600-NEXT:     MIN_INT T4.W, T1.W, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD * T6.W, KC0[3].Z, -PV.Y,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T1.Y, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T4.W, T1.W, literal.w,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:     SETGT_UINT * T7.W, T1.W, literal.x,
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T5.X, PV.W, T4.W, T2.Z,
+; R600-NEXT:     RNDNE T3.Y, T2.Y,
+; R600-NEXT:     MULADD_IEEE T2.Z, T6.W, literal.x, T4.X,
+; R600-NEXT:     RNDNE T4.W, T0.Z,
+; R600-NEXT:     MULADD_IEEE * T6.W, T1.Z, literal.x, T0.Y, BS:VEC_021/SCL_122
+; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; R600-NEXT:     SETGT_INT T4.X, T1.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.Y, T3.W, literal.y, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD T0.Z, T0.Z, -PV.W,
+; R600-NEXT:     MULADD_IEEE T1.W, T1.Y, literal.y, PV.Z,
+; R600-NEXT:     ADD * T3.W, T2.Y, -PV.Y,
+; R600-NEXT:    127(1.779649e-43), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T6.X, PS, PV.W,
+; R600-NEXT:     ADD T0.Y, PV.Z, PV.Y,
+; R600-NEXT:     CNDE_INT T0.Z, PV.X, T3.X, T5.X,
+; R600-NEXT:     MUL_IEEE * T1.W, T2.X, literal.x,
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT * T0.W, T0.W, T1.X, T2.W,
+; R600-NEXT:     CNDE_INT T0.X, T5.W, PV.W, T0.X,
+; R600-NEXT:     CNDE_INT T1.Y, T7.W, T2.X, T1.W, BS:VEC_102/SCL_221
+; R600-NEXT:     LSHL * T0.Z, T0.Z, literal.x,
+; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; R600-NEXT:     TRUNC T0.W, T4.W,
+; R600-NEXT:     EXP_IEEE * T0.Y, T0.Y,
+; R600-NEXT:     FLT_TO_INT T1.X, PV.W,
+; R600-NEXT:     TRUNC T2.Y, T3.Y, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE T1.Z, PS, literal.x,
+; R600-NEXT:     ADD_INT T0.W, T0.Z, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T4.X, T0.X, T1.Y,
+; R600-NEXT:    209715200(1.972152e-31), 1065353216(1.000000e+00)
+; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.x,
+; R600-NEXT:     FLT_TO_INT T0.Z, PV.Y,
+; R600-NEXT:     MAX_INT T0.W, PV.X, literal.y,
+; R600-NEXT:     EXP_IEEE * T1.W, T6.X,
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
+; R600-NEXT:     MUL_IEEE T2.X, T0.Y, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T1.X, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     MAX_INT * T2.W, T0.Z, literal.x,
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     SETGT_UINT T3.X, T1.X, literal.x,
+; R600-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T3.Z, T0.Z, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.x,
+; R600-NEXT:    -229(nan), 204(2.858649e-43)
+; R600-NEXT:    102(1.429324e-43), 0(0.000000e+00)
+; R600-NEXT:    ALU clause starting at 105:
+; R600-NEXT:     MIN_INT * T3.W, T0.Z, literal.x,
+; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT T4.X, PV.W, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, T0.Z, literal.y,
+; R600-NEXT:     SETGT_UINT T4.Z, T0.Z, literal.z,
+; R600-NEXT:     CNDE_INT T3.W, T2.W, T3.Y, T3.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.y,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T5.X, PS, PV.W, T0.Z,
+; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T0.Z, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.W, T3.X, T2.Z, T0.W,
+; R600-NEXT:     SETGT_INT * T3.W, T1.X, literal.y,
+; R600-NEXT:    127(1.779649e-43), -127(nan)
+; R600-NEXT:     CNDE_INT T4.X, PS, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MIN_INT T2.Z, T1.X, literal.x,
 ; R600-NEXT:     MUL_IEEE T0.W, T2.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T4.W, T3.X, T1.W,
-; R600-NEXT:    -1036817932(-4.485347e+01), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T5.Y, PS, T1.X,
-; R600-NEXT:     CNDE_INT T1.Y, T5.X, T2.Y, PV.W,
-; R600-NEXT:     CNDE T0.Z, PV.Z, PV.Y, 0.0,
-; R600-NEXT:     SETGT T0.W, KC0[3].Z, literal.x,
-; R600-NEXT:     LSHL * T1.W, PV.X, literal.y,
-; R600-NEXT:    1109008539(3.853184e+01), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T3.X, PS, literal.x,
-; R600-NEXT:     CNDE T0.Y, PV.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.X, PV.Y,
-; R600-NEXT:     CNDE T0.W, T2.X, T0.X, 0.0,
+; R600-NEXT:     MUL_IEEE * T5.W, T1.W, literal.z,
+; R600-NEXT:    381(5.338947e-43), 209715200(1.972152e-31)
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T5.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T2.Y, T2.W, PV.W, T2.Y,
+; R600-NEXT:     ADD_INT T2.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, T1.X, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T1.X, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
+; R600-NEXT:    -127(nan), 254(3.559298e-43)
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T4.Y, T1.X, literal.x,
+; R600-NEXT:     CNDE_INT T2.Z, T4.W, PV.Y, T1.W,
+; R600-NEXT:     CNDE_INT T0.W, T4.Z, T5.W, PV.X,
+; R600-NEXT:     LSHL * T1.W, T3.Y, literal.y,
+; R600-NEXT:    127(1.779649e-43), 23(3.222986e-44)
+; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T2.Y, T0.Z, PV.Z, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X,
+; R600-NEXT:     MUL_IEEE T0.W, T2.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T3.X, T1.Y, T1.Z,
+; R600-NEXT:    1065353216(1.000000e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T3.X, T3.W, PS, T0.Y,
+; R600-NEXT:     CNDE_INT T0.Y, T2.W, T2.X, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Y, PV.X,
+; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].Z,
+; R600-NEXT:    23(3.222986e-44), -1036817932(-4.485347e+01)
+; R600-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
+; R600-NEXT:     CNDE T1.Y, PS, PV.W, 0.0,
+; R600-NEXT:     SETGT T1.Z, KC0[3].Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, PV.Z, literal.z,
+; R600-NEXT:     CNDE_INT * T1.W, T4.Y, PV.X, PV.Y,
+; R600-NEXT:    -1036817932(-4.485347e+01), 1109008539(3.853184e+01)
+; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, PS, PV.W,
+; R600-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.x,
+; R600-NEXT:     SETGT T0.Z, literal.y, KC0[3].W,
+; R600-NEXT:     CNDE T0.W, PV.X, T0.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].Y, literal.z,
-; R600-NEXT:    1065353216(1.000000e+00), 2139095040(INF)
+; R600-NEXT:    2139095040(INF), -1036817932(-4.485347e+01)
 ; R600-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
 ; R600-NEXT:     CNDE T0.X, PS, PV.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, PV.Z, PV.X,
-; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
-; R600-NEXT:    2139095040(INF), -1036817932(-4.485347e+01)
-; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
-; R600-NEXT:     CNDE T0.W, PS, PV.W, 0.0,
+; R600-NEXT:     CNDE T0.W, PV.Z, PV.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].W, literal.y,
-; R600-NEXT:    2(2.802597e-45), 1109008539(3.853184e+01)
-; R600-NEXT:     CNDE T2.X, PS, PV.W, literal.x,
-; R600-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; R600-NEXT:    2139095040(INF), 8(1.121039e-44)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:    2139095040(INF), 1109008539(3.853184e+01)
+; R600-NEXT:     CNDE T1.X, PS, PV.W, literal.x,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.y,
+; R600-NEXT:    2139095040(INF), 2(2.802597e-45)
+; R600-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: s_exp10_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 77, @108, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X
+; CM-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 77, @105, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
-; CM-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT:     AND_INT * T0.W, KC0[3].W, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
-; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:     RNDNE T1.Z, PV.W,
-; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.W, literal.x,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T1.Z, T1.Z,
-; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     RNDNE T0.Z, PV.W,
+; CM-NEXT:     ADD * T2.W, KC0[3].W, -T0.W,
 ; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
-; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
+; CM-NEXT:     AND_INT T1.Z, KC0[3].Y, literal.y,
+; CM-NEXT:     TRUNC * T3.W, PV.Z,
+; CM-NEXT:    975668412(6.390323e-04), -4096(nan)
+; CM-NEXT:     FLT_TO_INT T1.Y, PV.W,
+; CM-NEXT:     ADD T2.Z, KC0[3].Y, -PV.Z,
+; CM-NEXT:     MULADD_IEEE * T2.W, T2.W, literal.x, PV.Y,
+; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; CM-NEXT:     MAX_INT T3.Z, PV.Y, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, PV.Y, literal.z,
+; CM-NEXT:    975668412(6.390323e-04), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
 ; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T3.Z, T1.Y, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T1.Y, literal.w,
 ; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.X, T1.Y, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T1.Y, literal.y,
+; CM-NEXT:     CNDE_INT T3.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T1.Y, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
-; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
-; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T1.Z, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     CNDE_INT T3.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T1.Y, literal.y,
+; CM-NEXT:    1079283712(3.321289e+00), 127(1.779649e-43)
+; CM-NEXT:     ADD T1.X, T1.W, -T0.Z,
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
-; CM-NEXT:     CNDE_INT T0.Y, T2.Y, T3.X, PV.Z,
-; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.y,
-; CM-NEXT:    23(3.222986e-44), -4096(nan)
-; CM-NEXT:     ADD T1.Y, KC0[3].Z, -PV.W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT * T1.W, T3.W, PV.X, PV.Y,
-; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
-; CM-NEXT:     AND_INT * T1.W, KC0[3].W, literal.z,
-; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
-; CM-NEXT:     ADD T2.Y, KC0[3].W, -PV.W,
-; CM-NEXT:     RNDNE T1.Z, PV.Z,
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y,
-; CM-NEXT:    -1036817932(-4.485347e+01), 1079283712(3.321289e+00)
-; CM-NEXT:     MULADD_IEEE T2.X, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:     TRUNC T3.X, T1.Z,
-; CM-NEXT:     RNDNE T1.Y, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z,
-; CM-NEXT:     ADD * T2.W, PV.Y, PV.X,
+; CM-NEXT:     RNDNE T0.Z, PV.X,
+; CM-NEXT:     MULADD_IEEE * T1.W, T2.Z, literal.x, T0.Y,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T2.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T2.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T2.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T2.W,
-; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, T0.Z,
-; CM-NEXT:     ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     FLT_TO_INT T0.Z, T3.X,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.y,
-; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.y,
-; CM-NEXT:     TRUNC T1.Z, T1.Y,
-; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
-; CM-NEXT:    209715200(1.972152e-31), -229(nan)
-; CM-NEXT:     EXP_IEEE T1.X (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Y, T1.W,
-; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T1.W,
-; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
-; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, T3.Y, T3.X, T0.W,
-; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    209715200(1.972152e-31), -127(nan)
-; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.Y,
-; CM-NEXT:     MUL_IEEE * T4.Y, PV.Y, literal.x,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 108:
-; CM-NEXT:     SETGT_UINT T1.Z, T2.X, literal.x,
-; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.y,
-; CM-NEXT:    -229(nan), -330(nan)
-; CM-NEXT:     ADD_INT T4.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T5.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y,
-; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
-; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:     MULADD_IEEE T2.X, T1.Z, literal.x, PV.W,
+; CM-NEXT:     ADD T0.Y, T3.X, -PV.Z,
+; CM-NEXT:     LSHL T1.Z, PV.Y, literal.y,
+; CM-NEXT:     ADD * T1.W, PV.X, T0.X,
+; CM-NEXT:    975668412(6.390323e-04), 23(3.222986e-44)
+; CM-NEXT:     EXP_IEEE T0.X, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     ADD_INT T1.Y, T1.Z, literal.x,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.X, literal.y,
+; CM-NEXT:     ADD * T1.W, T0.Y, T2.X,
+; CM-NEXT:    1065353216(1.000000e+00), 209715200(1.972152e-31)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     TRUNC T1.X, T0.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE T0.Z, T0.X, literal.y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.Y, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T4.Y, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, T0.W, PV.Y, T1.Z,
+; CM-NEXT:     FLT_TO_INT * T0.W, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; CM-NEXT:     SETGT_UINT T1.X, PV.W, literal.x,
+; CM-NEXT:     AND_INT T2.Y, KC0[3].Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, T2.W, PV.Z, T0.X,
+; CM-NEXT:     CNDE_INT * T2.W, T3.Y, T0.Z, PV.Y,
+; CM-NEXT:    -229(nan), -4096(nan)
+; CM-NEXT:     CNDE_INT T0.X, T3.W, PV.Z, PV.W,
+; CM-NEXT:     ADD T3.Y, KC0[3].Z, -PV.Y,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T2.X, T1.W,
+; CM-NEXT:     SETGT_INT * T1.W, T0.W, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     CNDE_INT T2.X, PV.W, PV.Z, T0.Y,
 ; CM-NEXT:     MUL_IEEE T0.Y, T0.Y, literal.x,
-; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y,
-; CM-NEXT:     CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212
-; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
-; CM-NEXT:     CNDE_INT T4.X, T0.W, PV.W, T0.Z,
-; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
-; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:     MUL_IEEE T0.Z, T2.Y, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T2.W, PV.Y, literal.z,
+; CM-NEXT:    2130706432(1.701412e+38), 1079283712(3.321289e+00)
+; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
+; CM-NEXT:     MULADD_IEEE T3.X, T3.Y, literal.x, PV.W,
+; CM-NEXT:     RNDNE T3.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.y,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.W, literal.z,
+; CM-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T4.X, PV.W, T0.Y, PV.Z,
+; CM-NEXT:     TRUNC T0.Y, PV.Y,
+; CM-NEXT:     MULADD_IEEE T1.Z, T2.Y, literal.x, PV.X,
+; CM-NEXT:     MAX_INT * T3.W, T0.W, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), -330(nan)
+; CM-NEXT:     ADD T3.X, T0.Z, -T3.Y,
+; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.x,
+; CM-NEXT:     ADD_INT * T0.Z, T0.W, literal.y,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    ALU clause starting at 105:
+; CM-NEXT:     MIN_INT * T3.W, T0.W, literal.x,
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
-; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T0.W, literal.y,
+; CM-NEXT:     CNDE_INT T0.Z, T1.X, T2.Y, T0.Z,
+; CM-NEXT:     ADD * T3.W, T3.X, T1.Z, BS:VEC_102/SCL_221
 ; CM-NEXT:    -254(nan), -127(nan)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, T1.W, PV.W, T2.X,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
-; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
-; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
-; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     EXP_IEEE T1.X, T3.W,
+; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T3.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T3.W,
+; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T3.W,
+; CM-NEXT:     CNDE_INT T3.X, T1.W, T0.Z, T0.W,
+; CM-NEXT:     CNDE_INT T2.Y, T2.W, T3.Y, T5.X, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     SETGT_INT T5.X, T0.W, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.y,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.z,
+; CM-NEXT:     MIN_INT * T0.W, PV.Z, literal.w,
+; CM-NEXT:    127(1.779649e-43), 209715200(1.972152e-31)
+; CM-NEXT:    -330(nan), 381(5.338947e-43)
+; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
 ; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
-; CM-NEXT:    127(1.779649e-43), -254(nan)
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T7.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
-; CM-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.Y, T1.Z,
-; CM-NEXT:     MUL_IEEE * T1.W, T7.X, literal.y,
-; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T7.X, T3.Z, T7.X, PV.W,
-; CM-NEXT:     LSHL T1.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.y,
-; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T4.X, T0.W, T0.Y, PV.W,
-; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T2.X, T5.X, PV.X,
-; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
+; CM-NEXT:     MUL_IEEE T8.X, T1.X, literal.x,
+; CM-NEXT:     CNDE_INT T3.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T6.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T3.Y, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, T0.W, T0.Y, T1.W,
+; CM-NEXT:     CNDE_INT * T0.W, T5.X, T3.X, T2.Y,
+; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     LSHL T3.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.Z, T1.X,
+; CM-NEXT:     CNDE_INT T0.Z, T4.Y, T8.X, PV.Y,
+; CM-NEXT:     LSHL * T0.W, PV.X, literal.x,
+; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T3.W, PV.Y, PV.Z,
+; CM-NEXT:     ADD_INT T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T5.X, T2.X, T4.X,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T2.Y, T3.X, PV.X,
-; CM-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
-; CM-NEXT:     CNDE T0.Z, PV.Y, PV.X, 0.0,
-; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.y,
-; CM-NEXT:    -1036817932(-4.485347e+01), 1109008539(3.853184e+01)
-; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, literal.x,
-; CM-NEXT:     CNDE T0.Y, PV.Y, PV.X, 0.0,
-; CM-NEXT:     SETGT T0.Z, KC0[3].Z, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2139095040(INF), 1109008539(3.853184e+01)
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE T0.Z, T1.X, T0.X, 0.0,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.X, T1.Y,
+; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].W,
+; CM-NEXT:    -1036817932(-4.485347e+01), 0(0.000000e+00)
+; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT:     SETGT T1.Y, literal.y, KC0[3].Z,
+; CM-NEXT:     CNDE T0.Z, PV.W, PV.Z, 0.0,
+; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.z,
+; CM-NEXT:    2(2.802597e-45), -1036817932(-4.485347e+01)
+; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
+; CM-NEXT:     CNDE T1.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     SETGT T2.Y, literal.y, KC0[3].Y,
+; CM-NEXT:     CNDE T0.Z, PV.Y, T0.Y, 0.0,
+; CM-NEXT:     SETGT * T0.W, KC0[3].Z, literal.z,
+; CM-NEXT:    2139095040(INF), -1036817932(-4.485347e+01)
+; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE T4.Y, PV.W, PV.Z, literal.y,
+; CM-NEXT:     CNDE T0.Z, PV.Y, T2.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE * T4.X, PV.W, PV.Z, literal.x,
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <3 x float> @llvm.exp10.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 7572dbe9f3b60..53b862e8bab2f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -576,9 +576,9 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; R600-LABEL: s_exp2_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 29, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT:    ALU 27, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     SETGT T0.W, literal.x, KC0[3].Z,
@@ -586,75 +586,73 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    -1023672320(-1.260000e+02), 0(0.000000e+00)
 ; R600-NEXT:     CNDE * T2.W, PV.W, 0.0, literal.x,
 ; R600-NEXT:    1115684864(6.400000e+01), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.W, KC0[3].Z, PV.W,
-; R600-NEXT:     CNDE * T3.W, T1.W, 0.0, literal.x,
-; R600-NEXT:    1115684864(6.400000e+01), 0(0.000000e+00)
+; R600-NEXT:     ADD T0.Z, KC0[3].Z, PV.W,
+; R600-NEXT:     SETGT T2.W, literal.x, KC0[3].W,
+; R600-NEXT:     CNDE * T3.W, T1.W, 0.0, literal.y,
+; R600-NEXT:    -1023672320(-1.260000e+02), 1115684864(6.400000e+01)
 ; R600-NEXT:     ADD T0.Y, KC0[3].Y, PS,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].W,
+; R600-NEXT:     CNDE T1.Z, PV.W, 0.0, literal.x,
 ; R600-NEXT:     CNDE T0.W, T0.W, 1.0, literal.y,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
-; R600-NEXT:    -1023672320(-1.260000e+02), 528482304(5.421011e-20)
+; R600-NEXT:     EXP_IEEE * T0.X, PV.Z,
+; R600-NEXT:    1115684864(6.400000e+01), 528482304(5.421011e-20)
 ; R600-NEXT:     MUL_IEEE T1.Y, PS, PV.W,
-; R600-NEXT:     CNDE T1.Z, PV.Z, 0.0, literal.x,
-; R600-NEXT:     CNDE T0.W, T1.W, 1.0, literal.y,
+; R600-NEXT:     ADD T0.Z, KC0[3].W, PV.Z,
+; R600-NEXT:     CNDE T0.W, T1.W, 1.0, literal.x,
 ; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
-; R600-NEXT:    1115684864(6.400000e+01), 528482304(5.421011e-20)
+; R600-NEXT:    528482304(5.421011e-20), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T1.X, PS, PV.W,
-; R600-NEXT:     ADD T0.W, KC0[3].W, PV.Z,
-; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
-; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; R600-NEXT:     CNDE T1.W, T0.Z, 1.0, literal.x,
-; R600-NEXT:     EXP_IEEE * T0.Y, PV.W,
+; R600-NEXT:     CNDE T0.W, T2.W, 1.0, literal.x,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.Z,
 ; R600-NEXT:    528482304(5.421011e-20), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.X, PS, PV.W,
-; R600-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; R600-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: s_exp2_v3f32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 35, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].W,
 ; CM-NEXT:    -1023672320(-1.260000e+02), 0(0.000000e+00)
-; CM-NEXT:     CNDE T0.Y, PV.W, 0.0, literal.x,
-; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].Z,
-; CM-NEXT:     SETGT * T1.W, literal.y, KC0[3].Y,
-; CM-NEXT:    1115684864(6.400000e+01), -1023672320(-1.260000e+02)
-; CM-NEXT:     CNDE T0.X, PV.W, 0.0, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Z, 0.0, literal.x,
-; CM-NEXT:     CNDE T1.Z, T0.W, 1.0, literal.y,
-; CM-NEXT:     ADD * T0.W, KC0[3].W, PV.Y,
-; CM-NEXT:    1115684864(6.400000e+01), 528482304(5.421011e-20)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     MUL_IEEE T1.X, PV.Y, T1.Z,
-; CM-NEXT:     CNDE T0.Y, T0.Z, 1.0, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     ADD * T0.W, KC0[3].Z, T1.Y,
-; CM-NEXT:    528482304(5.421011e-20), 8(1.121039e-44)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
+; CM-NEXT:     CNDE * T1.W, PV.W, 0.0, literal.x,
+; CM-NEXT:    1115684864(6.400000e+01), 0(0.000000e+00)
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     CNDE T0.Z, T0.W, 1.0, literal.y,
+; CM-NEXT:     ADD * T0.W, KC0[3].W, PV.W,
+; CM-NEXT:    -1023672320(-1.260000e+02), 528482304(5.421011e-20)
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W, T0.W,
-; CM-NEXT:     LSHR T2.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, T0.Y,
-; CM-NEXT:     CNDE T0.Z, T1.W, 1.0, literal.y,
-; CM-NEXT:     ADD * T0.W, KC0[3].Y, T0.X,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     MUL_IEEE T0.X, PV.X, T0.Z,
+; CM-NEXT:     CNDE T0.Z, T0.Y, 0.0, literal.x,
+; CM-NEXT:     SETGT * T0.W, literal.y, KC0[3].Y,
+; CM-NEXT:    1115684864(6.400000e+01), -1023672320(-1.260000e+02)
+; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:     CNDE T1.Y, PV.W, 0.0, literal.y,
+; CM-NEXT:     CNDE T1.Z, T0.Y, 1.0, literal.z,
+; CM-NEXT:     ADD * T1.W, KC0[3].Z, PV.Z,
+; CM-NEXT:    2(2.802597e-45), 1115684864(6.400000e+01)
+; CM-NEXT:    528482304(5.421011e-20), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     ADD_INT T2.X, T1.X, literal.x,
+; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, T1.Z,
+; CM-NEXT:     CNDE T0.Z, T0.W, 1.0, literal.y,
+; CM-NEXT:     ADD * T0.W, KC0[3].Y, T1.Y,
 ; CM-NEXT:    2(2.802597e-45), 528482304(5.421011e-20)
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     MUL_IEEE * T0.X, PV.X, T0.Z,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE * T3.X, PV.Y, T0.Z,
   %result = call <3 x float> @llvm.exp2.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index af6665c79feef..cef8aa0c8dd38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -1544,9 +1544,9 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; R600-LABEL: s_log_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 62, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT:    ALU 61, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     SETGT T0.W, literal.x, KC0[3].Z,
@@ -1554,141 +1554,138 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
 ; R600-NEXT:     CNDE * T2.W, PV.W, 1.0, literal.x,
 ; R600-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.W, KC0[3].Z, PV.W,
-; R600-NEXT:     CNDE * T3.W, T1.W, 1.0, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].Z, PV.W,
+; R600-NEXT:     SETGT T2.W, literal.x, KC0[3].W,
+; R600-NEXT:     CNDE * T3.W, T1.W, 1.0, literal.y,
+; R600-NEXT:    8388608(1.175494e-38), 1333788672(4.294967e+09)
+; R600-NEXT:     MUL_IEEE T1.Z, KC0[3].Y, PS,
+; R600-NEXT:     CNDE T3.W, PV.W, 1.0, literal.x,
+; R600-NEXT:     LOG_IEEE * T0.X, PV.Z,
 ; R600-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].Y, PS,
-; R600-NEXT:     SETGT T3.W, literal.x, KC0[3].W,
-; R600-NEXT:     LOG_IEEE * T0.X, PV.W,
-; R600-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; R600-NEXT:     AND_INT T1.Z, PS, literal.x,
-; R600-NEXT:     CNDE T2.W, PV.W, 1.0, literal.y,
-; R600-NEXT:     LOG_IEEE * T0.Y, PV.Z,
-; R600-NEXT:    -4096(nan), 1333788672(4.294967e+09)
 ; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].W, PV.W,
-; R600-NEXT:     ADD T2.W, T0.X, -PV.Z,
-; R600-NEXT:     AND_INT * T4.W, PS, literal.x,
+; R600-NEXT:     AND_INT T3.W, PS, literal.x,
+; R600-NEXT:     LOG_IEEE * T0.Y, PV.Z,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.Z, T0.Y, -PS,
-; R600-NEXT:     MUL_IEEE T5.W, PV.W, literal.x,
+; R600-NEXT:     ADD T1.Z, T0.X, -PV.W,
+; R600-NEXT:     AND_INT T4.W, PS, literal.x,
 ; R600-NEXT:     LOG_IEEE * T0.Z, PV.Z,
-; R600-NEXT:    939916788(3.194618e-05), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T3.Z, T1.Z, literal.x, PV.W,
-; R600-NEXT:     AND_INT T5.W, PS, literal.y,
-; R600-NEXT:     MUL_IEEE * T6.W, PV.Z, literal.x,
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD T2.Z, T0.Y, -PV.W,
+; R600-NEXT:     MUL_IEEE T5.W, PV.Z, literal.x,
+; R600-NEXT:     AND_INT * T6.W, PS, literal.y,
 ; R600-NEXT:    939916788(3.194618e-05), -4096(nan)
+; R600-NEXT:     ADD T3.Z, T0.Z, -PS,
+; R600-NEXT:     MULADD_IEEE T5.W, T3.W, literal.x, PV.W,
+; R600-NEXT:     MUL_IEEE * T7.W, PV.Z, literal.x,
+; R600-NEXT:    939916788(3.194618e-05), 0(0.000000e+00)
 ; R600-NEXT:     MULADD_IEEE T4.Z, T4.W, literal.x, PS,
-; R600-NEXT:     ADD T6.W, T0.Z, -PV.W,
-; R600-NEXT:     MULADD_IEEE * T2.W, T2.W, literal.y, PV.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     MULADD_IEEE T5.W, T1.Z, literal.y, PV.W,
+; R600-NEXT:     MUL_IEEE * T7.W, PV.Z, literal.x,
 ; R600-NEXT:    939916788(3.194618e-05), 1060204544(6.931152e-01)
-; R600-NEXT:     MULADD_IEEE T1.Y, T1.Z, literal.x, PS,
-; R600-NEXT:     SETGT T1.Z, literal.y, |T0.X|,
-; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.z,
-; R600-NEXT:     MULADD_IEEE * T7.W, T2.Z, literal.x, PV.Z, BS:VEC_021/SCL_122
-; R600-NEXT:    1060204544(6.931152e-01), 2139095040(INF)
-; R600-NEXT:    939916788(3.194618e-05), 0(0.000000e+00)
+; R600-NEXT:     MULADD_IEEE T1.Y, T6.W, literal.x, PS,
+; R600-NEXT:     MULADD_IEEE T1.Z, T3.W, literal.y, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     SETGT T3.W, literal.z, |T0.X|,
+; R600-NEXT:     MULADD_IEEE * T5.W, T2.Z, literal.y, PV.Z, BS:VEC_021/SCL_122
+; R600-NEXT:    939916788(3.194618e-05), 1060204544(6.931152e-01)
+; R600-NEXT:    2139095040(INF), 0(0.000000e+00)
 ; R600-NEXT:     MULADD_IEEE T1.X, T4.W, literal.x, PS,
 ; R600-NEXT:     SETGT T2.Y, literal.y, |T0.Y|,
-; R600-NEXT:     MULADD_IEEE T2.Z, T5.W, literal.z, PV.W, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE T2.W, PV.Z, T0.X, PV.Y,
-; R600-NEXT:     CNDE * T0.W, T0.W, 0.0, literal.w,
+; R600-NEXT:     CNDE T1.Z, PV.W, T0.X, PV.Z,
+; R600-NEXT:     CNDE T0.W, T0.W, 0.0, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     MULADD_IEEE * T3.W, T3.Z, literal.x, PV.Y, BS:VEC_021/SCL_122
 ; R600-NEXT:    1060204544(6.931152e-01), 2139095040(INF)
-; R600-NEXT:    939916788(3.194618e-05), 1102148120(2.218071e+01)
-; R600-NEXT:     ADD T1.Y, PV.W, -PS,
-; R600-NEXT:     MULADD_IEEE T1.Z, T6.W, literal.x, PV.Z,
+; R600-NEXT:    1102148120(2.218071e+01), 0(0.000000e+00)
+; R600-NEXT:     MULADD_IEEE T0.X, T6.W, literal.x, PS,
+; R600-NEXT:     ADD T1.Y, PV.Z, -PV.W,
+; R600-NEXT:     SETGT T1.Z, literal.y, |T0.Z|,
 ; R600-NEXT:     CNDE T0.W, PV.Y, T0.Y, PV.X,
-; R600-NEXT:     CNDE * T1.W, T1.W, 0.0, literal.y,
-; R600-NEXT:    1060204544(6.931152e-01), 1102148120(2.218071e+01)
-; R600-NEXT:     ADD T1.X, PV.W, -PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T5.W, literal.x, PV.Z,
-; R600-NEXT:     SETGT * T1.W, literal.y, |T0.Z|,
+; R600-NEXT:     CNDE * T1.W, T1.W, 0.0, literal.z,
 ; R600-NEXT:    1060204544(6.931152e-01), 2139095040(INF)
-; R600-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; R600-NEXT:     CNDE T0.W, PS, T0.Z, PV.W,
-; R600-NEXT:     CNDE * T1.W, T3.W, 0.0, literal.y,
-; R600-NEXT:    2(2.802597e-45), 1102148120(2.218071e+01)
-; R600-NEXT:     ADD T2.X, PV.W, -PS,
-; R600-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; R600-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:    1102148120(2.218071e+01), 0(0.000000e+00)
+; R600-NEXT:     ADD T1.X, PV.W, -PS,
+; R600-NEXT:     CNDE T0.W, PV.Z, T0.Z, PV.X,
+; R600-NEXT:     CNDE * T1.W, T2.W, 0.0, literal.x,
+; R600-NEXT:    1102148120(2.218071e+01), 0(0.000000e+00)
+; R600-NEXT:     ADD T0.X, PV.W, -PS,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: s_log_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 68, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T1.X
+; CM-NEXT:    ALU 66, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].Y,
+; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].W,
 ; CM-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; CM-NEXT:     CNDE T0.Z, PV.W, 1.0, literal.x,
-; CM-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
-; CM-NEXT:    1333788672(4.294967e+09), 8388608(1.175494e-38)
-; CM-NEXT:     CNDE T0.Y, PV.W, 1.0, literal.x,
-; CM-NEXT:     SETGT T1.Z, literal.y, KC0[3].Z,
-; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].Y, PV.Z,
-; CM-NEXT:    1333788672(4.294967e+09), 8388608(1.175494e-38)
-; CM-NEXT:     LOG_IEEE T0.X, T2.W,
-; CM-NEXT:     LOG_IEEE T0.Y (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T2.W,
-; CM-NEXT:     CNDE T1.Y, T1.Z, 1.0, literal.x,
-; CM-NEXT:     AND_INT T0.Z, PV.X, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].W, T0.Y,
+; CM-NEXT:     CNDE * T1.W, PV.W, 1.0, literal.x,
+; CM-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     SETGT T0.Z, literal.x, KC0[3].Y,
+; CM-NEXT:     MUL_IEEE * T1.W, KC0[3].W, PV.W,
+; CM-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
+; CM-NEXT:     LOG_IEEE T0.X, T1.W,
+; CM-NEXT:     LOG_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     CNDE T1.Y, T0.Z, 1.0, literal.x,
+; CM-NEXT:     CNDE T1.Z, T0.Y, 1.0, literal.x,
+; CM-NEXT:     AND_INT * T1.W, PV.X, literal.y,
 ; CM-NEXT:    1333788672(4.294967e+09), -4096(nan)
-; CM-NEXT:     LOG_IEEE T0.X (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE T0.Y, T2.W,
-; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T2.W,
-; CM-NEXT:     ADD T2.Y, T0.X, -T0.Z,
-; CM-NEXT:     AND_INT T2.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].Z, T1.Y,
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD T2.Y, T0.X, -PV.W,
+; CM-NEXT:     MUL_IEEE T1.Z, KC0[3].Z, PV.Z,
+; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].Y, PV.Y,
 ; CM-NEXT:     LOG_IEEE T1.X, T2.W,
 ; CM-NEXT:     LOG_IEEE T1.Y (MASKED), T2.W,
 ; CM-NEXT:     LOG_IEEE T1.Z (MASKED), T2.W,
 ; CM-NEXT:     LOG_IEEE * T1.W (MASKED), T2.W,
-; CM-NEXT:     ADD T1.Y, T0.Y, -T2.Z,
-; CM-NEXT:     AND_INT T3.Z, PV.X, literal.x,
-; CM-NEXT:     MUL_IEEE * T2.W, T2.Y, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LOG_IEEE T1.X (MASKED), T1.Z,
+; CM-NEXT:     LOG_IEEE T1.Y, T1.Z,
+; CM-NEXT:     LOG_IEEE T1.Z (MASKED), T1.Z,
+; CM-NEXT:     LOG_IEEE * T1.W (MASKED), T1.Z,
+; CM-NEXT:     AND_INT T3.Y, PV.Y, literal.x,
+; CM-NEXT:     AND_INT T1.Z, T1.X, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T2.Y, literal.y,
 ; CM-NEXT:    -4096(nan), 939916788(3.194618e-05)
-; CM-NEXT:     MULADD_IEEE T3.Y, T0.Z, literal.x, PV.W,
-; CM-NEXT:     ADD T4.Z, T1.X, -PV.Z,
-; CM-NEXT:     MUL_IEEE * T2.W, PV.Y, literal.x,
+; CM-NEXT:     MULADD_IEEE T4.Y, T1.W, literal.x, PV.W,
+; CM-NEXT:     ADD T2.Z, T1.X, -PV.Z,
+; CM-NEXT:     ADD * T2.W, T1.Y, -PV.Y,
 ; CM-NEXT:    939916788(3.194618e-05), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T4.Y, T2.Z, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T5.Z, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE * T2.W, T2.Y, literal.y, PV.Y,
+; CM-NEXT:     MUL_IEEE T5.Y, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T3.Z, PV.Z, literal.x,
+; CM-NEXT:     MULADD_IEEE * T3.W, T2.Y, literal.y, PV.Y,
 ; CM-NEXT:    939916788(3.194618e-05), 1060204544(6.931152e-01)
-; CM-NEXT:     MULADD_IEEE T2.Y, T0.Z, literal.x, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T3.Z, literal.y, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.x, PV.Y,
-; CM-NEXT:    1060204544(6.931152e-01), 939916788(3.194618e-05)
-; CM-NEXT:     SETGT T2.X, literal.x, |T0.X|,
-; CM-NEXT:     MULADD_IEEE T1.Y, T2.Z, literal.y, PV.W,
-; CM-NEXT:     SETGT T2.Z, literal.x, |T0.Y|,
-; CM-NEXT:     MULADD_IEEE * T2.W, T4.Z, literal.y, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:    2139095040(INF), 1060204544(6.931152e-01)
-; CM-NEXT:     MULADD_IEEE T3.X, T3.Z, literal.x, PV.W,
-; CM-NEXT:     SETGT T3.Y, literal.y, |T1.X|,
-; CM-NEXT:     CNDE T0.Z, PV.Z, T0.Y, PV.Y,
-; CM-NEXT:     CNDE * T1.W, T1.W, 0.0, literal.z,
+; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, PV.W,
+; CM-NEXT:     SETGT T2.Y, literal.y, |T0.X|,
+; CM-NEXT:     MULADD_IEEE T3.Z, T1.Z, literal.z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T1.W, T3.Y, literal.z, PV.Y,
 ; CM-NEXT:    1060204544(6.931152e-01), 2139095040(INF)
+; CM-NEXT:    939916788(3.194618e-05), 0(0.000000e+00)
+; CM-NEXT:     MULADD_IEEE T3.X, T2.W, literal.x, PV.W,
+; CM-NEXT:     MULADD_IEEE T4.Y, T2.Z, literal.x, PV.Z,
+; CM-NEXT:     CNDE T2.Z, PV.Y, T0.X, PV.X,
+; CM-NEXT:     CNDE * T0.W, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    1060204544(6.931152e-01), 1102148120(2.218071e+01)
+; CM-NEXT:     ADD T0.X, PV.Z, -PV.W,
+; CM-NEXT:     MULADD_IEEE T2.Y, T1.Z, literal.x, PV.Y,
+; CM-NEXT:     MULADD_IEEE T1.Z, T3.Y, literal.x, PV.X,
+; CM-NEXT:     SETGT * T0.W, literal.y, |T1.Y|,
+; CM-NEXT:    1060204544(6.931152e-01), 2139095040(INF)
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     SETGT T3.Y, literal.y, |T1.X|,
+; CM-NEXT:     CNDE T1.Z, PV.W, T1.Y, PV.Z,
+; CM-NEXT:     CNDE * T0.W, T0.Y, 0.0, literal.z,
+; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1102148120(2.218071e+01), 0(0.000000e+00)
-; CM-NEXT:     ADD T4.X, PV.Z, -PV.W,
-; CM-NEXT:     CNDE T0.Y, PV.Y, T1.X, PV.X,
-; CM-NEXT:     CNDE T0.Z, T1.Z, 0.0, literal.x,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; CM-NEXT:    1102148120(2.218071e+01), 8(1.121039e-44)
-; CM-NEXT:     LSHR T1.X, PV.W, literal.x,
-; CM-NEXT:     ADD T0.Y, PV.Y, -PV.Z,
-; CM-NEXT:     CNDE T0.Z, T2.X, T0.X, T2.Y,
-; CM-NEXT:     CNDE * T0.W, T0.W, 0.0, literal.y,
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     ADD T1.Y, PV.Z, -PV.W,
+; CM-NEXT:     CNDE T1.Z, PV.Y, T1.X, T2.Y,
+; CM-NEXT:     CNDE * T0.W, T0.Z, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 1102148120(2.218071e+01)
-; CM-NEXT:     ADD * T0.X, PV.Z, -PV.W,
-; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD * T1.X, PV.Z, -PV.W,
   %result = call <3 x float> @llvm.log.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index bbf19ff267b01..7da9a6a9e4eea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -1544,9 +1544,9 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; R600-LABEL: s_log10_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 62, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT:    ALU 61, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     SETGT T0.W, literal.x, KC0[3].Z,
@@ -1554,141 +1554,138 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
 ; R600-NEXT:     CNDE * T2.W, PV.W, 1.0, literal.x,
 ; R600-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.W, KC0[3].Z, PV.W,
-; R600-NEXT:     CNDE * T3.W, T1.W, 1.0, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].Z, PV.W,
+; R600-NEXT:     SETGT T2.W, literal.x, KC0[3].W,
+; R600-NEXT:     CNDE * T3.W, T1.W, 1.0, literal.y,
+; R600-NEXT:    8388608(1.175494e-38), 1333788672(4.294967e+09)
+; R600-NEXT:     MUL_IEEE T1.Z, KC0[3].Y, PS,
+; R600-NEXT:     CNDE T3.W, PV.W, 1.0, literal.x,
+; R600-NEXT:     LOG_IEEE * T0.X, PV.Z,
 ; R600-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].Y, PS,
-; R600-NEXT:     SETGT T3.W, literal.x, KC0[3].W,
-; R600-NEXT:     LOG_IEEE * T0.X, PV.W,
-; R600-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; R600-NEXT:     AND_INT T1.Z, PS, literal.x,
-; R600-NEXT:     CNDE T2.W, PV.W, 1.0, literal.y,
-; R600-NEXT:     LOG_IEEE * T0.Y, PV.Z,
-; R600-NEXT:    -4096(nan), 1333788672(4.294967e+09)
 ; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].W, PV.W,
-; R600-NEXT:     ADD T2.W, T0.X, -PV.Z,
-; R600-NEXT:     AND_INT * T4.W, PS, literal.x,
+; R600-NEXT:     AND_INT T3.W, PS, literal.x,
+; R600-NEXT:     LOG_IEEE * T0.Y, PV.Z,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.Z, T0.Y, -PS,
-; R600-NEXT:     MUL_IEEE T5.W, PV.W, literal.x,
+; R600-NEXT:     ADD T1.Z, T0.X, -PV.W,
+; R600-NEXT:     AND_INT T4.W, PS, literal.x,
 ; R600-NEXT:     LOG_IEEE * T0.Z, PV.Z,
-; R600-NEXT:    916096251(4.605039e-06), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T3.Z, T1.Z, literal.x, PV.W,
-; R600-NEXT:     AND_INT T5.W, PS, literal.y,
-; R600-NEXT:     MUL_IEEE * T6.W, PV.Z, literal.x,
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD T2.Z, T0.Y, -PV.W,
+; R600-NEXT:     MUL_IEEE T5.W, PV.Z, literal.x,
+; R600-NEXT:     AND_INT * T6.W, PS, literal.y,
 ; R600-NEXT:    916096251(4.605039e-06), -4096(nan)
+; R600-NEXT:     ADD T3.Z, T0.Z, -PS,
+; R600-NEXT:     MULADD_IEEE T5.W, T3.W, literal.x, PV.W,
+; R600-NEXT:     MUL_IEEE * T7.W, PV.Z, literal.x,
+; R600-NEXT:    916096251(4.605039e-06), 0(0.000000e+00)
 ; R600-NEXT:     MULADD_IEEE T4.Z, T4.W, literal.x, PS,
-; R600-NEXT:     ADD T6.W, T0.Z, -PV.W,
-; R600-NEXT:     MULADD_IEEE * T2.W, T2.W, literal.y, PV.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     MULADD_IEEE T5.W, T1.Z, literal.y, PV.W,
+; R600-NEXT:     MUL_IEEE * T7.W, PV.Z, literal.x,
 ; R600-NEXT:    916096251(4.605039e-06), 1050288128(3.010254e-01)
-; R600-NEXT:     MULADD_IEEE T1.Y, T1.Z, literal.x, PS,
-; R600-NEXT:     SETGT T1.Z, literal.y, |T0.X|,
-; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.z,
-; R600-NEXT:     MULADD_IEEE * T7.W, T2.Z, literal.x, PV.Z, BS:VEC_021/SCL_122
-; R600-NEXT:    1050288128(3.010254e-01), 2139095040(INF)
-; R600-NEXT:    916096251(4.605039e-06), 0(0.000000e+00)
+; R600-NEXT:     MULADD_IEEE T1.Y, T6.W, literal.x, PS,
+; R600-NEXT:     MULADD_IEEE T1.Z, T3.W, literal.y, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     SETGT T3.W, literal.z, |T0.X|,
+; R600-NEXT:     MULADD_IEEE * T5.W, T2.Z, literal.y, PV.Z, BS:VEC_021/SCL_122
+; R600-NEXT:    916096251(4.605039e-06), 1050288128(3.010254e-01)
+; R600-NEXT:    2139095040(INF), 0(0.000000e+00)
 ; R600-NEXT:     MULADD_IEEE T1.X, T4.W, literal.x, PS,
 ; R600-NEXT:     SETGT T2.Y, literal.y, |T0.Y|,
-; R600-NEXT:     MULADD_IEEE T2.Z, T5.W, literal.z, PV.W, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE T2.W, PV.Z, T0.X, PV.Y,
-; R600-NEXT:     CNDE * T0.W, T0.W, 0.0, literal.w,
+; R600-NEXT:     CNDE T1.Z, PV.W, T0.X, PV.Z,
+; R600-NEXT:     CNDE T0.W, T0.W, 0.0, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     MULADD_IEEE * T3.W, T3.Z, literal.x, PV.Y, BS:VEC_021/SCL_122
 ; R600-NEXT:    1050288128(3.010254e-01), 2139095040(INF)
-; R600-NEXT:    916096251(4.605039e-06), 1092231323(9.632960e+00)
-; R600-NEXT:     ADD T1.Y, PV.W, -PS,
-; R600-NEXT:     MULADD_IEEE T1.Z, T6.W, literal.x, PV.Z,
+; R600-NEXT:    1092231323(9.632960e+00), 0(0.000000e+00)
+; R600-NEXT:     MULADD_IEEE T0.X, T6.W, literal.x, PS,
+; R600-NEXT:     ADD T1.Y, PV.Z, -PV.W,
+; R600-NEXT:     SETGT T1.Z, literal.y, |T0.Z|,
 ; R600-NEXT:     CNDE T0.W, PV.Y, T0.Y, PV.X,
-; R600-NEXT:     CNDE * T1.W, T1.W, 0.0, literal.y,
-; R600-NEXT:    1050288128(3.010254e-01), 1092231323(9.632960e+00)
-; R600-NEXT:     ADD T1.X, PV.W, -PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T5.W, literal.x, PV.Z,
-; R600-NEXT:     SETGT * T1.W, literal.y, |T0.Z|,
+; R600-NEXT:     CNDE * T1.W, T1.W, 0.0, literal.z,
 ; R600-NEXT:    1050288128(3.010254e-01), 2139095040(INF)
-; R600-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; R600-NEXT:     CNDE T0.W, PS, T0.Z, PV.W,
-; R600-NEXT:     CNDE * T1.W, T3.W, 0.0, literal.y,
-; R600-NEXT:    2(2.802597e-45), 1092231323(9.632960e+00)
-; R600-NEXT:     ADD T2.X, PV.W, -PS,
-; R600-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; R600-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:    1092231323(9.632960e+00), 0(0.000000e+00)
+; R600-NEXT:     ADD T1.X, PV.W, -PS,
+; R600-NEXT:     CNDE T0.W, PV.Z, T0.Z, PV.X,
+; R600-NEXT:     CNDE * T1.W, T2.W, 0.0, literal.x,
+; R600-NEXT:    1092231323(9.632960e+00), 0(0.000000e+00)
+; R600-NEXT:     ADD T0.X, PV.W, -PS,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: s_log10_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 68, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T1.X
+; CM-NEXT:    ALU 66, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].Y,
+; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].W,
 ; CM-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; CM-NEXT:     CNDE T0.Z, PV.W, 1.0, literal.x,
-; CM-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
-; CM-NEXT:    1333788672(4.294967e+09), 8388608(1.175494e-38)
-; CM-NEXT:     CNDE T0.Y, PV.W, 1.0, literal.x,
-; CM-NEXT:     SETGT T1.Z, literal.y, KC0[3].Z,
-; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].Y, PV.Z,
-; CM-NEXT:    1333788672(4.294967e+09), 8388608(1.175494e-38)
-; CM-NEXT:     LOG_IEEE T0.X, T2.W,
-; CM-NEXT:     LOG_IEEE T0.Y (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T2.W,
-; CM-NEXT:     CNDE T1.Y, T1.Z, 1.0, literal.x,
-; CM-NEXT:     AND_INT T0.Z, PV.X, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].W, T0.Y,
+; CM-NEXT:     CNDE * T1.W, PV.W, 1.0, literal.x,
+; CM-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     SETGT T0.Z, literal.x, KC0[3].Y,
+; CM-NEXT:     MUL_IEEE * T1.W, KC0[3].W, PV.W,
+; CM-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
+; CM-NEXT:     LOG_IEEE T0.X, T1.W,
+; CM-NEXT:     LOG_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     CNDE T1.Y, T0.Z, 1.0, literal.x,
+; CM-NEXT:     CNDE T1.Z, T0.Y, 1.0, literal.x,
+; CM-NEXT:     AND_INT * T1.W, PV.X, literal.y,
 ; CM-NEXT:    1333788672(4.294967e+09), -4096(nan)
-; CM-NEXT:     LOG_IEEE T0.X (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE T0.Y, T2.W,
-; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T2.W,
-; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T2.W,
-; CM-NEXT:     ADD T2.Y, T0.X, -T0.Z,
-; CM-NEXT:     AND_INT T2.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].Z, T1.Y,
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD T2.Y, T0.X, -PV.W,
+; CM-NEXT:     MUL_IEEE T1.Z, KC0[3].Z, PV.Z,
+; CM-NEXT:     MUL_IEEE * T2.W, KC0[3].Y, PV.Y,
 ; CM-NEXT:     LOG_IEEE T1.X, T2.W,
 ; CM-NEXT:     LOG_IEEE T1.Y (MASKED), T2.W,
 ; CM-NEXT:     LOG_IEEE T1.Z (MASKED), T2.W,
 ; CM-NEXT:     LOG_IEEE * T1.W (MASKED), T2.W,
-; CM-NEXT:     ADD T1.Y, T0.Y, -T2.Z,
-; CM-NEXT:     AND_INT T3.Z, PV.X, literal.x,
-; CM-NEXT:     MUL_IEEE * T2.W, T2.Y, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LOG_IEEE T1.X (MASKED), T1.Z,
+; CM-NEXT:     LOG_IEEE T1.Y, T1.Z,
+; CM-NEXT:     LOG_IEEE T1.Z (MASKED), T1.Z,
+; CM-NEXT:     LOG_IEEE * T1.W (MASKED), T1.Z,
+; CM-NEXT:     AND_INT T3.Y, PV.Y, literal.x,
+; CM-NEXT:     AND_INT T1.Z, T1.X, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T2.Y, literal.y,
 ; CM-NEXT:    -4096(nan), 916096251(4.605039e-06)
-; CM-NEXT:     MULADD_IEEE T3.Y, T0.Z, literal.x, PV.W,
-; CM-NEXT:     ADD T4.Z, T1.X, -PV.Z,
-; CM-NEXT:     MUL_IEEE * T2.W, PV.Y, literal.x,
+; CM-NEXT:     MULADD_IEEE T4.Y, T1.W, literal.x, PV.W,
+; CM-NEXT:     ADD T2.Z, T1.X, -PV.Z,
+; CM-NEXT:     ADD * T2.W, T1.Y, -PV.Y,
 ; CM-NEXT:    916096251(4.605039e-06), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T4.Y, T2.Z, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T5.Z, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE * T2.W, T2.Y, literal.y, PV.Y,
+; CM-NEXT:     MUL_IEEE T5.Y, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T3.Z, PV.Z, literal.x,
+; CM-NEXT:     MULADD_IEEE * T3.W, T2.Y, literal.y, PV.Y,
 ; CM-NEXT:    916096251(4.605039e-06), 1050288128(3.010254e-01)
-; CM-NEXT:     MULADD_IEEE T2.Y, T0.Z, literal.x, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T3.Z, literal.y, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.x, PV.Y,
-; CM-NEXT:    1050288128(3.010254e-01), 916096251(4.605039e-06)
-; CM-NEXT:     SETGT T2.X, literal.x, |T0.X|,
-; CM-NEXT:     MULADD_IEEE T1.Y, T2.Z, literal.y, PV.W,
-; CM-NEXT:     SETGT T2.Z, literal.x, |T0.Y|,
-; CM-NEXT:     MULADD_IEEE * T2.W, T4.Z, literal.y, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:    2139095040(INF), 1050288128(3.010254e-01)
-; CM-NEXT:     MULADD_IEEE T3.X, T3.Z, literal.x, PV.W,
-; CM-NEXT:     SETGT T3.Y, literal.y, |T1.X|,
-; CM-NEXT:     CNDE T0.Z, PV.Z, T0.Y, PV.Y,
-; CM-NEXT:     CNDE * T1.W, T1.W, 0.0, literal.z,
+; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, PV.W,
+; CM-NEXT:     SETGT T2.Y, literal.y, |T0.X|,
+; CM-NEXT:     MULADD_IEEE T3.Z, T1.Z, literal.z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T1.W, T3.Y, literal.z, PV.Y,
 ; CM-NEXT:    1050288128(3.010254e-01), 2139095040(INF)
+; CM-NEXT:    916096251(4.605039e-06), 0(0.000000e+00)
+; CM-NEXT:     MULADD_IEEE T3.X, T2.W, literal.x, PV.W,
+; CM-NEXT:     MULADD_IEEE T4.Y, T2.Z, literal.x, PV.Z,
+; CM-NEXT:     CNDE T2.Z, PV.Y, T0.X, PV.X,
+; CM-NEXT:     CNDE * T0.W, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    1050288128(3.010254e-01), 1092231323(9.632960e+00)
+; CM-NEXT:     ADD T0.X, PV.Z, -PV.W,
+; CM-NEXT:     MULADD_IEEE T2.Y, T1.Z, literal.x, PV.Y,
+; CM-NEXT:     MULADD_IEEE T1.Z, T3.Y, literal.x, PV.X,
+; CM-NEXT:     SETGT * T0.W, literal.y, |T1.Y|,
+; CM-NEXT:    1050288128(3.010254e-01), 2139095040(INF)
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     SETGT T3.Y, literal.y, |T1.X|,
+; CM-NEXT:     CNDE T1.Z, PV.W, T1.Y, PV.Z,
+; CM-NEXT:     CNDE * T0.W, T0.Y, 0.0, literal.z,
+; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1092231323(9.632960e+00), 0(0.000000e+00)
-; CM-NEXT:     ADD T4.X, PV.Z, -PV.W,
-; CM-NEXT:     CNDE T0.Y, PV.Y, T1.X, PV.X,
-; CM-NEXT:     CNDE T0.Z, T1.Z, 0.0, literal.x,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; CM-NEXT:    1092231323(9.632960e+00), 8(1.121039e-44)
-; CM-NEXT:     LSHR T1.X, PV.W, literal.x,
-; CM-NEXT:     ADD T0.Y, PV.Y, -PV.Z,
-; CM-NEXT:     CNDE T0.Z, T2.X, T0.X, T2.Y,
-; CM-NEXT:     CNDE * T0.W, T0.W, 0.0, literal.y,
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     ADD T1.Y, PV.Z, -PV.W,
+; CM-NEXT:     CNDE T1.Z, PV.Y, T1.X, T2.Y,
+; CM-NEXT:     CNDE * T0.W, T0.Z, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 1092231323(9.632960e+00)
-; CM-NEXT:     ADD * T0.X, PV.Z, -PV.W,
-; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD * T1.X, PV.Z, -PV.W,
   %result = call <3 x float> @llvm.log10.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index bc81ab88a1a13..b2ec9534aa848 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -766,9 +766,9 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; R600-LABEL: s_log2_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 29, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT:    ALU 27, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     SETGT T0.W, literal.x, KC0[3].Z,
@@ -776,75 +776,73 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
 ; R600-NEXT:     CNDE * T2.W, PV.W, 1.0, literal.x,
 ; R600-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.W, KC0[3].Z, PV.W,
-; R600-NEXT:     CNDE * T3.W, T1.W, 1.0, literal.x,
-; R600-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].Z, PV.W,
+; R600-NEXT:     SETGT T2.W, literal.x, KC0[3].W,
+; R600-NEXT:     CNDE * T3.W, T1.W, 1.0, literal.y,
+; R600-NEXT:    8388608(1.175494e-38), 1333788672(4.294967e+09)
 ; R600-NEXT:     MUL_IEEE T0.Y, KC0[3].Y, PS,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].W,
+; R600-NEXT:     CNDE T1.Z, PV.W, 1.0, literal.x,
 ; R600-NEXT:     CNDE T0.W, T0.W, 0.0, literal.y,
-; R600-NEXT:     LOG_IEEE * T0.X, PV.W,
-; R600-NEXT:    8388608(1.175494e-38), 1107296256(3.200000e+01)
+; R600-NEXT:     LOG_IEEE * T0.X, PV.Z,
+; R600-NEXT:    1333788672(4.294967e+09), 1107296256(3.200000e+01)
 ; R600-NEXT:     ADD T1.Y, PS, -PV.W,
-; R600-NEXT:     CNDE T1.Z, PV.Z, 1.0, literal.x,
-; R600-NEXT:     CNDE T0.W, T1.W, 0.0, literal.y,
+; R600-NEXT:     MUL_IEEE T0.Z, KC0[3].W, PV.Z,
+; R600-NEXT:     CNDE T0.W, T1.W, 0.0, literal.x,
 ; R600-NEXT:     LOG_IEEE * T0.X, PV.Y,
-; R600-NEXT:    1333788672(4.294967e+09), 1107296256(3.200000e+01)
+; R600-NEXT:    1107296256(3.200000e+01), 0(0.000000e+00)
 ; R600-NEXT:     ADD T1.X, PS, -PV.W,
-; R600-NEXT:     MUL_IEEE T0.W, KC0[3].W, PV.Z,
-; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
-; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; R600-NEXT:     CNDE T1.W, T0.Z, 0.0, literal.x,
-; R600-NEXT:     LOG_IEEE * T0.Y, PV.W,
+; R600-NEXT:     CNDE T0.W, T2.W, 0.0, literal.x,
+; R600-NEXT:     LOG_IEEE * T0.X, PV.Z,
 ; R600-NEXT:    1107296256(3.200000e+01), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.X, PS, -PV.W,
-; R600-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; R600-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:     ADD T0.X, PS, -PV.W,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: s_log2_v3f32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 35, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     SETGT * T0.W, literal.x, KC0[3].W,
 ; CM-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
-; CM-NEXT:     CNDE T0.Y, PV.W, 1.0, literal.x,
-; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].Z,
-; CM-NEXT:     SETGT * T1.W, literal.y, KC0[3].Y,
-; CM-NEXT:    1333788672(4.294967e+09), 8388608(1.175494e-38)
-; CM-NEXT:     CNDE T0.X, PV.W, 1.0, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Z, 1.0, literal.x,
-; CM-NEXT:     CNDE T1.Z, T0.W, 0.0, literal.y,
-; CM-NEXT:     MUL_IEEE * T0.W, KC0[3].W, PV.Y,
-; CM-NEXT:    1333788672(4.294967e+09), 1107296256(3.200000e+01)
-; CM-NEXT:     LOG_IEEE T0.X (MASKED), T0.W,
-; CM-NEXT:     LOG_IEEE T0.Y, T0.W,
-; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     ADD T1.X, PV.Y, -T1.Z,
-; CM-NEXT:     CNDE T0.Y, T0.Z, 0.0, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     MUL_IEEE * T0.W, KC0[3].Z, T1.Y,
-; CM-NEXT:    1107296256(3.200000e+01), 8(1.121039e-44)
-; CM-NEXT:     LOG_IEEE T0.X (MASKED), T0.W,
+; CM-NEXT:     CNDE * T1.W, PV.W, 1.0, literal.x,
+; CM-NEXT:    1333788672(4.294967e+09), 0(0.000000e+00)
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     CNDE T0.Z, T0.W, 0.0, literal.y,
+; CM-NEXT:     MUL_IEEE * T0.W, KC0[3].W, PV.W,
+; CM-NEXT:    8388608(1.175494e-38), 1107296256(3.200000e+01)
+; CM-NEXT:     LOG_IEEE T0.X, T0.W,
 ; CM-NEXT:     LOG_IEEE T0.Y (MASKED), T0.W,
 ; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     LOG_IEEE * T0.W, T0.W,
-; CM-NEXT:     LSHR T2.X, T0.Z, literal.x,
-; CM-NEXT:     ADD T0.Y, PV.W, -T0.Y,
-; CM-NEXT:     CNDE T0.Z, T1.W, 0.0, literal.y,
-; CM-NEXT:     MUL_IEEE * T0.W, KC0[3].Y, T0.X,
+; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     ADD T0.X, PV.X, -T0.Z,
+; CM-NEXT:     CNDE T0.Z, T0.Y, 1.0, literal.x,
+; CM-NEXT:     SETGT * T0.W, literal.y, KC0[3].Y,
+; CM-NEXT:    1333788672(4.294967e+09), 8388608(1.175494e-38)
+; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:     CNDE T1.Y, PV.W, 1.0, literal.y,
+; CM-NEXT:     CNDE T1.Z, T0.Y, 0.0, literal.z,
+; CM-NEXT:     MUL_IEEE * T1.W, KC0[3].Z, PV.Z,
+; CM-NEXT:    2(2.802597e-45), 1333788672(4.294967e+09)
+; CM-NEXT:    1107296256(3.200000e+01), 0(0.000000e+00)
+; CM-NEXT:     LOG_IEEE T0.X (MASKED), T1.W,
+; CM-NEXT:     LOG_IEEE T0.Y, T1.W,
+; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     ADD_INT T2.X, T1.X, literal.x,
+; CM-NEXT:     ADD T3.Y, PV.Y, -T1.Z,
+; CM-NEXT:     CNDE T0.Z, T0.W, 0.0, literal.y,
+; CM-NEXT:     MUL_IEEE * T0.W, KC0[3].Y, T1.Y,
 ; CM-NEXT:    2(2.802597e-45), 1107296256(3.200000e+01)
-; CM-NEXT:     LOG_IEEE T0.X, T0.W,
-; CM-NEXT:     LOG_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     LOG_IEEE T0.X (MASKED), T0.W,
+; CM-NEXT:     LOG_IEEE T0.Y, T0.W,
 ; CM-NEXT:     LOG_IEEE T0.Z (MASKED), T0.W,
 ; CM-NEXT:     LOG_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     ADD * T0.X, PV.X, -T0.Z,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD * T3.X, PV.Y, -T0.Z,
   %result = call <3 x float> @llvm.log2.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index d1a16b687f930..802ffc1e7ea7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -708,61 +708,58 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
 ;
 ; R600-LABEL: round_v8f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 50, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 49, @4, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T1.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     TRUNC * T0.W, KC0[6].X,
-; R600-NEXT:     ADD T0.Z, KC0[6].X, -PV.W,
-; R600-NEXT:     TRUNC * T1.W, KC0[5].X,
-; R600-NEXT:     TRUNC * T2.W, KC0[4].W,
-; R600-NEXT:     ADD T1.Z, KC0[4].W, -PV.W,
-; R600-NEXT:     ADD T3.W, KC0[5].X, -T1.W,
-; R600-NEXT:     SETGE * T4.W, |T0.Z|, 0.5,
-; R600-NEXT:     BFI_INT T0.Y, literal.x, PS, KC0[6].X,
-; R600-NEXT:     SETGE T0.Z, |PV.W|, 0.5,
-; R600-NEXT:     SETGE T3.W, |PV.Z|, 0.5,
-; R600-NEXT:     TRUNC * T4.W, KC0[5].Y,
+; R600-NEXT:     ADD T1.W, KC0[6].X, -PV.W,
+; R600-NEXT:     TRUNC * T2.W, KC0[5].X,
+; R600-NEXT:     ADD T3.W, KC0[5].X, -PS,
+; R600-NEXT:     SETGE * T1.W, |PV.W|, 0.5,
+; R600-NEXT:     BFI_INT T0.Z, literal.x, PS, KC0[6].X,
+; R600-NEXT:     SETGE T1.W, |PV.W|, 0.5,
+; R600-NEXT:     TRUNC * T3.W, KC0[5].Y,
 ; R600-NEXT:    2147483647(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Y, KC0[5].Y, -PS,
-; R600-NEXT:     BFI_INT T1.Z, literal.x, PV.W, KC0[4].W,
-; R600-NEXT:     BFI_INT T3.W, literal.x, PV.Z, KC0[5].X,
-; R600-NEXT:     TRUNC * T5.W, KC0[4].Z,
+; R600-NEXT:     ADD T0.Y, KC0[5].Y, -PS,
+; R600-NEXT:     BFI_INT T1.Z, literal.x, PV.W, KC0[5].X,
+; R600-NEXT:     TRUNC * T1.W, KC0[4].Y,
 ; R600-NEXT:    2147483647(nan), 0(0.000000e+00)
-; R600-NEXT:     TRUNC T0.Z, KC0[4].Y,
-; R600-NEXT:     TRUNC * T6.W, KC0[5].W,
-; R600-NEXT:     ADD * T7.W, KC0[4].Z, -T5.W,
-; R600-NEXT:     TRUNC T0.X, KC0[5].Z,
-; R600-NEXT:     SETGE T2.Y, |PV.W|, 0.5,
-; R600-NEXT:     ADD T2.Z, KC0[5].W, -T6.W, BS:VEC_102/SCL_221
-; R600-NEXT:     ADD T7.W, KC0[4].Y, -T0.Z,
-; R600-NEXT:     ADD * T3.W, T1.W, T3.W,
-; R600-NEXT:     SETGE T1.X, |PV.W|, 0.5,
-; R600-NEXT:     SETGE T4.Y, |PV.Z|, 0.5,
-; R600-NEXT:     ADD T3.Z, T2.W, T1.Z,
-; R600-NEXT:     BFI_INT T1.W, literal.x, PV.Y, KC0[4].Z,
-; R600-NEXT:     ADD * T2.W, KC0[5].Z, -PV.X,
+; R600-NEXT:     TRUNC * T4.W, KC0[4].W,
+; R600-NEXT:     ADD T1.Y, KC0[4].W, -PV.W,
+; R600-NEXT:     TRUNC T2.Z, KC0[4].Z,
+; R600-NEXT:     TRUNC * T5.W, KC0[5].W,
+; R600-NEXT:     ADD * T6.W, KC0[4].Y, -T1.W,
+; R600-NEXT:     SETGE T0.X, |PV.W|, 0.5,
+; R600-NEXT:     ADD T2.Y, KC0[5].W, -T5.W,
+; R600-NEXT:     ADD T3.Z, KC0[4].Z, -T2.Z,
+; R600-NEXT:     SETGE T6.W, |T1.Y|, 0.5,
+; R600-NEXT:     TRUNC * T7.W, KC0[5].Z,
+; R600-NEXT:     ADD T1.X, KC0[5].Z, -PS,
+; R600-NEXT:     BFI_INT T1.Y, literal.x, PV.W, KC0[4].W,
+; R600-NEXT:     SETGE T3.Z, |PV.Z|, 0.5,
+; R600-NEXT:     SETGE T6.W, |PV.Y|, 0.5,
+; R600-NEXT:     ADD * T8.W, T2.W, T1.Z,
 ; R600-NEXT:    2147483647(nan), 0(0.000000e+00)
-; R600-NEXT:     SETGE T2.X, |PS|, 0.5,
-; R600-NEXT:     ADD T3.Y, T5.W, PV.W,
-; R600-NEXT:     BFI_INT T1.Z, literal.x, PV.Y, KC0[5].W,
-; R600-NEXT:     BFI_INT T1.W, literal.x, PV.X, KC0[4].Y,
-; R600-NEXT:     ADD * T0.W, T0.W, T0.Y,
+; R600-NEXT:     BFI_INT T2.X, literal.x, PV.W, KC0[5].W,
+; R600-NEXT:     BFI_INT T2.Y, literal.x, PV.Z, KC0[4].Z,
+; R600-NEXT:     ADD T8.Z, T4.W, PV.Y,
+; R600-NEXT:     SETGE T2.W, |PV.X|, 0.5,
+; R600-NEXT:     ADD * T0.W, T0.W, T0.Z,
 ; R600-NEXT:    2147483647(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T3.X, T0.Z, PV.W,
-; R600-NEXT:     ADD T0.Z, T6.W, PV.Z,
-; R600-NEXT:     BFI_INT T1.W, literal.x, PV.X, KC0[5].Z,
-; R600-NEXT:     SETGE * T2.W, |T1.Y|, 0.5,
+; R600-NEXT:     BFI_INT T1.X, literal.x, PV.W, KC0[5].Z,
+; R600-NEXT:     ADD T8.Y, T2.Z, PV.Y,
+; R600-NEXT:     ADD T0.Z, T5.W, PV.X,
+; R600-NEXT:     BFI_INT T2.W, literal.x, T0.X, KC0[4].Y,
+; R600-NEXT:     SETGE * T4.W, |T0.Y|, 0.5,
 ; R600-NEXT:    2147483647(nan), 0(0.000000e+00)
-; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
-; R600-NEXT:     ADD T0.Y, T0.X, PV.W,
-; R600-NEXT:     BFI_INT * T1.W, literal.y, PS, KC0[5].Y,
-; R600-NEXT:    2(2.802597e-45), 2147483647(nan)
-; R600-NEXT:     ADD T0.X, T4.W, PV.W,
-; R600-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; R600-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; R600-NEXT:     LSHR * T2.X, PV.W, literal.x,
+; R600-NEXT:     ADD T8.X, T1.W, PV.W,
+; R600-NEXT:     ADD T0.Y, T7.W, PV.X, BS:VEC_120/SCL_212
+; R600-NEXT:     BFI_INT * T1.W, literal.x, PS, KC0[5].Y,
+; R600-NEXT:    2147483647(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD T0.X, T3.W, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
   store <8 x float> %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index c41905bdb078e..b62f80606fc42 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1217,7 +1217,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -1228,12 +1228,11 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     BFE_UINT * T1.Y, T0.X, 1, 1,
 ; EG-NEXT:     AND_INT T1.X, T0.X, 1,
+; EG-NEXT:     LSHR * T0.X, T0.X, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v3i1_to_v3i32:
@@ -1334,25 +1333,24 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XY, T1.X, 1
+; EG-NEXT:    ALU 9, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XY, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT * T2.X, PV.W, 0.0, 1,
-; EG-NEXT:     BFE_INT T3.X, T0.X, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T0.X, 1,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T3.Y, PV.W, 0.0, 1,
+; EG-NEXT:     BFE_INT * T1.X, PV.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T2.X, T0.X, 0.0, 1,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T0.W, T0.X, 1,
+; EG-NEXT:     ADD_INT T0.X, T3.X, literal.x,
+; EG-NEXT:     BFE_INT * T2.Y, PV.W, 0.0, 1,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32:
@@ -1676,7 +1674,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -1687,22 +1685,20 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     BFE_UINT * T6.W, T5.X, literal.x, 1,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T6.Z, T5.X, literal.x, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T6.Z, T5.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T5.W, T5.X, literal.y, 1,
+; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
 ; EG-NEXT:     BFE_UINT T6.Y, T5.X, 1, 1,
-; EG-NEXT:     BFE_UINT * T5.W, T5.X, literal.x, 1,
-; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T5.Z, T5.X, literal.x, 1,
+; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T6.X, T5.X, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T5.X, literal.x, 1,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
-; EG-NEXT:    6(8.407791e-45), 2(2.802597e-45)
 ; EG-NEXT:     BFE_UINT * T5.Y, T5.X, literal.x, 1,
 ; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T5.X, T5.X, literal.x, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
+; EG-NEXT:     ADD_INT * T8.X, PS, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
 ; GFX12:       ; %bb.0:
@@ -1860,25 +1856,25 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_INT T6.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
 ; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T5.X, 0.0, 1,
 ; EG-NEXT:     BFE_INT T6.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T7.X, T5.X, 0.0, 1,
+; EG-NEXT:     BFE_INT T6.Y, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T5.X, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T5.X, literal.y,
-; EG-NEXT:    3(4.203895e-45), 5(7.006492e-45)
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     BFE_INT T6.Y, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T5.X, literal.x,
-; EG-NEXT:     BFE_INT T7.W, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T5.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT:    3(4.203895e-45), 4(5.605194e-45)
 ; EG-NEXT:     BFE_INT T6.X, PS, 0.0, 1,
-; EG-NEXT:     BFE_INT T7.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T5.X, 1,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T5.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T7.W, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T7.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T5.X, 1,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T5.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, 1,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32:
 ; GFX12:       ; %bb.0:
@@ -2038,11 +2034,11 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 36, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 31, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T14.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T11.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 0, #1
@@ -2051,41 +2047,36 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 11:
 ; EG-NEXT:     BFE_UINT * T8.W, T7.X, literal.x, 1,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T8.Z, T7.X, literal.x, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T8.Z, T7.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T9.W, T7.X, literal.y, 1,
+; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
 ; EG-NEXT:     BFE_UINT T8.Y, T7.X, 1, 1,
-; EG-NEXT:     BFE_UINT * T9.W, T7.X, literal.x, 1,
-; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T9.Z, T7.X, literal.x, 1,
+; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T8.X, T7.X, 1,
-; EG-NEXT:     BFE_UINT T9.Z, T7.X, literal.x, 1,
-; EG-NEXT:     LSHR * T10.X, KC0[2].Y, literal.y,
-; EG-NEXT:    6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT:     BFE_UINT T9.Y, T7.X, literal.x, 1,
-; EG-NEXT:     BFE_UINT * T11.W, T7.X, literal.y, 1,
-; EG-NEXT:    5(7.006492e-45), 11(1.541428e-44)
+; EG-NEXT:     BFE_UINT * T9.Y, T7.X, literal.x, 1,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T9.X, T7.X, literal.x, 1,
-; EG-NEXT:     BFE_UINT T11.Z, T7.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    4(5.605194e-45), 10(1.401298e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T11.Y, T7.X, literal.y, 1,
-; EG-NEXT:     LSHR * T7.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT:     BFE_UINT T10.W, T7.X, literal.y, 1,
+; EG-NEXT:     LSHR * T11.X, KC0[2].Y, literal.z,
+; EG-NEXT:    4(5.605194e-45), 11(1.541428e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T10.Z, T7.X, literal.x, 1,
+; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, T11.X, literal.x,
+; EG-NEXT:     BFE_UINT T10.Y, T7.X, literal.y, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T7.W, T7.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:    4(5.605194e-45), 9(1.261169e-44)
 ; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T11.X, T7.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT T10.X, T7.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T7.Z, T7.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T13.X, T11.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT * T7.Y, T7.X, literal.y, 1,
-; EG-NEXT:    2(2.802597e-45), 13(1.821688e-44)
+; EG-NEXT:     BFE_UINT * T7.Y, T7.X, literal.x, 1,
+; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T7.X, T7.X, literal.x, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    12(1.681558e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T14.X, T11.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
 ; GFX12:       ; %bb.0:
@@ -2260,11 +2251,11 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 51, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 47, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T14.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T11.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T9.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T10.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 0, #1
@@ -2277,52 +2268,48 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
 ; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T8.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T8.Y, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T7.X, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
-; EG-NEXT:    11(1.541428e-44), 5(7.006492e-45)
-; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT:     BFE_INT T8.Y, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T7.X, literal.y,
-; EG-NEXT:     BFE_INT T10.W, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
-; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    11(1.541428e-44), 4(5.605194e-45)
 ; EG-NEXT:     BFE_INT T8.X, PS, 0.0, 1,
-; EG-NEXT:     BFE_INT T10.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T7.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    9(1.261169e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T11.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T10.Y, PV.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T9.W, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T10.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T9.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T7.X, literal.y,
+; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT:     ADD_INT T11.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T9.Y, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T7.X, literal.y,
 ; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 15(2.101948e-44)
+; EG-NEXT:    4(5.605194e-45), 15(2.101948e-44)
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T10.X, PS, 0.0, 1,
+; EG-NEXT:     BFE_INT T9.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T12.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
 ; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T13.X, T7.X, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T7.X, literal.x,
 ; EG-NEXT:     BFE_INT T12.Z, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T7.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T7.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 13(1.821688e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T14.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T12.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T7.X, literal.x,
-; EG-NEXT:     BFE_INT T13.W, PV.Y, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T14.X, T10.X, literal.x,
+; EG-NEXT:     BFE_INT T12.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T7.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T13.W, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T7.X, literal.z,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T12.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T13.Z, PV.Z, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T7.X, 1,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T7.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T7.X, T10.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT * T13.Y, PV.W, 0.0, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32:
 ; GFX12:       ; %bb.0:
@@ -2611,15 +2598,15 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 76, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 59, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T24.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T18.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T15.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_32 T11.X, T11.X, 0, #1
@@ -2628,81 +2615,64 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 15:
 ; EG-NEXT:     BFE_UINT * T12.W, T11.X, literal.x, 1,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T12.Z, T11.X, literal.x, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T12.Z, T11.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T13.W, T11.X, literal.y, 1,
+; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
 ; EG-NEXT:     BFE_UINT T12.Y, T11.X, 1, 1,
-; EG-NEXT:     BFE_UINT * T13.W, T11.X, literal.x, 1,
-; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T13.Z, T11.X, literal.x, 1,
+; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T12.X, T11.X, 1,
-; EG-NEXT:     BFE_UINT T13.Z, T11.X, literal.x, 1,
-; EG-NEXT:     LSHR * T14.X, KC0[2].Y, literal.y,
-; EG-NEXT:    6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT:     BFE_UINT T13.Y, T11.X, literal.x, 1,
-; EG-NEXT:     BFE_UINT * T15.W, T11.X, literal.y, 1,
-; EG-NEXT:    5(7.006492e-45), 11(1.541428e-44)
+; EG-NEXT:     BFE_UINT * T13.Y, T11.X, literal.x, 1,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T13.X, T11.X, literal.x, 1,
-; EG-NEXT:     BFE_UINT T15.Z, T11.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    4(5.605194e-45), 10(1.401298e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T15.Y, T11.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T17.W, T11.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT:     BFE_UINT T14.W, T11.X, literal.y, 1,
+; EG-NEXT:     LSHR * T15.X, KC0[2].Y, literal.z,
+; EG-NEXT:    4(5.605194e-45), 11(1.541428e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T14.Z, T11.X, literal.x, 1,
+; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T16.X, T15.X, literal.x,
+; EG-NEXT:     BFE_UINT T14.Y, T11.X, literal.y, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_UINT * T17.W, T11.X, literal.z, 1, BS:VEC_120/SCL_212
+; EG-NEXT:    4(5.605194e-45), 9(1.261169e-44)
 ; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T15.X, T11.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT T14.X, T11.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T17.Z, T11.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T18.X, T15.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T17.Y, T11.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T19.W, T11.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T17.Y, T11.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T19.W, T11.X, literal.y, 1,
+; EG-NEXT:    13(1.821688e-44), 19(2.662467e-44)
 ; EG-NEXT:     BFE_UINT T17.X, T11.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T19.Z, T11.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T20.X, T15.X, literal.x,
 ; EG-NEXT:    12(1.681558e-44), 18(2.522337e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T19.Y, T11.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T21.W, T11.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 17(2.382207e-44)
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T19.Y, T11.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T21.W, T11.X, literal.y, 1,
+; EG-NEXT:    17(2.382207e-44), 23(3.222986e-44)
 ; EG-NEXT:     BFE_UINT T19.X, T11.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T21.Z, T11.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T22.X, T15.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 22(3.082857e-44)
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T21.Y, T11.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T23.W, T11.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T21.Y, T11.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T23.W, T11.X, literal.y, 1,
+; EG-NEXT:    21(2.942727e-44), 27(3.783506e-44)
 ; EG-NEXT:     BFE_UINT T21.X, T11.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T23.Z, T11.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T24.X, T15.X, literal.x,
 ; EG-NEXT:    20(2.802597e-44), 26(3.643376e-44)
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T23.Y, T11.X, literal.y, 1,
-; EG-NEXT:     LSHR * T11.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 25(3.503246e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T23.Y, T11.X, literal.x, 1,
+; EG-NEXT:     LSHR * T11.W, T11.X, literal.y,
+; EG-NEXT:    25(3.503246e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_UINT T23.X, T11.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T11.Z, T11.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T25.X, T15.X, literal.x,
 ; EG-NEXT:    24(3.363116e-44), 30(4.203895e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT * T11.Y, T11.X, literal.y, 1,
-; EG-NEXT:    2(2.802597e-45), 29(4.063766e-44)
+; EG-NEXT:     BFE_UINT * T11.Y, T11.X, literal.x, 1,
+; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T11.X, T11.X, literal.x, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    28(3.923636e-44), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T26.X, T15.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
 ; GFX12:       ; %bb.0:
@@ -3048,132 +3018,118 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ;
 ; EG-LABEL: constant_sextload_v32i1_to_v32i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 0 @14
-; EG-NEXT:    ALU 99, @17, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 5, @117, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @12
+; EG-NEXT:    ALU 94, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T19.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T17.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T17.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T15.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T13.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T14.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 14:
+; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_32 T11.X, T11.X, 0, #1
-; EG-NEXT:    ALU clause starting at 16:
+; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 17:
+; EG-NEXT:    ALU clause starting at 15:
 ; EG-NEXT:     LSHR * T0.W, T11.X, literal.x,
 ; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T12.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T11.X, literal.x,
 ; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T12.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T12.Y, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T11.X, literal.y,
-; EG-NEXT:    11(1.541428e-44), 5(7.006492e-45)
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     BFE_INT T12.Y, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; EG-NEXT:     BFE_INT T14.W, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
-; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    11(1.541428e-44), 4(5.605194e-45)
 ; EG-NEXT:     BFE_INT T12.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T11.X, literal.x,
-; EG-NEXT:     BFE_INT T14.Z, PV.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T13.W, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T13.Z, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T11.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    15(2.101948e-44), 9(1.261169e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T15.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T14.Y, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.z,
+; EG-NEXT:    2(2.802597e-45), 15(2.101948e-44)
+; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T15.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T13.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; EG-NEXT:     BFE_INT T16.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T16.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 14(1.961818e-44)
+; EG-NEXT:    4(5.605194e-45), 14(1.961818e-44)
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T14.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T11.X, literal.x,
+; EG-NEXT:     BFE_INT T13.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T16.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T11.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.y,
 ; EG-NEXT:    19(2.662467e-44), 13(1.821688e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T17.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T16.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; EG-NEXT:     BFE_INT T18.W, PV.Y, 0.0, 1,
+; EG-NEXT:     ADD_INT T17.X, T14.X, literal.x,
+; EG-NEXT:     BFE_INT T16.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T18.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 18(2.522337e-44)
+; EG-NEXT:    8(1.121039e-44), 18(2.522337e-44)
 ; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T16.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T11.X, literal.x,
 ; EG-NEXT:     BFE_INT T18.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T11.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 17(2.382207e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T19.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T18.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; EG-NEXT:     BFE_INT T20.W, PV.Y, 0.0, 1,
+; EG-NEXT:     ADD_INT T19.X, T14.X, literal.x,
+; EG-NEXT:     BFE_INT T18.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T20.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 22(3.082857e-44)
+; EG-NEXT:    12(1.681558e-44), 22(3.082857e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T18.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T11.X, literal.x,
 ; EG-NEXT:     BFE_INT T20.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T11.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.y,
 ; EG-NEXT:    27(3.783506e-44), 21(2.942727e-44)
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T20.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; EG-NEXT:     BFE_INT T22.W, PV.Y, 0.0, 1,
+; EG-NEXT:     ADD_INT T21.X, T14.X, literal.x,
+; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T22.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 26(3.643376e-44)
+; EG-NEXT:    16(2.242078e-44), 26(3.643376e-44)
 ; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T20.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T22.Z, PV.Z, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    25(3.503246e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T23.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, T14.X, literal.y,
+; EG-NEXT:    25(3.503246e-44), 20(2.802597e-44)
 ; EG-NEXT:     BFE_INT T22.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T11.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT:     LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T22.X, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
 ; EG-NEXT:     ASHR * T24.W, T11.X, literal.y,
 ; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T25.X, T11.X, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T11.X, literal.x,
 ; EG-NEXT:     BFE_INT T24.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T11.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 29(4.063766e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T24.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T11.X, literal.x,
-; EG-NEXT:     BFE_INT T25.W, PV.Y, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T11.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T26.X, T14.X, literal.x,
+; EG-NEXT:     BFE_INT T24.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T25.W, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T11.X, literal.z,
+; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T24.X, PS, 0.0, 1,
-; EG-NEXT:     BFE_INT * T25.Z, PV.Z, 0.0, 1,
-; EG-NEXT:    ALU clause starting at 117:
+; EG-NEXT:     BFE_INT T25.Z, PV.Z, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T11.X, 1,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T11.X, T14.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT * T25.Y, PV.W, 0.0, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32:
 ; GFX12:       ; %bb.0:
@@ -3706,8 +3662,8 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
-; EG-NEXT:    ALU 96, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 57, @122, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 89, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 32, @115, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T47.X, 0
@@ -3721,9 +3677,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T32.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 22:
@@ -3733,160 +3689,128 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 25:
 ; EG-NEXT:     BFE_UINT * T19.W, T21.X, literal.x, 1,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T19.Z, T21.X, literal.x, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T19.Z, T21.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T20.W, T21.X, literal.y, 1,
+; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
 ; EG-NEXT:     BFE_UINT T19.Y, T21.X, 1, 1,
-; EG-NEXT:     BFE_UINT * T20.W, T21.X, literal.x, 1,
-; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T20.Z, T21.X, literal.x, 1,
+; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T19.X, T21.X, 1,
-; EG-NEXT:     BFE_UINT T20.Z, T21.X, literal.x, 1,
-; EG-NEXT:     LSHR * T22.X, KC0[2].Y, literal.y,
-; EG-NEXT:    6(8.407791e-45), 2(2.802597e-45)
-; EG-NEXT:     BFE_UINT T20.Y, T21.X, literal.x, 1,
-; EG-NEXT:     BFE_UINT * T23.W, T21.X, literal.y, 1,
-; EG-NEXT:    5(7.006492e-45), 11(1.541428e-44)
+; EG-NEXT:     BFE_UINT * T20.Y, T21.X, literal.x, 1,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T20.X, T21.X, literal.x, 1,
-; EG-NEXT:     BFE_UINT T23.Z, T21.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    4(5.605194e-45), 10(1.401298e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T23.Y, T21.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T25.W, T21.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT:     BFE_UINT T22.W, T21.X, literal.y, 1,
+; EG-NEXT:     LSHR * T23.X, KC0[2].Y, literal.z,
+; EG-NEXT:    4(5.605194e-45), 11(1.541428e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T22.Z, T21.X, literal.x, 1,
+; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T24.X, T23.X, literal.x,
+; EG-NEXT:     BFE_UINT T22.Y, T21.X, literal.y, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_UINT * T25.W, T21.X, literal.z, 1, BS:VEC_120/SCL_212
+; EG-NEXT:    4(5.605194e-45), 9(1.261169e-44)
 ; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T23.X, T21.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT T22.X, T21.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T25.Z, T21.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T26.X, T23.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T25.Y, T21.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T27.W, T21.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T25.Y, T21.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T27.W, T21.X, literal.y, 1,
+; EG-NEXT:    13(1.821688e-44), 19(2.662467e-44)
 ; EG-NEXT:     BFE_UINT T25.X, T21.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T27.Z, T21.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T28.X, T23.X, literal.x,
 ; EG-NEXT:    12(1.681558e-44), 18(2.522337e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T27.Y, T21.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T29.W, T21.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 17(2.382207e-44)
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T27.Y, T21.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T29.W, T21.X, literal.y, 1,
+; EG-NEXT:    17(2.382207e-44), 23(3.222986e-44)
 ; EG-NEXT:     BFE_UINT T27.X, T21.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T29.Z, T21.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T30.X, T23.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 22(3.082857e-44)
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T29.Y, T21.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T31.W, T21.X, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T29.Y, T21.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T31.W, T21.X, literal.y, 1,
+; EG-NEXT:    21(2.942727e-44), 27(3.783506e-44)
 ; EG-NEXT:     BFE_UINT T29.X, T21.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T31.Z, T21.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T32.X, T23.X, literal.x,
 ; EG-NEXT:    20(2.802597e-44), 26(3.643376e-44)
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T31.Y, T21.X, literal.y, 1,
-; EG-NEXT:     LSHR * T33.W, T21.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 25(3.503246e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T31.Y, T21.X, literal.x, 1,
+; EG-NEXT:     LSHR * T33.W, T21.X, literal.y,
+; EG-NEXT:    25(3.503246e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_UINT T31.X, T21.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T33.Z, T21.X, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T34.X, T23.X, literal.x,
 ; EG-NEXT:    24(3.363116e-44), 30(4.203895e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T33.Y, T21.X, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T35.W, T21.Y, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 29(4.063766e-44)
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T33.Y, T21.X, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T35.W, T21.Y, literal.y, 1,
+; EG-NEXT:    29(4.063766e-44), 3(4.203895e-45)
 ; EG-NEXT:     BFE_UINT T33.X, T21.X, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T35.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T21.X, T23.X, literal.x,
 ; EG-NEXT:    28(3.923636e-44), 2(2.802597e-45)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
 ; EG-NEXT:     BFE_UINT T35.Y, T21.Y, 1, 1,
-; EG-NEXT:     BFE_UINT T36.W, T21.Y, literal.y, 1,
-; EG-NEXT:     AND_INT * T35.X, T21.Y, 1,
-; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
+; EG-NEXT:     BFE_UINT * T36.W, T21.Y, literal.x, 1,
+; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T35.X, T21.Y, 1,
 ; EG-NEXT:     BFE_UINT T36.Z, T21.Y, literal.x, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    6(8.407791e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T36.Y, T21.Y, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T38.W, T21.Y, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 5(7.006492e-45)
-; EG-NEXT:    11(1.541428e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T37.X, T23.X, literal.y,
+; EG-NEXT:    6(8.407791e-45), 32(4.484155e-44)
+; EG-NEXT:     BFE_UINT T36.Y, T21.Y, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T38.W, T21.Y, literal.y, 1,
+; EG-NEXT:    5(7.006492e-45), 11(1.541428e-44)
 ; EG-NEXT:     BFE_UINT T36.X, T21.Y, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T38.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T39.X, T23.X, literal.z,
 ; EG-NEXT:    4(5.605194e-45), 10(1.401298e-44)
-; EG-NEXT:    144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 122:
-; EG-NEXT:     LSHR T39.X, T0.W, literal.x,
-; EG-NEXT:     BFE_UINT T38.Y, T21.Y, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T40.W, T21.Y, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
-; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T38.Y, T21.Y, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T40.W, T21.Y, literal.y, 1,
+; EG-NEXT:    9(1.261169e-44), 15(2.101948e-44)
 ; EG-NEXT:     BFE_UINT T38.X, T21.Y, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T40.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T41.X, T23.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 14(1.961818e-44)
-; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T40.Y, T21.Y, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T42.W, T21.Y, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 13(1.821688e-44)
-; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T40.X, T21.Y, literal.x, 1,
-; EG-NEXT:     BFE_UINT T42.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    12(1.681558e-44), 18(2.522337e-44)
-; EG-NEXT:    176(2.466285e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T43.X, PV.W, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T40.Y, T21.Y, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T42.W, T21.Y, literal.y, 1,
+; EG-NEXT:    13(1.821688e-44), 19(2.662467e-44)
+; EG-NEXT:     BFE_UINT * T40.X, T21.Y, literal.x, 1,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 115:
+; EG-NEXT:     BFE_UINT * T42.Z, T21.Y, literal.x, 1,
+; EG-NEXT:    18(2.522337e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T43.X, T23.X, literal.x,
 ; EG-NEXT:     BFE_UINT T42.Y, T21.Y, literal.y, 1,
 ; EG-NEXT:     BFE_UINT * T44.W, T21.Y, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 17(2.382207e-44)
+; EG-NEXT:    44(6.165713e-44), 17(2.382207e-44)
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T42.X, T21.Y, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T44.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T45.X, T23.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 22(3.082857e-44)
-; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T45.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T44.Y, T21.Y, literal.y, 1,
-; EG-NEXT:     BFE_UINT * T46.W, T21.Y, literal.z, 1,
-; EG-NEXT:    2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T44.Y, T21.Y, literal.x, 1,
+; EG-NEXT:     BFE_UINT * T46.W, T21.Y, literal.y, 1,
+; EG-NEXT:    21(2.942727e-44), 27(3.783506e-44)
 ; EG-NEXT:     BFE_UINT T44.X, T21.Y, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T46.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T47.X, T23.X, literal.z,
 ; EG-NEXT:    20(2.802597e-44), 26(3.643376e-44)
-; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T46.Y, T21.Y, literal.y, 1,
-; EG-NEXT:     LSHR * T48.W, T21.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 25(3.503246e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T46.Y, T21.Y, literal.x, 1,
+; EG-NEXT:     LSHR * T48.W, T21.Y, literal.y,
+; EG-NEXT:    25(3.503246e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_UINT T46.X, T21.Y, literal.x, 1,
 ; EG-NEXT:     BFE_UINT T48.Z, T21.Y, literal.y, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T49.X, T23.X, literal.z,
 ; EG-NEXT:    24(3.363116e-44), 30(4.203895e-44)
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT * T48.Y, T21.Y, literal.y, 1,
-; EG-NEXT:    2(2.802597e-45), 29(4.063766e-44)
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T48.Y, T21.Y, literal.x, 1,
+; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T48.X, T21.Y, literal.x, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    28(3.923636e-44), 240(3.363116e-43)
-; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T50.X, T23.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 60(8.407791e-44)
 ;
 ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
 ; GFX12:       ; %bb.0:
@@ -4544,26 +4468,26 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
-; EG-NEXT:    ALU 99, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 98, @125, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 13, @224, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 101, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 85, @127, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T49.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T19.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T46.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T40.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T38.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T35.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T33.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T31.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T21.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_64 T19.XY, T19.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 24:
@@ -4575,215 +4499,189 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; EG-NEXT:     LSHR * T0.W, T19.X, literal.x,
 ; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T20.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T19.X, literal.x,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T20.Y, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T19.X, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T19.X, literal.y,
-; EG-NEXT:    11(1.541428e-44), 5(7.006492e-45)
-; EG-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
-; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.X, literal.y,
-; EG-NEXT:     BFE_INT T22.W, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
-; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    11(1.541428e-44), 4(5.605194e-45)
 ; EG-NEXT:     BFE_INT T20.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.X, literal.x,
-; EG-NEXT:     BFE_INT T22.Z, PV.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T21.W, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T19.X, literal.x,
+; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T21.Z, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T19.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    15(2.101948e-44), 9(1.261169e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T23.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T22.Y, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T1.W, T19.X, literal.z,
+; EG-NEXT:    2(2.802597e-45), 15(2.101948e-44)
+; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T23.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T21.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T19.X, literal.y,
-; EG-NEXT:     BFE_INT T24.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T24.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 14(1.961818e-44)
+; EG-NEXT:    4(5.605194e-45), 14(1.961818e-44)
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T22.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.X, literal.x,
+; EG-NEXT:     BFE_INT T21.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T24.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.X, literal.y,
 ; EG-NEXT:    19(2.662467e-44), 13(1.821688e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T25.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T24.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.X, literal.y,
-; EG-NEXT:     BFE_INT T26.W, PV.Y, 0.0, 1,
+; EG-NEXT:     ADD_INT T25.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T24.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T19.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T26.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 18(2.522337e-44)
+; EG-NEXT:    8(1.121039e-44), 18(2.522337e-44)
 ; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T24.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.X, literal.x,
 ; EG-NEXT:     BFE_INT T26.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 17(2.382207e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T27.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T26.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.X, literal.y,
-; EG-NEXT:     BFE_INT T28.W, PV.Y, 0.0, 1,
+; EG-NEXT:     ADD_INT T27.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T26.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T19.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T28.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 22(3.082857e-44)
+; EG-NEXT:    12(1.681558e-44), 22(3.082857e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T26.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.X, literal.x,
 ; EG-NEXT:     BFE_INT T28.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.X, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.X, literal.y,
 ; EG-NEXT:    27(3.783506e-44), 21(2.942727e-44)
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T29.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T28.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.X, literal.y,
-; EG-NEXT:     BFE_INT T30.W, PV.Y, 0.0, 1,
+; EG-NEXT:     ADD_INT T29.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T28.Y, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.Z, T19.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T30.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 26(3.643376e-44)
+; EG-NEXT:    16(2.242078e-44), 26(3.643376e-44)
 ; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T28.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T30.Z, PV.Z, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T19.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    25(3.503246e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T31.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T31.X, T22.X, literal.y,
+; EG-NEXT:    25(3.503246e-44), 20(2.802597e-44)
 ; EG-NEXT:     BFE_INT T30.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.X, literal.y,
-; EG-NEXT:     LSHR T0.W, T19.X, literal.z,
-; EG-NEXT:     ASHR * T32.W, T19.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 30(4.203895e-44)
-; EG-NEXT:    24(3.363116e-44), 31(4.344025e-44)
+; EG-NEXT:     LSHR T0.Z, T19.X, literal.x,
+; EG-NEXT:     LSHR T0.W, T19.X, literal.y,
+; EG-NEXT:     ASHR * T32.W, T19.X, literal.z,
+; EG-NEXT:    30(4.203895e-44), 24(3.363116e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T30.X, PV.W, 0.0, 1,
 ; EG-NEXT:     BFE_INT T32.Z, PV.Z, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T19.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    29(4.063766e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T33.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T33.X, T22.X, literal.y,
+; EG-NEXT:    29(4.063766e-44), 24(3.363116e-44)
 ; EG-NEXT:     BFE_INT T32.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.Y, literal.y,
-; EG-NEXT:     LSHR * T1.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
-; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.X, literal.y,
+; EG-NEXT:    7(9.809089e-45), 28(3.923636e-44)
 ; EG-NEXT:     BFE_INT T32.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.Y, literal.x,
 ; EG-NEXT:     BFE_INT T34.W, PV.W, 0.0, 1,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    6(8.407791e-45), 112(1.569454e-43)
-; EG-NEXT:    ALU clause starting at 125:
-; EG-NEXT:     LSHR T35.X, T0.W, literal.x,
-; EG-NEXT:     LSHR T0.Y, T19.Y, literal.y,
-; EG-NEXT:     BFE_INT T34.Z, T0.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.Y, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 11(1.541428e-44)
-; EG-NEXT:    5(7.006492e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T36.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T34.Y, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T19.Y, literal.x,
+; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T35.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T34.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.y,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.z,
+; EG-NEXT:    28(3.923636e-44), 11(1.541428e-44)
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T36.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T34.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T19.Y, literal.y,
-; EG-NEXT:     BFE_INT T37.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T37.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
+; EG-NEXT:    32(4.484155e-44), 10(1.401298e-44)
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T34.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.Y, literal.x,
-; EG-NEXT:     BFE_INT T37.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:    ALU clause starting at 127:
+; EG-NEXT:     BFE_INT T34.X, T0.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T37.Z, T0.Z, 0.0, 1,
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.y,
 ; EG-NEXT:    15(2.101948e-44), 9(1.261169e-44)
-; EG-NEXT:    144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T38.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T37.Y, PV.W, 0.0, 1,
+; EG-NEXT:     ADD_INT T38.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T37.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T19.Y, literal.y,
-; EG-NEXT:     BFE_INT T39.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T39.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 14(1.961818e-44)
+; EG-NEXT:    36(5.044674e-44), 14(1.961818e-44)
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T37.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.Y, literal.x,
 ; EG-NEXT:     BFE_INT T39.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.y,
 ; EG-NEXT:    19(2.662467e-44), 13(1.821688e-44)
-; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T40.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T39.Y, PV.W, 0.0, 1,
+; EG-NEXT:     ADD_INT T40.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T39.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T19.Y, literal.y,
-; EG-NEXT:     BFE_INT T41.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T41.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 18(2.522337e-44)
+; EG-NEXT:    40(5.605194e-44), 18(2.522337e-44)
 ; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T39.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.Y, literal.x,
 ; EG-NEXT:     BFE_INT T41.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 17(2.382207e-44)
-; EG-NEXT:    176(2.466285e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T41.Y, PV.W, 0.0, 1,
+; EG-NEXT:     ADD_INT T42.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T41.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T19.Y, literal.y,
-; EG-NEXT:     BFE_INT T43.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T43.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 22(3.082857e-44)
+; EG-NEXT:    44(6.165713e-44), 22(3.082857e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T41.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.Y, literal.x,
 ; EG-NEXT:     BFE_INT T43.Z, PV.Z, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T19.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.y,
 ; EG-NEXT:    27(3.783506e-44), 21(2.942727e-44)
-; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T44.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T43.Y, PV.W, 0.0, 1,
+; EG-NEXT:     ADD_INT T44.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T43.Y, PS, 0.0, 1,
 ; EG-NEXT:     LSHR T0.Z, T19.Y, literal.y,
-; EG-NEXT:     BFE_INT T45.W, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T45.W, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR * T0.W, T19.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 26(3.643376e-44)
+; EG-NEXT:    48(6.726233e-44), 26(3.643376e-44)
 ; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T43.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T45.Z, PV.Z, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    25(3.503246e-44), 208(2.914701e-43)
-; EG-NEXT:     LSHR T46.X, PS, literal.x,
+; EG-NEXT:     ADD_INT * T46.X, T22.X, literal.y,
+; EG-NEXT:    25(3.503246e-44), 52(7.286752e-44)
 ; EG-NEXT:     BFE_INT T45.Y, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T19.Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT:     LSHR * T0.W, T19.Y, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T45.X, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.Z, T19.Y, literal.x,
-; EG-NEXT:     LSHR T0.W, T19.X, 1,
-; EG-NEXT:     LSHR * T1.W, T19.Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 3(4.203895e-45)
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T19.X, 1,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T47.X, T19.X, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T19.X, literal.x,
-; EG-NEXT:     LSHR T1.Z, T19.X, literal.y,
-; EG-NEXT:     LSHR T2.W, T19.Y, literal.z,
-; EG-NEXT:     ASHR * T48.W, T19.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 3(4.203895e-45)
-; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_INT T19.X, T19.Y, 0.0, 1,
-; EG-NEXT:     LSHR T1.Y, T19.Y, literal.x,
+; EG-NEXT:     LSHR T0.Y, T19.Y, literal.x,
+; EG-NEXT:     LSHR T0.Z, T19.X, literal.x,
+; EG-NEXT:     LSHR T2.W, T19.Y, literal.y,
+; EG-NEXT:     ASHR * T48.W, T19.Y, literal.z,
+; EG-NEXT:    3(4.203895e-45), 30(4.203895e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T49.X, T19.Y, 0.0, 1,
+; EG-NEXT:     LSHR T1.Y, T19.X, literal.x,
 ; EG-NEXT:     BFE_INT T48.Z, PV.W, 0.0, 1,
 ; EG-NEXT:     BFE_INT T47.W, PV.Z, 0.0, 1,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    29(4.063766e-44), 224(3.138909e-43)
-; EG-NEXT:     LSHR * T49.X, PS, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 224:
-; EG-NEXT:     BFE_INT T48.Y, T1.Y, 0.0, 1,
-; EG-NEXT:     BFE_INT T47.Z, T0.Y, 0.0, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT T19.W, T1.W, 0.0, 1,
-; EG-NEXT:     LSHR * T1.W, T19.Y, literal.x,
-; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T2.W, T19.Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 29(4.063766e-44)
+; EG-NEXT:     ADD_INT T19.X, T22.X, literal.x,
+; EG-NEXT:     BFE_INT T48.Y, PS, 0.0, 1,
+; EG-NEXT:     BFE_INT T47.Z, PV.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T49.W, T0.Y, 0.0, 1,
+; EG-NEXT:     LSHR * T2.W, T19.Y, literal.y,
+; EG-NEXT:    56(7.847271e-44), 28(3.923636e-44)
 ; EG-NEXT:     BFE_INT T48.X, PS, 0.0, 1,
-; EG-NEXT:     BFE_INT T47.Y, T0.W, 0.0, 1,
-; EG-NEXT:     BFE_INT T19.Z, T0.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T47.Y, T1.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T49.Z, T0.W, 0.0, 1, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR T0.W, T19.Y, 1,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T50.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T19.Y, PV.W, 0.0, 1,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T50.X, T22.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T49.Y, PV.W, 0.0, 1,
 ;
 ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32:
 ; GFX12:       ; %bb.0:
@@ -5658,7 +5556,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -5674,11 +5572,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T0.Y, 0.0,
 ; EG-NEXT:     MOV * T1.W, 0.0,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v3i1_to_v3i64:
 ; GFX12:       ; %bb.0:
@@ -5811,7 +5708,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -5822,19 +5719,17 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     BFE_INT T1.X, T0.X, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T0.X, 1,
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T0.X, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T0.X, PS, 0.0, 1,
 ; EG-NEXT:     BFE_INT T1.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, 1,
-; EG-NEXT:     MOV T1.Y, T1.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     MOV T0.Y, PV.X,
+; EG-NEXT:     MOV * T1.Y, T1.X,
+; EG-NEXT:     ADD_INT T3.X, T2.X, literal.x,
+; EG-NEXT:     MOV T0.Y, T0.X, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T1.W, T1.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v3i1_to_v3i64:
 ; GFX12:       ; %bb.0:
@@ -5945,7 +5840,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 13, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -5964,11 +5859,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV T0.Y, 0.0,
 ; EG-NEXT:     MOV T1.W, 0.0,
 ; EG-NEXT:     MOV * T0.W, 0.0,
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v4i1_to_v4i64:
 ; GFX12:       ; %bb.0:
@@ -6119,7 +6013,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
@@ -6139,13 +6033,12 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV T2.Y, PV.X,
 ; EG-NEXT:     BFE_INT * T1.Z, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     MOV T1.Y, T1.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T1.Y, T1.X,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PV.X, literal.x,
 ; EG-NEXT:     MOV T1.W, T1.Z,
 ; EG-NEXT:     MOV * T2.W, T2.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v4i1_to_v4i64:
 ; GFX12:       ; %bb.0:
@@ -6289,7 +6182,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 30, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 26, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
@@ -6320,17 +6213,13 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV * T7.W, 0.0,
 ; EG-NEXT:     MOV T8.W, 0.0,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T9.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T10.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T11.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T12.X, T9.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v8i1_to_v8i64:
 ; GFX12:       ; %bb.0:
@@ -6503,55 +6392,53 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 37, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 35, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T11.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T6.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T5.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T0.W, T5.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 7(9.809089e-45)
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T6.Z, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T6.X, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T7.X, T5.X, 0.0, 1,
 ; EG-NEXT:     BFE_INT T8.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T5.X, literal.x,
-; EG-NEXT:     LSHR * T1.W, T5.X, literal.y,
-; EG-NEXT:    3(4.203895e-45), 6(8.407791e-45)
-; EG-NEXT:     BFE_INT T8.X, PS, 0.0, 1,
-; EG-NEXT:     BFE_INT T9.Z, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T8.X, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T5.X, 1,
-; EG-NEXT:     LSHR * T1.W, T5.X, literal.x,
+; EG-NEXT:     LSHR * T9.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T9.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T8.Y, PV.X,
 ; EG-NEXT:     BFE_INT T7.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T5.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    5(7.006492e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T10.X, PS, literal.x,
-; EG-NEXT:     MOV T9.Y, PV.X,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T10.X, T9.X, literal.x,
+; EG-NEXT:     MOV T6.Y, T6.X, BS:VEC_120/SCL_212
 ; EG-NEXT:     BFE_INT T5.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T0.W, T5.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x, BS:VEC_201
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T5.X, PV.W, 0.0, 1,
 ; EG-NEXT:     MOV T7.Y, T7.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T11.X, T9.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T5.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
 ; EG-NEXT:     MOV T7.W, T7.Z,
-; EG-NEXT:     MOV * T9.W, T9.Z,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T12.X, PV.Z, literal.x,
+; EG-NEXT:     MOV * T6.W, T6.Z,
+; EG-NEXT:     ADD_INT T12.X, T9.X, literal.x,
 ; EG-NEXT:     MOV T5.W, T5.Z,
 ; EG-NEXT:     MOV * T8.W, T8.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v8i1_to_v8i64:
 ; GFX12:       ; %bb.0:
@@ -6788,7 +6675,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 62, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 52, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
@@ -6843,29 +6730,19 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV * T13.W, 0.0,
 ; EG-NEXT:     MOV T14.W, 0.0,
 ; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     LSHR T15.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T22.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T15.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T18.X, T15.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T15.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T20.X, T15.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T15.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT * T22.X, T15.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
 ; GFX12:       ; %bb.0:
@@ -7218,7 +7095,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 78, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 68, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0
@@ -7233,33 +7110,26 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 15(2.101948e-44)
+; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T9.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T10.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T11.X, T8.X, literal.x,
+; EG-NEXT:     ADD_INT * T12.X, T8.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T13.X, T7.X, 0.0, 1,
 ; EG-NEXT:     BFE_INT T14.Z, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T7.X, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
 ; EG-NEXT:    11(1.541428e-44), 14(1.961818e-44)
 ; EG-NEXT:     BFE_INT T14.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T0.Y, T7.X, literal.x,
 ; EG-NEXT:     BFE_INT T15.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T7.X, literal.y,
-; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
-; EG-NEXT:    12(1.681558e-44), 7(9.809089e-45)
-; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.W, T7.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
+; EG-NEXT:    7(9.809089e-45), 10(1.401298e-44)
 ; EG-NEXT:     BFE_INT T15.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T14.Y, PV.X,
 ; EG-NEXT:     BFE_INT T16.Z, PV.W, 0.0, 1,
@@ -7276,42 +7146,39 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV T16.Y, PV.X,
 ; EG-NEXT:     BFE_INT T13.Z, PV.W, 0.0, 1,
 ; EG-NEXT:     LSHR T0.W, T7.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    5(7.006492e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T18.X, PS, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 5(7.006492e-45)
+; EG-NEXT:     ADD_INT T18.X, T8.X, literal.x,
 ; EG-NEXT:     MOV T17.Y, PV.X,
-; EG-NEXT:     BFE_INT T19.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T7.X, literal.y,
-; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT:     BFE_INT T19.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR T1.W, T7.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T7.X, literal.z,
+; EG-NEXT:    20(2.802597e-44), 9(1.261169e-44)
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T19.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T13.Y, T13.X,
 ; EG-NEXT:     BFE_INT T7.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
+; EG-NEXT:     LSHR T1.W, T7.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T7.X, literal.y,
 ; EG-NEXT:    13(1.821688e-44), 8(1.121039e-44)
 ; EG-NEXT:     BFE_INT T7.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T19.Y, PV.X,
 ; EG-NEXT:     BFE_INT T20.Z, PV.W, 0.0, 1,
 ; EG-NEXT:     MOV T13.W, T13.Z,
 ; EG-NEXT:     MOV * T17.W, T17.Z,
-; EG-NEXT:     BFE_INT T20.X, T0.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T20.X, T0.W, 0.0, 1,
 ; EG-NEXT:     MOV T7.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
 ; EG-NEXT:     MOV T19.W, T19.Z,
 ; EG-NEXT:     MOV * T16.W, T16.Z,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T21.X, PV.Z, literal.x,
+; EG-NEXT:     ADD_INT T21.X, T8.X, literal.x,
 ; EG-NEXT:     MOV T20.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
 ; EG-NEXT:     MOV T7.W, T7.Z,
 ; EG-NEXT:     MOV * T15.W, T15.Z,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T22.X, PV.Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T22.X, T8.X, literal.x,
 ; EG-NEXT:     MOV T20.W, T20.Z,
 ; EG-NEXT:     MOV * T14.W, T14.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v16i1_to_v16i64:
 ; GFX12:       ; %bb.0:
@@ -7725,7 +7592,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
 ; EG-NEXT:    ALU 96, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 30, @122, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 7, @122, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T41.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T40.X, 0
@@ -7829,55 +7696,32 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV * T25.W, 0.0,
 ; EG-NEXT:     MOV T26.W, 0.0,
 ; EG-NEXT:     MOV * T11.W, 0.0,
-; EG-NEXT:     LSHR T27.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR * T32.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T27.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T28.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T29.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T30.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T31.X, T27.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T32.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T33.X, T27.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T34.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T35.X, T27.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T36.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T37.X, T27.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
 ; EG-NEXT:    ALU clause starting at 122:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR * T42.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T38.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T39.X, T27.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T40.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T41.X, T27.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT * T42.X, T27.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64:
 ; GFX12:       ; %bb.0:
@@ -8470,8 +8314,8 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
-; EG-NEXT:    ALU 92, @25, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 65, @118, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 97, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 39, @123, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T41.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T34.X, 0
@@ -8495,166 +8339,145 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 24:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 25:
-; EG-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.Y, T11.X, literal.y,
-; EG-NEXT:     LSHR T0.Z, T11.X, literal.z,
-; EG-NEXT:     LSHR * T0.W, T11.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 28(3.923636e-44)
-; EG-NEXT:    29(4.063766e-44), 24(3.363116e-44)
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T1.Y, T11.X, literal.y,
-; EG-NEXT:     LSHR T1.Z, T11.X, literal.z,
-; EG-NEXT:     LSHR * T1.W, T11.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 25(3.503246e-44)
-; EG-NEXT:    20(2.802597e-44), 21(2.942727e-44)
-; EG-NEXT:     LSHR * T2.W, T11.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T12.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T13.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T14.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T15.X, T12.X, literal.x,
+; EG-NEXT:     ADD_INT * T16.X, T12.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T17.X, T12.X, literal.x,
+; EG-NEXT:     ADD_INT * T18.X, T12.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T19.X, T12.X, literal.x,
+; EG-NEXT:     ADD_INT * T20.X, T12.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T21.X, T12.X, literal.x,
+; EG-NEXT:     ADD_INT * T22.X, T12.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T23.X, T12.X, literal.x,
+; EG-NEXT:     LSHR T0.W, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.z,
+; EG-NEXT:    44(6.165713e-44), 28(3.923636e-44)
+; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T24.X, T12.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T0.Z, T11.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T11.X, literal.w, BS:VEC_120/SCL_212
+; EG-NEXT:    48(6.726233e-44), 24(3.363116e-44)
+; EG-NEXT:    25(3.503246e-44), 20(2.802597e-44)
+; EG-NEXT:     LSHR * T3.W, T11.X, literal.x,
+; EG-NEXT:    21(2.942727e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T25.X, T11.X, 0.0, 1,
-; EG-NEXT:     LSHR T2.Y, T11.X, literal.x,
+; EG-NEXT:     LSHR T1.Y, T11.X, literal.x,
 ; EG-NEXT:     ASHR T26.Z, T11.X, literal.y,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.z,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.w,
-; EG-NEXT:    17(2.382207e-44), 31(4.344025e-44)
+; EG-NEXT:     LSHR T4.W, T11.X, literal.z,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.w,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:    27(3.783506e-44), 30(4.203895e-44)
 ; EG-NEXT:     BFE_INT T26.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T3.Y, T11.X, literal.x,
+; EG-NEXT:     LSHR T2.Y, T11.X, literal.x,
 ; EG-NEXT:     BFE_INT T27.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.y,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.z,
-; EG-NEXT:    12(1.681558e-44), 23(3.222986e-44)
+; EG-NEXT:     LSHR T4.W, T11.X, literal.y,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.z,
+; EG-NEXT:    17(2.382207e-44), 23(3.222986e-44)
 ; EG-NEXT:    26(3.643376e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T27.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T26.Y, PV.X,
 ; EG-NEXT:     BFE_INT T28.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.y,
+; EG-NEXT:     LSHR T4.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.y,
 ; EG-NEXT:    19(2.662467e-44), 22(3.082857e-44)
 ; EG-NEXT:     BFE_INT T28.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T27.Y, PV.X,
 ; EG-NEXT:     BFE_INT T29.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.y,
+; EG-NEXT:     LSHR T4.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.y,
 ; EG-NEXT:    15(2.101948e-44), 18(2.522337e-44)
 ; EG-NEXT:     BFE_INT T29.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T28.Y, PV.X,
 ; EG-NEXT:     BFE_INT T30.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.y,
+; EG-NEXT:     LSHR T4.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.y,
 ; EG-NEXT:    11(1.541428e-44), 14(1.961818e-44)
 ; EG-NEXT:     BFE_INT T30.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T29.Y, PV.X,
 ; EG-NEXT:     BFE_INT T31.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.y,
+; EG-NEXT:     LSHR T4.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.y,
 ; EG-NEXT:    7(9.809089e-45), 10(1.401298e-44)
 ; EG-NEXT:     BFE_INT T31.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T30.Y, PV.X,
 ; EG-NEXT:     BFE_INT T32.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.y,
+; EG-NEXT:     LSHR T4.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 6(8.407791e-45)
-; EG-NEXT:    ALU clause starting at 118:
-; EG-NEXT:     BFE_INT T32.X, T4.W, 0.0, 1,
-; EG-NEXT:     MOV T31.Y, T31.X,
-; EG-NEXT:     BFE_INT T33.Z, T3.W, 0.0, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T3.W, T11.X, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.x,
+; EG-NEXT:     BFE_INT T32.X, PS, 0.0, 1,
+; EG-NEXT:     MOV T31.Y, PV.X,
+; EG-NEXT:     BFE_INT T33.Z, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR T4.W, T11.X, 1,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T33.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T32.Y, PV.X,
 ; EG-NEXT:     BFE_INT T25.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    5(7.006492e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T34.X, PS, literal.x,
+; EG-NEXT:     LSHR T4.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 5(7.006492e-45)
+; EG-NEXT:     ADD_INT T34.X, T12.X, literal.x,
 ; EG-NEXT:     MOV T33.Y, PV.X,
-; EG-NEXT:     BFE_INT T35.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.y,
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
+; EG-NEXT:     BFE_INT T35.Z, PS, 0.0, 1,
+; EG-NEXT:     LSHR T5.W, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT:    52(7.286752e-44), 9(1.261169e-44)
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T35.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T25.Y, T25.X,
 ; EG-NEXT:     BFE_INT T11.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T4.W, T11.X, literal.y,
-; EG-NEXT:    13(1.821688e-44), 8(1.121039e-44)
-; EG-NEXT:     BFE_INT T11.X, PS, 0.0, 1,
-; EG-NEXT:     MOV T35.Y, PV.X,
-; EG-NEXT:     BFE_INT T36.Z, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR * T5.W, T11.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 123:
+; EG-NEXT:     LSHR * T6.W, T11.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T11.X, PV.W, 0.0, 1,
+; EG-NEXT:     MOV T35.Y, T35.X,
+; EG-NEXT:     BFE_INT T36.Z, T5.W, 0.0, 1,
 ; EG-NEXT:     MOV T25.W, T25.Z,
 ; EG-NEXT:     MOV * T33.W, T33.Z,
-; EG-NEXT:     BFE_INT T36.X, T3.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T36.X, T4.W, 0.0, 1,
 ; EG-NEXT:     MOV T11.Y, PV.X,
-; EG-NEXT:     BFE_INT T37.Z, T2.Y, 0.0, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T37.Z, T2.Y, 0.0, 1,
 ; EG-NEXT:     MOV T35.W, T35.Z,
 ; EG-NEXT:     MOV * T32.W, T32.Z,
-; EG-NEXT:     BFE_INT T37.X, T2.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T37.X, T1.Y, 0.0, 1,
 ; EG-NEXT:     MOV T36.Y, PV.X,
-; EG-NEXT:     BFE_INT T38.Z, T1.W, 0.0, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T38.Z, T3.W, 0.0, 1,
 ; EG-NEXT:     MOV T11.W, T11.Z,
 ; EG-NEXT:     MOV * T31.W, T31.Z,
-; EG-NEXT:     BFE_INT T38.X, T1.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T38.X, T2.W, 0.0, 1,
 ; EG-NEXT:     MOV T37.Y, PV.X,
-; EG-NEXT:     BFE_INT T39.Z, T1.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T39.Z, T0.Z, 0.0, 1,
 ; EG-NEXT:     MOV T36.W, T36.Z, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T30.W, T30.Z,
-; EG-NEXT:     BFE_INT T39.X, T0.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T39.X, T0.Y, 0.0, 1,
 ; EG-NEXT:     MOV T38.Y, PV.X,
-; EG-NEXT:     BFE_INT T40.Z, T0.Z, 0.0, 1,
-; EG-NEXT:     MOV T37.W, T37.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T40.Z, T1.W, 0.0, 1,
+; EG-NEXT:     MOV T37.W, T37.Z,
 ; EG-NEXT:     MOV * T29.W, T29.Z,
-; EG-NEXT:     BFE_INT T40.X, T0.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T40.X, T0.W, 0.0, 1,
 ; EG-NEXT:     MOV T39.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
 ; EG-NEXT:     MOV T38.W, T38.Z,
 ; EG-NEXT:     MOV * T28.W, T28.Z,
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T41.X, PV.Z, literal.x,
+; EG-NEXT:     ADD_INT T41.X, T12.X, literal.x,
 ; EG-NEXT:     MOV T40.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
 ; EG-NEXT:     MOV T39.W, T39.Z,
 ; EG-NEXT:     MOV * T27.W, T27.Z,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR T42.X, PV.Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T42.X, T12.X, literal.x,
 ; EG-NEXT:     MOV T40.W, T40.Z,
 ; EG-NEXT:     MOV * T26.W, T26.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64:
 ; GFX12:       ; %bb.0:
@@ -9444,7 +9267,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; EG-NEXT:    TEX 0 @38
 ; EG-NEXT:    ALU 95, @41, KC0[], KC1[]
 ; EG-NEXT:    ALU 99, @137, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 60, @237, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 13, @237, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T82.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T81.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T80.X, 0
@@ -9645,103 +9468,56 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV * T49.W, 0.0,
 ; EG-NEXT:     MOV T50.W, 0.0,
 ; EG-NEXT:     MOV * T20.W, 0.0,
-; EG-NEXT:     LSHR T51.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T52.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T54.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T55.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T56.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T57.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T58.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T59.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T60.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T61.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR * T62.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T51.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T52.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T53.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T54.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T55.X, T51.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T56.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T57.X, T51.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T58.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T59.X, T51.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T60.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T61.X, T51.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T62.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T63.X, T51.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T64.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T65.X, T51.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT T66.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T67.X, T51.X, literal.y,
+; EG-NEXT:    60(8.407791e-44), 64(8.968310e-44)
+; EG-NEXT:     ADD_INT T68.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T69.X, T51.X, literal.y,
+; EG-NEXT:    68(9.528830e-44), 72(1.008935e-43)
+; EG-NEXT:     ADD_INT T70.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T71.X, T51.X, literal.y,
+; EG-NEXT:    76(1.064987e-43), 80(1.121039e-43)
+; EG-NEXT:     ADD_INT T72.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T73.X, T51.X, literal.y,
+; EG-NEXT:    84(1.177091e-43), 88(1.233143e-43)
 ; EG-NEXT:    ALU clause starting at 237:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T63.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T64.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T65.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR T66.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 256(3.587324e-43)
-; EG-NEXT:     LSHR T67.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 272(3.811532e-43)
-; EG-NEXT:     LSHR T68.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 288(4.035740e-43)
-; EG-NEXT:     LSHR T69.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 304(4.259947e-43)
-; EG-NEXT:     LSHR T70.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 320(4.484155e-43)
-; EG-NEXT:     LSHR T71.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 336(4.708363e-43)
-; EG-NEXT:     LSHR T72.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 352(4.932571e-43)
-; EG-NEXT:     LSHR T73.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 368(5.156778e-43)
-; EG-NEXT:     LSHR T74.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 384(5.380986e-43)
-; EG-NEXT:     LSHR T75.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 400(5.605194e-43)
-; EG-NEXT:     LSHR T76.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 416(5.829402e-43)
-; EG-NEXT:     LSHR T77.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 432(6.053609e-43)
-; EG-NEXT:     LSHR T78.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 448(6.277817e-43)
-; EG-NEXT:     LSHR T79.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 464(6.502025e-43)
-; EG-NEXT:     LSHR T80.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 480(6.726233e-43)
-; EG-NEXT:     LSHR T81.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 496(6.950440e-43)
-; EG-NEXT:     LSHR * T82.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T74.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T75.X, T51.X, literal.y,
+; EG-NEXT:    92(1.289195e-43), 96(1.345247e-43)
+; EG-NEXT:     ADD_INT T76.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T77.X, T51.X, literal.y,
+; EG-NEXT:    100(1.401298e-43), 104(1.457350e-43)
+; EG-NEXT:     ADD_INT T78.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T79.X, T51.X, literal.y,
+; EG-NEXT:    108(1.513402e-43), 112(1.569454e-43)
+; EG-NEXT:     ADD_INT T80.X, T51.X, literal.x,
+; EG-NEXT:     ADD_INT * T81.X, T51.X, literal.y,
+; EG-NEXT:    116(1.625506e-43), 120(1.681558e-43)
+; EG-NEXT:     ADD_INT * T82.X, T51.X, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64:
 ; GFX12:       ; %bb.0:
@@ -10930,11 +10706,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ;
 ; EG-LABEL: constant_sextload_v64i1_to_v64i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 22, @40, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @40, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @38
-; EG-NEXT:    ALU 89, @63, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 99, @153, KC0[], KC1[]
-; EG-NEXT:    ALU 107, @253, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 90, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 100, @132, KC0[], KC1[]
+; EG-NEXT:    ALU 78, @233, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T82.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T80.XYZW, T81.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T73.X, 0
@@ -10946,355 +10722,306 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T43.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T76.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T55.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T75.XYZW, T39.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T38.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T74.XYZW, T37.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T35.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T34.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T72.XYZW, T33.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T32.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T71.XYZW, T31.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T30.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T70.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T69.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T68.XYZW, T25.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T23.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T67.XYZW, T21.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T20.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T19.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T75.XYZW, T40.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T74.XYZW, T38.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T37.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T36.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T35.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T72.XYZW, T34.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T33.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T71.XYZW, T32.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T31.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T70.XYZW, T30.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T29.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T69.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T68.XYZW, T26.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T67.XYZW, T22.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T20.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 38:
-; EG-NEXT:     VTX_READ_64 T40.XY, T26.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T19.XY, T19.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 40:
-; EG-NEXT:     LSHR T19.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T26.X, KC0[2].Z,
+; EG-NEXT:     MOV * T19.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 41:
+; EG-NEXT:     LSHR * T20.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 63:
-; EG-NEXT:     LSHR T26.X, T0.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 256(3.587324e-43)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 272(3.811532e-43)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 288(4.035740e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 304(4.259947e-43)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 320(4.484155e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 336(4.708363e-43)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 352(4.932571e-43)
-; EG-NEXT:     LSHR T42.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.Z, T40.Y, literal.y,
-; EG-NEXT:     LSHR T0.W, T40.Y, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 28(3.923636e-44)
-; EG-NEXT:    29(4.063766e-44), 368(5.156778e-43)
-; EG-NEXT:     LSHR T43.X, PS, literal.x,
-; EG-NEXT:     LSHR T0.Y, T40.Y, literal.y,
-; EG-NEXT:     LSHR T1.Z, T40.Y, literal.z,
-; EG-NEXT:     LSHR * T1.W, T40.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    25(3.503246e-44), 20(2.802597e-44)
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    384(5.380986e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T1.Y, T40.Y, literal.y,
-; EG-NEXT:     LSHR T2.Z, T40.Y, literal.z,
-; EG-NEXT:     LSHR * T2.W, T40.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 21(2.942727e-44)
-; EG-NEXT:    16(2.242078e-44), 17(2.382207e-44)
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.x,
-; EG-NEXT:    400(5.605194e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T45.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T2.Y, T40.Y, literal.y,
-; EG-NEXT:     LSHR T3.Z, T40.Y, literal.z,
-; EG-NEXT:     LSHR * T3.W, T40.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T21.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T22.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T23.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T24.X, T20.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T25.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T26.X, T20.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T27.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T28.X, T20.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T29.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T30.X, T20.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T31.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T32.X, T20.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T33.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T34.X, T20.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT T35.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T36.X, T20.X, literal.y,
+; EG-NEXT:    60(8.407791e-44), 64(8.968310e-44)
+; EG-NEXT:     ADD_INT T37.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T38.X, T20.X, literal.y,
+; EG-NEXT:    68(9.528830e-44), 72(1.008935e-43)
+; EG-NEXT:     ADD_INT T39.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T40.X, T20.X, literal.y,
+; EG-NEXT:    76(1.064987e-43), 80(1.121039e-43)
+; EG-NEXT:     ADD_INT T41.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T42.X, T20.X, literal.y,
+; EG-NEXT:    84(1.177091e-43), 88(1.233143e-43)
+; EG-NEXT:     ADD_INT T43.X, T20.X, literal.x,
+; EG-NEXT:     ADD_INT * T44.X, T20.X, literal.y,
+; EG-NEXT:    92(1.289195e-43), 96(1.345247e-43)
+; EG-NEXT:     LSHR T0.Y, T19.Y, literal.x,
+; EG-NEXT:     LSHR T0.Z, T19.Y, literal.y,
+; EG-NEXT:     LSHR T0.W, T19.Y, literal.z,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.w,
+; EG-NEXT:    28(3.923636e-44), 29(4.063766e-44)
+; EG-NEXT:    24(3.363116e-44), 25(3.503246e-44)
+; EG-NEXT:     ADD_INT T45.X, T20.X, literal.x,
+; EG-NEXT:     LSHR T1.Y, T19.Y, literal.y,
+; EG-NEXT:     LSHR T1.Z, T19.Y, literal.z,
+; EG-NEXT:     LSHR * T2.W, T19.Y, literal.w,
+; EG-NEXT:    100(1.401298e-43), 20(2.802597e-44)
+; EG-NEXT:    21(2.942727e-44), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T3.W, T19.Y, literal.x,
+; EG-NEXT:    17(2.382207e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T46.X, T20.X, literal.x,
+; EG-NEXT:     LSHR T2.Y, T19.Y, literal.y,
+; EG-NEXT:     LSHR T2.Z, T19.Y, literal.z,
+; EG-NEXT:     LSHR * T4.W, T19.Y, literal.w,
+; EG-NEXT:    104(1.457350e-43), 12(1.681558e-44)
 ; EG-NEXT:    13(1.821688e-44), 8(1.121039e-44)
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
-; EG-NEXT:    416(5.829402e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T3.Y, T40.Y, literal.y,
-; EG-NEXT:     LSHR T4.Z, T40.Y, literal.z,
-; EG-NEXT:     LSHR * T4.W, T40.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
-; EG-NEXT:    4(5.605194e-45), 5(7.006492e-45)
-; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.x,
-; EG-NEXT:    432(6.053609e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T4.Y, KC0[2].Y, literal.y,
-; EG-NEXT:     LSHR T5.Z, T40.Y, 1,
-; EG-NEXT:     LSHR T5.W, T40.X, literal.z,
-; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 464(6.502025e-43)
-; EG-NEXT:    28(3.923636e-44), 448(6.277817e-43)
-; EG-NEXT:    ALU clause starting at 153:
-; EG-NEXT:     LSHR T48.X, T6.W, literal.x,
-; EG-NEXT:     LSHR T5.Y, T40.X, literal.y,
-; EG-NEXT:     LSHR T6.Z, T40.X, literal.z,
-; EG-NEXT:     LSHR * T6.W, T40.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 29(4.063766e-44)
+; EG-NEXT:     LSHR * T5.W, T19.Y, literal.x,
+; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T47.X, T20.X, literal.x,
+; EG-NEXT:     LSHR T3.Y, T19.Y, literal.y,
+; EG-NEXT:     LSHR T3.Z, T19.Y, literal.z,
+; EG-NEXT:     LSHR T6.W, T19.Y, 1,
+; EG-NEXT:     LSHR * T7.W, T19.X, literal.w,
+; EG-NEXT:    108(1.513402e-43), 4(5.605194e-45)
+; EG-NEXT:    5(7.006492e-45), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T48.X, T20.X, literal.x,
+; EG-NEXT:     LSHR T4.Y, T19.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T4.Z, T19.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T8.W, T19.X, literal.w, BS:VEC_120/SCL_212
+; EG-NEXT:    112(1.569454e-43), 29(4.063766e-44)
 ; EG-NEXT:    24(3.363116e-44), 25(3.503246e-44)
-; EG-NEXT:     LSHR * T7.W, T40.X, literal.x,
+; EG-NEXT:     LSHR * T9.W, T19.X, literal.x,
 ; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T49.X, T40.X, 0.0, 1,
-; EG-NEXT:     LSHR T6.Y, T40.X, literal.x,
-; EG-NEXT:     ASHR T50.Z, T40.Y, literal.y,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.z,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.w,
+; EG-NEXT:     BFE_INT T49.X, T19.X, 0.0, 1,
+; EG-NEXT:     LSHR T5.Y, T19.X, literal.x,
+; EG-NEXT:     ASHR T50.Z, T19.Y, literal.y,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.z,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.w,
 ; EG-NEXT:    21(2.942727e-44), 31(4.344025e-44)
 ; EG-NEXT:    27(3.783506e-44), 30(4.203895e-44)
 ; EG-NEXT:     BFE_INT T50.X, PS, 0.0, 1,
-; EG-NEXT:     LSHR T7.Y, T40.X, literal.x,
+; EG-NEXT:     LSHR T6.Y, T19.X, literal.x,
 ; EG-NEXT:     BFE_INT T51.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.y,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.z,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.y,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 23(3.222986e-44)
 ; EG-NEXT:    26(3.643376e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T51.X, PS, 0.0, 1,
-; EG-NEXT:     MOV T50.Y, PV.X,
-; EG-NEXT:     BFE_INT T52.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.x,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.y,
+; EG-NEXT:     MOV * T50.Y, PV.X,
+; EG-NEXT:    ALU clause starting at 132:
+; EG-NEXT:     BFE_INT T52.Z, T10.W, 0.0, 1,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.y,
 ; EG-NEXT:    19(2.662467e-44), 22(3.082857e-44)
 ; EG-NEXT:     BFE_INT T52.X, PS, 0.0, 1,
-; EG-NEXT:     MOV T51.Y, PV.X,
+; EG-NEXT:     MOV T51.Y, T51.X,
 ; EG-NEXT:     BFE_INT T53.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.x,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.y,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.y,
 ; EG-NEXT:    15(2.101948e-44), 18(2.522337e-44)
 ; EG-NEXT:     BFE_INT T53.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T52.Y, PV.X,
 ; EG-NEXT:     BFE_INT T54.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.x,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.y,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.y,
 ; EG-NEXT:    11(1.541428e-44), 14(1.961818e-44)
 ; EG-NEXT:     BFE_INT T54.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T53.Y, PV.X,
 ; EG-NEXT:     BFE_INT T55.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.x,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.y,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.y,
 ; EG-NEXT:    7(9.809089e-45), 10(1.401298e-44)
 ; EG-NEXT:     BFE_INT T55.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T54.Y, PV.X,
 ; EG-NEXT:     BFE_INT T56.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.Y, literal.x,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.y,
+; EG-NEXT:     LSHR T10.W, T19.Y, literal.x,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 6(8.407791e-45)
 ; EG-NEXT:     BFE_INT T56.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T55.Y, PV.X,
 ; EG-NEXT:     BFE_INT T57.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T8.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T9.W, T40.Y, literal.y,
+; EG-NEXT:     LSHR T10.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T11.W, T19.Y, literal.y,
 ; EG-NEXT:    17(2.382207e-44), 2(2.802597e-45)
 ; EG-NEXT:     BFE_INT T57.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T56.Y, PV.X,
-; EG-NEXT:     ASHR T58.Z, T40.X, literal.x,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.y,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.z,
+; EG-NEXT:     ASHR T58.Z, T19.X, literal.x,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.y,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.z,
 ; EG-NEXT:    31(4.344025e-44), 27(3.783506e-44)
 ; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T58.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T57.Y, PV.X,
 ; EG-NEXT:     BFE_INT T59.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 26(3.643376e-44)
 ; EG-NEXT:     BFE_INT T59.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T58.Y, PV.X,
 ; EG-NEXT:     BFE_INT T60.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
 ; EG-NEXT:    19(2.662467e-44), 22(3.082857e-44)
 ; EG-NEXT:     BFE_INT T60.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T59.Y, PV.X,
 ; EG-NEXT:     BFE_INT T61.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
 ; EG-NEXT:    15(2.101948e-44), 18(2.522337e-44)
 ; EG-NEXT:     BFE_INT T61.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T60.Y, PV.X,
 ; EG-NEXT:     BFE_INT T62.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
 ; EG-NEXT:    11(1.541428e-44), 14(1.961818e-44)
 ; EG-NEXT:     BFE_INT T62.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T61.Y, PV.X,
 ; EG-NEXT:     BFE_INT T63.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
 ; EG-NEXT:    7(9.809089e-45), 10(1.401298e-44)
 ; EG-NEXT:     BFE_INT T63.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T62.Y, PV.X,
 ; EG-NEXT:     BFE_INT T64.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR * T9.W, T40.X, literal.x,
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 253:
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.x,
-; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T64.X, PV.W, 0.0, 1,
-; EG-NEXT:     MOV T63.Y, T63.X,
-; EG-NEXT:     BFE_INT T65.Z, T9.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.x,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
+; EG-NEXT:    3(4.203895e-45), 6(8.407791e-45)
+; EG-NEXT:     BFE_INT T64.X, PS, 0.0, 1,
+; EG-NEXT:     MOV T63.Y, PV.X,
+; EG-NEXT:     BFE_INT T65.Z, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR T11.W, T19.X, 1,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T65.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T64.Y, PV.X,
 ; EG-NEXT:     BFE_INT T49.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T9.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T11.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T12.W, T19.X, literal.y,
 ; EG-NEXT:    12(1.681558e-44), 5(7.006492e-45)
-; EG-NEXT:     BFE_INT T66.X, T40.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T66.X, T19.Y, 0.0, 1,
 ; EG-NEXT:     MOV T65.Y, PV.X,
 ; EG-NEXT:     BFE_INT T67.Z, PS, 0.0, 1,
-; EG-NEXT:     LSHR T10.W, T40.X, literal.x,
-; EG-NEXT:     LSHR * T11.W, T40.X, literal.y,
+; EG-NEXT:     LSHR T12.W, T19.X, literal.x,
+; EG-NEXT:     LSHR * T13.W, T19.X, literal.y,
 ; EG-NEXT:    9(1.261169e-44), 4(5.605194e-45)
 ; EG-NEXT:     BFE_INT T67.X, PS, 0.0, 1,
 ; EG-NEXT:     MOV T49.Y, T49.X,
-; EG-NEXT:     BFE_INT T40.Z, PV.W, 0.0, 1,
-; EG-NEXT:     LSHR T10.W, T40.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T11.W, T40.X, literal.y,
+; EG-NEXT:     BFE_INT T19.Z, PV.W, 0.0, 1,
+; EG-NEXT:     LSHR T12.W, T19.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T13.W, T19.X, literal.y,
 ; EG-NEXT:    13(1.821688e-44), 8(1.121039e-44)
-; EG-NEXT:     BFE_INT T40.X, PS, 0.0, 1,
-; EG-NEXT:     MOV T67.Y, PV.X,
-; EG-NEXT:     BFE_INT T68.Z, PV.W, 0.0, 1,
+; EG-NEXT:    ALU clause starting at 233:
+; EG-NEXT:     BFE_INT T19.X, T13.W, 0.0, 1,
+; EG-NEXT:     MOV T67.Y, T67.X,
+; EG-NEXT:     BFE_INT T68.Z, T12.W, 0.0, 1, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV T49.W, T49.Z,
 ; EG-NEXT:     MOV * T65.W, T65.Z,
-; EG-NEXT:     BFE_INT T68.X, T9.W, 0.0, 1,
-; EG-NEXT:     MOV T40.Y, PV.X,
-; EG-NEXT:     BFE_INT T69.Z, T8.W, 0.0, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T68.X, T11.W, 0.0, 1,
+; EG-NEXT:     MOV T19.Y, PV.X,
+; EG-NEXT:     BFE_INT T69.Z, T10.W, 0.0, 1, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV T67.W, T67.Z,
 ; EG-NEXT:     MOV * T64.W, T64.Z,
-; EG-NEXT:     BFE_INT T69.X, T7.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T69.X, T6.Y, 0.0, 1,
 ; EG-NEXT:     MOV T68.Y, PV.X,
-; EG-NEXT:     BFE_INT T70.Z, T6.Y, 0.0, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     MOV T40.W, T40.Z,
+; EG-NEXT:     BFE_INT T70.Z, T5.Y, 0.0, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     MOV T19.W, T19.Z,
 ; EG-NEXT:     MOV * T63.W, T63.Z,
-; EG-NEXT:     BFE_INT T70.X, T7.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T70.X, T9.W, 0.0, 1,
 ; EG-NEXT:     MOV T69.Y, PV.X,
-; EG-NEXT:     BFE_INT T71.Z, T6.W, 0.0, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T71.Z, T8.W, 0.0, 1, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV T68.W, T68.Z,
 ; EG-NEXT:     MOV * T62.W, T62.Z,
-; EG-NEXT:     BFE_INT T71.X, T6.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T71.X, T4.Z, 0.0, 1,
 ; EG-NEXT:     MOV T70.Y, PV.X,
-; EG-NEXT:     BFE_INT T72.Z, T5.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T72.Z, T4.Y, 0.0, 1,
 ; EG-NEXT:     MOV T69.W, T69.Z, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T61.W, T61.Z,
-; EG-NEXT:     BFE_INT T72.X, T5.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T72.X, T7.W, 0.0, 1,
 ; EG-NEXT:     MOV T71.Y, PV.X,
-; EG-NEXT:     BFE_INT T66.Z, T5.Z, 0.0, 1,
-; EG-NEXT:     MOV T70.W, T70.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T66.Z, T6.W, 0.0, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     MOV T70.W, T70.Z,
 ; EG-NEXT:     MOV * T60.W, T60.Z,
-; EG-NEXT:     LSHR T73.X, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT T73.X, T20.X, literal.x,
 ; EG-NEXT:     MOV T72.Y, PV.X,
-; EG-NEXT:     BFE_INT T74.Z, T4.W, 0.0, 1,
-; EG-NEXT:     MOV T71.W, T71.Z,
+; EG-NEXT:     BFE_INT T74.Z, T3.Z, 0.0, 1,
+; EG-NEXT:     MOV T71.W, T71.Z, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T59.W, T59.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T74.X, T4.Z, 0.0, 1,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T74.X, T3.Y, 0.0, 1,
 ; EG-NEXT:     MOV T66.Y, T66.X,
-; EG-NEXT:     BFE_INT T75.Z, T3.Y, 0.0, 1,
-; EG-NEXT:     MOV T72.W, T72.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T75.Z, T5.W, 0.0, 1,
+; EG-NEXT:     MOV T72.W, T72.Z,
 ; EG-NEXT:     MOV * T58.W, T58.Z,
-; EG-NEXT:     BFE_INT T75.X, T3.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T75.X, T4.W, 0.0, 1,
 ; EG-NEXT:     MOV T74.Y, PV.X,
-; EG-NEXT:     BFE_INT T76.Z, T3.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T76.Z, T2.Z, 0.0, 1,
 ; EG-NEXT:     MOV T66.W, T66.Z, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T57.W, T57.Z,
 ; EG-NEXT:     BFE_INT T76.X, T2.Y, 0.0, 1,
 ; EG-NEXT:     MOV T75.Y, PV.X,
-; EG-NEXT:     BFE_INT T77.Z, T2.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T77.Z, T3.W, 0.0, 1,
 ; EG-NEXT:     MOV T74.W, T74.Z,
 ; EG-NEXT:     MOV * T56.W, T56.Z,
-; EG-NEXT:     BFE_INT T77.X, T2.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T77.X, T2.W, 0.0, 1,
 ; EG-NEXT:     MOV T76.Y, PV.X,
-; EG-NEXT:     BFE_INT T78.Z, T1.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T78.Z, T1.Z, 0.0, 1,
 ; EG-NEXT:     MOV T75.W, T75.Z, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T55.W, T55.Z,
-; EG-NEXT:     BFE_INT T78.X, T1.W, 0.0, 1,
+; EG-NEXT:     BFE_INT T78.X, T1.Y, 0.0, 1,
 ; EG-NEXT:     MOV T77.Y, PV.X,
-; EG-NEXT:     BFE_INT T79.Z, T1.Z, 0.0, 1,
-; EG-NEXT:     MOV T76.W, T76.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T79.Z, T1.W, 0.0, 1,
+; EG-NEXT:     MOV T76.W, T76.Z,
 ; EG-NEXT:     MOV * T54.W, T54.Z,
-; EG-NEXT:     BFE_INT T79.X, T0.Y, 0.0, 1,
+; EG-NEXT:     BFE_INT T79.X, T0.W, 0.0, 1,
 ; EG-NEXT:     MOV T78.Y, PV.X,
-; EG-NEXT:     BFE_INT T80.Z, T0.W, 0.0, 1,
-; EG-NEXT:     MOV T77.W, T77.Z,
+; EG-NEXT:     BFE_INT T80.Z, T0.Z, 0.0, 1,
+; EG-NEXT:     MOV T77.W, T77.Z, BS:VEC_120/SCL_212
 ; EG-NEXT:     MOV * T53.W, T53.Z,
-; EG-NEXT:     BFE_INT T80.X, T0.Z, 0.0, 1,
+; EG-NEXT:     BFE_INT T80.X, T0.Y, 0.0, 1,
 ; EG-NEXT:     MOV T79.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
-; EG-NEXT:     MOV T78.W, T78.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     MOV T78.W, T78.Z,
 ; EG-NEXT:     MOV * T52.W, T52.Z,
-; EG-NEXT:    480(6.726233e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T81.X, PV.Z, literal.x,
+; EG-NEXT:     ADD_INT T81.X, T20.X, literal.x,
 ; EG-NEXT:     MOV T80.Y, PV.X,
-; EG-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
 ; EG-NEXT:     MOV T79.W, T79.Z,
 ; EG-NEXT:     MOV * T51.W, T51.Z,
-; EG-NEXT:    2(2.802597e-45), 496(6.950440e-43)
-; EG-NEXT:     LSHR T82.X, PV.Z, literal.x,
+; EG-NEXT:    120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T82.X, T20.X, literal.x,
 ; EG-NEXT:     MOV T80.W, T80.Z,
 ; EG-NEXT:     MOV * T50.W, T50.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
 ; GFX12:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 9d18dd541d15e..9e0870ad62358 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -518,7 +518,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -528,11 +528,10 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_load_v16i16:
 ; GFX12:       ; %bb.0: ; %entry
@@ -1381,9 +1380,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ;
 ; EG-LABEL: constant_zextload_v3i16_to_v3i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 4, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 2, @17, KC0[], KC1[]
+; EG-NEXT:    ALU 2, @15, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XY, T0.X, 1
 ; EG-NEXT:    CF_END
@@ -1395,11 +1394,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     MOV * T1.X, KC0[2].Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 17:
-; EG-NEXT:     LSHR T4.X, T0.W, literal.x,
-; EG-NEXT:     MOV * T3.Y, T1.X,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     ADD_INT T4.X, T0.X, literal.x,
+; EG-NEXT:     MOV * T3.Y, T1.X, BS:VEC_120/SCL_212
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32:
@@ -1483,9 +1480,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 9, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    ALU 8, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
@@ -1496,13 +1493,12 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 13:
 ; EG-NEXT:     BFE_INT * T0.Y, T1.X, 0.0, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.X, T2.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT:     BFE_INT T2.X, T2.X, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32:
@@ -1847,7 +1843,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -1858,22 +1854,20 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     LSHR * T8.W, T7.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T8.Z, T7.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T8.Z, T7.Y, literal.x,
+; EG-NEXT:     LSHR * T9.W, T7.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T8.Y, T7.X, literal.x,
-; EG-NEXT:     LSHR * T9.W, T7.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T9.Z, T7.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
 ; EG-NEXT:     AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT:     AND_INT T9.Z, T7.W, literal.x,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR * T9.Y, T7.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T9.X, T7.Z, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T9.Y, T7.Z, literal.y,
+; EG-NEXT:     AND_INT * T9.X, T7.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T10.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T10.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GFX12:       ; %bb.0:
@@ -2008,7 +2002,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 19, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -2030,13 +2024,12 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     BFE_INT T8.Y, PS, 0.0, literal.y,
-; EG-NEXT:     LSHR T1.Z, T7.Z, literal.y,
 ; EG-NEXT:     BFE_INT T9.W, PV.Z, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T10.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T7.Z, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T10.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT * T9.Y, PS, 0.0, literal.y,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GFX12:       ; %bb.0:
@@ -2240,11 +2233,11 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 35, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 31, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T11.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T12.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T15.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
@@ -2254,40 +2247,36 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; EG-NEXT:    ALU clause starting at 13:
 ; EG-NEXT:     LSHR * T13.W, T12.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T13.Z, T12.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T13.Z, T12.Y, literal.x,
+; EG-NEXT:     LSHR * T14.W, T12.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T13.Y, T12.X, literal.x,
-; EG-NEXT:     LSHR * T14.W, T12.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T14.Z, T12.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
 ; EG-NEXT:     AND_INT T13.X, T12.X, literal.x,
-; EG-NEXT:     AND_INT T14.Z, T12.W, literal.x,
-; EG-NEXT:     LSHR * T12.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR T14.Y, T12.Z, literal.x,
-; EG-NEXT:     LSHR * T15.W, T11.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T14.X, T12.Z, literal.x,
-; EG-NEXT:     AND_INT T15.Z, T11.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T14.Y, T12.Z, literal.y,
+; EG-NEXT:     AND_INT * T14.X, T12.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T15.Y, T11.X, literal.y,
+; EG-NEXT:     LSHR * T12.W, T11.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T15.X, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T12.Z, T11.Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
+; EG-NEXT:     LSHR T12.Y, T11.X, literal.y,
 ; EG-NEXT:     LSHR T17.W, T11.W, literal.y,
-; EG-NEXT:     AND_INT * T15.X, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     AND_INT * T12.X, T11.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T17.Z, T11.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T17.Z, T11.W, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T11.X, T15.X, literal.x,
 ; EG-NEXT:     LSHR T17.Y, T11.Z, literal.y,
 ; EG-NEXT:     AND_INT * T17.X, T11.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T18.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T18.X, T15.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GFX12:       ; %bb.0:
@@ -2505,7 +2494,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 39, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 35, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
@@ -2517,20 +2506,18 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; EG-NEXT:    ALU clause starting at 12:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 13:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T15.X, T11.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Y, T12.W, literal.x,
-; EG-NEXT:     BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T16.Z, T11.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.W, T12.Y, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T11.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T16.X, T11.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T1.Y, T11.W, literal.x,
+; EG-NEXT:     LSHR T0.Y, T11.W, literal.x,
 ; EG-NEXT:     BFE_INT T17.Z, T12.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T15.W, PS, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T11.X, literal.x,
@@ -2543,20 +2530,18 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T18.X, T12.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T16.Y, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Z, T12.X, literal.x,
-; EG-NEXT:     BFE_INT T17.W, T0.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T17.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.Z, T12.Z, literal.y,
-; EG-NEXT:     BFE_INT T18.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T0.Z, T12.W, literal.x,
+; EG-NEXT:     BFE_INT T17.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T11.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT T17.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T18.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T12.Z, literal.y,
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T12.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT * T18.Y, PS, 0.0, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GFX12:       ; %bb.0:
@@ -2920,15 +2905,15 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 71, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 62, @21, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T34.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T32.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T19.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T20.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
@@ -2940,76 +2925,67 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; EG-NEXT:    ALU clause starting at 21:
 ; EG-NEXT:     LSHR * T23.W, T20.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T23.Z, T20.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T23.Z, T20.Y, literal.x,
+; EG-NEXT:     LSHR * T24.W, T20.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T23.Y, T20.X, literal.x,
-; EG-NEXT:     LSHR * T24.W, T20.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T24.Z, T20.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
 ; EG-NEXT:     AND_INT T23.X, T20.X, literal.x,
-; EG-NEXT:     AND_INT T24.Z, T20.W, literal.x,
-; EG-NEXT:     LSHR * T20.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR T24.Y, T20.Z, literal.x,
-; EG-NEXT:     LSHR * T25.W, T19.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T24.X, T20.Z, literal.x,
-; EG-NEXT:     AND_INT T25.Z, T19.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T24.Y, T20.Z, literal.y,
+; EG-NEXT:     AND_INT * T24.X, T20.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T25.Y, T19.X, literal.y,
+; EG-NEXT:     LSHR * T20.W, T19.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T25.X, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T20.Z, T19.Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT:     ADD_INT T26.X, PV.X, literal.x,
+; EG-NEXT:     LSHR T20.Y, T19.X, literal.y,
 ; EG-NEXT:     LSHR T27.W, T19.W, literal.y,
-; EG-NEXT:     AND_INT * T25.X, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     AND_INT * T20.X, T19.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T27.Z, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T27.Z, T19.W, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T19.X, T25.X, literal.x,
 ; EG-NEXT:     LSHR T27.Y, T19.Z, literal.y,
 ; EG-NEXT:     LSHR T28.W, T22.Y, literal.y,
 ; EG-NEXT:     AND_INT * T27.X, T19.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T28.Z, T22.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T28.Y, T22.X, literal.y,
+; EG-NEXT:     AND_INT * T28.Z, T22.Y, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T29.X, T25.X, literal.x,
+; EG-NEXT:     LSHR T28.Y, T22.X, literal.y, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR T30.W, T22.W, literal.y,
 ; EG-NEXT:     AND_INT * T28.X, T22.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T30.Z, T22.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T30.Y, T22.Z, literal.y,
-; EG-NEXT:     LSHR T31.W, T21.Y, literal.y,
-; EG-NEXT:     AND_INT * T30.X, T22.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     AND_INT * T30.Z, T22.W, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T22.X, T25.X, literal.x,
+; EG-NEXT:     LSHR T30.Y, T22.Z, literal.x,
+; EG-NEXT:     LSHR T31.W, T21.Y, literal.x,
+; EG-NEXT:     AND_INT * T30.X, T22.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT:     AND_INT * T31.Z, T21.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T31.Z, T21.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T31.Y, T21.X, literal.y,
+; EG-NEXT:     ADD_INT T32.X, T25.X, literal.x,
+; EG-NEXT:     LSHR T31.Y, T21.X, literal.y, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR T33.W, T21.W, literal.y,
 ; EG-NEXT:     AND_INT * T31.X, T21.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T33.Z, T21.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T33.Z, T21.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T21.X, T25.X, literal.x,
 ; EG-NEXT:     LSHR T33.Y, T21.Z, literal.y,
 ; EG-NEXT:     AND_INT * T33.X, T21.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T34.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T34.X, T25.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GFX12:       ; %bb.0:
@@ -3402,108 +3378,96 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ;
 ; EG-LABEL: constant_sextload_v32i16_to_v32i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 8, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 73, @29, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T22.X, 0
+; EG-NEXT:    ALU 69, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T27.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T20.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T23.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
-; EG-NEXT:     VTX_READ_128 T23.XYZW, T22.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T24.XYZW, T22.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T25.XYZW, T22.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T22.XYZW, T22.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 48, #1
 ; EG-NEXT:    ALU clause starting at 20:
-; EG-NEXT:     LSHR T19.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T22.X, KC0[2].Z,
+; EG-NEXT:     MOV * T19.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     LSHR * T23.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 29:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.W, T22.W, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T28.X, PS, literal.x,
-; EG-NEXT:     LSHR T0.Y, T22.Y, literal.y,
-; EG-NEXT:     BFE_INT T29.Z, T25.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T1.W, T24.W, literal.y,
-; EG-NEXT:     LSHR * T2.W, T24.Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T29.X, T25.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T1.Y, T23.W, literal.x,
-; EG-NEXT:     BFE_INT T30.Z, T25.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T3.W, T23.Y, literal.x,
-; EG-NEXT:     LSHR * T4.W, T25.Y, literal.x,
+; EG-NEXT:     ADD_INT T24.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T26.X, T23.X, literal.x,
+; EG-NEXT:     ADD_INT * T27.X, T23.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T28.X, T23.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T19.Y, literal.y,
+; EG-NEXT:     BFE_INT T29.Z, T22.Y, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T0.W, T21.W, literal.y,
+; EG-NEXT:     LSHR * T1.W, T21.Y, literal.y,
+; EG-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT T29.X, T22.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T1.Y, T20.W, literal.x,
+; EG-NEXT:     BFE_INT T30.Z, T22.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T2.W, T20.Y, literal.x,
+; EG-NEXT:     LSHR * T3.W, T22.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T30.X, T25.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T2.Y, T25.W, literal.x,
-; EG-NEXT:     BFE_INT T31.Z, T23.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.X, T22.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T2.Y, T22.W, literal.x,
+; EG-NEXT:     BFE_INT T31.Z, T20.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T29.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T4.W, T25.X, literal.x,
+; EG-NEXT:     LSHR * T3.W, T22.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T31.X, T23.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T31.X, T20.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T29.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T32.Z, T23.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T32.Z, T20.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T30.W, PV.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T4.W, T25.Z, literal.x,
+; EG-NEXT:     LSHR * T3.W, T22.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T32.X, T23.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T32.X, T20.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T30.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T25.Z, T24.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.W, T3.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T23.X, literal.x,
+; EG-NEXT:     BFE_INT T22.Z, T21.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T31.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T2.W, T20.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T25.X, T24.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.X, T21.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T33.Z, T24.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T33.Z, T21.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T32.W, T1.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T23.Z, literal.x,
+; EG-NEXT:     LSHR * T2.W, T20.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T33.X, T24.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T33.X, T21.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T32.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T23.Z, T22.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T25.W, T2.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T2.W, T24.X, literal.x,
+; EG-NEXT:     BFE_INT T20.Z, T19.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.W, T21.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T23.X, T22.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T25.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T34.Z, T22.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T33.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T1.W, T24.Z, literal.x,
+; EG-NEXT:     BFE_INT T20.X, T19.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T34.Z, T19.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T33.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T21.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T34.X, T22.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T34.X, T19.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T33.Y, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Z, T22.X, literal.x,
-; EG-NEXT:     BFE_INT T23.W, T0.Y, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T22.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T23.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.Z, T22.Z, literal.y,
-; EG-NEXT:     BFE_INT T34.W, T0.W, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T24.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T34.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T0.Z, T19.W, literal.x,
+; EG-NEXT:     BFE_INT T20.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T19.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T19.X, T23.X, literal.x,
+; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T34.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T19.Z, literal.y,
+; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T21.X, T23.X, literal.x,
+; EG-NEXT:     BFE_INT * T34.Y, PS, 0.0, literal.y,
+; EG-NEXT:    28(3.923636e-44), 16(2.242078e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GFX12:       ; %bb.0:
@@ -4185,185 +4149,158 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; EG-LABEL: constant_zextload_v64i16_to_v64i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 3 @22
-; EG-NEXT:    ALU 55, @39, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 3 @30
-; EG-NEXT:    ALU 87, @95, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @22
+; EG-NEXT:    ALU 34, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 4 @28
+; EG-NEXT:    ALU 81, @74, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T66.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T49.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T45.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T64.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T50.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T46.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T61.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T51.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T47.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T48.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T39.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T38.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T49.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T52.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T44.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T40.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_128 T38.XYZW, T37.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T39.XYZW, T37.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T40.XYZW, T37.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T41.XYZW, T37.X, 16, #1
-; EG-NEXT:    Fetch clause starting at 30:
-; EG-NEXT:     VTX_READ_128 T49.XYZW, T37.X, 112, #1
-; EG-NEXT:     VTX_READ_128 T50.XYZW, T37.X, 96, #1
-; EG-NEXT:     VTX_READ_128 T51.XYZW, T37.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T52.XYZW, T37.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T37.XYZW, T38.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T39.XYZW, T38.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T40.XYZW, T38.X, 16, #1
+; EG-NEXT:    Fetch clause starting at 28:
+; EG-NEXT:     VTX_READ_128 T45.XYZW, T38.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T46.XYZW, T38.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T47.XYZW, T38.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T48.XYZW, T38.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T49.XYZW, T38.X, 48, #1
 ; EG-NEXT:    ALU clause starting at 38:
-; EG-NEXT:     MOV * T37.X, KC0[2].Z,
+; EG-NEXT:     MOV * T38.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 39:
-; EG-NEXT:     LSHR * T35.W, T38.Y, literal.x,
+; EG-NEXT:     LSHR * T35.W, T37.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T35.Z, T37.Y, literal.x,
+; EG-NEXT:     LSHR * T36.W, T37.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LSHR T35.Y, T37.X, literal.x,
+; EG-NEXT:     AND_INT * T36.Z, T37.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT:     AND_INT T35.X, T37.X, literal.x,
+; EG-NEXT:     LSHR T36.Y, T37.Z, literal.y,
+; EG-NEXT:     AND_INT * T36.X, T37.Z, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T37.W, T40.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T35.Z, T38.Y, literal.x,
+; EG-NEXT:     LSHR T41.X, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T37.Z, T40.Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT:     ADD_INT T42.X, PV.X, literal.x,
+; EG-NEXT:     LSHR T37.Y, T40.X, literal.y,
+; EG-NEXT:     LSHR T43.W, T40.W, literal.y,
+; EG-NEXT:     AND_INT * T37.X, T40.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T35.Y, T38.X, literal.x,
-; EG-NEXT:     LSHR * T36.W, T38.W, literal.x,
+; EG-NEXT:     AND_INT * T43.Z, T40.W, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T40.X, T41.X, literal.x,
+; EG-NEXT:     LSHR T43.Y, T40.Z, literal.y,
+; EG-NEXT:     AND_INT * T43.X, T40.Z, literal.z,
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T38.W, T39.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T35.X, T38.X, literal.x,
-; EG-NEXT:     AND_INT T36.Z, T38.W, literal.x,
-; EG-NEXT:     LSHR * T38.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR T36.Y, T38.Z, literal.x,
-; EG-NEXT:     LSHR * T42.W, T41.Y, literal.x,
+; EG-NEXT:     ADD_INT T44.X, T41.X, literal.x,
+; EG-NEXT:     AND_INT * T38.Z, T39.Y, literal.y,
+; EG-NEXT:    12(1.681558e-44), 65535(9.183409e-41)
+; EG-NEXT:    ALU clause starting at 74:
+; EG-NEXT:     LSHR T38.Y, T39.X, literal.x,
+; EG-NEXT:     LSHR * T50.W, T39.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T36.X, T38.Z, literal.x,
-; EG-NEXT:     AND_INT T42.Z, T41.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     AND_INT T38.X, T39.X, literal.x,
+; EG-NEXT:     AND_INT T50.Z, T39.W, literal.x,
+; EG-NEXT:     ADD_INT * T39.X, T41.X, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T43.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T42.Y, T41.X, literal.y,
-; EG-NEXT:     LSHR T44.W, T41.W, literal.y,
-; EG-NEXT:     AND_INT * T42.X, T41.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T44.Z, T41.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T50.Y, T39.Z, literal.x,
+; EG-NEXT:     LSHR * T51.W, T49.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T50.X, T39.Z, literal.x,
+; EG-NEXT:     AND_INT T51.Z, T49.Y, literal.x,
+; EG-NEXT:     ADD_INT * T52.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 20(2.802597e-44)
+; EG-NEXT:     LSHR T51.Y, T49.X, literal.x,
+; EG-NEXT:     LSHR * T53.W, T49.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T51.X, T49.X, literal.x,
+; EG-NEXT:     AND_INT T53.Z, T49.W, literal.x,
+; EG-NEXT:     ADD_INT * T49.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LSHR T53.Y, T49.Z, literal.x,
+; EG-NEXT:     LSHR * T54.W, T48.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T53.X, T49.Z, literal.x,
+; EG-NEXT:     AND_INT T54.Z, T48.Y, literal.x,
+; EG-NEXT:     ADD_INT * T55.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 28(3.923636e-44)
+; EG-NEXT:     LSHR T54.Y, T48.X, literal.x,
+; EG-NEXT:     LSHR * T56.W, T48.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T54.X, T48.X, literal.x,
+; EG-NEXT:     AND_INT T56.Z, T48.W, literal.x,
+; EG-NEXT:     ADD_INT * T48.X, T41.X, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T44.Y, T41.Z, literal.y,
-; EG-NEXT:     LSHR T45.W, T40.Y, literal.y,
-; EG-NEXT:     AND_INT * T44.X, T41.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T45.Z, T40.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T56.Y, T48.Z, literal.x,
+; EG-NEXT:     LSHR * T57.W, T47.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T56.X, T48.Z, literal.x,
+; EG-NEXT:     AND_INT T57.Z, T47.Y, literal.x,
+; EG-NEXT:     ADD_INT * T58.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 36(5.044674e-44)
+; EG-NEXT:     LSHR T57.Y, T47.X, literal.x,
+; EG-NEXT:     LSHR * T59.W, T47.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T57.X, T47.X, literal.x,
+; EG-NEXT:     AND_INT T59.Z, T47.W, literal.x,
+; EG-NEXT:     ADD_INT * T47.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LSHR T59.Y, T47.Z, literal.x,
+; EG-NEXT:     LSHR * T60.W, T46.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T59.X, T47.Z, literal.x,
+; EG-NEXT:     AND_INT T60.Z, T46.Y, literal.x,
+; EG-NEXT:     ADD_INT * T61.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 44(6.165713e-44)
+; EG-NEXT:     LSHR T60.Y, T46.X, literal.x,
+; EG-NEXT:     LSHR * T62.W, T46.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T60.X, T46.X, literal.x,
+; EG-NEXT:     AND_INT T62.Z, T46.W, literal.x,
+; EG-NEXT:     ADD_INT * T46.X, T41.X, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T45.Y, T40.X, literal.y,
-; EG-NEXT:     LSHR T47.W, T40.W, literal.y,
-; EG-NEXT:     AND_INT * T45.X, T40.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T47.Z, T40.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T47.Y, T40.Z, literal.y,
-; EG-NEXT:     AND_INT * T47.X, T40.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T37.W, T39.Y, literal.y,
-; EG-NEXT:    80(1.121039e-43), 16(2.242078e-44)
-; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T37.Z, T39.Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    ALU clause starting at 95:
-; EG-NEXT:     LSHR T37.Y, T39.X, literal.x,
-; EG-NEXT:     LSHR * T53.W, T39.W, literal.x,
+; EG-NEXT:     LSHR T62.Y, T46.Z, literal.x,
+; EG-NEXT:     LSHR * T63.W, T45.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T37.X, T39.X, literal.x,
-; EG-NEXT:     AND_INT T53.Z, T39.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T53.Y, T39.Z, literal.y,
-; EG-NEXT:     LSHR T54.W, T52.Y, literal.y,
-; EG-NEXT:     AND_INT * T53.X, T39.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T54.Z, T52.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
-; EG-NEXT:     LSHR T55.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T54.Y, T52.X, literal.y,
-; EG-NEXT:     LSHR T56.W, T52.W, literal.y,
-; EG-NEXT:     AND_INT * T54.X, T52.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T56.Z, T52.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
-; EG-NEXT:     LSHR T52.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T56.Y, T52.Z, literal.y,
-; EG-NEXT:     LSHR T57.W, T51.Y, literal.y,
-; EG-NEXT:     AND_INT * T56.X, T52.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T57.Z, T51.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
-; EG-NEXT:     LSHR T58.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T57.Y, T51.X, literal.y,
-; EG-NEXT:     LSHR T59.W, T51.W, literal.y,
-; EG-NEXT:     AND_INT * T57.X, T51.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T59.Z, T51.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
-; EG-NEXT:     LSHR T51.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T59.Y, T51.Z, literal.y,
-; EG-NEXT:     LSHR T60.W, T50.Y, literal.y,
-; EG-NEXT:     AND_INT * T59.X, T51.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T60.Z, T50.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
-; EG-NEXT:     LSHR T61.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T60.Y, T50.X, literal.y,
-; EG-NEXT:     LSHR T62.W, T50.W, literal.y,
-; EG-NEXT:     AND_INT * T60.X, T50.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T62.Z, T50.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
-; EG-NEXT:     LSHR T50.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T62.Y, T50.Z, literal.y,
-; EG-NEXT:     LSHR T63.W, T49.Y, literal.y,
-; EG-NEXT:     AND_INT * T62.X, T50.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T63.Z, T49.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
-; EG-NEXT:     LSHR T64.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T63.Y, T49.X, literal.y,
-; EG-NEXT:     LSHR T65.W, T49.W, literal.y,
-; EG-NEXT:     AND_INT * T63.X, T49.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T65.Z, T49.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T65.Y, T49.Z, literal.y,
-; EG-NEXT:     AND_INT * T65.X, T49.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T66.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T62.X, T46.Z, literal.x,
+; EG-NEXT:     AND_INT T63.Z, T45.Y, literal.x,
+; EG-NEXT:     ADD_INT * T64.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 52(7.286752e-44)
+; EG-NEXT:     LSHR T63.Y, T45.X, literal.x,
+; EG-NEXT:     LSHR * T65.W, T45.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T63.X, T45.X, literal.x,
+; EG-NEXT:     AND_INT T65.Z, T45.W, literal.x,
+; EG-NEXT:     ADD_INT * T45.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LSHR * T65.Y, T45.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T65.X, T45.Z, literal.x,
+; EG-NEXT:     ADD_INT * T66.X, T41.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 60(8.407791e-44)
 ;
 ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GFX12:       ; %bb.0:
@@ -5103,206 +5040,181 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ;
 ; EG-LABEL: constant_sextload_v64i16_to_v64i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 17, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @22
-; EG-NEXT:    ALU 75, @56, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 71, @132, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T48.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T41.X, 0
+; EG-NEXT:    ALU 73, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 65, @113, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T36.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T35.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T56.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T55.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T54.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T53.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T53.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T52.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T51.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T51.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T50.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T49.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T39.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T38.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T37.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T35.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T49.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T48.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T47.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T46.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T45.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T44.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T43.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_128 T42.XYZW, T41.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T43.XYZW, T41.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T44.XYZW, T41.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T45.XYZW, T41.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T46.XYZW, T41.X, 64, #1
-; EG-NEXT:     VTX_READ_128 T47.XYZW, T41.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T48.XYZW, T41.X, 96, #1
-; EG-NEXT:     VTX_READ_128 T41.XYZW, T41.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T41.XYZW, T35.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T40.XYZW, T35.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T42.XYZW, T35.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T35.XYZW, T35.X, 112, #1
 ; EG-NEXT:    ALU clause starting at 38:
-; EG-NEXT:     LSHR T35.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T41.X, KC0[2].Z,
+; EG-NEXT:     MOV * T35.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 39:
+; EG-NEXT:     LSHR * T43.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 56:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T50.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T51.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T52.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.Y, T41.W, literal.y,
-; EG-NEXT:     LSHR T0.Z, T41.Y, literal.y,
-; EG-NEXT:     LSHR T0.W, T48.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    176(2.466285e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T54.X, PS, literal.x,
-; EG-NEXT:     LSHR T1.Y, T48.Y, literal.y,
-; EG-NEXT:     LSHR T1.Z, T47.W, literal.y,
-; EG-NEXT:     LSHR T1.W, T47.Y, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T55.X, PS, literal.x,
-; EG-NEXT:     LSHR T2.Y, T46.W, literal.y,
-; EG-NEXT:     LSHR T2.Z, T46.Y, literal.y,
-; EG-NEXT:     LSHR T2.W, T45.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T56.X, PS, literal.x,
-; EG-NEXT:     LSHR T3.Y, T45.Y, literal.y,
-; EG-NEXT:     BFE_INT T57.Z, T44.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T3.W, T43.W, literal.y,
-; EG-NEXT:     LSHR * T4.W, T43.Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T57.X, T44.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T4.Y, T42.W, literal.x,
-; EG-NEXT:     BFE_INT T58.Z, T44.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T5.W, T42.Y, literal.x,
-; EG-NEXT:     LSHR * T6.W, T44.Y, literal.x,
+; EG-NEXT:     ADD_INT T44.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T45.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T46.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T47.X, T43.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T48.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T49.X, T43.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T50.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T51.X, T43.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T52.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T53.X, T43.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T54.X, T43.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T35.Y, literal.y,
+; EG-NEXT:     LSHR T0.Z, T36.W, literal.y,
+; EG-NEXT:     LSHR T0.W, T36.Y, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T37.W, literal.y,
+; EG-NEXT:    44(6.165713e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T55.X, T43.X, literal.x,
+; EG-NEXT:     LSHR T1.Y, T37.Y, literal.y,
+; EG-NEXT:     LSHR T1.Z, T38.W, literal.y,
+; EG-NEXT:     LSHR T2.W, T38.Y, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T39.W, literal.y,
+; EG-NEXT:    48(6.726233e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T56.X, T43.X, literal.x,
+; EG-NEXT:     LSHR T2.Y, T39.Y, literal.y,
+; EG-NEXT:     BFE_INT T57.Z, T42.Y, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T4.W, T40.W, literal.y,
+; EG-NEXT:     LSHR * T5.W, T40.Y, literal.y,
+; EG-NEXT:    52(7.286752e-44), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT T57.X, T42.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T3.Y, T41.W, literal.x,
+; EG-NEXT:     BFE_INT T58.Z, T42.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T6.W, T41.Y, literal.x,
+; EG-NEXT:     LSHR * T7.W, T42.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T58.X, T44.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T5.Y, T44.W, literal.x,
-; EG-NEXT:     BFE_INT T59.Z, T42.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T58.X, T42.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T4.Y, T42.W, literal.x,
+; EG-NEXT:     BFE_INT T59.Z, T41.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T57.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T44.X, literal.x,
+; EG-NEXT:     LSHR * T7.W, T42.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T59.X, T42.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T59.X, T41.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T57.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T60.Z, T42.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T60.Z, T41.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T58.W, PV.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T44.Z, literal.x,
+; EG-NEXT:     LSHR * T7.W, T42.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T60.X, T42.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T60.X, T41.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T58.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T44.Z, T43.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T59.W, T5.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T5.W, T42.X, literal.x,
+; EG-NEXT:     BFE_INT T42.Z, T40.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T59.W, T6.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T6.W, T41.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T44.X, T43.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.X, T40.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T59.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T61.Z, T43.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T60.W, T4.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T5.W, T42.Z, literal.x,
+; EG-NEXT:     BFE_INT T61.Z, T40.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T60.W, T3.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T6.W, T41.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T61.X, T43.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T61.X, T40.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T60.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.Z, T45.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T44.W, T4.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.Z, T39.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.W, T5.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T5.W, T40.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 132:
-; EG-NEXT:     LSHR * T4.W, T43.X, literal.x,
+; EG-NEXT:     BFE_INT T41.X, T39.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T42.Y, PS, 0.0, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T42.X, T45.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T44.Y, PV.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T62.Z, T45.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T61.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.W, T43.Z, literal.x,
+; EG-NEXT:    ALU clause starting at 113:
+; EG-NEXT:     BFE_INT T62.Z, T39.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T61.W, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T4.W, T40.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T62.X, T45.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T62.X, T39.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T61.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.Z, T46.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.W, T45.X, literal.x,
+; EG-NEXT:     BFE_INT T40.Z, T38.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T4.W, T39.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T43.X, T46.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T63.Z, T46.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T62.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T2.W, T45.Z, literal.x,
+; EG-NEXT:     BFE_INT T40.X, T38.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T63.Z, T38.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T62.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T39.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T63.X, T46.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T63.X, T38.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T62.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.Z, T47.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T2.W, T46.X, literal.x,
+; EG-NEXT:     BFE_INT T39.Z, T37.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T2.W, T38.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T45.X, T47.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T64.Z, T47.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T63.W, T2.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T2.W, T46.Z, literal.x,
+; EG-NEXT:     BFE_INT T39.X, T37.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T64.Z, T37.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T63.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T2.W, T38.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T64.X, T47.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T64.X, T37.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T63.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.Z, T48.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.W, T1.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.W, T47.X, literal.x,
+; EG-NEXT:     BFE_INT T38.Z, T36.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T37.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T46.X, T48.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T65.Z, T48.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T64.W, T1.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.W, T47.Z, literal.x,
+; EG-NEXT:     BFE_INT T38.X, T36.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T65.Z, T36.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T64.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T37.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T65.X, T48.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T65.X, T36.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T64.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T47.Z, T41.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T1.W, T48.X, literal.x,
+; EG-NEXT:     BFE_INT T37.Z, T35.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T38.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T36.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T47.X, T41.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T66.Z, T41.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T65.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T0.W, T48.Z, literal.x,
+; EG-NEXT:     BFE_INT T37.X, T35.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T38.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T66.Z, T35.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T65.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T36.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T66.X, T41.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T66.X, T35.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T65.Y, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR T1.Z, T41.X, literal.x,
-; EG-NEXT:     BFE_INT T47.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 224(3.138909e-43)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T47.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.Z, T41.Z, literal.y,
-; EG-NEXT:     BFE_INT T66.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T48.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T66.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T0.Z, T35.W, literal.x,
+; EG-NEXT:     BFE_INT T37.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T35.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T35.X, T43.X, literal.x,
+; EG-NEXT:     BFE_INT T37.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T66.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T35.Z, literal.y,
+; EG-NEXT:    56(7.847271e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T36.X, T43.X, literal.x,
+; EG-NEXT:     BFE_INT * T66.Y, PS, 0.0, literal.y,
+; EG-NEXT:    60(8.407791e-44), 16(2.242078e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GFX12:       ; %bb.0:
@@ -6114,7 +6026,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 13, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -6133,11 +6045,10 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV T5.Y, 0.0,
 ; EG-NEXT:     MOV T6.W, 0.0,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T8.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GFX12:       ; %bb.0:
@@ -6265,32 +6176,31 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T5.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     ASHR * T5.W, T5.X, literal.x,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ASHR T5.Z, T5.X, literal.y,
-; EG-NEXT:     ASHR * T7.W, T5.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ASHR * T6.W, T5.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
-; EG-NEXT:     ASHR * T7.Z, T5.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT:     ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     ASHR T6.Z, T5.Y, literal.x,
+; EG-NEXT:     ASHR * T5.W, T5.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T7.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR * T5.Z, T5.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT:     ASHR * T5.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T8.X, T7.X, literal.x,
+; EG-NEXT:     ASHR * T6.Y, T6.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GFX12:       ; %bb.0:
@@ -6451,7 +6361,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 30, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 26, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T13.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
@@ -6482,17 +6392,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV * T9.W, 0.0,
 ; EG-NEXT:     MOV T10.W, 0.0,
 ; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T11.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T13.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T14.X, T11.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GFX12:       ; %bb.0:
@@ -6698,7 +6604,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 29, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T9.X, 0
@@ -6709,19 +6615,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T10.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T10.Z, T7.X, literal.y,
-; EG-NEXT:     ASHR * T12.W, T7.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T9.X, PV.X, literal.x,
+; EG-NEXT:     ASHR T10.W, T7.X, literal.y,
+; EG-NEXT:     ADD_INT * T11.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T10.Z, T7.X, literal.x,
+; EG-NEXT:     ASHR * T12.W, T7.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T10.X, T7.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T12.Z, T7.Y, literal.x,
 ; EG-NEXT:     ASHR * T13.W, T7.Z, literal.y,
@@ -6737,12 +6640,11 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T14.X, T7.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T13.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T7.X, T8.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T14.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T14.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GFX12:       ; %bb.0:
@@ -7012,7 +6914,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 62, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 52, @17, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
@@ -7068,29 +6970,19 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; EG-NEXT:     MOV * T17.W, 0.0,
 ; EG-NEXT:     MOV T18.W, 0.0,
 ; EG-NEXT:     MOV * T11.W, 0.0,
-; EG-NEXT:     LSHR T19.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T19.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T20.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T22.X, T19.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, T19.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T24.X, T19.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T19.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT * T26.X, T19.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GFX12:       ; %bb.0:
@@ -7459,7 +7351,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 65, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 55, @17, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
@@ -7475,31 +7367,22 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; EG-NEXT:    ALU clause starting at 16:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 17:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T19.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T19.Z, T11.X, literal.y,
-; EG-NEXT:     ASHR * T21.W, T11.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, T13.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T18.X, T13.X, literal.x,
+; EG-NEXT:     ASHR T19.W, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T20.X, T13.X, literal.z,
+; EG-NEXT:    20(2.802597e-44), 31(4.344025e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T19.Z, T11.X, literal.x,
+; EG-NEXT:     ASHR * T21.W, T11.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T19.X, T11.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T21.Z, T11.Y, literal.x,
 ; EG-NEXT:     ASHR * T22.W, T11.Z, literal.y,
@@ -7535,12 +7418,11 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T26.X, T12.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T25.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T12.X, T13.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T26.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T26.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GFX12:       ; %bb.0:
@@ -8032,33 +7914,33 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; EG-LABEL: constant_zextload_v32i16_to_v32i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 2 @22
-; EG-NEXT:    ALU 33, @31, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @28
-; EG-NEXT:    ALU 92, @65, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @22
+; EG-NEXT:    ALU 9, @31, KC0[], KC1[]
+; EG-NEXT:    TEX 2 @24
+; EG-NEXT:    ALU 94, @41, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T50.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T49.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T48.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T49.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T48.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T47.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T46.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T45.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T44.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T43.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T42.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T43.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T41.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T39.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T38.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T37.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T35.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T35.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T21.XYZW, T20.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T22.XYZW, T20.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T23.XYZW, T20.X, 32, #1
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_128 T29.XYZW, T20.X, 0, #1
+; EG-NEXT:    Fetch clause starting at 24:
+; EG-NEXT:     VTX_READ_128 T23.XYZW, T20.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T24.XYZW, T20.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T25.XYZW, T20.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 30:
 ; EG-NEXT:     MOV * T20.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 31:
@@ -8066,130 +7948,108 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T19.X, T21.W, literal.x,
 ; EG-NEXT:     MOV T19.Y, 0.0,
-; EG-NEXT:     LSHR T24.Z, T21.Z, literal.y,
-; EG-NEXT:     AND_INT * T24.X, T21.Z, literal.x,
+; EG-NEXT:     LSHR T22.Z, T21.Z, literal.y,
+; EG-NEXT:     AND_INT * T22.X, T21.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T24.Y, 0.0,
-; EG-NEXT:     LSHR * T25.Z, T21.Y, literal.x,
+; EG-NEXT:     MOV T22.Y, 0.0,
+; EG-NEXT:     LSHR * T20.Z, T21.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T25.X, T21.Y, literal.x,
-; EG-NEXT:     MOV T25.Y, 0.0,
+; EG-NEXT:    ALU clause starting at 41:
+; EG-NEXT:     AND_INT T20.X, T21.Y, literal.x,
+; EG-NEXT:     MOV T20.Y, 0.0,
 ; EG-NEXT:     LSHR T21.Z, T21.X, literal.y,
 ; EG-NEXT:     AND_INT * T21.X, T21.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T21.Y, 0.0,
-; EG-NEXT:     LSHR * T26.Z, T23.W, literal.x,
+; EG-NEXT:     LSHR * T26.Z, T25.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T26.X, T23.W, literal.x,
+; EG-NEXT:     AND_INT T26.X, T25.W, literal.x,
 ; EG-NEXT:     MOV T26.Y, 0.0,
-; EG-NEXT:     LSHR T27.Z, T23.Z, literal.y,
-; EG-NEXT:     AND_INT * T27.X, T23.Z, literal.x,
+; EG-NEXT:     LSHR T27.Z, T25.Z, literal.y,
+; EG-NEXT:     AND_INT * T27.X, T25.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T27.Y, 0.0,
-; EG-NEXT:     LSHR * T28.Z, T23.Y, literal.x,
+; EG-NEXT:     LSHR * T28.Z, T25.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T28.X, T23.Y, literal.x,
+; EG-NEXT:     AND_INT T28.X, T25.Y, literal.x,
 ; EG-NEXT:     MOV T28.Y, 0.0,
-; EG-NEXT:     LSHR T23.Z, T23.X, literal.y,
-; EG-NEXT:     AND_INT * T23.X, T23.X, literal.x,
+; EG-NEXT:     LSHR T25.Z, T25.X, literal.y,
+; EG-NEXT:     AND_INT * T25.X, T25.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T23.Y, 0.0,
-; EG-NEXT:     LSHR * T20.Z, T22.W, literal.x,
+; EG-NEXT:     MOV T25.Y, 0.0,
+; EG-NEXT:     LSHR * T29.Z, T24.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 65:
-; EG-NEXT:     AND_INT T20.X, T22.W, literal.x,
-; EG-NEXT:     MOV T20.Y, 0.0,
-; EG-NEXT:     LSHR T30.Z, T22.Z, literal.y,
-; EG-NEXT:     AND_INT * T30.X, T22.Z, literal.x,
+; EG-NEXT:     AND_INT T29.X, T24.W, literal.x,
+; EG-NEXT:     MOV T29.Y, 0.0,
+; EG-NEXT:     LSHR T30.Z, T24.Z, literal.y,
+; EG-NEXT:     AND_INT * T30.X, T24.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T30.Y, 0.0,
-; EG-NEXT:     LSHR * T31.Z, T22.Y, literal.x,
+; EG-NEXT:     LSHR * T31.Z, T24.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T31.X, T22.Y, literal.x,
+; EG-NEXT:     AND_INT T31.X, T24.Y, literal.x,
 ; EG-NEXT:     MOV T31.Y, 0.0,
-; EG-NEXT:     LSHR T22.Z, T22.X, literal.y,
-; EG-NEXT:     AND_INT * T22.X, T22.X, literal.x,
+; EG-NEXT:     LSHR T24.Z, T24.X, literal.y,
+; EG-NEXT:     AND_INT * T24.X, T24.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T22.Y, 0.0,
-; EG-NEXT:     LSHR * T32.Z, T29.W, literal.x,
+; EG-NEXT:     MOV T24.Y, 0.0,
+; EG-NEXT:     LSHR * T32.Z, T23.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T32.X, T29.W, literal.x,
+; EG-NEXT:     AND_INT T32.X, T23.W, literal.x,
 ; EG-NEXT:     MOV T32.Y, 0.0,
-; EG-NEXT:     LSHR T33.Z, T29.Z, literal.y,
-; EG-NEXT:     AND_INT * T33.X, T29.Z, literal.x,
+; EG-NEXT:     LSHR T33.Z, T23.Z, literal.y,
+; EG-NEXT:     AND_INT * T33.X, T23.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T33.Y, 0.0,
-; EG-NEXT:     LSHR * T34.Z, T29.Y, literal.x,
+; EG-NEXT:     LSHR * T34.Z, T23.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T34.X, T29.Y, literal.x,
+; EG-NEXT:     AND_INT T34.X, T23.Y, literal.x,
 ; EG-NEXT:     MOV T34.Y, 0.0,
-; EG-NEXT:     LSHR T29.Z, T29.X, literal.y,
-; EG-NEXT:     AND_INT * T29.X, T29.X, literal.x,
+; EG-NEXT:     LSHR T23.Z, T23.X, literal.y,
+; EG-NEXT:     AND_INT * T23.X, T23.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T29.Y, 0.0,
+; EG-NEXT:     MOV T23.Y, 0.0,
 ; EG-NEXT:     MOV T19.W, 0.0,
-; EG-NEXT:     MOV * T24.W, 0.0,
-; EG-NEXT:     MOV T25.W, 0.0,
+; EG-NEXT:     MOV * T22.W, 0.0,
+; EG-NEXT:     MOV T20.W, 0.0,
 ; EG-NEXT:     MOV * T21.W, 0.0,
 ; EG-NEXT:     MOV T26.W, 0.0,
 ; EG-NEXT:     MOV * T27.W, 0.0,
 ; EG-NEXT:     MOV T28.W, 0.0,
-; EG-NEXT:     MOV * T23.W, 0.0,
-; EG-NEXT:     MOV T20.W, 0.0,
+; EG-NEXT:     MOV * T25.W, 0.0,
+; EG-NEXT:     MOV T29.W, 0.0,
 ; EG-NEXT:     MOV * T30.W, 0.0,
 ; EG-NEXT:     MOV T31.W, 0.0,
-; EG-NEXT:     MOV * T22.W, 0.0,
+; EG-NEXT:     MOV * T24.W, 0.0,
 ; EG-NEXT:     MOV T32.W, 0.0,
 ; EG-NEXT:     MOV * T33.W, 0.0,
 ; EG-NEXT:     MOV T34.W, 0.0,
-; EG-NEXT:     MOV * T29.W, 0.0,
-; EG-NEXT:     LSHR T35.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T42.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T43.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T45.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T23.W, 0.0,
+; EG-NEXT:     LSHR * T35.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T36.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T37.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T38.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T39.X, T35.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T40.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T41.X, T35.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T42.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T43.X, T35.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T44.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T45.X, T35.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T46.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T47.X, T35.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T48.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T49.X, T35.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT * T50.X, T35.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GFX12:       ; %bb.0:
@@ -8886,9 +8746,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
-; EG-NEXT:    ALU 55, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 33, @31, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @24
-; EG-NEXT:    ALU 74, @87, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 74, @65, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T38.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T34.X, 0
@@ -8903,8 +8763,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T24.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T22.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T21.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
@@ -8915,68 +8775,47 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; EG-NEXT:    ALU clause starting at 30:
 ; EG-NEXT:     MOV * T19.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 31:
-; EG-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T35.W, T20.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T35.Z, T20.X, literal.y,
-; EG-NEXT:     ASHR * T37.W, T20.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T35.X, T20.X, 0.0, literal.x,
-; EG-NEXT:     ASHR * T37.Z, T20.Y, literal.x,
+; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T24.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T21.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T26.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T27.X, T21.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T28.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T29.X, T21.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T30.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T31.X, T21.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T32.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T33.X, T21.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T34.X, T21.X, literal.x,
+; EG-NEXT:     ASHR T35.W, T20.Y, literal.y,
+; EG-NEXT:     ADD_INT * T36.X, T21.X, literal.z,
+; EG-NEXT:    52(7.286752e-44), 31(4.344025e-44)
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T35.Z, T20.Y, literal.x,
+; EG-NEXT:     ASHR * T37.W, T20.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT T35.X, T20.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR * T37.Z, T20.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T37.X, T20.Y, 0.0, literal.x,
-; EG-NEXT:     ASHR T35.Y, PV.X, literal.y,
+; EG-NEXT:     BFE_INT T37.X, T20.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR * T19.W, T20.Z, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    ALU clause starting at 87:
-; EG-NEXT:     ASHR T19.Z, T20.Z, literal.x,
-; EG-NEXT:     ASHR * T41.W, T20.W, literal.y,
-; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    ALU clause starting at 65:
+; EG-NEXT:     ASHR T37.Y, T37.X, literal.x,
+; EG-NEXT:     ASHR T19.Z, T20.Z, literal.y,
+; EG-NEXT:     ASHR * T41.W, T20.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T19.X, T20.Z, 0.0, literal.x,
-; EG-NEXT:     ASHR T37.Y, T37.X, literal.y,
+; EG-NEXT:     ASHR T35.Y, T35.X, literal.y,
 ; EG-NEXT:     ASHR T41.Z, T20.W, literal.x,
 ; EG-NEXT:     ASHR * T42.W, T40.X, literal.y, BS:VEC_120/SCL_212
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
@@ -9041,12 +8880,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T50.X, T38.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T49.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T38.X, T21.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T50.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T50.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GFX12:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index a44fd4b1b64f3..917c690ec5e89 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -252,21 +252,19 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:     MOV * T2.X, T0.Z,
+; EG-NEXT:     MOV T1.X, T0.Z,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_load_v3i32:
@@ -496,7 +494,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -506,11 +504,10 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_load_v8i32:
 ; GFX9-HSA:       ; %bb.0: ; %entry
@@ -664,7 +661,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 7, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 4, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 1
@@ -677,14 +674,11 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR * T5.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T4.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T5.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
 ;
 ; GFX9-HSA-LABEL: constant_load_v9i32:
 ; GFX9-HSA:       ; %bb.0: ; %entry
@@ -848,30 +842,26 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; EG-LABEL: constant_load_v10i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 8, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
+; EG-NEXT:    ALU 2, @17, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T1.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T5.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.X, KC0[2].Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 17:
+; EG-NEXT:     ADD_INT T4.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T5.X, T0.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 4(5.605194e-45)
 ;
 ; GFX9-HSA-LABEL: constant_load_v10i32:
 ; GFX9-HSA:       ; %bb.0: ; %entry
@@ -1045,11 +1035,11 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 7, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
@@ -1058,20 +1048,14 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T4.X, T0.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T4.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T5.X, T0.Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T6.X, T3.X, literal.x,
+; EG-NEXT:     ADD_INT * T7.X, T3.X, literal.y,
+; EG-NEXT:    10(1.401298e-44), 4(5.605194e-45)
 ;
 ; GFX9-HSA-LABEL: constant_load_v11i32:
 ; GFX9-HSA:       ; %bb.0: ; %entry
@@ -1245,9 +1229,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; EG-LABEL: constant_load_v12i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 7, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 1, @22, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @19, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
@@ -1258,17 +1242,14 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ; EG-NEXT:     VTX_READ_128 T4.XYZW, T2.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T2.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 22:
-; EG-NEXT:     LSHR * T5.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 19:
+; EG-NEXT:     ADD_INT * T5.X, T0.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_load_v12i32:
 ; GFX9-HSA:       ; %bb.0: ; %entry
@@ -1466,9 +1447,9 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; EG-LABEL: constant_load_v16i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 10, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 5, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @8
-; EG-NEXT:    ALU 1, @27, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @22, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
@@ -1480,20 +1461,15 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; EG-NEXT:     VTX_READ_128 T6.XYZW, T3.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T3.XYZW, T3.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 16:
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T3.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 27:
-; EG-NEXT:     LSHR * T7.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     MOV * T3.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 22:
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_load_v16i32:
 ; GFX9-HSA:       ; %bb.0: ; %entry
@@ -2297,7 +2273,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
@@ -2314,11 +2290,10 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV T1.W, 0.0,
 ; EG-NEXT:     MOV * T2.Z, T0.Y,
 ; EG-NEXT:     MOV * T2.W, 0.0,
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -2461,31 +2436,30 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
+; EG-NEXT:     ASHR * T1.W, T0.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ASHR T1.Y, T0.X, literal.y,
-; EG-NEXT:     ASHR T3.W, T0.W, literal.y,
-; EG-NEXT:     MOV * T1.X, T0.X,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:     ASHR * T3.Y, T0.Z, literal.x,
+; EG-NEXT:     ASHR T1.Y, T0.Z, literal.x,
+; EG-NEXT:     ASHR * T2.W, T0.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T3.X, T0.Z,
-; EG-NEXT:     MOV T1.Z, T0.Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T3.Z, T0.W,
+; EG-NEXT:     MOV T1.X, T0.Z,
+; EG-NEXT:     ASHR T2.Y, T0.X, literal.x,
+; EG-NEXT:     MOV * T2.X, T0.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T2.Z, T0.Y,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T1.Z, T0.W,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -2665,7 +2639,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 26, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 22, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
@@ -2693,17 +2667,13 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV T4.W, 0.0,
 ; EG-NEXT:     MOV * T5.Z, T0.Y,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -2928,7 +2898,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 31, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 27, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0
@@ -2940,38 +2910,34 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 12:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 13:
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T4.W, T0.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; EG-NEXT:     ASHR T4.W, T0.Y, literal.y,
+; EG-NEXT:     ADD_INT * T5.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T4.Y, T0.X, literal.x,
+; EG-NEXT:     ASHR * T6.W, T0.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T4.Y, T0.X, literal.y,
-; EG-NEXT:     ASHR T6.W, T0.W, literal.y,
-; EG-NEXT:     MOV * T4.X, T0.X,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     MOV T4.X, T0.X,
 ; EG-NEXT:     ASHR T6.Y, T0.Z, literal.x,
-; EG-NEXT:     ASHR * T7.W, T1.Y, literal.x,
+; EG-NEXT:     ASHR T7.W, T1.Y, literal.x,
+; EG-NEXT:     MOV * T6.X, T0.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T6.X, T0.Z,
 ; EG-NEXT:     ASHR T7.Y, T1.X, literal.x,
 ; EG-NEXT:     MOV T4.Z, T0.Y,
-; EG-NEXT:     ASHR T8.W, T1.W, literal.x,
-; EG-NEXT:     MOV * T7.X, T1.X,
+; EG-NEXT:     ASHR * T8.W, T1.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T7.X, T1.X,
 ; EG-NEXT:     ASHR T8.Y, T1.Z, literal.x,
-; EG-NEXT:     MOV * T6.Z, T0.W,
+; EG-NEXT:     MOV T6.Z, T0.W,
+; EG-NEXT:     MOV * T8.X, T1.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T8.X, T1.Z,
-; EG-NEXT:     MOV T7.Z, T1.Y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T7.Z, T1.Y,
+; EG-NEXT:     ADD_INT T0.X, T2.X, literal.x,
 ; EG-NEXT:     MOV * T8.Z, T1.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -3349,7 +3315,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 63, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 53, @21, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0
@@ -3367,70 +3333,60 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; EG-NEXT:    ALU clause starting at 20:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 21:
-; EG-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T10.W, T0.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T5.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T7.X, T4.X, literal.x,
+; EG-NEXT:     ADD_INT * T8.X, T4.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T9.X, T4.X, literal.x,
+; EG-NEXT:     ASHR T10.W, T0.Y, literal.y,
+; EG-NEXT:     ADD_INT * T11.X, T4.X, literal.z,
+; EG-NEXT:    20(2.802597e-44), 31(4.344025e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T10.Y, T0.X, literal.x,
+; EG-NEXT:     ASHR * T12.W, T0.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T10.Y, T0.X, literal.y,
-; EG-NEXT:     ASHR T12.W, T0.W, literal.y,
-; EG-NEXT:     MOV * T10.X, T0.X,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     MOV T10.X, T0.X,
 ; EG-NEXT:     ASHR T12.Y, T0.Z, literal.x,
-; EG-NEXT:     ASHR * T13.W, T3.Y, literal.x,
+; EG-NEXT:     ASHR T13.W, T3.Y, literal.x,
+; EG-NEXT:     MOV * T12.X, T0.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T12.X, T0.Z,
 ; EG-NEXT:     ASHR T13.Y, T3.X, literal.x,
 ; EG-NEXT:     MOV T10.Z, T0.Y,
-; EG-NEXT:     ASHR T14.W, T3.W, literal.x,
-; EG-NEXT:     MOV * T13.X, T3.X,
+; EG-NEXT:     ASHR * T14.W, T3.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T13.X, T3.X,
 ; EG-NEXT:     ASHR T14.Y, T3.Z, literal.x,
 ; EG-NEXT:     MOV T12.Z, T0.W,
-; EG-NEXT:     ASHR * T0.W, T2.Y, literal.x,
+; EG-NEXT:     ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT:     MOV * T14.X, T3.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T14.X, T3.Z,
 ; EG-NEXT:     ASHR T0.Y, T2.X, literal.x,
 ; EG-NEXT:     MOV T13.Z, T3.Y,
-; EG-NEXT:     ASHR T15.W, T2.W, literal.x,
-; EG-NEXT:     MOV * T0.X, T2.X,
+; EG-NEXT:     ASHR * T15.W, T2.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.X, T2.X,
 ; EG-NEXT:     ASHR T15.Y, T2.Z, literal.x,
 ; EG-NEXT:     MOV T14.Z, T3.W,
-; EG-NEXT:     ASHR * T3.W, T1.Y, literal.x,
+; EG-NEXT:     ASHR T3.W, T1.Y, literal.x,
+; EG-NEXT:     MOV * T15.X, T2.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T15.X, T2.Z,
 ; EG-NEXT:     ASHR T3.Y, T1.X, literal.x,
 ; EG-NEXT:     MOV T0.Z, T2.Y,
-; EG-NEXT:     ASHR T16.W, T1.W, literal.x,
-; EG-NEXT:     MOV * T3.X, T1.X,
+; EG-NEXT:     ASHR * T16.W, T1.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T3.X, T1.X,
 ; EG-NEXT:     ASHR T16.Y, T1.Z, literal.x,
-; EG-NEXT:     MOV * T15.Z, T2.W,
+; EG-NEXT:     MOV T15.Z, T2.W,
+; EG-NEXT:     MOV * T16.X, T1.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T16.X, T1.Z,
-; EG-NEXT:     MOV T3.Z, T1.Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T3.Z, T1.Y,
+; EG-NEXT:     ADD_INT T1.X, T4.X, literal.x,
 ; EG-NEXT:     MOV * T16.Z, T1.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -3799,7 +3755,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 54, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 44, @21, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0
@@ -3849,29 +3805,19 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; EG-NEXT:     MOV T10.W, 0.0,
 ; EG-NEXT:     MOV * T11.Z, T2.Y,
 ; EG-NEXT:     MOV * T11.W, 0.0,
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T15.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T12.X, T0.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T13.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T14.X, T0.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -4509,167 +4455,144 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ;
 ; EG-LABEL: constant_sextload_v32i32_to_v32i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 32, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 10, @36, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @20
-; EG-NEXT:    ALU 96, @69, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
+; EG-NEXT:    ALU 95, @47, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T23.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T17.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T16.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T15.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 20:
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 112, #1
-; EG-NEXT:     VTX_READ_128 T13.XYZW, T11.X, 96, #1
-; EG-NEXT:     VTX_READ_128 T14.XYZW, T11.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T15.XYZW, T11.X, 64, #1
-; EG-NEXT:     VTX_READ_128 T16.XYZW, T11.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T17.XYZW, T11.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T18.XYZW, T11.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T6.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T8.XYZW, T6.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T9.XYZW, T6.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T10.XYZW, T6.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T6.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T6.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T13.XYZW, T6.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T6.XYZW, T6.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 36:
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T11.X, KC0[2].Z,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 69:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    176(2.466285e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T22.W, T11.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     MOV * T6.X, KC0[2].Z,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 47:
+; EG-NEXT:     ADD_INT T14.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.y,
+; EG-NEXT:    24(3.363116e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T16.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, T0.X, literal.y,
+; EG-NEXT:    32(4.484155e-44), 36(5.044674e-44)
+; EG-NEXT:     ADD_INT T18.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T0.X, literal.y,
+; EG-NEXT:    40(5.605194e-44), 44(6.165713e-44)
+; EG-NEXT:     ADD_INT T20.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T0.X, literal.y,
+; EG-NEXT:    48(6.726233e-44), 52(7.286752e-44)
+; EG-NEXT:     ASHR * T22.W, T6.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T22.Y, T11.X, literal.y,
-; EG-NEXT:     ASHR T24.W, T11.W, literal.y,
-; EG-NEXT:     MOV * T22.X, T11.X,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:     ASHR T24.Y, T11.Z, literal.x,
-; EG-NEXT:     ASHR * T25.W, T18.Y, literal.x,
+; EG-NEXT:     ADD_INT T23.X, T0.X, literal.x,
+; EG-NEXT:     ASHR T22.Y, T6.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ASHR T24.W, T6.W, literal.y,
+; EG-NEXT:     MOV * T22.X, T6.X,
+; EG-NEXT:    56(7.847271e-44), 31(4.344025e-44)
+; EG-NEXT:     ASHR T24.Y, T6.Z, literal.x,
+; EG-NEXT:     ASHR * T25.W, T13.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T24.X, T11.Z,
-; EG-NEXT:     ASHR T25.Y, T18.X, literal.x,
-; EG-NEXT:     MOV T22.Z, T11.Y,
-; EG-NEXT:     ASHR T26.W, T18.W, literal.x,
-; EG-NEXT:     MOV * T25.X, T18.X,
+; EG-NEXT:     MOV T24.X, T6.Z,
+; EG-NEXT:     ASHR T25.Y, T13.X, literal.x,
+; EG-NEXT:     MOV T22.Z, T6.Y,
+; EG-NEXT:     ASHR T26.W, T13.W, literal.x,
+; EG-NEXT:     MOV * T25.X, T13.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T26.Y, T18.Z, literal.x,
-; EG-NEXT:     MOV T24.Z, T11.W,
-; EG-NEXT:     ASHR * T11.W, T17.Y, literal.x,
+; EG-NEXT:     ASHR T26.Y, T13.Z, literal.x,
+; EG-NEXT:     MOV T24.Z, T6.W,
+; EG-NEXT:     ASHR * T6.W, T12.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T26.X, T18.Z,
-; EG-NEXT:     ASHR T11.Y, T17.X, literal.x,
-; EG-NEXT:     MOV T25.Z, T18.Y,
-; EG-NEXT:     ASHR T27.W, T17.W, literal.x,
-; EG-NEXT:     MOV * T11.X, T17.X,
+; EG-NEXT:     MOV T26.X, T13.Z,
+; EG-NEXT:     ASHR T6.Y, T12.X, literal.x,
+; EG-NEXT:     MOV T25.Z, T13.Y,
+; EG-NEXT:     ASHR T27.W, T12.W, literal.x,
+; EG-NEXT:     MOV * T6.X, T12.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T27.Y, T17.Z, literal.x,
-; EG-NEXT:     MOV T26.Z, T18.W,
-; EG-NEXT:     ASHR * T18.W, T16.Y, literal.x,
+; EG-NEXT:     ASHR T27.Y, T12.Z, literal.x,
+; EG-NEXT:     MOV T26.Z, T13.W,
+; EG-NEXT:     ASHR * T13.W, T11.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T27.X, T17.Z,
-; EG-NEXT:     ASHR T18.Y, T16.X, literal.x,
-; EG-NEXT:     MOV T11.Z, T17.Y,
-; EG-NEXT:     ASHR T28.W, T16.W, literal.x,
-; EG-NEXT:     MOV * T18.X, T16.X,
+; EG-NEXT:     MOV T27.X, T12.Z,
+; EG-NEXT:     ASHR T13.Y, T11.X, literal.x,
+; EG-NEXT:     MOV T6.Z, T12.Y,
+; EG-NEXT:     ASHR T28.W, T11.W, literal.x,
+; EG-NEXT:     MOV * T13.X, T11.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T28.Y, T16.Z, literal.x,
-; EG-NEXT:     MOV T27.Z, T17.W,
-; EG-NEXT:     ASHR * T17.W, T15.Y, literal.x,
+; EG-NEXT:     ASHR T28.Y, T11.Z, literal.x,
+; EG-NEXT:     MOV T27.Z, T12.W,
+; EG-NEXT:     ASHR * T12.W, T10.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T28.X, T16.Z,
-; EG-NEXT:     ASHR T17.Y, T15.X, literal.x,
-; EG-NEXT:     MOV T18.Z, T16.Y,
-; EG-NEXT:     ASHR T29.W, T15.W, literal.x,
-; EG-NEXT:     MOV * T17.X, T15.X,
+; EG-NEXT:     MOV T28.X, T11.Z,
+; EG-NEXT:     ASHR T12.Y, T10.X, literal.x,
+; EG-NEXT:     MOV T13.Z, T11.Y,
+; EG-NEXT:     ASHR T29.W, T10.W, literal.x,
+; EG-NEXT:     MOV * T12.X, T10.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T29.Y, T15.Z, literal.x,
-; EG-NEXT:     MOV T28.Z, T16.W,
-; EG-NEXT:     ASHR * T16.W, T14.Y, literal.x,
+; EG-NEXT:     ASHR T29.Y, T10.Z, literal.x,
+; EG-NEXT:     MOV T28.Z, T11.W,
+; EG-NEXT:     ASHR * T11.W, T9.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T29.X, T15.Z,
-; EG-NEXT:     ASHR T16.Y, T14.X, literal.x,
-; EG-NEXT:     MOV T17.Z, T15.Y,
-; EG-NEXT:     ASHR T30.W, T14.W, literal.x,
-; EG-NEXT:     MOV * T16.X, T14.X,
+; EG-NEXT:     MOV T29.X, T10.Z,
+; EG-NEXT:     ASHR T11.Y, T9.X, literal.x,
+; EG-NEXT:     MOV T12.Z, T10.Y,
+; EG-NEXT:     ASHR T30.W, T9.W, literal.x,
+; EG-NEXT:     MOV * T11.X, T9.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T30.Y, T14.Z, literal.x,
-; EG-NEXT:     MOV T29.Z, T15.W,
-; EG-NEXT:     ASHR * T15.W, T13.Y, literal.x,
+; EG-NEXT:     ASHR T30.Y, T9.Z, literal.x,
+; EG-NEXT:     MOV T29.Z, T10.W,
+; EG-NEXT:     ASHR * T10.W, T8.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T30.X, T14.Z,
-; EG-NEXT:     ASHR T15.Y, T13.X, literal.x,
-; EG-NEXT:     MOV T16.Z, T14.Y,
-; EG-NEXT:     ASHR T31.W, T13.W, literal.x,
-; EG-NEXT:     MOV * T15.X, T13.X,
+; EG-NEXT:     MOV T30.X, T9.Z,
+; EG-NEXT:     ASHR T10.Y, T8.X, literal.x,
+; EG-NEXT:     MOV T11.Z, T9.Y,
+; EG-NEXT:     ASHR T31.W, T8.W, literal.x,
+; EG-NEXT:     MOV * T10.X, T8.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T31.Y, T13.Z, literal.x,
-; EG-NEXT:     MOV T30.Z, T14.W,
-; EG-NEXT:     ASHR * T14.W, T12.Y, literal.x,
+; EG-NEXT:     ASHR T31.Y, T8.Z, literal.x,
+; EG-NEXT:     MOV T30.Z, T9.W,
+; EG-NEXT:     ASHR * T9.W, T7.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T31.X, T13.Z,
-; EG-NEXT:     ASHR T14.Y, T12.X, literal.x,
-; EG-NEXT:     MOV T15.Z, T13.Y,
-; EG-NEXT:     ASHR T32.W, T12.W, literal.x,
-; EG-NEXT:     MOV * T14.X, T12.X,
+; EG-NEXT:     MOV T31.X, T8.Z,
+; EG-NEXT:     ASHR T9.Y, T7.X, literal.x,
+; EG-NEXT:     MOV T10.Z, T8.Y,
+; EG-NEXT:     ASHR T32.W, T7.W, literal.x,
+; EG-NEXT:     MOV * T9.X, T7.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T32.Y, T12.Z, literal.x,
-; EG-NEXT:     MOV * T31.Z, T13.W,
+; EG-NEXT:     ASHR T32.Y, T7.Z, literal.x,
+; EG-NEXT:     MOV * T31.Z, T8.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T32.X, T12.Z,
-; EG-NEXT:     MOV T14.Z, T12.Y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T32.Z, T12.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV T32.X, T7.Z,
+; EG-NEXT:     MOV T9.Z, T7.Y,
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV * T32.Z, T7.W,
 ;
 ; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -5357,153 +5280,127 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ;
 ; EG-LABEL: constant_zextload_v32i32_to_v32i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 2 @22
-; EG-NEXT:    ALU 10, @39, KC0[], KC1[]
-; EG-NEXT:    TEX 4 @28
-; EG-NEXT:    ALU 99, @50, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0
+; EG-NEXT:    ALU 0, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 7 @20
+; EG-NEXT:    ALU 88, @37, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T31.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T30.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T29.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    Fetch clause starting at 22:
+; EG-NEXT:    Fetch clause starting at 20:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 112, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 96, #1
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T11.XYZW, T0.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T13.XYZW, T0.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 64, #1
-; EG-NEXT:    ALU clause starting at 38:
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T4.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T5.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T6.XYZW, T0.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T0.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 96, #1
+; EG-NEXT:    ALU clause starting at 36:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 39:
-; EG-NEXT:     MOV T4.X, T1.Z,
-; EG-NEXT:     MOV T4.Y, 0.0,
-; EG-NEXT:     MOV * T5.X, T1.X,
-; EG-NEXT:     MOV * T5.Y, 0.0,
-; EG-NEXT:     MOV T6.X, T3.Z,
-; EG-NEXT:     MOV T6.Y, 0.0,
-; EG-NEXT:     MOV * T7.X, T3.X,
-; EG-NEXT:     MOV * T7.Y, 0.0,
-; EG-NEXT:     MOV T8.X, T2.Z,
+; EG-NEXT:    ALU clause starting at 37:
+; EG-NEXT:     MOV T8.X, T1.Z,
 ; EG-NEXT:     MOV T8.Y, 0.0,
-; EG-NEXT:     MOV * T9.X, T2.X,
-; EG-NEXT:    ALU clause starting at 50:
+; EG-NEXT:     MOV * T9.X, T1.X,
 ; EG-NEXT:     MOV * T9.Y, 0.0,
-; EG-NEXT:     MOV T14.X, T0.Z,
+; EG-NEXT:     MOV T10.X, T0.Z,
+; EG-NEXT:     MOV T10.Y, 0.0,
+; EG-NEXT:     MOV * T11.X, T0.X,
+; EG-NEXT:     MOV * T11.Y, 0.0,
+; EG-NEXT:     MOV T12.X, T7.Z,
+; EG-NEXT:     MOV T12.Y, 0.0,
+; EG-NEXT:     MOV * T13.X, T7.X,
+; EG-NEXT:     MOV * T13.Y, 0.0,
+; EG-NEXT:     MOV T14.X, T6.Z,
 ; EG-NEXT:     MOV T14.Y, 0.0,
-; EG-NEXT:     MOV * T15.X, T0.X,
+; EG-NEXT:     MOV * T15.X, T6.X,
 ; EG-NEXT:     MOV * T15.Y, 0.0,
-; EG-NEXT:     MOV T16.X, T13.Z,
+; EG-NEXT:     MOV T16.X, T5.Z,
 ; EG-NEXT:     MOV T16.Y, 0.0,
-; EG-NEXT:     MOV * T17.X, T13.X,
+; EG-NEXT:     MOV * T17.X, T5.X,
 ; EG-NEXT:     MOV * T17.Y, 0.0,
-; EG-NEXT:     MOV T18.X, T12.Z,
+; EG-NEXT:     MOV T18.X, T4.Z,
 ; EG-NEXT:     MOV T18.Y, 0.0,
-; EG-NEXT:     MOV * T19.X, T12.X,
+; EG-NEXT:     MOV * T19.X, T4.X,
 ; EG-NEXT:     MOV * T19.Y, 0.0,
-; EG-NEXT:     MOV T20.X, T11.Z,
+; EG-NEXT:     MOV T20.X, T3.Z,
 ; EG-NEXT:     MOV T20.Y, 0.0,
-; EG-NEXT:     MOV * T21.X, T11.X,
+; EG-NEXT:     MOV * T21.X, T3.X,
 ; EG-NEXT:     MOV * T21.Y, 0.0,
-; EG-NEXT:     MOV T22.X, T10.Z,
+; EG-NEXT:     MOV T22.X, T2.Z,
 ; EG-NEXT:     MOV T22.Y, 0.0,
-; EG-NEXT:     MOV * T23.X, T10.X,
+; EG-NEXT:     MOV * T23.X, T2.X,
 ; EG-NEXT:     MOV T23.Y, 0.0,
-; EG-NEXT:     MOV T4.Z, T1.W,
-; EG-NEXT:     MOV T4.W, 0.0,
-; EG-NEXT:     MOV * T5.Z, T1.Y,
-; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     MOV T6.Z, T3.W,
-; EG-NEXT:     MOV T6.W, 0.0,
-; EG-NEXT:     MOV * T7.Z, T3.Y,
-; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     MOV T8.Z, T2.W,
+; EG-NEXT:     MOV T8.Z, T1.W,
 ; EG-NEXT:     MOV T8.W, 0.0,
-; EG-NEXT:     MOV * T9.Z, T2.Y,
+; EG-NEXT:     MOV * T9.Z, T1.Y,
 ; EG-NEXT:     MOV * T9.W, 0.0,
-; EG-NEXT:     MOV T14.Z, T0.W,
+; EG-NEXT:     MOV T10.Z, T0.W,
+; EG-NEXT:     MOV T10.W, 0.0,
+; EG-NEXT:     MOV * T11.Z, T0.Y,
+; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     MOV T12.Z, T7.W,
+; EG-NEXT:     MOV T12.W, 0.0,
+; EG-NEXT:     MOV * T13.Z, T7.Y,
+; EG-NEXT:     MOV * T13.W, 0.0,
+; EG-NEXT:     MOV T14.Z, T6.W,
 ; EG-NEXT:     MOV T14.W, 0.0,
-; EG-NEXT:     MOV * T15.Z, T0.Y,
+; EG-NEXT:     MOV * T15.Z, T6.Y,
 ; EG-NEXT:     MOV * T15.W, 0.0,
-; EG-NEXT:     MOV T16.Z, T13.W,
+; EG-NEXT:     MOV T16.Z, T5.W,
 ; EG-NEXT:     MOV T16.W, 0.0,
-; EG-NEXT:     MOV * T17.Z, T13.Y,
+; EG-NEXT:     MOV * T17.Z, T5.Y,
 ; EG-NEXT:     MOV * T17.W, 0.0,
-; EG-NEXT:     MOV T18.Z, T12.W,
+; EG-NEXT:     MOV T18.Z, T4.W,
 ; EG-NEXT:     MOV T18.W, 0.0,
-; EG-NEXT:     MOV * T19.Z, T12.Y,
+; EG-NEXT:     MOV * T19.Z, T4.Y,
 ; EG-NEXT:     MOV * T19.W, 0.0,
-; EG-NEXT:     MOV T20.Z, T11.W,
+; EG-NEXT:     MOV T20.Z, T3.W,
 ; EG-NEXT:     MOV T20.W, 0.0,
-; EG-NEXT:     MOV * T21.Z, T11.Y,
+; EG-NEXT:     MOV * T21.Z, T3.Y,
 ; EG-NEXT:     MOV * T21.W, 0.0,
-; EG-NEXT:     MOV T22.Z, T10.W,
+; EG-NEXT:     MOV T22.Z, T2.W,
 ; EG-NEXT:     MOV T22.W, 0.0,
-; EG-NEXT:     MOV * T23.Z, T10.Y,
+; EG-NEXT:     MOV * T23.Z, T2.Y,
 ; EG-NEXT:     MOV * T23.W, 0.0,
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR * T31.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, T0.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T7.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T24.X, T0.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T25.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T26.X, T0.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T27.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T28.X, T0.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T29.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T30.X, T0.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT * T31.X, T0.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX9-HSA:       ; %bb.0:
@@ -5934,9 +5831,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; EG-LABEL: constant_load_v32i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 22, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @28, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @12
-; EG-NEXT:    ALU 1, @51, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @40, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
@@ -5956,32 +5853,21 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; EG-NEXT:     VTX_READ_128 T14.XYZW, T7.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 28:
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T7.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 51:
-; EG-NEXT:     LSHR * T15.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, T0.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     MOV * T7.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 40:
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX9-HSA-LABEL: constant_load_v32i32:
 ; GFX9-HSA:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 6f310f35327ce..230232fe21524 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -235,21 +235,20 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_load_v3i64:
 ; GFX12:       ; %bb.0: ; %entry
@@ -350,7 +349,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -360,11 +359,10 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_load_v4i64:
 ; GFX12:       ; %bb.0: ; %entry
@@ -510,9 +508,9 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; EG-LABEL: constant_load_v8i64:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 10, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 5, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @8
-; EG-NEXT:    ALU 1, @27, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @22, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
@@ -524,20 +522,15 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; EG-NEXT:     VTX_READ_128 T6.XYZW, T3.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T3.XYZW, T3.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 16:
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T3.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 27:
-; EG-NEXT:     LSHR * T7.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     MOV * T3.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 22:
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_load_v8i64:
 ; GFX12:       ; %bb.0: ; %entry
@@ -797,9 +790,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ;
 ; EG-LABEL: constant_load_v16i64:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 22, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @28, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @12
-; EG-NEXT:    ALU 1, @51, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @40, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
@@ -819,32 +812,21 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; EG-NEXT:     VTX_READ_128 T14.XYZW, T7.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 28:
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T7.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 51:
-; EG-NEXT:     LSHR * T15.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, T0.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     MOV * T7.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 40:
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_load_v16i64:
 ; GFX12:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 25136d6194ad0..6584a1e2b760e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1139,7 +1139,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
 ; EG-NEXT:    CF_END
@@ -1152,13 +1152,12 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT * T5.Y, T4.X, literal.x, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T5.X, T4.X, literal.x,
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
+; EG-NEXT:     AND_INT * T5.X, T4.X, literal.x,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T4.X, T4.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT:     ADD_INT * T7.X, PS, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32:
@@ -1244,25 +1243,25 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
 ; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T4.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XY, T5.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XY, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T4.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T0.W, T4.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT * T6.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T4.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T5.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T4.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.W, T4.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T6.X, T4.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT:     LSHR * T0.W, T4.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T4.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, literal.y,
+; EG-NEXT:     ADD_INT T4.X, T7.X, literal.x,
+; EG-NEXT:     BFE_INT * T6.Y, PV.W, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
@@ -1604,7 +1603,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
@@ -1618,22 +1617,19 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_UINT * T6.Z, T5.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T6.Y, T5.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T7.Z, T5.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T6.W, T5.X, literal.z,
+; EG-NEXT:     BFE_UINT * T7.Z, T5.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T7.Y, T5.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T6.W, T5.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:     AND_INT T6.X, T5.X, literal.x,
-; EG-NEXT:     BFE_UINT T7.Y, T5.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T7.W, T5.Y, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T7.X, T5.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T7.W, T5.Y, literal.y,
+; EG-NEXT:     AND_INT * T7.X, T5.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T8.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
 ; GFX12:       ; %bb.0:
@@ -1766,7 +1762,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 23, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 21, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
@@ -1792,13 +1788,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     BFE_INT T6.Y, PS, 0.0, literal.y,
 ; EG-NEXT:     BFE_INT T7.Z, PV.Y, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.W, T5.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR * T0.W, T5.Y, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T8.X, PS, literal.x,
+; EG-NEXT:     ADD_INT T8.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
 ; GFX12:       ; %bb.0:
@@ -2003,7 +1997,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 39, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T11.X, 0
@@ -2019,41 +2013,35 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; EG-NEXT:     BFE_UINT * T8.Z, T7.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T8.Y, T7.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T9.Z, T7.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T8.W, T7.X, literal.z,
+; EG-NEXT:     BFE_UINT * T9.Z, T7.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T9.Y, T7.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T8.W, T7.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:     AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT:     BFE_UINT T9.Y, T7.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T10.Z, T7.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T9.W, T7.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T9.X, T7.Y, literal.x,
-; EG-NEXT:     BFE_UINT T10.Y, T7.Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:     LSHR T9.W, T7.Y, literal.y,
+; EG-NEXT:     AND_INT * T9.X, T7.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T10.Z, T7.Z, literal.x, T0.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_UINT * T10.Y, T7.Z, literal.y, T0.W,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T11.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_UINT T12.Z, T7.W, literal.y, T0.W,
 ; EG-NEXT:     LSHR T10.W, T7.Z, literal.z,
 ; EG-NEXT:     AND_INT * T10.X, T7.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T12.Y, T7.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
+; EG-NEXT:     BFE_UINT * T12.Y, T7.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T13.X, T7.X, literal.x,
 ; EG-NEXT:     LSHR T12.W, T7.W, literal.y,
 ; EG-NEXT:     AND_INT * T12.X, T7.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T14.X, T7.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
 ; GFX12:       ; %bb.0:
@@ -2272,65 +2260,59 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 47, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
+; EG-NEXT:    ALU 41, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T8.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR T0.W, T7.W, literal.y,
-; EG-NEXT:     LSHR * T1.W, T7.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT * T8.X, T7.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T9.X, T7.X, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.Y, T7.W, literal.y,
-; EG-NEXT:     LSHR T0.Z, T7.Z, literal.z,
-; EG-NEXT:     LSHR T2.W, T7.Y, literal.x,
-; EG-NEXT:     LSHR * T3.W, T7.X, literal.y,
-; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.Z, T7.W, literal.z,
+; EG-NEXT:     LSHR T0.W, T7.Z, literal.y,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T10.X, T7.Y, 0.0, literal.x,
 ; EG-NEXT:     LSHR T1.Y, T7.Z, literal.y,
 ; EG-NEXT:     LSHR T1.Z, T7.Y, literal.y,
 ; EG-NEXT:     BFE_INT T9.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T7.X, literal.z,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T11.X, T7.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR T2.Y, T7.Y, literal.y,
 ; EG-NEXT:     BFE_INT T9.Z, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T10.W, PV.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T7.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T12.X, T7.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T9.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T10.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T11.W, T1.Y, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T7.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T10.Y, T2.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T11.Z, T0.Z, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T12.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T13.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T12.Z, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T0.W, T7.W, literal.y, BS:VEC_201
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T14.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T12.Y, PV.W, 0.0, literal.y,
+; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T9.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T10.Z, PV.Y, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T11.W, T1.Y, 0.0, literal.y,
+; EG-NEXT:     LSHR * T1.W, T7.Y, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T12.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T10.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T11.Z, T0.W, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T8.W, T0.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T7.Z, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T13.X, T7.X, literal.x,
+; EG-NEXT:     BFE_INT T11.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T8.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.W, T7.W, literal.x,
+; EG-NEXT:     ADD_INT * T14.X, T7.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 12(1.681558e-44)
+; EG-NEXT:     BFE_INT * T8.Y, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
 ; GFX12:       ; %bb.0:
@@ -2693,7 +2675,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 75, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 64, @17, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0
@@ -2714,77 +2696,66 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; EG-NEXT:     BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T13.Y, T11.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T14.Z, T11.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T13.W, T11.X, literal.z,
+; EG-NEXT:     BFE_UINT * T14.Z, T11.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T14.Y, T11.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T13.W, T11.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:     AND_INT T13.X, T11.X, literal.x,
-; EG-NEXT:     BFE_UINT T14.Y, T11.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T11.X, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T15.Z, T11.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T14.W, T11.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T14.X, T11.Y, literal.x,
-; EG-NEXT:     BFE_UINT T15.Y, T11.Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:     LSHR T14.W, T11.Y, literal.y,
+; EG-NEXT:     AND_INT * T14.X, T11.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T15.Z, T11.Z, literal.x, T0.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_UINT * T15.Y, T11.Z, literal.y, T0.W,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_UINT T17.Z, T11.W, literal.y, T0.W,
 ; EG-NEXT:     LSHR T15.W, T11.Z, literal.z,
 ; EG-NEXT:     AND_INT * T15.X, T11.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T17.Y, T11.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T17.W, T11.W, literal.z,
+; EG-NEXT:     BFE_UINT * T17.Y, T11.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T18.X, T11.X, literal.x,
+; EG-NEXT:     BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T17.W, T11.W, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     AND_INT * T17.X, T11.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T19.Y, T12.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
+; EG-NEXT:     BFE_UINT * T19.Y, T12.X, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T20.X, T11.X, literal.x,
 ; EG-NEXT:     BFE_UINT T21.Z, T12.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T19.W, T12.X, literal.z,
+; EG-NEXT:     LSHR T19.W, T12.X, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     AND_INT * T19.X, T12.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T21.Y, T12.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T22.Z, T12.Z, literal.y, T0.W,
-; EG-NEXT:     LSHR T21.W, T12.Y, literal.z,
-; EG-NEXT:     AND_INT * T21.X, T12.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T22.Y, T12.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
+; EG-NEXT:     BFE_UINT * T21.Y, T12.Y, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, T11.X, literal.x,
+; EG-NEXT:     BFE_UINT T22.Z, T12.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR T21.W, T12.Y, literal.y,
+; EG-NEXT:     AND_INT * T21.X, T12.Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T22.Y, T12.Z, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T23.X, T11.X, literal.x,
 ; EG-NEXT:     BFE_UINT T24.Z, T12.W, literal.y, T0.W,
 ; EG-NEXT:     LSHR T22.W, T12.Z, literal.z,
 ; EG-NEXT:     AND_INT * T22.X, T12.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    20(2.802597e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T24.Y, T12.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T24.W, T12.W, literal.y,
-; EG-NEXT:     AND_INT * T24.X, T12.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T24.Y, T12.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T25.X, T11.X, literal.x,
+; EG-NEXT:     LSHR T24.W, T12.W, literal.x,
+; EG-NEXT:     AND_INT * T24.X, T12.W, literal.y,
+; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
+; EG-NEXT:     ADD_INT * T26.X, T11.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX12:       ; %bb.0:
@@ -3178,9 +3149,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @18, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @14
-; EG-NEXT:    ALU 18, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 9, @19, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @16
-; EG-NEXT:    ALU 75, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 73, @29, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 0
@@ -3197,102 +3168,91 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 18:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 19:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.Z, T12.W, literal.y,
-; EG-NEXT:     LSHR T0.W, T12.Z, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T16.X, PS, literal.x,
-; EG-NEXT:     LSHR T0.Y, T12.W, literal.y,
-; EG-NEXT:     LSHR T1.Z, T12.Z, literal.z,
-; EG-NEXT:     LSHR T1.W, T12.Y, literal.w,
-; EG-NEXT:     LSHR * T2.W, T12.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT:    ALU clause starting at 38:
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.x,
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T1.Y, T12.Y, literal.y,
-; EG-NEXT:     LSHR T2.Z, T12.Y, literal.z,
-; EG-NEXT:     LSHR T3.W, T12.X, literal.y,
-; EG-NEXT:     LSHR * T4.W, T12.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; EG-NEXT:     LSHR T0.W, T12.W, literal.y,
+; EG-NEXT:     LSHR * T1.W, T12.W, literal.z,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 29:
+; EG-NEXT:     LSHR T2.W, T12.Z, literal.x,
+; EG-NEXT:     LSHR * T3.W, T12.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T17.X, T13.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T12.Y, literal.x,
+; EG-NEXT:     LSHR T0.Z, T12.Y, literal.y,
+; EG-NEXT:     LSHR T4.W, T12.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T5.W, T12.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     BFE_INT T18.X, T11.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T2.Y, T11.W, literal.y,
-; EG-NEXT:     LSHR T3.Z, T11.W, literal.z,
-; EG-NEXT:     LSHR T5.W, T11.Z, literal.y,
-; EG-NEXT:     LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT:     LSHR T1.Y, T11.W, literal.y,
+; EG-NEXT:     LSHR T1.Z, T11.W, literal.z,
+; EG-NEXT:     LSHR T6.W, T11.Z, literal.y,
+; EG-NEXT:     LSHR * T7.W, T11.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T19.X, T11.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR T3.Y, T11.Z, literal.y,
-; EG-NEXT:     LSHR T4.Z, T11.Y, literal.y,
+; EG-NEXT:     LSHR T2.Y, T11.Z, literal.y,
+; EG-NEXT:     LSHR T2.Z, T11.Y, literal.y,
 ; EG-NEXT:     BFE_INT T18.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT:     LSHR * T7.W, T11.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T20.X, T11.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T4.Y, T11.Y, literal.y,
+; EG-NEXT:     LSHR T3.Y, T11.Y, literal.y,
 ; EG-NEXT:     BFE_INT T18.Z, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T19.W, PV.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T7.W, T11.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T21.X, T11.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T18.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T19.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T20.W, T3.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T11.Y, literal.x,
+; EG-NEXT:     BFE_INT T20.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.W, T11.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T22.X, T12.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T19.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T20.Z, T5.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.W, T3.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T5.W, T11.Z, literal.x,
+; EG-NEXT:     BFE_INT T20.Z, T6.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T21.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T6.W, T11.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T11.X, T12.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT T22.W, T4.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.W, literal.x,
+; EG-NEXT:     BFE_INT T21.Z, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T22.W, T5.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T23.X, T12.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T22.Z, T3.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T11.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.W, T12.X, literal.x,
+; EG-NEXT:     BFE_INT T22.Z, T4.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T11.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T4.W, T12.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T24.X, T12.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T22.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T11.Z, T1.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T23.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T12.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T23.Z, T1.Z, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T24.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T25.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T23.Y, T0.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T24.Z, T0.Z, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T11.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T23.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T12.Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT T11.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T23.Z, T2.W, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T24.W, T1.W, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T12.Z, literal.y,
+; EG-NEXT:    20(2.802597e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T25.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT T23.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T24.Z, T0.W, 0.0, literal.y,
 ; EG-NEXT:     LSHR T0.W, T12.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T24.Y, PV.W, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T26.X, T13.X, literal.z,
+; EG-NEXT:    24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T24.Y, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX12:       ; %bb.0:
@@ -3975,185 +3935,166 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @22
-; EG-NEXT:    ALU 59, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 36, @31, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @26
-; EG-NEXT:    ALU 88, @91, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 92, @68, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T32.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T29.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T30.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T35.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T27.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T24.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_128 T22.XYZW, T21.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T23.XYZW, T21.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T23.XYZW, T22.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T24.XYZW, T22.X, 0, #1
 ; EG-NEXT:    Fetch clause starting at 26:
-; EG-NEXT:     VTX_READ_128 T32.XYZW, T21.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T33.XYZW, T21.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T29.XYZW, T22.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T30.XYZW, T22.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 30:
-; EG-NEXT:     MOV * T21.X, KC0[2].Z,
+; EG-NEXT:     MOV * T22.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 31:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T19.Z, T23.X, literal.x, PV.W,
+; EG-NEXT:     BFE_UINT * T19.Z, T24.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T19.Y, T23.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T20.Z, T23.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T19.W, T23.X, literal.z,
+; EG-NEXT:     BFE_UINT T19.Y, T24.X, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T20.Z, T24.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT:     BFE_UINT T20.Y, T24.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T19.W, T24.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T19.X, T24.X, literal.x,
+; EG-NEXT:     LSHR T20.W, T24.Y, literal.y,
+; EG-NEXT:     AND_INT * T20.X, T24.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T21.Z, T24.Z, literal.x, T0.W,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T24.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_UINT * T21.Y, T24.Z, literal.y, T0.W,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T25.X, PV.X, literal.x,
+; EG-NEXT:     BFE_UINT T26.Z, T24.W, literal.y, T0.W,
+; EG-NEXT:     LSHR T21.W, T24.Z, literal.z,
+; EG-NEXT:     AND_INT * T21.X, T24.Z, literal.w,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
+; EG-NEXT:     BFE_UINT * T26.Y, T24.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T27.X, T24.X, literal.x,
+; EG-NEXT:     LSHR T26.W, T24.W, literal.y,
+; EG-NEXT:     AND_INT * T26.X, T24.W, literal.z,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T22.Z, T23.X, literal.x, T0.W,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T28.X, T24.X, literal.x,
+; EG-NEXT:     BFE_UINT * T22.Y, T23.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; EG-NEXT:    12(1.681558e-44), 8(1.121039e-44)
+; EG-NEXT:    ALU clause starting at 68:
+; EG-NEXT:     BFE_UINT T31.Z, T23.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T22.W, T23.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T22.X, T23.X, literal.x,
+; EG-NEXT:     BFE_UINT T31.Y, T23.Y, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T23.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T32.Z, T23.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T31.W, T23.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T31.X, T23.Y, literal.x,
+; EG-NEXT:     BFE_UINT T32.Y, T23.Z, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T33.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T34.Z, T23.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T32.W, T23.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T32.X, T23.Z, literal.x,
+; EG-NEXT:     BFE_UINT T34.Y, T23.W, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T35.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T19.X, T23.X, literal.x,
-; EG-NEXT:     BFE_UINT T20.Y, T23.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T23.X, KC0[2].Y, literal.z,
+; EG-NEXT:     BFE_UINT T36.Z, T30.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T34.W, T23.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T34.X, T23.W, literal.x,
+; EG-NEXT:     BFE_UINT T36.Y, T30.X, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T37.X, T24.X, literal.z,
 ; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T24.Z, T23.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T20.W, T23.Y, literal.y,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T38.Z, T30.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T36.W, T30.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T20.X, T23.Y, literal.x,
-; EG-NEXT:     BFE_UINT T24.Y, T23.Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     AND_INT T36.X, T30.X, literal.x,
+; EG-NEXT:     BFE_UINT T38.Y, T30.Y, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T30.X, T24.X, literal.z,
 ; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T26.Z, T23.W, literal.y, T0.W,
-; EG-NEXT:     LSHR T24.W, T23.Z, literal.z,
-; EG-NEXT:     AND_INT * T24.X, T23.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T26.Y, T23.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T26.W, T23.W, literal.z,
-; EG-NEXT:     AND_INT * T26.X, T23.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T28.Y, T22.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T30.Z, T22.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T28.W, T22.X, literal.z,
-; EG-NEXT:     AND_INT * T28.X, T22.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T30.Y, T22.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T30.W, T22.Y, literal.y,
-; EG-NEXT:     AND_INT * T30.X, T22.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T21.Z, T22.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT * T21.Y, T22.Z, literal.y, T0.W,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    ALU clause starting at 91:
-; EG-NEXT:     BFE_UINT T34.Z, T22.W, literal.x, T0.W,
-; EG-NEXT:     LSHR * T21.W, T22.Z, literal.y,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T39.Z, T30.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T38.W, T30.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T21.X, T22.Z, literal.x,
-; EG-NEXT:     BFE_UINT T34.Y, T22.W, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     AND_INT T38.X, T30.Y, literal.x,
+; EG-NEXT:     BFE_UINT T39.Y, T30.Z, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T40.X, T24.X, literal.z,
 ; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T34.W, T22.W, literal.z,
-; EG-NEXT:     AND_INT * T34.X, T22.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T36.Y, T33.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 112(1.569454e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T38.Z, T33.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T36.W, T33.X, literal.z,
-; EG-NEXT:     AND_INT * T36.X, T33.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T38.Y, T33.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 128(1.793662e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T39.Z, T33.Z, literal.y, T0.W,
-; EG-NEXT:     LSHR T38.W, T33.Y, literal.z,
-; EG-NEXT:     AND_INT * T38.X, T33.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T39.Y, T33.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 144(2.017870e-43)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T41.Z, T33.W, literal.y, T0.W,
-; EG-NEXT:     LSHR T39.W, T33.Z, literal.z,
-; EG-NEXT:     AND_INT * T39.X, T33.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T41.Y, T33.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 160(2.242078e-43)
-; EG-NEXT:     LSHR T42.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T43.Z, T32.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T41.W, T33.W, literal.z,
-; EG-NEXT:     AND_INT * T41.X, T33.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T43.Y, T32.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 176(2.466285e-43)
-; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T45.Z, T32.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T43.W, T32.X, literal.z,
-; EG-NEXT:     AND_INT * T43.X, T32.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T45.Y, T32.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 192(2.690493e-43)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T46.Z, T32.Z, literal.y, T0.W,
-; EG-NEXT:     LSHR T45.W, T32.Y, literal.z,
-; EG-NEXT:     AND_INT * T45.X, T32.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T46.Y, T32.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 208(2.914701e-43)
-; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T48.Z, T32.W, literal.y, T0.W,
-; EG-NEXT:     LSHR T46.W, T32.Z, literal.z,
-; EG-NEXT:     AND_INT * T46.X, T32.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T48.Y, T32.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 224(3.138909e-43)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T48.W, T32.W, literal.y,
-; EG-NEXT:     AND_INT * T48.X, T32.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T41.Z, T30.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T39.W, T30.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T39.X, T30.Z, literal.x,
+; EG-NEXT:     BFE_UINT T41.Y, T30.W, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T42.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T43.Z, T29.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T41.W, T30.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T41.X, T30.W, literal.x,
+; EG-NEXT:     BFE_UINT T43.Y, T29.X, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T44.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T45.Z, T29.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T43.W, T29.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T43.X, T29.X, literal.x,
+; EG-NEXT:     BFE_UINT T45.Y, T29.Y, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T29.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T46.Z, T29.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T45.W, T29.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T45.X, T29.Y, literal.x,
+; EG-NEXT:     BFE_UINT T46.Y, T29.Z, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T47.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T48.Z, T29.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T46.W, T29.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T46.X, T29.Z, literal.x,
+; EG-NEXT:     BFE_UINT T48.Y, T29.W, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T49.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T48.W, T29.W, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T48.X, T29.W, literal.x,
+; EG-NEXT:     ADD_INT * T50.X, T24.X, literal.y,
+; EG-NEXT:    255(3.573311e-43), 60(8.407791e-44)
 ;
 ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32:
 ; GFX12:       ; %bb.0:
@@ -4889,231 +4830,207 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; EG-LABEL: constant_sextload_v64i8_to_v64i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @32, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @24
-; EG-NEXT:    ALU 40, @33, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @28
-; EG-NEXT:    ALU 76, @74, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 72, @151, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @24
+; EG-NEXT:    ALU 17, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @26
+; EG-NEXT:    ALU 76, @51, KC0[], KC1[]
+; EG-NEXT:    ALU 71, @128, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T35.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T30.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T25.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T23.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T22.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T32.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T31.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T26.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T21.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 24:
-; EG-NEXT:     VTX_READ_128 T20.XYZW, T21.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T19.XYZW, T21.X, 48, #1
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_128 T31.XYZW, T21.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T21.XYZW, T21.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T19.XYZW, T20.X, 48, #1
+; EG-NEXT:    Fetch clause starting at 26:
+; EG-NEXT:     VTX_READ_128 T29.XYZW, T20.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T30.XYZW, T20.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T20.XYZW, T20.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 32:
-; EG-NEXT:     MOV * T21.X, KC0[2].Z,
+; EG-NEXT:     MOV * T20.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 33:
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T24.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T21.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T26.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T27.X, T21.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T28.X, T21.X, literal.x,
 ; EG-NEXT:     LSHR T0.Y, T19.W, literal.y,
-; EG-NEXT:     LSHR T0.Z, T19.Z, literal.z,
-; EG-NEXT:     LSHR * T0.W, T19.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T1.Y, T19.Z, literal.y,
+; EG-NEXT:     LSHR T0.Z, T19.W, literal.z,
+; EG-NEXT:     LSHR T0.W, T19.Z, literal.y,
+; EG-NEXT:     LSHR * T1.W, T19.Z, literal.z,
+; EG-NEXT:    28(3.923636e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 51:
+; EG-NEXT:     ADD_INT T31.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T1.Y, T19.Y, literal.y,
 ; EG-NEXT:     LSHR T1.Z, T19.Y, literal.z,
-; EG-NEXT:     LSHR * T1.W, T19.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    128(1.793662e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T2.Y, T19.Y, literal.y,
-; EG-NEXT:     LSHR T2.Z, T19.Y, literal.z,
-; EG-NEXT:     LSHR T2.W, T19.X, literal.y,
+; EG-NEXT:     LSHR T2.W, T19.X, literal.y, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR * T3.W, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    32(4.484155e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 74:
-; EG-NEXT:     LSHR T3.Y, T20.W, literal.x,
-; EG-NEXT:     LSHR T3.Z, T20.W, literal.y,
-; EG-NEXT:     LSHR T4.W, T20.Z, literal.x,
-; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.z,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:    144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T32.X, PS, literal.x,
-; EG-NEXT:     LSHR T4.Y, T20.Z, literal.y,
-; EG-NEXT:     LSHR T4.Z, T20.Y, literal.z,
-; EG-NEXT:     LSHR T5.W, T20.Y, literal.y,
-; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 160(2.242078e-43)
-; EG-NEXT:     LSHR T33.X, PS, literal.x,
-; EG-NEXT:     LSHR T5.Y, T20.X, literal.y,
-; EG-NEXT:     LSHR T5.Z, T20.X, literal.z,
-; EG-NEXT:     LSHR T6.W, T21.W, literal.y,
-; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 176(2.466285e-43)
-; EG-NEXT:     LSHR T34.X, PS, literal.x,
-; EG-NEXT:     LSHR T6.Y, T21.W, literal.y,
-; EG-NEXT:     LSHR T6.Z, T21.Z, literal.z,
-; EG-NEXT:     LSHR T7.W, T21.Z, literal.y,
-; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 192(2.690493e-43)
-; EG-NEXT:     LSHR T35.X, PS, literal.x,
-; EG-NEXT:     LSHR T7.Y, T21.Y, literal.y,
-; EG-NEXT:     LSHR T7.Z, T21.Y, literal.z,
-; EG-NEXT:     LSHR T8.W, T21.X, literal.y,
-; EG-NEXT:     LSHR * T9.W, T21.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T32.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T2.Y, T20.W, literal.y,
+; EG-NEXT:     LSHR T2.Z, T20.W, literal.z,
+; EG-NEXT:     LSHR T4.W, T20.Z, literal.y,
+; EG-NEXT:     LSHR * T5.W, T20.Z, literal.z,
+; EG-NEXT:    36(5.044674e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T36.X, T31.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T8.Y, T31.W, literal.y,
-; EG-NEXT:     LSHR T8.Z, T31.W, literal.z,
-; EG-NEXT:     LSHR T10.W, T31.Z, literal.y,
-; EG-NEXT:     LSHR * T11.W, T31.X, literal.z,
+; EG-NEXT:     ADD_INT T33.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T3.Y, T20.Y, literal.y,
+; EG-NEXT:     LSHR T3.Z, T20.Y, literal.z,
+; EG-NEXT:     LSHR T6.W, T20.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T7.W, T20.X, literal.z,
+; EG-NEXT:    40(5.605194e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T34.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T4.Y, T30.W, literal.y,
+; EG-NEXT:     LSHR T4.Z, T30.W, literal.z,
+; EG-NEXT:     LSHR T8.W, T30.Z, literal.y,
+; EG-NEXT:     LSHR * T9.W, T30.Z, literal.z,
+; EG-NEXT:    44(6.165713e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T35.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T5.Y, T30.Y, literal.y,
+; EG-NEXT:     LSHR T5.Z, T30.Y, literal.z,
+; EG-NEXT:     LSHR T10.W, T30.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T11.W, T30.X, literal.z,
+; EG-NEXT:    48(6.726233e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T36.X, T29.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T6.Y, T29.W, literal.y,
+; EG-NEXT:     LSHR T6.Z, T29.W, literal.z,
+; EG-NEXT:     LSHR T12.W, T29.Z, literal.y,
+; EG-NEXT:     LSHR * T13.W, T29.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T37.X, T31.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR T9.Y, T31.Z, literal.y,
-; EG-NEXT:     LSHR T9.Z, T31.Y, literal.y,
+; EG-NEXT:     BFE_INT T37.X, T29.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR T7.Y, T29.Z, literal.y,
+; EG-NEXT:     LSHR T7.Z, T29.Y, literal.y,
 ; EG-NEXT:     BFE_INT T36.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T11.W, T31.X, literal.z,
+; EG-NEXT:     LSHR * T13.W, T29.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T38.X, T31.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T10.Y, T31.Y, literal.y,
+; EG-NEXT:     BFE_INT T38.X, T29.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T8.Y, T29.Y, literal.y,
 ; EG-NEXT:     BFE_INT T36.Z, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T37.W, PV.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T11.W, T31.X, literal.x,
+; EG-NEXT:     LSHR * T13.W, T29.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T39.X, T31.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.X, T29.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T36.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T37.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T38.W, T9.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T11.W, T31.Y, literal.x,
+; EG-NEXT:     BFE_INT T38.W, T7.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T13.W, T29.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T40.X, T21.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.X, T30.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T37.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T38.Z, T10.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T39.W, T8.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T10.W, T31.Z, literal.x,
+; EG-NEXT:     BFE_INT T38.Z, T12.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T12.W, T29.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T31.X, T21.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T29.X, T30.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T38.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T39.Z, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT T40.W, T9.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T9.W, T31.W, literal.x,
+; EG-NEXT:     BFE_INT T39.Z, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T40.W, T11.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T11.W, T29.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T41.X, T21.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.X, T30.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T39.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T40.Z, T8.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T31.W, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT * T40.Z, T10.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 151:
-; EG-NEXT:     LSHR * T8.W, T21.X, literal.x,
+; EG-NEXT:    ALU clause starting at 128:
+; EG-NEXT:     BFE_INT T29.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T10.W, T30.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T42.X, T21.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T40.Y, PV.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.Z, T7.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T41.W, T7.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T7.W, T21.Y, literal.x,
+; EG-NEXT:     BFE_INT T42.X, T30.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T29.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.W, T9.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T9.W, T30.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T43.X, T20.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T41.Z, T6.Z, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.W, T6.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T7.W, T21.Z, literal.x,
+; EG-NEXT:     BFE_INT T29.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.Z, T8.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T8.W, T30.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T21.X, T20.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.X, T20.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T41.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.Z, T6.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.W, T5.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T21.W, literal.x,
+; EG-NEXT:     BFE_INT T42.Z, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T43.W, T7.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.W, T30.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T44.X, T20.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.Z, T5.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.W, T5.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T5.W, T20.X, literal.x,
+; EG-NEXT:     BFE_INT T43.Z, T6.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T6.W, T20.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T45.X, T20.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.Z, T4.Z, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T44.W, T4.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T44.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR * T5.W, T20.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T46.X, T19.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T44.Z, T4.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T45.W, T2.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T4.W, T20.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T20.X, T19.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T44.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.Z, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T45.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     BFE_INT T46.W, T3.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T3.W, T20.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T47.X, T19.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T45.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T46.Z, T2.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T20.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T20.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR * T2.W, T19.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T48.X, T19.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T20.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T20.Z, T1.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T47.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 208(2.914701e-43)
-; EG-NEXT:     LSHR T19.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T20.Y, T1.Z, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T47.Z, T1.Y, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T48.W, T0.W, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T49.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T47.Y, T0.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T1.W, T19.Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T19.X, T21.X, literal.x,
+; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T47.Z, T0.W, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T48.W, T0.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T19.Z, literal.y,
+; EG-NEXT:    52(7.286752e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T49.X, T21.X, literal.x,
+; EG-NEXT:     BFE_INT T47.Y, PS, 0.0, literal.y,
 ; EG-NEXT:     BFE_INT T48.Z, T0.Y, 0.0, literal.y,
 ; EG-NEXT:     LSHR T0.W, T19.W, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T50.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T48.Y, PV.W, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T50.X, T21.X, literal.z,
+; EG-NEXT:    56(7.847271e-44), 8(1.121039e-44)
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T48.Y, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32:
 ; GFX12:       ; %bb.0:
@@ -5965,7 +5882,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 1
 ; EG-NEXT:    CF_END
@@ -5987,11 +5904,10 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV T5.W, 0.0,
 ; EG-NEXT:     MOV * T4.W, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T7.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX12:       ; %bb.0:
@@ -6117,33 +6033,33 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
 ; EG-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T4.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT:     ASHR T4.W, T4.X, literal.y,
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.z,
+; EG-NEXT:     ASHR * T5.W, T4.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T6.X, T4.X, 0.0, literal.x,
+; EG-NEXT:     ASHR T5.Z, T4.X, literal.y,
+; EG-NEXT:     LSHR * T0.W, T4.X, literal.z,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T5.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.Y, PV.X, literal.y,
+; EG-NEXT:     LSHR T0.W, T4.X, literal.x,
+; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ASHR T5.Y, PV.X, literal.x,
-; EG-NEXT:     ASHR T4.Z, T4.X, literal.y,
-; EG-NEXT:     LSHR T0.W, T4.X, literal.z,
-; EG-NEXT:     LSHR * T1.W, T4.X, literal.w,
-; EG-NEXT:    31(4.344025e-44), 24(3.363116e-44)
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T4.X, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T5.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T4.Y, PV.X, literal.y,
-; EG-NEXT:     ASHR * T5.W, PV.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT * T6.Z, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T7.X, T4.X, literal.x,
+; EG-NEXT:     ASHR T5.Y, T5.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ASHR * T6.W, PV.Z, literal.y,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
 ; GFX12:       ; %bb.0:
@@ -6304,7 +6220,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 34, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 30, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
@@ -6339,17 +6255,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
 ; EG-NEXT:     MOV T8.W, 0.0,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T9.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T10.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T11.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T12.X, T9.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX12:       ; %bb.0:
@@ -6564,33 +6476,29 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 39, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 34, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T9.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T6.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T5.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T7.Y, PV.X, literal.y,
+; EG-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT:     ADD_INT T8.X, PS, literal.x,
+; EG-NEXT:     ASHR T6.Y, PV.X, literal.y,
 ; EG-NEXT:     LSHR T0.W, T5.Y, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T9.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T7.Z, PV.W, 0.0, literal.y,
-; EG-NEXT:     ASHR * T10.W, T5.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T9.X, PS, literal.z,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T6.Z, PV.W, 0.0, literal.x,
+; EG-NEXT:     ASHR * T10.W, T5.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T11.X, T5.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T10.Z, T5.X, literal.y,
 ; EG-NEXT:     LSHR T0.W, T5.X, literal.z,
@@ -6607,14 +6515,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_INT T5.X, PS, 0.0, literal.x,
 ; EG-NEXT:     ASHR T10.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T11.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T12.X, T7.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT:     ASHR T11.W, PV.Z, literal.y,
-; EG-NEXT:     ASHR * T7.W, T7.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T5.Y, PV.X, literal.x,
+; EG-NEXT:     ASHR T11.W, PV.Z, literal.x,
+; EG-NEXT:     ASHR * T6.W, T6.Z, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX12:       ; %bb.0:
@@ -6888,7 +6795,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 68, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 58, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
@@ -6949,29 +6856,19 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV * T13.W, 0.0,
 ; EG-NEXT:     MOV T14.W, 0.0,
 ; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     LSHR T15.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T22.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T15.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T18.X, T15.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T15.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T20.X, T15.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T15.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT * T22.X, T15.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX12:       ; %bb.0:
@@ -7355,7 +7252,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 78, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 67, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
@@ -7370,39 +7267,30 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T9.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T10.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T11.X, T8.X, literal.x,
+; EG-NEXT:     ADD_INT * T12.X, T8.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT * T13.X, T7.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T14.X, T7.Y, 0.0, literal.x,
 ; EG-NEXT:     ASHR T13.Y, PV.X, literal.y,
 ; EG-NEXT:     LSHR T0.W, T7.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T15.X, T8.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T15.X, PS, literal.x,
-; EG-NEXT:     ASHR T14.Y, PV.X, literal.y,
-; EG-NEXT:     BFE_INT T13.Z, PV.W, 0.0, literal.z,
-; EG-NEXT:     LSHR T0.W, T7.Y, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:    8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T16.X, PS, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T14.Y, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T13.Z, PV.W, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T7.Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T8.X, literal.x,
 ; EG-NEXT:     BFE_INT T14.Z, PV.W, 0.0, literal.y,
-; EG-NEXT:     ASHR * T17.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ASHR * T17.W, T7.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:    24(3.363116e-44), 8(1.121039e-44)
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T18.X, T7.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T17.Z, T7.X, literal.y,
@@ -7420,22 +7308,20 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     BFE_INT T19.X, PS, 0.0, literal.x,
 ; EG-NEXT:     ASHR T17.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T18.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.z,
 ; EG-NEXT:     ASHR * T20.W, T7.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T7.X, T7.Z, 0.0, literal.x,
 ; EG-NEXT:     ASHR T19.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T20.Z, T7.Z, literal.z,
-; EG-NEXT:     LSHR T1.W, T7.Z, literal.w,
+; EG-NEXT:     LSHR T0.W, T7.Z, literal.w,
 ; EG-NEXT:     ASHR * T21.W, T7.W, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T20.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T7.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T21.Z, T7.W, literal.z,
-; EG-NEXT:     LSHR T1.W, T7.Z, literal.x,
-; EG-NEXT:     LSHR * T2.W, T7.W, literal.w,
+; EG-NEXT:     LSHR T0.W, T7.Z, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.W, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T21.X, PS, 0.0, literal.x,
@@ -7444,11 +7330,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     ASHR T18.W, T18.Z, literal.y,
 ; EG-NEXT:     ASHR * T14.W, T14.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHR T22.X, T0.W, literal.x,
+; EG-NEXT:     ADD_INT T22.X, T8.X, literal.x,
 ; EG-NEXT:     ASHR T21.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T7.W, PV.Z, literal.y,
 ; EG-NEXT:     ASHR * T13.W, T13.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    28(3.923636e-44), 31(4.344025e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX12:       ; %bb.0:
@@ -7944,9 +7830,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-LABEL: constant_zextload_v32i8_to_v32i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @22
-; EG-NEXT:    ALU 103, @27, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 33, @131, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @22
+; EG-NEXT:    ALU 12, @27, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @24
+; EG-NEXT:    ALU 102, @40, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T41.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T40.X, 0
@@ -7954,7 +7841,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T38.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T37.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T35.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T35.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T34.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T33.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
@@ -7962,88 +7849,90 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T30.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T29.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T27.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT:    Fetch clause starting at 24:
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 26:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 27:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T13.X, T11.W, literal.x, PV.W,
-; EG-NEXT:     LSHR * T13.Z, T11.W, literal.y,
+; EG-NEXT:     BFE_UINT T13.X, T12.W, literal.x, PV.W,
+; EG-NEXT:     LSHR * T13.Z, T12.W, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T13.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T14.Z, T11.W, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T14.Z, T12.W, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T14.X, T11.W, literal.x,
+; EG-NEXT:     AND_INT T14.X, T12.W, literal.x,
 ; EG-NEXT:     MOV * T14.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T15.X, T11.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T15.Z, T11.Z, literal.y,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T15.X, T12.Z, literal.x, T0.W,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 40:
+; EG-NEXT:     LSHR * T15.Z, T12.Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T15.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T16.Z, T11.Z, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T16.Z, T12.Z, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T16.X, T11.Z, literal.x,
+; EG-NEXT:     AND_INT T16.X, T12.Z, literal.x,
 ; EG-NEXT:     MOV * T16.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T17.X, T11.Y, literal.x, T0.W,
-; EG-NEXT:     LSHR * T17.Z, T11.Y, literal.y,
+; EG-NEXT:     BFE_UINT T17.X, T12.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T17.Z, T12.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T17.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T18.Z, T11.Y, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T18.Z, T12.Y, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T18.X, T11.Y, literal.x,
+; EG-NEXT:     AND_INT T18.X, T12.Y, literal.x,
 ; EG-NEXT:     MOV * T18.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T19.X, T11.X, literal.x, T0.W,
-; EG-NEXT:     LSHR * T19.Z, T11.X, literal.y,
+; EG-NEXT:     BFE_UINT T19.X, T12.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T19.Z, T12.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T19.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T11.X, T11.X, literal.x,
-; EG-NEXT:     MOV * T11.Y, 0.0,
+; EG-NEXT:     AND_INT T12.X, T12.X, literal.x,
+; EG-NEXT:     MOV * T12.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T20.X, T12.W, literal.x, T0.W,
-; EG-NEXT:     LSHR * T20.Z, T12.W, literal.y,
+; EG-NEXT:     BFE_UINT T20.X, T11.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T20.Z, T11.W, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T20.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T21.Z, T12.W, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T21.Z, T11.W, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T21.X, T12.W, literal.x,
+; EG-NEXT:     AND_INT T21.X, T11.W, literal.x,
 ; EG-NEXT:     MOV * T21.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T22.X, T12.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T22.Z, T12.Z, literal.y,
+; EG-NEXT:     BFE_UINT T22.X, T11.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T22.Z, T11.Z, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T22.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T23.Z, T12.Z, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T23.Z, T11.Z, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T23.X, T12.Z, literal.x,
+; EG-NEXT:     AND_INT T23.X, T11.Z, literal.x,
 ; EG-NEXT:     MOV * T23.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T24.X, T12.Y, literal.x, T0.W,
-; EG-NEXT:     LSHR * T24.Z, T12.Y, literal.y,
+; EG-NEXT:     BFE_UINT T24.X, T11.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T24.Z, T11.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T24.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T25.Z, T12.Y, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T25.Z, T11.Y, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T25.X, T12.Y, literal.x,
+; EG-NEXT:     AND_INT T25.X, T11.Y, literal.x,
 ; EG-NEXT:     MOV * T25.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T26.X, T12.X, literal.x, T0.W,
-; EG-NEXT:     LSHR * T26.Z, T12.X, literal.y,
+; EG-NEXT:     BFE_UINT T26.X, T11.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T26.Z, T11.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T26.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T12.X, T12.X, literal.x,
-; EG-NEXT:     MOV T12.Y, 0.0,
+; EG-NEXT:     AND_INT T11.X, T11.X, literal.x,
+; EG-NEXT:     MOV T11.Y, 0.0,
 ; EG-NEXT:     MOV T13.W, 0.0,
 ; EG-NEXT:     MOV * T14.W, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
@@ -8052,7 +7941,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV T17.W, 0.0,
 ; EG-NEXT:     MOV * T18.W, 0.0,
 ; EG-NEXT:     MOV T19.W, 0.0,
-; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     MOV * T12.W, 0.0,
 ; EG-NEXT:     MOV T20.W, 0.0,
 ; EG-NEXT:     MOV * T21.W, 0.0,
 ; EG-NEXT:     MOV T22.W, 0.0,
@@ -8060,56 +7949,32 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV T24.W, 0.0,
 ; EG-NEXT:     MOV * T25.W, 0.0,
 ; EG-NEXT:     MOV T26.W, 0.0,
-; EG-NEXT:     MOV * T12.W, 0.0,
-; EG-NEXT:     LSHR T27.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR * T31.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 131:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR * T42.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     LSHR * T27.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T28.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T29.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T30.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T31.X, T27.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T32.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T33.X, T27.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T34.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T35.X, T27.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T36.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T37.X, T27.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T38.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T39.X, T27.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T40.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T41.X, T27.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT * T42.X, T27.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX12:       ; %bb.0:
@@ -8837,8 +8702,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @22
-; EG-NEXT:    ALU 84, @27, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 71, @112, KC0[], KC1[]
+; EG-NEXT:    ALU 83, @27, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 50, @111, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T31.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T30.X, 0
@@ -8863,44 +8728,26 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:    ALU clause starting at 26:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 27:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR * T25.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, T13.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T18.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T13.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T20.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T13.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T22.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, T13.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T24.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T13.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
 ; EG-NEXT:     BFE_INT * T26.X, T11.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T27.X, T11.Y, 0.0, literal.x,
@@ -8916,78 +8763,74 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:     ASHR T28.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T27.Z, PV.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T30.X, T13.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T30.X, PS, literal.x,
-; EG-NEXT:     ASHR T29.Y, PV.X, literal.y,
-; EG-NEXT:     BFE_INT T28.Z, PV.W, 0.0, literal.z,
-; EG-NEXT:     LSHR T0.W, T12.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:    8(1.121039e-44), 224(3.138909e-43)
-; EG-NEXT:     LSHR T31.X, PS, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T29.Y, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T28.Z, PV.W, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T12.W, literal.y,
+; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T31.X, T13.X, literal.x,
 ; EG-NEXT:     BFE_INT T29.Z, PV.W, 0.0, literal.y,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:     ASHR * T32.W, T12.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    240(3.363116e-43), 31(4.344025e-44)
+; EG-NEXT:     ASHR * T32.W, T12.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:    56(7.847271e-44), 8(1.121039e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T33.X, T12.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.Y, T11.Z, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     ASHR T32.Z, T12.X, literal.y,
-; EG-NEXT:     LSHR T1.W, T12.X, literal.z,
+; EG-NEXT:     LSHR T0.W, T12.X, literal.z,
 ; EG-NEXT:     ASHR * T34.W, T12.Y, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T32.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T33.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T34.Z, T12.Y, literal.z,
-; EG-NEXT:     LSHR T1.W, T12.Z, literal.x,
-; EG-NEXT:     LSHR * T2.W, T12.Y, literal.w,
+; EG-NEXT:     LSHR T0.W, T12.Z, literal.x,
+; EG-NEXT:     LSHR * T1.W, T12.Y, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT * T34.X, PS, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 112:
-; EG-NEXT:     ASHR T32.Y, T32.X, literal.x,
-; EG-NEXT:     BFE_INT T33.Z, T1.W, 0.0, literal.y,
-; EG-NEXT:     LSHR T1.W, T11.W, literal.z, BS:VEC_120/SCL_212
-; EG-NEXT:     ASHR * T35.W, T12.Z, literal.x,
-; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
+; EG-NEXT:     BFE_INT T34.X, PS, 0.0, literal.x,
+; EG-NEXT:     ASHR T32.Y, PV.X, literal.y,
+; EG-NEXT:     BFE_INT T33.Z, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.W, T11.W, literal.z,
+; EG-NEXT:     ASHR * T35.W, T12.Z, literal.y,
+; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T36.X, T12.X, 0.0, literal.x,
-; EG-NEXT:     ASHR T34.Y, T34.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ASHR T34.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T35.Z, T12.Z, literal.z,
-; EG-NEXT:     LSHR T2.W, T12.Z, literal.w,
+; EG-NEXT:     LSHR T1.W, T12.Z, literal.w,
 ; EG-NEXT:     ASHR * T37.W, T12.W, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T35.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T36.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T37.Z, T12.W, literal.z,
-; EG-NEXT:     LSHR T2.W, T12.X, literal.x,
-; EG-NEXT:     LSHR * T3.W, T12.W, literal.w,
+; EG-NEXT:     LSHR T1.W, T12.X, literal.x,
+; EG-NEXT:     LSHR * T2.W, T12.W, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T37.X, PS, 0.0, literal.x,
-; EG-NEXT:     ASHR T35.Y, PV.X, literal.y,
-; EG-NEXT:     BFE_INT T36.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     LSHR T2.W, T11.Z, literal.z,
-; EG-NEXT:     ASHR * T12.W, T11.X, literal.y,
-; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT * T37.X, PS, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 111:
+; EG-NEXT:     ASHR T35.Y, T35.X, literal.x,
+; EG-NEXT:     BFE_INT T36.Z, T1.W, 0.0, literal.y,
+; EG-NEXT:     LSHR T1.W, T11.Z, literal.z,
+; EG-NEXT:     ASHR * T12.W, T11.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T38.X, T12.Y, 0.0, literal.x,
-; EG-NEXT:     ASHR T37.Y, PV.X, literal.y,
-; EG-NEXT:     ASHR T12.Z, T11.X, literal.z,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.w,
+; EG-NEXT:     ASHR T37.Y, T37.X, literal.y,
+; EG-NEXT:     ASHR T12.Z, T11.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T2.W, T11.X, literal.w, BS:VEC_120/SCL_212
 ; EG-NEXT:     ASHR * T39.W, T11.Y, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T12.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T38.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T39.Z, T11.Y, literal.z,
-; EG-NEXT:     LSHR T3.W, T12.Y, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T4.W, T11.Y, literal.w,
+; EG-NEXT:     LSHR T2.W, T12.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T11.Y, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T39.X, PS, 0.0, literal.x,
@@ -9003,24 +8846,24 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; EG-NEXT:     ASHR * T41.W, T11.W, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T40.X, T2.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.X, T1.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T11.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T41.Z, T11.W, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     ASHR T33.W, T33.Z, literal.y,
 ; EG-NEXT:     ASHR * T29.W, T29.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T41.X, T1.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.X, T0.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T40.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T11.Z, T0.Y, 0.0, literal.x,
 ; EG-NEXT:     ASHR T28.W, T28.Z, literal.y,
 ; EG-NEXT:     ASHR * T27.W, T27.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHR T42.X, T0.W, literal.x,
+; EG-NEXT:     ADD_INT T42.X, T13.X, literal.x,
 ; EG-NEXT:     ASHR T41.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T11.W, PV.Z, literal.y,
 ; EG-NEXT:     ASHR * T26.W, T26.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    60(8.407791e-44), 31(4.344025e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX12:       ; %bb.0:
@@ -10712,9 +10555,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
 ; EG-NEXT:    ALU 103, @12, KC0[], KC1[]
-; EG-NEXT:    ALU 20, @116, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT:    ALU 19, @116, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
@@ -10833,17 +10676,16 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
 ; EG-NEXT:     MOV * T5.X, PV.W,
 ; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR T0.W, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
+; EG-NEXT:     LSHR * T0.W, T19.W, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     AND_INT T1.W, PV.Y, literal.y,
 ; EG-NEXT:     AND_INT * T0.W, PV.W, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; EG-NEXT:    16711680(2.341805e-38), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
 ; EG-NEXT:     OR_INT * T19.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T5.X, PV.W,
 ; EG-NEXT:     MOV * T20.X, T16.X,
 ; EG-NEXT:     MOV * T20.Z, T12.X,
@@ -11102,9 +10944,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
 ; EG-NEXT:    ALU 104, @12, KC0[], KC1[]
-; EG-NEXT:    ALU 46, @117, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT:    ALU 45, @117, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
@@ -11250,17 +11092,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
 ; EG-NEXT:     MOV * T5.X, PV.W,
 ; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR T0.W, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
+; EG-NEXT:     ASHR * T0.W, T19.W, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     AND_INT T1.W, PV.Y, literal.y,
 ; EG-NEXT:     LSHL * T0.W, PV.W, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
 ; EG-NEXT:     OR_INT * T19.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T5.X, PV.W,
 ; EG-NEXT:     MOV * T20.X, T16.X,
 ; EG-NEXT:     MOV * T20.Z, T12.X,
@@ -11639,11 +11480,11 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; EG-NEXT:    TEX 1 @10
 ; EG-NEXT:    ALU 103, @16, KC0[], KC1[]
 ; EG-NEXT:    ALU 104, @120, KC0[], KC1[]
-; EG-NEXT:    ALU 41, @225, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 37, @225, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T40.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 10:
 ; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
@@ -11878,24 +11719,20 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; EG-NEXT:    -65536(nan), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
 ; EG-NEXT:     MOV * T21.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.W, T35.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16711680(2.341805e-38), 32(4.484155e-44)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT:     MOV * T0.Y, PV.X,
+; EG-NEXT:     LSHR * T39.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T40.X, PV.X, literal.x,
+; EG-NEXT:     LSHR T0.W, T35.W, literal.y,
+; EG-NEXT:     ADD_INT * T41.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT:     ADD_INT T42.X, T39.X, literal.x,
+; EG-NEXT:     OR_INT * T35.W, PV.W, PS,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T21.X, PV.W,
 ; EG-NEXT:     MOV * T36.X, T16.X,
 ; EG-NEXT:     MOV * T36.Z, T12.X,
@@ -12367,11 +12204,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; EG-NEXT:    TEX 1 @10
 ; EG-NEXT:    ALU 104, @16, KC0[], KC1[]
 ; EG-NEXT:    ALU 104, @121, KC0[], KC1[]
-; EG-NEXT:    ALU 95, @226, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 91, @226, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T40.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 10:
 ; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
@@ -12661,24 +12498,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
 ; EG-NEXT:     MOV * T21.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ASHR T0.W, T35.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    24(3.363116e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT:     MOV * T0.Y, PV.X,
+; EG-NEXT:     LSHR * T39.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T40.X, PV.X, literal.x,
+; EG-NEXT:     ASHR T0.W, T35.W, literal.y,
+; EG-NEXT:     ADD_INT * T41.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 24(3.363116e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T42.X, T39.X, literal.x,
+; EG-NEXT:     OR_INT * T35.W, PV.W, PS,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T21.X, PV.W,
 ; EG-NEXT:     MOV * T36.X, T16.X,
 ; EG-NEXT:     MOV * T36.Z, T12.X,
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 0be898359d028..33defc2ecf6a9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -586,9 +586,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
@@ -596,17 +596,16 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_load_v16i16:
 ; CM:       ; %bb.0: ; %entry
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @6
-; CM-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
 ; CM-NEXT:    CF_END
@@ -616,11 +615,10 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; CM-NEXT:    ALU clause starting at 10:
 ; CM-NEXT:     MOV * T0.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 11:
-; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 entry:
   %ld = load <16 x i16>, ptr addrspace(1) %in
   store <16 x i16> %ld, ptr addrspace(1) %out
@@ -770,7 +768,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
@@ -780,19 +778,18 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Y,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T2.X, KC0[2].Z, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Z, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_load_v16i16_align2:
 ; CM:       ; %bb.0: ; %entry
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @6
-; CM-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
@@ -800,12 +797,10 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; CM-NEXT:    ALU clause starting at 10:
 ; CM-NEXT:     MOV * T0.X, KC0[2].Y,
 ; CM-NEXT:    ALU clause starting at 11:
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T2.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T3.X, KC0[2].Z, literal.x,
+; CM-NEXT:     LSHR * T2.X, KC0[2].Z, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 entry:
   %ld =  load <16 x i16>, ptr addrspace(1) %in, align 2
   store <16 x i16> %ld, ptr addrspace(1) %out, align 32
@@ -1437,9 +1432,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ;
 ; EG-LABEL: global_zextload_v3i16_to_v3i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 4, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 2, @17, KC0[], KC1[]
+; EG-NEXT:    ALU 2, @15, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XY, T0.X, 1
 ; EG-NEXT:    CF_END
@@ -1451,34 +1446,30 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     MOV * T1.X, KC0[2].Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 17:
-; EG-NEXT:     LSHR T4.X, T0.W, literal.x,
-; EG-NEXT:     MOV * T3.Y, T1.X,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     ADD_INT T4.X, T0.X, literal.x,
+; EG-NEXT:     MOV * T3.Y, T1.X, BS:VEC_120/SCL_212
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v3i16_to_v3i32:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 4, @12, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 2 @6
-; CM-NEXT:    ALU 2, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T4.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
+; CM-NEXT:    ALU 2, @15, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T0.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_16 T2.X, T1.X, 4, #1
 ; CM-NEXT:     VTX_READ_16 T3.X, T1.X, 0, #1
 ; CM-NEXT:     VTX_READ_16 T1.X, T1.X, 2, #1
 ; CM-NEXT:    ALU clause starting at 12:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T1.X, KC0[2].Z,
-; CM-NEXT:    ALU clause starting at 17:
-; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
-; CM-NEXT:     MOV * T3.Y, T1.X,
+; CM-NEXT:    ALU clause starting at 15:
+; CM-NEXT:     ADD_INT T4.X, T0.X, literal.x,
+; CM-NEXT:     MOV * T3.Y, T1.X, BS:VEC_120/SCL_212
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %ld = load <3 x i16>, ptr addrspace(1) %in
@@ -1552,9 +1543,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 9, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    ALU 8, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
@@ -1565,22 +1556,21 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; EG-NEXT:    ALU clause starting at 13:
 ; EG-NEXT:     BFE_INT * T0.Y, T1.X, 0.0, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.X, T2.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT:     BFE_INT T2.X, T2.X, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v3i16_to_v3i32:
 ; CM:       ; %bb.0: ; %entry
 ; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 2 @6
-; CM-NEXT:    ALU 9, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T3.X
+; CM-NEXT:    ALU 8, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 4, #1
@@ -1589,16 +1579,15 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; CM-NEXT:    ALU clause starting at 12:
 ; CM-NEXT:     MOV * T0.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 13:
-; CM-NEXT:     BFE_INT T1.X, T1.X, 0.0, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
+; CM-NEXT:     BFE_INT * T1.X, T1.X, 0.0, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T4.X, PV.X, literal.x,
 ; CM-NEXT:     BFE_INT * T0.Y, T0.X, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; CM-NEXT:     BFE_INT * T0.X, T2.X, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %ld = load <3 x i16>, ptr addrspace(1) %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -1929,7 +1918,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -1940,30 +1929,28 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     LSHR * T8.W, T7.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T8.Z, T7.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T8.Z, T7.Y, literal.x,
+; EG-NEXT:     LSHR * T9.W, T7.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T8.Y, T7.X, literal.x,
-; EG-NEXT:     LSHR * T9.W, T7.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T9.Z, T7.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
 ; EG-NEXT:     AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT:     AND_INT T9.Z, T7.W, literal.x,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR * T9.Y, T7.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T9.X, T7.Z, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T9.Y, T7.Z, literal.y,
+; EG-NEXT:     AND_INT * T9.X, T7.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T10.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T10.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v8i16_to_v8i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T10.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T9.X
+; CM-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T9.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
@@ -1974,20 +1961,19 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T8.Z, T7.W, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR T8.Y, T7.Z, literal.x,
-; CM-NEXT:     LSHR * T7.W, T7.Y, literal.x,
+; CM-NEXT:     LSHR * T8.Y, T7.Z, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T8.X, T7.Z, literal.x,
-; CM-NEXT:     AND_INT T7.Z, T7.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT:     LSHR * T7.W, T7.Y, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
+; CM-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
+; CM-NEXT:     AND_INT * T7.Z, T7.Y, literal.y,
+; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; CM-NEXT:     ADD_INT T10.X, PV.X, literal.x,
 ; CM-NEXT:     LSHR * T7.Y, T7.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; CM-NEXT:     AND_INT * T7.X, T7.X, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T10.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(1) %out
@@ -2081,7 +2067,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 19, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -2103,21 +2089,20 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     BFE_INT T8.Y, PS, 0.0, literal.y,
-; EG-NEXT:     LSHR T1.Z, T7.Z, literal.y,
 ; EG-NEXT:     BFE_INT T9.W, PV.Z, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T10.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T7.Z, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T10.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT * T9.Y, PS, 0.0, literal.y,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ;
 ; CM-LABEL: global_sextload_v8i16_to_v8i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 19, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
+; CM-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T10.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T7.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
@@ -2127,23 +2112,22 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; CM-NEXT:     BFE_INT * T8.Z, T7.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T8.X, T7.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T0.Y, T7.Y, literal.x,
 ; CM-NEXT:     BFE_INT T9.Z, T7.Y, 0.0, literal.x,
 ; CM-NEXT:     LSHR * T0.W, T7.W, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T9.X, T7.X, 0.0, literal.x,
-; CM-NEXT:     LSHR T1.Y, T7.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR T0.Y, T7.Y, literal.x,
+; CM-NEXT:     LSHR T0.Z, T7.Z, literal.x,
 ; CM-NEXT:     BFE_INT * T8.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T10.X, PV.Z, literal.x,
-; CM-NEXT:     BFE_INT T8.Y, PV.Y, 0.0, literal.y,
+; CM-NEXT:     LSHR T10.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_INT T8.Y, PV.Z, 0.0, literal.y,
 ; CM-NEXT:     LSHR T0.Z, T7.X, literal.y,
-; CM-NEXT:     BFE_INT * T9.W, T0.Y, 0.0, literal.y,
+; CM-NEXT:     BFE_INT * T9.W, PV.Y, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T7.X, PV.X, literal.x,
 ; CM-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    4(5.605194e-45), 16(2.242078e-44)
   %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(1) %out
@@ -2284,11 +2268,11 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 35, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 31, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T11.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T12.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T15.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
@@ -2298,50 +2282,46 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 13:
 ; EG-NEXT:     LSHR * T13.W, T12.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T13.Z, T12.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T13.Z, T12.Y, literal.x,
+; EG-NEXT:     LSHR * T14.W, T12.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T13.Y, T12.X, literal.x,
-; EG-NEXT:     LSHR * T14.W, T12.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T14.Z, T12.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
 ; EG-NEXT:     AND_INT T13.X, T12.X, literal.x,
-; EG-NEXT:     AND_INT T14.Z, T12.W, literal.x,
-; EG-NEXT:     LSHR * T12.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR T14.Y, T12.Z, literal.x,
-; EG-NEXT:     LSHR * T15.W, T11.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T14.X, T12.Z, literal.x,
-; EG-NEXT:     AND_INT T15.Z, T11.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR T14.Y, T12.Z, literal.y,
+; EG-NEXT:     AND_INT * T14.X, T12.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T15.Y, T11.X, literal.y,
+; EG-NEXT:     LSHR * T12.W, T11.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T15.X, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T12.Z, T11.Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
+; EG-NEXT:     LSHR T12.Y, T11.X, literal.y,
 ; EG-NEXT:     LSHR T17.W, T11.W, literal.y,
-; EG-NEXT:     AND_INT * T15.X, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     AND_INT * T12.X, T11.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T17.Z, T11.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T17.Z, T11.W, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T11.X, T15.X, literal.x,
 ; EG-NEXT:     LSHR T17.Y, T11.Z, literal.y,
 ; EG-NEXT:     AND_INT * T17.X, T11.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T18.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T18.X, T15.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v16i16_to_v16i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @8
-; CM-NEXT:    ALU 33, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T17.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T16.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T14.X
+; CM-NEXT:    ALU 30, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T18.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T17.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T15.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
@@ -2353,36 +2333,33 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T13.Z, T12.W, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR T13.Y, T12.Z, literal.x,
-; CM-NEXT:     LSHR * T12.W, T12.Y, literal.x,
+; CM-NEXT:     LSHR * T13.Y, T12.Z, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T13.X, T12.Z, literal.x,
-; CM-NEXT:     AND_INT T12.Z, T12.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T12.W, T12.Y, literal.y,
+; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
+; CM-NEXT:     AND_INT * T12.Z, T12.Y, literal.y,
+; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; CM-NEXT:     ADD_INT T15.X, PV.X, literal.x,
 ; CM-NEXT:     LSHR T12.Y, T12.X, literal.y,
-; CM-NEXT:     LSHR * T15.W, T11.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     LSHR * T16.W, T11.W, literal.y,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; CM-NEXT:     AND_INT T12.X, T12.X, literal.x,
-; CM-NEXT:     AND_INT T15.Z, T11.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T15.Y, T11.Z, literal.y,
+; CM-NEXT:     AND_INT * T16.Z, T11.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T17.X, T14.X, literal.x,
+; CM-NEXT:     LSHR T16.Y, T11.Z, literal.y,
 ; CM-NEXT:     LSHR * T11.W, T11.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T15.X, T11.Z, literal.x,
-; CM-NEXT:     AND_INT T11.Z, T11.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
-; CM-NEXT:     LSHR * T11.Y, T11.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T16.X, T11.Z, literal.x,
+; CM-NEXT:     AND_INT * T11.Z, T11.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T18.X, T14.X, literal.x,
+; CM-NEXT:     LSHR * T11.Y, T11.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; CM-NEXT:     AND_INT * T11.X, T11.X, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T18.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2523,7 +2500,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 39, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 35, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
@@ -2535,20 +2512,18 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 12:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 13:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T15.X, T11.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Y, T12.W, literal.x,
-; EG-NEXT:     BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T16.Z, T11.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.W, T12.Y, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T11.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T16.X, T11.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T1.Y, T11.W, literal.x,
+; EG-NEXT:     LSHR T0.Y, T11.W, literal.x,
 ; EG-NEXT:     BFE_INT T17.Z, T12.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T15.W, PS, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T1.W, T11.X, literal.x,
@@ -2561,30 +2536,28 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T18.X, T12.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T16.Y, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Z, T12.X, literal.x,
-; EG-NEXT:     BFE_INT T17.W, T0.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T17.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.Z, T12.Z, literal.y,
-; EG-NEXT:     BFE_INT T18.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T0.Z, T12.W, literal.x,
+; EG-NEXT:     BFE_INT T17.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T11.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT T17.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T18.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T12.Z, literal.y,
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T12.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT * T18.Y, PS, 0.0, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ;
 ; CM-LABEL: global_sextload_v16i16_to_v16i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @8
-; CM-NEXT:    ALU 40, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T11.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T13.X
+; CM-NEXT:    ALU 35, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T11.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T18.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T14.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
@@ -2592,47 +2565,42 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; CM-NEXT:    ALU clause starting at 12:
 ; CM-NEXT:     MOV * T11.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 13:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T13.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Y, T11.Y, literal.y,
-; CM-NEXT:     LSHR T0.Z, T11.Z, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T0.W, T11.Y, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T1.Y, T11.W, literal.y,
+; CM-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; CM-NEXT:     LSHR T0.Y, T11.W, literal.y,
 ; CM-NEXT:     BFE_INT T15.Z, T12.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:     LSHR * T0.W, T12.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     LSHR * T1.W, T12.X, literal.y,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; CM-NEXT:     BFE_INT T15.X, T12.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T2.Y, T12.Y, literal.x,
+; CM-NEXT:     LSHR T1.Y, T12.Y, literal.x,
 ; CM-NEXT:     BFE_INT T16.Z, T12.Y, 0.0, literal.x,
-; CM-NEXT:     LSHR * T1.W, T12.W, literal.x,
+; CM-NEXT:     LSHR * T2.W, T12.W, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T16.X, T12.X, 0.0, literal.x,
-; CM-NEXT:     LSHR T3.Y, T12.Z, literal.x,
+; CM-NEXT:     LSHR T2.Y, T12.Z, literal.x,
 ; CM-NEXT:     BFE_INT T12.Z, T11.W, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T15.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T12.X, T11.Z, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T15.Y, PV.Y, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T17.Z, T11.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T16.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T16.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T17.X, T11.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T16.Y, T0.W, 0.0, literal.x,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT * T12.W, T1.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T16.Y, T1.W, 0.0, literal.x,
+; CM-NEXT:     LSHR T0.Z, T11.Z, literal.x,
+; CM-NEXT:     BFE_INT * T12.W, T0.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T18.X, PV.Z, literal.x,
-; CM-NEXT:     BFE_INT T12.Y, T0.Z, 0.0, literal.y,
-; CM-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; CM-NEXT:     BFE_INT * T17.W, T0.Y, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T18.X, T13.X, literal.x,
+; CM-NEXT:     BFE_INT T12.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT:     LSHR T0.Z, T11.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T17.W, T0.W, 0.0, literal.y,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T11.X, T13.X, literal.x,
 ; CM-NEXT:     BFE_INT * T17.Y, PV.Z, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    4(5.605194e-45), 16(2.242078e-44)
   %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2867,15 +2835,15 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 72, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 59, @21, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T34.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T30.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T29.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T24.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
@@ -2889,89 +2857,76 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT * T23.Z, T20.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T23.Y, T20.Z, literal.x,
-; EG-NEXT:     LSHR * T20.W, T20.Y, literal.x,
+; EG-NEXT:     LSHR * T23.Y, T20.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T23.X, T20.Z, literal.x,
+; EG-NEXT:     LSHR T20.W, T20.Y, literal.y,
+; EG-NEXT:     LSHR * T24.X, KC0[2].Y, literal.z,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T20.Z, T20.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR * T25.W, T19.W, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T20.Y, T20.X, literal.y,
-; EG-NEXT:     LSHR T25.W, T19.W, literal.y,
+; EG-NEXT:     ADD_INT T26.X, T24.X, literal.x,
+; EG-NEXT:     LSHR T20.Y, T20.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T25.Z, T19.W, literal.z,
 ; EG-NEXT:     AND_INT * T20.X, T20.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T25.Z, T19.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR T25.Y, T19.Z, literal.y,
-; EG-NEXT:     LSHR T19.W, T19.Y, literal.y,
-; EG-NEXT:     AND_INT * T25.X, T19.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LSHR T25.Y, T19.Z, literal.x,
+; EG-NEXT:     LSHR * T19.W, T19.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T25.X, T19.Z, literal.x,
 ; EG-NEXT:     AND_INT T19.Z, T19.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T19.Y, T19.X, literal.y,
-; EG-NEXT:     LSHR T28.W, T22.W, literal.y,
-; EG-NEXT:     AND_INT * T19.X, T19.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T27.X, T24.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 12(1.681558e-44)
+; EG-NEXT:     LSHR T19.Y, T19.X, literal.x,
+; EG-NEXT:     LSHR * T28.W, T22.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T19.X, T19.X, literal.x,
 ; EG-NEXT:     AND_INT T28.Z, T22.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T28.Y, T22.Z, literal.y,
-; EG-NEXT:     LSHR T22.W, T22.Y, literal.y,
-; EG-NEXT:     AND_INT * T28.X, T22.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T29.X, T24.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LSHR T28.Y, T22.Z, literal.x,
+; EG-NEXT:     LSHR * T22.W, T22.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T28.X, T22.Z, literal.x,
 ; EG-NEXT:     AND_INT T22.Z, T22.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T22.Y, T22.X, literal.y,
-; EG-NEXT:     LSHR T31.W, T21.W, literal.y,
-; EG-NEXT:     AND_INT * T22.X, T22.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T30.X, T24.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 20(2.802597e-44)
+; EG-NEXT:     LSHR T22.Y, T22.X, literal.x,
+; EG-NEXT:     LSHR * T31.W, T21.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T22.X, T22.X, literal.x,
 ; EG-NEXT:     AND_INT T31.Z, T21.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T31.Y, T21.Z, literal.y,
-; EG-NEXT:     LSHR T21.W, T21.Y, literal.y,
-; EG-NEXT:     AND_INT * T31.X, T21.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T32.X, T24.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LSHR T31.Y, T21.Z, literal.x,
+; EG-NEXT:     LSHR * T21.W, T21.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T31.X, T21.Z, literal.x,
 ; EG-NEXT:     AND_INT T21.Z, T21.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T21.Y, T21.X, literal.y,
-; EG-NEXT:     AND_INT * T21.X, T21.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T34.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T33.X, T24.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 28(3.923636e-44)
+; EG-NEXT:     LSHR * T21.Y, T21.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T21.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T34.X, T24.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
 ;
 ; CM-LABEL: global_zextload_v32i16_to_v32i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 3 @12
-; CM-NEXT:    ALU 65, @21, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T21.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T32.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T29.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T19.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T20.X
+; CM-NEXT:    ALU 59, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T20.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T34.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T22.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T30.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T19.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T27.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T25.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 12:
 ; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 48, #1
@@ -2985,68 +2940,62 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T23.Z, T20.Y, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR T23.Y, T20.X, literal.x,
-; CM-NEXT:     LSHR * T24.W, T20.W, literal.x,
+; CM-NEXT:     LSHR * T23.Y, T20.X, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T23.X, T20.X, literal.x,
-; CM-NEXT:     AND_INT T24.Z, T20.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
-; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T24.W, T20.W, literal.y,
+; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT:     LSHR T20.X, KC0[2].Y, literal.x,
+; CM-NEXT:     AND_INT * T24.Z, T20.W, literal.y,
+; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; CM-NEXT:     ADD_INT T25.X, PV.X, literal.x,
 ; CM-NEXT:     LSHR T24.Y, T20.Z, literal.y,
-; CM-NEXT:     LSHR * T25.W, T19.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     LSHR * T26.W, T19.Y, literal.y,
+; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; CM-NEXT:     AND_INT T24.X, T20.Z, literal.x,
-; CM-NEXT:     AND_INT T25.Z, T19.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
-; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T25.Y, T19.X, literal.y,
-; CM-NEXT:     LSHR * T27.W, T19.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T25.X, T19.X, literal.x,
-; CM-NEXT:     AND_INT T27.Z, T19.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
-; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T27.Y, T19.Z, literal.y,
-; CM-NEXT:     LSHR * T28.W, T22.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T27.X, T19.Z, literal.x,
-; CM-NEXT:     AND_INT T28.Z, T22.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
-; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T28.Y, T22.X, literal.y,
-; CM-NEXT:     LSHR * T30.W, T22.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T28.X, T22.X, literal.x,
-; CM-NEXT:     AND_INT T30.Z, T22.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T30.Y, T22.Z, literal.y,
-; CM-NEXT:     LSHR * T31.W, T21.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T30.X, T22.Z, literal.x,
-; CM-NEXT:     AND_INT T31.Z, T21.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; CM-NEXT:     LSHR T32.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T31.Y, T21.X, literal.y,
+; CM-NEXT:     AND_INT * T26.Z, T19.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T27.X, T20.X, literal.x,
+; CM-NEXT:     LSHR T26.Y, T19.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T28.W, T19.W, literal.y,
+; CM-NEXT:    28(3.923636e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T26.X, T19.X, literal.x,
+; CM-NEXT:     AND_INT * T28.Z, T19.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T19.X, T20.X, literal.x,
+; CM-NEXT:     LSHR T28.Y, T19.Z, literal.x,
+; CM-NEXT:     LSHR * T29.W, T22.Y, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T28.X, T19.Z, literal.x,
+; CM-NEXT:     AND_INT * T29.Z, T22.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T30.X, T20.X, literal.x,
+; CM-NEXT:     LSHR T29.Y, T22.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T31.W, T22.W, literal.y,
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T29.X, T22.X, literal.x,
+; CM-NEXT:     AND_INT * T31.Z, T22.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T22.X, T20.X, literal.x,
+; CM-NEXT:     LSHR T31.Y, T22.Z, literal.y,
+; CM-NEXT:     LSHR * T32.W, T21.Y, literal.y,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T31.X, T22.Z, literal.x,
+; CM-NEXT:     AND_INT T32.Z, T21.Y, literal.x,
 ; CM-NEXT:     LSHR * T33.W, T21.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T31.X, T21.X, literal.x,
-; CM-NEXT:     AND_INT * T33.Z, T21.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T34.X, T20.X, literal.x,
+; CM-NEXT:     LSHR T32.Y, T21.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     AND_INT * T33.Z, T21.W, literal.z,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
+; CM-NEXT:     AND_INT T32.X, T21.X, literal.x,
 ; CM-NEXT:     LSHR * T33.Y, T21.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T33.X, T21.Z, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T34.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     AND_INT * T33.X, T21.Z, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T21.X, T20.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3279,221 +3228,191 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ;
 ; EG-LABEL: global_sextload_v32i16_to_v32i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 9, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 73, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T22.X, 0
+; EG-NEXT:    ALU 69, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T19.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T34.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T28.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T20.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T26.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T24.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
-; EG-NEXT:     VTX_READ_128 T23.XYZW, T22.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T24.XYZW, T22.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T25.XYZW, T22.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T22.XYZW, T22.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 48, #1
 ; EG-NEXT:    ALU clause starting at 20:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T20.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T22.X, KC0[2].Z,
+; EG-NEXT:     MOV * T19.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     LSHR * T23.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 30:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.W, T22.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T28.X, PS, literal.x,
-; EG-NEXT:     LSHR T0.Y, T22.W, literal.y,
-; EG-NEXT:     BFE_INT T29.Z, T25.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T1.W, T24.Y, literal.y,
-; EG-NEXT:     LSHR * T2.W, T24.W, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T29.X, T25.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T1.Y, T23.Y, literal.x,
-; EG-NEXT:     BFE_INT T30.Z, T25.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T3.W, T23.W, literal.x,
-; EG-NEXT:     LSHR * T4.W, T25.W, literal.x,
+; EG-NEXT:     ADD_INT T24.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T26.X, T23.X, literal.x,
+; EG-NEXT:     ADD_INT * T27.X, T23.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T28.X, T23.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T19.W, literal.x,
+; EG-NEXT:     BFE_INT T29.Z, T22.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T0.W, T21.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T21.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T30.X, T25.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T2.Y, T25.Y, literal.x,
-; EG-NEXT:     BFE_INT T31.Z, T23.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T29.X, T22.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T1.Y, T20.Y, literal.x,
+; EG-NEXT:     BFE_INT T30.Z, T22.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T2.W, T20.W, literal.x,
+; EG-NEXT:     LSHR * T3.W, T22.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T30.X, T22.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T2.Y, T22.Y, literal.x,
+; EG-NEXT:     BFE_INT T31.Z, T20.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T29.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T4.W, T25.Z, literal.x,
+; EG-NEXT:     LSHR * T3.W, T22.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T31.X, T23.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T31.X, T20.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T29.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T25.Z, T23.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.Z, T20.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T30.W, PV.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T4.W, T25.X, literal.x,
+; EG-NEXT:     LSHR * T3.W, T22.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T25.X, T23.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.X, T20.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T30.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T32.Z, T24.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.W, T23.Z, literal.x,
+; EG-NEXT:     BFE_INT T32.Z, T21.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T31.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T20.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T32.X, T24.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T32.X, T21.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T23.Z, T24.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T25.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.W, T23.X, literal.x,
+; EG-NEXT:     BFE_INT T20.Z, T21.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T20.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T23.X, T24.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T25.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T33.Z, T22.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T32.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T2.W, T24.Z, literal.x,
+; EG-NEXT:     BFE_INT T20.X, T21.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T22.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T33.Z, T19.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T32.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T21.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T33.X, T22.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T33.X, T19.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T32.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T24.Z, T22.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T23.W, T1.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.W, T24.X, literal.x,
+; EG-NEXT:     BFE_INT T21.Z, T19.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T20.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T21.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T24.X, T22.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T23.Y, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.Z, T22.Z, literal.x,
-; EG-NEXT:     BFE_INT T33.W, T0.Y, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
-; EG-NEXT:     LSHR T34.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T33.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.Z, T22.X, literal.y,
-; EG-NEXT:     BFE_INT T24.W, T0.W, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T24.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT T21.X, T19.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.Z, T19.Y, literal.x,
+; EG-NEXT:     BFE_INT T33.W, T0.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T19.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T34.X, T23.X, literal.x,
+; EG-NEXT:     BFE_INT T33.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T21.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T19.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T19.X, T23.X, literal.x,
+; EG-NEXT:     BFE_INT * T21.Y, PS, 0.0, literal.y,
+; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ;
 ; CM-LABEL: global_sextload_v32i16_to_v32i32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 0, @22, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 0 @14
-; CM-NEXT:    ALU 7, @23, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 2 @16
-; CM-NEXT:    ALU 76, @31, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 3 @12
+; CM-NEXT:    ALU 71, @21, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T28.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T27.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T24.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T19.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T28.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T27.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T25.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T24.X
 ; CM-NEXT:    CF_END
-; CM-NEXT:    Fetch clause starting at 14:
-; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; CM-NEXT:    Fetch clause starting at 16:
-; CM-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 48, #1
-; CM-NEXT:     VTX_READ_128 T23.XYZW, T19.X, 32, #1
-; CM-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 16, #1
-; CM-NEXT:    ALU clause starting at 22:
+; CM-NEXT:    Fetch clause starting at 12:
+; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 16, #1
+; CM-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 0, #1
+; CM-NEXT:    ALU clause starting at 20:
 ; CM-NEXT:     MOV * T19.X, KC0[2].Z,
-; CM-NEXT:    ALU clause starting at 23:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Y, T20.Z, literal.y,
-; CM-NEXT:     LSHR T0.Z, T20.W, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 31:
-; CM-NEXT:     LSHR T24.X, T0.W, literal.x,
-; CM-NEXT:     LSHR T1.Y, T20.Y, literal.y,
-; CM-NEXT:     LSHR T1.Z, T19.Z, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T2.Y, T19.W, literal.y,
-; CM-NEXT:     LSHR T2.Z, T19.X, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T3.Y, T19.Y, literal.y,
-; CM-NEXT:     LSHR T3.Z, T23.Z, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T27.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T4.Y, T23.W, literal.y,
-; CM-NEXT:     LSHR T4.Z, T23.X, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T28.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T5.Y, T23.Y, literal.y,
-; CM-NEXT:     BFE_INT T29.Z, T22.Y, 0.0, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    ALU clause starting at 21:
+; CM-NEXT:     LSHR * T23.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T24.X, PV.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T25.X, T23.X, literal.x,
+; CM-NEXT:     LSHR T0.Y, T19.W, literal.y,
+; CM-NEXT:     LSHR T0.Z, T19.Y, literal.y,
 ; CM-NEXT:     LSHR * T0.W, T22.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     BFE_INT T29.X, T22.X, 0.0, literal.x,
-; CM-NEXT:     LSHR T6.Y, T22.W, literal.x,
-; CM-NEXT:     BFE_INT T30.Z, T22.W, 0.0, literal.x,
+; CM-NEXT:    28(3.923636e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T26.X, T23.X, literal.x,
+; CM-NEXT:     LSHR T1.Y, T22.W, literal.x,
+; CM-NEXT:     LSHR T1.Z, T22.X, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:     LSHR * T1.W, T22.Y, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T30.X, T22.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T7.Y, T22.X, literal.x,
-; CM-NEXT:     BFE_INT T22.Z, T23.Y, 0.0, literal.x,
+; CM-NEXT:     ADD_INT T27.X, T23.X, literal.x,
+; CM-NEXT:     LSHR T2.Y, T21.Z, literal.y,
+; CM-NEXT:     LSHR T2.Z, T21.W, literal.y,
+; CM-NEXT:     LSHR * T2.W, T21.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T28.X, T23.X, literal.x,
+; CM-NEXT:     LSHR T3.Y, T21.Y, literal.y,
+; CM-NEXT:     BFE_INT T29.Z, T20.Y, 0.0, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T3.W, T20.Z, literal.y,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     BFE_INT T29.X, T20.X, 0.0, literal.x,
+; CM-NEXT:     LSHR T4.Y, T20.W, literal.x,
+; CM-NEXT:     BFE_INT T30.Z, T20.W, 0.0, literal.x,
+; CM-NEXT:     LSHR * T4.W, T20.Y, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T30.X, T20.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR T5.Y, T20.X, literal.x,
+; CM-NEXT:     BFE_INT T20.Z, T21.Y, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T29.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T22.X, T23.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.X, T21.X, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T29.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T31.Z, T23.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T30.W, T6.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T31.Z, T21.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T30.W, T4.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T31.X, T23.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T30.Y, T0.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T23.Z, T19.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T22.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T31.X, T21.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T30.Y, T3.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T21.Z, T22.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T20.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T23.X, T19.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T22.Y, T4.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T32.Z, T19.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T31.W, T4.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T21.X, T22.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.Y, T2.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T32.Z, T22.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T31.W, T2.Z, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T32.X, T19.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T31.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T19.Z, T20.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T23.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T32.X, T22.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T31.Y, T2.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T22.Z, T19.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T21.W, T1.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T19.X, T20.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T23.Y, T2.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T33.Z, T20.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T32.W, T2.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T22.X, T19.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T21.Y, T1.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T33.Z, T19.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T32.W, T1.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T33.X, T20.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T32.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     LSHR T1.Z, T20.X, literal.x,
-; CM-NEXT:     BFE_INT * T19.W, T1.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T33.X, T19.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T32.Y, T0.W, 0.0, literal.x,
+; CM-NEXT:     LSHR T1.Z, T19.X, literal.x,
+; CM-NEXT:     BFE_INT * T22.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T20.X, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT T19.Y, PV.Z, 0.0, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     BFE_INT * T33.W, T0.Z, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T34.X, PV.Z, literal.x,
-; CM-NEXT:     BFE_INT * T33.Y, T0.Y, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T19.X, T23.X, literal.x,
+; CM-NEXT:     BFE_INT T22.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT:     LSHR T0.Z, T19.Z, literal.y,
+; CM-NEXT:     BFE_INT * T33.W, T0.Y, 0.0, literal.y,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T34.X, T23.X, literal.x,
+; CM-NEXT:     BFE_INT * T33.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT:    4(5.605194e-45), 16(2.242078e-44)
   %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3967,37 +3886,37 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; EG-LABEL: global_zextload_v64i16_to_v64i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 3 @22
-; EG-NEXT:    ALU 56, @39, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 3 @30
-; EG-NEXT:    ALU 87, @96, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T66.X, 0
+; EG-NEXT:    TEX 2 @22
+; EG-NEXT:    ALU 35, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 4 @28
+; EG-NEXT:    ALU 81, @75, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T66.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T65.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T64.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T64.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T62.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T61.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T61.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T58.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T53.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T55.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T53.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T52.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T50.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T44.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T43.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T40.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T42.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T36.XYZW, T37.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T38.XYZW, T37.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T39.XYZW, T37.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T40.XYZW, T37.X, 16, #1
-; EG-NEXT:    Fetch clause starting at 30:
-; EG-NEXT:     VTX_READ_128 T49.XYZW, T37.X, 112, #1
-; EG-NEXT:     VTX_READ_128 T50.XYZW, T37.X, 96, #1
-; EG-NEXT:     VTX_READ_128 T51.XYZW, T37.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T52.XYZW, T37.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T38.XYZW, T37.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T39.XYZW, T37.X, 16, #1
+; EG-NEXT:    Fetch clause starting at 28:
+; EG-NEXT:     VTX_READ_128 T45.XYZW, T37.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T46.XYZW, T37.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T47.XYZW, T37.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T48.XYZW, T37.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T49.XYZW, T37.X, 48, #1
 ; EG-NEXT:    ALU clause starting at 38:
 ; EG-NEXT:     MOV * T37.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 39:
@@ -4005,317 +3924,276 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT * T35.Z, T36.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T35.Y, T36.Z, literal.x,
-; EG-NEXT:     LSHR * T36.W, T36.Y, literal.x,
+; EG-NEXT:     LSHR * T35.Y, T36.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T35.X, T36.Z, literal.x,
+; EG-NEXT:     LSHR T36.W, T36.Y, literal.y,
+; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.z,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T36.Z, T36.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     LSHR * T41.W, T39.W, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T36.Y, T36.X, literal.y,
-; EG-NEXT:     LSHR T42.W, T40.W, literal.y,
+; EG-NEXT:     ADD_INT T42.X, T40.X, literal.x,
+; EG-NEXT:     LSHR T36.Y, T36.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T41.Z, T39.W, literal.z,
 ; EG-NEXT:     AND_INT * T36.X, T36.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T42.Z, T40.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T43.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR T42.Y, T40.Z, literal.y,
-; EG-NEXT:     LSHR T40.W, T40.Y, literal.y,
-; EG-NEXT:     AND_INT * T42.X, T40.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T40.Z, T40.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T40.Y, T40.X, literal.y,
-; EG-NEXT:     LSHR T45.W, T39.W, literal.y,
-; EG-NEXT:     AND_INT * T40.X, T40.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T45.Z, T39.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T45.Y, T39.Z, literal.y,
-; EG-NEXT:     LSHR T39.W, T39.Y, literal.y,
-; EG-NEXT:     AND_INT * T45.X, T39.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LSHR T41.Y, T39.Z, literal.x,
+; EG-NEXT:     LSHR * T39.W, T39.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T41.X, T39.Z, literal.x,
 ; EG-NEXT:     AND_INT T39.Z, T39.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
-; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T39.Y, T39.X, literal.y,
-; EG-NEXT:     AND_INT * T39.X, T39.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT * T43.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 12(1.681558e-44)
+; EG-NEXT:     LSHR * T39.Y, T39.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T39.X, T39.X, literal.x,
+; EG-NEXT:     LSHR T37.W, T38.W, literal.y,
+; EG-NEXT:     ADD_INT * T44.X, T40.X, literal.z,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T37.Z, T38.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T37.W, T38.W, literal.y,
-; EG-NEXT:    64(8.968310e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T37.Z, T38.W, literal.y,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    ALU clause starting at 96:
+; EG-NEXT:    ALU clause starting at 75:
 ; EG-NEXT:     LSHR T37.Y, T38.Z, literal.x,
 ; EG-NEXT:     LSHR * T38.W, T38.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T37.X, T38.Z, literal.x,
 ; EG-NEXT:     AND_INT T38.Z, T38.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
-; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T38.Y, T38.X, literal.y,
-; EG-NEXT:     LSHR T54.W, T52.W, literal.y,
-; EG-NEXT:     AND_INT * T38.X, T38.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T54.Z, T52.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT:     LSHR T55.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T54.Y, T52.Z, literal.y,
-; EG-NEXT:     LSHR T52.W, T52.Y, literal.y,
-; EG-NEXT:     AND_INT * T54.X, T52.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T52.Z, T52.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
-; EG-NEXT:     LSHR T56.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T52.Y, T52.X, literal.y,
-; EG-NEXT:     LSHR T57.W, T51.W, literal.y,
-; EG-NEXT:     AND_INT * T52.X, T52.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T57.Z, T51.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
-; EG-NEXT:     LSHR T58.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T57.Y, T51.Z, literal.y,
-; EG-NEXT:     LSHR T51.W, T51.Y, literal.y,
-; EG-NEXT:     AND_INT * T57.X, T51.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T51.Z, T51.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
-; EG-NEXT:     LSHR T59.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T51.Y, T51.X, literal.y,
-; EG-NEXT:     LSHR T60.W, T50.W, literal.y,
-; EG-NEXT:     AND_INT * T51.X, T51.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T60.Z, T50.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
-; EG-NEXT:     LSHR T61.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T60.Y, T50.Z, literal.y,
-; EG-NEXT:     LSHR T50.W, T50.Y, literal.y,
-; EG-NEXT:     AND_INT * T60.X, T50.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T50.Z, T50.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
-; EG-NEXT:     LSHR T62.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T50.Y, T50.X, literal.y,
-; EG-NEXT:     LSHR T63.W, T49.W, literal.y,
-; EG-NEXT:     AND_INT * T50.X, T50.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T63.Z, T49.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
-; EG-NEXT:     LSHR T64.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T63.Y, T49.Z, literal.y,
-; EG-NEXT:     LSHR T49.W, T49.Y, literal.y,
-; EG-NEXT:     AND_INT * T63.X, T49.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T50.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 20(2.802597e-44)
+; EG-NEXT:     LSHR T38.Y, T38.X, literal.x,
+; EG-NEXT:     LSHR * T51.W, T49.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T38.X, T38.X, literal.x,
+; EG-NEXT:     AND_INT T51.Z, T49.W, literal.x,
+; EG-NEXT:     ADD_INT * T52.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LSHR T51.Y, T49.Z, literal.x,
+; EG-NEXT:     LSHR * T49.W, T49.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T51.X, T49.Z, literal.x,
 ; EG-NEXT:     AND_INT T49.Z, T49.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
-; EG-NEXT:     LSHR T65.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T49.Y, T49.X, literal.y,
-; EG-NEXT:     AND_INT * T49.X, T49.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T66.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T53.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 28(3.923636e-44)
+; EG-NEXT:     LSHR T49.Y, T49.X, literal.x,
+; EG-NEXT:     LSHR * T54.W, T48.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T49.X, T49.X, literal.x,
+; EG-NEXT:     AND_INT T54.Z, T48.W, literal.x,
+; EG-NEXT:     ADD_INT * T55.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LSHR T54.Y, T48.Z, literal.x,
+; EG-NEXT:     LSHR * T48.W, T48.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T54.X, T48.Z, literal.x,
+; EG-NEXT:     AND_INT T48.Z, T48.Y, literal.x,
+; EG-NEXT:     ADD_INT * T56.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 36(5.044674e-44)
+; EG-NEXT:     LSHR T48.Y, T48.X, literal.x,
+; EG-NEXT:     LSHR * T57.W, T47.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T48.X, T48.X, literal.x,
+; EG-NEXT:     AND_INT T57.Z, T47.W, literal.x,
+; EG-NEXT:     ADD_INT * T58.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LSHR T57.Y, T47.Z, literal.x,
+; EG-NEXT:     LSHR * T47.W, T47.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T57.X, T47.Z, literal.x,
+; EG-NEXT:     AND_INT T47.Z, T47.Y, literal.x,
+; EG-NEXT:     ADD_INT * T59.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 44(6.165713e-44)
+; EG-NEXT:     LSHR T47.Y, T47.X, literal.x,
+; EG-NEXT:     LSHR * T60.W, T46.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T47.X, T47.X, literal.x,
+; EG-NEXT:     AND_INT T60.Z, T46.W, literal.x,
+; EG-NEXT:     ADD_INT * T61.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LSHR T60.Y, T46.Z, literal.x,
+; EG-NEXT:     LSHR * T46.W, T46.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T60.X, T46.Z, literal.x,
+; EG-NEXT:     AND_INT T46.Z, T46.Y, literal.x,
+; EG-NEXT:     ADD_INT * T62.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 52(7.286752e-44)
+; EG-NEXT:     LSHR T46.Y, T46.X, literal.x,
+; EG-NEXT:     LSHR * T63.W, T45.W, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T46.X, T46.X, literal.x,
+; EG-NEXT:     AND_INT T63.Z, T45.W, literal.x,
+; EG-NEXT:     ADD_INT * T64.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LSHR T63.Y, T45.Z, literal.x,
+; EG-NEXT:     LSHR * T45.W, T45.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T63.X, T45.Z, literal.x,
+; EG-NEXT:     AND_INT T45.Z, T45.Y, literal.x,
+; EG-NEXT:     ADD_INT * T65.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 60(8.407791e-44)
+; EG-NEXT:     LSHR * T45.Y, T45.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T45.X, T45.X, literal.x,
+; EG-NEXT:     ADD_INT * T66.X, T40.X, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
 ;
 ; CM-LABEL: global_zextload_v64i16_to_v64i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 3 @22
-; CM-NEXT:    ALU 50, @39, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 3 @30
-; CM-NEXT:    ALU 78, @90, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T48.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T62, T64.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T60, T49.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T59, T61.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T57, T50.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T56, T58.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T54, T51.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T53, T55.X
+; CM-NEXT:    TEX 2 @22
+; CM-NEXT:    ALU 32, @39, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 4 @28
+; CM-NEXT:    ALU 82, @72, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T65, T45.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T64, T36.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T66.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T61, T46.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T60, T62.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T58, T47.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T57, T59.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T55, T48.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T54, T56.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T52, T49.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T51, T53.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T37.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T52.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T38.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T46.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T39.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T43.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T36.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T50.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T38.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T43.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T41.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 22:
 ; CM-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 112, #1
-; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 64, #1
-; CM-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 80, #1
-; CM-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 96, #1
-; CM-NEXT:    Fetch clause starting at 30:
-; CM-NEXT:     VTX_READ_128 T48.XYZW, T35.X, 0, #1
-; CM-NEXT:     VTX_READ_128 T49.XYZW, T35.X, 16, #1
-; CM-NEXT:     VTX_READ_128 T50.XYZW, T35.X, 32, #1
-; CM-NEXT:     VTX_READ_128 T51.XYZW, T35.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 80, #1
+; CM-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 96, #1
+; CM-NEXT:    Fetch clause starting at 28:
+; CM-NEXT:     VTX_READ_128 T45.XYZW, T35.X, 0, #1
+; CM-NEXT:     VTX_READ_128 T46.XYZW, T35.X, 16, #1
+; CM-NEXT:     VTX_READ_128 T47.XYZW, T35.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T48.XYZW, T35.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T49.XYZW, T35.X, 64, #1
 ; CM-NEXT:    ALU clause starting at 38:
 ; CM-NEXT:     MOV * T35.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 39:
-; CM-NEXT:     LSHR * T40.W, T36.Y, literal.x,
+; CM-NEXT:     LSHR * T39.W, T36.Y, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT * T40.Z, T36.Y, literal.x,
+; CM-NEXT:     AND_INT * T39.Z, T36.Y, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR T40.Y, T36.X, literal.x,
-; CM-NEXT:     LSHR * T41.W, T36.W, literal.x,
+; CM-NEXT:     LSHR * T39.Y, T36.X, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T40.X, T36.X, literal.x,
-; CM-NEXT:     AND_INT T41.Z, T36.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
-; CM-NEXT:     LSHR T36.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T41.Y, T36.Z, literal.y,
-; CM-NEXT:     LSHR * T42.W, T39.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T41.X, T36.Z, literal.x,
-; CM-NEXT:     AND_INT T42.Z, T39.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
-; CM-NEXT:     LSHR T43.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T42.Y, T39.X, literal.y,
-; CM-NEXT:     LSHR * T44.W, T39.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T42.X, T39.X, literal.x,
-; CM-NEXT:     AND_INT T44.Z, T39.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T44.Y, T39.Z, literal.y,
-; CM-NEXT:     LSHR * T45.W, T38.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T44.X, T39.Z, literal.x,
-; CM-NEXT:     AND_INT T45.Z, T38.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
-; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T45.Y, T38.X, literal.y,
-; CM-NEXT:     LSHR * T47.W, T38.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T45.X, T38.X, literal.x,
-; CM-NEXT:     AND_INT T47.Z, T38.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
-; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T47.Y, T38.Z, literal.y,
+; CM-NEXT:     AND_INT T39.X, T36.X, literal.x,
+; CM-NEXT:     LSHR * T40.W, T36.W, literal.y,
+; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT:     LSHR T36.X, KC0[2].Y, literal.x,
+; CM-NEXT:     AND_INT * T40.Z, T36.W, literal.y,
+; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; CM-NEXT:     ADD_INT T41.X, PV.X, literal.x,
+; CM-NEXT:     LSHR T40.Y, T36.Z, literal.y,
+; CM-NEXT:     LSHR * T42.W, T38.Y, literal.y,
+; CM-NEXT:    56(7.847271e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T40.X, T36.Z, literal.x,
+; CM-NEXT:     AND_INT * T42.Z, T38.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T43.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T42.Y, T38.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T44.W, T38.W, literal.y,
+; CM-NEXT:    60(8.407791e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T42.X, T38.X, literal.x,
+; CM-NEXT:     AND_INT * T44.Z, T38.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T38.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T44.Y, T38.Z, literal.y,
 ; CM-NEXT:     LSHR * T35.W, T37.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T47.X, T38.Z, literal.x,
-; CM-NEXT:     AND_INT T35.Z, T37.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
-; CM-NEXT:    ALU clause starting at 90:
-; CM-NEXT:     LSHR T52.X, T0.W, literal.x,
-; CM-NEXT:     LSHR T35.Y, T37.X, literal.y,
-; CM-NEXT:     LSHR * T53.W, T37.W, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    48(6.726233e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T44.X, T38.Z, literal.x,
+; CM-NEXT:     AND_INT * T35.Z, T37.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 72:
+; CM-NEXT:     ADD_INT T50.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T35.Y, T37.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T51.W, T37.W, literal.y,
+; CM-NEXT:    52(7.286752e-44), 16(2.242078e-44)
 ; CM-NEXT:     AND_INT T35.X, T37.X, literal.x,
-; CM-NEXT:     AND_INT T53.Z, T37.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
-; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T53.Y, T37.Z, literal.y,
-; CM-NEXT:     LSHR * T54.W, T51.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T53.X, T37.Z, literal.x,
-; CM-NEXT:     AND_INT T54.Z, T51.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
-; CM-NEXT:     LSHR T55.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T54.Y, T51.X, literal.y,
-; CM-NEXT:     LSHR * T56.W, T51.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T54.X, T51.X, literal.x,
-; CM-NEXT:     AND_INT T56.Z, T51.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
-; CM-NEXT:     LSHR T51.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T56.Y, T51.Z, literal.y,
-; CM-NEXT:     LSHR * T57.W, T50.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T56.X, T51.Z, literal.x,
-; CM-NEXT:     AND_INT T57.Z, T50.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
-; CM-NEXT:     LSHR T58.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T57.Y, T50.X, literal.y,
-; CM-NEXT:     LSHR * T59.W, T50.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T57.X, T50.X, literal.x,
-; CM-NEXT:     AND_INT T59.Z, T50.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
-; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T59.Y, T50.Z, literal.y,
-; CM-NEXT:     LSHR * T60.W, T49.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T59.X, T50.Z, literal.x,
-; CM-NEXT:     AND_INT T60.Z, T49.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
-; CM-NEXT:     LSHR T61.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T60.Y, T49.X, literal.y,
-; CM-NEXT:     LSHR * T62.W, T49.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T60.X, T49.X, literal.x,
-; CM-NEXT:     AND_INT T62.Z, T49.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; CM-NEXT:     LSHR T49.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T62.Y, T49.Z, literal.y,
-; CM-NEXT:     LSHR * T63.W, T48.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T62.X, T49.Z, literal.x,
-; CM-NEXT:     AND_INT T63.Z, T48.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
-; CM-NEXT:     LSHR T64.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T63.Y, T48.X, literal.y,
-; CM-NEXT:     LSHR * T65.W, T48.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T63.X, T48.X, literal.x,
-; CM-NEXT:     AND_INT * T65.Z, T48.W, literal.x,
+; CM-NEXT:     AND_INT * T51.Z, T37.W, literal.x,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     LSHR T48.X, KC0[2].Y, literal.x,
-; CM-NEXT:     LSHR * T65.Y, T48.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T65.X, T48.Z, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT:     ADD_INT T37.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T51.Y, T37.Z, literal.y,
+; CM-NEXT:     LSHR * T52.W, T49.Y, literal.y,
+; CM-NEXT:    40(5.605194e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T51.X, T37.Z, literal.x,
+; CM-NEXT:     AND_INT * T52.Z, T49.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T53.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T52.Y, T49.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T54.W, T49.W, literal.y,
+; CM-NEXT:    44(6.165713e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T52.X, T49.X, literal.x,
+; CM-NEXT:     AND_INT * T54.Z, T49.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T49.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T54.Y, T49.Z, literal.y,
+; CM-NEXT:     LSHR * T55.W, T48.Y, literal.y,
+; CM-NEXT:    32(4.484155e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T54.X, T49.Z, literal.x,
+; CM-NEXT:     AND_INT * T55.Z, T48.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T56.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T55.Y, T48.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T57.W, T48.W, literal.y,
+; CM-NEXT:    36(5.044674e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T55.X, T48.X, literal.x,
+; CM-NEXT:     AND_INT * T57.Z, T48.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T48.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T57.Y, T48.Z, literal.y,
+; CM-NEXT:     LSHR * T58.W, T47.Y, literal.y,
+; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T57.X, T48.Z, literal.x,
+; CM-NEXT:     AND_INT * T58.Z, T47.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T59.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T58.Y, T47.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T60.W, T47.W, literal.y,
+; CM-NEXT:    28(3.923636e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T58.X, T47.X, literal.x,
+; CM-NEXT:     AND_INT * T60.Z, T47.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T47.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T60.Y, T47.Z, literal.x,
+; CM-NEXT:     LSHR * T61.W, T46.Y, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T60.X, T47.Z, literal.x,
+; CM-NEXT:     AND_INT * T61.Z, T46.Y, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T62.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T61.Y, T46.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T63.W, T46.W, literal.y,
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T61.X, T46.X, literal.x,
+; CM-NEXT:     AND_INT * T63.Z, T46.W, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T46.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T63.Y, T46.Z, literal.y,
+; CM-NEXT:     LSHR * T64.W, T45.Y, literal.y,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T63.X, T46.Z, literal.x,
+; CM-NEXT:     AND_INT T64.Z, T45.Y, literal.x,
+; CM-NEXT:     LSHR * T65.W, T45.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T66.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T66.X, T36.X, literal.x,
+; CM-NEXT:     LSHR T64.Y, T45.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     AND_INT * T65.Z, T45.W, literal.z,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T64.X, T45.X, literal.x,
+; CM-NEXT:     LSHR * T65.Y, T45.Z, literal.y,
+; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT:     AND_INT * T65.X, T45.Z, literal.x,
+; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T45.X, T36.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4773,421 +4651,363 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ;
 ; EG-LABEL: global_sextload_v64i16_to_v64i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 18, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @22
-; EG-NEXT:    ALU 75, @57, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 71, @133, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T41.X, 0
+; EG-NEXT:    ALU 73, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 65, @113, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T35.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T66.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T56.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T56.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T55.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T54.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T54.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T53.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T52.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T52.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T51.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T50.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T50.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T49.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T39.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T38.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T37.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T35.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T48.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T47.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T46.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T45.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T43.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T44.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_128 T42.XYZW, T41.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T43.XYZW, T41.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T44.XYZW, T41.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T45.XYZW, T41.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T46.XYZW, T41.X, 64, #1
-; EG-NEXT:     VTX_READ_128 T47.XYZW, T41.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T48.XYZW, T41.X, 96, #1
-; EG-NEXT:     VTX_READ_128 T41.XYZW, T41.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T41.XYZW, T35.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T40.XYZW, T35.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T42.XYZW, T35.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T35.XYZW, T35.X, 112, #1
 ; EG-NEXT:    ALU clause starting at 38:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T36.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T41.X, KC0[2].Z,
+; EG-NEXT:     MOV * T35.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 39:
+; EG-NEXT:     LSHR * T43.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 57:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T50.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T51.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T52.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.Y, T41.Y, literal.y,
-; EG-NEXT:     LSHR T0.Z, T41.W, literal.y,
-; EG-NEXT:     LSHR T0.W, T48.Y, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T54.X, PS, literal.x,
-; EG-NEXT:     LSHR T1.Y, T48.W, literal.y,
-; EG-NEXT:     LSHR T1.Z, T47.Y, literal.y,
-; EG-NEXT:     LSHR T1.W, T47.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T55.X, PS, literal.x,
-; EG-NEXT:     LSHR T2.Y, T46.Y, literal.y,
-; EG-NEXT:     LSHR T2.Z, T46.W, literal.y,
-; EG-NEXT:     LSHR T2.W, T45.Y, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T56.X, PS, literal.x,
-; EG-NEXT:     LSHR T3.Y, T45.W, literal.y,
-; EG-NEXT:     BFE_INT T57.Z, T44.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T3.W, T43.Y, literal.y,
-; EG-NEXT:     LSHR * T4.W, T43.W, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T57.X, T44.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T4.Y, T42.Y, literal.x,
-; EG-NEXT:     BFE_INT T58.Z, T44.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T5.W, T42.W, literal.x,
-; EG-NEXT:     LSHR * T6.W, T44.W, literal.x,
+; EG-NEXT:     ADD_INT T44.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T45.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T46.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T47.X, T43.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T48.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T49.X, T43.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T50.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T51.X, T43.X, literal.y,
+; EG-NEXT:    24(3.363116e-44), 36(5.044674e-44)
+; EG-NEXT:     ADD_INT T52.X, T43.X, literal.x,
+; EG-NEXT:     ADD_INT * T53.X, T43.X, literal.y,
+; EG-NEXT:    32(4.484155e-44), 44(6.165713e-44)
+; EG-NEXT:     ADD_INT T54.X, T43.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T35.W, literal.y,
+; EG-NEXT:     LSHR T0.Z, T36.Y, literal.y,
+; EG-NEXT:     LSHR T0.W, T36.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T37.Y, literal.y,
+; EG-NEXT:    40(5.605194e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T55.X, T43.X, literal.x,
+; EG-NEXT:     LSHR T1.Y, T37.W, literal.y,
+; EG-NEXT:     LSHR T1.Z, T38.Y, literal.y,
+; EG-NEXT:     LSHR T2.W, T38.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T39.Y, literal.y,
+; EG-NEXT:    52(7.286752e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T56.X, T43.X, literal.x,
+; EG-NEXT:     LSHR T2.Y, T39.W, literal.y,
+; EG-NEXT:     BFE_INT T57.Z, T42.W, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T4.W, T40.Y, literal.y,
+; EG-NEXT:     LSHR * T5.W, T40.W, literal.y,
+; EG-NEXT:    48(6.726233e-44), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT T57.X, T42.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T3.Y, T41.Y, literal.x,
+; EG-NEXT:     BFE_INT T58.Z, T42.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T6.W, T41.W, literal.x,
+; EG-NEXT:     LSHR * T7.W, T42.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T58.X, T44.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T5.Y, T44.Y, literal.x,
-; EG-NEXT:     BFE_INT T59.Z, T42.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T58.X, T42.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T4.Y, T42.Y, literal.x,
+; EG-NEXT:     BFE_INT T59.Z, T41.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T57.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T44.Z, literal.x,
+; EG-NEXT:     LSHR * T7.W, T42.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T59.X, T42.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T59.X, T41.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T57.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T44.Z, T42.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.Z, T41.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T58.W, PV.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T44.X, literal.x,
+; EG-NEXT:     LSHR * T7.W, T42.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T44.X, T42.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.X, T41.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T58.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T60.Z, T43.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T59.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T5.W, T42.Z, literal.x,
+; EG-NEXT:     BFE_INT T60.Z, T40.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T59.W, T6.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T6.W, T41.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T60.X, T43.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T60.X, T40.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T59.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.Z, T43.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T44.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T5.W, T42.X, literal.x,
+; EG-NEXT:     BFE_INT T41.Z, T40.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T6.W, T41.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T42.X, T43.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T44.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T61.Z, T45.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T60.W, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T41.X, T40.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T61.Z, T39.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T60.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T5.W, T40.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 133:
-; EG-NEXT:     LSHR * T4.W, T43.Z, literal.x,
+; EG-NEXT:     BFE_INT T61.X, T39.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T60.Y, PS, 0.0, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T61.X, T45.Z, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T60.Y, PV.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.Z, T45.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.W, T3.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T43.X, literal.x,
+; EG-NEXT:    ALU clause starting at 113:
+; EG-NEXT:     BFE_INT T40.Z, T39.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.W, T4.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T4.W, T40.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T43.X, T45.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T62.Z, T46.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T61.W, T3.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T45.Z, literal.x,
+; EG-NEXT:     BFE_INT T40.X, T39.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T62.Z, T38.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T61.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T4.W, T39.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T62.X, T46.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T62.X, T38.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T61.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.Z, T46.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.W, T2.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T2.W, T45.X, literal.x,
+; EG-NEXT:     BFE_INT T39.Z, T38.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.W, T3.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T3.W, T39.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T45.X, T46.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T63.Z, T47.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T62.W, T2.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T2.W, T46.Z, literal.x,
+; EG-NEXT:     BFE_INT T39.X, T38.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T63.Z, T37.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T62.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T38.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T63.X, T47.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T63.X, T37.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T62.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.Z, T47.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T2.W, T46.X, literal.x,
+; EG-NEXT:     BFE_INT T38.Z, T37.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T38.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T46.X, T47.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T64.Z, T48.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T63.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T1.W, T47.Z, literal.x,
+; EG-NEXT:     BFE_INT T38.X, T37.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T64.Z, T36.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T63.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T2.W, T37.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T64.X, T48.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T64.X, T36.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T63.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T47.Z, T48.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T1.W, T47.X, literal.x,
+; EG-NEXT:     BFE_INT T37.Z, T36.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T38.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.W, T37.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T47.X, T48.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T65.Z, T41.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T64.W, T1.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.W, T48.Z, literal.x,
+; EG-NEXT:     BFE_INT T37.X, T36.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T38.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T65.Z, T35.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T36.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T65.X, T41.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T65.X, T35.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T64.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T48.Z, T41.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T47.W, T0.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T0.W, T48.X, literal.x,
+; EG-NEXT:     BFE_INT T36.Z, T35.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T37.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T36.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T48.X, T41.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T47.Y, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR T1.Z, T41.Z, literal.x,
-; EG-NEXT:     BFE_INT T65.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 240(3.363116e-43)
-; EG-NEXT:     LSHR T66.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T65.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.Z, T41.X, literal.y,
-; EG-NEXT:     BFE_INT T48.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T48.Y, PV.Z, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT T36.X, T35.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T37.Y, PS, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.Z, T35.Y, literal.x,
+; EG-NEXT:     BFE_INT T65.W, T0.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T0.W, T35.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T66.X, T43.X, literal.x,
+; EG-NEXT:     BFE_INT T65.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T36.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T35.X, literal.y,
+; EG-NEXT:    60(8.407791e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T35.X, T43.X, literal.x,
+; EG-NEXT:     BFE_INT * T36.Y, PS, 0.0, literal.y,
+; EG-NEXT:    56(7.847271e-44), 16(2.242078e-44)
 ;
 ; CM-LABEL: global_sextload_v64i16_to_v64i32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 0, @40, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 1 @24
-; CM-NEXT:    ALU 15, @41, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 5 @28
-; CM-NEXT:    ALU 82, @57, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 72, @140, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 7 @22
+; CM-NEXT:    ALU 75, @39, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 67, @115, KC0[], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T35.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T64, T56.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T55.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T54.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T53.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T62, T52.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T51.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T61, T50.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T49.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T60, T48.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T47.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T59, T46.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T40.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T58, T39.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T57, T38.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T43.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T64, T35.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T56.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T55.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T54.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T62, T53.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T52.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T61, T51.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T50.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T60, T49.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T48.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T59, T47.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T46.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T58, T45.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T57, T44.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
-; CM-NEXT:    Fetch clause starting at 24:
-; CM-NEXT:     VTX_READ_128 T36.XYZW, T37.X, 16, #1
-; CM-NEXT:     VTX_READ_128 T35.XYZW, T37.X, 0, #1
-; CM-NEXT:    Fetch clause starting at 28:
-; CM-NEXT:     VTX_READ_128 T41.XYZW, T37.X, 112, #1
-; CM-NEXT:     VTX_READ_128 T42.XYZW, T37.X, 96, #1
-; CM-NEXT:     VTX_READ_128 T43.XYZW, T37.X, 80, #1
-; CM-NEXT:     VTX_READ_128 T44.XYZW, T37.X, 64, #1
-; CM-NEXT:     VTX_READ_128 T45.XYZW, T37.X, 48, #1
-; CM-NEXT:     VTX_READ_128 T37.XYZW, T37.X, 32, #1
-; CM-NEXT:    ALU clause starting at 40:
-; CM-NEXT:     MOV * T37.X, KC0[2].Z,
-; CM-NEXT:    ALU clause starting at 41:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Y, T35.Z, literal.y,
-; CM-NEXT:     LSHR T0.Z, T35.W, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T1.Y, T35.Y, literal.y,
-; CM-NEXT:     LSHR T1.Z, T36.Z, literal.y,
-; CM-NEXT:     LSHR * T0.W, T36.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    ALU clause starting at 57:
-; CM-NEXT:     LSHR T2.Z, T36.X, literal.x,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; CM-NEXT:    16(2.242078e-44), 208(2.914701e-43)
-; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T2.Y, T36.Y, literal.y,
-; CM-NEXT:     LSHR T3.Z, T37.Z, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    160(2.242078e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T47.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T3.Y, T37.W, literal.y,
-; CM-NEXT:     LSHR T4.Z, T37.X, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    176(2.466285e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T48.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T4.Y, T37.Y, literal.y,
-; CM-NEXT:     LSHR T5.Z, T45.Z, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    128(1.793662e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T49.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T5.Y, T45.W, literal.y,
-; CM-NEXT:     LSHR T6.Z, T45.X, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    144(2.017870e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T6.Y, T45.Y, literal.y,
-; CM-NEXT:     LSHR T7.Z, T44.Z, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T51.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T7.Y, T44.W, literal.y,
-; CM-NEXT:     LSHR T8.Z, T44.X, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T52.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T8.Y, T44.Y, literal.y,
-; CM-NEXT:     LSHR T9.Z, T43.Z, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T53.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T9.Y, T43.W, literal.y,
-; CM-NEXT:     LSHR T10.Z, T43.X, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T54.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T10.Y, T43.Y, literal.y,
-; CM-NEXT:     LSHR T11.Z, T42.Z, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T55.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T11.Y, T42.W, literal.y,
-; CM-NEXT:     LSHR T12.Z, T42.X, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    Fetch clause starting at 22:
+; CM-NEXT:     VTX_READ_128 T42.XYZW, T35.X, 112, #1
+; CM-NEXT:     VTX_READ_128 T41.XYZW, T35.X, 96, #1
+; CM-NEXT:     VTX_READ_128 T40.XYZW, T35.X, 80, #1
+; CM-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 64, #1
+; CM-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 16, #1
+; CM-NEXT:     VTX_READ_128 T35.XYZW, T35.X, 0, #1
+; CM-NEXT:    ALU clause starting at 38:
+; CM-NEXT:     MOV * T35.X, KC0[2].Z,
+; CM-NEXT:    ALU clause starting at 39:
+; CM-NEXT:     LSHR * T43.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T44.X, PV.X, literal.x,
+; CM-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T45.X, T43.X, literal.x,
+; CM-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T46.X, T43.X, literal.x,
 ; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T56.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T12.Y, T42.Y, literal.y,
-; CM-NEXT:     BFE_INT T57.Z, T41.Y, 0.0, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:     LSHR * T1.W, T41.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     BFE_INT T57.X, T41.X, 0.0, literal.x,
-; CM-NEXT:     LSHR T13.Y, T41.W, literal.x,
-; CM-NEXT:     BFE_INT T58.Z, T41.W, 0.0, literal.x,
-; CM-NEXT:     LSHR * T2.W, T41.Y, literal.x,
+; CM-NEXT:     ADD_INT T47.X, T43.X, literal.x,
+; CM-NEXT:     LSHR * T0.W, T35.W, literal.y,
+; CM-NEXT:    52(7.286752e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T48.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T0.Y, T35.Y, literal.y,
+; CM-NEXT:     LSHR T0.Z, T36.Z, literal.y,
+; CM-NEXT:     LSHR * T1.W, T36.W, literal.y,
+; CM-NEXT:    40(5.605194e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T49.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T1.Y, T36.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T1.Z, T36.Y, literal.y,
+; CM-NEXT:     LSHR * T2.W, T37.Z, literal.y,
+; CM-NEXT:    44(6.165713e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T50.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T2.Y, T37.W, literal.y,
+; CM-NEXT:     LSHR T2.Z, T37.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T3.W, T37.Y, literal.y,
+; CM-NEXT:    32(4.484155e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T51.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T3.Y, T38.Z, literal.y,
+; CM-NEXT:     LSHR T3.Z, T38.W, literal.y,
+; CM-NEXT:     LSHR * T4.W, T38.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    36(5.044674e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T52.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T4.Y, T38.Y, literal.y,
+; CM-NEXT:     LSHR T4.Z, T39.Z, literal.y,
+; CM-NEXT:     LSHR * T5.W, T39.W, literal.y,
+; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T53.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T5.Y, T39.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T5.Z, T39.Y, literal.y,
+; CM-NEXT:     LSHR * T6.W, T40.Z, literal.y,
+; CM-NEXT:    28(3.923636e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T54.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T6.Y, T40.W, literal.x,
+; CM-NEXT:     LSHR T6.Z, T40.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T7.W, T40.Y, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T55.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T7.Y, T41.Z, literal.y,
+; CM-NEXT:     LSHR T7.Z, T41.W, literal.y,
+; CM-NEXT:     LSHR * T8.W, T41.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T56.X, T43.X, literal.x,
+; CM-NEXT:     LSHR T8.Y, T41.Y, literal.y,
+; CM-NEXT:     BFE_INT T57.Z, T42.Y, 0.0, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T9.W, T42.Z, literal.y,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     BFE_INT T57.X, T42.X, 0.0, literal.x,
+; CM-NEXT:     LSHR T9.Y, T42.W, literal.x,
+; CM-NEXT:     BFE_INT T58.Z, T42.W, 0.0, literal.x,
+; CM-NEXT:     LSHR * T10.W, T42.Y, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T58.X, T41.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T14.Y, T41.X, literal.x,
-; CM-NEXT:     BFE_INT T41.Z, T42.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T58.X, T42.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR T10.Y, T42.X, literal.x,
+; CM-NEXT:     BFE_INT T42.Z, T41.Y, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T57.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T41.X, T42.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T42.X, T41.X, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T57.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T59.Z, T42.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T58.W, T13.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T59.Z, T41.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T58.W, T9.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 140:
-; CM-NEXT:     BFE_INT T59.X, T42.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T58.Y, T1.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T42.Z, T43.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T41.W, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T59.X, T41.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T58.Y, T9.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T41.Z, T40.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T42.W, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T42.X, T43.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T41.Y, T12.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T60.Z, T43.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T59.W, T11.Y, 0.0, literal.x,
+; CM-NEXT:    ALU clause starting at 115:
+; CM-NEXT:     BFE_INT T41.X, T40.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T42.Y, T8.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T60.Z, T40.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T59.W, T7.Z, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T60.X, T43.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T59.Y, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T43.Z, T44.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T42.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T60.X, T40.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T59.Y, T7.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T40.Z, T39.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T41.W, T7.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T43.X, T44.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T42.Y, T10.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T61.Z, T44.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T60.W, T9.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T40.X, T39.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T41.Y, T6.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T61.Z, T39.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T60.W, T6.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T61.X, T44.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T60.Y, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T44.Z, T45.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T43.W, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T61.X, T39.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T60.Y, T6.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T39.Z, T38.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T40.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T44.X, T45.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T43.Y, T8.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T62.Z, T45.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T61.W, T7.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T39.X, T38.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T40.Y, T5.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T62.Z, T38.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T61.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T62.X, T45.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T45.Z, T37.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T62.X, T38.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T61.Y, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T38.Z, T37.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T39.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T45.X, T37.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T44.Y, T6.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T63.Z, T37.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T62.W, T5.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T38.X, T37.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T39.Y, T4.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T63.Z, T37.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T62.W, T3.Z, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T63.X, T37.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T37.Z, T36.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T62.Y, T3.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.Z, T36.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T38.W, T3.W, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T37.X, T36.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T45.Y, T4.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T38.Y, T2.Z, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T64.Z, T36.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T63.W, T3.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T63.W, T2.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T64.X, T36.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T63.Y, T2.W, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T36.Z, T35.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T37.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T36.X, T35.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T37.Y, T2.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.Y, T1.Y, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T65.Z, T35.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T64.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T65.X, T35.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     LSHR T1.Z, T35.X, literal.x,
-; CM-NEXT:     BFE_INT * T36.W, T1.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T64.Y, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T0.Z, T35.X, literal.x,
+; CM-NEXT:     BFE_INT * T36.W, T0.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T35.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T35.X, T43.X, literal.x,
 ; CM-NEXT:     BFE_INT T36.Y, PV.Z, 0.0, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     BFE_INT * T65.W, T0.Z, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR T66.X, PV.Z, literal.x,
-; CM-NEXT:     BFE_INT * T65.Y, T0.Y, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     LSHR T0.Z, T35.Z, literal.y,
+; CM-NEXT:     BFE_INT * T65.W, T0.W, 0.0, literal.y,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T66.X, T43.X, literal.x,
+; CM-NEXT:     BFE_INT * T65.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT:    4(5.605194e-45), 16(2.242078e-44)
   %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(1) %out
@@ -5899,7 +5719,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 13, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
@@ -5918,19 +5738,18 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; EG-NEXT:     MOV T5.Y, 0.0,
 ; EG-NEXT:     MOV T6.W, 0.0,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T8.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v4i16_to_v4i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T8.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
+; CM-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
@@ -5948,12 +5767,10 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; CM-NEXT:     MOV * T6.W, 0.0,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T5.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T7.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T8.X, PV.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
@@ -6050,63 +5867,61 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T5.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     ASHR * T5.W, T5.X, literal.x,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ASHR T5.Z, T5.X, literal.y,
-; EG-NEXT:     ASHR * T7.W, T5.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ASHR * T6.W, T5.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
-; EG-NEXT:     ASHR * T7.Z, T5.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT:     ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     ASHR T6.Z, T5.Y, literal.x,
+; EG-NEXT:     ASHR * T5.W, T5.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T7.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR * T5.Z, T5.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT:     ASHR * T5.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T8.X, T7.X, literal.x,
+; EG-NEXT:     ASHR * T6.Y, T6.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
 ;
 ; CM-LABEL: global_sextload_v4i16_to_v4i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
+; CM-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T8.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 8:
 ; CM-NEXT:     MOV * T5.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 9:
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT:     ASHR * T5.W, T5.X, literal.x,
+; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; CM-NEXT:     ASHR T5.Z, T5.X, literal.x,
 ; CM-NEXT:     ASHR * T6.W, T5.Y, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T7.X, PV.Z, literal.x,
-; CM-NEXT:     ASHR T6.Z, T5.Y, literal.y,
-; CM-NEXT:     ASHR * T5.W, T5.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
-; CM-NEXT:     ASHR * T5.Z, T5.X, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
+; CM-NEXT:     ASHR * T6.Z, T5.Y, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT * T6.X, T5.Y, 0.0, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T6.Y, PV.X, literal.y,
-; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; CM-NEXT:     ASHR * T5.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:     ADD_INT T8.X, PV.X, literal.x,
+; CM-NEXT:     ASHR * T5.Y, T5.X, literal.y,
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
@@ -6237,7 +6052,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 30, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 26, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T13.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
@@ -6268,27 +6083,23 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:     MOV * T9.W, 0.0,
 ; EG-NEXT:     MOV T10.W, 0.0,
 ; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T11.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T13.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T14.X, T11.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v8i16_to_v8i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
-; CM-NEXT:    ALU 32, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T13.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T12.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T11.X
+; CM-NEXT:    ALU 28, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T11.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T14.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T12.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
@@ -6316,18 +6127,14 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; CM-NEXT:     MOV * T9.W, 0.0,
 ; CM-NEXT:     MOV * T10.W, 0.0,
 ; CM-NEXT:     MOV * T7.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T11.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T12.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T14.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T11.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T12.X, PV.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T13.X, T11.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T14.X, T11.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
@@ -6500,7 +6307,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 29, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T9.X, 0
@@ -6511,19 +6318,16 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T10.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T10.Z, T7.X, literal.y,
-; EG-NEXT:     ASHR * T12.W, T7.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T9.X, PV.X, literal.x,
+; EG-NEXT:     ASHR T10.W, T7.X, literal.y,
+; EG-NEXT:     ADD_INT * T11.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T10.Z, T7.X, literal.x,
+; EG-NEXT:     ASHR * T12.W, T7.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T10.X, T7.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T12.Z, T7.Y, literal.x,
 ; EG-NEXT:     ASHR * T13.W, T7.Z, literal.y,
@@ -6539,42 +6343,36 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T14.X, T7.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T13.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T7.X, T8.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T14.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T14.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v8i16_to_v8i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
-; CM-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T11.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T9.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T8.X
+; CM-NEXT:    ALU 28, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T14.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T11.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T9.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 10:
 ; CM-NEXT:     MOV * T7.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 11:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T8.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     ASHR * T10.W, T7.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T11.X, PV.Z, literal.x,
+; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T9.X, PV.X, literal.x,
+; CM-NEXT:     ASHR * T10.W, T7.W, literal.y,
+; CM-NEXT:    12(1.681558e-44), 31(4.344025e-44)
+; CM-NEXT:     ADD_INT T11.X, T8.X, literal.x,
 ; CM-NEXT:     ASHR T10.Z, T7.W, literal.y,
 ; CM-NEXT:     ASHR * T12.W, T7.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T10.X, T7.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T12.Z, T7.Z, literal.x,
@@ -6592,9 +6390,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; CM-NEXT:     BFE_INT T7.X, T7.X, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T13.Y, PV.X, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T14.X, T8.X, literal.x,
 ; CM-NEXT:     ASHR * T7.Y, PV.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
@@ -6811,7 +6609,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 62, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 52, @17, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
@@ -6867,43 +6665,33 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV * T17.W, 0.0,
 ; EG-NEXT:     MOV T18.W, 0.0,
 ; EG-NEXT:     MOV * T11.W, 0.0,
-; EG-NEXT:     LSHR T19.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T19.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T20.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T22.X, T19.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, T19.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T24.X, T19.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T19.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT * T26.X, T19.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v16i16_to_v16i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @12
-; CM-NEXT:    ALU 64, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T25.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T24.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T23.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T21.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T19.X
+; CM-NEXT:    ALU 56, @17, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T19.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T25.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T24.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T22.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T20.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 12:
 ; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
@@ -6952,30 +6740,22 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; CM-NEXT:     MOV * T17.W, 0.0,
 ; CM-NEXT:     MOV * T18.W, 0.0,
 ; CM-NEXT:     MOV * T11.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T23.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T25.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T26.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T19.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T20.X, PV.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T21.X, T19.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T22.X, T19.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T23.X, T19.X, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T24.X, T19.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T25.X, T19.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T26.X, T19.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
@@ -7276,7 +7056,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 65, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 55, @17, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
@@ -7292,31 +7072,22 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 16:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 17:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T19.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T19.Z, T11.X, literal.y,
-; EG-NEXT:     ASHR * T21.W, T11.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, T13.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T18.X, T13.X, literal.x,
+; EG-NEXT:     ASHR T19.W, T11.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T20.X, T13.X, literal.z,
+; EG-NEXT:    20(2.802597e-44), 31(4.344025e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T19.Z, T11.X, literal.x,
+; EG-NEXT:     ASHR * T21.W, T11.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T19.X, T11.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T21.Z, T11.Y, literal.x,
 ; EG-NEXT:     ASHR * T22.W, T11.Z, literal.y,
@@ -7352,26 +7123,25 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T26.X, T12.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T25.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T12.X, T13.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T26.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T26.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v16i16_to_v16i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @12
-; CM-NEXT:    ALU 65, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T17.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T16.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T15.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T13.X
+; CM-NEXT:    ALU 56, @17, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T20.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T18.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T17.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T16.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T15.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T14.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 12:
 ; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
@@ -7379,32 +7149,23 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; CM-NEXT:    ALU clause starting at 16:
 ; CM-NEXT:     MOV * T11.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 17:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T13.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T18.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     ASHR * T19.W, T11.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T20.X, PV.Z, literal.x,
+; CM-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T14.X, PV.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T15.X, T13.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T16.X, T13.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T17.X, T13.X, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T18.X, T13.X, literal.x,
+; CM-NEXT:     ASHR * T19.W, T11.W, literal.y,
+; CM-NEXT:    12(1.681558e-44), 31(4.344025e-44)
+; CM-NEXT:     ADD_INT T20.X, T13.X, literal.x,
 ; CM-NEXT:     ASHR T19.Z, T11.W, literal.y,
 ; CM-NEXT:     ASHR * T21.W, T11.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T19.X, T11.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T21.Z, T11.Z, literal.x,
@@ -7442,9 +7203,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; CM-NEXT:     BFE_INT T12.X, T12.X, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T25.Y, PV.X, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T26.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T26.X, T13.X, literal.x,
 ; CM-NEXT:     ASHR * T12.Y, PV.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
@@ -7850,33 +7611,33 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; EG-LABEL: global_zextload_v32i16_to_v32i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 2 @22
-; EG-NEXT:    ALU 33, @31, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @28
-; EG-NEXT:    ALU 93, @65, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @22
+; EG-NEXT:    ALU 9, @31, KC0[], KC1[]
+; EG-NEXT:    TEX 2 @24
+; EG-NEXT:    ALU 94, @41, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T50.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T49.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T48.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T49.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T48.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T47.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T46.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T45.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T44.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T43.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T42.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T43.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T41.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T39.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T38.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T37.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T35.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T35.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T36.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T21.XYZW, T20.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T22.XYZW, T20.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T23.XYZW, T20.X, 32, #1
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_128 T29.XYZW, T20.X, 0, #1
+; EG-NEXT:    Fetch clause starting at 24:
+; EG-NEXT:     VTX_READ_128 T23.XYZW, T20.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T24.XYZW, T20.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T25.XYZW, T20.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 30:
 ; EG-NEXT:     MOV * T20.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 31:
@@ -7884,295 +7645,256 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T19.X, T21.Z, literal.x,
 ; EG-NEXT:     MOV T19.Y, 0.0,
-; EG-NEXT:     LSHR T24.Z, T21.W, literal.y,
-; EG-NEXT:     AND_INT * T24.X, T21.W, literal.x,
+; EG-NEXT:     LSHR T22.Z, T21.W, literal.y,
+; EG-NEXT:     AND_INT * T22.X, T21.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T24.Y, 0.0,
-; EG-NEXT:     LSHR * T25.Z, T21.X, literal.x,
+; EG-NEXT:     MOV T22.Y, 0.0,
+; EG-NEXT:     LSHR * T20.Z, T21.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T25.X, T21.X, literal.x,
-; EG-NEXT:     MOV T25.Y, 0.0,
+; EG-NEXT:    ALU clause starting at 41:
+; EG-NEXT:     AND_INT T20.X, T21.X, literal.x,
+; EG-NEXT:     MOV T20.Y, 0.0,
 ; EG-NEXT:     LSHR T21.Z, T21.Y, literal.y,
 ; EG-NEXT:     AND_INT * T21.X, T21.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T21.Y, 0.0,
-; EG-NEXT:     LSHR * T26.Z, T23.Z, literal.x,
+; EG-NEXT:     LSHR * T26.Z, T25.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T26.X, T23.Z, literal.x,
+; EG-NEXT:     AND_INT T26.X, T25.Z, literal.x,
 ; EG-NEXT:     MOV T26.Y, 0.0,
-; EG-NEXT:     LSHR T27.Z, T23.W, literal.y,
-; EG-NEXT:     AND_INT * T27.X, T23.W, literal.x,
+; EG-NEXT:     LSHR T27.Z, T25.W, literal.y,
+; EG-NEXT:     AND_INT * T27.X, T25.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T27.Y, 0.0,
-; EG-NEXT:     LSHR * T28.Z, T23.X, literal.x,
+; EG-NEXT:     LSHR * T28.Z, T25.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T28.X, T23.X, literal.x,
+; EG-NEXT:     AND_INT T28.X, T25.X, literal.x,
 ; EG-NEXT:     MOV T28.Y, 0.0,
-; EG-NEXT:     LSHR T23.Z, T23.Y, literal.y,
-; EG-NEXT:     AND_INT * T23.X, T23.Y, literal.x,
+; EG-NEXT:     LSHR T25.Z, T25.Y, literal.y,
+; EG-NEXT:     AND_INT * T25.X, T25.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T23.Y, 0.0,
-; EG-NEXT:     LSHR * T20.Z, T22.Z, literal.x,
+; EG-NEXT:     MOV T25.Y, 0.0,
+; EG-NEXT:     LSHR * T29.Z, T24.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 65:
-; EG-NEXT:     AND_INT T20.X, T22.Z, literal.x,
-; EG-NEXT:     MOV T20.Y, 0.0,
-; EG-NEXT:     LSHR T30.Z, T22.W, literal.y,
-; EG-NEXT:     AND_INT * T30.X, T22.W, literal.x,
+; EG-NEXT:     AND_INT T29.X, T24.Z, literal.x,
+; EG-NEXT:     MOV T29.Y, 0.0,
+; EG-NEXT:     LSHR T30.Z, T24.W, literal.y,
+; EG-NEXT:     AND_INT * T30.X, T24.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T30.Y, 0.0,
-; EG-NEXT:     LSHR * T31.Z, T22.X, literal.x,
+; EG-NEXT:     LSHR * T31.Z, T24.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T31.X, T22.X, literal.x,
+; EG-NEXT:     AND_INT T31.X, T24.X, literal.x,
 ; EG-NEXT:     MOV T31.Y, 0.0,
-; EG-NEXT:     LSHR T22.Z, T22.Y, literal.y,
-; EG-NEXT:     AND_INT * T22.X, T22.Y, literal.x,
+; EG-NEXT:     LSHR T24.Z, T24.Y, literal.y,
+; EG-NEXT:     AND_INT * T24.X, T24.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T22.Y, 0.0,
-; EG-NEXT:     LSHR * T32.Z, T29.Z, literal.x,
+; EG-NEXT:     MOV T24.Y, 0.0,
+; EG-NEXT:     LSHR * T32.Z, T23.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T32.X, T29.Z, literal.x,
+; EG-NEXT:     AND_INT T32.X, T23.Z, literal.x,
 ; EG-NEXT:     MOV T32.Y, 0.0,
-; EG-NEXT:     LSHR T33.Z, T29.W, literal.y,
-; EG-NEXT:     AND_INT * T33.X, T29.W, literal.x,
+; EG-NEXT:     LSHR T33.Z, T23.W, literal.y,
+; EG-NEXT:     AND_INT * T33.X, T23.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     MOV T33.Y, 0.0,
-; EG-NEXT:     LSHR * T34.Z, T29.X, literal.x,
+; EG-NEXT:     LSHR * T34.Z, T23.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T34.X, T29.X, literal.x,
+; EG-NEXT:     AND_INT T34.X, T23.X, literal.x,
 ; EG-NEXT:     MOV T34.Y, 0.0,
-; EG-NEXT:     LSHR T29.Z, T29.Y, literal.y,
-; EG-NEXT:     AND_INT * T29.X, T29.Y, literal.x,
+; EG-NEXT:     LSHR T23.Z, T23.Y, literal.y,
+; EG-NEXT:     AND_INT * T23.X, T23.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     MOV T29.Y, 0.0,
+; EG-NEXT:     MOV T23.Y, 0.0,
 ; EG-NEXT:     MOV T19.W, 0.0,
-; EG-NEXT:     MOV * T24.W, 0.0,
-; EG-NEXT:     MOV T25.W, 0.0,
+; EG-NEXT:     MOV * T22.W, 0.0,
+; EG-NEXT:     MOV T20.W, 0.0,
 ; EG-NEXT:     MOV * T21.W, 0.0,
 ; EG-NEXT:     MOV T26.W, 0.0,
 ; EG-NEXT:     MOV * T27.W, 0.0,
 ; EG-NEXT:     MOV T28.W, 0.0,
-; EG-NEXT:     MOV * T23.W, 0.0,
-; EG-NEXT:     MOV T20.W, 0.0,
+; EG-NEXT:     MOV * T25.W, 0.0,
+; EG-NEXT:     MOV T29.W, 0.0,
 ; EG-NEXT:     MOV * T30.W, 0.0,
 ; EG-NEXT:     MOV T31.W, 0.0,
-; EG-NEXT:     MOV * T22.W, 0.0,
+; EG-NEXT:     MOV * T24.W, 0.0,
 ; EG-NEXT:     MOV T32.W, 0.0,
 ; EG-NEXT:     MOV * T33.W, 0.0,
 ; EG-NEXT:     MOV T34.W, 0.0,
-; EG-NEXT:     MOV * T29.W, 0.0,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T36.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T42.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T43.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T45.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T23.W, 0.0,
+; EG-NEXT:     LSHR * T35.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T36.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T37.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T38.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T39.X, T35.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T40.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T41.X, T35.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T42.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T43.X, T35.X, literal.y,
+; EG-NEXT:    24(3.363116e-44), 36(5.044674e-44)
+; EG-NEXT:     ADD_INT T44.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T45.X, T35.X, literal.y,
+; EG-NEXT:    32(4.484155e-44), 44(6.165713e-44)
+; EG-NEXT:     ADD_INT T46.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T47.X, T35.X, literal.y,
+; EG-NEXT:    40(5.605194e-44), 52(7.286752e-44)
+; EG-NEXT:     ADD_INT T48.X, T35.X, literal.x,
+; EG-NEXT:     ADD_INT * T49.X, T35.X, literal.y,
+; EG-NEXT:    48(6.726233e-44), 60(8.407791e-44)
+; EG-NEXT:     ADD_INT * T50.X, T35.X, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v32i16_to_v32i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 2 @22
-; CM-NEXT:    ALU 33, @31, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @28
-; CM-NEXT:    ALU 94, @65, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T50.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T49.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T48.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T47.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T46.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T45.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T44.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T43.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T42.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T41.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T40.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T39.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T38.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T37.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T34, T36.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T22.X
+; CM-NEXT:    TEX 0 @22
+; CM-NEXT:    ALU 9, @31, KC0[], KC1[]
+; CM-NEXT:    TEX 2 @24
+; CM-NEXT:    ALU 102, @41, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T50.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T49.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T48.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T47.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T46.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T45.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T44.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T43.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T42.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T41.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T40.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T39.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T38.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T34, T37.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T36.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 22:
 ; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
-; CM-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 32, #1
-; CM-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 16, #1
-; CM-NEXT:    Fetch clause starting at 28:
-; CM-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 48, #1
+; CM-NEXT:    Fetch clause starting at 24:
+; CM-NEXT:     VTX_READ_128 T23.XYZW, T19.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T24.XYZW, T19.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T25.XYZW, T19.X, 16, #1
 ; CM-NEXT:    ALU clause starting at 30:
 ; CM-NEXT:     MOV * T19.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 31:
-; CM-NEXT:     LSHR * T23.Z, T20.Y, literal.x,
+; CM-NEXT:     LSHR * T21.Z, T20.Y, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T23.X, T20.Y, literal.x,
-; CM-NEXT:     MOV T23.Y, 0.0,
-; CM-NEXT:     LSHR * T24.Z, T20.X, literal.y,
+; CM-NEXT:     AND_INT T21.X, T20.Y, literal.x,
+; CM-NEXT:     MOV T21.Y, 0.0,
+; CM-NEXT:     LSHR * T22.Z, T20.X, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T24.X, T20.X, literal.x,
-; CM-NEXT:     MOV T24.Y, 0.0,
-; CM-NEXT:     LSHR * T25.Z, T20.W, literal.y,
+; CM-NEXT:     AND_INT T22.X, T20.X, literal.x,
+; CM-NEXT:     MOV T22.Y, 0.0,
+; CM-NEXT:     LSHR * T19.Z, T20.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T25.X, T20.W, literal.x,
-; CM-NEXT:     MOV T25.Y, 0.0,
+; CM-NEXT:    ALU clause starting at 41:
+; CM-NEXT:     AND_INT T19.X, T20.W, literal.x,
+; CM-NEXT:     MOV T19.Y, 0.0,
 ; CM-NEXT:     LSHR * T26.Z, T20.Z, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; CM-NEXT:     AND_INT T26.X, T20.Z, literal.x,
 ; CM-NEXT:     MOV T26.Y, 0.0,
-; CM-NEXT:     LSHR * T20.Z, T22.Y, literal.y,
+; CM-NEXT:     LSHR * T20.Z, T25.Y, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T20.X, T22.Y, literal.x,
+; CM-NEXT:     AND_INT T20.X, T25.Y, literal.x,
 ; CM-NEXT:     MOV T20.Y, 0.0,
-; CM-NEXT:     LSHR * T27.Z, T22.X, literal.y,
+; CM-NEXT:     LSHR * T27.Z, T25.X, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T27.X, T22.X, literal.x,
+; CM-NEXT:     AND_INT T27.X, T25.X, literal.x,
 ; CM-NEXT:     MOV T27.Y, 0.0,
-; CM-NEXT:     LSHR * T28.Z, T22.W, literal.y,
+; CM-NEXT:     LSHR * T28.Z, T25.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T28.X, T22.W, literal.x,
+; CM-NEXT:     AND_INT T28.X, T25.W, literal.x,
 ; CM-NEXT:     MOV T28.Y, 0.0,
-; CM-NEXT:     LSHR * T29.Z, T22.Z, literal.y,
+; CM-NEXT:     LSHR * T29.Z, T25.Z, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T29.X, T22.Z, literal.x,
+; CM-NEXT:     AND_INT T29.X, T25.Z, literal.x,
 ; CM-NEXT:     MOV T29.Y, 0.0,
-; CM-NEXT:     LSHR * T19.Z, T21.Y, literal.y,
+; CM-NEXT:     LSHR * T25.Z, T24.Y, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:    ALU clause starting at 65:
-; CM-NEXT:     AND_INT T19.X, T21.Y, literal.x,
-; CM-NEXT:     MOV T19.Y, 0.0,
-; CM-NEXT:     LSHR * T30.Z, T21.X, literal.y,
+; CM-NEXT:     AND_INT T25.X, T24.Y, literal.x,
+; CM-NEXT:     MOV T25.Y, 0.0,
+; CM-NEXT:     LSHR * T30.Z, T24.X, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T30.X, T21.X, literal.x,
+; CM-NEXT:     AND_INT T30.X, T24.X, literal.x,
 ; CM-NEXT:     MOV T30.Y, 0.0,
-; CM-NEXT:     LSHR * T31.Z, T21.W, literal.y,
+; CM-NEXT:     LSHR * T31.Z, T24.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T31.X, T21.W, literal.x,
+; CM-NEXT:     AND_INT T31.X, T24.W, literal.x,
 ; CM-NEXT:     MOV T31.Y, 0.0,
-; CM-NEXT:     LSHR * T32.Z, T21.Z, literal.y,
+; CM-NEXT:     LSHR * T32.Z, T24.Z, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T32.X, T21.Z, literal.x,
+; CM-NEXT:     AND_INT T32.X, T24.Z, literal.x,
 ; CM-NEXT:     MOV T32.Y, 0.0,
-; CM-NEXT:     LSHR * T21.Z, T22.Y, literal.y,
+; CM-NEXT:     LSHR * T24.Z, T23.Y, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T21.X, T22.Y, literal.x,
-; CM-NEXT:     MOV T21.Y, 0.0,
-; CM-NEXT:     LSHR * T33.Z, T22.X, literal.y,
+; CM-NEXT:     AND_INT T24.X, T23.Y, literal.x,
+; CM-NEXT:     MOV T24.Y, 0.0,
+; CM-NEXT:     LSHR * T33.Z, T23.X, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T33.X, T22.X, literal.x,
+; CM-NEXT:     AND_INT T33.X, T23.X, literal.x,
 ; CM-NEXT:     MOV T33.Y, 0.0,
-; CM-NEXT:     LSHR * T34.Z, T22.W, literal.y,
+; CM-NEXT:     LSHR * T34.Z, T23.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T34.X, T22.W, literal.x,
+; CM-NEXT:     AND_INT T34.X, T23.W, literal.x,
 ; CM-NEXT:     MOV T34.Y, 0.0,
-; CM-NEXT:     LSHR * T35.Z, T22.Z, literal.y,
+; CM-NEXT:     LSHR * T35.Z, T23.Z, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     AND_INT T35.X, T22.Z, literal.x,
+; CM-NEXT:     AND_INT T35.X, T23.Z, literal.x,
 ; CM-NEXT:     MOV T35.Y, 0.0,
-; CM-NEXT:     MOV * T23.W, 0.0,
+; CM-NEXT:     MOV * T21.W, 0.0,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     MOV * T24.W, 0.0,
-; CM-NEXT:     MOV * T25.W, 0.0,
+; CM-NEXT:     MOV * T22.W, 0.0,
+; CM-NEXT:     MOV * T19.W, 0.0,
 ; CM-NEXT:     MOV * T26.W, 0.0,
 ; CM-NEXT:     MOV * T20.W, 0.0,
 ; CM-NEXT:     MOV * T27.W, 0.0,
 ; CM-NEXT:     MOV * T28.W, 0.0,
 ; CM-NEXT:     MOV * T29.W, 0.0,
-; CM-NEXT:     MOV * T19.W, 0.0,
+; CM-NEXT:     MOV * T25.W, 0.0,
 ; CM-NEXT:     MOV * T30.W, 0.0,
 ; CM-NEXT:     MOV * T31.W, 0.0,
 ; CM-NEXT:     MOV * T32.W, 0.0,
-; CM-NEXT:     MOV * T21.W, 0.0,
+; CM-NEXT:     MOV * T24.W, 0.0,
 ; CM-NEXT:     MOV * T33.W, 0.0,
 ; CM-NEXT:     MOV * T34.W, 0.0,
 ; CM-NEXT:     MOV * T35.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; CM-NEXT:     LSHR T36.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; CM-NEXT:     LSHR T41.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; CM-NEXT:     LSHR T42.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T43.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; CM-NEXT:     LSHR T44.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T45.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T47.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR * T48.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR T49.X, KC0[2].Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T50.X, PV.W, literal.x,
+; CM-NEXT:     LSHR * T23.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T36.X, PV.X, literal.x,
+; CM-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T37.X, T23.X, literal.x,
+; CM-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T38.X, T23.X, literal.x,
+; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T39.X, T23.X, literal.x,
+; CM-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T40.X, T23.X, literal.x,
+; CM-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T41.X, T23.X, literal.x,
+; CM-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T42.X, T23.X, literal.x,
+; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T43.X, T23.X, literal.x,
+; CM-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T44.X, T23.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T45.X, T23.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T46.X, T23.X, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T47.X, T23.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T48.X, T23.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T49.X, T23.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T50.X, T23.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(1) %out
@@ -8727,9 +8449,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @22
-; EG-NEXT:    ALU 56, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 33, @31, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @24
-; EG-NEXT:    ALU 74, @88, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 74, @65, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T38.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T34.X, 0
@@ -8744,8 +8466,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T24.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
@@ -8756,69 +8478,47 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 30:
 ; EG-NEXT:     MOV * T19.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 31:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T22.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T35.W, T20.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T35.Z, T20.Y, literal.y,
-; EG-NEXT:     ASHR * T37.W, T20.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T35.X, T20.Y, 0.0, literal.x,
-; EG-NEXT:     ASHR * T37.Z, T20.X, literal.x,
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T24.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T21.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T26.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T27.X, T21.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T28.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T29.X, T21.X, literal.y,
+; EG-NEXT:    24(3.363116e-44), 36(5.044674e-44)
+; EG-NEXT:     ADD_INT T30.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T31.X, T21.X, literal.y,
+; EG-NEXT:    32(4.484155e-44), 44(6.165713e-44)
+; EG-NEXT:     ADD_INT T32.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T33.X, T21.X, literal.y,
+; EG-NEXT:    40(5.605194e-44), 52(7.286752e-44)
+; EG-NEXT:     ADD_INT T34.X, T21.X, literal.x,
+; EG-NEXT:     ASHR T35.W, T20.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T36.X, T21.X, literal.z,
+; EG-NEXT:    48(6.726233e-44), 31(4.344025e-44)
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T35.Z, T20.X, literal.x,
+; EG-NEXT:     ASHR * T37.W, T20.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT T35.X, T20.X, 0.0, literal.x,
+; EG-NEXT:     ASHR * T37.Z, T20.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T37.X, T20.X, 0.0, literal.x,
-; EG-NEXT:     ASHR T35.Y, PV.X, literal.y,
+; EG-NEXT:     BFE_INT T37.X, T20.Y, 0.0, literal.x,
 ; EG-NEXT:     ASHR * T19.W, T20.W, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    ALU clause starting at 88:
-; EG-NEXT:     ASHR T19.Z, T20.W, literal.x,
-; EG-NEXT:     ASHR * T41.W, T20.Z, literal.y,
-; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    ALU clause starting at 65:
+; EG-NEXT:     ASHR T37.Y, T37.X, literal.x,
+; EG-NEXT:     ASHR T19.Z, T20.W, literal.y,
+; EG-NEXT:     ASHR * T41.W, T20.Z, literal.x,
+; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T19.X, T20.W, 0.0, literal.x,
-; EG-NEXT:     ASHR T37.Y, T37.X, literal.y,
+; EG-NEXT:     ASHR T35.Y, T35.X, literal.y,
 ; EG-NEXT:     ASHR T41.Z, T20.Z, literal.x,
 ; EG-NEXT:     ASHR * T20.W, T40.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
@@ -8883,36 +8583,35 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T50.X, T38.Z, 0.0, literal.x,
 ; EG-NEXT:     ASHR T49.Y, PV.X, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T38.X, T21.X, literal.z,
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ASHR * T50.Y, PV.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T50.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v32i16_to_v32i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @22
-; CM-NEXT:    ALU 55, @31, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 39, @31, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 2 @24
-; CM-NEXT:    ALU 73, @87, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 73, @71, KC0[], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T50.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T49, T36.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T48, T34.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T33.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T32.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T46, T31.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T30.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T29.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T28.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T27.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T25.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T24.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T49, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T48, T36.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T34.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T33.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T46, T32.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T31.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T30.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T29.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T28.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T27.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T25.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T24.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T23.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T21.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 22:
 ; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 48, #1
@@ -8923,68 +8622,53 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; CM-NEXT:    ALU clause starting at 30:
 ; CM-NEXT:     MOV * T19.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 31:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; CM-NEXT:     LSHR T23.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; CM-NEXT:     LSHR T27.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; CM-NEXT:     LSHR T28.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; CM-NEXT:     LSHR T30.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T31.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T32.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T33.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T34.X, PV.W, literal.x,
-; CM-NEXT:     ASHR * T35.W, T20.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; CM-NEXT:     LSHR T36.X, KC0[2].Y, literal.x,
-; CM-NEXT:     ASHR T35.Z, T20.Z, literal.y,
-; CM-NEXT:     ASHR * T37.W, T20.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     LSHR * T21.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T22.X, PV.X, literal.x,
+; CM-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T23.X, T21.X, literal.x,
+; CM-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T24.X, T21.X, literal.x,
+; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T25.X, T21.X, literal.x,
+; CM-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T26.X, T21.X, literal.x,
+; CM-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T27.X, T21.X, literal.x,
+; CM-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T28.X, T21.X, literal.x,
+; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T29.X, T21.X, literal.x,
+; CM-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T30.X, T21.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T31.X, T21.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T32.X, T21.X, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T33.X, T21.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T34.X, T21.X, literal.x,
+; CM-NEXT:     ASHR * T35.W, T20.W, literal.y,
+; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
+; CM-NEXT:     ADD_INT T36.X, T21.X, literal.x,
+; CM-NEXT:     ASHR T35.Z, T20.W, literal.y,
+; CM-NEXT:     ASHR * T37.W, T20.Z, literal.z,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T35.X, T20.Z, 0.0, literal.x,
-; CM-NEXT:     ASHR * T37.Z, T20.W, literal.x,
+; CM-NEXT:     BFE_INT T35.X, T20.W, 0.0, literal.x,
+; CM-NEXT:     ASHR * T37.Z, T20.Z, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T37.X, T20.W, 0.0, literal.x,
-; CM-NEXT:     ASHR T35.Y, PV.X, literal.y,
+; CM-NEXT:     BFE_INT T37.X, T20.Z, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T19.W, T20.X, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT:    ALU clause starting at 87:
-; CM-NEXT:     ASHR T19.Z, T20.X, literal.x,
-; CM-NEXT:     ASHR * T20.W, T20.Y, literal.y,
-; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; CM-NEXT:    ALU clause starting at 71:
+; CM-NEXT:     ASHR T37.Y, T37.X, literal.x,
+; CM-NEXT:     ASHR T19.Z, T20.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     ASHR * T20.W, T20.Y, literal.x,
+; CM-NEXT:    31(4.344025e-44), 16(2.242078e-44)
 ; CM-NEXT:     BFE_INT T19.X, T20.X, 0.0, literal.x,
-; CM-NEXT:     ASHR T37.Y, T37.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     ASHR T35.Y, T35.X, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:     ASHR T20.Z, T20.Y, literal.x,
 ; CM-NEXT:     ASHR * T41.W, T40.Z, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
@@ -9048,12 +8732,11 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; CM-NEXT:     ASHR * T38.Z, T38.Y, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; CM-NEXT:     BFE_INT T38.X, T38.Y, 0.0, literal.x,
-; CM-NEXT:     ASHR T49.Y, PV.X, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT:     ASHR * T49.Y, PV.X, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T50.X, T21.X, literal.x,
 ; CM-NEXT:     ASHR * T38.Y, PV.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0499b007575c8..3923290976363 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -226,21 +226,19 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:     MOV * T2.X, T0.Z,
+; EG-NEXT:     MOV T1.X, T0.Z,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.X, PS, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_load_v3i32:
@@ -412,9 +410,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
+; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
@@ -422,11 +420,10 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_load_v8i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
@@ -533,12 +530,12 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
 ;
 ; EG-LABEL: global_load_v9i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 8, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 5, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 1, @23, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T0.X, 1
+; EG-NEXT:    ALU 1, @20, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
@@ -546,18 +543,15 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
 ; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 16, #1
 ; EG-NEXT:     VTX_READ_32 T3.X, T3.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV * T3.X, PS,
-; EG-NEXT:    ALU clause starting at 23:
-; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 20:
+; EG-NEXT:     ADD_INT * T5.X, T0.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_load_v9i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
@@ -667,29 +661,26 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
 ;
 ; EG-LABEL: global_load_v10i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 7, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
+; EG-NEXT:    ALU 2, @17, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T1.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.X, KC0[2].Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 17:
+; EG-NEXT:     ADD_INT T4.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T5.X, T0.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 4(5.605194e-45)
 ;
 ; GCN-HSA-LABEL: global_load_v10i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
@@ -802,11 +793,11 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 12, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
+; EG-NEXT:    ALU 7, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
@@ -815,19 +806,14 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T4.X, T0.Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T4.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T5.X, T0.Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T6.X, T3.X, literal.x,
+; EG-NEXT:     ADD_INT * T7.X, T3.X, literal.y,
+; EG-NEXT:    10(1.401298e-44), 4(5.605194e-45)
 ;
 ; GCN-HSA-LABEL: global_load_v11i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
@@ -938,12 +924,12 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
 ;
 ; EG-LABEL: global_load_v12i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 7, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 2 @8
-; EG-NEXT:    ALU 1, @22, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; EG-NEXT:    ALU 1, @19, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
@@ -951,17 +937,14 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     VTX_READ_128 T4.XYZW, T2.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T2.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 22:
-; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 19:
+; EG-NEXT:     ADD_INT * T5.X, T0.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_load_v12i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
@@ -1088,13 +1071,13 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ;
 ; EG-LABEL: global_load_v16i32:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 11, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 5, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @8
-; EG-NEXT:    ALU 1, @28, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @22, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T4.XYZW, T3.X, 32, #1
@@ -1102,21 +1085,15 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     VTX_READ_128 T6.XYZW, T3.X, 0, #1
 ; EG-NEXT:     VTX_READ_128 T3.XYZW, T3.X, 16, #1
 ; EG-NEXT:    ALU clause starting at 16:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
 ; EG-NEXT:     MOV * T3.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 28:
-; EG-NEXT:     LSHR * T7.X, T0.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 22:
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_load_v16i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
@@ -1776,7 +1753,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
@@ -1793,11 +1770,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; EG-NEXT:     MOV T1.W, 0.0,
 ; EG-NEXT:     MOV * T2.Z, T0.Y,
 ; EG-NEXT:     MOV * T2.W, 0.0,
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
@@ -1907,31 +1883,30 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
+; EG-NEXT:     ASHR * T1.W, T0.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ASHR T1.Y, T0.X, literal.y,
-; EG-NEXT:     ASHR T3.W, T0.W, literal.y,
-; EG-NEXT:     MOV * T1.X, T0.X,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:     ASHR * T3.Y, T0.Z, literal.x,
+; EG-NEXT:     ASHR T1.Y, T0.Z, literal.x,
+; EG-NEXT:     ASHR * T2.W, T0.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T3.X, T0.Z,
-; EG-NEXT:     MOV T1.Z, T0.Y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T3.Z, T0.W,
+; EG-NEXT:     MOV T1.X, T0.Z,
+; EG-NEXT:     ASHR T2.Y, T0.X, literal.x,
+; EG-NEXT:     MOV * T2.X, T0.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T2.Z, T0.Y,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; EG-NEXT:     MOV * T1.Z, T0.W,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
@@ -2078,7 +2053,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 26, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 22, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
@@ -2106,17 +2081,13 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:     MOV T4.W, 0.0,
 ; EG-NEXT:     MOV * T5.Z, T0.Y,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
@@ -2283,7 +2254,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 31, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 27, @13, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0
@@ -2295,38 +2266,34 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:    ALU clause starting at 12:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 13:
-; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T4.W, T0.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; EG-NEXT:     ASHR T4.W, T0.Y, literal.y,
+; EG-NEXT:     ADD_INT * T5.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T4.Y, T0.X, literal.x,
+; EG-NEXT:     ASHR * T6.W, T0.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T4.Y, T0.X, literal.y,
-; EG-NEXT:     ASHR T6.W, T0.W, literal.y,
-; EG-NEXT:     MOV * T4.X, T0.X,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     MOV T4.X, T0.X,
 ; EG-NEXT:     ASHR T6.Y, T0.Z, literal.x,
-; EG-NEXT:     ASHR * T7.W, T1.Y, literal.x,
+; EG-NEXT:     ASHR T7.W, T1.Y, literal.x,
+; EG-NEXT:     MOV * T6.X, T0.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T6.X, T0.Z,
 ; EG-NEXT:     ASHR T7.Y, T1.X, literal.x,
 ; EG-NEXT:     MOV T4.Z, T0.Y,
-; EG-NEXT:     ASHR T8.W, T1.W, literal.x,
-; EG-NEXT:     MOV * T7.X, T1.X,
+; EG-NEXT:     ASHR * T8.W, T1.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T7.X, T1.X,
 ; EG-NEXT:     ASHR T8.Y, T1.Z, literal.x,
-; EG-NEXT:     MOV * T6.Z, T0.W,
+; EG-NEXT:     MOV T6.Z, T0.W,
+; EG-NEXT:     MOV * T8.X, T1.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T8.X, T1.Z,
-; EG-NEXT:     MOV T7.Z, T1.Y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T7.Z, T1.Y,
+; EG-NEXT:     ADD_INT T0.X, T2.X, literal.x,
 ; EG-NEXT:     MOV * T8.Z, T1.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
@@ -2594,15 +2561,15 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 64, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 53, @21, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T6.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T4.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
@@ -2612,71 +2579,60 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; EG-NEXT:    ALU clause starting at 20:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 21:
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T4.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T10.W, T0.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT:     ADD_INT T5.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T7.X, T4.X, literal.x,
+; EG-NEXT:     ADD_INT * T8.X, T4.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T9.X, T4.X, literal.x,
+; EG-NEXT:     ASHR T10.W, T0.W, literal.y,
+; EG-NEXT:     ADD_INT * T11.X, T4.X, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T10.Y, T0.Z, literal.x,
+; EG-NEXT:     ASHR * T12.W, T0.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T10.Y, T0.Z, literal.y,
-; EG-NEXT:     ASHR T12.W, T0.Y, literal.y,
-; EG-NEXT:     MOV * T10.X, T0.Z,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     MOV T10.X, T0.Z,
 ; EG-NEXT:     ASHR T12.Y, T0.X, literal.x,
-; EG-NEXT:     ASHR * T13.W, T3.W, literal.x,
+; EG-NEXT:     ASHR T13.W, T3.W, literal.x,
+; EG-NEXT:     MOV * T12.X, T0.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T12.X, T0.X,
 ; EG-NEXT:     ASHR T13.Y, T3.Z, literal.x,
 ; EG-NEXT:     MOV T10.Z, T0.W,
-; EG-NEXT:     ASHR T14.W, T3.Y, literal.x,
-; EG-NEXT:     MOV * T13.X, T3.Z,
+; EG-NEXT:     ASHR * T14.W, T3.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T13.X, T3.Z,
 ; EG-NEXT:     ASHR T14.Y, T3.X, literal.x,
 ; EG-NEXT:     MOV T12.Z, T0.Y,
-; EG-NEXT:     ASHR * T0.W, T2.W, literal.x,
+; EG-NEXT:     ASHR T0.W, T2.W, literal.x,
+; EG-NEXT:     MOV * T14.X, T3.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T14.X, T3.X,
 ; EG-NEXT:     ASHR T0.Y, T2.Z, literal.x,
 ; EG-NEXT:     MOV T13.Z, T3.W,
-; EG-NEXT:     ASHR T15.W, T2.Y, literal.x,
-; EG-NEXT:     MOV * T0.X, T2.Z,
+; EG-NEXT:     ASHR * T15.W, T2.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.X, T2.Z,
 ; EG-NEXT:     ASHR T15.Y, T2.X, literal.x,
 ; EG-NEXT:     MOV T14.Z, T3.Y,
-; EG-NEXT:     ASHR * T3.W, T1.W, literal.x,
+; EG-NEXT:     ASHR T3.W, T1.W, literal.x,
+; EG-NEXT:     MOV * T15.X, T2.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T15.X, T2.X,
 ; EG-NEXT:     ASHR T3.Y, T1.Z, literal.x,
 ; EG-NEXT:     MOV T0.Z, T2.W,
-; EG-NEXT:     ASHR T16.W, T1.Y, literal.x,
-; EG-NEXT:     MOV * T3.X, T1.Z,
+; EG-NEXT:     ASHR * T16.W, T1.Y, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T3.X, T1.Z,
 ; EG-NEXT:     ASHR T16.Y, T1.X, literal.x,
-; EG-NEXT:     MOV * T15.Z, T2.Y,
+; EG-NEXT:     MOV T15.Z, T2.Y,
+; EG-NEXT:     MOV * T16.X, T1.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T16.X, T1.X,
-; EG-NEXT:     MOV T3.Z, T1.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T3.Z, T1.W,
+; EG-NEXT:     ADD_INT T1.X, T4.X, literal.x,
 ; EG-NEXT:     MOV * T16.Z, T1.Y,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
@@ -2934,15 +2890,15 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 55, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 44, @21, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T0.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
@@ -2983,31 +2939,20 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV T10.Z, T2.Y,
 ; EG-NEXT:     MOV T10.W, 0.0,
 ; EG-NEXT:     MOV * T11.Z, T2.W,
-; EG-NEXT:     MOV T11.W, 0.0,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PS, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR * T15.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T12.X, T0.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T13.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T14.X, T0.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
@@ -3493,168 +3438,144 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ;
 ; EG-LABEL: global_sextload_v32i32_to_v32i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 33, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 10, @36, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @20
-; EG-NEXT:    ALU 96, @70, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
+; EG-NEXT:    ALU 95, @47, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T23.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T17.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T16.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T15.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 20:
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 112, #1
-; EG-NEXT:     VTX_READ_128 T13.XYZW, T11.X, 96, #1
-; EG-NEXT:     VTX_READ_128 T14.XYZW, T11.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T15.XYZW, T11.X, 64, #1
-; EG-NEXT:     VTX_READ_128 T16.XYZW, T11.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T17.XYZW, T11.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T18.XYZW, T11.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T6.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T8.XYZW, T6.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T9.XYZW, T6.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T10.XYZW, T6.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T6.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T6.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T13.XYZW, T6.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T6.XYZW, T6.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 36:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T11.X, KC0[2].Z,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 70:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:     ASHR * T22.W, T11.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     MOV * T6.X, KC0[2].Z,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 47:
+; EG-NEXT:     ADD_INT T14.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T16.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, T0.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T18.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T0.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T20.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T0.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 48(6.726233e-44)
+; EG-NEXT:     ASHR * T22.W, T6.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T22.Y, T11.Z, literal.y,
-; EG-NEXT:     ASHR T24.W, T11.Y, literal.y,
-; EG-NEXT:     MOV * T22.X, T11.Z,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:     ASHR T24.Y, T11.X, literal.x,
-; EG-NEXT:     ASHR * T25.W, T18.W, literal.x,
+; EG-NEXT:     ADD_INT T23.X, T0.X, literal.x,
+; EG-NEXT:     ASHR T22.Y, T6.Z, literal.y,
+; EG-NEXT:     ASHR T24.W, T6.Y, literal.y,
+; EG-NEXT:     MOV * T22.X, T6.Z,
+; EG-NEXT:    60(8.407791e-44), 31(4.344025e-44)
+; EG-NEXT:     ASHR T24.Y, T6.X, literal.x,
+; EG-NEXT:     ASHR * T25.W, T13.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T24.X, T11.X,
-; EG-NEXT:     ASHR T25.Y, T18.Z, literal.x,
-; EG-NEXT:     MOV T22.Z, T11.W,
-; EG-NEXT:     ASHR T26.W, T18.Y, literal.x,
-; EG-NEXT:     MOV * T25.X, T18.Z,
+; EG-NEXT:     MOV T24.X, T6.X,
+; EG-NEXT:     ASHR T25.Y, T13.Z, literal.x,
+; EG-NEXT:     MOV T22.Z, T6.W,
+; EG-NEXT:     ASHR T26.W, T13.Y, literal.x,
+; EG-NEXT:     MOV * T25.X, T13.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T26.Y, T18.X, literal.x,
-; EG-NEXT:     MOV T24.Z, T11.Y,
-; EG-NEXT:     ASHR * T11.W, T17.W, literal.x,
+; EG-NEXT:     ASHR T26.Y, T13.X, literal.x,
+; EG-NEXT:     MOV T24.Z, T6.Y,
+; EG-NEXT:     ASHR * T6.W, T12.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T26.X, T18.X,
-; EG-NEXT:     ASHR T11.Y, T17.Z, literal.x,
-; EG-NEXT:     MOV T25.Z, T18.W,
-; EG-NEXT:     ASHR T27.W, T17.Y, literal.x,
-; EG-NEXT:     MOV * T11.X, T17.Z,
+; EG-NEXT:     MOV T26.X, T13.X,
+; EG-NEXT:     ASHR T6.Y, T12.Z, literal.x,
+; EG-NEXT:     MOV T25.Z, T13.W,
+; EG-NEXT:     ASHR T27.W, T12.Y, literal.x,
+; EG-NEXT:     MOV * T6.X, T12.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T27.Y, T17.X, literal.x,
-; EG-NEXT:     MOV T26.Z, T18.Y,
-; EG-NEXT:     ASHR * T18.W, T16.W, literal.x,
+; EG-NEXT:     ASHR T27.Y, T12.X, literal.x,
+; EG-NEXT:     MOV T26.Z, T13.Y,
+; EG-NEXT:     ASHR * T13.W, T11.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T27.X, T17.X,
-; EG-NEXT:     ASHR T18.Y, T16.Z, literal.x,
-; EG-NEXT:     MOV T11.Z, T17.W,
-; EG-NEXT:     ASHR T28.W, T16.Y, literal.x,
-; EG-NEXT:     MOV * T18.X, T16.Z,
+; EG-NEXT:     MOV T27.X, T12.X,
+; EG-NEXT:     ASHR T13.Y, T11.Z, literal.x,
+; EG-NEXT:     MOV T6.Z, T12.W,
+; EG-NEXT:     ASHR T28.W, T11.Y, literal.x,
+; EG-NEXT:     MOV * T13.X, T11.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T28.Y, T16.X, literal.x,
-; EG-NEXT:     MOV T27.Z, T17.Y,
-; EG-NEXT:     ASHR * T17.W, T15.W, literal.x,
+; EG-NEXT:     ASHR T28.Y, T11.X, literal.x,
+; EG-NEXT:     MOV T27.Z, T12.Y,
+; EG-NEXT:     ASHR * T12.W, T10.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T28.X, T16.X,
-; EG-NEXT:     ASHR T17.Y, T15.Z, literal.x,
-; EG-NEXT:     MOV T18.Z, T16.W,
-; EG-NEXT:     ASHR T29.W, T15.Y, literal.x,
-; EG-NEXT:     MOV * T17.X, T15.Z,
+; EG-NEXT:     MOV T28.X, T11.X,
+; EG-NEXT:     ASHR T12.Y, T10.Z, literal.x,
+; EG-NEXT:     MOV T13.Z, T11.W,
+; EG-NEXT:     ASHR T29.W, T10.Y, literal.x,
+; EG-NEXT:     MOV * T12.X, T10.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T29.Y, T15.X, literal.x,
-; EG-NEXT:     MOV T28.Z, T16.Y,
-; EG-NEXT:     ASHR * T16.W, T14.W, literal.x,
+; EG-NEXT:     ASHR T29.Y, T10.X, literal.x,
+; EG-NEXT:     MOV T28.Z, T11.Y,
+; EG-NEXT:     ASHR * T11.W, T9.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T29.X, T15.X,
-; EG-NEXT:     ASHR T16.Y, T14.Z, literal.x,
-; EG-NEXT:     MOV T17.Z, T15.W,
-; EG-NEXT:     ASHR T30.W, T14.Y, literal.x,
-; EG-NEXT:     MOV * T16.X, T14.Z,
+; EG-NEXT:     MOV T29.X, T10.X,
+; EG-NEXT:     ASHR T11.Y, T9.Z, literal.x,
+; EG-NEXT:     MOV T12.Z, T10.W,
+; EG-NEXT:     ASHR T30.W, T9.Y, literal.x,
+; EG-NEXT:     MOV * T11.X, T9.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T30.Y, T14.X, literal.x,
-; EG-NEXT:     MOV T29.Z, T15.Y,
-; EG-NEXT:     ASHR * T15.W, T13.W, literal.x,
+; EG-NEXT:     ASHR T30.Y, T9.X, literal.x,
+; EG-NEXT:     MOV T29.Z, T10.Y,
+; EG-NEXT:     ASHR * T10.W, T8.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T30.X, T14.X,
-; EG-NEXT:     ASHR T15.Y, T13.Z, literal.x,
-; EG-NEXT:     MOV T16.Z, T14.W,
-; EG-NEXT:     ASHR T31.W, T13.Y, literal.x,
-; EG-NEXT:     MOV * T15.X, T13.Z,
+; EG-NEXT:     MOV T30.X, T9.X,
+; EG-NEXT:     ASHR T10.Y, T8.Z, literal.x,
+; EG-NEXT:     MOV T11.Z, T9.W,
+; EG-NEXT:     ASHR T31.W, T8.Y, literal.x,
+; EG-NEXT:     MOV * T10.X, T8.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T31.Y, T13.X, literal.x,
-; EG-NEXT:     MOV T30.Z, T14.Y,
-; EG-NEXT:     ASHR * T14.W, T12.W, literal.x,
+; EG-NEXT:     ASHR T31.Y, T8.X, literal.x,
+; EG-NEXT:     MOV T30.Z, T9.Y,
+; EG-NEXT:     ASHR * T9.W, T7.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T31.X, T13.X,
-; EG-NEXT:     ASHR T14.Y, T12.Z, literal.x,
-; EG-NEXT:     MOV T15.Z, T13.W,
-; EG-NEXT:     ASHR T32.W, T12.Y, literal.x,
-; EG-NEXT:     MOV * T14.X, T12.Z,
+; EG-NEXT:     MOV T31.X, T8.X,
+; EG-NEXT:     ASHR T9.Y, T7.Z, literal.x,
+; EG-NEXT:     MOV T10.Z, T8.W,
+; EG-NEXT:     ASHR T32.W, T7.Y, literal.x,
+; EG-NEXT:     MOV * T9.X, T7.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T32.Y, T12.X, literal.x,
-; EG-NEXT:     MOV * T31.Z, T13.Y,
+; EG-NEXT:     ASHR T32.Y, T7.X, literal.x,
+; EG-NEXT:     MOV * T31.Z, T8.Y,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     MOV T32.X, T12.X,
-; EG-NEXT:     MOV T14.Z, T12.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T32.Z, T12.Y,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV T32.X, T7.X,
+; EG-NEXT:     MOV T9.Z, T7.W,
+; EG-NEXT:     ADD_INT * T7.X, T0.X, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV * T32.Z, T7.Y,
 ;
 ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
 ; GCN-GFX900-HSA:       ; %bb.0:
@@ -4236,154 +4157,127 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ;
 ; EG-LABEL: global_zextload_v32i32_to_v32i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 2 @22
-; EG-NEXT:    ALU 10, @39, KC0[], KC1[]
-; EG-NEXT:    TEX 4 @28
-; EG-NEXT:    ALU 100, @50, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0
+; EG-NEXT:    ALU 0, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 7 @20
+; EG-NEXT:    ALU 88, @37, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T31.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T30.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T29.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    Fetch clause starting at 22:
+; EG-NEXT:    Fetch clause starting at 20:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 112, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 80, #1
-; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 96, #1
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T11.XYZW, T0.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T13.XYZW, T0.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 64, #1
-; EG-NEXT:    ALU clause starting at 38:
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T4.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T5.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T6.XYZW, T0.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T0.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 96, #1
+; EG-NEXT:    ALU clause starting at 36:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 39:
-; EG-NEXT:     MOV T4.X, T1.X,
-; EG-NEXT:     MOV T4.Y, 0.0,
-; EG-NEXT:     MOV * T5.X, T1.Z,
-; EG-NEXT:     MOV * T5.Y, 0.0,
-; EG-NEXT:     MOV T6.X, T3.X,
-; EG-NEXT:     MOV T6.Y, 0.0,
-; EG-NEXT:     MOV * T7.X, T3.Z,
-; EG-NEXT:     MOV * T7.Y, 0.0,
-; EG-NEXT:     MOV T8.X, T2.X,
+; EG-NEXT:    ALU clause starting at 37:
+; EG-NEXT:     MOV T8.X, T1.X,
 ; EG-NEXT:     MOV T8.Y, 0.0,
-; EG-NEXT:     MOV * T9.X, T2.Z,
-; EG-NEXT:    ALU clause starting at 50:
+; EG-NEXT:     MOV * T9.X, T1.Z,
 ; EG-NEXT:     MOV * T9.Y, 0.0,
-; EG-NEXT:     MOV T14.X, T0.X,
+; EG-NEXT:     MOV T10.X, T0.X,
+; EG-NEXT:     MOV T10.Y, 0.0,
+; EG-NEXT:     MOV * T11.X, T0.Z,
+; EG-NEXT:     MOV * T11.Y, 0.0,
+; EG-NEXT:     MOV T12.X, T7.X,
+; EG-NEXT:     MOV T12.Y, 0.0,
+; EG-NEXT:     MOV * T13.X, T7.Z,
+; EG-NEXT:     MOV * T13.Y, 0.0,
+; EG-NEXT:     MOV T14.X, T6.X,
 ; EG-NEXT:     MOV T14.Y, 0.0,
-; EG-NEXT:     MOV * T15.X, T0.Z,
+; EG-NEXT:     MOV * T15.X, T6.Z,
 ; EG-NEXT:     MOV * T15.Y, 0.0,
-; EG-NEXT:     MOV T16.X, T13.X,
+; EG-NEXT:     MOV T16.X, T5.X,
 ; EG-NEXT:     MOV T16.Y, 0.0,
-; EG-NEXT:     MOV * T17.X, T13.Z,
+; EG-NEXT:     MOV * T17.X, T5.Z,
 ; EG-NEXT:     MOV * T17.Y, 0.0,
-; EG-NEXT:     MOV T18.X, T12.X,
+; EG-NEXT:     MOV T18.X, T4.X,
 ; EG-NEXT:     MOV T18.Y, 0.0,
-; EG-NEXT:     MOV * T19.X, T12.Z,
+; EG-NEXT:     MOV * T19.X, T4.Z,
 ; EG-NEXT:     MOV * T19.Y, 0.0,
-; EG-NEXT:     MOV T20.X, T11.X,
+; EG-NEXT:     MOV T20.X, T3.X,
 ; EG-NEXT:     MOV T20.Y, 0.0,
-; EG-NEXT:     MOV * T21.X, T11.Z,
+; EG-NEXT:     MOV * T21.X, T3.Z,
 ; EG-NEXT:     MOV * T21.Y, 0.0,
-; EG-NEXT:     MOV T22.X, T10.X,
+; EG-NEXT:     MOV T22.X, T2.X,
 ; EG-NEXT:     MOV T22.Y, 0.0,
-; EG-NEXT:     MOV * T23.X, T10.Z,
+; EG-NEXT:     MOV * T23.X, T2.Z,
 ; EG-NEXT:     MOV T23.Y, 0.0,
-; EG-NEXT:     MOV T4.Z, T1.Y,
-; EG-NEXT:     MOV T4.W, 0.0,
-; EG-NEXT:     MOV * T5.Z, T1.W,
-; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     MOV T6.Z, T3.Y,
-; EG-NEXT:     MOV T6.W, 0.0,
-; EG-NEXT:     MOV * T7.Z, T3.W,
-; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     MOV T8.Z, T2.Y,
+; EG-NEXT:     MOV T8.Z, T1.Y,
 ; EG-NEXT:     MOV T8.W, 0.0,
-; EG-NEXT:     MOV * T9.Z, T2.W,
+; EG-NEXT:     MOV * T9.Z, T1.W,
 ; EG-NEXT:     MOV * T9.W, 0.0,
-; EG-NEXT:     MOV T14.Z, T0.Y,
+; EG-NEXT:     MOV T10.Z, T0.Y,
+; EG-NEXT:     MOV T10.W, 0.0,
+; EG-NEXT:     MOV * T11.Z, T0.W,
+; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     MOV T12.Z, T7.Y,
+; EG-NEXT:     MOV T12.W, 0.0,
+; EG-NEXT:     MOV * T13.Z, T7.W,
+; EG-NEXT:     MOV * T13.W, 0.0,
+; EG-NEXT:     MOV T14.Z, T6.Y,
 ; EG-NEXT:     MOV T14.W, 0.0,
-; EG-NEXT:     MOV * T15.Z, T0.W,
+; EG-NEXT:     MOV * T15.Z, T6.W,
 ; EG-NEXT:     MOV * T15.W, 0.0,
-; EG-NEXT:     MOV T16.Z, T13.Y,
+; EG-NEXT:     MOV T16.Z, T5.Y,
 ; EG-NEXT:     MOV T16.W, 0.0,
-; EG-NEXT:     MOV * T17.Z, T13.W,
+; EG-NEXT:     MOV * T17.Z, T5.W,
 ; EG-NEXT:     MOV * T17.W, 0.0,
-; EG-NEXT:     MOV T18.Z, T12.Y,
+; EG-NEXT:     MOV T18.Z, T4.Y,
 ; EG-NEXT:     MOV T18.W, 0.0,
-; EG-NEXT:     MOV * T19.Z, T12.W,
+; EG-NEXT:     MOV * T19.Z, T4.W,
 ; EG-NEXT:     MOV * T19.W, 0.0,
-; EG-NEXT:     MOV T20.Z, T11.Y,
+; EG-NEXT:     MOV T20.Z, T3.Y,
 ; EG-NEXT:     MOV T20.W, 0.0,
-; EG-NEXT:     MOV * T21.Z, T11.W,
+; EG-NEXT:     MOV * T21.Z, T3.W,
 ; EG-NEXT:     MOV * T21.W, 0.0,
-; EG-NEXT:     MOV T22.Z, T10.Y,
+; EG-NEXT:     MOV T22.Z, T2.Y,
 ; EG-NEXT:     MOV T22.W, 0.0,
-; EG-NEXT:     MOV * T23.Z, T10.W,
-; EG-NEXT:     MOV T23.W, 0.0,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PS, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR * T31.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T23.Z, T2.W,
+; EG-NEXT:     MOV * T23.W, 0.0,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, T0.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T7.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T24.X, T0.X, literal.y,
+; EG-NEXT:    24(3.363116e-44), 36(5.044674e-44)
+; EG-NEXT:     ADD_INT T25.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T26.X, T0.X, literal.y,
+; EG-NEXT:    32(4.484155e-44), 44(6.165713e-44)
+; EG-NEXT:     ADD_INT T27.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T28.X, T0.X, literal.y,
+; EG-NEXT:    40(5.605194e-44), 52(7.286752e-44)
+; EG-NEXT:     ADD_INT T29.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T30.X, T0.X, literal.y,
+; EG-NEXT:    48(6.726233e-44), 60(8.407791e-44)
+; EG-NEXT:     ADD_INT * T31.X, T0.X, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
@@ -4637,17 +4531,17 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ;
 ; EG-LABEL: global_load_v32i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 23, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 11, @28, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 7 @12
-; EG-NEXT:    ALU 1, @52, KC0[], KC1[]
+; EG-NEXT:    ALU 1, @40, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T3.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_128 T8.XYZW, T7.X, 96, #1
@@ -4659,33 +4553,21 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     VTX_READ_128 T14.XYZW, T7.X, 0, #1
 ; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 16, #1
 ; EG-NEXT:    ALU clause starting at 28:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T4.X, T0.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T5.X, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T6.X, T0.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 52:
-; EG-NEXT:     LSHR * T15.X, T0.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 40:
+; EG-NEXT:     ADD_INT * T15.X, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ;
 ; GCN-HSA-LABEL: global_load_v32i32:
 ; GCN-HSA:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 5398cf85ace7b..7978ac89c163b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1299,7 +1299,7 @@ define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(ptr addrspace(1) %out,
 ; EG:       ; %bb.0: ; %entry
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
 ; EG-NEXT:    CF_END
@@ -1312,22 +1312,21 @@ define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(ptr addrspace(1) %out,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT * T5.Y, T4.X, literal.x, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T5.X, T4.X, literal.x,
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
+; EG-NEXT:     AND_INT * T5.X, T4.X, literal.x,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T4.X, T4.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT:     ADD_INT * T7.X, PS, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v3i8_to_v3i32:
 ; CM:       ; %bb.0: ; %entry
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T6.X
+; CM-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T6.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T7.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
@@ -1336,16 +1335,15 @@ define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(ptr addrspace(1) %out,
 ; CM-NEXT:    ALU clause starting at 9:
 ; CM-NEXT:     MOV * T0.W, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T5.X, T4.X, literal.x, PV.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
+; CM-NEXT:     BFE_UINT * T5.X, T4.X, literal.x, PV.W,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T7.X, PV.X, literal.x,
 ; CM-NEXT:     BFE_UINT * T4.Y, T4.X, literal.y, T0.W,
 ; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ; CM-NEXT:     AND_INT * T4.X, T4.X, literal.x,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %ld = load <3 x i8>, ptr addrspace(1) %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -1420,34 +1418,34 @@ define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(ptr addrspace(1) %out,
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
 ; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T4.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XY, T5.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XY, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T4.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T0.W, T4.X, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT * T6.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T4.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T5.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T4.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T0.W, T4.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T6.X, T4.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT:     LSHR * T0.W, T4.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T4.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, literal.y,
+; EG-NEXT:     ADD_INT T4.X, T7.X, literal.x,
+; EG-NEXT:     BFE_INT * T6.Y, PV.W, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
 ; CM-LABEL: global_sextload_v3i8_to_v3i32:
 ; CM:       ; %bb.0: ; %entry
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T4.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T6.X
+; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
@@ -1456,16 +1454,15 @@ define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(ptr addrspace(1) %out,
 ; CM-NEXT:    ALU clause starting at 9:
 ; CM-NEXT:     LSHR * T0.W, T4.X, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T5.X, PV.W, 0.0, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_INT * T5.X, PV.W, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T6.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T7.X, T4.X, 0.0, literal.x,
-; CM-NEXT:     LSHR * T0.W, T4.X, literal.x,
+; CM-NEXT:     BFE_INT * T6.X, T4.X, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, literal.y,
+; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T0.W, T4.X, literal.y,
+; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T4.X, PV.X, literal.x,
+; CM-NEXT:     BFE_INT * T6.Y, PV.W, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 entry:
   %ld = load <3 x i8>, ptr addrspace(1) %in
@@ -1800,7 +1797,7 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
@@ -1814,30 +1811,27 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; EG-NEXT:     BFE_UINT * T6.Z, T5.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T6.Y, T5.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T7.Z, T5.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T6.W, T5.X, literal.z,
+; EG-NEXT:     BFE_UINT * T7.Z, T5.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T7.Y, T5.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T6.W, T5.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:     AND_INT T6.X, T5.X, literal.x,
-; EG-NEXT:     BFE_UINT T7.Y, T5.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T7.W, T5.Y, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T7.X, T5.Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T7.W, T5.Y, literal.y,
+; EG-NEXT:     AND_INT * T7.X, T5.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T8.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v8i8_to_v8i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
+; CM-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T8.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
@@ -1849,22 +1843,19 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; CM-NEXT:     BFE_UINT * T6.Z, T5.Y, literal.x, PV.W,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_UINT T6.Y, T5.Y, literal.x, T0.W,
-; CM-NEXT:     BFE_UINT T5.Z, T5.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T6.W, T5.Y, literal.z,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T6.W, T5.Y, literal.y,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT T6.X, T5.Y, literal.x,
-; CM-NEXT:     BFE_UINT T5.Y, T5.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T7.X, PV.W, literal.x,
+; CM-NEXT:     BFE_UINT * T5.Z, T5.X, literal.y, T0.W,
+; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_UINT * T5.Y, T5.X, literal.y, T0.W,
+; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T8.X, PV.X, literal.x,
 ; CM-NEXT:     LSHR * T5.W, T5.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; CM-NEXT:    4(5.605194e-45), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT * T5.X, T5.X, literal.x,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1958,7 +1949,7 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 23, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 21, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
@@ -1984,21 +1975,19 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     BFE_INT T6.Y, PS, 0.0, literal.y,
 ; EG-NEXT:     BFE_INT T7.Z, PV.Y, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.W, T5.Y, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     LSHR * T0.W, T5.Y, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T8.X, PS, literal.x,
+; EG-NEXT:     ADD_INT T8.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
 ;
 ; CM-LABEL: global_sextload_v8i8_to_v8i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 23, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T5.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T9.X
+; CM-NEXT:    ALU 21, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T9.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T5.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
@@ -2006,29 +1995,27 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; CM-NEXT:     MOV * T5.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 9:
 ; CM-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
-; CM-NEXT:     LSHR T0.Z, T5.X, literal.y,
-; CM-NEXT:     LSHR * T0.W, T5.Y, literal.z,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T0.W, T5.Y, literal.y,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:     BFE_INT T7.X, T5.X, 0.0, literal.x,
 ; CM-NEXT:     LSHR T0.Y, T5.X, literal.y,
-; CM-NEXT:     LSHR T1.Z, T5.Y, literal.z,
+; CM-NEXT:     LSHR T0.Z, T5.Y, literal.z,
 ; CM-NEXT:     BFE_INT * T6.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T8.X, T5.Y, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, KC0[2].Y, literal.y,
-; CM-NEXT:     BFE_INT T6.Z, PV.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T7.W, PV.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:     LSHR T9.X, PV.Y, literal.x,
-; CM-NEXT:     BFE_INT T6.Y, PV.X, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T7.Z, T0.Z, 0.0, literal.y,
+; CM-NEXT:     LSHR T8.X, T5.X, literal.x,
+; CM-NEXT:     LSHR T1.Y, T5.Y, literal.y,
+; CM-NEXT:     BFE_INT T6.Z, PV.Z, 0.0, literal.y,
+; CM-NEXT:     BFE_INT * T7.W, PV.Y, 0.0, literal.y,
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_INT T6.Y, PV.Y, 0.0, literal.y,
+; CM-NEXT:     BFE_INT T7.Z, PV.X, 0.0, literal.y,
 ; CM-NEXT:     LSHR * T0.W, T5.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T5.X, PV.X, literal.x,
 ; CM-NEXT:     BFE_INT * T7.Y, PV.W, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:    4(5.605194e-45), 8(1.121039e-44)
   %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(1) %out
@@ -2160,7 +2147,7 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 39, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T11.X, 0
@@ -2176,51 +2163,45 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_UINT * T8.Z, T7.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T8.Y, T7.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T9.Z, T7.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T8.W, T7.X, literal.z,
+; EG-NEXT:     BFE_UINT * T9.Z, T7.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T9.Y, T7.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T8.W, T7.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:     AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT:     BFE_UINT T9.Y, T7.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T10.Z, T7.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T9.W, T7.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T9.X, T7.Y, literal.x,
-; EG-NEXT:     BFE_UINT T10.Y, T7.Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:     LSHR T9.W, T7.Y, literal.y,
+; EG-NEXT:     AND_INT * T9.X, T7.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T10.Z, T7.Z, literal.x, T0.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_UINT * T10.Y, T7.Z, literal.y, T0.W,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T11.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_UINT T12.Z, T7.W, literal.y, T0.W,
 ; EG-NEXT:     LSHR T10.W, T7.Z, literal.z,
 ; EG-NEXT:     AND_INT * T10.X, T7.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T12.Y, T7.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
+; EG-NEXT:     BFE_UINT * T12.Y, T7.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T13.X, T7.X, literal.x,
 ; EG-NEXT:     LSHR T12.W, T7.W, literal.y,
 ; EG-NEXT:     AND_INT * T12.X, T7.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T14.X, T7.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v16i8_to_v16i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
-; CM-NEXT:    ALU 40, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T13.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T12.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
+; CM-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T10.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T14.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T11.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
@@ -2232,42 +2213,35 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; CM-NEXT:     BFE_UINT * T8.Z, T7.W, literal.x, PV.W,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_UINT T8.Y, T7.W, literal.x, T0.W,
-; CM-NEXT:     BFE_UINT T9.Z, T7.Z, literal.y, T0.W,
-; CM-NEXT:     LSHR * T8.W, T7.W, literal.z,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T8.W, T7.W, literal.y,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT T8.X, T7.W, literal.x,
-; CM-NEXT:     BFE_UINT T9.Y, T7.Z, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T10.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T11.Z, T7.Y, literal.y, T0.W,
+; CM-NEXT:     BFE_UINT * T9.Z, T7.Z, literal.y, T0.W,
+; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT:     LSHR T10.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_UINT * T9.Y, T7.Z, literal.y, T0.W,
+; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T11.X, PV.X, literal.x,
+; CM-NEXT:     BFE_UINT T12.Z, T7.Y, literal.y, T0.W,
 ; CM-NEXT:     LSHR * T9.W, T7.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T9.X, T7.Z, literal.x,
-; CM-NEXT:     BFE_UINT T11.Y, T7.Y, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     BFE_UINT * T12.Y, T7.Y, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T12.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T7.Z, T7.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T11.W, T7.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T13.X, T10.X, literal.x,
+; CM-NEXT:     BFE_UINT T7.Z, T7.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T12.W, T7.Y, literal.z,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T11.X, T7.Y, literal.x,
-; CM-NEXT:     BFE_UINT T7.Y, T7.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T12.X, T7.Y, literal.x,
+; CM-NEXT:     BFE_UINT * T7.Y, T7.X, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T13.X, PV.W, literal.x,
-; CM-NEXT:     LSHR * T7.W, T7.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; CM-NEXT:     ADD_INT T14.X, T10.X, literal.x,
+; CM-NEXT:     LSHR * T7.W, T7.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    4(5.605194e-45), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT * T7.X, T7.X, literal.x,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T14.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2399,130 +2373,119 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 47, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
+; EG-NEXT:    ALU 41, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T8.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR T0.W, T7.W, literal.y,
-; EG-NEXT:     LSHR * T1.W, T7.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     BFE_INT * T8.X, T7.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T9.X, T7.X, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.Y, T7.W, literal.y,
-; EG-NEXT:     LSHR T0.Z, T7.Z, literal.z,
-; EG-NEXT:     LSHR T2.W, T7.Y, literal.x,
-; EG-NEXT:     LSHR * T3.W, T7.X, literal.y,
-; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.Z, T7.W, literal.z,
+; EG-NEXT:     LSHR T0.W, T7.Z, literal.y,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T10.X, T7.Y, 0.0, literal.x,
 ; EG-NEXT:     LSHR T1.Y, T7.Z, literal.y,
 ; EG-NEXT:     LSHR T1.Z, T7.Y, literal.y,
 ; EG-NEXT:     BFE_INT T9.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T7.X, literal.z,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T11.X, T7.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR T2.Y, T7.Y, literal.y,
 ; EG-NEXT:     BFE_INT T9.Z, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T10.W, PV.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T3.W, T7.X, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T12.X, T7.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T9.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T10.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T11.W, T1.Y, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T7.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T10.Y, T2.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T11.Z, T0.Z, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T12.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T13.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T12.Z, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T0.W, T7.W, literal.y, BS:VEC_201
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T14.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T12.Y, PV.W, 0.0, literal.y,
+; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T9.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T10.Z, PV.Y, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T11.W, T1.Y, 0.0, literal.y,
+; EG-NEXT:     LSHR * T1.W, T7.Y, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T12.X, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T10.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T11.Z, T0.W, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T8.W, T0.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T7.Z, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T13.X, T7.X, literal.x,
+; EG-NEXT:     BFE_INT T11.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T8.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.W, T7.W, literal.x,
+; EG-NEXT:     ADD_INT * T14.X, T7.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 12(1.681558e-44)
+; EG-NEXT:     BFE_INT * T8.Y, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v16i8_to_v16i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
-; CM-NEXT:    ALU 48, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T15.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T9.X
+; CM-NEXT:    ALU 43, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T12.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T7.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T14.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T13.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 10:
 ; CM-NEXT:     MOV * T7.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 11:
-; CM-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; CM-NEXT:     LSHR T0.Z, T7.X, literal.x,
+; CM-NEXT:     LSHR * T0.W, T7.Y, literal.y,
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     BFE_INT T8.X, T7.X, 0.0, literal.x,
+; CM-NEXT:     LSHR T0.Y, T7.X, literal.y,
+; CM-NEXT:     LSHR T1.Z, T7.Y, literal.z,
+; CM-NEXT:     LSHR * T1.W, T7.Z, literal.x,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T8.X, T7.Y, literal.x,
-; CM-NEXT:     ADD_INT T0.Y, KC0[2].Y, literal.y,
-; CM-NEXT:     LSHR T0.Z, T7.X, literal.z,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 48(6.726233e-44)
-; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
+; CM-NEXT:     BFE_INT T9.X, T7.W, 0.0, literal.x,
 ; CM-NEXT:     LSHR T1.Y, T7.Y, literal.y,
-; CM-NEXT:     LSHR T1.Z, T7.Z, literal.z,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; CM-NEXT:     BFE_INT T10.X, T7.W, 0.0, literal.x,
-; CM-NEXT:     LSHR T2.Y, T7.Y, literal.y,
 ; CM-NEXT:     LSHR T2.Z, T7.Z, literal.z,
 ; CM-NEXT:     LSHR * T2.W, T7.W, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T11.X, T7.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T3.Y, T7.Z, literal.y,
+; CM-NEXT:     BFE_INT T10.X, T7.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR T2.Y, T7.Z, literal.y,
 ; CM-NEXT:     LSHR T3.Z, T7.W, literal.z,
-; CM-NEXT:     BFE_INT * T10.W, PV.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T9.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T12.X, T7.Y, 0.0, literal.x,
-; CM-NEXT:     LSHR T4.Y, T7.W, literal.x,
-; CM-NEXT:     BFE_INT T10.Z, PV.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T11.W, PV.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T13.X, T7.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T10.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T11.Z, T2.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T12.W, T2.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T14.X, T1.W, literal.x,
-; CM-NEXT:     BFE_INT T11.Y, T1.Z, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T12.Z, T1.Y, 0.0, literal.y,
-; CM-NEXT:     BFE_INT * T13.W, T0.Z, 0.0, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T15.X, T0.Y, literal.x,
-; CM-NEXT:     BFE_INT T12.Y, T8.X, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T13.Z, T0.W, 0.0, literal.y,
-; CM-NEXT:     LSHR * T0.W, T7.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT * T13.Y, PV.W, 0.0, literal.y,
+; CM-NEXT:     BFE_INT T11.X, T7.Y, 0.0, literal.x,
+; CM-NEXT:     LSHR T3.Y, T7.W, literal.x,
+; CM-NEXT:     BFE_INT T9.Z, PV.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T10.W, PV.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_INT T9.Y, PV.Y, 0.0, literal.y,
+; CM-NEXT:     BFE_INT T10.Z, T2.Z, 0.0, literal.y,
+; CM-NEXT:     BFE_INT * T11.W, T1.Y, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T13.X, PV.X, literal.x,
+; CM-NEXT:     BFE_INT T10.Y, T1.W, 0.0, literal.y,
+; CM-NEXT:     BFE_INT T11.Z, T1.Z, 0.0, literal.y,
+; CM-NEXT:     BFE_INT * T8.W, T0.Y, 0.0, literal.y,
+; CM-NEXT:    12(1.681558e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T14.X, T12.X, literal.x,
+; CM-NEXT:     BFE_INT T11.Y, T0.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T8.Z, T0.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T7.X, T12.X, literal.x,
+; CM-NEXT:     BFE_INT * T8.Y, PV.W, 0.0, literal.y,
+; CM-NEXT:    4(5.605194e-45), 8(1.121039e-44)
   %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2740,7 +2703,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 75, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 64, @17, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0
@@ -2761,91 +2724,80 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T13.Y, T11.X, literal.x, T0.W,
-; EG-NEXT:     BFE_UINT T14.Z, T11.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T13.W, T11.X, literal.z,
+; EG-NEXT:     BFE_UINT * T14.Z, T11.Y, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T14.Y, T11.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T13.W, T11.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:     AND_INT T13.X, T11.X, literal.x,
-; EG-NEXT:     BFE_UINT T14.Y, T11.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR * T11.X, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T15.Z, T11.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T14.W, T11.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T14.X, T11.Y, literal.x,
-; EG-NEXT:     BFE_UINT T15.Y, T11.Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:     LSHR T14.W, T11.Y, literal.y,
+; EG-NEXT:     AND_INT * T14.X, T11.Y, literal.x,
+; EG-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T15.Z, T11.Z, literal.x, T0.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_UINT * T15.Y, T11.Z, literal.y, T0.W,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
 ; EG-NEXT:     BFE_UINT T17.Z, T11.W, literal.y, T0.W,
 ; EG-NEXT:     LSHR T15.W, T11.Z, literal.z,
 ; EG-NEXT:     AND_INT * T15.X, T11.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T17.Y, T11.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T17.W, T11.W, literal.z,
+; EG-NEXT:     BFE_UINT * T17.Y, T11.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T18.X, T11.X, literal.x,
+; EG-NEXT:     BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T17.W, T11.W, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     AND_INT * T17.X, T11.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T19.Y, T12.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
+; EG-NEXT:     BFE_UINT * T19.Y, T12.X, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T20.X, T11.X, literal.x,
 ; EG-NEXT:     BFE_UINT T21.Z, T12.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T19.W, T12.X, literal.z,
+; EG-NEXT:     LSHR T19.W, T12.X, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     AND_INT * T19.X, T12.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T21.Y, T12.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T22.Z, T12.Z, literal.y, T0.W,
-; EG-NEXT:     LSHR T21.W, T12.Y, literal.z,
-; EG-NEXT:     AND_INT * T21.X, T12.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T22.Y, T12.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
+; EG-NEXT:     BFE_UINT * T21.Y, T12.Y, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, T11.X, literal.x,
+; EG-NEXT:     BFE_UINT T22.Z, T12.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR T21.W, T12.Y, literal.y,
+; EG-NEXT:     AND_INT * T21.X, T12.Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T22.Y, T12.Z, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T23.X, T11.X, literal.x,
 ; EG-NEXT:     BFE_UINT T24.Z, T12.W, literal.y, T0.W,
 ; EG-NEXT:     LSHR T22.W, T12.Z, literal.z,
 ; EG-NEXT:     AND_INT * T22.X, T12.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    20(2.802597e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T24.Y, T12.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T24.W, T12.W, literal.y,
-; EG-NEXT:     AND_INT * T24.X, T12.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T24.Y, T12.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T25.X, T11.X, literal.x,
+; EG-NEXT:     LSHR T24.W, T12.W, literal.x,
+; EG-NEXT:     AND_INT * T24.X, T12.W, literal.y,
+; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
+; EG-NEXT:     ADD_INT * T26.X, T11.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v32i8_to_v32i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @12
-; CM-NEXT:    ALU 80, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T25.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T24.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T17.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T15.X
+; CM-NEXT:    ALU 63, @17, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T15.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T19.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T18.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T16.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 12:
 ; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
@@ -2858,82 +2810,65 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; CM-NEXT:     BFE_UINT * T13.Z, T11.W, literal.x, PV.W,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_UINT T13.Y, T11.W, literal.x, T0.W,
-; CM-NEXT:     BFE_UINT T14.Z, T11.Z, literal.y, T0.W,
-; CM-NEXT:     LSHR * T13.W, T11.W, literal.z,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T13.W, T11.W, literal.y,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT T13.X, T11.W, literal.x,
-; CM-NEXT:     BFE_UINT T14.Y, T11.Z, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T16.Z, T11.Y, literal.y, T0.W,
+; CM-NEXT:     BFE_UINT * T14.Z, T11.Z, literal.y, T0.W,
+; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT:     LSHR T15.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_UINT * T14.Y, T11.Z, literal.y, T0.W,
+; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T16.X, PV.X, literal.x,
+; CM-NEXT:     BFE_UINT T17.Z, T11.Y, literal.y, T0.W,
 ; CM-NEXT:     LSHR * T14.W, T11.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    28(3.923636e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T14.X, T11.Z, literal.x,
-; CM-NEXT:     BFE_UINT T16.Y, T11.Y, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     BFE_UINT * T17.Y, T11.Y, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T11.Z, T11.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T16.W, T11.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T16.X, T11.Y, literal.x,
-; CM-NEXT:     BFE_UINT T11.Y, T11.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     ADD_INT T18.X, T15.X, literal.x,
+; CM-NEXT:     BFE_UINT T11.Z, T11.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T17.W, T11.Y, literal.x,
+; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T17.X, T11.Y, literal.x,
+; CM-NEXT:     BFE_UINT * T11.Y, T11.X, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T18.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T19.Z, T12.W, literal.y, T0.W,
-; CM-NEXT:     LSHR * T11.W, T11.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T19.X, T15.X, literal.x,
+; CM-NEXT:     BFE_UINT T20.Z, T12.W, literal.y, T0.W,
+; CM-NEXT:     LSHR * T11.W, T11.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T11.X, T11.X, literal.x,
-; CM-NEXT:     BFE_UINT T19.Y, T12.W, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     BFE_UINT * T20.Y, T12.W, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T21.Z, T12.Z, literal.y, T0.W,
-; CM-NEXT:     LSHR * T19.W, T12.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T19.X, T12.W, literal.x,
-; CM-NEXT:     BFE_UINT T21.Y, T12.Z, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     ADD_INT T21.X, T15.X, literal.x,
+; CM-NEXT:     BFE_UINT T22.Z, T12.Z, literal.x, T0.W,
+; CM-NEXT:     LSHR * T20.W, T12.W, literal.y,
+; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; CM-NEXT:     AND_INT T20.X, T12.W, literal.x,
+; CM-NEXT:     BFE_UINT * T22.Y, T12.Z, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T23.Z, T12.Y, literal.y, T0.W,
-; CM-NEXT:     LSHR * T21.W, T12.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T23.X, T15.X, literal.x,
+; CM-NEXT:     BFE_UINT T24.Z, T12.Y, literal.y, T0.W,
+; CM-NEXT:     LSHR * T22.W, T12.Z, literal.z,
+; CM-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T21.X, T12.Z, literal.x,
-; CM-NEXT:     BFE_UINT T23.Y, T12.Y, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T22.X, T12.Z, literal.x,
+; CM-NEXT:     BFE_UINT * T24.Y, T12.Y, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T12.Z, T12.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T23.W, T12.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T25.X, T15.X, literal.x,
+; CM-NEXT:     BFE_UINT T12.Z, T12.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T24.W, T12.Y, literal.z,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T23.X, T12.Y, literal.x,
-; CM-NEXT:     BFE_UINT T12.Y, T12.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T24.X, T12.Y, literal.x,
+; CM-NEXT:     BFE_UINT * T12.Y, T12.X, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
-; CM-NEXT:     LSHR * T12.W, T12.X, literal.y,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; CM-NEXT:     ADD_INT T26.X, T15.X, literal.x,
+; CM-NEXT:     LSHR * T12.W, T12.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    4(5.605194e-45), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT * T12.X, T12.X, literal.x,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T26.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3151,9 +3086,9 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @18, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @14
-; EG-NEXT:    ALU 18, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 9, @19, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @16
-; EG-NEXT:    ALU 75, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 73, @29, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 0
@@ -3170,118 +3105,107 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; EG-NEXT:    ALU clause starting at 18:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 19:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T0.Z, T12.W, literal.y,
-; EG-NEXT:     LSHR T0.W, T12.Z, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T16.X, PS, literal.x,
-; EG-NEXT:     LSHR T0.Y, T12.W, literal.y,
-; EG-NEXT:     LSHR T1.Z, T12.Z, literal.z,
-; EG-NEXT:     LSHR T1.W, T12.Y, literal.w,
-; EG-NEXT:     LSHR * T2.W, T12.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT:    ALU clause starting at 38:
-; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.x,
-; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T1.Y, T12.Y, literal.y,
-; EG-NEXT:     LSHR T2.Z, T12.Y, literal.z,
-; EG-NEXT:     LSHR T3.W, T12.X, literal.y,
-; EG-NEXT:     LSHR * T4.W, T12.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; EG-NEXT:     LSHR T0.W, T12.W, literal.y,
+; EG-NEXT:     LSHR * T1.W, T12.W, literal.z,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 29:
+; EG-NEXT:     LSHR T2.W, T12.Z, literal.x,
+; EG-NEXT:     LSHR * T3.W, T12.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T17.X, T13.X, literal.x,
+; EG-NEXT:     LSHR T0.Y, T12.Y, literal.x,
+; EG-NEXT:     LSHR T0.Z, T12.Y, literal.y,
+; EG-NEXT:     LSHR T4.W, T12.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T5.W, T12.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     BFE_INT T18.X, T11.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T2.Y, T11.W, literal.y,
-; EG-NEXT:     LSHR T3.Z, T11.W, literal.z,
-; EG-NEXT:     LSHR T5.W, T11.Z, literal.y,
-; EG-NEXT:     LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT:     LSHR T1.Y, T11.W, literal.y,
+; EG-NEXT:     LSHR T1.Z, T11.W, literal.z,
+; EG-NEXT:     LSHR T6.W, T11.Z, literal.y,
+; EG-NEXT:     LSHR * T7.W, T11.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T19.X, T11.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR T3.Y, T11.Z, literal.y,
-; EG-NEXT:     LSHR T4.Z, T11.Y, literal.y,
+; EG-NEXT:     LSHR T2.Y, T11.Z, literal.y,
+; EG-NEXT:     LSHR T2.Z, T11.Y, literal.y,
 ; EG-NEXT:     BFE_INT T18.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT:     LSHR * T7.W, T11.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T20.X, T11.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR T4.Y, T11.Y, literal.y,
+; EG-NEXT:     LSHR T3.Y, T11.Y, literal.y,
 ; EG-NEXT:     BFE_INT T18.Z, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T19.W, PV.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T11.X, literal.x,
+; EG-NEXT:     LSHR * T7.W, T11.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T21.X, T11.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T18.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T19.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T20.W, T3.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T11.Y, literal.x,
+; EG-NEXT:     BFE_INT T20.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.W, T11.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T22.X, T12.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T19.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T20.Z, T5.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.W, T3.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T5.W, T11.Z, literal.x,
+; EG-NEXT:     BFE_INT T20.Z, T6.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T21.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T6.W, T11.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T11.X, T12.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT T22.W, T4.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T4.W, T11.W, literal.x,
+; EG-NEXT:     BFE_INT T21.Z, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T22.W, T5.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T5.W, T11.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T23.X, T12.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T22.Z, T3.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T11.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.W, T12.X, literal.x,
+; EG-NEXT:     BFE_INT T22.Z, T4.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T11.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T4.W, T12.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T24.X, T12.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T22.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T11.Z, T1.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T23.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T12.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T23.Z, T1.Z, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T24.W, T0.Y, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T25.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T23.Y, T0.W, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T24.Z, T0.Z, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T11.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T23.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T12.Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T12.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT T11.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T23.Z, T2.W, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T24.W, T1.W, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T1.W, T12.Z, literal.y,
+; EG-NEXT:    20(2.802597e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T25.X, T13.X, literal.x,
+; EG-NEXT:     BFE_INT T23.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T24.Z, T0.W, 0.0, literal.y,
 ; EG-NEXT:     LSHR T0.W, T12.W, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T26.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T24.Y, PV.W, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T26.X, T13.X, literal.z,
+; EG-NEXT:    24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T24.Y, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v32i8_to_v32i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @18, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @14
-; CM-NEXT:    ALU 19, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 8, @19, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @16
-; CM-NEXT:    ALU 78, @39, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T12.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T27.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T19.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T17.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T16.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T15.X
+; CM-NEXT:    ALU 78, @28, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T12.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T25.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T17.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T16.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T15.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T14.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 14:
 ; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
@@ -3290,106 +3214,95 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; CM-NEXT:    ALU clause starting at 18:
 ; CM-NEXT:     MOV * T11.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 19:
-; CM-NEXT:     LSHR * T0.W, T12.X, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T13.X, T12.Y, literal.x,
-; CM-NEXT:     ADD_INT T0.Y, KC0[2].Y, literal.y,
-; CM-NEXT:     LSHR T0.Z, T12.X, literal.z,
-; CM-NEXT:     LSHR * T1.W, T12.Y, literal.y,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T14.X, T12.Z, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, KC0[2].Y, literal.y,
+; CM-NEXT:     LSHR T0.Z, T12.X, literal.x,
+; CM-NEXT:     LSHR * T0.W, T12.Y, literal.y,
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR T0.Y, T12.X, literal.y,
 ; CM-NEXT:     LSHR T1.Z, T12.Y, literal.z,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.w,
-; CM-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; CM-NEXT:    24(3.363116e-44), 112(1.569454e-43)
-; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T2.Y, T12.Z, literal.y,
-; CM-NEXT:     LSHR T2.Z, T12.W, literal.z,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 96(1.345247e-43)
-; CM-NEXT:    ALU clause starting at 39:
-; CM-NEXT:     LSHR T16.X, T2.W, literal.x,
-; CM-NEXT:     LSHR T3.Y, T12.Z, literal.y,
-; CM-NEXT:     LSHR T3.Z, T12.W, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:    16(2.242078e-44), 80(1.121039e-43)
-; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T4.Y, T11.X, literal.y,
-; CM-NEXT:     LSHR T4.Z, T12.W, literal.z,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    24(3.363116e-44), 64(8.968310e-44)
-; CM-NEXT:     LSHR T18.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T5.Y, T11.X, literal.y,
-; CM-NEXT:     LSHR T5.Z, T11.Y, literal.z,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T6.Y, T11.X, literal.y,
-; CM-NEXT:     LSHR T6.Z, T11.Y, literal.z,
-; CM-NEXT:     LSHR * T2.W, T11.Z, literal.w,
+; CM-NEXT:     LSHR * T1.W, T12.Z, literal.w,
 ; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; CM-NEXT:     BFE_INT T20.X, T11.W, 0.0, literal.x,
-; CM-NEXT:     LSHR T7.Y, T11.Y, literal.y,
-; CM-NEXT:     LSHR T7.Z, T11.Z, literal.z,
-; CM-NEXT:     LSHR * T3.W, T11.W, literal.y,
+; CM-NEXT:    ALU clause starting at 28:
+; CM-NEXT:     ADD_INT T14.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T1.Y, T12.Y, literal.y,
+; CM-NEXT:     LSHR T2.Z, T12.Z, literal.z,
+; CM-NEXT:     LSHR * T2.W, T12.W, literal.w,
+; CM-NEXT:    28(3.923636e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T15.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T2.Y, T12.Z, literal.x,
+; CM-NEXT:     LSHR T3.Z, T12.W, literal.y,
+; CM-NEXT:     LSHR * T3.W, T11.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T3.Y, T12.W, literal.y,
+; CM-NEXT:     LSHR T4.Z, T11.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T4.W, T11.Y, literal.w,
+; CM-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T17.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T4.Y, T11.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T5.Z, T11.Y, literal.x,
+; CM-NEXT:     LSHR * T5.W, T11.Z, literal.z,
+; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T18.X, T11.W, 0.0, literal.x,
+; CM-NEXT:     LSHR T5.Y, T11.Y, literal.y,
+; CM-NEXT:     LSHR T6.Z, T11.Z, literal.z,
+; CM-NEXT:     LSHR * T6.W, T11.W, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T21.X, T11.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T8.Y, T11.Z, literal.y,
-; CM-NEXT:     LSHR T8.Z, T11.W, literal.z,
-; CM-NEXT:     BFE_INT * T20.W, PV.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T19.X, T11.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR T6.Y, T11.Z, literal.y,
+; CM-NEXT:     LSHR T7.Z, T11.W, literal.z,
+; CM-NEXT:     BFE_INT * T18.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T22.X, T11.Y, 0.0, literal.x,
-; CM-NEXT:     LSHR T9.Y, T11.W, literal.x,
-; CM-NEXT:     BFE_INT T20.Z, PV.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T21.W, PV.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.X, T11.Y, 0.0, literal.x,
+; CM-NEXT:     LSHR T7.Y, T11.W, literal.x,
+; CM-NEXT:     BFE_INT T18.Z, PV.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T19.W, PV.Y, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T11.X, T11.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T20.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T21.Z, T7.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T22.W, T7.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T23.X, T12.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T21.Y, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T22.Z, T6.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T11.W, T6.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T24.X, T12.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T22.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T11.Z, T5.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T23.W, T4.Z, 0.0, literal.x, BS:VEC_201
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T25.X, T12.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T11.Y, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T23.Z, T3.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T24.W, T3.Y, 0.0, literal.x, BS:VEC_201
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T26.X, T12.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T23.Y, T2.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T24.Z, T2.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T25.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T27.X, T1.Y, literal.x,
-; CM-NEXT:     BFE_INT T24.Y, T14.X, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T25.Z, T1.W, 0.0, literal.y,
-; CM-NEXT:     BFE_INT * T26.W, T0.Z, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T14.X, T0.Y, literal.x,
-; CM-NEXT:     BFE_INT T25.Y, T13.X, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T26.Z, T0.W, 0.0, literal.y,
-; CM-NEXT:     LSHR * T0.W, T12.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT * T26.Y, PV.W, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     BFE_INT T18.Y, PV.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T19.Z, T6.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T20.W, T5.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T21.X, T12.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T19.Y, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T20.Z, T5.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T11.W, T4.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T22.X, T12.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.Y, T4.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T11.Z, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T21.W, T3.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T23.X, T12.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T11.Y, T3.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T21.Z, T3.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T22.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T24.X, T12.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T21.Y, T2.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T22.Z, T2.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T23.W, T1.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T25.X, T13.X, literal.x,
+; CM-NEXT:     BFE_INT T22.Y, T1.W, 0.0, literal.y,
+; CM-NEXT:     BFE_INT T23.Z, T1.Z, 0.0, literal.y,
+; CM-NEXT:     BFE_INT * T24.W, T0.Y, 0.0, literal.y,
+; CM-NEXT:    12(1.681558e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T26.X, T13.X, literal.x,
+; CM-NEXT:     BFE_INT T23.Y, T0.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T24.Z, T0.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR * T0.W, T12.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T12.X, T13.X, literal.x,
+; CM-NEXT:     BFE_INT * T24.Y, PV.W, 0.0, literal.y,
+; CM-NEXT:    4(5.605194e-45), 8(1.121039e-44)
   %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3806,32 +3719,32 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @22
-; EG-NEXT:    ALU 59, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 37, @31, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @26
-; EG-NEXT:    ALU 88, @91, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 92, @69, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T32.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T29.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T46.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T42.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T33.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T30.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T31.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T24.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T35.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T22.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T26.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T22.XYZW, T21.X, 16, #1
 ; EG-NEXT:     VTX_READ_128 T23.XYZW, T21.X, 0, #1
 ; EG-NEXT:    Fetch clause starting at 26:
-; EG-NEXT:     VTX_READ_128 T32.XYZW, T21.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T33.XYZW, T21.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T29.XYZW, T21.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T30.XYZW, T21.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 30:
 ; EG-NEXT:     MOV * T21.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 31:
@@ -3840,182 +3753,164 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_UINT * T19.Z, T23.Y, literal.x, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T19.Y, T23.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T19.W, T23.Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T19.X, T23.Y, literal.x,
 ; EG-NEXT:     BFE_UINT T20.Z, T23.X, literal.y, T0.W,
-; EG-NEXT:     LSHR * T19.W, T23.Y, literal.z,
+; EG-NEXT:     LSHR * T24.X, KC0[2].Y, literal.z,
+; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T20.Y, T23.X, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T25.Z, T23.W, literal.y, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T19.X, T23.Y, literal.x,
-; EG-NEXT:     BFE_UINT T20.Y, T23.X, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T25.Z, T23.W, literal.y, T0.W,
-; EG-NEXT:     LSHR T20.W, T23.X, literal.z,
+; EG-NEXT:     ADD_INT T26.X, T24.X, literal.x,
+; EG-NEXT:     BFE_UINT T25.Y, T23.W, literal.y, T0.W,
+; EG-NEXT:     LSHR T20.W, T23.X, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     AND_INT * T20.X, T23.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
 ; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT * T25.Y, T23.W, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT T27.Z, T23.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T25.W, T23.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T25.X, T23.W, literal.x,
+; EG-NEXT:     BFE_UINT T27.Y, T23.Z, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T23.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T27.W, T23.Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T27.X, T23.Z, literal.x,
+; EG-NEXT:     BFE_UINT T21.Z, T22.Y, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T28.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T23.X, KC0[2].Y, literal.x,
-; EG-NEXT:     BFE_UINT T26.Z, T23.Z, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T25.W, T23.W, literal.z,
-; EG-NEXT:     AND_INT * T25.X, T23.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T26.Y, T23.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T28.Z, T22.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T26.W, T23.Z, literal.z,
-; EG-NEXT:     AND_INT * T26.X, T23.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T28.Y, T22.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T30.Z, T22.X, literal.y, T0.W,
-; EG-NEXT:     LSHR T28.W, T22.Y, literal.z,
-; EG-NEXT:     AND_INT * T28.X, T22.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T30.Y, T22.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T30.W, T22.X, literal.y,
-; EG-NEXT:     AND_INT * T30.X, T22.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T21.Z, T22.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 64(8.968310e-44)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT * T21.Y, T22.W, literal.y, T0.W,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    ALU clause starting at 91:
+; EG-NEXT:     BFE_UINT * T21.Y, T22.Y, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 69:
+; EG-NEXT:     BFE_UINT T31.Z, T22.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T21.W, T22.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T21.X, T22.Y, literal.x,
+; EG-NEXT:     BFE_UINT T31.Y, T22.X, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T32.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T33.Z, T22.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T31.W, T22.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T31.X, T22.X, literal.x,
+; EG-NEXT:     BFE_UINT T33.Y, T22.W, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T22.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_UINT T34.Z, T22.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T21.W, T22.W, literal.y,
+; EG-NEXT:     LSHR * T33.W, T22.W, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:     AND_INT T21.X, T22.W, literal.x,
+; EG-NEXT:     AND_INT T33.X, T22.W, literal.x,
 ; EG-NEXT:     BFE_UINT T34.Y, T22.Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T35.X, T24.X, literal.z,
 ; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T36.Z, T33.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T34.W, T22.Z, literal.z,
-; EG-NEXT:     AND_INT * T34.X, T22.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T36.Y, T33.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T38.Z, T33.X, literal.y, T0.W,
-; EG-NEXT:     LSHR T36.W, T33.Y, literal.z,
-; EG-NEXT:     AND_INT * T36.X, T33.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T38.Y, T33.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 144(2.017870e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T40.Z, T33.W, literal.y, T0.W,
-; EG-NEXT:     LSHR T38.W, T33.X, literal.z,
-; EG-NEXT:     AND_INT * T38.X, T33.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T40.Y, T33.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 128(1.793662e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T41.Z, T33.Z, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T40.W, T33.W, literal.z,
-; EG-NEXT:     AND_INT * T40.X, T33.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T41.Y, T33.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 176(2.466285e-43)
-; EG-NEXT:     LSHR T42.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T43.Z, T32.Y, literal.y, T0.W,
-; EG-NEXT:     LSHR T41.W, T33.Z, literal.z,
-; EG-NEXT:     AND_INT * T41.X, T33.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T43.Y, T32.Y, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 160(2.242078e-43)
-; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T45.Z, T32.X, literal.y, T0.W,
-; EG-NEXT:     LSHR T43.W, T32.Y, literal.z,
-; EG-NEXT:     AND_INT * T43.X, T32.Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T45.Y, T32.X, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 208(2.914701e-43)
-; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T47.Z, T32.W, literal.y, T0.W,
-; EG-NEXT:     LSHR T45.W, T32.X, literal.z,
-; EG-NEXT:     AND_INT * T45.X, T32.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T47.Y, T32.W, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 192(2.690493e-43)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     BFE_UINT T48.Z, T32.Z, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHR T47.W, T32.W, literal.z,
-; EG-NEXT:     AND_INT * T47.X, T32.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT:     BFE_UINT T48.Y, T32.Z, literal.x, T0.W,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 240(3.363116e-43)
-; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T48.W, T32.Z, literal.y,
-; EG-NEXT:     AND_INT * T48.X, T32.Z, literal.z,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T36.Z, T30.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T34.W, T22.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T34.X, T22.Z, literal.x,
+; EG-NEXT:     BFE_UINT T36.Y, T30.Y, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T37.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T38.Z, T30.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T36.W, T30.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T36.X, T30.Y, literal.x,
+; EG-NEXT:     BFE_UINT T38.Y, T30.X, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T39.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T40.Z, T30.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T38.W, T30.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T38.X, T30.X, literal.x,
+; EG-NEXT:     BFE_UINT T40.Y, T30.W, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T30.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T41.Z, T30.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T40.W, T30.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T40.X, T30.W, literal.x,
+; EG-NEXT:     BFE_UINT T41.Y, T30.Z, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T42.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T43.Z, T29.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T41.W, T30.Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T41.X, T30.Z, literal.x,
+; EG-NEXT:     BFE_UINT T43.Y, T29.Y, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T44.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T45.Z, T29.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T43.W, T29.Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T43.X, T29.Y, literal.x,
+; EG-NEXT:     BFE_UINT T45.Y, T29.X, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T46.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T47.Z, T29.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T45.W, T29.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T45.X, T29.X, literal.x,
+; EG-NEXT:     BFE_UINT T47.Y, T29.W, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T29.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T48.Z, T29.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T47.W, T29.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     AND_INT T47.X, T29.W, literal.x,
+; EG-NEXT:     BFE_UINT T48.Y, T29.Z, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T49.X, T24.X, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T48.W, T29.Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T48.X, T29.Z, literal.x,
+; EG-NEXT:     ADD_INT * T50.X, T24.X, literal.y,
+; EG-NEXT:    255(3.573311e-43), 56(7.847271e-44)
 ;
 ; CM-LABEL: global_zextload_v64i8_to_v64i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @22
-; CM-NEXT:    ALU 63, @31, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 36, @31, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @26
-; CM-NEXT:    ALU 95, @95, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T49, T50.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T32.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T48.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T46.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T44.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T33.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T39.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T37.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T34.X
+; CM-NEXT:    ALU 91, @68, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T49, T29.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T48, T24.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T46, T50.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T47.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T45.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T30.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T42.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T40.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T38.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T34, T20.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T35.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T33.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T31.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T29.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T21.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T24.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T27.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 22:
 ; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 32, #1
 ; CM-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 48, #1
 ; CM-NEXT:    Fetch clause starting at 26:
-; CM-NEXT:     VTX_READ_128 T32.XYZW, T19.X, 0, #1
-; CM-NEXT:     VTX_READ_128 T33.XYZW, T19.X, 16, #1
+; CM-NEXT:     VTX_READ_128 T29.XYZW, T19.X, 0, #1
+; CM-NEXT:     VTX_READ_128 T30.XYZW, T19.X, 16, #1
 ; CM-NEXT:    ALU clause starting at 30:
 ; CM-NEXT:     MOV * T19.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 31:
@@ -4024,162 +3919,131 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; CM-NEXT:     BFE_UINT * T22.Z, T21.Z, literal.x, PV.W,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_UINT T22.Y, T21.Z, literal.x, T0.W,
-; CM-NEXT:     BFE_UINT T23.Z, T21.W, literal.y, T0.W,
-; CM-NEXT:     LSHR * T22.W, T21.Z, literal.z,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T22.W, T21.Z, literal.y,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:     AND_INT T22.X, T21.Z, literal.x,
-; CM-NEXT:     BFE_UINT T23.Y, T21.W, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T25.Z, T21.X, literal.y, T0.W,
+; CM-NEXT:     BFE_UINT * T23.Z, T21.W, literal.y, T0.W,
+; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT:     LSHR T24.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_UINT * T23.Y, T21.W, literal.y, T0.W,
+; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T25.X, PV.X, literal.x,
+; CM-NEXT:     BFE_UINT T26.Z, T21.X, literal.y, T0.W,
 ; CM-NEXT:     LSHR * T23.W, T21.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    56(7.847271e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT T23.X, T21.W, literal.x,
-; CM-NEXT:     BFE_UINT T25.Y, T21.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     BFE_UINT * T26.Y, T21.X, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T27.Z, T21.Y, literal.y, T0.W,
-; CM-NEXT:     LSHR * T25.W, T21.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T27.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T28.Z, T21.Y, literal.y, T0.W,
+; CM-NEXT:     LSHR * T26.W, T21.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    60(8.407791e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T25.X, T21.X, literal.x,
-; CM-NEXT:     BFE_UINT T27.Y, T21.Y, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T26.X, T21.X, literal.x,
+; CM-NEXT:     BFE_UINT * T28.Y, T21.Y, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    192(2.690493e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T28.Z, T20.Z, literal.y, T0.W,
-; CM-NEXT:     LSHR * T27.W, T21.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T21.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T19.Z, T20.Z, literal.y, T0.W,
+; CM-NEXT:     LSHR * T28.W, T21.Y, literal.z,
+; CM-NEXT:    48(6.726233e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T27.X, T21.Y, literal.x,
-; CM-NEXT:     BFE_UINT T28.Y, T20.Z, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T28.X, T21.Y, literal.x,
+; CM-NEXT:     BFE_UINT * T19.Y, T20.Z, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    208(2.914701e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T30.Z, T20.W, literal.y, T0.W,
-; CM-NEXT:     LSHR * T28.W, T20.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    ALU clause starting at 68:
+; CM-NEXT:     ADD_INT T31.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T32.Z, T20.W, literal.y, T0.W,
+; CM-NEXT:     LSHR * T19.W, T20.Z, literal.z,
+; CM-NEXT:    52(7.286752e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T28.X, T20.Z, literal.x,
-; CM-NEXT:     BFE_UINT T30.Y, T20.W, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T19.X, T20.Z, literal.x,
+; CM-NEXT:     BFE_UINT * T32.Y, T20.W, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    160(2.242078e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T31.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T19.Z, T20.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T30.W, T20.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T33.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T34.Z, T20.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T32.W, T20.W, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    40(5.605194e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T30.X, T20.W, literal.x,
-; CM-NEXT:     BFE_UINT T19.Y, T20.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T32.X, T20.W, literal.x,
+; CM-NEXT:     BFE_UINT * T34.Y, T20.X, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    176(2.466285e-43), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 95:
-; CM-NEXT:     LSHR T34.X, T1.W, literal.x,
-; CM-NEXT:     BFE_UINT T35.Z, T20.Y, literal.y, T0.W,
-; CM-NEXT:     LSHR * T19.W, T20.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T35.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T36.Z, T20.Y, literal.y, T0.W,
+; CM-NEXT:     LSHR * T34.W, T20.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    44(6.165713e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T19.X, T20.X, literal.x,
-; CM-NEXT:     BFE_UINT T35.Y, T20.Y, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T34.X, T20.X, literal.x,
+; CM-NEXT:     BFE_UINT * T36.Y, T20.Y, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    128(1.793662e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T36.Z, T33.Z, literal.y, T0.W,
-; CM-NEXT:     LSHR * T35.W, T20.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T20.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T37.Z, T30.Z, literal.y, T0.W,
+; CM-NEXT:     LSHR * T36.W, T20.Y, literal.z,
+; CM-NEXT:    32(4.484155e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T35.X, T20.Y, literal.x,
-; CM-NEXT:     BFE_UINT T36.Y, T33.Z, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T36.X, T20.Y, literal.x,
+; CM-NEXT:     BFE_UINT * T37.Y, T30.Z, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    144(2.017870e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T38.Z, T33.W, literal.y, T0.W,
-; CM-NEXT:     LSHR * T36.W, T33.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T38.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T39.Z, T30.W, literal.y, T0.W,
+; CM-NEXT:     LSHR * T37.W, T30.Z, literal.z,
+; CM-NEXT:    36(5.044674e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T36.X, T33.Z, literal.x,
-; CM-NEXT:     BFE_UINT T38.Y, T33.W, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T37.X, T30.Z, literal.x,
+; CM-NEXT:     BFE_UINT * T39.Y, T30.W, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T40.Z, T33.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T38.W, T33.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T38.X, T33.W, literal.x,
-; CM-NEXT:     BFE_UINT T40.Y, T33.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     ADD_INT T40.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T41.Z, T30.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T39.W, T30.W, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; CM-NEXT:     AND_INT T39.X, T30.W, literal.x,
+; CM-NEXT:     BFE_UINT * T41.Y, T30.X, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T41.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T42.Z, T33.Y, literal.y, T0.W,
-; CM-NEXT:     LSHR * T40.W, T33.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T42.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T43.Z, T30.Y, literal.y, T0.W,
+; CM-NEXT:     LSHR * T41.W, T30.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    28(3.923636e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T40.X, T33.X, literal.x,
-; CM-NEXT:     BFE_UINT T42.Y, T33.Y, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T41.X, T30.X, literal.x,
+; CM-NEXT:     BFE_UINT * T43.Y, T30.Y, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    64(8.968310e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T33.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T43.Z, T32.Z, literal.y, T0.W,
-; CM-NEXT:     LSHR * T42.W, T33.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T42.X, T33.Y, literal.x,
-; CM-NEXT:     BFE_UINT T43.Y, T32.Z, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     ADD_INT T30.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T44.Z, T29.Z, literal.x, T0.W,
+; CM-NEXT:     LSHR * T43.W, T30.Y, literal.y,
+; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; CM-NEXT:     AND_INT T43.X, T30.Y, literal.x,
+; CM-NEXT:     BFE_UINT * T44.Y, T29.Z, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T44.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T45.Z, T32.W, literal.y, T0.W,
-; CM-NEXT:     LSHR * T43.W, T32.Z, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T45.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T46.Z, T29.W, literal.y, T0.W,
+; CM-NEXT:     LSHR * T44.W, T29.Z, literal.z,
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T43.X, T32.Z, literal.x,
-; CM-NEXT:     BFE_UINT T45.Y, T32.W, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T44.X, T29.Z, literal.x,
+; CM-NEXT:     BFE_UINT * T46.Y, T29.W, literal.y, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T47.Z, T32.X, literal.y, T0.W,
-; CM-NEXT:     LSHR * T45.W, T32.W, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:     ADD_INT T47.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T48.Z, T29.X, literal.y, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T46.W, T29.W, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T45.X, T32.W, literal.x,
-; CM-NEXT:     BFE_UINT T47.Y, T32.X, literal.y, T0.W,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T46.X, T29.W, literal.x,
+; CM-NEXT:     BFE_UINT T48.Y, T29.X, literal.y, T0.W,
+; CM-NEXT:     BFE_UINT * T49.Z, T29.Y, literal.z, T0.W,
 ; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T48.X, PV.W, literal.x,
-; CM-NEXT:     BFE_UINT T49.Z, T32.Y, literal.y, T0.W,
-; CM-NEXT:     LSHR * T47.W, T32.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T50.X, T24.X, literal.x,
+; CM-NEXT:     BFE_UINT T49.Y, T29.Y, literal.y, T0.W,
+; CM-NEXT:     LSHR * T48.W, T29.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    12(1.681558e-44), 8(1.121039e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T47.X, T32.X, literal.x,
-; CM-NEXT:     BFE_UINT * T49.Y, T32.Y, literal.y, T0.W,
-; CM-NEXT:    255(3.573311e-43), 8(1.121039e-44)
-; CM-NEXT:     LSHR T32.X, KC0[2].Y, literal.x,
-; CM-NEXT:     LSHR * T49.W, T32.Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:     AND_INT T49.X, T32.Y, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T50.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T48.X, T29.X, literal.x,
+; CM-NEXT:     LSHR * T49.W, T29.Y, literal.y,
+; CM-NEXT:    255(3.573311e-43), 24(3.363116e-44)
+; CM-NEXT:     AND_INT * T49.X, T29.Y, literal.x,
+; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T29.X, T24.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <64 x i8>, ptr addrspace(1) %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4596,11 +4460,11 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; EG-LABEL: global_sextload_v64i8_to_v64i32:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @32, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @24
-; EG-NEXT:    ALU 41, @33, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @28
-; EG-NEXT:    ALU 76, @75, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 72, @152, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @24
+; EG-NEXT:    ALU 16, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @26
+; EG-NEXT:    ALU 76, @50, KC0[], KC1[]
+; EG-NEXT:    ALU 71, @127, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T19.X, 0
@@ -4608,457 +4472,408 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T32.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T30.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T27.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T26.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T25.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T26.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 24:
-; EG-NEXT:     VTX_READ_128 T20.XYZW, T21.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T19.XYZW, T21.X, 48, #1
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_128 T31.XYZW, T21.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T21.XYZW, T21.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T19.XYZW, T20.X, 48, #1
+; EG-NEXT:    Fetch clause starting at 26:
+; EG-NEXT:     VTX_READ_128 T29.XYZW, T20.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T30.XYZW, T20.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T20.XYZW, T20.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 32:
-; EG-NEXT:     MOV * T21.X, KC0[2].Z,
+; EG-NEXT:     MOV * T20.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 33:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T23.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 12(1.681558e-44)
+; EG-NEXT:     ADD_INT T24.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T21.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 20(2.802597e-44)
+; EG-NEXT:     ADD_INT T26.X, T21.X, literal.x,
+; EG-NEXT:     ADD_INT * T27.X, T21.X, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     ADD_INT T28.X, T21.X, literal.x,
 ; EG-NEXT:     LSHR T0.Y, T19.Z, literal.y,
-; EG-NEXT:     LSHR T0.Z, T19.W, literal.z,
-; EG-NEXT:     LSHR * T0.W, T19.Z, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T1.Y, T19.W, literal.y,
-; EG-NEXT:     LSHR T1.Z, T19.X, literal.z,
-; EG-NEXT:     LSHR * T1.W, T19.W, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT:    144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T2.Y, T19.X, literal.y,
-; EG-NEXT:     LSHR T2.Z, T19.X, literal.z,
+; EG-NEXT:     LSHR T0.Z, T19.Z, literal.x,
+; EG-NEXT:     LSHR T0.W, T19.W, literal.y,
+; EG-NEXT:     LSHR * T1.W, T19.W, literal.x,
+; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
+; EG-NEXT:    ALU clause starting at 50:
+; EG-NEXT:     ADD_INT T31.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T1.Y, T19.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T1.Z, T19.X, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR T2.W, T19.Y, literal.y,
 ; EG-NEXT:     LSHR * T3.W, T19.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:    36(5.044674e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 75:
-; EG-NEXT:     LSHR T3.Y, T20.Z, literal.x,
-; EG-NEXT:     LSHR T3.Z, T20.Z, literal.y,
-; EG-NEXT:     LSHR T4.W, T20.W, literal.x,
-; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.z,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT:    128(1.793662e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T32.X, PS, literal.x,
-; EG-NEXT:     LSHR T4.Y, T20.W, literal.y,
-; EG-NEXT:     LSHR T4.Z, T20.X, literal.z,
-; EG-NEXT:     LSHR T5.W, T20.X, literal.y,
-; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 176(2.466285e-43)
-; EG-NEXT:     LSHR T33.X, PS, literal.x,
-; EG-NEXT:     LSHR T5.Y, T20.Y, literal.y,
-; EG-NEXT:     LSHR T5.Z, T20.Y, literal.z,
-; EG-NEXT:     LSHR T6.W, T21.Z, literal.y,
-; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:    24(3.363116e-44), 160(2.242078e-43)
-; EG-NEXT:     LSHR T34.X, PS, literal.x,
-; EG-NEXT:     LSHR T6.Y, T21.Z, literal.y,
-; EG-NEXT:     LSHR T6.Z, T21.W, literal.z,
-; EG-NEXT:     LSHR T7.W, T21.W, literal.y,
-; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT:    16(2.242078e-44), 208(2.914701e-43)
-; EG-NEXT:     LSHR T35.X, PS, literal.x,
-; EG-NEXT:     LSHR T7.Y, T21.X, literal.y,
-; EG-NEXT:     LSHR T7.Z, T21.X, literal.z,
-; EG-NEXT:     LSHR T8.W, T21.Y, literal.y,
-; EG-NEXT:     LSHR * T9.W, T21.Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T32.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T2.Y, T20.Z, literal.y,
+; EG-NEXT:     LSHR T2.Z, T20.Z, literal.z,
+; EG-NEXT:     LSHR T4.W, T20.W, literal.y,
+; EG-NEXT:     LSHR * T5.W, T20.W, literal.z,
+; EG-NEXT:    32(4.484155e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T33.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T3.Y, T20.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T3.Z, T20.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T6.W, T20.Y, literal.y,
+; EG-NEXT:     LSHR * T7.W, T20.Y, literal.z,
+; EG-NEXT:    44(6.165713e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T34.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T4.Y, T30.Z, literal.y,
+; EG-NEXT:     LSHR T4.Z, T30.Z, literal.z,
+; EG-NEXT:     LSHR T8.W, T30.W, literal.y,
+; EG-NEXT:     LSHR * T9.W, T30.W, literal.z,
+; EG-NEXT:    40(5.605194e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T36.X, T31.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR T8.Y, T31.Z, literal.y,
-; EG-NEXT:     LSHR T8.Z, T31.Z, literal.z,
-; EG-NEXT:     LSHR T10.W, T31.W, literal.y,
-; EG-NEXT:     LSHR * T11.W, T31.Y, literal.z,
+; EG-NEXT:     ADD_INT T35.X, T21.X, literal.x,
+; EG-NEXT:     LSHR T5.Y, T30.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T5.Z, T30.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T10.W, T30.Y, literal.y,
+; EG-NEXT:     LSHR * T11.W, T30.Y, literal.z,
+; EG-NEXT:    52(7.286752e-44), 16(2.242078e-44)
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T36.X, T29.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR T6.Y, T29.Z, literal.y,
+; EG-NEXT:     LSHR T6.Z, T29.Z, literal.z,
+; EG-NEXT:     LSHR T12.W, T29.W, literal.y,
+; EG-NEXT:     LSHR * T13.W, T29.Y, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T37.X, T31.X, 0.0, literal.x,
-; EG-NEXT:     LSHR T9.Y, T31.W, literal.y,
-; EG-NEXT:     LSHR T9.Z, T31.X, literal.y,
+; EG-NEXT:     BFE_INT T37.X, T29.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T7.Y, T29.W, literal.y,
+; EG-NEXT:     LSHR T7.Z, T29.X, literal.y,
 ; EG-NEXT:     BFE_INT T36.W, PS, 0.0, literal.x,
-; EG-NEXT:     LSHR * T11.W, T31.Y, literal.z,
+; EG-NEXT:     LSHR * T13.W, T29.Y, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T38.X, T31.W, 0.0, literal.x,
-; EG-NEXT:     LSHR T10.Y, T31.X, literal.y,
+; EG-NEXT:     BFE_INT T38.X, T29.W, 0.0, literal.x,
+; EG-NEXT:     LSHR T8.Y, T29.X, literal.y,
 ; EG-NEXT:     BFE_INT T36.Z, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T37.W, PV.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T11.W, T31.Y, literal.x,
+; EG-NEXT:     LSHR * T13.W, T29.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T39.X, T31.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.X, T29.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T36.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T37.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T38.W, T9.Y, 0.0, literal.x,
-; EG-NEXT:     LSHR * T11.W, T31.X, literal.x,
+; EG-NEXT:     BFE_INT T38.W, T7.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T13.W, T29.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T31.X, T21.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T29.X, T30.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T37.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T38.Z, T10.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T39.W, T8.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T10.W, T31.W, literal.x,
+; EG-NEXT:     BFE_INT T38.Z, T12.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T39.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T12.W, T29.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T40.X, T21.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.X, T30.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T38.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T39.Z, T8.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.W, T9.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T9.W, T31.Z, literal.x,
+; EG-NEXT:     BFE_INT T39.Z, T6.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T29.W, T11.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T11.W, T29.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T41.X, T21.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.X, T30.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T39.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.Z, T8.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT * T40.W, T7.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T29.Z, T10.W, 0.0, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 152:
-; EG-NEXT:     LSHR * T8.W, T21.Y, literal.x,
+; EG-NEXT:    ALU clause starting at 127:
+; EG-NEXT:     BFE_INT T40.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T10.W, T30.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T42.X, T21.Z, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T31.Y, PV.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T40.Z, T7.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T41.W, T7.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T7.W, T21.X, literal.x,
+; EG-NEXT:     BFE_INT T42.X, T30.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T29.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.Z, T5.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.W, T9.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T9.W, T30.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T21.X, T20.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.X, T20.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T40.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T41.Z, T6.Z, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T7.W, T21.W, literal.x,
+; EG-NEXT:     BFE_INT T41.Z, T8.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T42.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T8.W, T30.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T43.X, T20.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T41.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T42.Z, T6.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.W, T5.Z, 0.0, literal.x,
-; EG-NEXT:     LSHR * T6.W, T21.Z, literal.x,
+; EG-NEXT:     BFE_INT T42.Z, T4.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.W, T7.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.W, T30.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T44.X, T20.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.Z, T5.Y, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T5.W, T20.Y, literal.x,
+; EG-NEXT:     BFE_INT T30.Z, T6.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T43.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T6.W, T20.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T45.X, T20.Z, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T43.Z, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT T44.W, T4.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T30.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T43.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T44.W, T5.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T5.W, T20.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T20.X, T19.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T43.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T44.Z, T4.W, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T45.W, T2.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T4.W, T20.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T46.X, T19.X, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T44.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T45.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T45.Z, T2.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T20.W, T3.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T3.W, T20.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T47.X, T19.W, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T45.Y, PS, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T20.Z, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BFE_INT T46.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T46.W, T1.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T2.W, T19.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T48.X, T19.Z, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T20.Y, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T46.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T46.Z, T1.Y, 0.0, literal.x,
 ; EG-NEXT:     BFE_INT T47.W, T1.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 192(2.690493e-43)
-; EG-NEXT:     LSHR T19.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T46.Y, T1.Z, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T47.Z, T1.Y, 0.0, literal.y,
-; EG-NEXT:     BFE_INT T48.W, T0.W, 0.0, literal.y,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T49.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T47.Y, T0.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T1.W, T19.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T19.X, T21.X, literal.x,
+; EG-NEXT:     BFE_INT T46.Y, PS, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T47.Z, T0.W, 0.0, literal.y,
+; EG-NEXT:     BFE_INT T48.W, T0.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T19.W, literal.y,
+; EG-NEXT:    48(6.726233e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T49.X, T21.X, literal.x,
+; EG-NEXT:     BFE_INT T47.Y, PS, 0.0, literal.y,
 ; EG-NEXT:     BFE_INT T48.Z, T0.Y, 0.0, literal.y,
-; EG-NEXT:     LSHR T0.W, T19.Z, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T50.X, PS, literal.x,
-; EG-NEXT:     BFE_INT * T48.Y, PV.W, 0.0, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     LSHR T0.W, T19.Z, literal.y,
+; EG-NEXT:     ADD_INT * T50.X, T21.X, literal.z,
+; EG-NEXT:    60(8.407791e-44), 8(1.121039e-44)
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT * T48.Y, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v64i8_to_v64i32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @32, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 1 @24
-; CM-NEXT:    ALU 39, @33, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 1 @28
-; CM-NEXT:    ALU 84, @73, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 73, @158, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T50.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T21.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T49, T23.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T48, T38.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T37.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T36.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T46, T35.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T34.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T33.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T32.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T31.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T30.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T29.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T27.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T26.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T25.X
+; CM-NEXT:    TEX 0 @24
+; CM-NEXT:    ALU 17, @33, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 2 @26
+; CM-NEXT:    ALU 84, @51, KC0[], KC1[]
+; CM-NEXT:    ALU 72, @136, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T50.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T46, T49.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T48.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T35.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T34.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T33.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T32.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T31.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T30.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T29.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T28.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T27.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T26.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T22.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    Fetch clause starting at 24:
-; CM-NEXT:     VTX_READ_128 T19.XYZW, T22.X, 0, #1
-; CM-NEXT:     VTX_READ_128 T20.XYZW, T22.X, 16, #1
-; CM-NEXT:    Fetch clause starting at 28:
-; CM-NEXT:     VTX_READ_128 T28.XYZW, T22.X, 48, #1
-; CM-NEXT:     VTX_READ_128 T22.XYZW, T22.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T19.XYZW, T20.X, 0, #1
+; CM-NEXT:    Fetch clause starting at 26:
+; CM-NEXT:     VTX_READ_128 T24.XYZW, T20.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T25.XYZW, T20.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T20.XYZW, T20.X, 16, #1
 ; CM-NEXT:    ALU clause starting at 32:
-; CM-NEXT:     MOV * T22.X, KC0[2].Z,
+; CM-NEXT:     MOV * T20.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 33:
-; CM-NEXT:     LSHR T0.Y, T19.Y, literal.x,
-; CM-NEXT:     LSHR T0.Z, T19.Y, literal.y,
-; CM-NEXT:     LSHR * T0.W, T19.X, literal.x,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:     LSHR T21.X, T19.Y, literal.x,
+; CM-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR T0.Y, T19.Y, literal.y,
+; CM-NEXT:     LSHR T0.Z, T19.X, literal.z,
+; CM-NEXT:     LSHR * T0.W, T19.Y, literal.w,
+; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT:     ADD_INT T22.X, PV.X, literal.x,
 ; CM-NEXT:     LSHR T1.Y, T19.X, literal.y,
 ; CM-NEXT:     LSHR T1.Z, T19.W, literal.z,
-; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; CM-NEXT:     LSHR T23.X, T19.X, literal.x,
+; CM-NEXT:     LSHR * T1.W, T19.X, literal.w,
+; CM-NEXT:    56(7.847271e-44), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT:     ADD_INT T23.X, T21.X, literal.x,
 ; CM-NEXT:     LSHR T2.Y, T19.W, literal.y,
 ; CM-NEXT:     LSHR T2.Z, T19.Z, literal.z,
-; CM-NEXT:     LSHR * T2.W, T19.W, literal.x,
-; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T24.X, T19.Z, literal.x,
-; CM-NEXT:     LSHR T3.Y, T20.Y, literal.y,
-; CM-NEXT:     LSHR T3.Z, T19.Z, literal.z,
-; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.w,
+; CM-NEXT:     LSHR * T2.W, T19.W, literal.w,
+; CM-NEXT:    60(8.407791e-44), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT:    ALU clause starting at 51:
+; CM-NEXT:     LSHR T3.Z, T19.Z, literal.x,
+; CM-NEXT:     LSHR * T3.W, T20.Y, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; CM-NEXT:    24(3.363116e-44), 224(3.138909e-43)
-; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T26.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T3.Y, T19.Z, literal.y,
+; CM-NEXT:     LSHR T4.Z, T20.Y, literal.z,
+; CM-NEXT:     LSHR * T4.W, T20.X, literal.w, BS:VEC_120/SCL_212
+; CM-NEXT:    48(6.726233e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T27.X, T21.X, literal.x,
 ; CM-NEXT:     LSHR T4.Y, T20.Y, literal.y,
-; CM-NEXT:     LSHR T4.Z, T20.X, literal.z,
-; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 240(3.363116e-43)
-; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T5.Y, T20.Y, literal.y,
-; CM-NEXT:     LSHR T5.Z, T20.X, literal.z,
-; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:    16(2.242078e-44), 192(2.690493e-43)
-; CM-NEXT:     LSHR T27.X, PV.W, literal.x,
+; CM-NEXT:     LSHR T5.Z, T20.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T5.W, T20.W, literal.w,
+; CM-NEXT:    52(7.286752e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T28.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T5.Y, T20.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T6.Z, T20.W, literal.z,
+; CM-NEXT:     LSHR * T6.W, T20.Z, literal.w,
+; CM-NEXT:    40(5.605194e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T29.X, T21.X, literal.x,
 ; CM-NEXT:     LSHR T6.Y, T20.W, literal.y,
-; CM-NEXT:     LSHR T6.Z, T20.X, literal.z,
-; CM-NEXT:     LSHR * T3.W, T20.W, literal.w,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     LSHR T7.Z, T20.Z, literal.z,
+; CM-NEXT:     LSHR * T7.W, T25.Y, literal.w,
+; CM-NEXT:    44(6.165713e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T30.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T7.Y, T20.Z, literal.y,
+; CM-NEXT:     LSHR T8.Z, T25.Y, literal.z,
+; CM-NEXT:     LSHR * T8.W, T25.X, literal.w, BS:VEC_120/SCL_212
+; CM-NEXT:    32(4.484155e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T31.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T8.Y, T25.Y, literal.y,
+; CM-NEXT:     LSHR T9.Z, T25.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T9.W, T25.W, literal.w,
+; CM-NEXT:    36(5.044674e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T32.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T9.Y, T25.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T10.Z, T25.W, literal.y,
+; CM-NEXT:     LSHR * T10.W, T25.Z, literal.z,
 ; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; CM-NEXT:    ALU clause starting at 73:
-; CM-NEXT:     LSHR T7.Z, T20.Z, literal.x,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
-; CM-NEXT:    8(1.121039e-44), 208(2.914701e-43)
-; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T7.Y, T20.W, literal.y,
-; CM-NEXT:     LSHR T8.Z, T20.Z, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:    16(2.242078e-44), 160(2.242078e-43)
-; CM-NEXT:     LSHR T30.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T8.Y, T22.Y, literal.y,
-; CM-NEXT:     LSHR T9.Z, T20.Z, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    24(3.363116e-44), 176(2.466285e-43)
-; CM-NEXT:     LSHR T31.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T9.Y, T22.Y, literal.y,
-; CM-NEXT:     LSHR T10.Z, T22.X, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 128(1.793662e-43)
-; CM-NEXT:     LSHR T32.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T10.Y, T22.Y, literal.y,
-; CM-NEXT:     LSHR T11.Z, T22.X, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:    16(2.242078e-44), 144(2.017870e-43)
-; CM-NEXT:     LSHR T33.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T11.Y, T22.W, literal.y,
-; CM-NEXT:     LSHR T12.Z, T22.X, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    24(3.363116e-44), 96(1.345247e-43)
-; CM-NEXT:     LSHR T34.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T12.Y, T22.W, literal.y,
-; CM-NEXT:     LSHR T13.Z, T22.Z, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 112(1.569454e-43)
-; CM-NEXT:     LSHR T35.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T13.Y, T22.W, literal.y,
-; CM-NEXT:     LSHR T14.Z, T22.Z, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:    16(2.242078e-44), 64(8.968310e-44)
-; CM-NEXT:     LSHR T36.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T14.Y, T28.Y, literal.y,
-; CM-NEXT:     LSHR T15.Z, T22.Z, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    24(3.363116e-44), 80(1.121039e-43)
-; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T15.Y, T28.Y, literal.y,
-; CM-NEXT:     LSHR T16.Z, T28.X, literal.z,
-; CM-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T16.Y, T28.Y, literal.y,
-; CM-NEXT:     LSHR T17.Z, T28.X, literal.z,
-; CM-NEXT:     LSHR * T4.W, T28.W, literal.w,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T33.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T10.Y, T25.W, literal.y,
+; CM-NEXT:     LSHR T11.Z, T25.Z, literal.z,
+; CM-NEXT:     LSHR * T11.W, T24.Y, literal.w,
+; CM-NEXT:    28(3.923636e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T34.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T11.Y, T25.Z, literal.y,
+; CM-NEXT:     LSHR T12.Z, T24.Y, literal.x,
+; CM-NEXT:     LSHR * T12.W, T24.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T35.X, T21.X, literal.x,
+; CM-NEXT:     LSHR T12.Y, T24.Y, literal.y,
+; CM-NEXT:     LSHR T13.Z, T24.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T13.W, T24.W, literal.w,
+; CM-NEXT:    20(2.802597e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
-; CM-NEXT:     BFE_INT T39.X, T28.Z, 0.0, literal.x,
-; CM-NEXT:     LSHR T17.Y, T28.X, literal.y,
-; CM-NEXT:     LSHR T18.Z, T28.W, literal.z,
-; CM-NEXT:     LSHR * T5.W, T28.Z, literal.y,
+; CM-NEXT:     BFE_INT T36.X, T24.Z, 0.0, literal.x,
+; CM-NEXT:     LSHR T13.Y, T24.X, literal.y,
+; CM-NEXT:     LSHR T14.Z, T24.W, literal.z,
+; CM-NEXT:     LSHR * T14.W, T24.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T40.X, T28.W, 0.0, literal.x,
-; CM-NEXT:     LSHR T18.Y, T28.W, literal.y,
-; CM-NEXT:     LSHR T21.Z, T28.Z, literal.z,
-; CM-NEXT:     BFE_INT * T39.W, PV.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.X, T24.W, 0.0, literal.x,
+; CM-NEXT:     LSHR T14.Y, T24.W, literal.y,
+; CM-NEXT:     LSHR T15.Z, T24.Z, literal.z,
+; CM-NEXT:     BFE_INT * T36.W, PV.W, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T28.X, T28.X, 0.0, literal.x,
-; CM-NEXT:     LSHR T21.Y, T28.Z, literal.x,
-; CM-NEXT:     BFE_INT T39.Z, PV.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T40.W, PV.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T41.X, T28.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T39.Y, PV.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T40.Z, T18.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T28.W, T17.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 158:
-; CM-NEXT:     BFE_INT T42.X, T22.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T40.Y, T4.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T28.Z, T17.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT * T41.W, T16.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T43.X, T22.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T28.Y, T16.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T41.Z, T15.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T42.W, T15.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T22.X, T22.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T41.Y, T14.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T42.Z, T14.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T43.W, T13.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T44.X, T22.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T42.Y, T13.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T43.Z, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT * T22.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T45.X, T20.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T43.Y, T11.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T22.Z, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT * T44.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T46.X, T20.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T22.Y, T10.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T44.Z, T9.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T45.W, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T24.X, T24.X, 0.0, literal.x,
+; CM-NEXT:     LSHR T15.Y, T24.Z, literal.x,
+; CM-NEXT:     BFE_INT T36.Z, PV.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T37.W, PV.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T38.X, T24.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T36.Y, PV.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.Z, T14.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T24.W, T13.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 136:
+; CM-NEXT:     BFE_INT T39.X, T25.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.Y, T13.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T24.Z, T13.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T38.W, T12.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T40.X, T25.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T24.Y, T12.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T38.Z, T12.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T39.W, T11.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T25.X, T25.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T38.Y, T11.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T39.Z, T11.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T40.W, T10.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T41.X, T25.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T39.Y, T10.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T40.Z, T10.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T25.W, T9.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T42.X, T20.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T40.Y, T9.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T25.Z, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T41.W, T8.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T43.X, T20.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T25.Y, T8.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T41.Z, T8.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T42.W, T7.Y, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T20.X, T20.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T44.Y, T8.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T45.Z, T8.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T46.W, T7.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T47.X, T20.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T45.Y, T7.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T46.Z, T3.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T20.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T41.Y, T7.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T42.Z, T7.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T43.W, T6.Y, 0.0, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T48.X, T19.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T46.Y, T6.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T20.Z, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT * T47.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T49.X, T19.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T20.Y, T4.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T47.Z, T4.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T48.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T19.X, T19.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T47.Y, T3.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T48.Z, T24.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT * T49.W, T2.W, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T24.X, T19.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T48.Y, T2.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T49.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT * T19.W, T23.X, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T23.X, T1.W, literal.x,
-; CM-NEXT:     BFE_INT T49.Y, T1.Z, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T19.Z, T1.Y, 0.0, literal.y,
-; CM-NEXT:     BFE_INT * T24.W, T21.X, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT T19.Y, T0.W, 0.0, literal.y,
-; CM-NEXT:     BFE_INT T24.Z, T0.Z, 0.0, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
-; CM-NEXT:     BFE_INT * T24.Y, T0.Y, 0.0, literal.y,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     BFE_INT T44.X, T20.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T42.Y, T6.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T43.Z, T6.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T20.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T45.X, T19.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T43.Y, T5.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.Z, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T44.W, T4.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T46.X, T19.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.Y, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T44.Z, T4.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T45.W, T3.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T47.X, T19.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T44.Y, T3.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T45.Z, T3.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T46.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T19.X, T19.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T45.Y, T2.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T46.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T47.W, T1.W, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T48.X, T21.X, literal.x,
+; CM-NEXT:     BFE_INT T46.Y, T1.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T47.Z, T1.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T19.W, T0.W, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T49.X, T21.X, literal.x,
+; CM-NEXT:     BFE_INT T47.Y, T0.Z, 0.0, literal.y,
+; CM-NEXT:     BFE_INT T19.Z, T0.Y, 0.0, literal.y,
+; CM-NEXT:     LSHR * T0.W, T19.Y, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    12(1.681558e-44), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T50.X, T21.X, literal.x,
+; CM-NEXT:     BFE_INT * T19.Y, PV.W, 0.0, literal.y,
+; CM-NEXT:    4(5.605194e-45), 8(1.121039e-44)
   %load = load <64 x i8>, ptr addrspace(1) %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(1) %out
@@ -5799,7 +5614,7 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 1
 ; EG-NEXT:    CF_END
@@ -5821,19 +5636,18 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out,
 ; EG-NEXT:     MOV T5.W, 0.0,
 ; EG-NEXT:     MOV * T4.W, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T7.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v4i8_to_v4i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T4.X
+; CM-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T4.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
@@ -5853,12 +5667,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out,
 ; CM-NEXT:     MOV T6.Y, 0.0,
 ; CM-NEXT:     MOV * T5.W, 0.0,
 ; CM-NEXT:     MOV * T6.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T4.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T4.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T7.X, PV.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5961,66 +5773,65 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out,
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
 ; EG-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T4.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT:     ASHR T4.W, T4.X, literal.y,
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.z,
+; EG-NEXT:     ASHR * T5.W, T4.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T6.X, T4.X, 0.0, literal.x,
+; EG-NEXT:     ASHR T5.Z, T4.X, literal.y,
+; EG-NEXT:     LSHR * T0.W, T4.X, literal.z,
+; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T5.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.Y, PV.X, literal.y,
+; EG-NEXT:     LSHR T0.W, T4.X, literal.x,
+; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ASHR T5.Y, PV.X, literal.x,
-; EG-NEXT:     ASHR T4.Z, T4.X, literal.y,
-; EG-NEXT:     LSHR T0.W, T4.X, literal.z,
-; EG-NEXT:     LSHR * T1.W, T4.X, literal.w,
-; EG-NEXT:    31(4.344025e-44), 24(3.363116e-44)
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T4.X, PS, 0.0, literal.x,
-; EG-NEXT:     BFE_INT T5.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T4.Y, PV.X, literal.y,
-; EG-NEXT:     ASHR * T5.W, PV.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT * T6.Z, PV.W, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T7.X, T4.X, literal.x,
+; EG-NEXT:     ASHR T5.Y, T5.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ASHR * T6.W, PV.Z, literal.y,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
 ;
 ; CM-LABEL: global_sextload_v4i8_to_v4i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 18, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T4.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T6.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
 ; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 8:
 ; CM-NEXT:     MOV * T4.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 9:
-; CM-NEXT:     BFE_INT T5.X, T4.X, 0.0, literal.x,
-; CM-NEXT:     LSHR T0.Y, T4.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     ASHR * T4.W, T4.X, literal.z,
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:     ASHR * T5.W, T4.X, literal.x,
 ; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T6.X, PV.Z, literal.x,
-; CM-NEXT:     ASHR T5.Y, PV.X, literal.y,
-; CM-NEXT:     ASHR T4.Z, T4.X, literal.z,
-; CM-NEXT:     LSHR * T0.W, T4.X, literal.w,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; CM-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; CM-NEXT:     BFE_INT T4.X, PV.W, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T5.Z, T0.Y, 0.0, literal.x,
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
-; CM-NEXT:     ASHR T4.Y, PV.X, literal.y,
-; CM-NEXT:     ASHR * T5.W, PV.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:     BFE_INT T6.X, T4.X, 0.0, literal.x,
+; CM-NEXT:     ASHR T5.Z, T4.X, literal.y,
+; CM-NEXT:     LSHR * T0.W, T4.X, literal.z,
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     BFE_INT T5.X, PV.W, 0.0, literal.x,
+; CM-NEXT:     ASHR T6.Y, PV.X, literal.y,
+; CM-NEXT:     LSHR * T0.W, T4.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
+; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_INT * T6.Z, PV.W, 0.0, literal.y,
+; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     ADD_INT T7.X, PV.X, literal.x,
+; CM-NEXT:     ASHR T5.Y, T5.X, literal.y,
+; CM-NEXT:     ASHR * T6.W, PV.Z, literal.y,
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
@@ -6151,7 +5962,7 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 34, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 30, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
@@ -6186,27 +5997,23 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
 ; EG-NEXT:     MOV T8.W, 0.0,
 ; EG-NEXT:     MOV * T5.W, 0.0,
-; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T9.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T10.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T11.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT * T12.X, T9.X, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v8i8_to_v8i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
-; CM-NEXT:    ALU 35, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T12.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T11.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T9.X
+; CM-NEXT:    ALU 31, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T9.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T12.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T11.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T10.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
@@ -6237,18 +6044,14 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; CM-NEXT:     MOV * T7.W, 0.0,
 ; CM-NEXT:     MOV * T8.W, 0.0,
 ; CM-NEXT:     MOV * T5.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T10.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T11.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T12.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T9.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T10.X, PV.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T11.X, T9.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T12.X, T9.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
@@ -6439,33 +6242,29 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 39, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 34, @11, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T12.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T9.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T6.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 8:
 ; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T5.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T7.Y, PV.X, literal.y,
+; EG-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT:     ADD_INT T8.X, PS, literal.x,
+; EG-NEXT:     ASHR T6.Y, PV.X, literal.y,
 ; EG-NEXT:     LSHR T0.W, T5.Y, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T9.X, PS, literal.x,
-; EG-NEXT:     BFE_INT T7.Z, PV.W, 0.0, literal.y,
-; EG-NEXT:     ASHR * T10.W, T5.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T9.X, PS, literal.z,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T6.Z, PV.W, 0.0, literal.x,
+; EG-NEXT:     ASHR * T10.W, T5.X, literal.y,
+; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T11.X, T5.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T10.Z, T5.X, literal.y,
 ; EG-NEXT:     LSHR T0.W, T5.X, literal.z,
@@ -6482,49 +6281,43 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; EG-NEXT:     BFE_INT T5.X, PS, 0.0, literal.x,
 ; EG-NEXT:     ASHR T10.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T11.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T12.X, T7.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
-; EG-NEXT:     ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT:     ASHR T11.W, PV.Z, literal.y,
-; EG-NEXT:     ASHR * T7.W, T7.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T5.Y, PV.X, literal.x,
+; EG-NEXT:     ASHR T11.W, PV.Z, literal.x,
+; EG-NEXT:     ASHR * T6.W, T6.Z, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_sextload_v8i8_to_v8i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
-; CM-NEXT:    ALU 39, @11, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T12.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T9.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T6.X
+; CM-NEXT:    ALU 34, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T7.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T12.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T9.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T8.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 8:
 ; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 10:
 ; CM-NEXT:     MOV * T5.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 11:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T6.X, PV.W, literal.x,
+; CM-NEXT:     BFE_INT * T6.X, T5.Y, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    8(1.121039e-44), 32(4.484155e-44)
-; CM-NEXT:     LSHR T8.X, PV.W, literal.x,
-; CM-NEXT:     ASHR T7.Y, PV.X, literal.y,
-; CM-NEXT:     LSHR T0.Z, T5.Y, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Y, T5.X, literal.y,
-; CM-NEXT:     BFE_INT T7.Z, PV.Z, 0.0, literal.y,
-; CM-NEXT:     ASHR * T10.W, T5.Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T8.X, PV.X, literal.x,
+; CM-NEXT:     ASHR T6.Y, T6.X, literal.y,
+; CM-NEXT:     LSHR * T0.W, T5.Y, literal.z,
+; CM-NEXT:    12(1.681558e-44), 31(4.344025e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T9.X, T7.X, literal.x,
+; CM-NEXT:     LSHR T0.Y, T5.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T6.Z, PV.W, 0.0, literal.x,
+; CM-NEXT:     ASHR * T10.W, T5.Y, literal.y,
+; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:     BFE_INT T11.X, T5.X, 0.0, literal.x,
 ; CM-NEXT:     LSHR T1.Y, T5.Y, literal.y,
 ; CM-NEXT:     ASHR T10.Z, T5.Y, literal.z,
@@ -6540,12 +6333,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; CM-NEXT:     BFE_INT T5.X, PV.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T10.Y, PV.X, literal.y,
 ; CM-NEXT:     BFE_INT T11.Z, T0.Y, 0.0, literal.x,
-; CM-NEXT:     ASHR * T7.W, T7.Z, literal.y,
+; CM-NEXT:     ASHR * T6.W, T6.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T12.X, T7.X, literal.x,
 ; CM-NEXT:     ASHR T5.Y, PV.X, literal.y,
 ; CM-NEXT:     ASHR * T11.W, PV.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
@@ -6752,7 +6545,7 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 68, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 58, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
@@ -6813,43 +6606,33 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV * T13.W, 0.0,
 ; EG-NEXT:     MOV T14.W, 0.0,
 ; EG-NEXT:     MOV * T7.W, 0.0,
-; EG-NEXT:     LSHR T15.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR * T22.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T15.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T16.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T18.X, T15.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T15.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T20.X, T15.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T15.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT * T22.X, T15.X, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v16i8_to_v16i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @12
-; CM-NEXT:    ALU 69, @15, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T21.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T19.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T17.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T16.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T15.X
+; CM-NEXT:    ALU 61, @15, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T15.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T22.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T20.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T19.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T18.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T17.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T16.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 12:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
@@ -6902,30 +6685,22 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; CM-NEXT:     MOV * T7.W, 0.0,
 ; CM-NEXT:     MOV * T13.W, 0.0,
 ; CM-NEXT:     MOV * T14.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T18.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T21.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T22.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T15.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T16.X, PV.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T17.X, T15.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T18.X, T15.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T19.X, T15.X, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T20.X, T15.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T21.X, T15.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T22.X, T15.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
@@ -7248,7 +7023,7 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @12
-; EG-NEXT:    ALU 78, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 67, @15, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T22.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
@@ -7263,39 +7038,30 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T7.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T9.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T10.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T11.X, T8.X, literal.x,
+; EG-NEXT:     ADD_INT * T12.X, T8.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT * T13.X, T7.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T14.X, T7.Y, 0.0, literal.x,
 ; EG-NEXT:     ASHR T13.Y, PV.X, literal.y,
 ; EG-NEXT:     LSHR T0.W, T7.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T15.X, T8.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T15.X, PS, literal.x,
-; EG-NEXT:     ASHR T14.Y, PV.X, literal.y,
-; EG-NEXT:     BFE_INT T13.Z, PV.W, 0.0, literal.z,
-; EG-NEXT:     LSHR T0.W, T7.Y, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:    8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT:     LSHR T16.X, PS, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T14.Y, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T13.Z, PV.W, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T7.Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T8.X, literal.x,
 ; EG-NEXT:     BFE_INT T14.Z, PV.W, 0.0, literal.y,
-; EG-NEXT:     ASHR * T17.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     ASHR * T17.W, T7.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:    24(3.363116e-44), 8(1.121039e-44)
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T18.X, T7.X, 0.0, literal.x,
 ; EG-NEXT:     ASHR T17.Z, T7.X, literal.y,
@@ -7313,22 +7079,20 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_INT T19.X, PS, 0.0, literal.x,
 ; EG-NEXT:     ASHR T17.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T18.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.z,
 ; EG-NEXT:     ASHR * T20.W, T7.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T7.X, T7.Z, 0.0, literal.x,
 ; EG-NEXT:     ASHR T19.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T20.Z, T7.Z, literal.z,
-; EG-NEXT:     LSHR T1.W, T7.Z, literal.w,
+; EG-NEXT:     LSHR T0.W, T7.Z, literal.w,
 ; EG-NEXT:     ASHR * T21.W, T7.W, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T20.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T7.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T21.Z, T7.W, literal.z,
-; EG-NEXT:     LSHR T1.W, T7.Z, literal.x,
-; EG-NEXT:     LSHR * T2.W, T7.W, literal.w,
+; EG-NEXT:     LSHR T0.W, T7.Z, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.W, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T21.X, PS, 0.0, literal.x,
@@ -7337,73 +7101,63 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; EG-NEXT:     ASHR T18.W, T18.Z, literal.y,
 ; EG-NEXT:     ASHR * T14.W, T14.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHR T22.X, T0.W, literal.x,
+; EG-NEXT:     ADD_INT T22.X, T8.X, literal.x,
 ; EG-NEXT:     ASHR T21.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T7.W, PV.Z, literal.y,
 ; EG-NEXT:     ASHR * T13.W, T13.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    28(3.923636e-44), 31(4.344025e-44)
 ;
 ; CM-LABEL: global_sextload_v16i8_to_v16i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @12
-; CM-NEXT:    ALU 79, @15, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T16.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T15.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T12.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T11.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T10.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T9.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T8.X
+; CM-NEXT:    ALU 69, @15, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T8.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T22.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T16.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T15.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T12.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T11.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T10.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T9.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 12:
 ; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; CM-NEXT:    ALU clause starting at 14:
 ; CM-NEXT:     MOV * T7.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 15:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T8.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T10.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T11.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T12.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Z, T7.X, literal.y,
-; CM-NEXT:     LSHR * T0.W, T7.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T9.X, PV.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T10.X, T8.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T11.X, T8.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T12.X, T8.X, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T13.X, T7.Y, 0.0, literal.x,
-; CM-NEXT:     LSHR T0.Y, T7.Y, literal.y,
-; CM-NEXT:     LSHR T1.Z, T7.Z, literal.x,
-; CM-NEXT:     LSHR * T1.W, T7.W, literal.x,
+; CM-NEXT:     LSHR T0.Y, T7.X, literal.x,
+; CM-NEXT:     LSHR T0.Z, T7.X, literal.y,
+; CM-NEXT:     LSHR * T0.W, T7.Y, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:     BFE_INT T14.X, T7.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T13.Y, PV.X, literal.y,
-; CM-NEXT:     LSHR T2.Z, T7.Y, literal.x,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
+; CM-NEXT:     LSHR T1.Z, T7.Z, literal.x,
+; CM-NEXT:     LSHR * T1.W, T7.Y, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T15.X, T8.X, literal.x,
 ; CM-NEXT:     ASHR T14.Y, PV.X, literal.y,
-; CM-NEXT:     BFE_INT T13.Z, PV.Z, 0.0, literal.z,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
+; CM-NEXT:     BFE_INT T13.Z, PV.W, 0.0, literal.z,
+; CM-NEXT:     LSHR * T1.W, T7.W, literal.z,
+; CM-NEXT:    12(1.681558e-44), 31(4.344025e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T16.X, T8.X, literal.x,
 ; CM-NEXT:     LSHR T1.Y, T7.Z, literal.y,
-; CM-NEXT:     BFE_INT T14.Z, T1.W, 0.0, literal.z,
-; CM-NEXT:     ASHR * T17.W, T7.W, literal.w, BS:VEC_120/SCL_212
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
+; CM-NEXT:     BFE_INT T14.Z, PV.W, 0.0, literal.x,
+; CM-NEXT:     ASHR * T17.W, T7.W, literal.z,
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T18.X, T7.Z, 0.0, literal.x,
 ; CM-NEXT:     LSHR T2.Y, T7.W, literal.y,
 ; CM-NEXT:     ASHR T17.Z, T7.W, literal.z,
@@ -7427,21 +7181,21 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; CM-NEXT:     ASHR * T7.W, T7.X, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T20.X, T0.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T20.X, T0.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T21.Y, PV.X, literal.y,
 ; CM-NEXT:     ASHR T7.Z, T7.X, literal.z,
 ; CM-NEXT:     ASHR * T18.W, T18.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T7.X, T0.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T7.X, T0.Z, 0.0, literal.x,
 ; CM-NEXT:     ASHR T20.Y, PV.X, literal.y,
-; CM-NEXT:     BFE_INT T21.Z, T0.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T21.Z, T0.Y, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T13.W, T13.Z, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T22.X, T8.X, literal.x,
 ; CM-NEXT:     ASHR T7.Y, PV.X, literal.y,
 ; CM-NEXT:     ASHR * T21.W, PV.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
@@ -7838,9 +7592,10 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-LABEL: global_zextload_v32i8_to_v32i64:
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @22
-; EG-NEXT:    ALU 103, @27, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 33, @131, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @22
+; EG-NEXT:    ALU 12, @27, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @24
+; EG-NEXT:    ALU 102, @40, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T41.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T40.X, 0
@@ -7848,7 +7603,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T38.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T37.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T35.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T35.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T34.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T33.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
@@ -7856,88 +7611,90 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T30.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T29.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T27.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT:    Fetch clause starting at 24:
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 26:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 27:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T13.X, T11.W, literal.x, PV.W,
-; EG-NEXT:     LSHR * T13.Z, T11.W, literal.y,
+; EG-NEXT:     BFE_UINT T13.X, T12.W, literal.x, PV.W,
+; EG-NEXT:     LSHR * T13.Z, T12.W, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T13.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T14.Z, T11.W, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T14.Z, T12.W, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T14.X, T11.W, literal.x,
+; EG-NEXT:     AND_INT T14.X, T12.W, literal.x,
 ; EG-NEXT:     MOV * T14.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T15.X, T11.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T15.Z, T11.Z, literal.y,
-; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     BFE_UINT * T15.X, T12.Z, literal.x, T0.W,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 40:
+; EG-NEXT:     LSHR * T15.Z, T12.Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T15.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T16.Z, T11.Z, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T16.Z, T12.Z, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T16.X, T11.Z, literal.x,
+; EG-NEXT:     AND_INT T16.X, T12.Z, literal.x,
 ; EG-NEXT:     MOV * T16.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T17.X, T11.Y, literal.x, T0.W,
-; EG-NEXT:     LSHR * T17.Z, T11.Y, literal.y,
+; EG-NEXT:     BFE_UINT T17.X, T12.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T17.Z, T12.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T17.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T18.Z, T11.Y, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T18.Z, T12.Y, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T18.X, T11.Y, literal.x,
+; EG-NEXT:     AND_INT T18.X, T12.Y, literal.x,
 ; EG-NEXT:     MOV * T18.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T19.X, T11.X, literal.x, T0.W,
-; EG-NEXT:     LSHR * T19.Z, T11.X, literal.y,
+; EG-NEXT:     BFE_UINT T19.X, T12.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T19.Z, T12.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T19.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T11.X, T11.X, literal.x,
-; EG-NEXT:     MOV * T11.Y, 0.0,
+; EG-NEXT:     AND_INT T12.X, T12.X, literal.x,
+; EG-NEXT:     MOV * T12.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T20.X, T12.W, literal.x, T0.W,
-; EG-NEXT:     LSHR * T20.Z, T12.W, literal.y,
+; EG-NEXT:     BFE_UINT T20.X, T11.W, literal.x, T0.W,
+; EG-NEXT:     LSHR * T20.Z, T11.W, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T20.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T21.Z, T12.W, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T21.Z, T11.W, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T21.X, T12.W, literal.x,
+; EG-NEXT:     AND_INT T21.X, T11.W, literal.x,
 ; EG-NEXT:     MOV * T21.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T22.X, T12.Z, literal.x, T0.W,
-; EG-NEXT:     LSHR * T22.Z, T12.Z, literal.y,
+; EG-NEXT:     BFE_UINT T22.X, T11.Z, literal.x, T0.W,
+; EG-NEXT:     LSHR * T22.Z, T11.Z, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T22.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T23.Z, T12.Z, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T23.Z, T11.Z, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T23.X, T12.Z, literal.x,
+; EG-NEXT:     AND_INT T23.X, T11.Z, literal.x,
 ; EG-NEXT:     MOV * T23.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T24.X, T12.Y, literal.x, T0.W,
-; EG-NEXT:     LSHR * T24.Z, T12.Y, literal.y,
+; EG-NEXT:     BFE_UINT T24.X, T11.Y, literal.x, T0.W,
+; EG-NEXT:     LSHR * T24.Z, T11.Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T24.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T25.Z, T12.Y, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T25.Z, T11.Y, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T25.X, T12.Y, literal.x,
+; EG-NEXT:     AND_INT T25.X, T11.Y, literal.x,
 ; EG-NEXT:     MOV * T25.Y, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T26.X, T12.X, literal.x, T0.W,
-; EG-NEXT:     LSHR * T26.Z, T12.X, literal.y,
+; EG-NEXT:     BFE_UINT T26.X, T11.X, literal.x, T0.W,
+; EG-NEXT:     LSHR * T26.Z, T11.X, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; EG-NEXT:     MOV T26.Y, 0.0,
-; EG-NEXT:     BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T12.X, T12.X, literal.x,
-; EG-NEXT:     MOV T12.Y, 0.0,
+; EG-NEXT:     AND_INT T11.X, T11.X, literal.x,
+; EG-NEXT:     MOV T11.Y, 0.0,
 ; EG-NEXT:     MOV T13.W, 0.0,
 ; EG-NEXT:     MOV * T14.W, 0.0,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
@@ -7946,7 +7703,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV T17.W, 0.0,
 ; EG-NEXT:     MOV * T18.W, 0.0,
 ; EG-NEXT:     MOV T19.W, 0.0,
-; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     MOV * T12.W, 0.0,
 ; EG-NEXT:     MOV T20.W, 0.0,
 ; EG-NEXT:     MOV * T21.W, 0.0,
 ; EG-NEXT:     MOV T22.W, 0.0,
@@ -7954,167 +7711,145 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV T24.W, 0.0,
 ; EG-NEXT:     MOV * T25.W, 0.0,
 ; EG-NEXT:     MOV T26.W, 0.0,
-; EG-NEXT:     MOV * T12.W, 0.0,
-; EG-NEXT:     LSHR T27.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR * T31.X, PV.W, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 131:
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT:     LSHR * T42.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T11.W, 0.0,
+; EG-NEXT:     LSHR * T27.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T28.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T29.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T30.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T31.X, T27.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T32.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T33.X, T27.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T34.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T35.X, T27.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T36.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T37.X, T27.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T38.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T39.X, T27.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
+; EG-NEXT:     ADD_INT T40.X, T27.X, literal.x,
+; EG-NEXT:     ADD_INT * T41.X, T27.X, literal.y,
+; EG-NEXT:    52(7.286752e-44), 56(7.847271e-44)
+; EG-NEXT:     ADD_INT * T42.X, T27.X, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_zextload_v32i8_to_v32i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 1 @22
-; CM-NEXT:    ALU 103, @27, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 33, @131, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T42.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T41.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T40.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T39.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T38.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T37.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T36.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T35.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T34.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T33.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T32.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T31.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T30.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T29.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T28.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T27.X
+; CM-NEXT:    TEX 0 @22
+; CM-NEXT:    ALU 12, @27, KC0[], KC1[]
+; CM-NEXT:    TEX 0 @24
+; CM-NEXT:    ALU 108, @40, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T27.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T42.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T41.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T40.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T39.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T38.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T37.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T36.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T35.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T34.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T33.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T32.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T31.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T30.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T29.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T28.X
 ; CM-NEXT:    CF_END
-; CM-NEXT:    PAD
 ; CM-NEXT:    Fetch clause starting at 22:
-; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; CM-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
+; CM-NEXT:    Fetch clause starting at 24:
+; CM-NEXT:     VTX_READ_128 T15.XYZW, T11.X, 16, #1
 ; CM-NEXT:    ALU clause starting at 26:
 ; CM-NEXT:     MOV * T11.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 27:
 ; CM-NEXT:     MOV * T0.W, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
+; CM-NEXT:     BFE_UINT * T13.Z, T12.X, literal.x, PV.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T13.X, T11.X, literal.x,
+; CM-NEXT:     AND_INT T13.X, T12.X, literal.x,
 ; CM-NEXT:     MOV * T13.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T14.X, T11.X, literal.x, T0.W,
-; CM-NEXT:     LSHR * T14.Z, T11.X, literal.y,
+; CM-NEXT:     BFE_UINT T14.X, T12.X, literal.x, T0.W,
+; CM-NEXT:     LSHR * T14.Z, T12.X, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; CM-NEXT:     MOV T14.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T15.Z, T11.Y, literal.x, T0.W,
+; CM-NEXT:     BFE_UINT * T11.Z, T12.Y, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T15.X, T11.Y, literal.x,
-; CM-NEXT:     MOV * T15.Y, 0.0,
+; CM-NEXT:    ALU clause starting at 40:
+; CM-NEXT:     AND_INT T11.X, T12.Y, literal.x,
+; CM-NEXT:     MOV * T11.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T16.X, T11.Y, literal.x, T0.W,
-; CM-NEXT:     LSHR * T16.Z, T11.Y, literal.y,
+; CM-NEXT:     BFE_UINT T16.X, T12.Y, literal.x, T0.W,
+; CM-NEXT:     LSHR * T16.Z, T12.Y, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; CM-NEXT:     MOV T16.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T17.Z, T11.Z, literal.x, T0.W,
+; CM-NEXT:     BFE_UINT * T17.Z, T12.Z, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T17.X, T11.Z, literal.x,
+; CM-NEXT:     AND_INT T17.X, T12.Z, literal.x,
 ; CM-NEXT:     MOV * T17.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T11.X, T11.Z, literal.x, T0.W,
-; CM-NEXT:     LSHR * T11.Z, T11.Z, literal.y,
+; CM-NEXT:     BFE_UINT T12.X, T12.Z, literal.x, T0.W,
+; CM-NEXT:     LSHR * T12.Z, T12.Z, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; CM-NEXT:     MOV T11.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T18.Z, T11.W, literal.x, T0.W,
+; CM-NEXT:     MOV T12.Y, 0.0,
+; CM-NEXT:     BFE_UINT * T18.Z, T12.W, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T18.X, T11.W, literal.x,
+; CM-NEXT:     AND_INT T18.X, T12.W, literal.x,
 ; CM-NEXT:     MOV * T18.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T19.X, T11.W, literal.x, T0.W,
-; CM-NEXT:     LSHR * T19.Z, T11.W, literal.y,
+; CM-NEXT:     BFE_UINT T19.X, T12.W, literal.x, T0.W,
+; CM-NEXT:     LSHR * T19.Z, T12.W, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; CM-NEXT:     MOV T19.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T20.Z, T12.X, literal.x, T0.W,
+; CM-NEXT:     BFE_UINT * T20.Z, T15.X, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T20.X, T12.X, literal.x,
+; CM-NEXT:     AND_INT T20.X, T15.X, literal.x,
 ; CM-NEXT:     MOV * T20.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T21.X, T12.X, literal.x, T0.W,
-; CM-NEXT:     LSHR * T21.Z, T12.X, literal.y,
+; CM-NEXT:     BFE_UINT T21.X, T15.X, literal.x, T0.W,
+; CM-NEXT:     LSHR * T21.Z, T15.X, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; CM-NEXT:     MOV T21.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T22.Z, T12.Y, literal.x, T0.W,
+; CM-NEXT:     BFE_UINT * T22.Z, T15.Y, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T22.X, T12.Y, literal.x,
+; CM-NEXT:     AND_INT T22.X, T15.Y, literal.x,
 ; CM-NEXT:     MOV * T22.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T23.X, T12.Y, literal.x, T0.W,
-; CM-NEXT:     LSHR * T23.Z, T12.Y, literal.y,
+; CM-NEXT:     BFE_UINT T23.X, T15.Y, literal.x, T0.W,
+; CM-NEXT:     LSHR * T23.Z, T15.Y, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; CM-NEXT:     MOV T23.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T24.Z, T12.Z, literal.x, T0.W,
+; CM-NEXT:     BFE_UINT * T24.Z, T15.Z, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T24.X, T12.Z, literal.x,
+; CM-NEXT:     AND_INT T24.X, T15.Z, literal.x,
 ; CM-NEXT:     MOV * T24.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T12.X, T12.Z, literal.x, T0.W,
-; CM-NEXT:     LSHR * T12.Z, T12.Z, literal.y,
+; CM-NEXT:     BFE_UINT T15.X, T15.Z, literal.x, T0.W,
+; CM-NEXT:     LSHR * T15.Z, T15.Z, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
-; CM-NEXT:     MOV T12.Y, 0.0,
-; CM-NEXT:     BFE_UINT * T25.Z, T12.W, literal.x, T0.W,
+; CM-NEXT:     MOV T15.Y, 0.0,
+; CM-NEXT:     BFE_UINT * T25.Z, T15.W, literal.x, T0.W,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T25.X, T12.W, literal.x,
+; CM-NEXT:     AND_INT T25.X, T15.W, literal.x,
 ; CM-NEXT:     MOV * T25.Y, 0.0,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     BFE_UINT T26.X, T12.W, literal.x, T0.W,
-; CM-NEXT:     LSHR * T26.Z, T12.W, literal.y,
+; CM-NEXT:     BFE_UINT T26.X, T15.W, literal.x, T0.W,
+; CM-NEXT:     LSHR * T26.Z, T15.W, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 24(3.363116e-44)
 ; CM-NEXT:     MOV T26.Y, 0.0,
 ; CM-NEXT:     MOV * T13.W, 0.0,
 ; CM-NEXT:     MOV * T14.W, 0.0,
-; CM-NEXT:     MOV * T15.W, 0.0,
+; CM-NEXT:     MOV * T11.W, 0.0,
 ; CM-NEXT:     MOV * T16.W, 0.0,
 ; CM-NEXT:     MOV * T17.W, 0.0,
-; CM-NEXT:     MOV * T11.W, 0.0,
+; CM-NEXT:     MOV * T12.W, 0.0,
 ; CM-NEXT:     MOV * T18.W, 0.0,
 ; CM-NEXT:     MOV * T19.W, 0.0,
 ; CM-NEXT:     MOV * T20.W, 0.0,
@@ -8122,58 +7857,41 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; CM-NEXT:     MOV * T22.W, 0.0,
 ; CM-NEXT:     MOV * T23.W, 0.0,
 ; CM-NEXT:     MOV * T24.W, 0.0,
-; CM-NEXT:     MOV * T12.W, 0.0,
+; CM-NEXT:     MOV * T15.W, 0.0,
 ; CM-NEXT:     MOV * T25.W, 0.0,
 ; CM-NEXT:     MOV * T26.W, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T27.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; CM-NEXT:     LSHR T28.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; CM-NEXT:     LSHR T30.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; CM-NEXT:    ALU clause starting at 131:
-; CM-NEXT:     LSHR T31.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; CM-NEXT:     LSHR T32.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; CM-NEXT:     LSHR T33.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; CM-NEXT:     LSHR T34.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; CM-NEXT:     LSHR T35.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; CM-NEXT:     LSHR T36.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:     LSHR * T41.X, PV.W, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T42.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T27.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T28.X, PV.X, literal.x,
+; CM-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T29.X, T27.X, literal.x,
+; CM-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T30.X, T27.X, literal.x,
+; CM-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T31.X, T27.X, literal.x,
+; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T32.X, T27.X, literal.x,
+; CM-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T33.X, T27.X, literal.x,
+; CM-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T34.X, T27.X, literal.x,
+; CM-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T35.X, T27.X, literal.x,
+; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T36.X, T27.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T37.X, T27.X, literal.x,
+; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T38.X, T27.X, literal.x,
+; CM-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T39.X, T27.X, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T40.X, T27.X, literal.x,
+; CM-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T41.X, T27.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T42.X, T27.X, literal.x,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(1) %out
@@ -8782,8 +8500,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @22
-; EG-NEXT:    ALU 84, @27, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 71, @112, KC0[], KC1[]
+; EG-NEXT:    ALU 83, @27, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 50, @111, KC0[], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T31.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T30.X, 0
@@ -8808,44 +8526,26 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:    ALU clause starting at 26:
 ; EG-NEXT:     MOV * T11.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 27:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT:     LSHR * T25.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T14.X, PV.X, literal.x,
+; EG-NEXT:     ADD_INT * T15.X, PV.X, literal.y,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T16.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T17.X, T13.X, literal.y,
+; EG-NEXT:    12(1.681558e-44), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T18.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T19.X, T13.X, literal.y,
+; EG-NEXT:    20(2.802597e-44), 24(3.363116e-44)
+; EG-NEXT:     ADD_INT T20.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T21.X, T13.X, literal.y,
+; EG-NEXT:    28(3.923636e-44), 32(4.484155e-44)
+; EG-NEXT:     ADD_INT T22.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T23.X, T13.X, literal.y,
+; EG-NEXT:    36(5.044674e-44), 40(5.605194e-44)
+; EG-NEXT:     ADD_INT T24.X, T13.X, literal.x,
+; EG-NEXT:     ADD_INT * T25.X, T13.X, literal.y,
+; EG-NEXT:    44(6.165713e-44), 48(6.726233e-44)
 ; EG-NEXT:     BFE_INT * T26.X, T11.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T27.X, T11.Y, 0.0, literal.x,
@@ -8861,78 +8561,74 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:     ASHR T28.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T27.Z, PV.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.W, T11.X, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:     ADD_INT * T30.X, T13.X, literal.z,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR T30.X, PS, literal.x,
-; EG-NEXT:     ASHR T29.Y, PV.X, literal.y,
-; EG-NEXT:     BFE_INT T28.Z, PV.W, 0.0, literal.z,
-; EG-NEXT:     LSHR T0.W, T12.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:    8(1.121039e-44), 224(3.138909e-43)
-; EG-NEXT:     LSHR T31.X, PS, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T29.Y, PV.X, literal.x,
+; EG-NEXT:     BFE_INT T28.Z, PV.W, 0.0, literal.y,
+; EG-NEXT:     LSHR * T0.W, T12.W, literal.y,
+; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
+; EG-NEXT:     ADD_INT T31.X, T13.X, literal.x,
 ; EG-NEXT:     BFE_INT T29.Z, PV.W, 0.0, literal.y,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.z,
-; EG-NEXT:     ASHR * T32.W, T12.X, literal.w,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:    240(3.363116e-43), 31(4.344025e-44)
+; EG-NEXT:     ASHR * T32.W, T12.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:    56(7.847271e-44), 8(1.121039e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T33.X, T12.Z, 0.0, literal.x,
 ; EG-NEXT:     LSHR T0.Y, T11.Z, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     ASHR T32.Z, T12.X, literal.y,
-; EG-NEXT:     LSHR T1.W, T12.X, literal.z,
+; EG-NEXT:     LSHR T0.W, T12.X, literal.z,
 ; EG-NEXT:     ASHR * T34.W, T12.Y, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 24(3.363116e-44)
 ; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
 ; EG-NEXT:     BFE_INT T32.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T33.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T34.Z, T12.Y, literal.z,
-; EG-NEXT:     LSHR T1.W, T12.Z, literal.x,
-; EG-NEXT:     LSHR * T2.W, T12.Y, literal.w,
+; EG-NEXT:     LSHR T0.W, T12.Z, literal.x,
+; EG-NEXT:     LSHR * T1.W, T12.Y, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT * T34.X, PS, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 112:
-; EG-NEXT:     ASHR T32.Y, T32.X, literal.x,
-; EG-NEXT:     BFE_INT T33.Z, T1.W, 0.0, literal.y,
-; EG-NEXT:     LSHR T1.W, T11.W, literal.z, BS:VEC_120/SCL_212
-; EG-NEXT:     ASHR * T35.W, T12.Z, literal.x,
-; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
+; EG-NEXT:     BFE_INT T34.X, PS, 0.0, literal.x,
+; EG-NEXT:     ASHR T32.Y, PV.X, literal.y,
+; EG-NEXT:     BFE_INT T33.Z, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.W, T11.W, literal.z,
+; EG-NEXT:     ASHR * T35.W, T12.Z, literal.y,
+; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T36.X, T12.X, 0.0, literal.x,
-; EG-NEXT:     ASHR T34.Y, T34.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ASHR T34.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T35.Z, T12.Z, literal.z,
-; EG-NEXT:     LSHR T2.W, T12.Z, literal.w,
+; EG-NEXT:     LSHR T1.W, T12.Z, literal.w,
 ; EG-NEXT:     ASHR * T37.W, T12.W, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T35.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T36.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T37.Z, T12.W, literal.z,
-; EG-NEXT:     LSHR T2.W, T12.X, literal.x,
-; EG-NEXT:     LSHR * T3.W, T12.W, literal.w,
+; EG-NEXT:     LSHR T1.W, T12.X, literal.x,
+; EG-NEXT:     LSHR * T2.W, T12.W, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     BFE_INT T37.X, PS, 0.0, literal.x,
-; EG-NEXT:     ASHR T35.Y, PV.X, literal.y,
-; EG-NEXT:     BFE_INT T36.Z, PV.W, 0.0, literal.x,
-; EG-NEXT:     LSHR T2.W, T11.Z, literal.z,
-; EG-NEXT:     ASHR * T12.W, T11.X, literal.y,
-; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_INT * T37.X, PS, 0.0, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 111:
+; EG-NEXT:     ASHR T35.Y, T35.X, literal.x,
+; EG-NEXT:     BFE_INT T36.Z, T1.W, 0.0, literal.y,
+; EG-NEXT:     LSHR T1.W, T11.Z, literal.z,
+; EG-NEXT:     ASHR * T12.W, T11.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 8(1.121039e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BFE_INT T38.X, T12.Y, 0.0, literal.x,
-; EG-NEXT:     ASHR T37.Y, PV.X, literal.y,
-; EG-NEXT:     ASHR T12.Z, T11.X, literal.z,
-; EG-NEXT:     LSHR T3.W, T11.X, literal.w,
+; EG-NEXT:     ASHR T37.Y, T37.X, literal.y,
+; EG-NEXT:     ASHR T12.Z, T11.X, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T2.W, T11.X, literal.w, BS:VEC_120/SCL_212
 ; EG-NEXT:     ASHR * T39.W, T11.Y, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T12.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T38.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T39.Z, T11.Y, literal.z,
-; EG-NEXT:     LSHR T3.W, T12.Y, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T4.W, T11.Y, literal.w,
+; EG-NEXT:     LSHR T2.W, T12.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T3.W, T11.Y, literal.w,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
 ; EG-NEXT:     BFE_INT T39.X, PS, 0.0, literal.x,
@@ -8948,47 +8644,47 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; EG-NEXT:     ASHR * T41.W, T11.W, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T40.X, T2.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T40.X, T1.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T11.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T41.Z, T11.W, literal.z, BS:VEC_120/SCL_212
 ; EG-NEXT:     ASHR T33.W, T33.Z, literal.y,
 ; EG-NEXT:     ASHR * T29.W, T29.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T41.X, T1.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T41.X, T0.W, 0.0, literal.x,
 ; EG-NEXT:     ASHR T40.Y, PV.X, literal.y,
 ; EG-NEXT:     BFE_INT T11.Z, T0.Y, 0.0, literal.x,
 ; EG-NEXT:     ASHR T28.W, T28.Z, literal.y,
 ; EG-NEXT:     ASHR * T27.W, T27.Z, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHR T42.X, T0.W, literal.x,
+; EG-NEXT:     ADD_INT T42.X, T13.X, literal.x,
 ; EG-NEXT:     ASHR T41.Y, PV.X, literal.y,
 ; EG-NEXT:     ASHR T11.W, PV.Z, literal.y,
 ; EG-NEXT:     ASHR * T26.W, T26.Z, literal.y,
-; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:    60(8.407791e-44), 31(4.344025e-44)
 ;
 ; CM-LABEL: global_sextload_v32i8_to_v32i64:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 1 @22
-; CM-NEXT:    ALU 84, @27, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 74, @112, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T42.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T31.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T30.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T25.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T24.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T23.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T22.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T21.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T20.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T19.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T18.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T17.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T16.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T34, T15.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T13.X
+; CM-NEXT:    ALU 86, @27, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 52, @114, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T13.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T42.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T31.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T30.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T25.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T24.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T23.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T22.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T21.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T20.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T19.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T18.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T17.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T34, T16.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T15.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T14.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    Fetch clause starting at 22:
@@ -8997,95 +8693,74 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; CM-NEXT:    ALU clause starting at 26:
 ; CM-NEXT:     MOV * T11.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 27:
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    240(3.363116e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T13.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 224(3.138909e-43)
-; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 208(2.914701e-43)
-; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 192(2.690493e-43)
-; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 176(2.466285e-43)
-; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 160(2.242078e-43)
-; CM-NEXT:     LSHR T18.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 144(2.017870e-43)
-; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 128(1.793662e-43)
-; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 112(1.569454e-43)
-; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Z, T11.X, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Y, T11.X, literal.y,
-; CM-NEXT:     LSHR T1.Z, T11.Y, literal.y,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR T23.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T1.Y, T11.Z, literal.y,
-; CM-NEXT:     LSHR T2.Z, T11.Z, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:    16(2.242078e-44), 64(8.968310e-44)
-; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T2.Y, T11.W, literal.y,
-; CM-NEXT:     LSHR T3.Z, T11.W, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T3.Y, T12.X, literal.y,
-; CM-NEXT:     LSHR T4.Z, T12.Y, literal.y,
-; CM-NEXT:     LSHR * T0.W, T12.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T13.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T14.X, PV.X, literal.x,
+; CM-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T15.X, T13.X, literal.x,
+; CM-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T16.X, T13.X, literal.x,
+; CM-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T17.X, T13.X, literal.x,
+; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T18.X, T13.X, literal.x,
+; CM-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T19.X, T13.X, literal.x,
+; CM-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T20.X, T13.X, literal.x,
+; CM-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T21.X, T13.X, literal.x,
+; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT * T22.X, T13.X, literal.x,
+; CM-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T23.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T0.Z, T11.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR * T0.W, T11.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    24(3.363116e-44), 8(1.121039e-44)
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T24.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T0.Y, T11.Y, literal.y,
+; CM-NEXT:     LSHR T1.Z, T11.Z, literal.z,
+; CM-NEXT:     LSHR * T1.W, T11.Z, literal.y,
+; CM-NEXT:    20(2.802597e-44), 16(2.242078e-44)
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T25.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T1.Y, T11.W, literal.x,
+; CM-NEXT:     LSHR T2.Z, T11.W, literal.y,
+; CM-NEXT:     LSHR * T2.W, T12.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
 ; CM-NEXT:     BFE_INT T26.X, T12.W, 0.0, literal.x,
-; CM-NEXT:     LSHR T4.Y, T12.Y, literal.x,
-; CM-NEXT:     ADD_INT T5.Z, KC0[2].Y, literal.y,
-; CM-NEXT:     LSHR * T1.W, T11.Y, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T2.Y, T12.Y, literal.y,
+; CM-NEXT:     LSHR T3.Z, T12.X, literal.x,
+; CM-NEXT:     LSHR * T3.W, T12.Y, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
 ; CM-NEXT:     BFE_INT T27.X, T12.Z, 0.0, literal.x,
 ; CM-NEXT:     ASHR T26.Y, PV.X, literal.y,
-; CM-NEXT:     ADD_INT T6.Z, KC0[2].Y, literal.z,
-; CM-NEXT:     LSHR * T2.W, T12.W, literal.x,
+; CM-NEXT:     LSHR T4.Z, T11.Y, literal.x,
+; CM-NEXT:     LSHR * T4.W, T12.W, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T28.X, T11.Y, 0.0, literal.x,
 ; CM-NEXT:     ASHR T27.Y, PV.X, literal.y,
 ; CM-NEXT:     BFE_INT T26.Z, PV.W, 0.0, literal.x,
-; CM-NEXT:     LSHR * T2.W, T12.Z, literal.x,
+; CM-NEXT:     LSHR * T4.W, T12.Z, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:     BFE_INT T29.X, T12.Y, 0.0, literal.x,
 ; CM-NEXT:     ASHR T28.Y, PV.X, literal.y,
 ; CM-NEXT:     BFE_INT T27.Z, PV.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T26.W, PV.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T30.X, T6.Z, literal.x,
+; CM-NEXT:     ADD_INT T30.X, T13.X, literal.x,
 ; CM-NEXT:     ASHR T29.Y, PV.X, literal.y,
-; CM-NEXT:     BFE_INT T28.Z, T1.W, 0.0, literal.z,
+; CM-NEXT:     BFE_INT T28.Z, T4.Z, 0.0, literal.z,
 ; CM-NEXT:     ASHR * T27.W, PV.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    12(1.681558e-44), 31(4.344025e-44)
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T31.X, T5.Z, literal.x,
-; CM-NEXT:     LSHR * T5.Y, T12.Z, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; CM-NEXT:    ALU clause starting at 112:
-; CM-NEXT:     BFE_INT T29.Z, T4.Y, 0.0, literal.x,
-; CM-NEXT:     ASHR * T32.W, T12.W, literal.y,
-; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
+; CM-NEXT:     ADD_INT T31.X, T13.X, literal.x,
+; CM-NEXT:     LSHR T3.Y, T12.Z, literal.y,
+; CM-NEXT:     BFE_INT T29.Z, T3.W, 0.0, literal.x,
+; CM-NEXT:     ASHR * T32.W, T12.W, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), 16(2.242078e-44)
+; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T33.X, T12.X, 0.0, literal.x,
 ; CM-NEXT:     LSHR T4.Y, T12.W, literal.y,
 ; CM-NEXT:     ASHR T32.Z, T12.W, literal.z,
@@ -9098,27 +8773,28 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; CM-NEXT:     ASHR * T29.W, T29.Z, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T34.X, T5.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T34.X, T3.Y, 0.0, literal.x,
 ; CM-NEXT:     ASHR T32.Y, PV.X, literal.y,
-; CM-NEXT:     BFE_INT T33.Z, T0.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T33.Z, T3.Z, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T35.W, T12.Y, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:     BFE_INT T36.X, T11.W, 0.0, literal.x,
-; CM-NEXT:     ASHR T34.Y, PV.X, literal.y,
-; CM-NEXT:     ASHR T35.Z, T12.Y, literal.z,
-; CM-NEXT:     ASHR * T12.W, T12.X, literal.y,
-; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T35.X, T4.Z, 0.0, literal.x,
-; CM-NEXT:     ASHR T36.Y, PV.X, literal.y,
-; CM-NEXT:     ASHR T12.Z, T12.X, literal.z,
-; CM-NEXT:     ASHR * T33.W, T33.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T36.X, T11.W, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 114:
+; CM-NEXT:     ASHR T34.Y, T34.X, literal.x,
+; CM-NEXT:     ASHR T35.Z, T12.Y, literal.y,
+; CM-NEXT:     ASHR * T12.W, T12.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    31(4.344025e-44), 24(3.363116e-44)
+; CM-NEXT:     BFE_INT T35.X, T2.Y, 0.0, literal.x,
+; CM-NEXT:     ASHR T36.Y, T36.X, literal.y,
+; CM-NEXT:     ASHR T12.Z, T12.X, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     ASHR * T33.W, T33.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T12.X, T3.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T12.X, T2.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T35.Y, PV.X, literal.y,
-; CM-NEXT:     BFE_INT T36.Z, T3.Z, 0.0, literal.x,
-; CM-NEXT:     ASHR * T37.W, T11.W, literal.y,
+; CM-NEXT:     BFE_INT T36.Z, T2.Z, 0.0, literal.x,
+; CM-NEXT:     ASHR * T37.W, T11.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:     BFE_INT T38.X, T11.Z, 0.0, literal.x,
 ; CM-NEXT:     ASHR T12.Y, PV.X, literal.y,
@@ -9126,16 +8802,16 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; CM-NEXT:     ASHR * T39.W, T11.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T37.X, T2.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.X, T1.Y, 0.0, literal.x,
 ; CM-NEXT:     ASHR T38.Y, PV.X, literal.y,
 ; CM-NEXT:     ASHR T39.Z, T11.Z, literal.z,
 ; CM-NEXT:     ASHR * T36.W, T36.Z, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T39.X, T2.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T39.X, T1.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T37.Y, PV.X, literal.y,
-; CM-NEXT:     BFE_INT T38.Z, T1.Y, 0.0, literal.x,
-; CM-NEXT:     ASHR * T40.W, T11.Y, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T38.Z, T1.Z, 0.0, literal.x,
+; CM-NEXT:     ASHR * T40.W, T11.Y, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:     BFE_INT T41.X, T11.X, 0.0, literal.x,
 ; CM-NEXT:     ASHR T39.Y, PV.X, literal.y,
@@ -9143,21 +8819,21 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; CM-NEXT:     ASHR * T11.W, T11.X, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T40.X, T1.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T40.X, T0.Y, 0.0, literal.x,
 ; CM-NEXT:     ASHR T41.Y, PV.X, literal.y,
 ; CM-NEXT:     ASHR T11.Z, T11.X, literal.z,
-; CM-NEXT:     ASHR * T38.W, T38.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     ASHR * T38.W, T38.Z, literal.y,
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T11.X, T0.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T11.X, T0.W, 0.0, literal.x,
 ; CM-NEXT:     ASHR T40.Y, PV.X, literal.y,
 ; CM-NEXT:     BFE_INT T41.Z, T0.Z, 0.0, literal.x,
 ; CM-NEXT:     ASHR * T28.W, T28.Z, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; CM-NEXT:     LSHR T42.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ADD_INT T42.X, T13.X, literal.x,
 ; CM-NEXT:     ASHR T11.Y, PV.X, literal.y,
 ; CM-NEXT:     ASHR * T41.W, PV.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; CM-NEXT:    4(5.605194e-45), 31(4.344025e-44)
   %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(1) %out
@@ -10994,9 +10670,9 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
 ; EG-NEXT:    ALU 103, @12, KC0[], KC1[]
-; EG-NEXT:    ALU 20, @116, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT:    ALU 19, @116, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
@@ -11115,17 +10791,16 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
 ; EG-NEXT:     MOV * T5.X, PV.W,
 ; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR T0.W, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
+; EG-NEXT:     LSHR * T0.W, T19.W, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     AND_INT T1.W, PV.Y, literal.y,
 ; EG-NEXT:     AND_INT * T0.W, PV.W, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; EG-NEXT:    16711680(2.341805e-38), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
 ; EG-NEXT:     OR_INT * T19.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T5.X, PV.W,
 ; EG-NEXT:     MOV * T20.X, T16.X,
 ; EG-NEXT:     MOV * T20.Z, T12.X,
@@ -11137,7 +10812,7 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; CM-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
 ; CM-NEXT:    ALU 101, @12, KC0[], KC1[]
-; CM-NEXT:    ALU 20, @114, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 19, @114, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T22.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T21.X
 ; CM-NEXT:    CF_END
@@ -11259,14 +10934,13 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; CM-NEXT:     LSHR * T0.W, T19.W, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; CM-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
-; CM-NEXT:     AND_INT T0.Y, PV.Y, literal.y,
-; CM-NEXT:     AND_INT T0.Z, PV.W, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.w,
+; CM-NEXT:     AND_INT T0.Z, PV.Y, literal.y,
+; CM-NEXT:     AND_INT * T0.W, PV.W, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; CM-NEXT:    16711680(2.341805e-38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     OR_INT * T19.W, PV.Y, PV.Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:    16711680(2.341805e-38), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T22.X, PV.X, literal.x,
+; CM-NEXT:     OR_INT * T19.W, PV.Z, PV.W,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T5.X, PV.W,
 ; CM-NEXT:     MOV T20.X, T16.X,
 ; CM-NEXT:     MOV * T20.Z, T12.X, BS:VEC_120/SCL_212
@@ -11507,9 +11181,9 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @8
 ; EG-NEXT:    ALU 104, @12, KC0[], KC1[]
-; EG-NEXT:    ALU 46, @117, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT:    ALU 45, @117, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 8:
@@ -11655,17 +11329,16 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
 ; EG-NEXT:     MOV * T5.X, PV.W,
 ; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR T0.W, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
+; EG-NEXT:     ASHR * T0.W, T19.W, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     AND_INT T1.W, PV.Y, literal.y,
 ; EG-NEXT:     LSHL * T0.W, PV.W, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT T22.X, PV.X, literal.x,
 ; EG-NEXT:     OR_INT * T19.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     MOV T5.X, PV.W,
 ; EG-NEXT:     MOV * T20.X, T16.X,
 ; EG-NEXT:     MOV * T20.Z, T12.X,
@@ -11677,7 +11350,7 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; CM-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    TEX 0 @8
 ; CM-NEXT:    ALU 104, @12, KC0[], KC1[]
-; CM-NEXT:    ALU 46, @117, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 45, @117, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T22.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T21.X
 ; CM-NEXT:    CF_END
@@ -11828,14 +11501,13 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; CM-NEXT:     ASHR * T0.W, T19.W, literal.x,
 ; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
 ; CM-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
-; CM-NEXT:     AND_INT T0.Y, PV.Y, literal.y,
-; CM-NEXT:     LSHL T0.Z, PV.W, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT:     AND_INT T0.Z, PV.Y, literal.y,
+; CM-NEXT:     LSHL * T0.W, PV.W, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
-; CM-NEXT:     OR_INT * T19.W, PV.Y, PV.Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T22.X, PV.X, literal.x,
+; CM-NEXT:     OR_INT * T19.W, PV.Z, PV.W,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T5.X, PV.W,
 ; CM-NEXT:     MOV T20.X, T16.X,
 ; CM-NEXT:     MOV * T20.Z, T12.X, BS:VEC_120/SCL_212
@@ -12182,11 +11854,11 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; EG-NEXT:    TEX 1 @10
 ; EG-NEXT:    ALU 103, @16, KC0[], KC1[]
 ; EG-NEXT:    ALU 104, @120, KC0[], KC1[]
-; EG-NEXT:    ALU 41, @225, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 37, @225, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T40.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 10:
 ; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
@@ -12421,24 +12093,20 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; EG-NEXT:    -65536(nan), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
 ; EG-NEXT:     MOV * T21.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.W, T35.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16711680(2.341805e-38), 32(4.484155e-44)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT:     MOV * T0.Y, PV.X,
+; EG-NEXT:     LSHR * T39.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T40.X, PV.X, literal.x,
+; EG-NEXT:     LSHR T0.W, T35.W, literal.y,
+; EG-NEXT:     ADD_INT * T41.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 8(1.121039e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT:     ADD_INT T42.X, T39.X, literal.x,
+; EG-NEXT:     OR_INT * T35.W, PV.W, PS,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T21.X, PV.W,
 ; EG-NEXT:     MOV * T36.X, T16.X,
 ; EG-NEXT:     MOV * T36.Z, T12.X,
@@ -12455,11 +12123,11 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; CM-NEXT:    TEX 1 @10
 ; CM-NEXT:    ALU 101, @16, KC0[], KC1[]
 ; CM-NEXT:    ALU 101, @118, KC0[], KC1[]
-; CM-NEXT:    ALU 40, @220, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 36, @220, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T42.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T40.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T39.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T39.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T41.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T40.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 10:
 ; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
@@ -12688,24 +12356,20 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; CM-NEXT:    -65536(nan), 16(2.242078e-44)
 ; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
 ; CM-NEXT:     MOV * T21.X, PV.W,
-; CM-NEXT:     MOV T0.Y, PV.X,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
-; CM-NEXT:     LSHR * T0.W, T35.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; CM-NEXT:     LSHR T41.X, KC0[2].Y, literal.x,
-; CM-NEXT:     AND_INT T0.Y, T0.Y, literal.y,
-; CM-NEXT:     AND_INT T0.Z, PV.W, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.w,
-; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; CM-NEXT:    16711680(2.341805e-38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T42.X, PV.W, literal.x,
-; CM-NEXT:     OR_INT * T35.W, PV.Y, PV.Z,
+; CM-NEXT:     MOV * T0.Y, PV.X,
+; CM-NEXT:     LSHR * T39.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T40.X, PV.X, literal.x,
+; CM-NEXT:     LSHR * T0.W, T35.W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T41.X, T39.X, literal.x,
+; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
+; CM-NEXT:     AND_INT * T0.W, PV.W, literal.z,
+; CM-NEXT:    12(1.681558e-44), 65535(9.183409e-41)
+; CM-NEXT:    16711680(2.341805e-38), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T42.X, T39.X, literal.x,
+; CM-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T21.X, PV.W,
 ; CM-NEXT:     MOV T36.X, T16.X,
 ; CM-NEXT:     MOV * T36.Z, T12.X, BS:VEC_120/SCL_212
@@ -13146,11 +12810,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; EG-NEXT:    TEX 1 @10
 ; EG-NEXT:    ALU 104, @16, KC0[], KC1[]
 ; EG-NEXT:    ALU 104, @121, KC0[], KC1[]
-; EG-NEXT:    ALU 95, @226, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 91, @226, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T39.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T40.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 10:
 ; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
@@ -13440,24 +13104,20 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
 ; EG-NEXT:     MOV * T21.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ASHR T0.W, T35.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    24(3.363116e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT:     MOV * T0.Y, PV.X,
+; EG-NEXT:     LSHR * T39.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T40.X, PV.X, literal.x,
+; EG-NEXT:     ASHR T0.W, T35.W, literal.y,
+; EG-NEXT:     ADD_INT * T41.X, PV.X, literal.z,
+; EG-NEXT:    4(5.605194e-45), 24(3.363116e-44)
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     ADD_INT T42.X, T39.X, literal.x,
+; EG-NEXT:     OR_INT * T35.W, PV.W, PS,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     MOV T21.X, PV.W,
 ; EG-NEXT:     MOV * T36.X, T16.X,
 ; EG-NEXT:     MOV * T36.Z, T12.X,
@@ -13474,11 +13134,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; CM-NEXT:    TEX 1 @10
 ; CM-NEXT:    ALU 104, @16, KC0[], KC1[]
 ; CM-NEXT:    ALU 104, @121, KC0[], KC1[]
-; CM-NEXT:    ALU 95, @226, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 91, @226, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T42.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T40.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T39.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T39.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T41.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T40.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 10:
 ; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
@@ -13768,24 +13428,20 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
 ; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
 ; CM-NEXT:     MOV * T21.X, PV.W,
-; CM-NEXT:     MOV T0.Y, PV.X,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
-; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
+; CM-NEXT:     MOV * T0.Y, PV.X,
+; CM-NEXT:     LSHR * T39.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T40.X, PV.X, literal.x,
 ; CM-NEXT:     ASHR * T0.W, T35.W, literal.y,
-; CM-NEXT:    2(2.802597e-45), 24(3.363116e-44)
-; CM-NEXT:     LSHR T41.X, KC0[2].Y, literal.x,
-; CM-NEXT:     AND_INT T0.Y, T0.Y, literal.y,
-; CM-NEXT:     LSHL T0.Z, PV.W, literal.z,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
-; CM-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; CM-NEXT:    8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT:     ADD_INT T41.X, T39.X, literal.x,
+; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
+; CM-NEXT:     LSHL * T0.W, PV.W, literal.z,
+; CM-NEXT:    12(1.681558e-44), 65535(9.183409e-41)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T42.X, PV.W, literal.x,
-; CM-NEXT:     OR_INT * T35.W, PV.Y, PV.Z,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T42.X, T39.X, literal.x,
+; CM-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T21.X, PV.W,
 ; CM-NEXT:     MOV T36.X, T16.X,
 ; CM-NEXT:     MOV * T36.Z, T12.X, BS:VEC_120/SCL_212
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 093803c82b8c1..84ef83d47a8db 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -664,17 +664,16 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32
 ;
 ; EG-LABEL: s_test_umax_uge_v3i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MAX_UINT * T0.Y, KC0[3].Z, KC0[4].Z,
 ; EG-NEXT:     MAX_UINT * T0.X, KC0[3].Y, KC0[4].Y,
-; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T2.X, PV.X, literal.x,
 ; EG-NEXT:     MAX_UINT * T3.X, KC0[3].W, KC0[4].W,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %cmp = icmp uge <3 x i32> %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 343daeaa27cc9..6ebb6aaa4520f 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2069,9 +2069,9 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 9, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    ALU 7, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
@@ -2084,13 +2084,11 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MIN_UINT * T0.Y, T2.Y, T1.Y,
 ; EG-NEXT:     MIN_UINT T0.X, T2.X, T1.X,
+; EG-NEXT:     MIN_UINT * T1.X, T2.Z, T1.Z,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     MIN_UINT * T2.X, T2.Z, T1.Z,
+; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, T0.W, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CI-LABEL: v_test_umin_ule_v3i32:
@@ -3390,25 +3388,23 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
 ; EG-LABEL: s_test_umin_ult_v8i32:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MIN_UINT * T0.W, KC0[5].X, KC0[7].X,
 ; EG-NEXT:     MIN_UINT * T0.Z, KC0[4].W, KC0[6].W,
 ; EG-NEXT:     MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z,
 ; EG-NEXT:     MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y,
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MIN_UINT * T2.W, KC0[6].X, KC0[8].X,
-; EG-NEXT:     MIN_UINT * T2.Z, KC0[5].W, KC0[7].W,
-; EG-NEXT:     MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z,
-; EG-NEXT:     MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:     MIN_UINT * T1.W, KC0[6].X, KC0[8].X,
+; EG-NEXT:     MIN_UINT * T1.Z, KC0[5].W, KC0[7].W,
+; EG-NEXT:     MIN_UINT * T1.Y, KC0[5].Z, KC0[7].Z,
+; EG-NEXT:     MIN_UINT * T1.X, KC0[5].Y, KC0[7].Y,
+; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; CI-LABEL: s_test_umin_ult_v8i32:
 ; CI:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/r600.llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/r600.llvm.is.fpclass.ll
index 08039833e2a66..900aea624d433 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.llvm.is.fpclass.ll
@@ -69,27 +69,25 @@ define amdgpu_kernel void @issue135083_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @issue135083_v3f32(ptr addrspace(1) %out, <3 x float> %x) {
 ; CM-LABEL: issue135083_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 15, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     LSHL * T0.W, KC0[3].W, 1,
-; CM-NEXT:     LSHL T0.Z, KC0[3].Z, 1,
 ; CM-NEXT:     SETGT_UINT * T0.W, PV.W, literal.x,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
 ; CM-NEXT:     CNDE_INT T0.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHL T0.Y, KC0[3].Y, 1,
-; CM-NEXT:     SETGT_UINT T0.Z, PV.Z, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 8(1.121039e-44)
-; CM-NEXT:     LSHR T1.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T2.Y, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T0.W, PV.Y, literal.y,
+; CM-NEXT:     LSHL * T0.W, KC0[3].Z, 1,
+; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHL T0.Z, KC0[3].Y, 1,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.W, literal.y,
 ; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
-; CM-NEXT:     CNDE_INT * T2.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T2.X, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T3.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Z, literal.y,
+; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     CNDE_INT * T3.X, PV.W, 1, 0.0,
   %result = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> %x, i32 504)
   %zext = zext <3 x i1> %result to <3 x i32>
   store <3 x i32> %zext, ptr addrspace(1) %out, align 16
@@ -131,34 +129,32 @@ define amdgpu_kernel void @issue135083_v4f32(ptr addrspace(1) %out, <4 x float>
 define amdgpu_kernel void @issue135083_v5f32(ptr addrspace(1) %out, <5 x float> %x) {
 ; CM-LABEL: issue135083_v5f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 22, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
+; CM-NEXT:    ALU 20, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     LSHL T0.Z, KC0[4].Y, 1,
+; CM-NEXT:     LSHL T0.Z, KC0[5].Y, 1,
 ; CM-NEXT:     LSHL * T0.W, KC0[5].X, 1,
 ; CM-NEXT:     SETGT_UINT T0.Y, PV.W, literal.x,
-; CM-NEXT:     LSHL T1.Z, KC0[5].Y, 1,
-; CM-NEXT:     LSHL * T0.W, KC0[4].W, 1,
+; CM-NEXT:     LSHL T1.Z, KC0[4].W, 1,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Z, literal.x,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T0.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T0.X, PV.W, 1, 0.0,
 ; CM-NEXT:     LSHL T1.Y, KC0[4].Z, 1,
-; CM-NEXT:     SETGT_UINT T1.Z, PV.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T0.Z, PV.Z, literal.x,
 ; CM-NEXT:     CNDE_INT * T1.W, PV.Y, 1, 0.0,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T2.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T0.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, PV.X, 1, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.y,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHL T0.Y, KC0[4].Y, 1,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Y, literal.y,
 ; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Y, literal.y,
+; CM-NEXT:    4(5.605194e-45), -16777217(-1.701412e+38)
 ; CM-NEXT:     CNDE_INT * T1.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> %x, i32 504)
   %zext = zext <5 x i1> %result to <5 x i32>
   store <5 x i32> %zext, ptr addrspace(1) %out, align 32
@@ -168,37 +164,36 @@ define amdgpu_kernel void @issue135083_v5f32(ptr addrspace(1) %out, <5 x float>
 define amdgpu_kernel void @issue135083_v6f32(ptr addrspace(1) %out, <6 x float> %x) {
 ; CM-LABEL: issue135083_v6f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 25, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 24, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T0.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     LSHL T0.Z, KC0[4].Y, 1,
 ; CM-NEXT:     LSHL * T0.W, KC0[5].Z, 1,
-; CM-NEXT:     LSHL T1.Z, KC0[5].X, 1,
-; CM-NEXT:     LSHL * T1.W, KC0[4].W, 1,
+; CM-NEXT:     LSHL T0.Y, KC0[5].Y, 1,
+; CM-NEXT:     SETGT_UINT T0.Z, PV.W, literal.x,
+; CM-NEXT:     LSHL * T0.W, KC0[5].X, 1,
+; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
 ; CM-NEXT:     SETGT_UINT T0.X, PV.W, literal.x,
-; CM-NEXT:     SETGT_UINT T0.Y, PV.Z, literal.x,
-; CM-NEXT:     LSHL T1.Z, KC0[5].Y, 1,
-; CM-NEXT:     SETGT_UINT * T0.W, T0.W, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.Z, 1, 0.0,
+; CM-NEXT:     LSHL T0.Z, KC0[4].W, 1,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T1.X, KC0[4].Z, 1,
-; CM-NEXT:     CNDE_INT T2.Y, PV.W, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T1.Z, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT * T1.W, PV.Y, 1, 0.0,
+; CM-NEXT:     CNDE_INT T1.X, PV.W, 1, 0.0,
+; CM-NEXT:     LSHL T0.Y, KC0[4].Z, 1,
+; CM-NEXT:     SETGT_UINT T0.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, PV.X, 1, 0.0,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T2.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T0.Y, PV.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, T0.X, 1, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.y,
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHL T2.Y, KC0[4].Y, 1,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Z, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T1.W, PV.Y, literal.y,
 ; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
-; CM-NEXT:     CNDE_INT * T1.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T1.W, PV.Y, literal.y,
+; CM-NEXT:    4(5.605194e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     CNDE_INT * T0.X, PV.W, 1, 0.0,
   %result = call <6 x i1> @llvm.is.fpclass.v6f32(<6 x float> %x, i32 504)
   %zext = zext <6 x i1> %result to <6 x i32>
   store <6 x i32> %zext, ptr addrspace(1) %out, align 32
@@ -208,46 +203,43 @@ define amdgpu_kernel void @issue135083_v6f32(ptr addrspace(1) %out, <6 x float>
 define amdgpu_kernel void @issue135083_v7f32(ptr addrspace(1) %out, <7 x float> %x) {
 ; CM-LABEL: issue135083_v7f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 32, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T0.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
+; CM-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T5.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
-; CM-NEXT:     LSHL T0.Z, KC0[4].Y, 1,
-; CM-NEXT:     LSHL * T0.W, KC0[4].W, 1,
-; CM-NEXT:     SETGT_UINT T0.Y, PV.W, literal.x,
-; CM-NEXT:     LSHL T1.Z, KC0[5].W, 1,
-; CM-NEXT:     LSHL * T0.W, KC0[5].X, 1,
+; CM-NEXT:     LSHL * T0.W, KC0[5].W, 1,
+; CM-NEXT:     LSHL T0.Y, KC0[4].Z, 1,
+; CM-NEXT:     LSHL T0.Z, KC0[4].W, 1,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.W, literal.x,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T0.X, KC0[4].Z, 1,
-; CM-NEXT:     SETGT_UINT T1.Y, PV.W, literal.x,
-; CM-NEXT:     LSHL T2.Z, KC0[5].Z, 1,
-; CM-NEXT:     SETGT_UINT * T0.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.X, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT T1.Y, PV.Z, literal.x,
+; CM-NEXT:     LSHL T0.Z, KC0[5].Z, 1,
+; CM-NEXT:     LSHL * T0.W, KC0[5].X, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHL T2.Y, KC0[5].Y, 1,
-; CM-NEXT:     SETGT_UINT T1.Z, PV.Z, literal.x,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 24(3.363116e-44)
-; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T3.Y, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T1.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T4.W, T1.Y, 1, 0.0,
+; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, PV.W, literal.y,
+; CM-NEXT:     LSHL T1.Z, KC0[5].Y, 1,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Z, literal.y,
 ; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     ADD_INT T2.X, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T3.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT T0.Z, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT * T4.W, PV.Y, 1, 0.0,
+; CM-NEXT:    6(8.407791e-45), -16777217(-1.701412e+38)
 ; CM-NEXT:     CNDE_INT T3.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T1.Y, T0.X, literal.x,
-; CM-NEXT:     CNDE_INT T4.Z, T0.Y, 1, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T4.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     LSHL T2.Y, KC0[4].Y, 1,
+; CM-NEXT:     CNDE_INT T4.Z, T1.Y, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Y, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T5.X, T1.X, literal.x,
+; CM-NEXT:     CNDE_INT T4.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Y, literal.y,
+; CM-NEXT:    4(5.605194e-45), -16777217(-1.701412e+38)
 ; CM-NEXT:     CNDE_INT * T4.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <7 x i1> @llvm.is.fpclass.v7f32(<7 x float> %x, i32 504)
   %zext = zext <7 x i1> %result to <7 x i32>
   store <7 x i32> %zext, ptr addrspace(1) %out, align 32
@@ -257,45 +249,44 @@ define amdgpu_kernel void @issue135083_v7f32(ptr addrspace(1) %out, <7 x float>
 define amdgpu_kernel void @issue135083_v8f32(ptr addrspace(1) %out, <8 x float> %x) {
 ; CM-LABEL: issue135083_v8f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 33, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
+; CM-NEXT:    ALU 32, @4, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
-; CM-NEXT:     LSHL T0.Z, KC0[6].X, 1,
-; CM-NEXT:     LSHL * T0.W, KC0[4].W, 1,
-; CM-NEXT:     LSHL T0.X, KC0[4].Y, 1,
-; CM-NEXT:     SETGT_UINT T0.Y, PV.W, literal.x,
-; CM-NEXT:     SETGT_UINT T0.Z, PV.Z, literal.x,
+; CM-NEXT:     LSHL * T0.W, KC0[6].X, 1,
+; CM-NEXT:     SETGT_UINT T0.Z, PV.W, literal.x,
 ; CM-NEXT:     LSHL * T0.W, KC0[5].W, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T1.X, KC0[5].Z, 1,
-; CM-NEXT:     SETGT_UINT T1.Y, PV.W, literal.x,
-; CM-NEXT:     LSHL T1.Z, KC0[5].X, 1,
-; CM-NEXT:     CNDE_INT * T1.W, PV.Z, 1, 0.0,
-; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T2.X, PV.Z, literal.x,
-; CM-NEXT:     LSHL T2.Y, KC0[5].Y, 1,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T0.W, PV.X, literal.x,
+; CM-NEXT:     SETGT_UINT T0.Y, PV.W, literal.x,
+; CM-NEXT:     LSHL T1.Z, KC0[5].Z, 1,
+; CM-NEXT:     CNDE_INT * T0.W, PV.Z, 1, 0.0,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T3.X, KC0[4].Z, 1,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T0.Z, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, PV.X, 1, 0.0,
+; CM-NEXT:     LSHL T0.X, KC0[5].Y, 1,
+; CM-NEXT:     SETGT_UINT T1.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, 1, 0.0,
+; CM-NEXT:     LSHL * T1.W, KC0[5].X, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T2.Y, PV.X, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, T0.Y, 1, 0.0,
-; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T1.X, PV.W, literal.x,
 ; CM-NEXT:     CNDE_INT T0.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T2.W, T0.X, literal.y,
+; CM-NEXT:     LSHL T1.Z, KC0[4].W, 1,
+; CM-NEXT:     SETGT_UINT * T1.W, PV.X, literal.x,
+; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T0.X, PV.W, 1, 0.0,
+; CM-NEXT:     LSHL T1.Y, KC0[4].Z, 1,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T1.W, PV.X, 1, 0.0,
+; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHL T2.Y, KC0[4].Y, 1,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T2.W, PV.Y, literal.y,
 ; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
-; CM-NEXT:     CNDE_INT * T0.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T2.W, PV.Y, literal.y,
+; CM-NEXT:    4(5.605194e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     CNDE_INT * T1.X, PV.W, 1, 0.0,
   %result = call <8 x i1> @llvm.is.fpclass.v3f32(<8 x float> %x, i32 504)
   %zext = zext <8 x i1> %result to <8 x i32>
   store <8 x i32> %zext, ptr addrspace(1) %out, align 32
@@ -305,82 +296,78 @@ define amdgpu_kernel void @issue135083_v8f32(ptr addrspace(1) %out, <8 x float>
 define amdgpu_kernel void @issue135083_v16f32(ptr addrspace(1) %out, <16 x float> %x) {
 ; CM-LABEL: issue135083_v16f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 68, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 64, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T7.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T0.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T6.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T6.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T5.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 6:
-; CM-NEXT:     LSHL T0.Z, KC0[6].Y, 1,
 ; CM-NEXT:     LSHL * T0.W, KC0[6].W, 1,
-; CM-NEXT:     SETGT_UINT T0.X, PV.W, literal.x,
 ; CM-NEXT:     LSHL T0.Y, KC0[6].Z, 1,
-; CM-NEXT:     LSHL T1.Z, KC0[7].Y, 1,
+; CM-NEXT:     SETGT_UINT T0.Z, PV.W, literal.x,
 ; CM-NEXT:     LSHL * T0.W, KC0[7].X, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T1.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T0.X, PV.W, literal.x,
 ; CM-NEXT:     LSHL T1.Y, KC0[7].Z, 1,
-; CM-NEXT:     LSHL T2.Z, KC0[8].X, 1,
+; CM-NEXT:     LSHL T1.Z, KC0[10].X, 1,
 ; CM-NEXT:     LSHL * T0.W, KC0[7].W, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T2.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T1.X, PV.W, literal.x,
 ; CM-NEXT:     SETGT_UINT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     LSHL T2.Z, KC0[10].X, 1,
-; CM-NEXT:     LSHL * T0.W, KC0[8].W, 1,
+; CM-NEXT:     LSHL T1.Z, KC0[9].W, 1,
+; CM-NEXT:     LSHL * T0.W, KC0[8].X, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T3.X, KC0[8].Y, 1,
-; CM-NEXT:     SETGT_UINT T3.Y, PV.W, literal.x,
-; CM-NEXT:     SETGT_UINT T2.Z, PV.Z, literal.x,
-; CM-NEXT:     LSHL * T0.W, KC0[9].W, 1,
+; CM-NEXT:     SETGT_UINT T2.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.x,
+; CM-NEXT:     LSHL T1.Z, KC0[9].Z, 1,
+; CM-NEXT:     CNDE_INT * T3.W, PV.Y, 1, 0.0,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T4.X, KC0[9].Z, 1,
-; CM-NEXT:     SETGT_UINT T4.Y, PV.W, literal.x,
-; CM-NEXT:     LSHL T3.Z, KC0[9].X, 1,
-; CM-NEXT:     CNDE_INT * T4.W, PV.Z, 1, 0.0,
+; CM-NEXT:     LSHL T3.X, KC0[9].Y, 1,
+; CM-NEXT:     SETGT_UINT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T3.Z, PV.Y, 1, 0.0,
+; CM-NEXT:     LSHL * T0.W, KC0[9].X, 1,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T5.X, PV.Z, literal.x,
-; CM-NEXT:     LSHL T5.Y, KC0[9].Y, 1,
-; CM-NEXT:     CNDE_INT T4.Z, PV.Y, 1, 0.0,
+; CM-NEXT:     SETGT_UINT T4.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T3.Y, PV.Y, 1, 0.0,
+; CM-NEXT:     LSHL T1.Z, KC0[8].W, 1,
 ; CM-NEXT:     SETGT_UINT * T0.W, PV.X, literal.x,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     LSHL T6.X, KC0[8].Z, 1,
-; CM-NEXT:     CNDE_INT T4.Y, PV.W, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T2.Z, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT * T3.W, PV.X, 1, 0.0,
+; CM-NEXT:     CNDE_INT T3.X, PV.W, 1, 0.0,
+; CM-NEXT:     LSHL T2.Y, KC0[8].Z, 1,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T2.W, PV.X, 1, 0.0,
 ; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T4.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T5.Y, PV.X, literal.x,
-; CM-NEXT:     CNDE_INT T3.Z, T3.Y, 1, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 48(6.726233e-44)
-; CM-NEXT:     LSHR T5.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T3.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T2.Z, T3.X, literal.y,
-; CM-NEXT:     CNDE_INT * T2.W, T2.Y, 1, 0.0,
-; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
-; CM-NEXT:     CNDE_INT T3.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T1.Y, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T2.Z, T2.X, 1, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 32(4.484155e-44)
-; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T2.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T1.Z, T1.Z, literal.y,
-; CM-NEXT:     CNDE_INT * T1.W, T1.X, 1, 0.0,
+; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHL T4.Y, KC0[8].Y, 1,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, PV.Y, literal.y,
 ; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     ADD_INT T5.X, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T1.W, T2.X, 1, 0.0,
+; CM-NEXT:    12(1.681558e-44), -16777217(-1.701412e+38)
 ; CM-NEXT:     CNDE_INT T2.X, PV.Z, 1, 0.0,
-; CM-NEXT:     SETGT_UINT T0.Y, T0.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, T0.X, 1, 0.0,
-; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
-; CM-NEXT:    -16777217(-1.701412e+38), 16(2.242078e-44)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Y, 1, 0.0,
-; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    2(2.802597e-45), -16777217(-1.701412e+38)
-; CM-NEXT:     CNDE_INT * T1.X, PV.W, 1, 0.0,
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
-; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     LSHL T4.Y, KC0[7].Y, 1,
+; CM-NEXT:     CNDE_INT T1.Z, T1.X, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T0.W, T1.Y, literal.x,
+; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T6.X, T4.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT T4.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.X, 1, 0.0, BS:VEC_120/SCL_212
+; CM-NEXT:    8(1.121039e-44), -16777217(-1.701412e+38)
+; CM-NEXT:     CNDE_INT T1.X, PV.Z, 1, 0.0,
+; CM-NEXT:     LSHL T4.Y, KC0[6].Y, 1,
+; CM-NEXT:     CNDE_INT T0.Z, T0.Z, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T4.W, T0.Y, literal.x,
+; CM-NEXT:    -16777217(-1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T7.X, T4.X, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, PV.W, 1, 0.0,
+; CM-NEXT:     SETGT_UINT * T4.W, PV.Y, literal.y,
+; CM-NEXT:    4(5.605194e-45), -16777217(-1.701412e+38)
+; CM-NEXT:     CNDE_INT * T0.X, PV.W, 1, 0.0,
   %result = call <16 x i1> @llvm.is.fpclass.v3f32(<16 x float> %x, i32 504)
   %zext = zext <16 x i1> %result to <16 x i32>
   store <16 x i32> %zext, ptr addrspace(1) %out, align 64
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 71c7797cbc68e..581312dd3e73f 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -312,25 +312,23 @@ entry:
 define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) {
 ; R600-LABEL: rotr_v8i32:
 ; R600:       ; %bb.0: ; %entry
-; R600-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
-; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X,
 ; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W,
 ; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z,
 ; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y,
-; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
-; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; R600-NEXT:     BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X,
-; R600-NEXT:     BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W,
-; R600-NEXT:     BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z,
-; R600-NEXT:     BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y,
-; R600-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; R600-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; R600-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[6].X, KC0[6].X, KC0[8].X,
+; R600-NEXT:     BIT_ALIGN_INT * T1.Z, KC0[5].W, KC0[5].W, KC0[7].W,
+; R600-NEXT:     BIT_ALIGN_INT * T1.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z,
+; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[5].Y, KC0[5].Y, KC0[7].Y,
+; R600-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT * T3.X, PV.X, literal.x,
+; R600-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ;
 ; SI-LABEL: rotr_v8i32:
 ; SI:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index acf999e586a68..7278247d9b45f 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -956,67 +956,65 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @6
-; EG-NEXT:    ALU 48, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
+; EG-NEXT:    ALU 46, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
+; EG-NEXT:     LSHR T0.Y, T1.W, 1,
+; EG-NEXT:     NOT_INT T4.Z, T0.Z,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T1.W, T1.Z, 1,
+; EG-NEXT:     AND_INT * T1.W, T3.Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, T0.Z, PV.W,
-; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
-; EG-NEXT:     LSHR T1.Y, T3.W, 1,
-; EG-NEXT:     NOT_INT T4.Z, T2.Z, BS:VEC_201
-; EG-NEXT:     BIT_ALIGN_INT T2.W, T3.W, T3.Z, 1,
-; EG-NEXT:     AND_INT * T3.W, T2.Z, literal.y,
-; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
-; EG-NEXT:     LSHL T5.X, T3.Z, PS,
-; EG-NEXT:     AND_INT T2.Y, T2.Z, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.Y, PV.W, PV.Z,
-; EG-NEXT:     LSHR T2.W, T3.Y, 1,
-; EG-NEXT:     NOT_INT * T3.W, T2.X,
+; EG-NEXT:     LSHL T4.X, T2.Z, PS,
+; EG-NEXT:     AND_INT T3.Y, T3.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T5.Z, T2.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
+; EG-NEXT:     NOT_INT * T2.W, T3.Z,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T6.X, T3.Y, T3.X, 1,
-; EG-NEXT:     AND_INT T1.Y, T2.X, literal.x,
-; EG-NEXT:     LSHR T3.Z, T0.W, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
-; EG-NEXT:     NOT_INT * T4.W, T1.Z,
+; EG-NEXT:     BIT_ALIGN_INT T5.X, PV.Z, PV.W, PS,
+; EG-NEXT:     LSHR T4.Y, T2.Y, 1,
+; EG-NEXT:     NOT_INT T2.Z, T3.X,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, T2.Y, T2.X, 1,
+; EG-NEXT:     AND_INT * T2.W, T3.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T7.X, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T1.Y, T3.X, PV.Y, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT T0.Z, T2.X, literal.x, BS:VEC_201
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, PV.X, T3.W,
-; EG-NEXT:     CNDE_INT * T3.W, T2.Y, T2.Z, T5.X,
+; EG-NEXT:     AND_INT T6.X, T0.Z, literal.x,
+; EG-NEXT:     LSHL T2.Y, T2.X, PS, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T3.Z, T3.X, literal.y, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PV.Y, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T4.W, T3.Y, PV.X, T4.X,
+; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T4.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     LSHL T1.Z, T1.Z, PV.X,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.W, T4.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T2.X, T0.Y, 1,
-; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     NOT_INT T1.Z, T1.X,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
-; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
+; EG-NEXT:     LSHR T2.X, T1.Y, 1,
+; EG-NEXT:     NOT_INT T0.Y, T0.X, BS:VEC_201
+; EG-NEXT:     CNDE_INT T4.Z, T3.Y, T4.X, 0.0, BS:VEC_102/SCL_221
+; EG-NEXT:     BIT_ALIGN_INT T2.W, T1.Y, T1.X, 1,
+; EG-NEXT:     AND_INT * T3.W, T0.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL T0.X, T0.X, PS,
-; EG-NEXT:     AND_INT T0.Y, T1.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T3.Z, T2.Y, T5.X, 0.0, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T4.X, T3.Z, T2.Y, 0.0, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL T1.Y, T1.X, PS,
+; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T2.W, PV.X, PV.W, PV.Y,
+; EG-NEXT:     CNDE_INT * T3.W, T1.W, T0.W, T1.Z,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T2.W, T4.X, T7.X, T1.W,
-; EG-NEXT:     CNDE_INT T3.X, T0.Z, T1.Y, 0.0,
-; EG-NEXT:     CNDE_INT T2.Y, T0.Y, T0.W, T0.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     CNDE_INT T2.Z, T4.X, T1.W, 0.0,
-; EG-NEXT:     CNDE_INT * T2.X, T0.Y, T0.X, 0.0,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     CNDE_INT * T3.Y, PV.Z, PV.W, PV.Y,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     CNDE_INT T3.Z, T1.W, T1.Z, 0.0,
+; EG-NEXT:     CNDE_INT * T3.X, T0.Z, T1.Y, 0.0,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
   %a = load <4 x i64>, ptr addrspace(1) %in
   %b = load <4 x i64>, ptr addrspace(1) %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 4cf74a9ba96a8..c7e7927c37da3 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -648,9 +648,9 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @6
-; EG-NEXT:    ALU 39, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    ALU 38, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
@@ -687,19 +687,18 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG-NEXT:     AND_INT * T4.W, T1.X, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Y,
-; EG-NEXT:     ASHR T6.Y, T3.W, literal.x,
-; EG-NEXT:     ASHR T3.Z, T0.Y, literal.x, BS:VEC_201
-; EG-NEXT:     ADD_INT T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:     ASHR T3.Z, T3.W, literal.x,
+; EG-NEXT:     ASHR T3.W, T0.Y, literal.x, BS:VEC_201
 ; EG-NEXT:     CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y,
-; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     CNDE_INT T0.Y, T2.W, T4.Y, PV.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     CNDE_INT T0.Y, T2.W, T4.Y, PV.W,
 ; EG-NEXT:     ASHR T3.W, T3.Y, literal.y,
-; EG-NEXT:     CNDE_INT * T2.W, T1.W, T4.Z, PV.Y,
+; EG-NEXT:     CNDE_INT * T2.W, T1.W, T4.Z, PV.Z,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT T3.X, PV.X, literal.x,
 ; EG-NEXT:     CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
   %a = load <4 x i64>, ptr addrspace(1) %in
   %b = load <4 x i64>, ptr addrspace(1) %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 089c0b2d46578..90345993de473 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -6691,2217 +6691,2213 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG-NEXT:    ALU 114, @35, KC0[], KC1[]
 ; EG-NEXT:    ALU 115, @150, KC0[], KC1[]
 ; EG-NEXT:    ALU 115, @266, KC0[], KC1[]
-; EG-NEXT:    ALU 111, @382, KC0[], KC1[]
+; EG-NEXT:    ALU 109, @382, KC0[], KC1[]
 ; EG-NEXT:    TEX 1 @30
-; EG-NEXT:    ALU 114, @494, KC0[], KC1[]
-; EG-NEXT:    ALU 113, @609, KC0[], KC1[]
-; EG-NEXT:    ALU 114, @723, KC0[], KC1[]
-; EG-NEXT:    ALU 113, @838, KC0[], KC1[]
-; EG-NEXT:    ALU 114, @952, KC0[], KC1[]
-; EG-NEXT:    ALU 113, @1067, KC0[], KC1[]
-; EG-NEXT:    ALU 114, @1181, KC0[], KC1[]
-; EG-NEXT:    ALU 113, @1296, KC0[], KC1[]
-; EG-NEXT:    ALU 114, @1410, KC0[], KC1[]
-; EG-NEXT:    ALU 114, @1525, KC0[], KC1[]
-; EG-NEXT:    ALU 114, @1640, KC0[], KC1[]
-; EG-NEXT:    ALU 115, @1755, KC0[], KC1[]
-; EG-NEXT:    ALU 113, @1871, KC0[], KC1[]
-; EG-NEXT:    ALU 112, @1985, KC0[], KC1[]
-; EG-NEXT:    ALU 99, @2098, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
+; EG-NEXT:    ALU 114, @492, KC0[], KC1[]
+; EG-NEXT:    ALU 112, @607, KC0[], KC1[]
+; EG-NEXT:    ALU 113, @720, KC0[], KC1[]
+; EG-NEXT:    ALU 114, @834, KC0[], KC1[]
+; EG-NEXT:    ALU 114, @949, KC0[], KC1[]
+; EG-NEXT:    ALU 113, @1064, KC0[], KC1[]
+; EG-NEXT:    ALU 114, @1178, KC0[], KC1[]
+; EG-NEXT:    ALU 113, @1293, KC0[], KC1[]
+; EG-NEXT:    ALU 114, @1407, KC0[], KC1[]
+; EG-NEXT:    ALU 113, @1522, KC0[], KC1[]
+; EG-NEXT:    ALU 114, @1636, KC0[], KC1[]
+; EG-NEXT:    ALU 113, @1751, KC0[], KC1[]
+; EG-NEXT:    ALU 112, @1865, KC0[], KC1[]
+; EG-NEXT:    ALU 114, @1978, KC0[], KC1[]
+; EG-NEXT:    ALU 100, @2093, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 26:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T2.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T2.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
 ; EG-NEXT:    Fetch clause starting at 30:
-; EG-NEXT:     VTX_READ_128 T9.XYZW, T2.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T10.XYZW, T2.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T8.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T9.XYZW, T0.X, 48, #1
 ; EG-NEXT:    ALU clause starting at 34:
-; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 35:
-; EG-NEXT:     ASHR * T3.W, T1.Y, literal.x,
+; EG-NEXT:     ASHR * T3.W, T2.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T2.W, T1.X, PV.W,
-; EG-NEXT:     XOR_INT * T7.W, PV.W, T3.W,
-; EG-NEXT:     SUB_INT T2.Z, 0.0, PV.W,
-; EG-NEXT:     ASHR T2.W, T0.Y, literal.x,
-; EG-NEXT:     RECIP_UINT * T2.Y, PV.W,
+; EG-NEXT:     ADD_INT * T0.W, T2.Z, PV.W,
+; EG-NEXT:     XOR_INT * T6.W, PV.W, T3.W,
+; EG-NEXT:     SUB_INT T0.Z, 0.0, PV.W,
+; EG-NEXT:     ASHR T0.W, T1.W, literal.x,
+; EG-NEXT:     RECIP_UINT * T0.Y, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T3.Z, T0.Y, PV.W,
-; EG-NEXT:     ADDC_UINT T4.W, T0.X, PV.W,
-; EG-NEXT:     MULLO_INT * T0.Y, PV.Z, PS,
-; EG-NEXT:     ADD_INT T4.W, PV.Z, PV.W,
-; EG-NEXT:     MULHI * T0.Y, T2.Y, PS,
-; EG-NEXT:     ADD_INT T5.W, T2.Y, PS,
-; EG-NEXT:     XOR_INT * T4.W, PV.W, T2.W,
+; EG-NEXT:     ADD_INT T3.Z, T1.W, PV.W,
+; EG-NEXT:     ADDC_UINT T1.W, T1.Z, PV.W,
+; EG-NEXT:     MULLO_INT * T0.Z, PV.Z, PS,
+; EG-NEXT:     ADD_INT T1.W, PV.Z, PV.W,
+; EG-NEXT:     MULHI * T0.Z, T0.Y, PS,
+; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
+; EG-NEXT:     XOR_INT * T1.W, PV.W, T0.W,
 ; EG-NEXT:     MULHI * T0.Y, PS, PV.W,
-; EG-NEXT:     MULLO_INT * T0.Y, PS, T7.W,
-; EG-NEXT:     SUB_INT * T5.W, T4.W, PS,
-; EG-NEXT:     SETGE_UINT T6.W, PV.W, T7.W,
-; EG-NEXT:     SUB_INT * T8.W, PV.W, T7.W,
-; EG-NEXT:     CNDE_INT T2.Z, PV.W, T5.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     ADD_INT T5.W, T1.Y, T3.W,
-; EG-NEXT:     ADDC_UINT * T6.W, T1.X, T3.W,
-; EG-NEXT:     ADD_INT T3.Z, PV.W, PS,
-; EG-NEXT:     SETGE_UINT T5.W, PV.Z, T7.W,
-; EG-NEXT:     SUB_INT * T6.W, PV.Z, T7.W,
-; EG-NEXT:     ADD_INT T4.Z, T0.X, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T5.W, PV.W, T2.Z, PS,
-; EG-NEXT:     XOR_INT * T6.W, PV.Z, T3.W,
-; EG-NEXT:     CNDE_INT T3.W, PS, PV.W, T4.W,
-; EG-NEXT:     XOR_INT * T8.W, PV.Z, T2.W,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, PV.W, PS, literal.x,
-; EG-NEXT:     LSHR * T3.W, PV.W, literal.x,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETE_INT T2.Z, PS, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.W, PS, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T7.W,
+; EG-NEXT:     MULLO_INT * T0.Y, PS, T6.W,
+; EG-NEXT:     SUB_INT * T4.W, T1.W, PS,
+; EG-NEXT:     SETGE_UINT T5.W, PV.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, PV.W, T6.W,
+; EG-NEXT:     CNDE_INT T0.Z, PV.W, T4.W, PS, BS:VEC_021/SCL_122
+; EG-NEXT:     ADD_INT T2.W, T2.W, T3.W,
+; EG-NEXT:     ADDC_UINT * T4.W, T2.Z, T3.W,
+; EG-NEXT:     ADD_INT T2.Z, PV.W, PS,
+; EG-NEXT:     SETGE_UINT T2.W, PV.Z, T6.W,
+; EG-NEXT:     SUB_INT * T4.W, PV.Z, T6.W,
+; EG-NEXT:     ADD_INT T1.Z, T1.Z, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T2.W, PV.W, T0.Z, PS,
+; EG-NEXT:     XOR_INT * T4.W, PV.Z, T3.W,
+; EG-NEXT:     CNDE_INT T1.W, PS, PV.W, T1.W,
+; EG-NEXT:     XOR_INT * T5.W, PV.Z, T0.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.W, PV.W, PS, literal.x,
+; EG-NEXT:     LSHR * T1.W, PV.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT T0.Z, PS, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T3.W, PS, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T6.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     SUB_INT * T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T5.W, T3.W, T6.W,
-; EG-NEXT:     SUBB_UINT * T9.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT T5.W, T5.W, PV.W,
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT * T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T3.W, T1.W, T4.W,
+; EG-NEXT:     SUBB_UINT * T7.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT T3.W, T3.W, PV.W,
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    26(3.643376e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT * T2.Z, PS, T7.W,
+; EG-NEXT:     SETGE_UINT * T0.Z, PS, T6.W,
 ; EG-NEXT:    ALU clause starting at 150:
-; EG-NEXT:     SETE_INT T5.W, T3.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, T3.W, T6.W,
-; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, T2.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SETE_INT T3.W, T1.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, T1.W, T4.W,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, T0.Z,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    25(3.503246e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    22(3.082857e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    21(2.942727e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T2.Z, T4.W, T7.W,
+; EG-NEXT:     SUB_INT * T0.Z, T2.W, T6.W,
 ; EG-NEXT:    ALU clause starting at 266:
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    18(2.522337e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    17(2.382207e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
 ; EG-NEXT:    ALU clause starting at 382:
-; EG-NEXT:     LSHL T2.Z, T4.W, 1,
-; EG-NEXT:     BFE_UINT * T9.W, T8.W, literal.x, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL T0.Z, T2.W, 1,
+; EG-NEXT:     BFE_UINT * T7.W, T5.W, literal.x, 1, BS:VEC_120/SCL_212
 ; EG-NEXT:    11(1.541428e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, T5.W,
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T4.W, T2.Z, T9.W,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PV.W, T2.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T2.W, T0.Z, T7.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T9.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T9.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:    ALU clause starting at 494:
-; EG-NEXT:     LSHL T2.Z, T4.W, 1,
-; EG-NEXT:     BFE_UINT * T11.W, T8.W, literal.x, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:    ALU clause starting at 492:
+; EG-NEXT:     SUB_INT T3.W, T7.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, T5.W,
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T4.W, T2.Z, T11.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T11.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T2.Z, PS, 1,
-; EG-NEXT:     BFE_UINT T11.W, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T3.W, T0.Y, T3.W, PV.W,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T0.Z, PS, 1,
+; EG-NEXT:     BFE_UINT T7.W, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T1.W, PV.W,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.W, PS, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T4.W, PV.Z, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, PS, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T2.W, PV.Z, PV.W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.Z, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.W, PV.W, T6.W,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.W, T6.W,
+; EG-NEXT:     SETGE_UINT T0.Z, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.W, PV.W, T4.W,
+; EG-NEXT:     SETGE_UINT * T7.W, PV.W, T4.W,
 ; EG-NEXT:     CNDE_INT T0.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Z, T4.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T5.W, T4.W, T7.W,
-; EG-NEXT:     SUB_INT * T11.W, T3.W, T6.W,
-; EG-NEXT:     SUB_INT T5.W, PV.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, T0.Y, T4.W, T2.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1,
-; EG-NEXT:     CNDE_INT T3.W, T0.Y, T3.W, PV.W,
-; EG-NEXT:     ASHR * T11.W, T10.Y, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, T2.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T3.W, T2.W, T6.W,
+; EG-NEXT:     SUB_INT * T7.W, T1.W, T4.W,
+; EG-NEXT:     SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T2.W, T0.Y, T2.W, T0.Z,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T0.Z, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T1.W, T0.Y, T1.W, PV.W,
+; EG-NEXT:     ASHR * T7.W, T9.W, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-; EG-NEXT:     ADD_INT T3.Z, T10.X, PS,
-; EG-NEXT:     BIT_ALIGN_INT T5.W, PV.W, T4.W, literal.x,
-; EG-NEXT:     OR_INT * T12.W, PV.Y, PV.Z,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T0.Y, PS, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T2.Z, PV.W, T6.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T3.W, PV.W, T6.W, BS:VEC_102/SCL_221
-; EG-NEXT:     XOR_INT * T4.W, PV.Z, T11.W,
-; EG-NEXT:     SUB_INT T13.W, 0.0, PS,
-; EG-NEXT:     CNDE_INT * T14.W, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T0.X, T12.W, T7.W,
-; EG-NEXT:     SUBB_UINT * T0.Y, T12.W, T7.W,
-; EG-NEXT:     SUB_INT T2.Z, T5.W, T6.W,
-; EG-NEXT:     ASHR T3.W, T9.Y, literal.x,
-; EG-NEXT:     RECIP_UINT * T1.X, T4.W,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T1.Y, T9.Y, PV.W,
-; EG-NEXT:     SUB_INT T2.Z, PV.Z, T0.Y,
-; EG-NEXT:     CNDE_INT T12.W, T14.W, T12.W, T0.X,
-; EG-NEXT:     MULLO_INT * T0.X, T13.W, PS,
-; EG-NEXT:     ADDC_UINT T2.X, T9.X, T3.W,
-; EG-NEXT:     LSHL T0.Y, PV.W, 1,
-; EG-NEXT:     BFE_UINT * T3.Z, T8.W, 1, 1,
-; EG-NEXT:     CNDE_INT T5.W, T14.W, T5.W, T2.Z,
-; EG-NEXT:     MULHI * T0.X, T1.X, T0.X,
-; EG-NEXT:     ADD_INT T2.Y, T1.X, PS,
-; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T12.W, literal.x,
-; EG-NEXT:     OR_INT T12.W, T0.Y, T3.Z,
-; EG-NEXT:     ADD_INT * T5.W, T1.Y, T2.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T1.X, PS, T3.W,
-; EG-NEXT:     ASHR T0.Y, T10.W, literal.x,
-; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETE_INT T5.W, T4.Z, T6.W,
-; EG-NEXT:     SETGE_UINT * T13.W, T4.Z, T6.W,
-; EG-NEXT:     CNDE_INT T1.Y, PV.W, PS, T2.Z,
-; EG-NEXT:     SUB_INT T3.Z, T12.W, T7.W,
-; EG-NEXT:     ADD_INT T5.W, T10.Z, T0.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     MULHI * T0.X, T1.X, T2.Y,
-; EG-NEXT:     SUBB_UINT T2.X, T12.W, T7.W,
-; EG-NEXT:     SUB_INT T2.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     XOR_INT * T2.Z, PV.W, T0.Y,
-; EG-NEXT:     ASHR T5.W, T9.W, literal.x,
-; EG-NEXT:     MULLO_INT * T0.X, T0.X, T4.W,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T3.X, T9.W, PV.W,
-; EG-NEXT:     SUB_INT T3.Y, 0.0, T2.Z,
-; EG-NEXT:     SUB_INT T5.Z, T2.Y, T2.X,
-; EG-NEXT:     CNDE_INT T9.W, T1.Y, T12.W, T3.Z, BS:VEC_120/SCL_212
-; EG-NEXT:     RECIP_UINT * T2.X, T2.Z,
-; EG-NEXT:     ADDC_UINT T4.X, T9.Z, T5.W,
-; EG-NEXT:     LSHL T2.Y, PV.W, 1,
-; EG-NEXT:     AND_INT T3.Z, T8.W, 1,
-; EG-NEXT:     CNDE_INT T8.W, T1.Y, T4.Z, PV.Z,
-; EG-NEXT:     MULLO_INT * T1.Y, PV.Y, PS,
-; EG-NEXT:     BIT_ALIGN_INT T3.Y, PV.W, T9.W, literal.x,
-; EG-NEXT:     OR_INT T5.Z, PV.Y, PV.Z,
-; EG-NEXT:     ADD_INT T8.W, T3.X, PV.X,
-; EG-NEXT:     MULHI * T1.Y, T2.X, PS,
+; EG-NEXT:     ADD_INT T1.Z, T9.Z, PS,
+; EG-NEXT:     BIT_ALIGN_INT T3.W, PV.W, T2.W, literal.x,
+; EG-NEXT:     OR_INT * T10.W, PV.Y, PV.Z,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T2.X, T2.X, PS,
-; EG-NEXT:     XOR_INT T1.Y, PV.W, T5.W,
-; EG-NEXT:     SETGE_UINT T3.Z, PV.Z, T7.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Y, T6.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT * T9.W, T1.X, T0.X,
-; EG-NEXT:     SETGE_UINT T0.X, T3.Y, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUBB_UINT T2.Y, T5.Z, T7.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT T4.Z, T3.Y, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.W, PS, T4.W,
-; EG-NEXT:     SUB_INT * T12.W, PS, T4.W,
-; EG-NEXT:     CNDE_INT T3.X, PV.W, T9.W, PS,
-; EG-NEXT:     ADD_INT T4.Y, T10.Y, T11.W, BS:VEC_102/SCL_221
-; EG-NEXT:     ADDC_UINT T6.Z, T10.X, T11.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT T6.W, PV.Z, PV.Y,
-; EG-NEXT:     CNDE_INT * T8.W, T8.W, PV.X, T3.Z,
-; EG-NEXT:     CNDE_INT T0.X, PS, T3.Y, PV.W,
-; EG-NEXT:     ADD_INT * T2.Y, PV.Y, PV.Z,
-; EG-NEXT:    ALU clause starting at 609:
-; EG-NEXT:     SETGE_UINT T3.Z, T3.X, T4.W,
-; EG-NEXT:     SUB_INT T6.W, T3.X, T4.W,
-; EG-NEXT:     MULHI * T2.X, T1.Y, T2.X,
-; EG-NEXT:     ASHR T4.X, T1.W, literal.x,
-; EG-NEXT:     CNDE_INT T3.Y, PV.Z, T3.X, PV.W,
-; EG-NEXT:     XOR_INT T3.Z, T2.Y, T11.W,
-; EG-NEXT:     ADD_INT T6.W, T9.Z, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     MULLO_INT * T2.X, PS, T2.Z,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T3.X, PV.W, T5.W,
-; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.Y, T1.X,
-; EG-NEXT:     ADD_INT T4.Z, T1.Z, PV.X,
-; EG-NEXT:     SUB_INT T9.W, T1.Y, PS,
-; EG-NEXT:     ASHR * T6.W, T0.W, literal.x,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T1.X, T0.W, PS,
-; EG-NEXT:     ADDC_UINT T3.Y, T0.Z, PS,
-; EG-NEXT:     SETGE_UINT T6.Z, PV.W, T2.Z,
-; EG-NEXT:     SUB_INT T11.W, PV.W, T2.Z,
-; EG-NEXT:     XOR_INT * T0.W, PV.Z, T4.X,
-; EG-NEXT:     SUB_INT T2.X, 0.0, PS,
-; EG-NEXT:     CNDE_INT T4.Y, PV.Z, T9.W, PV.W,
-; EG-NEXT:     ADD_INT T4.Z, T10.W, T0.Y,
-; EG-NEXT:     ADDC_UINT T9.W, T10.Z, T0.Y,
-; EG-NEXT:     RECIP_UINT * T5.X, PS,
-; EG-NEXT:     ADD_INT T6.X, PV.Z, PV.W,
-; EG-NEXT:     SETGE_UINT T5.Y, PV.Y, T2.Z,
-; EG-NEXT:     SUB_INT T4.Z, PV.Y, T2.Z,
-; EG-NEXT:     ADD_INT T9.W, T9.X, T3.W,
-; EG-NEXT:     MULLO_INT * T6.Y, PV.X, PS,
-; EG-NEXT:     XOR_INT T2.X, PV.W, T3.W,
-; EG-NEXT:     CNDE_INT T4.Y, PV.Y, T4.Y, PV.Z,
-; EG-NEXT:     XOR_INT T4.Z, PV.X, T0.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     ADD_INT T9.W, T1.X, T3.Y, BS:VEC_102/SCL_221
-; EG-NEXT:     MULHI * T0.Y, T5.X, PS,
-; EG-NEXT:     ADD_INT T1.X, T5.X, PS,
-; EG-NEXT:     XOR_INT T0.Y, PV.W, T6.W,
-; EG-NEXT:     CNDE_INT T6.Z, PV.Z, PV.Y, T1.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T9.W, T2.Y, PV.X, literal.x,
-; EG-NEXT:     LSHR * T10.W, T2.Y, literal.x,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETE_INT T5.X, PS, T3.Z,
-; EG-NEXT:     SETGE_UINT T1.Y, PS, T3.Z,
-; EG-NEXT:     SETGE_UINT T7.Z, PV.W, T4.W,
-; EG-NEXT:     LSHR T11.W, PV.Z, literal.x,
-; EG-NEXT:     MULHI * T1.X, PV.Y, PV.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETE_INT T6.X, PV.W, T4.Z,
-; EG-NEXT:     CNDE_INT T1.Y, PV.X, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT T7.Z, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T12.W, T6.Z, T3.X, literal.x,
-; EG-NEXT:     MULLO_INT * T1.X, PS, T0.W,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T5.X, T11.W, T4.Z,
-; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T2.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT T6.Z, T0.Y, PS,
-; EG-NEXT:     CNDE_INT T13.W, PV.Y, T9.W, PV.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T7.W, T5.Z, T7.W,
-; EG-NEXT:     CNDE_INT T1.X, T8.W, T5.Z, PS,
-; EG-NEXT:     LSHL T3.Y, PV.W, 1,
-; EG-NEXT:     SETGE_UINT T5.Z, PV.Z, T0.W,
-; EG-NEXT:     SUB_INT T7.W, PV.Z, T0.W,
-; EG-NEXT:     CNDE_INT * T8.W, T6.X, PV.X, PV.Y,
-; EG-NEXT:     SUB_INT T5.X, T12.W, T2.Z,
-; EG-NEXT:     SUB_INT T2.Y, T11.W, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SUBB_UINT T7.Z, T12.W, T2.Z,
-; EG-NEXT:     ADD_INT T1.W, T1.W, T4.X, BS:VEC_201
-; EG-NEXT:     ADDC_UINT * T14.W, T1.Z, T4.X,
-; EG-NEXT:     BFE_UINT T6.X, T2.X, literal.x, 1,
-; EG-NEXT:     ADD_INT T4.Y, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.Z, PV.Y, PV.Z,
-; EG-NEXT:     CNDE_INT T1.W, T8.W, T12.W, PV.X, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T7.W, T5.Z, T6.Z, T7.W,
+; EG-NEXT:     SETGE_UINT T0.Y, PS, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T0.Z, PV.W, T4.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT T1.W, PV.W, T4.W, BS:VEC_102/SCL_221
+; EG-NEXT:     XOR_INT * T2.W, PV.Z, T7.W,
+; EG-NEXT:     SUB_INT T11.W, 0.0, PS,
+; EG-NEXT:     CNDE_INT * T12.W, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T0.X, T10.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T0.Y, T10.W, T6.W,
+; EG-NEXT:     SUB_INT T0.Z, T3.W, T4.W,
+; EG-NEXT:     ASHR * T1.W, T8.W, literal.x, BS:VEC_201
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     RECIP_UINT * T1.Z, T2.W,
+; EG-NEXT:     ADD_INT T3.Y, T8.W, T1.W,
+; EG-NEXT:     SUB_INT * T0.Z, T0.Z, T0.Y,
+; EG-NEXT:     CNDE_INT T8.W, T12.W, T10.W, T0.X,
+; EG-NEXT:     MULLO_INT * T0.X, T11.W, T1.Z,
+; EG-NEXT:     ADDC_UINT T3.X, T8.Z, T1.W,
+; EG-NEXT:     LSHL T0.Y, PV.W, 1,
+; EG-NEXT:     BFE_UINT * T2.Z, T5.W, 1, 1,
+; EG-NEXT:     CNDE_INT T3.W, T12.W, T3.W, T0.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     MULHI * T0.X, T1.Z, T0.X,
+; EG-NEXT:     ADD_INT T0.X, T1.Z, PS,
+; EG-NEXT:     BIT_ALIGN_INT T4.Y, PV.W, T8.W, literal.x,
+; EG-NEXT:     OR_INT T0.Z, T0.Y, T2.Z,
+; EG-NEXT:     ASHR T8.W, T2.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, T3.Y, T3.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     XOR_INT T3.X, PS, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     ADD_INT T0.Y, T2.X, PV.W,
+; EG-NEXT:     SETGE_UINT T1.Z, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETE_INT T3.W, PV.Y, T4.W,
+; EG-NEXT:     SETGE_UINT * T10.W, PV.Y, T4.W,
+; EG-NEXT:     CNDE_INT T4.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T0.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUBB_UINT T1.Z, T0.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T10.W, T4.Y, T4.W, BS:VEC_102/SCL_221
+; EG-NEXT:     XOR_INT * T3.W, PV.Y, T8.W,
+; EG-NEXT:     SUB_INT T5.X, 0.0, PS,
+; EG-NEXT:     ASHR T5.Y, T9.Y, literal.x,
+; EG-NEXT:     SUB_INT T1.Z, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T10.W, PV.X, T0.Z, PV.Y,
+; EG-NEXT:     RECIP_UINT * T2.Z, PS,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHL T6.X, PV.W, 1,
+; EG-NEXT:     AND_INT T3.Y, T5.W, 1,
+; EG-NEXT:     CNDE_INT T0.Z, T4.X, T4.Y, PV.Z,
+; EG-NEXT:     ADD_INT T5.W, T9.X, PV.Y, BS:VEC_120/SCL_212
+; EG-NEXT:     MULLO_INT * T1.Z, PV.X, PS,
+; EG-NEXT:     XOR_INT T0.Y, PV.W, T5.Y,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Z, T10.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, PV.X, PV.Y,
+; EG-NEXT:     MULHI * T3.Y, T3.X, T0.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T4.Y, PV.W, T6.W,
+; EG-NEXT:     SETE_INT T3.Z, PV.Z, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T5.W, 0.0, PV.Y,
+; EG-NEXT:     RECIP_UINT * T4.X, PV.Y,
+; EG-NEXT:     SETGE_UINT T0.X, T0.Z, T4.W,
+; EG-NEXT:     SUBB_UINT T6.Y, T10.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T4.Z, T0.Z, T4.W,
+; EG-NEXT:     ASHR T4.W, T8.Y, literal.x,
+; EG-NEXT:     MULLO_INT * T5.X, PV.W, PS,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T6.X, T8.Y, PV.W,
+; EG-NEXT:     ADDC_UINT T7.Y, T8.X, PV.W,
+; EG-NEXT:     SUB_INT T4.Z, PV.Z, PV.Y,
+; EG-NEXT:     CNDE_INT T11.W, T3.Z, PV.X, T4.Y,
+; EG-NEXT:     MULHI * T3.Z, T4.X, PS,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.Z, PV.Z,
+; EG-NEXT:     ADD_INT T4.Y, T4.X, PS,
+; EG-NEXT:     ASHR * T0.Z, T1.Y, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 607:
+; EG-NEXT:     ADD_INT T5.W, T6.X, T7.Y,
+; EG-NEXT:     MULLO_INT * T3.Y, T3.Y, T2.W,
+; EG-NEXT:     XOR_INT T4.X, PV.W, T4.W,
+; EG-NEXT:     ADD_INT T1.Y, T1.Y, T0.Z, BS:VEC_102/SCL_221
+; EG-NEXT:     ADDC_UINT T3.Z, T1.X, T0.Z, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT T5.W, T3.X, PS,
+; EG-NEXT:     MULHI * T1.Z, T2.Z, T1.Z,
+; EG-NEXT:     ADD_INT T5.X, T2.Z, PS,
+; EG-NEXT:     SETGE_UINT T3.Y, PV.W, T2.W,
+; EG-NEXT:     SUB_INT T1.Z, PV.W, T2.W,
+; EG-NEXT:     ADD_INT T12.W, PV.Y, PV.Z,
+; EG-NEXT:     MULHI * T1.Y, PV.X, T4.Y,
+; EG-NEXT:     XOR_INT T6.X, PV.W, T0.Z,
+; EG-NEXT:     CNDE_INT T3.Y, PV.Y, T5.W, PV.Z,
+; EG-NEXT:     ADD_INT T1.Z, T9.W, T7.W, BS:VEC_021/SCL_122
+; EG-NEXT:     ADDC_UINT T5.W, T9.Z, T7.W, BS:VEC_021/SCL_122
+; EG-NEXT:     MULLO_INT * T1.Y, PS, T0.Y,
+; EG-NEXT:     ADD_INT T7.X, T8.X, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T1.Y, T4.X, PS, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT T1.Z, PV.Z, PV.W,
+; EG-NEXT:     SETGE_UINT T5.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT * T9.W, PV.Y, T2.W,
+; EG-NEXT:     ADD_INT T8.X, T8.Z, T1.W,
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, T3.Y, PS, BS:VEC_021/SCL_122
+; EG-NEXT:     XOR_INT T1.Z, PV.Z, T7.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.W, PV.Y, T0.Y,
+; EG-NEXT:     SUB_INT * T7.W, PV.Y, T0.Y,
+; EG-NEXT:     CNDE_INT T10.X, PV.W, T1.Y, PS,
+; EG-NEXT:     ADD_INT T1.Y, T9.Y, T5.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     ADDC_UINT T2.Z, T9.X, T5.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T5.W, PV.Z, PV.Y, T3.X,
+; EG-NEXT:     XOR_INT * T7.W, PV.X, T1.W,
+; EG-NEXT:     BIT_ALIGN_INT T8.X, PV.W, PS, literal.x,
+; EG-NEXT:     LSHR T3.Y, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T2.Z, PV.Y, PV.Z,
+; EG-NEXT:     SETGE_UINT T5.W, PV.X, T0.Y,
+; EG-NEXT:     SUB_INT * T9.W, PV.X, T0.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T3.X, PV.W, T10.X, PS,
+; EG-NEXT:     XOR_INT T1.Y, PV.Z, T5.Y,
+; EG-NEXT:     SETE_INT T2.Z, PV.Y, T1.Z,
+; EG-NEXT:     SETGE_UINT T5.W, PV.Y, T1.Z,
+; EG-NEXT:     SETGE_UINT * T9.W, PV.X, T2.W,
+; EG-NEXT:     CNDE_INT T9.X, PV.Z, PV.W, PS,
+; EG-NEXT:     SUB_INT T4.Y, T8.X, T2.W,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.X, T4.X,
+; EG-NEXT:     XOR_INT * T5.W, T7.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT:     SUB_INT * T6.W, T10.W, T6.W,
+; EG-NEXT:     CNDE_INT T3.X, T11.W, T10.W, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T5.Y, T2.Z, T5.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     LSHR T2.Z, T2.Z, literal.x,
+; EG-NEXT:     CNDE_INT * T9.W, T9.X, T8.X, T4.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MULHI * T3.Z, T6.X, T5.X,
+; EG-NEXT:     LSHL T4.X, T9.W, 1,
+; EG-NEXT:     SETE_INT T4.Y, T2.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT T4.Z, T2.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT T6.W, T5.Y, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     MULLO_INT * T3.Z, PS, T3.W,
+; EG-NEXT:     CNDE_INT T5.X, PV.Y, PV.Z, PV.W,
+; EG-NEXT:     SUB_INT T4.Y, T5.Y, T0.Y,
+; EG-NEXT:     SUB_INT T4.Z, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUBB_UINT T6.W, T5.Y, T0.Y,
+; EG-NEXT:     SUB_INT * T10.W, T6.X, PS,
+; EG-NEXT:     BFE_UINT T7.X, T7.W, literal.x, 1,
+; EG-NEXT:     SETGE_UINT T6.Y, PS, T3.W,
+; EG-NEXT:     SUB_INT T3.Z, PS, T3.W,
+; EG-NEXT:     SUB_INT T6.W, PV.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T11.W, PV.X, T5.Y, PV.Y,
 ; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T5.X, PS, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT T2.Y, PS, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     LSHL T5.Z, PV.W, 1,
-; EG-NEXT:     BFE_UINT T12.W, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T8.W, T8.W, T11.W, PV.Z,
+; EG-NEXT:     LSHL T10.X, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Y, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Z, T5.X, T2.Z, PV.W,
+; EG-NEXT:     ADD_INT T6.W, T2.Y, T8.W,
+; EG-NEXT:     ADDC_UINT * T12.W, T2.X, T8.W,
 ; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T7.X, PS, T1.W, literal.x,
-; EG-NEXT:     OR_INT T5.Y, PV.Z, PV.W,
-; EG-NEXT:     ADD_INT T0.Z, T0.Z, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T7.W, PV.X, T7.W, PV.Y, BS:VEC_102/SCL_221
-; EG-NEXT:     XOR_INT * T1.W, T4.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, T0.Y,
-; EG-NEXT:     XOR_INT T0.Y, PV.Z, T6.W,
-; EG-NEXT:     SETGE_UINT T0.Z, PV.Y, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.X, T4.Z,
-; EG-NEXT:     SETGE_UINT * T8.W, PV.X, T4.Z,
-; EG-NEXT:     SUB_INT T5.X, T10.W, T3.Z,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T0.Z, T5.Y, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T7.W, PV.X, PV.Y, literal.x,
-; EG-NEXT:     LSHR * T8.W, PV.X, literal.x,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T5.Y, T2.Z,
-; EG-NEXT:     SUB_INT T4.Y, T7.X, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T1.Z, PS, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T11.W, PS, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT * T12.W, PV.W, T0.W,
-; EG-NEXT:     SUB_INT T8.X, T8.W, T1.W,
-; EG-NEXT:     CNDE_INT * T6.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.Z, T7.W, T0.W,
-; EG-NEXT:     SUB_INT T11.W, T4.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T12.W, T2.Y, T5.Y, T0.Z,
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T2.X, T3.Y, T1.Z,
+; EG-NEXT:     ADD_INT T2.Y, PV.W, PS,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.Z, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T11.W, PV.X, PV.Y,
+; EG-NEXT:     CNDE_INT * T6.W, T6.Y, T10.W, T3.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T5.X, PS, T3.W,
+; EG-NEXT:     SUB_INT T4.Y, PS, T3.W,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.Y,
+; EG-NEXT:     CNDE_INT T10.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Y, T11.W, T0.Y,
+; EG-NEXT:     ADD_INT T3.Z, T1.X, T0.Z,
+; EG-NEXT:     CNDE_INT T10.W, PV.X, T6.W, PV.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     XOR_INT * T6.W, T2.Y, T8.W,
+; EG-NEXT:     SUBB_UINT T1.X, T11.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Z, PS, PV.W, T6.X,
+; EG-NEXT:     XOR_INT T8.W, PV.Z, T0.Z,
+; EG-NEXT:     CNDE_INT * T10.W, PV.X, T11.W, PV.Y,
+; EG-NEXT:     SUBB_UINT T5.X, T8.X, T2.W,
 ; EG-NEXT:     LSHL T4.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T2.Y, T7.X, PV.W,
-; EG-NEXT:     CNDE_INT * T7.W, T6.Y, T7.W, PV.Z,
-; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT * T7.X, T9.W, T4.W,
-; EG-NEXT:    ALU clause starting at 723:
-; EG-NEXT:     LSHL T2.Y, T7.W, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, T11.W, T12.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     OR_INT T9.W, T4.Y, T0.Z,
-; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.Z, PV.W, literal.x,
+; EG-NEXT:     LSHR T11.W, PV.Z, literal.x,
+; EG-NEXT:     SUB_INT * T12.W, PV.Y, PV.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T4.Y, T6.Y, T8.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T4.Z,
-; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T5.Y, T9.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, T2.Y, PV.X,
-; EG-NEXT:     SUB_INT * T8.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T1.Y, T10.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT * T8.W, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T10.W, T8.X, T9.W, T5.Y,
-; EG-NEXT:     LSHL T5.X, PV.W, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T5.Z, T8.W, T1.Y,
-; EG-NEXT:     SUB_INT * T5.Z, T7.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T4.X, T13.W, literal.x,
-; EG-NEXT:     OR_INT * T11.W, T3.Y, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W,
-; EG-NEXT:     SUB_INT * T2.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T11.W, T4.W,
-; EG-NEXT:     SETE_INT T12.W, T8.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T13.W, T8.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T3.Z,
-; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T10.X, T2.Z, PS,
+; EG-NEXT:     SETE_INT T2.Z, PV.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T12.W, PV.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT * T13.W, PV.Z, T3.W,
 ; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T3.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
-; EG-NEXT:    28(3.923636e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T8.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.W,
-; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T9.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T5.Z, T7.W, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W,
-; EG-NEXT:     SUB_INT * T1.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T9.W, T4.W,
-; EG-NEXT:     SETE_INT T11.W, T8.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T12.W, T8.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T11.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T3.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T1.Z, T4.Z,
+; EG-NEXT:     CNDE_INT T6.X, PV.Z, PV.W, PS,
+; EG-NEXT:     SUB_INT T5.Y, T3.Z, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT * T2.Z, PV.Y, T10.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 720:
+; EG-NEXT:     OR_INT T10.W, T4.Y, T1.X, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT * T12.W, T2.X, T5.X,
+; EG-NEXT:     CNDE_INT T1.X, T9.X, T3.Y, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T4.Z, T2.Z, T1.Y, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT * T12.W, T2.Z, T1.Y, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T13.W, T6.X, T3.Z, T5.Y,
+; EG-NEXT:     LSHL T2.X, PV.W, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T4.Z, T12.W, T2.Y,
+; EG-NEXT:     SUB_INT T4.Z, T10.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T1.X, T9.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, T4.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T0.Y,
+; EG-NEXT:     SUB_INT T3.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Z,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T1.W,
+; EG-NEXT:     BFE_UINT T4.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
 ; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T1.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T1.Z, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T4.W,
-; EG-NEXT:     SUB_INT T2.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, T9.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT * T6.Z, T10.W, T2.Z,
-; EG-NEXT:    ALU clause starting at 838:
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T4.Z,
+; EG-NEXT:     BFE_UINT T5.X, T8.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T10.W, T3.Y, T10.W, T6.Z,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T4.X,
+; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T9.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Y,
+; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
+; EG-NEXT:     SUB_INT T4.X, T11.W, T6.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T4.Z, T10.W, T0.Y, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T12.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T7.X, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Z,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T4.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
 ; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T0.Y, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T5.X,
-; EG-NEXT:    27(3.783506e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
+; EG-NEXT:     SUBB_UINT T8.X, T3.Z, T3.W,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
 ; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T3.Z,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Y,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.Y, T12.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T12.W, T4.X, T8.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T11.W, PS, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T2.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETE_INT T4.Z, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Z,
+; EG-NEXT:     CNDE_INT * T12.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T4.Z, T9.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T2.W,
+; EG-NEXT:     SUB_INT T3.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, T11.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T15.W, T11.W, T6.W,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, T3.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T6.W,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T14.W, T2.Y, T3.Z, PV.W,
 ; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T0.W,
-; EG-NEXT:     CNDE_INT * T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T4.W,
-; EG-NEXT:     SUB_INT T1.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T10.W, T7.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, T7.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T2.Z,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T2.Z,
-; EG-NEXT:     SUB_INT T10.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T4.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T13.W, T4.Y, T13.W, T5.Z,
+; EG-NEXT:     BFE_UINT T5.X, T5.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T14.W, T9.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT T9.W, T3.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T14.W, T1.X, T2.X,
+; EG-NEXT:    26(3.643376e-44), 31(4.344025e-44)
+; EG-NEXT:    ALU clause starting at 834:
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T3.Y, T4.Y, T11.W, T14.W,
+; EG-NEXT:    29(4.063766e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T4.Z, T9.W, T2.W,
+; EG-NEXT:     SETE_INT T11.W, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, T3.Z, T1.Z,
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T4.Z, T9.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T2.W,
+; EG-NEXT:     SUB_INT T2.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T13.W, T10.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T10.W, T6.W,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T13.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T6.W,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T13.W, T4.Y, T3.Z, PV.W,
 ; EG-NEXT:    26(3.643376e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T0.Z, T1.W,
+; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T5.Z,
+; EG-NEXT:     SUB_INT T8.X, T2.Z, T1.Y,
+; EG-NEXT:     LSHL T4.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T13.W, T9.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, T2.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T13.W, T1.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T13.W, PV.Z, T1.Z,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T2.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T2.Z, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T2.Z, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T3.W,
+; EG-NEXT:     SUB_INT T3.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T0.Y,
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Y,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T8.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T3.Z,
-; EG-NEXT:    26(3.643376e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T0.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T0.Z, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T10.W, PV.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T0.Z, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T2.Z,
-; EG-NEXT:     SUB_INT T2.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T13.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T4.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
+; EG-NEXT:    27(3.783506e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T5.X, T7.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T4.X,
+; EG-NEXT:    25(3.503246e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:    25(3.503246e-44), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT T11.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T4.Z, T6.W,
+; EG-NEXT:     SUBB_UINT T4.X, T9.W, T2.W,
+; EG-NEXT:     CNDE_INT * T4.Y, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T2.Z, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T3.W,
+; EG-NEXT:     SUB_INT T2.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T12.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T7.X, T11.W, T0.Y,
 ; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T0.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T0.Y,
 ; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Y,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T1.Z, PV.W,
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T12.W, T4.Y, T4.Z, PV.W,
 ; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
-; EG-NEXT:    25(3.503246e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T2.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
-; EG-NEXT:    25(3.503246e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T4.Z,
 ; EG-NEXT:    26(3.643376e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T4.W,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T0.Z, T7.W, T2.Z, BS:VEC_201
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T2.Z,
-; EG-NEXT:     SUB_INT T1.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T11.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T0.W,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T0.W,
-; EG-NEXT:     SUB_INT * T11.W, PV.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 952:
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T1.W,
-; EG-NEXT:     LSHL T1.Y, PV.W, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T1.Z, T11.W,
+; EG-NEXT:     SUB_INT T8.X, T3.Z, T1.Z,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T10.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T12.W, PV.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT * T1.X, T5.W, literal.x, 1,
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T6.Z,
-; EG-NEXT:     SUB_INT T10.X, T5.Z, T3.Z,
-; EG-NEXT:     LSHL T3.Y, PV.W, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, T11.W, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, T1.Y, T0.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T11.W, T4.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T4.Z,
-; EG-NEXT:    25(3.503246e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, T9.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T5.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T5.Z, T8.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T0.W,
-; EG-NEXT:     SUB_INT * T2.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T10.W, T4.W,
-; EG-NEXT:     SETE_INT T12.W, T9.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T10.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T3.Z,
+; EG-NEXT:    ALU clause starting at 949:
+; EG-NEXT:     CNDE_INT * T2.Y, T3.Y, T9.W, T12.W,
+; EG-NEXT:     SETGE_UINT T2.Z, T10.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T12.W, T4.Z, T6.W,
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, T2.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T9.W, T4.Y, T1.X, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T3.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y,
+; EG-NEXT:     SETE_INT T3.Z, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T3.Z, T9.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T0.Y,
+; EG-NEXT:     SUB_INT T3.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Z,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, PV.Y, T13.W, PV.Z,
+; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T5.X, T8.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T10.W, PV.Y, T10.W, PV.Z,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T3.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T5.X,
-; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T9.W, literal.y,
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T2.X,
+; EG-NEXT:    25(3.503246e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Y,
 ; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T2.Z,
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T3.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T3.Z, T9.W, T0.Y, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T13.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T2.W, BS:VEC_021/SCL_122
 ; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T5.Z, T8.W, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T0.W,
-; EG-NEXT:     SUB_INT * T1.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T9.W, T4.W,
-; EG-NEXT:     SETE_INT T10.W, T7.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T12.W, T7.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T10.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T3.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T1.Z, T4.Z,
-; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T8.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T1.W,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T1.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T1.Z, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T10.W, PV.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T1.Z, T7.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 1067:
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T4.W,
-; EG-NEXT:     SUB_INT T2.Y, T5.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT * T6.Z, T11.W, T2.Z, BS:VEC_210
-; EG-NEXT:     SETE_INT T12.W, T9.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T2.Z,
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T13.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T1.Z,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T13.W, T4.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
 ; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
 ; EG-NEXT:    22(3.082857e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T0.Y, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
-; EG-NEXT:    22(3.082857e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T3.Z,
+; EG-NEXT:     SUB_INT T8.X, T4.Z, T6.W,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T9.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T13.W, PV.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T13.W, PV.Z, T1.Y,
+; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.Y, T11.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T4.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T4.Z, PV.Z, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T4.Z, T10.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T2.W,
+; EG-NEXT:     SUB_INT T3.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, T11.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T15.W, T11.W, T6.W,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, T3.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T6.W,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T14.W, T2.Y, T3.Z, PV.W,
 ; EG-NEXT:    22(3.082857e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T0.W,
-; EG-NEXT:     CNDE_INT * T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T1.Z, T7.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T4.W,
-; EG-NEXT:     SUB_INT T1.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T11.W, T8.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, T8.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T2.Z,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T2.Z,
-; EG-NEXT:     SUB_INT T11.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T4.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
+; EG-NEXT:    ALU clause starting at 1064:
+; EG-NEXT:     CNDE_INT * T12.W, T4.Y, T12.W, T5.Z,
+; EG-NEXT:     BFE_UINT T5.X, T5.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T14.W, T10.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT T10.W, T3.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T14.W, T1.X, T4.X,
+; EG-NEXT:    21(2.942727e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Z,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T4.X, T9.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T4.Z, T10.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T2.W,
+; EG-NEXT:     SUB_INT T2.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T12.W, T9.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T9.W, T6.W,
+; EG-NEXT:     SUBB_UINT T7.X, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T12.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T6.W,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T12.W, T4.Y, T3.Z, PV.W,
 ; EG-NEXT:    21(2.942727e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T0.Z, T1.W,
+; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T5.Z,
+; EG-NEXT:     SUB_INT T8.X, T2.Z, T1.Y,
+; EG-NEXT:     LSHL T4.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T12.W, T10.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, T2.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T12.W, T1.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.Z,
+; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T9.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T2.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T2.Z, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T12.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T2.Z, T9.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T3.W,
+; EG-NEXT:     SUB_INT T3.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T0.Y,
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Y,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T7.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T3.Z,
-; EG-NEXT:    21(2.942727e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T0.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T0.Z, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T0.Z, T8.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x,
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T2.Z,
-; EG-NEXT:     SUB_INT T2.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T13.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T10.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T4.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T13.W, PV.Y, T13.W, PV.Z,
+; EG-NEXT:    22(3.082857e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T5.X, T7.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T10.W, PV.Y, T10.W, PV.Z,
-; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T2.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T5.X,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T9.W, literal.y,
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T2.X,
 ; EG-NEXT:    20(2.802597e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T4.Z,
-; EG-NEXT:    21(2.942727e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T4.W,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T0.Z, T8.W, T2.Z, BS:VEC_201
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT * T4.X, T8.W, T2.Z,
-; EG-NEXT:    ALU clause starting at 1181:
-; EG-NEXT:     SUB_INT T1.Y, T1.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, T9.W, T0.W,
-; EG-NEXT:     SETE_INT T10.W, T7.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, T7.W, T1.W,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T10.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T5.Z, T3.Z,
-; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T8.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T4.Z,
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, T9.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T5.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T10.W, PV.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T5.Z, T7.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W,
-; EG-NEXT:     SUB_INT * T2.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T11.W, T4.W,
-; EG-NEXT:     SETE_INT T12.W, T9.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T4.Z, T6.W,
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T2.W,
+; EG-NEXT:     CNDE_INT * T4.Y, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T2.Z, T9.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T3.W,
+; EG-NEXT:     SUB_INT T2.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T13.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T0.Y,
 ; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T0.Y,
+; EG-NEXT:     SUB_INT T13.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T1.Y,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T13.W, T4.Y, T4.Z, PV.W,
 ; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
+; EG-NEXT:    21(2.942727e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 1178:
+; EG-NEXT:     SUB_INT T8.X, T3.Z, T1.Z,
+; EG-NEXT:     LSHL T4.Y, T11.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, T13.W, T9.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT T9.W, T2.Y, T2.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT * T13.W, T1.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T3.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
-; EG-NEXT:    18(2.522337e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.W,
-; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T5.Z, T7.W, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W,
-; EG-NEXT:     SUB_INT * T1.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T9.W, T4.W,
-; EG-NEXT:     SETE_INT T11.W, T8.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T12.W, T8.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T11.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T3.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    18(2.522337e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T1.Z, T4.Z,
+; EG-NEXT:     SETE_INT T10.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T13.W, T4.Z, T6.W,
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, T2.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, T4.Y, T1.X, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T3.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y,
+; EG-NEXT:     SETE_INT T3.Z, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T3.Z, T10.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T0.Y,
+; EG-NEXT:     SUB_INT T3.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Z,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T1.W,
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
 ; EG-NEXT:    18(2.522337e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T1.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T4.W,
-; EG-NEXT:     SETE_INT * T1.Z, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:    ALU clause starting at 1296:
-; EG-NEXT:     SETGE_UINT T9.W, T5.Z, T3.Z,
-; EG-NEXT:     CNDE_INT * T11.W, T9.X, T7.W, T2.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T1.Z, PV.W, T1.Y,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, T4.X, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T4.W,
-; EG-NEXT:     SUB_INT T2.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, T9.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T10.W, T2.Z,
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T4.Z,
+; EG-NEXT:     BFE_UINT T5.X, T8.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T10.W, PV.Y, T10.W, PV.Z,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T4.X,
+; EG-NEXT:    20(2.802597e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Y,
+; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T4.X, T9.W, T3.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T3.Z, T10.W, T0.Y, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T12.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T7.X, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Z,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T12.W, T4.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
 ; EG-NEXT:    17(2.382207e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T0.Y, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T5.X,
-; EG-NEXT:    17(2.382207e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
+; EG-NEXT:     SUB_INT T8.X, T4.Z, T6.W,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T12.W, PV.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
 ; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T3.Z,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.Y,
+; EG-NEXT:    18(2.522337e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.Y, T11.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T4.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T4.Z, PV.Z, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T4.Z, T9.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T2.W,
+; EG-NEXT:     SUB_INT T3.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, T11.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T15.W, T11.W, T6.W,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T3.W,
+; EG-NEXT:     CNDE_INT * T4.Y, PV.W, PS, T5.Z,
+; EG-NEXT:    ALU clause starting at 1293:
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, T3.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T6.W,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T14.W, T2.Y, T3.Z, PV.W,
 ; EG-NEXT:    17(2.382207e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T0.W,
-; EG-NEXT:     CNDE_INT * T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T4.W,
-; EG-NEXT:     SUB_INT T1.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T10.W, T7.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, T7.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T2.Z,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T2.Z,
-; EG-NEXT:     SUB_INT T10.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T4.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T13.W, T4.Y, T13.W, T5.Z,
+; EG-NEXT:     BFE_UINT T5.X, T5.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T14.W, T9.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT T9.W, T3.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T14.W, T1.X, T2.X,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Z,
+; EG-NEXT:    19(2.662467e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T4.Z, T9.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T2.W,
+; EG-NEXT:     SUB_INT T2.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T13.W, T10.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T10.W, T6.W,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T13.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T6.W,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T13.W, T4.Y, T3.Z, PV.W,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T0.Z, T1.W,
+; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T5.Z,
+; EG-NEXT:     SUB_INT T8.X, T2.Z, T1.Y,
+; EG-NEXT:     LSHL T4.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T13.W, T9.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, T2.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T13.W, T1.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T13.W, PV.Z, T1.Z,
+; EG-NEXT:    18(2.522337e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T2.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T2.Z, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T2.Z, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T3.W,
+; EG-NEXT:     SUB_INT T3.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T0.Y,
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Y,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T8.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T3.Z,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T0.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T0.Z, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T10.W, PV.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T0.Z, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T2.Z,
-; EG-NEXT:     SUB_INT T2.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T13.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T4.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
+; EG-NEXT:    17(2.382207e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T5.X, T7.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
-; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T2.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T4.X,
 ; EG-NEXT:    15(2.101948e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z,
-; EG-NEXT:     SETE_INT * T9.W, PV.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 1410:
-; EG-NEXT:     SETGE_UINT * T12.W, T1.Z, T4.Z,
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T4.W,
-; EG-NEXT:     CNDE_INT * T3.Y, T9.W, PV.W, T0.Z, BS:VEC_201
-; EG-NEXT:     SUB_INT T0.Z, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T2.Z,
-; EG-NEXT:     SUB_INT T1.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T11.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T11.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T5.Z, T3.Z,
-; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T4.Z,
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    15(2.101948e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, T9.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T5.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T5.Z, T8.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T0.W,
-; EG-NEXT:     SUB_INT * T2.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T10.W, T4.W,
-; EG-NEXT:     SETE_INT T12.W, T9.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T4.Z, T6.W,
+; EG-NEXT:     SUBB_UINT T4.X, T9.W, T2.W,
+; EG-NEXT:     CNDE_INT * T4.Y, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T2.Z, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T3.W,
+; EG-NEXT:     SUB_INT T2.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T12.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T7.X, T11.W, T0.Y,
 ; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T10.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT * T5.Z, T11.W, T0.Y,
+; EG-NEXT:    ALU clause starting at 1407:
+; EG-NEXT:     SUB_INT T12.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Y,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T10.W, PV.Y, T10.W, PV.Z,
-; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T3.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T5.X,
-; EG-NEXT:    13(1.821688e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.W,
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T12.W, T4.Y, T4.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T5.Z,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     SUB_INT T8.X, T3.Z, T1.Z,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T10.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T12.W, PV.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T5.Z, T8.W, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T0.W,
-; EG-NEXT:     SUB_INT * T1.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T9.W, T4.W,
-; EG-NEXT:     SETE_INT T10.W, T7.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T12.W, T7.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T10.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T3.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T1.Z, T4.Z,
-; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T8.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
+; EG-NEXT:     SETE_INT T9.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T12.W, T4.Z, T6.W,
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, T2.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T9.W, T4.Y, T1.X, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T1.W,
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T3.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y,
+; EG-NEXT:     SETE_INT T3.Z, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T3.Z, T9.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T0.Y,
+; EG-NEXT:     SUB_INT T3.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Z,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, PV.Y, T13.W, PV.Z,
 ; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:    ALU clause starting at 1525:
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, T1.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, T3.Y, T4.X, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T1.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T1.Z, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T10.W, T8.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T1.Z, T7.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T4.W,
-; EG-NEXT:     SUB_INT T2.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, T9.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T2.Z,
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T4.Z,
+; EG-NEXT:     BFE_UINT T5.X, T8.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T9.W, literal.y,
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T2.X,
+; EG-NEXT:    15(2.101948e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Y,
+; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T3.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T3.Z, T9.W, T0.Y, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T13.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T13.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T1.Z,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T13.W, T4.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
 ; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
 ; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T0.Y, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
-; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T3.Z,
+; EG-NEXT:     SUB_INT T8.X, T4.Z, T6.W,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T9.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T13.W, PV.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T13.W, PV.Z, T1.Y,
+; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.Y, T11.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T4.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T4.Z, PV.Z, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T4.Z, T10.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT * T11.W, PV.X, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 1522:
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T2.W,
+; EG-NEXT:     SUB_INT T3.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, T11.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T15.W, T11.W, T6.W,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, T3.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T6.W,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T14.W, T2.Y, T3.Z, PV.W,
 ; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T0.W,
-; EG-NEXT:     CNDE_INT * T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T1.Z, T7.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T4.W,
-; EG-NEXT:     SUB_INT T1.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T11.W, T8.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, T8.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T2.Z,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T2.Z,
-; EG-NEXT:     SUB_INT T11.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T4.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T12.W, T4.Y, T12.W, T5.Z,
+; EG-NEXT:     BFE_UINT T5.X, T5.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T14.W, T10.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT T10.W, T3.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T14.W, T1.X, T4.X,
+; EG-NEXT:    11(1.541428e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Z,
+; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T4.X, T9.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T4.Z, T10.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T2.W,
+; EG-NEXT:     SUB_INT T2.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T12.W, T9.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T9.W, T6.W,
+; EG-NEXT:     SUBB_UINT T7.X, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T12.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T6.W,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T12.W, T4.Y, T3.Z, PV.W,
 ; EG-NEXT:    11(1.541428e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T0.Z, T1.W,
+; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T5.Z,
+; EG-NEXT:     SUB_INT T8.X, T2.Z, T1.Y,
+; EG-NEXT:     LSHL T4.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T12.W, T10.W, literal.x,
+; EG-NEXT:     OR_INT T10.W, T2.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T12.W, T1.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.Z,
+; EG-NEXT:    13(1.821688e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T9.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T2.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T2.Z, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T12.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T2.Z, T9.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T3.W,
+; EG-NEXT:     SUB_INT T3.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T0.Y,
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Y,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T7.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T3.Z,
-; EG-NEXT:    11(1.541428e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T0.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T0.Z, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T0.Z, T8.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x,
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T2.Z,
-; EG-NEXT:     SUB_INT T2.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T13.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T10.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T4.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T13.W, PV.Y, T13.W, PV.Z,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T5.X, T7.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T10.W, PV.Y, T10.W, PV.Z,
-; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T2.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT * T1.Z, PV.W, T8.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T9.W, literal.y,
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T2.X,
 ; EG-NEXT:    10(1.401298e-44), 31(4.344025e-44)
-; EG-NEXT:    ALU clause starting at 1640:
-; EG-NEXT:     OR_INT T8.W, T2.Y, T0.Z,
-; EG-NEXT:     SUB_INT * T12.W, T4.X, T5.X,
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T9.W, T1.Z, T4.Z, BS:VEC_210
-; EG-NEXT:     SETGE_UINT * T12.W, T1.Z, T4.Z,
-; EG-NEXT:    11(1.541428e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T4.W,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T0.Z, T8.W, T2.Z, BS:VEC_201
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T2.Z,
-; EG-NEXT:     SUB_INT T1.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T10.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T10.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T5.Z, T3.Z,
-; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T8.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T4.Z,
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    10(1.401298e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, T9.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T5.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T10.W, PV.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T5.Z, T7.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W,
-; EG-NEXT:     SUB_INT * T2.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T11.W, T4.W,
-; EG-NEXT:     SETE_INT T12.W, T9.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T4.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T4.Z, T6.W,
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T2.W,
+; EG-NEXT:     CNDE_INT * T4.Y, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T2.Z, T9.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 1636:
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T3.W,
+; EG-NEXT:     SUB_INT * T2.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, T11.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T13.W, T10.W, T1.Y, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT * T14.W, T10.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T0.Y,
 ; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T3.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T0.Y,
+; EG-NEXT:     SUB_INT T13.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T1.Y,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T13.W, T4.Y, T4.Z, PV.W,
 ; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
+; EG-NEXT:    11(1.541428e-44), 0(0.000000e+00)
+; EG-NEXT:     SUB_INT T8.X, T3.Z, T1.Z,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T9.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T13.W, PV.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T3.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T6.X,
-; EG-NEXT:    8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.W,
-; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T5.Z, T7.W, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T0.W,
-; EG-NEXT:     SUB_INT * T1.Y, T0.Z, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, T9.W, T4.W,
-; EG-NEXT:     SETE_INT T11.W, T8.W, T3.Z, BS:VEC_201
-; EG-NEXT:     SETGE_UINT * T12.W, T8.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T11.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T5.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T3.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T0.Z, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T1.Z, T4.Z,
+; EG-NEXT:     SETE_INT T10.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T13.W, T4.Z, T6.W,
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, T2.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, T4.Y, T1.X, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T3.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y,
+; EG-NEXT:     SETE_INT T3.Z, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T3.Z, T10.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T0.Y,
+; EG-NEXT:     SUB_INT T3.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Z,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 1755:
-; EG-NEXT:     BFE_UINT T4.X, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T1.Y, T2.Y, T8.W, T11.W,
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T5.Z, T7.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, T0.Z, T1.W,
-; EG-NEXT:     SETGE_UINT * T11.W, T0.Z, T1.W,
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, T1.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, T3.Y, T4.X, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T1.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T1.Z, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T4.W,
-; EG-NEXT:     SUB_INT T2.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, T9.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T13.W, T9.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T5.X, T10.W, T2.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T10.W, T2.Z,
-; EG-NEXT:     SUB_INT T12.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T4.Z,
+; EG-NEXT:     BFE_UINT T5.X, T8.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T10.W, PV.Y, T10.W, PV.Z,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T4.X,
+; EG-NEXT:    10(1.401298e-44), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Y,
+; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T4.X, T9.W, T3.W,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T3.Z, T10.W, T0.Y, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T3.Y, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T2.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T2.W,
+; EG-NEXT:     SETE_INT T12.W, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T7.X, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T4.Y, T10.W, T3.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Z,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T3.Z, T5.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T12.W, T4.Y, T2.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T11.W, PV.Y, T11.W, PV.Z,
 ; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T7.X, T0.Y, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T12.W, PV.X, T5.X,
-; EG-NEXT:    7(9.809089e-45), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
+; EG-NEXT:     SUB_INT T8.X, T4.Z, T6.W,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T12.W, PV.X, T7.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T7.W, literal.x, 1,
 ; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T3.Z,
+; EG-NEXT:     SETGE_UINT T3.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T9.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T1.Y,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T7.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T10.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.Y, T11.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T4.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T6.X, T4.Z, PS,
+; EG-NEXT:     SETGE_UINT * T2.Y, PV.W, T2.W,
+; EG-NEXT:    ALU clause starting at 1751:
+; EG-NEXT:     SETE_INT T4.Z, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT T11.W, T3.Z, T1.Z,
+; EG-NEXT:     CNDE_INT * T12.W, T7.X, T10.W, T3.Y,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, T2.Y,
+; EG-NEXT:     SUB_INT T4.Z, T9.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, T1.X, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T13.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T2.W,
+; EG-NEXT:     SUB_INT T3.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, T11.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T15.W, T11.W, T6.W,
+; EG-NEXT:     SUBB_UINT T2.X, T13.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T13.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T14.W, T3.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T2.Y, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T6.W,
+; EG-NEXT:     LSHL T3.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T14.W, T2.Y, T3.Z, PV.W,
 ; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T5.X, T7.W, T0.W,
-; EG-NEXT:     CNDE_INT * T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T7.W, T2.Y, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T8.W, T4.W,
-; EG-NEXT:     SUB_INT T1.Y, T5.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T6.Z, PS, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T10.W, T7.W, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, T7.W, T4.Z,
-; EG-NEXT:     SUBB_UINT T8.X, T9.W, T2.Z,
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, T6.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T2.Z,
-; EG-NEXT:     SUB_INT T10.W, T1.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T1.Z,
-; EG-NEXT:     SUB_INT T4.X, T7.W, T4.Z,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T1.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T10.W, T3.Y, T5.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T13.W, T4.Y, T13.W, T5.Z,
+; EG-NEXT:     BFE_UINT T5.X, T5.W, literal.x, 1,
+; EG-NEXT:     LSHL T2.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T14.W, T9.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT T9.W, T3.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T14.W, T1.X, T2.X,
+; EG-NEXT:    6(8.407791e-45), 31(4.344025e-44)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T11.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T14.W, PV.Z, T1.Z,
+; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT * T4.Z, T9.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Y, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T11.W, T2.Y, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T9.W, T2.W,
+; EG-NEXT:     SUB_INT T2.Y, T3.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T5.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T13.W, T10.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T10.W, T6.W,
+; EG-NEXT:     SUBB_UINT T6.X, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T5.Z,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T13.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T9.W, T4.Y, T9.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T10.W, T6.W,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T7.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T13.W, T4.Y, T3.Z, PV.W,
 ; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T0.Z, T1.W,
+; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T5.Z,
+; EG-NEXT:     SUB_INT T8.X, T2.Z, T1.Y,
+; EG-NEXT:     LSHL T4.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T13.W, T9.W, literal.x,
+; EG-NEXT:     OR_INT T9.W, T2.Y, T4.Z,
+; EG-NEXT:     SUB_INT * T13.W, T1.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Z,
+; EG-NEXT:     SETGE_UINT * T13.W, PV.Z, T1.Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T3.Y, T9.W, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.Y, T11.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T10.W, T4.Y, PV.X,
+; EG-NEXT:     SUB_INT * T11.W, T8.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T7.X, T2.Z, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T2.Z, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT T11.W, PV.Z, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T13.W, PV.X, T9.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T2.Z, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, T4.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T10.W, T3.W,
+; EG-NEXT:     SUB_INT T3.Y, T4.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T14.W, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T4.X, T12.W, T0.Y,
+; EG-NEXT:     CNDE_INT T4.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Z, T12.W, T0.Y,
+; EG-NEXT:     SUB_INT T14.W, PV.Y, PV.X,
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T10.W, T2.Z,
+; EG-NEXT:     SUB_INT T1.X, T11.W, T1.Y,
 ; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T5.Z, PV.W, T8.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T10.W, PV.X, T8.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T7.W, PS,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.W, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.Z, T3.Z,
-; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T8.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T8.W, T4.W,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Y, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T7.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T5.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T9.X, T0.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T0.Z, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T4.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T10.W, PV.X, T8.W, PV.Y,
-; EG-NEXT:     LSHL T5.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T0.Z, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T11.W, T6.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T2.Z,
-; EG-NEXT:     SUB_INT T2.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T12.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T13.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T6.X, T11.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T11.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T12.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T1.W,
+; EG-NEXT:     BFE_UINT T2.Z, T8.W, literal.x, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T14.W, T2.Y, T4.Z, PV.W,
+; EG-NEXT:     CNDE_INT * T12.W, PV.Y, T12.W, PV.Z,
+; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T5.X, T7.W, literal.x, 1,
 ; EG-NEXT:     LSHL T2.Y, PS, 1,
-; EG-NEXT:     BFE_UINT * T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 1871:
-; EG-NEXT:     CNDE_INT T12.W, T1.Y, T1.Z, T12.W,
-; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T11.W, T6.Z,
-; EG-NEXT:     BFE_UINT T7.X, T2.X, literal.x, 1,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T7.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, T2.Y, T0.Z,
-; EG-NEXT:     SUB_INT * T12.W, T4.X, T6.X,
+; EG-NEXT:     BIT_ALIGN_INT T4.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T10.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T14.W, PV.X, T4.X,
 ; EG-NEXT:    5(7.006492e-45), 31(4.344025e-44)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T2.Y, T3.Y, T9.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.Z, T4.Z,
-; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T6.X, T8.W, T4.W,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T0.Z, T7.W, T2.Z, BS:VEC_201
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T2.Y, T11.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T1.Y, T4.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T4.X, T7.W, T2.Z,
-; EG-NEXT:     SUB_INT T1.Y, T1.Z, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T6.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T11.W, PV.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, PV.W, T1.W,
-; EG-NEXT:     SUBB_UINT T9.X, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T2.Y, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T6.Z, T9.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T11.W, PV.Y, PV.X,
-; EG-NEXT:     CNDE_INT * T7.W, T3.Y, T7.W, T0.Z,
-; EG-NEXT:     SUB_INT T4.X, T8.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T0.Z, T3.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T11.W, T3.Y, T1.Z, PV.W,
-; EG-NEXT:     CNDE_INT * T9.W, PV.Y, T9.W, PV.Z,
-; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T10.X, T5.Z, T3.Z,
-; EG-NEXT:     LSHL T3.Y, PS, 1,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.W, T7.W, literal.x,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T11.W, PV.X, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T2.Y, T8.W, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, PV.Z, T4.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T4.Z,
+; EG-NEXT:     BFE_UINT T1.X, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Y, T11.W, PS,
+; EG-NEXT:     SETGE_UINT * T2.Z, PV.W, T3.W, BS:VEC_021/SCL_122
 ; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T9.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, T9.W, literal.x,
-; EG-NEXT:     OR_INT T8.W, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T10.X, T6.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.X, T8.X, T5.Z, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T1.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T11.W, PV.X, T7.W, PV.Y,
-; EG-NEXT:     LSHL T6.X, PS, 1,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T5.Z, T8.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T10.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T10.W, T5.X, T7.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T4.X, T3.X, literal.x, 1,
-; EG-NEXT:     SETGE_UINT T2.Y, PS, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T6.Z, PV.W, T3.Z,
-; EG-NEXT:     SETGE_UINT T12.W, PV.W, T3.Z,
-; EG-NEXT:     CNDE_INT * T13.W, PV.Y, T8.W, PV.Z,
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHL T5.X, PS, 1,
+; EG-NEXT:    ALU clause starting at 1865:
+; EG-NEXT:     SETE_INT T11.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T4.Z, T6.W,
+; EG-NEXT:     CNDE_INT T4.X, PV.W, PS, T2.Z,
+; EG-NEXT:     SUB_INT T4.Y, T10.W, T3.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, T3.Y, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT T11.W, T2.Y, T1.X, BS:VEC_102/SCL_221
+; EG-NEXT:     OR_INT * T12.W, T2.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T1.X, PS, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y,
+; EG-NEXT:     SETE_INT T5.Z, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T14.W, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T15.W, PV.X, T10.W, PV.Y,
+; EG-NEXT:     LSHL T2.X, PS, 1,
 ; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
-; EG-NEXT:     SUB_INT T5.Z, T10.W, T4.W,
-; EG-NEXT:     SUBB_UINT T12.W, T10.W, T4.W,
-; EG-NEXT:     SUB_INT * T14.W, T9.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T7.X, T7.W, T2.Z,
-; EG-NEXT:     SUBB_UINT * T3.Y, T8.W, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:     SUB_INT T6.Z, T0.Z, T1.W,
-; EG-NEXT:     SUB_INT * T7.W, T14.W, T12.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T8.W, T2.Y, T10.W, T5.Z,
-; EG-NEXT:     SUB_INT T8.X, T1.Z, T4.Z,
-; EG-NEXT:     LSHL T4.Y, PV.W, 1,
-; EG-NEXT:     BFE_UINT T5.Z, T2.X, literal.x, 1,
-; EG-NEXT:     CNDE_INT T7.W, T2.Y, T9.W, T7.W,
-; EG-NEXT:     SUB_INT * T9.W, T6.Z, T3.Y,
+; EG-NEXT:     SUB_INT T5.Z, T11.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUBB_UINT T14.W, T11.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT * T16.W, T2.Z, T1.Y,
+; EG-NEXT:     SUBB_UINT * T5.X, T10.W, T3.W,
+; EG-NEXT:     SUBB_UINT T3.Y, T9.W, T2.W,
+; EG-NEXT:     SUB_INT * T6.Z, T3.Z, T1.Z,
+; EG-NEXT:     SUB_INT T9.W, T16.W, T14.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T10.W, T2.Y, T11.W, T5.Z,
+; EG-NEXT:     SUB_INT T7.X, T4.Z, T6.W,
+; EG-NEXT:     LSHL T4.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T5.Z, T5.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T9.W, T2.Y, T2.Z, PV.W,
+; EG-NEXT:     SUB_INT * T11.W, T6.Z, T3.Y,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T10.X, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT T1.Y, T1.Y, T0.Z, PS, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, PV.W, T8.W, literal.y,
-; EG-NEXT:     OR_INT T7.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT * T8.W, PV.X, T7.X,
-; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
-; EG-NEXT:     CNDE_INT T7.X, T9.X, T1.Z, PS,
-; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T4.W,
-; EG-NEXT:     SETE_INT T1.Z, PV.Z, T3.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T8.W, PV.Y, T13.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT * T9.W, T5.X, PV.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T5.X, T0.Z, T3.Z,
-; EG-NEXT:     SUBB_UINT T1.Y, T7.W, T4.W,
-; EG-NEXT:     SETGE_UINT * T5.Z, PS, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T10.W, T8.W, T1.W,
-; EG-NEXT:     SETGE_UINT * T12.W, T8.W, T1.W,
-; EG-NEXT:     SUB_INT T8.X, T0.Z, T3.Z,
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, T5.Z,
-; EG-NEXT:     SUB_INT T5.Z, T9.W, T0.W,
-; EG-NEXT:     SUBB_UINT * T10.W, T9.W, T0.W,
-; EG-NEXT:     SUB_INT * T12.W, T8.W, T1.W,
-; EG-NEXT:     SUB_INT T9.X, PV.W, T10.W,
-; EG-NEXT:     CNDE_INT * T4.Y, T3.Y, T9.W, T5.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T5.Z, T7.W, T4.W,
-; EG-NEXT:     SUB_INT T9.W, T8.X, T1.Y,
-; EG-NEXT:     CNDE_INT * T10.W, T1.Z, T5.X, T2.Y,
-; EG-NEXT:    ALU clause starting at 1985:
-; EG-NEXT:     CNDE_INT T5.X, T10.W, T0.Z, T9.W,
-; EG-NEXT:     CNDE_INT T1.Y, T10.W, T7.W, T5.Z,
-; EG-NEXT:     LSHL T0.Z, T4.Y, 1,
-; EG-NEXT:     BFE_UINT * T7.W, T0.Y, literal.x, 1, BS:VEC_120/SCL_212
-; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T8.W, T3.Y, T8.W, T9.X,
-; EG-NEXT:     BIT_ALIGN_INT T8.X, PV.W, T4.Y, literal.x,
-; EG-NEXT:     OR_INT T2.Y, T0.Z, T7.W,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, T5.X, T1.Y, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT * T7.W, T7.X, T11.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T8.W, T6.X, T4.X,
-; EG-NEXT:     SETGE_UINT T4.X, PV.W, T2.Z,
-; EG-NEXT:     SETE_INT T3.Y, T7.W, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T1.Z, T7.W, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     BFE_UINT T9.W, T2.X, literal.x, 1,
-; EG-NEXT:     LSHL * T10.W, T1.Y, 1,
+; EG-NEXT:     BFE_UINT T8.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T6.X, T3.Z, PS,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.W, T10.W, literal.y,
+; EG-NEXT:     OR_INT T9.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T10.W, PV.X, T5.X,
+; EG-NEXT:    6(8.407791e-45), 31(4.344025e-44)
+; EG-NEXT:     CNDE_INT T4.X, T4.X, T4.Z, PS,
+; EG-NEXT:     SETGE_UINT T3.Y, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T3.Z, PV.Z, T1.Y,
+; EG-NEXT:     BIT_ALIGN_INT T10.W, PV.Y, T13.W, literal.x,
+; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T1.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT T5.X, PV.W, T1.Z,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PS, PV.Y,
+; EG-NEXT:     SUB_INT T3.Z, T9.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, PV.X, T15.W, literal.x,
+; EG-NEXT:     OR_INT * T13.W, T2.X, T8.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T2.X, T10.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T3.Y, PS, T3.W,
+; EG-NEXT:     SETE_INT T4.Z, PV.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT * T14.W, PV.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T15.W, T2.Y, T9.W, T3.Z,
+; EG-NEXT:     LSHL T4.X, PV.W, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T4.Z, T14.W, T3.Y,
+; EG-NEXT:     SUB_INT T3.Z, T13.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T14.W, T5.X, T2.X, T1.X,
+; EG-NEXT:     SUB_INT * T16.W, T12.W, T2.W,
+; EG-NEXT:     SUBB_UINT T1.X, T12.W, T2.W,
+; EG-NEXT:     SUB_INT * T4.Y, T10.W, T1.Z, BS:VEC_201
+; EG-NEXT:     SUBB_UINT * T4.Z, T13.W, T3.W,
+; EG-NEXT:     SUB_INT * T17.W, T11.W, T6.W,
+; EG-NEXT:     CNDE_INT * T12.W, T14.W, T12.W, T16.W,
+; EG-NEXT:     LSHL T2.X, PV.W, 1,
+; EG-NEXT:     BFE_UINT T5.Y, T7.W, literal.x, 1,
+; EG-NEXT:     SUB_INT T4.Z, T17.W, T4.Z, BS:VEC_210
+; EG-NEXT:     SUB_INT T16.W, T4.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T13.W, T3.Y, T13.W, T3.Z,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL T1.X, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Y, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T3.Z, T14.W, T10.W, PV.W, BS:VEC_120/SCL_212
+; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T10.W, T3.Y, T11.W, T4.Z,
+; EG-NEXT:     OR_INT * T11.W, T2.X, T5.Y,
+; EG-NEXT:     BFE_UINT T2.X, T5.W, literal.x, 1,
+; EG-NEXT:     SETGE_UINT T3.Y, PS, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT * T4.Z, PV.W, T13.W, literal.y, BS:VEC_021/SCL_122
+; EG-NEXT:    3(4.203895e-45), 31(4.344025e-44)
+; EG-NEXT:     BIT_ALIGN_INT T10.W, T3.Z, T12.W, literal.x,
+; EG-NEXT:     OR_INT * T12.W, T1.X, T4.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT T1.X, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T4.Y, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T3.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T13.W, T4.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T14.W, T4.Z, T6.W,
+; EG-NEXT:     CNDE_INT T5.X, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T5.Y, T12.W, T3.W,
+; EG-NEXT:     CNDE_INT * T3.Z, PV.X, PV.Y, T3.Y,
+; EG-NEXT:     SUB_INT T13.W, T11.W, T2.W,
+; EG-NEXT:     OR_INT * T14.W, T4.X, T2.X,
+; EG-NEXT:     SETGE_UINT T1.X, PS, T0.Y,
+; EG-NEXT:     SUBB_UINT T3.Y, T11.W, T2.W,
+; EG-NEXT:     SUB_INT T5.Z, T10.W, T1.Z, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T11.W, T3.Z, T11.W, PV.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT * T13.W, T5.X, T12.W, T5.Y,
+; EG-NEXT:     LSHL T2.X, PV.W, 1,
+; EG-NEXT:     LSHL T4.Y, T11.W, 1,
+; EG-NEXT:     SUBB_UINT * T6.Z, T12.W, T3.W, BS:VEC_120/SCL_212
+; EG-NEXT:     SUB_INT T12.W, T4.Z, T6.W,
+; EG-NEXT:     SUB_INT * T16.W, T5.Z, T3.Y,
+; EG-NEXT:     BFE_UINT T4.X, T7.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Y, T3.Z, T10.W, PS,
+; EG-NEXT:     SUBB_UINT T3.Z, T9.W, T0.Y, BS:VEC_201
+; EG-NEXT:     SUB_INT T9.W, T2.Z, T1.Y, BS:VEC_210
+; EG-NEXT:     SUB_INT * T10.W, PV.W, T6.Z,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     OR_INT T5.X, PS, PV.W,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.Z, PV.X,
-; EG-NEXT:     SUB_INT T1.Z, T8.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUBB_UINT T9.W, T8.W, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T10.W, T7.W, T4.Z,
-; EG-NEXT:     SUB_INT T4.X, PS, PV.W,
-; EG-NEXT:     CNDE_INT T3.Y, PV.Y, T8.W, PV.Z,
-; EG-NEXT:     SETGE_UINT T1.Z, PV.X, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T8.W, T0.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T9.W, T0.Z, T3.Z,
-; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT T4.Y, T5.X, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     LSHL T1.Z, PV.Y, 1,
-; EG-NEXT:     BFE_UINT T8.W, T3.X, literal.x, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T7.W, T1.Y, T7.W, PV.X,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T4.X, PS, T3.Y, literal.x,
-; EG-NEXT:     OR_INT T1.Y, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T1.Z, PV.X, T5.X, PV.Y,
-; EG-NEXT:     SUBB_UINT T7.W, T2.Y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T8.W, T8.X, T1.W,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T7.X, PS, PV.W,
-; EG-NEXT:     LSHL T3.Y, PV.Z, 1,
-; EG-NEXT:     SETGE_UINT T5.Z, PV.Y, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T7.W, PV.X, T4.Z,
-; EG-NEXT:     SETGE_UINT * T8.W, PV.X, T4.Z,
-; EG-NEXT:     BFE_UINT T9.X, T2.X, literal.x, 1,
-; EG-NEXT:     SETGE_UINT T4.Y, T2.Y, T0.W,
-; EG-NEXT:     SETE_INT T6.Z, T8.X, T1.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T7.W, PV.W, PS, PV.Z,
-; EG-NEXT:     SUB_INT * T8.W, T1.Y, T2.Z,
+; EG-NEXT:     BFE_UINT T6.X, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T5.Y, T5.X, T4.Z, PS,
+; EG-NEXT:     SUB_INT T3.Z, PV.W, PV.Z,
+; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.Y, T11.W, literal.y,
+; EG-NEXT:     OR_INT * T10.W, T4.Y, PV.X,
+; EG-NEXT:    4(5.605194e-45), 31(4.344025e-44)
+; EG-NEXT:    ALU clause starting at 1978:
+; EG-NEXT:     SETGE_UINT T4.X, T10.W, T2.W,
+; EG-NEXT:     SETE_INT T3.Y, T9.W, T1.Z, BS:VEC_201
+; EG-NEXT:     CNDE_INT * T2.Z, T2.Y, T2.Z, T3.Z,
+; EG-NEXT:     BIT_ALIGN_INT T11.W, T5.Y, T13.W, literal.x,
+; EG-NEXT:     OR_INT * T12.W, T2.X, T6.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T2.X, T9.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T2.Y, PS, T3.W,
+; EG-NEXT:     SETE_INT * T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     BIT_ALIGN_INT T13.W, T2.Z, T15.W, literal.x, BS:VEC_102/SCL_221
+; EG-NEXT:     SETGE_UINT * T15.W, T11.W, T6.W,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT T5.X, PV.W, T1.Y,
+; EG-NEXT:     CNDE_INT T2.Y, T3.Z, PS, T2.Y,
+; EG-NEXT:     SUB_INT T2.Z, T12.W, T3.W,
+; EG-NEXT:     CNDE_INT * T15.W, T3.Y, T2.X, T4.X,
+; EG-NEXT:     SUB_INT * T16.W, T10.W, T2.W,
+; EG-NEXT:     SUBB_UINT T2.X, T10.W, T2.W,
+; EG-NEXT:     SUB_INT * T3.Y, T9.W, T1.Z, BS:VEC_201
+; EG-NEXT:     SUBB_UINT * T3.Z, T12.W, T3.W,
+; EG-NEXT:     SUB_INT * T17.W, T11.W, T6.W,
+; EG-NEXT:     CNDE_INT * T10.W, T15.W, T10.W, T16.W,
+; EG-NEXT:     LSHL T4.X, PV.W, 1,
+; EG-NEXT:     BFE_UINT T4.Y, T7.W, literal.x, 1,
+; EG-NEXT:     SUB_INT T3.Z, T17.W, T3.Z, BS:VEC_210
+; EG-NEXT:     SUB_INT T16.W, T3.Y, T2.X,
+; EG-NEXT:     CNDE_INT * T12.W, T2.Y, T12.W, T2.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T10.X, T1.Y, T2.Z,
-; EG-NEXT:     SUB_INT T5.Y, T4.X, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUBB_UINT * T5.Z, T5.X, T4.W, BS:VEC_120/SCL_212
-; EG-NEXT:     SUB_INT T9.W, T0.Z, T3.Z,
-; EG-NEXT:     CNDE_INT * T8.W, T7.W, T1.Y, T8.W,
-; EG-NEXT:     SETGE_UINT T5.X, T8.X, T1.W,
-; EG-NEXT:     LSHL T1.Y, PS, 1,
-; EG-NEXT:     BFE_UINT T7.Z, T3.X, 1, 1, BS:VEC_201
-; EG-NEXT:     SUB_INT T9.W, PV.W, T5.Z,
-; EG-NEXT:     SUB_INT * T10.W, T5.Y, T10.X,
-; EG-NEXT:     CNDE_INT T4.X, T7.W, T4.X, PS,
-; EG-NEXT:     CNDE_INT T5.Y, T6.X, T0.Z, PV.W,
-; EG-NEXT:     OR_INT T0.Z, PV.Y, PV.Z,
-; EG-NEXT:     CNDE_INT T7.W, T6.Z, PV.X, T4.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T9.W, T2.Y, T0.W,
-; EG-NEXT:     CNDE_INT T5.X, PV.W, T2.Y, PS,
-; EG-NEXT:     SETGE_UINT T1.Y, PV.Z, T2.Z,
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Y, T1.Z, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T8.W, PV.X, T8.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T3.Y, T9.X,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETE_INT T4.X, PV.W, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PS, T4.W,
-; EG-NEXT:     SETE_INT T10.W, PV.Z, T3.Z,
-; EG-NEXT:     SETGE_UINT * T11.W, PV.Z, T3.Z,
+; EG-NEXT:     LSHL T2.X, PS, 1,
+; EG-NEXT:     BFE_UINT T3.Y, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT * T2.Z, T15.W, T9.W, PV.W, BS:VEC_120/SCL_212
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T9.W, T2.Y, T11.W, T3.Z,
+; EG-NEXT:     OR_INT * T11.W, T4.X, T4.Y,
+; EG-NEXT:     SETGE_UINT T4.X, T13.W, T1.Y,
+; EG-NEXT:     SETGE_UINT T2.Y, PS, T2.W,
+; EG-NEXT:     BIT_ALIGN_INT * T3.Z, PV.W, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T2.Z, T10.W, literal.x,
+; EG-NEXT:     OR_INT * T10.W, T2.X, T3.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT T2.X, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T3.Y, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T2.Z, PS, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T12.W, T3.Z, T6.W,
+; EG-NEXT:     SETGE_UINT * T15.W, T3.Z, T6.W,
 ; EG-NEXT:     CNDE_INT T6.X, PV.W, PS, PV.Z,
-; EG-NEXT:     CNDE_INT T1.Y, PV.X, PV.Y, T1.Y,
-; EG-NEXT:     LSHL T5.Z, T5.X, 1, BS:VEC_201
-; EG-NEXT:     BFE_UINT T10.W, T0.Y, literal.x, 1,
-; EG-NEXT:     CNDE_INT * T7.W, T7.W, T8.X, T7.X,
+; EG-NEXT:     CNDE_INT T2.Y, PV.X, PV.Y, T2.Y,
+; EG-NEXT:     SUB_INT T2.Z, T11.W, T2.W,
+; EG-NEXT:     SUBB_UINT T12.W, T11.W, T2.W,
+; EG-NEXT:     SUB_INT * T15.W, T9.W, T1.Z,
+; EG-NEXT:     SUB_INT T2.X, T10.W, T3.W,
+; EG-NEXT:     SUBB_UINT T3.Y, T10.W, T3.W,
+; EG-NEXT:     SUB_INT T4.Z, T3.Z, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT * T12.W, PS, PV.W,
+; EG-NEXT:     CNDE_INT * T11.W, T2.Y, T11.W, T2.Z,
+; EG-NEXT:     LSHL T7.X, PV.W, 1,
+; EG-NEXT:     BFE_UINT T4.Y, T7.W, 1, 1,
+; EG-NEXT:     CNDE_INT T2.Z, T2.Y, T9.W, T12.W,
+; EG-NEXT:     SUB_INT * T9.W, T4.Z, T3.Y,
+; EG-NEXT:     CNDE_INT * T10.W, T6.X, T10.W, T2.X,
+; EG-NEXT:     LSHL T2.X, PV.W, 1,
+; EG-NEXT:     BFE_UINT T2.Y, T8.W, literal.x, 1,
+; EG-NEXT:     CNDE_INT T3.Z, T6.X, T3.Z, T9.W,
+; EG-NEXT:     BIT_ALIGN_INT T9.W, T2.Z, T11.W, literal.y,
+; EG-NEXT:     OR_INT * T11.W, T7.X, T4.Y,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     SETGE_UINT T6.X, PS, T2.W,
+; EG-NEXT:     SETE_INT T3.Y, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T2.Z, PV.W, T1.Z,
+; EG-NEXT:     BIT_ALIGN_INT T10.W, PV.Z, T10.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, PV.X, PV.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T2.X, PS, T3.W,
+; EG-NEXT:     SUB_INT T2.Y, PV.W, T6.W, BS:VEC_102/SCL_221
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, PV.X,
+; EG-NEXT:     CNDE_INT T15.W, T5.X, T4.X, T1.X,
+; EG-NEXT:     SUB_INT * T16.W, T14.W, T0.Y,
+; EG-NEXT:     SUBB_UINT T1.X, T14.W, T0.Y,
+; EG-NEXT:     SUB_INT * T3.Y, T13.W, T1.Y, BS:VEC_120/SCL_212
+; EG-NEXT:     SETGE_UINT * T3.Z, T12.W, T3.W,
+; EG-NEXT:     SETE_INT T17.W, T10.W, T6.W,
+; EG-NEXT:     SETGE_UINT * T18.W, T10.W, T6.W,
+; EG-NEXT:     SUB_INT T4.X, T11.W, T2.W,
+; EG-NEXT:     CNDE_INT * T4.Y, PV.W, PS, T3.Z,
+; EG-NEXT:     SUB_INT T3.Z, T12.W, T3.W,
+; EG-NEXT:     SUB_INT * T17.W, T3.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T14.W, T15.W, T14.W, T16.W,
+; EG-NEXT:     LSHL T1.X, PV.W, 1,
+; EG-NEXT:     BFE_UINT * T3.Y, T5.W, literal.x, 1,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T4.X, T0.Z, T2.Z,
-; EG-NEXT:     SUBB_UINT T2.Y, T0.Z, T2.Z,
-; EG-NEXT:     SUB_INT T6.Z, T8.W, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T7.W, PS, T5.X, literal.x,
-; EG-NEXT:     OR_INT * T10.W, PV.Z, PV.W,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T5.X, PS, T0.W,
-; EG-NEXT:     SETE_INT T3.Y, PV.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T11.W, PV.Z, PV.Y,
-; EG-NEXT:     CNDE_INT * T12.W, T1.Y, T0.Z, PV.X,
-; EG-NEXT:     SUBB_UINT * T4.X, T10.W, T0.W,
-; EG-NEXT:     SUB_INT T2.Y, T7.W, T1.W,
-; EG-NEXT:     LSHL T0.Z, T12.W, 1, BS:VEC_201
-; EG-NEXT:     AND_INT * T13.W, T3.X, 1,
-; EG-NEXT:     CNDE_INT * T8.W, T1.Y, T8.W, T11.W,
-; EG-NEXT:     SUB_INT T3.X, T10.W, T0.W,
-; EG-NEXT:     BIT_ALIGN_INT * T1.Y, PV.W, T12.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT T0.Z, T0.Z, T13.W,
-; EG-NEXT:     SUB_INT T8.W, T2.Y, T4.X,
-; EG-NEXT:     CNDE_INT * T11.W, T3.Y, T5.Z, T5.X,
-; EG-NEXT:     SUB_INT T4.X, T9.W, T4.W,
-; EG-NEXT:     CNDE_INT T2.Y, PS, T7.W, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T5.Z, PV.Z, T2.Z,
-; EG-NEXT:     SETE_INT * T7.W, T1.Y, T4.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T8.W, T11.W, T10.W, T3.X,
-; EG-NEXT:     SETGE_UINT T3.X, T1.Y, T4.Z,
-; EG-NEXT:     SUBB_UINT T3.Y, T0.Z, T2.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T4.Z, T1.Y, T4.Z,
-; EG-NEXT:    ALU clause starting at 2098:
-; EG-NEXT:     BFE_UINT T10.W, T0.Y, 1, 1,
-; EG-NEXT:     LSHL * T11.W, T8.W, 1,
-; EG-NEXT:     SUBB_UINT T5.X, T9.W, T4.W,
-; EG-NEXT:     SUB_INT T4.Y, T1.Z, T3.Z,
-; EG-NEXT:     OR_INT T6.Z, PS, PV.W,
-; EG-NEXT:     SUB_INT * T10.W, T4.Z, T3.Y, BS:VEC_201
-; EG-NEXT:     CNDE_INT * T7.W, T7.W, T3.X, T5.Z,
-; EG-NEXT:     CNDE_INT T3.X, PV.W, T1.Y, T10.W,
-; EG-NEXT:     SETGE_UINT T1.Y, T6.Z, T0.W,
-; EG-NEXT:     SUB_INT T4.Z, T4.Y, T5.X,
-; EG-NEXT:     BIT_ALIGN_INT * T8.W, T2.Y, T8.W, literal.x, BS:VEC_201
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T9.W, T6.X, T9.W, T4.X,
-; EG-NEXT:     SETE_INT T4.X, T8.W, T1.W,
-; EG-NEXT:     SETGE_UINT T2.Y, T8.W, T1.W,
-; EG-NEXT:     LSHL T5.Z, PV.W, 1,
-; EG-NEXT:     BFE_UINT T10.W, T2.X, 1, 1,
-; EG-NEXT:     CNDE_INT * T11.W, T6.X, T1.Z, T4.Z,
-; EG-NEXT:     BIT_ALIGN_INT T5.X, PS, T9.W, literal.x, BS:VEC_021/SCL_122
-; EG-NEXT:     OR_INT T3.Y, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T1.Z, PV.X, PV.Y, T1.Y,
-; EG-NEXT:     SUB_INT T9.W, T6.Z, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     XOR_INT * T10.W, T3.X, T5.W,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T3.X, PS, T5.W,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Z, T6.Z, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT T4.Z, PV.Y, T4.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETE_INT T9.W, PV.X, T3.Z,
-; EG-NEXT:     SETGE_UINT * T10.W, PV.X, T3.Z,
-; EG-NEXT:     CNDE_INT T4.X, PV.W, PS, PV.Z,
-; EG-NEXT:     LSHL T2.Y, PV.Y, 1,
-; EG-NEXT:     AND_INT T4.Z, T0.Y, 1,
-; EG-NEXT:     SUBB_UINT T9.W, T6.Z, T0.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT * T10.W, T8.W, T1.W,
-; EG-NEXT:     SUB_INT T6.X, T3.Y, T4.W,
+; EG-NEXT:     CNDE_INT * T4.Z, T15.W, T13.W, T17.W,
+; EG-NEXT:     CNDE_INT T12.W, T4.Y, T12.W, T3.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T13.W, T2.Z, T11.W, T4.X,
+; EG-NEXT:     LSHL T4.X, PS, 1,
+; EG-NEXT:     LSHL T5.Y, PV.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, T4.Z, T14.W, literal.x,
+; EG-NEXT:     OR_INT T14.W, T1.X, T3.Y,
+; EG-NEXT:     SUB_INT * T15.W, T2.Y, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_UINT T1.X, T8.W, 1, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T4.Y, T10.W, PS,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T10.W, PV.Z, T1.Y,
+; EG-NEXT:     SETGE_UINT * T15.W, PV.Z, T1.Y,
+; EG-NEXT:     AND_INT T2.X, T7.W, 1,
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, PS, PV.Z,
+; EG-NEXT:     SUB_INT T4.Z, T14.W, T0.Y, BS:VEC_102/SCL_221
+; EG-NEXT:     BIT_ALIGN_INT T7.W, PV.Y, T12.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T10.W, T5.Y, PV.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, T14.W, T0.Y,
+; EG-NEXT:     SUB_INT T2.Y, T3.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T5.Z, PS, T3.W,
+; EG-NEXT:     SETE_INT * T12.W, PV.W, T6.W, BS:VEC_021/SCL_122
+; EG-NEXT:    ALU clause starting at 2093:
+; EG-NEXT:     SETGE_UINT * T15.W, T7.W, T6.W,
+; EG-NEXT:     SUBB_UINT T5.X, T11.W, T2.W,
+; EG-NEXT:     CNDE_INT * T4.Y, T12.W, PV.W, T5.Z, BS:VEC_201
+; EG-NEXT:     SUB_INT T5.Z, T10.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T11.W, T2.Y, T1.X,
+; EG-NEXT:     CNDE_INT * T12.W, T3.Y, T14.W, T4.Z,
+; EG-NEXT:     SUB_INT T1.X, T9.W, T1.Z,
+; EG-NEXT:     LSHL T2.Y, PS, 1,
+; EG-NEXT:     BFE_UINT T4.Z, T5.W, 1, 1, BS:VEC_201
+; EG-NEXT:     CNDE_INT T11.W, T3.Y, T3.Z, PV.W, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T14.W, T4.Y, T10.W, PV.Z,
+; EG-NEXT:     LSHL T6.X, PS, 1,
+; EG-NEXT:     AND_INT T3.Y, T8.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T3.Z, PV.W, T12.W, literal.x,
+; EG-NEXT:     OR_INT T8.W, PV.Y, PV.Z,
+; EG-NEXT:     SUB_INT * T11.W, PV.X, T5.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T1.X, T2.Z, T9.W, PS,
+; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T0.Y,
+; EG-NEXT:     SETE_INT T2.Z, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGE_UINT T9.W, PV.Z, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T11.W, PV.X, PV.Y,
+; EG-NEXT:     SUB_INT T5.X, PS, T3.W,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     SUB_INT T2.Z, T8.W, T0.Y,
+; EG-NEXT:     BIT_ALIGN_INT T9.W, PV.X, T13.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     OR_INT * T12.W, T4.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T1.X, PS, T2.W,
+; EG-NEXT:     SETE_INT T3.Y, PV.W, T1.Z,
+; EG-NEXT:     SETGE_UINT T4.Z, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T13.W, PS, T2.W,
+; EG-NEXT:     SUB_INT * T15.W, PV.W, T1.Z,
+; EG-NEXT:     SUBB_UINT T2.X, T8.W, T0.Y,
+; EG-NEXT:     SUB_INT T5.Y, T3.Z, T1.Y, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT T1.Z, PS, PV.W,
+; EG-NEXT:     CNDE_INT T13.W, PV.Y, PV.Z, PV.X,
+; EG-NEXT:     CNDE_INT * T8.W, T2.Y, T8.W, T2.Z,
+; EG-NEXT:     LSHL T1.X, PS, 1,
+; EG-NEXT:     CNDE_INT T3.Y, PV.W, T9.W, PV.Z,
+; EG-NEXT:     SUBB_UINT * T1.Z, T10.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T9.W, T7.W, T6.W,
+; EG-NEXT:     SUB_INT * T10.W, T5.Y, T2.X,
+; EG-NEXT:     AND_INT T2.X, T5.W, 1,
+; EG-NEXT:     CNDE_INT T2.Y, T2.Y, T3.Z, PS,
+; EG-NEXT:     SUB_INT T1.Z, PV.W, T1.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT * T2.W, T12.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     XOR_INT * T5.W, T3.Y, T1.W,
+; EG-NEXT:     SUB_INT * T4.X, PV.W, T1.W,
+; EG-NEXT:     CNDE_INT * T3.Y, T13.W, T12.W, T2.W,
+; EG-NEXT:     CNDE_INT T1.Z, T4.Y, T7.W, T1.Z,
+; EG-NEXT:     BIT_ALIGN_INT T2.W, T2.Y, T8.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     OR_INT * T5.W, T1.X, T2.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SETGE_UINT T1.X, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T2.Y, PV.W, T1.Y,
+; EG-NEXT:     SETGE_UINT T2.Z, PV.W, T1.Y,
+; EG-NEXT:     SUBB_UINT T7.W, PS, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT * T8.W, PV.W, T1.Y,
+; EG-NEXT:     SUB_INT T2.X, T5.W, T0.Y,
 ; EG-NEXT:     SUB_INT T0.Y, PS, PV.W,
-; EG-NEXT:     SUBB_UINT T5.Z, T3.Y, T4.W,
-; EG-NEXT:     SUB_INT T9.W, T5.X, T3.Z,
-; EG-NEXT:     OR_INT * T10.W, PV.Y, PV.Z,
-; EG-NEXT:     SUB_INT T7.X, PS, T0.W,
-; EG-NEXT:     SUB_INT T2.Y, PV.W, PV.Z,
-; EG-NEXT:     CNDE_INT T1.Z, T1.Z, T8.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T8.W, T4.X, T3.Y, PV.X,
-; EG-NEXT:     SUB_INT * T9.W, T0.Z, T2.Z,
-; EG-NEXT:     CNDE_INT T6.X, T7.W, T0.Z, PS,
-; EG-NEXT:     LSHL T0.Y, PV.W, 1,
-; EG-NEXT:     AND_INT T0.Z, T2.X, 1,
-; EG-NEXT:     BIT_ALIGN_INT T7.W, PV.Z, T1.Y, literal.x,
-; EG-NEXT:     CNDE_INT * T9.W, T4.X, T5.X, PV.Y,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T2.X, T10.W, T0.W,
-; EG-NEXT:     SETE_INT T1.Y, PV.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGE_UINT * T1.Z, PV.W, T1.W, BS:VEC_021/SCL_122
-; EG-NEXT:     BIT_ALIGN_INT T8.W, T9.W, T8.W, literal.x,
-; EG-NEXT:     OR_INT * T9.W, T0.Y, T0.Z,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     SETGE_UINT T4.X, PS, T4.W,
-; EG-NEXT:     SETE_INT T0.Y, PV.W, T3.Z,
-; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T3.Z,
-; EG-NEXT:     SUBB_UINT T11.W, PS, T4.W,
-; EG-NEXT:     SUB_INT * T12.W, PV.W, T3.Z,
-; EG-NEXT:     SUBB_UINT * T5.X, T10.W, T0.W,
-; EG-NEXT:     SUB_INT * T2.Y, T7.W, T1.W,
-; EG-NEXT:     SUB_INT * T2.Z, T9.W, T4.W,
-; EG-NEXT:     SUB_INT T0.W, T12.W, T11.W,
-; EG-NEXT:     CNDE_INT * T1.W, T0.Y, T0.Z, T4.X,
-; EG-NEXT:     CNDE_INT T4.X, PS, T8.W, PV.W, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT T0.Y, PS, T9.W, T2.Z, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT T0.Z, T2.Y, T5.X,
-; EG-NEXT:     CNDE_INT T0.W, T1.Y, T1.Z, T2.X, BS:VEC_210
-; EG-NEXT:     XOR_INT * T1.W, T6.X, T5.W,
-; EG-NEXT:     SUBB_UINT T2.X, PS, T5.W,
-; EG-NEXT:     CNDE_INT T1.Y, PV.W, T7.W, PV.Z, BS:VEC_021/SCL_122
-; EG-NEXT:     XOR_INT T0.Z, PV.Y, T3.W, BS:VEC_102/SCL_221
-; EG-NEXT:     XOR_INT * T4.W, PV.X, T3.W, BS:VEC_102/SCL_221
-; EG-NEXT:     CNDE_INT * T0.W, T0.W, T10.W, T7.X,
-; EG-NEXT:     XOR_INT T4.X, PV.W, T6.W,
-; EG-NEXT:     SUB_INT T0.Y, T4.W, T3.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUBB_UINT T1.Z, T0.Z, T3.W, BS:VEC_021/SCL_122
-; EG-NEXT:     XOR_INT T0.W, T1.Y, T6.W,
-; EG-NEXT:     SUB_INT * T7.W, T3.X, T2.X,
-; EG-NEXT:     SUB_INT T2.X, PV.W, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T7.Y, PV.Y, PV.Z,
-; EG-NEXT:     SUBB_UINT T1.Z, PV.X, T6.W, BS:VEC_021/SCL_122
-; EG-NEXT:     XOR_INT T0.W, T1.X, T2.W,
-; EG-NEXT:     XOR_INT * T4.W, T0.X, T2.W,
-; EG-NEXT:     SUB_INT T0.Y, PS, T2.W,
-; EG-NEXT:     SUB_INT T7.Z, T1.W, T5.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUBB_UINT T1.W, PV.W, T2.W,
-; EG-NEXT:     SUB_INT * T4.W, PV.X, PV.Z,
-; EG-NEXT:     SUB_INT T7.X, T0.Z, T3.W,
-; EG-NEXT:     SUB_INT T4.Y, PV.Y, PV.W,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     SUB_INT T4.Z, T4.X, T6.W, BS:VEC_102/SCL_221
-; EG-NEXT:     SUB_INT * T4.X, T0.W, T2.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, PV.X,
+; EG-NEXT:     BIT_ALIGN_INT T7.W, T1.Z, T14.W, literal.x, BS:VEC_021/SCL_122
+; EG-NEXT:     XOR_INT * T8.W, T3.Y, T1.W,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     SUBB_UINT T1.X, PS, T1.W,
+; EG-NEXT:     SETGE_UINT * T1.Y, T11.W, T3.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETE_INT T1.Z, T7.W, T6.W,
+; EG-NEXT:     CNDE_INT * T2.W, T2.Z, T2.W, T0.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT * T5.W, T2.Z, T5.W, T2.X,
+; EG-NEXT:     SETGE_UINT * T2.X, T7.W, T6.W,
+; EG-NEXT:     SUBB_UINT * T0.Y, T11.W, T3.W,
+; EG-NEXT:     SUB_INT * T2.Z, T7.W, T6.W,
+; EG-NEXT:     XOR_INT T3.W, T5.W, T4.W,
+; EG-NEXT:     XOR_INT * T2.W, T2.W, T4.W,
+; EG-NEXT:     SUB_INT T6.X, PS, T4.W,
+; EG-NEXT:     SUBB_UINT T2.Y, PV.W, T4.W,
+; EG-NEXT:     SUB_INT T2.Z, T2.Z, T0.Y,
+; EG-NEXT:     CNDE_INT T2.W, T1.Z, T2.X, T1.Y, BS:VEC_102/SCL_221
+; EG-NEXT:     SUB_INT * T5.W, T4.X, T1.X,
+; EG-NEXT:     CNDE_INT T1.X, PV.W, T7.W, PV.Z, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T5.Y, PV.X, PV.Y,
+; EG-NEXT:     CNDE_INT T1.Z, PV.W, T11.W, T5.X, BS:VEC_201
+; EG-NEXT:     XOR_INT T2.W, T3.X, T0.W,
+; EG-NEXT:     XOR_INT * T6.W, T0.X, T0.W,
+; EG-NEXT:     SUB_INT T0.X, PS, T0.W,
+; EG-NEXT:     SUBB_UINT T0.Y, PV.W, T0.W,
+; EG-NEXT:     SUB_INT T5.Z, T8.W, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     XOR_INT T1.W, PV.Z, T0.Z,
+; EG-NEXT:     XOR_INT * T6.W, PV.X, T0.Z,
+; EG-NEXT:     SUB_INT T5.X, T3.W, T4.W,
+; EG-NEXT:     SUB_INT T1.Z, PS, T0.Z,
+; EG-NEXT:     SUBB_UINT T3.W, PV.W, T0.Z,
+; EG-NEXT:     SUB_INT * T4.W, PV.X, PV.Y,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT * T4.Y, PV.Z, PV.W,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T1.X, PV.X, literal.x,
+; EG-NEXT:     SUB_INT T4.Z, T2.W, T0.W,
+; EG-NEXT:     SUB_INT * T4.X, T1.W, T0.Z,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %den_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
   %num = load <4 x i64>, ptr addrspace(1) %in
   %den = load <4 x i64>, ptr addrspace(1) %den_ptr
@@ -9056,9 +9052,9 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 48, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 1
+; EG-NEXT:    ALU 49, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
@@ -9068,53 +9064,54 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; EG-NEXT:    ALU clause starting at 11:
 ; EG-NEXT:     ASHR * T1.W, T1.W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T2.Z, T0.Y, literal.x,
-; EG-NEXT:     ASHR T2.W, T1.Y, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T1.W, PV.W, literal.y,
-; EG-NEXT:    31(4.344025e-44), 30(4.203895e-44)
-; EG-NEXT:     ADD_INT T0.Y, T1.Z, PS,
-; EG-NEXT:     ASHR T3.Z, T0.W, literal.x,
-; EG-NEXT:     LSHR T0.W, PV.W, literal.y,
-; EG-NEXT:     LSHR * T2.W, PV.Z, literal.y,
-; EG-NEXT:    31(4.344025e-44), 30(4.203895e-44)
-; EG-NEXT:     ADD_INT T2.X, T0.X, PS,
-; EG-NEXT:     ADD_INT T1.Y, T1.X, PV.W, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR T2.Z, PV.Z, literal.x,
-; EG-NEXT:     ADDC_UINT T1.W, T1.Z, T1.W,
-; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.y,
+; EG-NEXT:     LSHR * T1.W, PV.W, literal.x,
+; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T2.W, T1.Z, PV.W,
+; EG-NEXT:     ASHR * T3.W, T1.Y, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T3.W, PS, literal.x,
+; EG-NEXT:     AND_INT * T2.W, PV.W, literal.y,
 ; EG-NEXT:    30(4.203895e-44), -4(nan)
-; EG-NEXT:     SUBB_UINT T3.X, T1.Z, PS,
-; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, 1,
-; EG-NEXT:     ADD_INT T3.Z, T0.Z, PV.Z, BS:VEC_120/SCL_212
-; EG-NEXT:     ADDC_UINT T0.W, T1.X, T0.W,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT:    -4(nan), 0(0.000000e+00)
-; EG-NEXT:     ADDC_UINT T4.X, T0.Z, T2.Z,
-; EG-NEXT:     SUBB_UINT T1.Y, T1.X, PS,
-; EG-NEXT:     BFE_INT T2.Z, PV.W, 0.0, 1,
-; EG-NEXT:     AND_INT T0.W, PV.Z, literal.x,
-; EG-NEXT:     SUB_INT * T4.W, PV.Y, PV.X,
+; EG-NEXT:     SUBB_UINT T1.Y, T1.Z, PS,
+; EG-NEXT:     ASHR T2.Z, T0.W, literal.x,
+; EG-NEXT:     ADD_INT T0.W, T1.X, PV.W,
+; EG-NEXT:     ADDC_UINT * T1.W, T1.Z, T1.W,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T2.X, PS, 0.0, 1,
+; EG-NEXT:     ADDC_UINT T2.Y, T1.X, T3.W,
+; EG-NEXT:     AND_INT T3.Z, PV.W, literal.x,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.y,
+; EG-NEXT:     LSHR * T1.W, PV.Z, literal.z,
+; EG-NEXT:    -4(nan), 31(4.344025e-44)
+; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T3.X, T0.Z, PS,
+; EG-NEXT:     LSHR T0.Y, PV.W, literal.x,
+; EG-NEXT:     SUBB_UINT T2.Z, T1.X, PV.Z,
+; EG-NEXT:     BFE_INT T0.W, PV.Y, 0.0, 1,
+; EG-NEXT:     SUB_INT * T4.W, PV.X, T1.Y,
+; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT:     SUB_INT T4.Y, PV.W, PV.Z,
+; EG-NEXT:     ADD_INT T2.Z, T0.X, PV.Y,
+; EG-NEXT:     ADDC_UINT T0.W, T0.Z, T1.W,
+; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
 ; EG-NEXT:    -4(nan), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T3.X, T0.Z, PV.W,
-; EG-NEXT:     SUB_INT T4.Y, PV.Z, PV.Y,
-; EG-NEXT:     BFE_INT T2.Z, PV.X, 0.0, 1,
-; EG-NEXT:     ADDC_UINT T2.W, T0.X, T2.W,
-; EG-NEXT:     AND_INT * T5.W, T2.X, literal.x,
+; EG-NEXT:     SUBB_UINT T2.X, T0.Z, PS,
+; EG-NEXT:     BFE_INT T1.Y, PV.W, 0.0, 1,
+; EG-NEXT:     SUB_INT T4.Z, T1.Z, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     ADDC_UINT T0.W, T0.X, T0.Y,
+; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.x,
 ; EG-NEXT:    -4(nan), 0(0.000000e+00)
-; EG-NEXT:     SUBB_UINT T0.Y, T0.X, PS,
-; EG-NEXT:     SUB_INT T4.Z, T1.Z, T3.W,
-; EG-NEXT:     BFE_INT T2.W, PV.W, 0.0, 1,
-; EG-NEXT:     SUB_INT * T3.W, PV.Z, PV.X,
-; EG-NEXT:     SUB_INT T4.X, T1.X, T1.W,
-; EG-NEXT:     SUB_INT T3.Y, PV.W, PV.Y,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     SUB_INT T3.Z, T0.Z, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT * T3.X, T0.X, T5.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT T4.X, T1.X, T3.Z,
+; EG-NEXT:     SUBB_UINT T1.Z, T0.X, PS, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, 1,
+; EG-NEXT:     SUB_INT * T3.W, PV.Y, PV.X,
+; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT * T3.Y, PV.W, PV.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T2.X, PV.X, literal.x,
+; EG-NEXT:     SUB_INT T3.Z, T0.Z, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT * T3.X, T0.X, T2.W,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %num = load <4 x i64>, ptr addrspace(1) %in
   %result = srem <4 x i64> %num, 
   store <4 x i64> %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 571c0f04c06ca..094f114dd4fdc 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -311,9 +311,9 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @6
-; EG-NEXT:    ALU 34, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; EG-NEXT:    ALU 33, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
 ; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
@@ -349,15 +349,14 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG-NEXT:     AND_INT * T4.W, T1.X, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, PV.Y,
-; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:     CNDE_INT * T2.W, T0.Z, T1.Y, 0.0,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     CNDE_INT T2.Y, T3.W, T3.Y, 0.0,
-; EG-NEXT:     CNDE_INT T1.W, T1.W, T4.Z, 0.0, BS:VEC_120/SCL_212
-; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:     CNDE_INT T2.W, T0.Z, T1.Y, 0.0,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T2.Y, T3.W, T3.Y, 0.0,
+; EG-NEXT:     CNDE_INT * T1.W, T1.W, T4.Z, 0.0, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT T3.X, T0.X, literal.x,
 ; EG-NEXT:     CNDE_INT * T1.Y, T4.W, T4.Y, 0.0,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
   %a = load <4 x i64>, ptr addrspace(1) %in
   %b = load <4 x i64>, ptr addrspace(1) %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f6922c75ff848..0e6bbd4e16e1b 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -48,16 +48,17 @@ bb:
 define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: barrier_vscnt_global:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_barrier
@@ -67,17 +68,13 @@ define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX9-LABEL: barrier_vscnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    global_store_dword v[2:3], v1, off
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[2:3], v0, off offset:-4
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -100,39 +97,34 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: barrier_vmcnt_vscnt_global:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -8, v1
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v2, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[3:4]
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vmcnt_vscnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:-8
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_store_dword v[2:3], v1, off
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1] offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[2:3], v0, off offset:-4
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -201,16 +193,17 @@ bb:
 define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
 ; GFX8-LABEL: barrier_vscnt_flat:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
@@ -219,16 +212,15 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
 ;
 ; GFX9-LABEL: barrier_vscnt_flat:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -4, v2
-; GFX9-NEXT:    flat_store_dword v[2:3], v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 4, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_store_dword v[0:1], v2 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
@@ -255,42 +247,38 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
 ; GFX8-LABEL: barrier_vmcnt_vscnt_flat:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -8, v1
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v2, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[3:4]
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vmcnt_vscnt_flat:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, -8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
-; GFX9-NEXT:    flat_load_dword v4, v[4:5]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -4, v2
-; GFX9-NEXT:    flat_store_dword v[2:3], v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_load_dword v2, v[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 4, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_store_dword v[0:1], v3 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    flat_store_dword v[0:1], v4
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -315,42 +303,38 @@ bb:
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX8-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 ; GFX8:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[1:2], 30, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v3, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -8, v1
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v2, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[3:4]
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
-; GFX8-NEXT:    flat_store_dword v[0:1], v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 30, v[1:2]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, -8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
-; GFX9-NEXT:    flat_load_dword v4, v[4:5]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -4, v2
-; GFX9-NEXT:    flat_store_dword v[2:3], v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_load_dword v2, v[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 4, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    flat_store_dword v[0:1], v3 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    flat_store_dword v[0:1], v4
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

From 851ca38d1bcb6a8bbf46de46953f8d6a858a4d2e Mon Sep 17 00:00:00 2001
From: Krut Patel 
Date: Tue, 12 May 2026 09:46:31 -0700
Subject: [PATCH 479/538] [AsmParser] Use cantFail for FloatLiteral string
 conversion (#197064)

With assertions disabled but `LLVM_ABI_BREAKING_CHECKS=FORCE_ON`, the
`assert` was elided, the Expected stayed unchecked, and the subsequent
`*Except` tripped `fatalUncheckedError`. Fix this by switching to
`cantFail`.

Assisted-by: Claude Opus
---
 llvm/lib/AsmParser/LLParser.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 820f64bf30ba7..bcd18fd11f706 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4162,15 +4162,16 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     if (!ExpectedTy->isFloatingPointTy())
       return error(ID.Loc, "floating-point constant invalid for type");
     ID.APFloatVal = APFloat(ExpectedTy->getFltSemantics());
-    auto Except = ID.APFloatVal.convertFromString(
-        Lex.getStrVal(), RoundingMode::NearestTiesToEven);
-    assert(Except && "Invalid float strings should be caught by the lexer");
+    APFloat::opStatus Except =
+        cantFail(ID.APFloatVal.convertFromString(
+                     Lex.getStrVal(), RoundingMode::NearestTiesToEven),
+                 "Invalid float strings should be caught by the lexer");
     // Forbid overflowing and underflowing literals, but permit inexact
     // literals. Underflow is thrown when the result is denormal, so to allow
     // denormals, only reject underflowing literals that resulted in a zero.
-    if (*Except & APFloat::opOverflow)
+    if (Except & APFloat::opOverflow)
       return error(ID.Loc, "floating-point constant overflowed type");
-    if ((*Except & APFloat::opUnderflow) && ID.APFloatVal.isZero())
+    if ((Except & APFloat::opUnderflow) && ID.APFloatVal.isZero())
       return error(ID.Loc, "floating-point constant underflowed type");
     ID.Kind = ValID::t_APFloat;
     break;

From bc39082d02f798ed94317ea1a8f93f1e0adbc398 Mon Sep 17 00:00:00 2001
From: Dave Lee 
Date: Tue, 12 May 2026 09:50:35 -0700
Subject: [PATCH 480/538] [clang-tidy] Add `llvm-formatv-string` (#195974)

Adds a clang-tidy check to perform some validation on `llvm::formatv`
calls. Similar to the built in support Clang has for checking printf
calls.

The validations are:
- The number of unique format indices matches the number of arguments.
- Every argument is used by the format string.
- Automatic and explicit indices are not mixed.

This includes a config option (`AdditionalFunctions`) to perform the
same validation checks on other functions which take formatv inputs.

Assisted-by: claude

---------

Co-authored-by: EugeneZelenko 
Co-authored-by: Victor Chernyakin 
Co-authored-by: Zeyi Xu 
---
 .../clang-tidy/llvm/CMakeLists.txt            |   1 +
 .../clang-tidy/llvm/FormatvStringCheck.cpp    | 171 ++++++++++++++++++
 .../clang-tidy/llvm/FormatvStringCheck.h      |  47 +++++
 .../clang-tidy/llvm/LLVMTidyModule.cpp        |   2 +
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 +
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../clang-tidy/checks/llvm/formatv-string.rst |  50 +++++
 .../llvm/formatv-string-additional.cpp        |  28 +++
 .../llvm/formatv-string-autodetect.cpp        |  24 +++
 .../checkers/llvm/formatv-string.cpp          |  84 +++++++++
 10 files changed, 414 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/llvm/formatv-string.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-additional.cpp
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-autodetect.cpp
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string.cpp

diff --git a/clang-tools-extra/clang-tidy/llvm/CMakeLists.txt b/clang-tools-extra/clang-tidy/llvm/CMakeLists.txt
index c81882e0e2024..bec3ba50c81c5 100644
--- a/clang-tools-extra/clang-tidy/llvm/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/llvm/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_clang_library(clangTidyLLVMModule STATIC
+  FormatvStringCheck.cpp
   HeaderGuardCheck.cpp
   IncludeOrderCheck.cpp
   LLVMTidyModule.cpp
diff --git a/clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.cpp b/clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.cpp
new file mode 100644
index 0000000000000..0b439e9f72c8f
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.cpp
@@ -0,0 +1,171 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FormatvStringCheck.h"
+#include "../utils/OptionsUtils.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Error.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::llvm_check {
+
+namespace {
+
+struct ParseResult {
+  SmallVector Indices;
+  unsigned MaxIndex = 0;
+};
+
+} // namespace
+
+static Expected parseFormatvString(StringRef Fmt) {
+  ParseResult Result;
+  unsigned NextAutoIndex = 0;
+  bool HasAutomatic = false;
+  bool HasExplicit = false;
+
+  while (!Fmt.empty()) {
+    const size_t OpenBrace = Fmt.find('{');
+    if (OpenBrace == StringRef::npos)
+      break;
+
+    Fmt = Fmt.drop_front(OpenBrace);
+
+    // Handle escaped braces '{{'.
+    if (Fmt.consume_front("{{"))
+      continue;
+
+    // Find the closing '}'.
+    const size_t CloseBrace = Fmt.find('}');
+    if (CloseBrace == StringRef::npos)
+      return llvm::createStringError("unterminated brace in format string");
+
+    // Extract the content between braces.
+    const StringRef Content = Fmt.substr(1, CloseBrace - 1);
+    Fmt = Fmt.drop_front(CloseBrace + 1);
+
+    // Parse the replacement field: [index] ["," layout] [":" format]
+    StringRef IndexStr = Content.substr(0, Content.find_first_of(",:"));
+
+    IndexStr = IndexStr.trim();
+
+    unsigned Index = 0;
+    if (IndexStr.empty()) {
+      Index = NextAutoIndex++;
+      HasAutomatic = true;
+    } else {
+      if (IndexStr.getAsInteger(10, Index))
+        return llvm::createStringError(
+            "invalid replacement index in format string");
+      HasExplicit = true;
+    }
+
+    Result.Indices.push_back(Index);
+    Result.MaxIndex = std::max(Result.MaxIndex, Index);
+  }
+
+  if (HasAutomatic && HasExplicit)
+    return llvm::createStringError(
+        "format string mixes automatic and explicit indices");
+
+  return Result;
+}
+
+FormatvStringCheck::FormatvStringCheck(StringRef Name,
+                                       ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      AdditionalFunctions(Options.get("AdditionalFunctions", "")) {
+  Functions = utils::options::parseStringList(AdditionalFunctions);
+  Functions.emplace_back("::llvm::formatv");
+  Functions.emplace_back("::llvm::createStringErrorV");
+}
+
+void FormatvStringCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "AdditionalFunctions", AdditionalFunctions);
+}
+
+void FormatvStringCheck::registerMatchers(MatchFinder *Finder) {
+  // Build a matcher for all configured function names.
+  Finder->addMatcher(
+      callExpr(callee(functionDecl(hasAnyName(Functions),
+                                   ast_matchers::isTemplateInstantiation())),
+               argumentCountAtLeast(1))
+          .bind("call"),
+      this);
+}
+
+void FormatvStringCheck::check(const MatchFinder::MatchResult &Result) {
+  const auto *Call = Result.Nodes.getNodeAs("call");
+  assert(Call && Call->getNumArgs() > 0);
+
+  const auto *FD = Call->getDirectCallee();
+  assert(FD);
+
+  // Find the format string index from the template signature: it's the
+  // parameter immediately before the trailing parameter pack.
+  const FunctionDecl *TemplateDecl = FD;
+  if (const FunctionTemplateDecl *Primary = FD->getPrimaryTemplate())
+    TemplateDecl = Primary->getTemplatedDecl();
+
+  const unsigned NumDeclParams = TemplateDecl->getNumParams();
+  if (NumDeclParams < 2)
+    return;
+
+  const unsigned PackParamIndex = NumDeclParams - 1;
+  if (!TemplateDecl->getParamDecl(PackParamIndex)->isParameterPack())
+    return;
+
+  const unsigned FmtStringIndex = PackParamIndex - 1;
+
+  if (Call->getNumArgs() <= FmtStringIndex)
+    return;
+
+  // Extract the format string literal.
+  const Expr *FmtArg = Call->getArg(FmtStringIndex)->IgnoreParenImpCasts();
+  const auto *FmtLiteral = dyn_cast(FmtArg);
+  if (!FmtLiteral)
+    return;
+
+  const StringRef FmtString = FmtLiteral->getString();
+  const int NumFmtArgs = Call->getNumArgs() - PackParamIndex;
+
+  auto ParsedOrErr = parseFormatvString(FmtString);
+  if (!ParsedOrErr) {
+    diag(FmtLiteral->getBeginLoc(), toString(ParsedOrErr.takeError()));
+    return;
+  }
+
+  const ParseResult &Parsed = *ParsedOrErr;
+  const int NumRequiredArgs = Parsed.Indices.empty() ? 0 : Parsed.MaxIndex + 1;
+
+  if (NumRequiredArgs > NumFmtArgs) {
+    diag(FmtLiteral->getBeginLoc(),
+         "format string requires %0 argument%s0, but %1 argument%s1 "
+         "%plural{1:was|:were}1 provided")
+        << NumRequiredArgs << NumFmtArgs;
+    return;
+  }
+
+  // Check for unused arguments: both indices not referenced by the format
+  // string, and trailing arguments beyond what the format string requires.
+  llvm::SmallBitVector UnusedIndices(NumFmtArgs, true);
+  for (const unsigned Index : Parsed.Indices)
+    UnusedIndices.reset(Index);
+
+  for (const auto UnusedIndex : UnusedIndices.set_bits()) {
+    const Expr *UnusedArg = Call->getArg(PackParamIndex + UnusedIndex);
+    diag(UnusedArg->getBeginLoc(), "argument unused in format string");
+  }
+}
+
+} // namespace clang::tidy::llvm_check
diff --git a/clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.h b/clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.h
new file mode 100644
index 0000000000000..d1a3db4e77f4a
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/llvm/FormatvStringCheck.h
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVM_FORMATVSTRINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVM_FORMATVSTRINGCHECK_H
+
+#include "../ClangTidyCheck.h"
+#include 
+
+namespace clang::tidy::llvm_check {
+
+/// Validates llvm::formatv format strings against the provided arguments.
+///
+/// Checks that:
+/// - The number of format indices matches the number of arguments.
+/// - Every argument is used by the format string.
+/// - Automatic and explicit indices are not mixed.
+///
+/// For the user-facing documentation see:
+/// https://clang.llvm.org/extra/clang-tidy/checks/llvm/formatv-string.html
+class FormatvStringCheck : public ClangTidyCheck {
+public:
+  FormatvStringCheck(StringRef Name, ClangTidyContext *Context);
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus;
+  }
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+  std::optional getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  std::vector Functions;
+  const StringRef AdditionalFunctions;
+};
+
+} // namespace clang::tidy::llvm_check
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVM_FORMATVSTRINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp b/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
index 104fcf63712f7..918af88c979e0 100644
--- a/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
@@ -11,6 +11,7 @@
 #include "../readability/ElseAfterReturnCheck.h"
 #include "../readability/NamespaceCommentCheck.h"
 #include "../readability/QualifiedAutoCheck.h"
+#include "FormatvStringCheck.h"
 #include "HeaderGuardCheck.h"
 #include "IncludeOrderCheck.h"
 #include "PreferIsaOrDynCastInConditionalsCheck.h"
@@ -32,6 +33,7 @@ class LLVMModule : public ClangTidyModule {
   void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override {
     CheckFactories.registerCheck(
         "llvm-else-after-return");
+    CheckFactories.registerCheck("llvm-formatv-string");
     CheckFactories.registerCheck("llvm-header-guard");
     CheckFactories.registerCheck("llvm-include-order");
     CheckFactories.registerCheck(
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 75ad9050787a5..3f000d7051614 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -204,6 +204,12 @@ New checks
   Finds functions where throwing exceptions is unsafe but the function is still
   marked as potentially throwing.
 
+- New :doc:`llvm-formatv-string
+  ` check.
+
+  Validates ``llvm::formatv`` format strings against the provided arguments,
+  diagnosing mismatched argument counts, unused arguments, and mixed index styles.
+
 - New :doc:`llvm-redundant-casting
   ` check.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 96b8231d2b618..f86d0fb643801 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -243,6 +243,7 @@ Clang-Tidy Checks
    :doc:`google-runtime-operator `,
    :doc:`google-upgrade-googletest-case `, "Yes"
    :doc:`linuxkernel-must-check-errs `,
+   :doc:`llvm-formatv-string `,
    :doc:`llvm-header-guard `,
    :doc:`llvm-include-order `, "Yes"
    :doc:`llvm-namespace-comment `,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/formatv-string.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/formatv-string.rst
new file mode 100644
index 0000000000000..1a545906a795b
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/formatv-string.rst
@@ -0,0 +1,50 @@
+.. title:: clang-tidy - llvm-formatv-string
+
+llvm-formatv-string
+===================
+
+Validates ``llvm::formatv`` format strings against the provided arguments,
+diagnosing mismatched argument counts, unused arguments, and mixed index
+styles.
+
+This check diagnoses the following issues:
+
+- The number of replacement indices in the format string does not match the
+  number of arguments provided.
+- A format string does not use one of the given arguments.
+- Mixing of automatic and explicit indices (e.g. ``{} {1}``).
+
+.. code-block:: c++
+
+  // warning: format string requires 2 arguments, but 1 argument was provided
+  llvm::formatv("{0} {1}", x);
+
+  // warning: format string mixes automatic and explicit indices
+  llvm::formatv("{} {1}", x, y);
+
+  // warning: argument unused in format string
+  llvm::formatv("{0} {2}", x, y, z);
+
+  // OK.
+  llvm::formatv("{0} {1}", x, y);
+  llvm::formatv("{} {}", x, y);
+  llvm::formatv("{0} {0}", x);
+
+The check only operates on calls where the format string is a string literal.
+Dynamic format strings are not diagnosed.
+
+Options
+-------
+
+.. option:: AdditionalFunctions
+
+  A semicolon-separated list of additional fully qualified function names to
+  check, beyond ``llvm::formatv`` and ``llvm::createStringErrorV``. Each
+  function must be a variadic template whose last parameter is a parameter
+  pack. The format string is assumed to be the parameter immediately preceding
+  the pack.
+
+  For example, to check ``::mylib::log(Level, const char *Fmt, Ts&&...)`` set
+  this option to `::mylib::log`.
+
+  Default is the empty string.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-additional.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-additional.cpp
new file mode 100644
index 0000000000000..ee1a4dd4a43f3
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-additional.cpp
@@ -0,0 +1,28 @@
+// RUN: %check_clang_tidy %s llvm-formatv-string %t -- \
+// RUN:   -config='{CheckOptions: {llvm-formatv-string.AdditionalFunctions: "mylib::log"}}'
+
+namespace llvm {
+
+template 
+void formatv(const char *Fmt, Ts &&...Vals) {}
+
+} // namespace llvm
+
+namespace mylib {
+
+enum Level { Info, Error };
+
+template 
+void log(Level L, const char *Fmt, Ts &&...Vals) {}
+
+} // namespace mylib
+
+void correct() {
+  mylib::log(mylib::Info, "{0} {1}", 1, 2);
+  mylib::log(mylib::Error, "{0}", 42);
+}
+
+void wrong_count() {
+  mylib::log(mylib::Info, "{0} {1}", 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: format string requires 2 arguments, but 1 argument was provided
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-autodetect.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-autodetect.cpp
new file mode 100644
index 0000000000000..9b4288ba9a65e
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string-autodetect.cpp
@@ -0,0 +1,24 @@
+// RUN: %check_clang_tidy %s llvm-formatv-string %t
+
+namespace llvm {
+
+template 
+void createStringErrorV(int EC, const char *Fmt, Ts &&...Vals) {}
+
+template 
+void createStringErrorV(const char *Fmt, Ts &&...Vals) {}
+
+} // namespace llvm
+
+void correct() {
+  llvm::createStringErrorV(0, "{0} {1}", 1, 2);
+  llvm::createStringErrorV("{0}", 42);
+}
+
+void wrong_count() {
+  llvm::createStringErrorV(0, "{0} {1}", 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: format string requires 2 arguments, but 1 argument was provided
+
+  llvm::createStringErrorV("{0} {1}", 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: format string requires 2 arguments, but 1 argument was provided
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string.cpp
new file mode 100644
index 0000000000000..36df9da3dff58
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/formatv-string.cpp
@@ -0,0 +1,84 @@
+// RUN: %check_clang_tidy %s llvm-formatv-string %t
+
+namespace llvm {
+
+template 
+void formatv(const char *Fmt, Ts &&...Vals) {}
+
+template 
+void formatv(bool Validate, const char *Fmt, Ts &&...Vals) {}
+
+} // namespace llvm
+
+void correct() {
+  llvm::formatv("{0}", 1);
+  llvm::formatv("{0} {1}", 1, 2);
+  llvm::formatv("{0} {0}", 1);
+  llvm::formatv("{1} {0}", 1, 2);
+  llvm::formatv("{0,10}", 1);
+  llvm::formatv("{0,-10}", 1);
+  llvm::formatv("{0:x}", 1);
+  llvm::formatv("{0,10:x}", 1);
+  llvm::formatv("{0:$[,]}", 1);
+  llvm::formatv("{ 0 }", 1);
+  llvm::formatv("{  0  :x}", 1);
+  llvm::formatv("{ 0 } { 1 }", 1, 2);
+  llvm::formatv("no replacements");
+  llvm::formatv("escaped {{ braces }}");
+  llvm::formatv("{}", 1);
+  llvm::formatv("{} {}", 1, 2);
+  llvm::formatv(false, "{0}", 1);
+}
+
+void too_few_args() {
+  llvm::formatv("{0}");
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: format string requires 1 argument, but 0 arguments were provided
+
+  llvm::formatv("{0} {1}", 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: format string requires 2 arguments, but 1 argument was provided
+
+  llvm::formatv("{0} {1} {2}", 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: format string requires 3 arguments, but 2 arguments were provided
+}
+
+void too_many_args() {
+  llvm::formatv("{0}", 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: argument unused in format string
+
+  llvm::formatv("no replacements", 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:36: warning: argument unused in format string
+}
+
+void mixed_indices() {
+  llvm::formatv("{} {1}", 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: format string mixes automatic and explicit indices
+}
+
+void holes_in_indices() {
+  llvm::formatv("{0} {2}", 1, 2, 3);
+  // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: argument unused in format string
+
+  llvm::formatv("{2}", 1, 2, 3);
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: argument unused in format string
+  // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: argument unused in format string
+
+  llvm::formatv("{1}", 10, 20, 30);
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: argument unused in format string
+  // CHECK-MESSAGES: :[[@LINE-2]]:32: warning: argument unused in format string
+}
+
+void non_literal_format_string(const char *fmt) {
+  // No warning for non-literal format strings.
+  llvm::formatv(fmt, 1, 2);
+}
+
+void bool_overload() {
+  llvm::formatv(false, "{0} {1}", 1, 2);
+  llvm::formatv(true, "{0}", 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: argument unused in format string
+}
+
+void invalid_index() {
+  llvm::formatv("{abc}", 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: invalid replacement index in format string
+}

From 6121089f4cf8be6ece817b8e9fd194935fdde9c9 Mon Sep 17 00:00:00 2001
From: vporpo 
Date: Tue, 12 May 2026 09:53:02 -0700
Subject: [PATCH 481/538] [AMDGPU][MIRFormatter] Human-readable mask for
 S_WAITCNT_soft (#197075)

This patch reuses the S_WAITCNT mask printer and parser for
S_WAITCNT_soft. It prints the mask in a human-readable format, showing
the counter values like `Vmcnt__Expcnt__Lgkmcnt_`.
---
 llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp |   2 +
 .../memory-legalizer-atomic-fence.ll          | 268 +++++++++---------
 .../memory-legalizer-atomic-insert-end.mir    |   6 +-
 ...egalizer-multiple-mem-operands-atomics.mir |   4 +-
 ...-legalizer-single-wave-workgroup-memops.ll |  94 +++---
 .../CodeGen/MIR/AMDGPU/s_waitcnt_soft.mir     | 229 +++++++++++++++
 6 files changed, 417 insertions(+), 186 deletions(-)
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/s_waitcnt_soft.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index b967013201062..e262d4e715fea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -91,6 +91,7 @@ void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
 
   switch (MI.getOpcode()) {
   case AMDGPU::S_WAITCNT:
+  case AMDGPU::S_WAITCNT_soft:
     printSWaitcntImm(Imm, OS);
     break;
   case AMDGPU::S_WAITCNT_DEPCTR:
@@ -116,6 +117,7 @@ bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
 
   switch (OpCode) {
   case AMDGPU::S_WAITCNT:
+  case AMDGPU::S_WAITCNT_soft:
     return parseSWaitcntImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
   case AMDGPU::S_WAITCNT_DEPCTR:
     return parseSWaitAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 37b5422be7e2f..6d23ee00c8190 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -13,19 +13,19 @@
 define amdgpu_kernel void @system_one_as_acquire() #0 {
   ; GFX6-LABEL: name: system_one_as_acquire
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_one_as_acquire
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_one_as_acquire
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -33,7 +33,7 @@ define amdgpu_kernel void @system_one_as_acquire() #0 {
   ;
   ; GFX10CU-LABEL: name: system_one_as_acquire
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -41,7 +41,7 @@ define amdgpu_kernel void @system_one_as_acquire() #0 {
   ;
   ; GFX11WGP-LABEL: name: system_one_as_acquire
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -49,7 +49,7 @@ define amdgpu_kernel void @system_one_as_acquire() #0 {
   ;
   ; GFX11CU-LABEL: name: system_one_as_acquire
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -62,35 +62,35 @@ entry:
 define amdgpu_kernel void @system_one_as_release() #0 {
   ; GFX6-LABEL: name: system_one_as_release
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_one_as_release
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_one_as_release
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_release
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -101,19 +101,19 @@ entry:
 define amdgpu_kernel void @system_one_as_acq_rel() #0 {
   ; GFX6-LABEL: name: system_one_as_acq_rel
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_one_as_acq_rel
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_one_as_acq_rel
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -121,7 +121,7 @@ define amdgpu_kernel void @system_one_as_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: system_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -129,7 +129,7 @@ define amdgpu_kernel void @system_one_as_acq_rel() #0 {
   ;
   ; GFX11WGP-LABEL: name: system_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -137,7 +137,7 @@ define amdgpu_kernel void @system_one_as_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: system_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -150,19 +150,19 @@ entry:
 define amdgpu_kernel void @system_one_as_seq_cst() #0 {
   ; GFX6-LABEL: name: system_one_as_seq_cst
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_one_as_seq_cst
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_one_as_seq_cst
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -170,7 +170,7 @@ define amdgpu_kernel void @system_one_as_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: system_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -178,7 +178,7 @@ define amdgpu_kernel void @system_one_as_seq_cst() #0 {
   ;
   ; GFX11WGP-LABEL: name: system_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -186,7 +186,7 @@ define amdgpu_kernel void @system_one_as_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: system_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -315,19 +315,19 @@ entry:
 define amdgpu_kernel void @agent_one_as_acquire() #0 {
   ; GFX6-LABEL: name: agent_one_as_acquire
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_one_as_acquire
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_one_as_acquire
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -335,7 +335,7 @@ define amdgpu_kernel void @agent_one_as_acquire() #0 {
   ;
   ; GFX10CU-LABEL: name: agent_one_as_acquire
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -343,7 +343,7 @@ define amdgpu_kernel void @agent_one_as_acquire() #0 {
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_acquire
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -351,7 +351,7 @@ define amdgpu_kernel void @agent_one_as_acquire() #0 {
   ;
   ; GFX11CU-LABEL: name: agent_one_as_acquire
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -364,35 +364,35 @@ entry:
 define amdgpu_kernel void @agent_one_as_release() #0 {
   ; GFX6-LABEL: name: agent_one_as_release
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_one_as_release
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_one_as_release
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_release
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -403,19 +403,19 @@ entry:
 define amdgpu_kernel void @agent_one_as_acq_rel() #0 {
   ; GFX6-LABEL: name: agent_one_as_acq_rel
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_one_as_acq_rel
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_one_as_acq_rel
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -423,7 +423,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: agent_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -431,7 +431,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel() #0 {
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -439,7 +439,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: agent_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -452,19 +452,19 @@ entry:
 define amdgpu_kernel void @agent_one_as_seq_cst() #0 {
   ; GFX6-LABEL: name: agent_one_as_seq_cst
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 3952
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_one_as_seq_cst
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 3952
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_one_as_seq_cst
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -472,7 +472,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: agent_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -480,7 +480,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst() #0 {
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -488,7 +488,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: agent_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -509,7 +509,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire() #0 {
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_acquire
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
@@ -520,7 +520,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire() #0 {
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_acquire
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
@@ -544,27 +544,27 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_release
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_lds_direct
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_release
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -583,7 +583,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_lds_direct
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -591,21 +591,21 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -624,7 +624,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_lds_direct
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -632,21 +632,21 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -773,19 +773,19 @@ entry:
 define amdgpu_kernel void @system_acquire() #0 {
   ; GFX6-LABEL: name: system_acquire
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_acquire
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_acquire
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -793,7 +793,7 @@ define amdgpu_kernel void @system_acquire() #0 {
   ;
   ; GFX10CU-LABEL: name: system_acquire
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -801,7 +801,7 @@ define amdgpu_kernel void @system_acquire() #0 {
   ;
   ; GFX11WGP-LABEL: name: system_acquire
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -809,7 +809,7 @@ define amdgpu_kernel void @system_acquire() #0 {
   ;
   ; GFX11CU-LABEL: name: system_acquire
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -822,35 +822,35 @@ entry:
 define amdgpu_kernel void @system_release() #0 {
   ; GFX6-LABEL: name: system_release
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_release
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_release
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_release
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -861,19 +861,19 @@ entry:
 define amdgpu_kernel void @system_acq_rel() #0 {
   ; GFX6-LABEL: name: system_acq_rel
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_acq_rel
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_acq_rel
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -881,7 +881,7 @@ define amdgpu_kernel void @system_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: system_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -889,7 +889,7 @@ define amdgpu_kernel void @system_acq_rel() #0 {
   ;
   ; GFX11WGP-LABEL: name: system_acq_rel
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -897,7 +897,7 @@ define amdgpu_kernel void @system_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: system_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -910,19 +910,19 @@ entry:
 define amdgpu_kernel void @system_seq_cst() #0 {
   ; GFX6-LABEL: name: system_seq_cst
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: system_seq_cst
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: system_seq_cst
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -930,7 +930,7 @@ define amdgpu_kernel void @system_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: system_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -938,7 +938,7 @@ define amdgpu_kernel void @system_seq_cst() #0 {
   ;
   ; GFX11WGP-LABEL: name: system_seq_cst
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -946,7 +946,7 @@ define amdgpu_kernel void @system_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: system_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1075,19 +1075,19 @@ entry:
 define amdgpu_kernel void @agent_acquire() #0 {
   ; GFX6-LABEL: name: agent_acquire
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_acquire
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_acquire
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1095,7 +1095,7 @@ define amdgpu_kernel void @agent_acquire() #0 {
   ;
   ; GFX10CU-LABEL: name: agent_acquire
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1103,7 +1103,7 @@ define amdgpu_kernel void @agent_acquire() #0 {
   ;
   ; GFX11WGP-LABEL: name: agent_acquire
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1111,7 +1111,7 @@ define amdgpu_kernel void @agent_acquire() #0 {
   ;
   ; GFX11CU-LABEL: name: agent_acquire
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1124,35 +1124,35 @@ entry:
 define amdgpu_kernel void @agent_release() #0 {
   ; GFX6-LABEL: name: agent_release
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_release
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_release
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_release
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -1163,19 +1163,19 @@ entry:
 define amdgpu_kernel void @agent_acq_rel() #0 {
   ; GFX6-LABEL: name: agent_acq_rel
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_acq_rel
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_acq_rel
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1183,7 +1183,7 @@ define amdgpu_kernel void @agent_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: agent_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1191,7 +1191,7 @@ define amdgpu_kernel void @agent_acq_rel() #0 {
   ;
   ; GFX11WGP-LABEL: name: agent_acq_rel
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1199,7 +1199,7 @@ define amdgpu_kernel void @agent_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: agent_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1212,19 +1212,19 @@ entry:
 define amdgpu_kernel void @agent_seq_cst() #0 {
   ; GFX6-LABEL: name: agent_seq_cst
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 112
+  ; GFX6-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX6-NEXT:   BUFFER_WBINVL1 implicit $exec
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: agent_seq_cst
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 112
+  ; GFX8-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX8-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: agent_seq_cst
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1232,7 +1232,7 @@ define amdgpu_kernel void @agent_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: agent_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1240,7 +1240,7 @@ define amdgpu_kernel void @agent_seq_cst() #0 {
   ;
   ; GFX11WGP-LABEL: name: agent_seq_cst
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1248,7 +1248,7 @@ define amdgpu_kernel void @agent_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: agent_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1261,36 +1261,36 @@ entry:
 define amdgpu_kernel void @workgroup_acquire() #0 {
   ; GFX6-LABEL: name: workgroup_acquire
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_acquire
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_acquire
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_acquire
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_acquire
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_acquire
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") acquire
@@ -1300,37 +1300,37 @@ entry:
 define amdgpu_kernel void @workgroup_release() #0 {
   ; GFX6-LABEL: name: workgroup_release
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_release
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_release
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_lds_direct
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_release
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -1341,17 +1341,17 @@ entry:
 define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ; GFX6-LABEL: name: workgroup_acq_rel
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_acq_rel
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_acq_rel
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_lds_direct
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1359,21 +1359,21 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_acq_rel
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
@@ -1384,17 +1384,17 @@ entry:
 define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ; GFX6-LABEL: name: workgroup_seq_cst
   ; GFX6: bb.0.entry:
-  ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_seq_cst
   ; GFX8: bb.0.entry:
-  ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_seq_cst
   ; GFX10WGP: bb.0.entry:
-  ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10WGP-NEXT:   S_WAITCNT_lds_direct
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -1402,21 +1402,21 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
+  ; GFX10CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_seq_cst
   ; GFX11WGP: bb.0.entry:
-  ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
index 739219d019605..5633884baccf6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -79,7 +79,7 @@ body:             |
   ; CHECK-NEXT:   $sgpr6 = S_MOV_B32 0
   ; CHECK-NEXT:   S_WAITCNT .Lgkmcnt_0
   ; CHECK-NEXT:   $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 1, 0, implicit $exec :: (volatile load (s64) from %ir.tid.gep, addrspace 1)
-  ; CHECK-NEXT:   S_WAITCNT_soft 3952
+  ; CHECK-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; CHECK-NEXT:   $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec
   ; CHECK-NEXT:   V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
   ; CHECK-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -97,9 +97,9 @@ body:             |
   ; CHECK-NEXT:   S_WAITCNT .Lgkmcnt_0
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
   ; CHECK-NEXT:   S_WAITCNT .Vmcnt_0
-  ; CHECK-NEXT:   S_WAITCNT_soft 3952
+  ; CHECK-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; CHECK-NEXT:   BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from %ir.gep, addrspace 1)
-  ; CHECK-NEXT:   S_WAITCNT_soft 3952
+  ; CHECK-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; CHECK-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.exit:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
index a146c93716dbe..3941da208f825 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -51,9 +51,9 @@ body:             |
   ; GCN-NEXT:   S_WAITCNT .Lgkmcnt_0
   ; GCN-NEXT:   $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
   ; GCN-NEXT:   $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 0, implicit $exec :: (load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(1) poison`, addrspace 1), (load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(5) poison`, addrspace 5)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GCN-NEXT:   BUFFER_WBINVL1_VOL implicit $exec
   ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
   ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
index aaa295992c361..85ccf0c6007eb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
@@ -44,7 +44,7 @@ define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 {
   ;
   ; GFX10-W32-LABEL: name: wg_fence_acq_rel_single64
   ; GFX10-W32: bb.0 (%ir-block.0):
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -82,19 +82,19 @@ define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 {
 define amdgpu_kernel void @wg_fence_acq_rel_multi() #2 {
   ; GFX9-LABEL: name: wg_fence_acq_rel_multi
   ; GFX9: bb.0 (%ir-block.0):
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   S_ENDPGM 0
   ;
   ; GFX942-LABEL: name: wg_fence_acq_rel_multi
   ; GFX942: bb.0 (%ir-block.0):
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   S_ENDPGM 0
   ;
   ; GFX10-LABEL: name: wg_fence_acq_rel_multi
   ; GFX10: bb.0 (%ir-block.0):
-  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -132,7 +132,7 @@ define amdgpu_kernel void @wg_fence_acquire_single64() #1 {
   ;
   ; GFX10-W32-LABEL: name: wg_fence_acquire_single64
   ; GFX10-W32: bb.0 (%ir-block.0):
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
@@ -175,7 +175,7 @@ define amdgpu_kernel void @wg_fence_release_single64() #1 {
   ;
   ; GFX10-W32-LABEL: name: wg_fence_release_single64
   ; GFX10-W32: bb.0 (%ir-block.0):
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   S_ENDPGM 0
@@ -219,7 +219,7 @@ define amdgpu_kernel void @wg_fence_seq_cst_single64() #1 {
   ;
   ; GFX10-W32-LABEL: name: wg_fence_seq_cst_single64
   ; GFX10-W32: bb.0 (%ir-block.0):
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
@@ -329,11 +329,11 @@ define amdgpu_kernel void @wg_ld_seq_cst_single64(ptr addrspace(1) %p) #1 {
   ; GFX10-W32-NEXT: {{  $}}
   ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
   ;
@@ -395,7 +395,7 @@ define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
   ; GFX9-NEXT:   S_ENDPGM 0
@@ -406,7 +406,7 @@ define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX942-NEXT: {{  $}}
   ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
   ; GFX942-NEXT:   S_ENDPGM 0
@@ -417,11 +417,11 @@ define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
-  ; GFX10-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-NEXT:   S_ENDPGM 0
   ;
@@ -484,7 +484,7 @@ define amdgpu_kernel void @wg_ld_acquire_single64(ptr addrspace(1) %p) #1 {
   ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
   ;
@@ -679,7 +679,7 @@ define amdgpu_kernel void @wg_st_seq_cst_single64(ptr addrspace(1) %p, i32 %x) #
   ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
@@ -746,7 +746,7 @@ define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
   ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
   ; GFX9-NEXT:   S_ENDPGM 0
@@ -759,7 +759,7 @@ define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
   ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 1, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
   ; GFX942-NEXT:   S_ENDPGM 0
@@ -772,7 +772,7 @@ define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
   ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
@@ -855,7 +855,7 @@ define amdgpu_kernel void @wg_st_release_single64(ptr addrspace(1) %p, i32 %x) #
   ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
@@ -1169,7 +1169,7 @@ define amdgpu_kernel void @wg_rmw_add_seq_cst_single64(ptr addrspace(1) %p) #1 {
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
@@ -1317,7 +1317,7 @@ define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
   ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
   ; GFX9-NEXT: {{  $}}
@@ -1345,7 +1345,7 @@ define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
   ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
   ; GFX942-NEXT: {{  $}}
@@ -1372,7 +1372,7 @@ define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
@@ -1403,7 +1403,7 @@ define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
   ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
   ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX10-W64-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W64-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W64-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W64-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
@@ -1541,7 +1541,7 @@ define amdgpu_kernel void @wg_rmw_xchg_acq_rel_single64(ptr addrspace(1) %p, i32
   ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
@@ -1635,7 +1635,7 @@ define amdgpu_kernel void @wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(1
   ; GFX10-W32-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
@@ -1710,7 +1710,7 @@ define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p,
   ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
   ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
   ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
   ; GFX9-NEXT:   S_ENDPGM 0
@@ -1723,7 +1723,7 @@ define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p,
   ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
   ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
   ; GFX942-NEXT:   S_ENDPGM 0
@@ -1736,7 +1736,7 @@ define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p,
   ; GFX10-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
   ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
   ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
@@ -1959,11 +1959,11 @@ define amdgpu_kernel void @lds_wg_ld_seq_cst_single64(ptr addrspace(3) %p) #1 {
   ; GFX10-W32-NEXT: {{  $}}
   ; GFX10-W32-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
   ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
@@ -2026,10 +2026,10 @@ define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
   ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   S_ENDPGM 0
   ;
@@ -2039,10 +2039,10 @@ define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
   ; GFX942-NEXT: {{  $}}
   ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
   ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   S_ENDPGM 0
   ;
@@ -2052,11 +2052,11 @@ define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
   ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
-  ; GFX10-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-NEXT:   S_ENDPGM 0
@@ -2122,7 +2122,7 @@ define amdgpu_kernel void @lds_wg_st_release_single64(ptr addrspace(3) %p, i32 %
   ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
   ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
@@ -2251,11 +2251,11 @@ define amdgpu_kernel void @lds_wg_rmw_add_acq_rel_single64(ptr addrspace(3) %p)
   ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT: {{  $}}
@@ -2408,11 +2408,11 @@ define amdgpu_kernel void @lds_wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspa
   ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-W32-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
@@ -2531,7 +2531,7 @@ define amdgpu_kernel void @lds_wg_cmpxchg_monotonic_acquire_single64(ptr addrspa
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
   ; GFX10-W32-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
   ;
@@ -2612,7 +2612,7 @@ define amdgpu_kernel void @flat_wg_ld_acquire_single64(ptr addrspace(0) %p) #1 {
   ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
   ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
   ; GFX10-W32-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 1, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
-  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10-W32-NEXT:   S_ENDPGM 0
   ;
@@ -2673,7 +2673,7 @@ define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x)
   ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
   ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
   ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX9-NEXT:   S_WAITCNT_lds_direct
   ; GFX9-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
   ; GFX9-NEXT:   S_ENDPGM 0
@@ -2686,7 +2686,7 @@ define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x)
   ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
   ; GFX942-NEXT:   $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
   ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_soft .Lgkmcnt_0
   ; GFX942-NEXT:   S_WAITCNT_lds_direct
   ; GFX942-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 1, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
   ; GFX942-NEXT:   S_ENDPGM 0
@@ -2700,7 +2700,7 @@ define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x)
   ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
   ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
   ; GFX10-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
   ; GFX10-NEXT:   S_WAITCNT_lds_direct
   ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/s_waitcnt_soft.mir b/llvm/test/CodeGen/MIR/AMDGPU/s_waitcnt_soft.mir
new file mode 100644
index 0000000000000..1067c872f4def
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/s_waitcnt_soft.mir
@@ -0,0 +1,229 @@
+# RUN: split-file %s %t
+
+# Common tests that work on all targets
+;--- common.mir
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=none %t/common.mir -o - | FileCheck %t/common.mir --check-prefix=CHECK
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=none %t/common.mir -o - | FileCheck %t/common.mir --check-prefix=CHECK
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=none %t/common.mir -o - | FileCheck %t/common.mir --check-prefix=CHECK
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=none %t/common.mir -o - | FileCheck %t/common.mir --check-prefix=CHECK
+---
+name: vmcnt_0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_0
+    ; CHECK: S_WAITCNT_soft .Vmcnt_0
+    S_WAITCNT_soft .Vmcnt_0
+...
+---
+name: vmcnt_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_1
+    ; CHECK: S_WAITCNT_soft .Vmcnt_1
+    S_WAITCNT_soft .Vmcnt_1
+...
+---
+name: expcnt_0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: expcnt_0
+    ; CHECK: S_WAITCNT_soft .Expcnt_0
+    S_WAITCNT_soft .Expcnt_0
+...
+---
+name: expcnt_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: expcnt_1
+    ; CHECK: S_WAITCNT_soft .Expcnt_1
+    S_WAITCNT_soft .Expcnt_1
+...
+---
+name: expcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: expcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Expcnt_6
+    S_WAITCNT_soft .Expcnt_6
+...
+---
+name: lgkmcnt_0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: lgkmcnt_0
+    ; CHECK: S_WAITCNT_soft .Lgkmcnt_0
+    S_WAITCNT_soft .Lgkmcnt_0
+...
+---
+name: lgkmcnt_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: lgkmcnt_1
+    ; CHECK: S_WAITCNT_soft .Lgkmcnt_1
+    S_WAITCNT_soft .Lgkmcnt_1
+...
+---
+name: vmcnt_expcnt
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_expcnt
+    ; CHECK: S_WAITCNT_soft .Vmcnt_1_Expcnt_2
+    S_WAITCNT_soft .Vmcnt_1_Expcnt_2
+...
+---
+name: vmcnt_lgkmcnt
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_lgkmcnt
+    ; CHECK: S_WAITCNT_soft .Vmcnt_3_Lgkmcnt_5
+    S_WAITCNT_soft .Vmcnt_3_Lgkmcnt_5
+...
+---
+name: expcnt_lgkmcnt
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: expcnt_lgkmcnt
+    ; CHECK: S_WAITCNT_soft .Expcnt_4_Lgkmcnt_6
+    S_WAITCNT_soft .Expcnt_4_Lgkmcnt_6
+...
+---
+name: all-zero
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: all-zero
+    ; CHECK: S_WAITCNT_soft .Vmcnt_0_Expcnt_0_Lgkmcnt_0
+    S_WAITCNT_soft .Vmcnt_0_Expcnt_0_Lgkmcnt_0
+...
+---
+name: all-off
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: all-off
+    ; CHECK: S_WAITCNT_soft .AllOff
+    S_WAITCNT_soft .AllOff
+...
+---
+name: zero-number
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: zero-number
+    ; CHECK: S_WAITCNT_soft .Vmcnt_0_Expcnt_0_Lgkmcnt_0
+    S_WAITCNT_soft 0
+...
+
+# GFX8-specific: vmcnt max=15, lgkmcnt max=15
+;--- gfx8.mir
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=none %t/gfx8.mir -o - | FileCheck %t/gfx8.mir
+---
+name: vmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Vmcnt_14
+    S_WAITCNT_soft .Vmcnt_14
+...
+---
+name: lgkmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: lgkmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Lgkmcnt_14
+    S_WAITCNT_soft .Lgkmcnt_14
+...
+---
+name: all-off-number
+body: |
+  bb.0:
+    ; vmcnt=15, expcnt=7, lgkmcnt=15 -> 0xF7F = 3967
+    ; CHECK-LABEL: name: all-off-number
+    ; CHECK: S_WAITCNT_soft .AllOff
+    S_WAITCNT_soft 3967
+...
+
+# GFX9-specific: vmcnt max=63, lgkmcnt max=15
+;--- gfx9.mir
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=none %t/gfx9.mir -o - | FileCheck %t/gfx9.mir
+---
+name: vmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Vmcnt_62
+    S_WAITCNT_soft .Vmcnt_62
+...
+---
+name: lgkmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: lgkmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Lgkmcnt_14
+    S_WAITCNT_soft .Lgkmcnt_14
+...
+---
+name: all-off-number
+body: |
+  bb.0:
+    ; vmcnt=63, expcnt=7, lgkmcnt=15 -> 0xCF7F = 53119
+    ; CHECK-LABEL: name: all-off-number
+    ; CHECK: S_WAITCNT_soft .AllOff
+    S_WAITCNT_soft 53119
+...
+
+# GFX10-specific: vmcnt max=63, lgkmcnt max=63
+;--- gfx10.mir
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=none %t/gfx10.mir -o - | FileCheck %t/gfx10.mir
+---
+name: vmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Vmcnt_62
+    S_WAITCNT_soft .Vmcnt_62
+...
+---
+name: lgkmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: lgkmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Lgkmcnt_62
+    S_WAITCNT_soft .Lgkmcnt_62
+...
+---
+name: all-off-number
+body: |
+  bb.0:
+    ; vmcnt=63, expcnt=7, lgkmcnt=63 -> 0xFF7F = 65407
+    ; CHECK-LABEL: name: all-off-number
+    ; CHECK: S_WAITCNT_soft .AllOff
+    S_WAITCNT_soft 65407
+...
+
+# GFX11-specific: vmcnt max=63, lgkmcnt max=63 (different encoding than GFX10)
+;--- gfx11.mir
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=none %t/gfx11.mir -o - | FileCheck %t/gfx11.mir
+---
+name: vmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Vmcnt_62
+    S_WAITCNT_soft .Vmcnt_62
+...
+---
+name: lgkmcnt_max-1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: lgkmcnt_max-1
+    ; CHECK: S_WAITCNT_soft .Lgkmcnt_62
+    S_WAITCNT_soft .Lgkmcnt_62
+...
+---
+name: all-off-number
+body: |
+  bb.0:
+    ; GFX11 encoding: expcnt[2:0]=bits[2:0], lgkmcnt[5:0]=bits[9:4], vmcnt[5:0]=bits[15:10]
+    ; vmcnt=63, expcnt=7, lgkmcnt=63 -> 0xFFF7 = 65527
+    ; CHECK-LABEL: name: all-off-number
+    ; CHECK: S_WAITCNT_soft .AllOff
+    S_WAITCNT_soft 65527
+...

From df41d79a8531e0595b695878c38f51e0c60a0bcb Mon Sep 17 00:00:00 2001
From: walkerkd 
Date: Tue, 12 May 2026 18:10:14 +0100
Subject: [PATCH 482/538] Delete top level mops-instructions.s file (#197244)

The top level file mops-instructions.s file was accidentally added with
the AArch64 C1-Nano scheduling model and is deleted.

The correct file is located in
llvm/test/tools/llvm-mca/AArch64/Inputs/mops-instructions.s
---
 mops-instructions.s | 138 --------------------------------------------
 1 file changed, 138 deletions(-)
 delete mode 100644 mops-instructions.s

diff --git a/mops-instructions.s b/mops-instructions.s
deleted file mode 100644
index 89f271f55ab7d..0000000000000
--- a/mops-instructions.s
+++ /dev/null
@@ -1,138 +0,0 @@
-cpyfp [x0]!, [x1]!, x2!
-cpyfpwn [x0]!, [x1]!, x2!
-cpyfprn [x0]!, [x1]!, x2!
-cpyfpn [x0]!, [x1]!, x2!
-cpyfpwt [x0]!, [x1]!, x2!
-cpyfpwtwn [x0]!, [x1]!, x2!
-cpyfpwtrn [x0]!, [x1]!, x2!
-cpyfpwtn [x0]!, [x1]!, x2!
-cpyfprt [x0]!, [x1]!, x2!
-cpyfprtwn [x0]!, [x1]!, x2!
-cpyfprtrn [x0]!, [x1]!, x2!
-cpyfprtn [x0]!, [x1]!, x2!
-cpyfpt [x0]!, [x1]!, x2!
-cpyfptwn [x0]!, [x1]!, x2!
-cpyfptrn [x0]!, [x1]!, x2!
-cpyfptn [x0]!, [x1]!, x2!
-cpyfm [x0]!, [x1]!, x2!
-cpyfmwn [x0]!, [x1]!, x2!
-cpyfmrn [x0]!, [x1]!, x2!
-cpyfmn [x0]!, [x1]!, x2!
-cpyfmwt [x0]!, [x1]!, x2!
-cpyfmwtwn [x0]!, [x1]!, x2!
-cpyfmwtrn [x0]!, [x1]!, x2!
-cpyfmwtn [x0]!, [x1]!, x2!
-cpyfmrt [x0]!, [x1]!, x2!
-cpyfmrtwn [x0]!, [x1]!, x2!
-cpyfmrtrn [x0]!, [x1]!, x2!
-cpyfmrtn [x0]!, [x1]!, x2!
-cpyfmt [x0]!, [x1]!, x2!
-cpyfmtwn [x0]!, [x1]!, x2!
-cpyfmtrn [x0]!, [x1]!, x2!
-cpyfmtn [x0]!, [x1]!, x2!
-cpyfe [x0]!, [x1]!, x2!
-cpyfewn [x0]!, [x1]!, x2!
-cpyfern [x0]!, [x1]!, x2!
-cpyfen [x0]!, [x1]!, x2!
-cpyfewt [x0]!, [x1]!, x2!
-cpyfewtwn [x0]!, [x1]!, x2!
-cpyfewtrn [x0]!, [x1]!, x2!
-cpyfewtn [x0]!, [x1]!, x2!
-cpyfert [x0]!, [x1]!, x2!
-cpyfertwn [x0]!, [x1]!, x2!
-cpyfertrn [x0]!, [x1]!, x2!
-cpyfertn [x0]!, [x1]!, x2!
-cpyfet [x0]!, [x1]!, x2!
-cpyfetwn [x0]!, [x1]!, x2!
-cpyfetrn [x0]!, [x1]!, x2!
-cpyfetn [x0]!, [x1]!, x2!
-cpyp [x0]!, [x1]!, x2!
-cpypwn [x0]!, [x1]!, x2!
-cpyprn [x0]!, [x1]!, x2!
-cpypn [x0]!, [x1]!, x2!
-cpypwt [x0]!, [x1]!, x2!
-cpypwtwn [x0]!, [x1]!, x2!
-cpypwtrn [x0]!, [x1]!, x2!
-cpypwtn [x0]!, [x1]!, x2!
-cpyprt [x0]!, [x1]!, x2!
-cpyprtwn [x0]!, [x1]!, x2!
-cpyprtrn [x0]!, [x1]!, x2!
-cpyprtn [x0]!, [x1]!, x2!
-cpypt [x0]!, [x1]!, x2!
-cpyptwn [x0]!, [x1]!, x2!
-cpyptrn [x0]!, [x1]!, x2!
-cpyptn [x0]!, [x1]!, x2!
-cpym [x0]!, [x1]!, x2!
-cpymwn [x0]!, [x1]!, x2!
-cpymrn [x0]!, [x1]!, x2!
-cpymn [x0]!, [x1]!, x2!
-cpymwt [x0]!, [x1]!, x2!
-cpymwtwn [x0]!, [x1]!, x2!
-cpymwtrn [x0]!, [x1]!, x2!
-cpymwtn [x0]!, [x1]!, x2!
-cpymrt [x0]!, [x1]!, x2!
-cpymrtwn [x0]!, [x1]!, x2!
-cpymrtrn [x0]!, [x1]!, x2!
-cpymrtn [x0]!, [x1]!, x2!
-cpymt [x0]!, [x1]!, x2!
-cpymtwn [x0]!, [x1]!, x2!
-cpymtrn [x0]!, [x1]!, x2!
-cpymtn [x0]!, [x1]!, x2!
-cpye [x0]!, [x1]!, x2!
-cpyewn [x0]!, [x1]!, x2!
-cpyern [x0]!, [x1]!, x2!
-cpyen [x0]!, [x1]!, x2!
-cpyewt [x0]!, [x1]!, x2!
-cpyewtwn [x0]!, [x1]!, x2!
-cpyewtrn [x0]!, [x1]!, x2!
-cpyewtn [x0]!, [x1]!, x2!
-cpyert [x0]!, [x1]!, x2!
-cpyertwn [x0]!, [x1]!, x2!
-cpyertrn [x0]!, [x1]!, x2!
-cpyertn [x0]!, [x1]!, x2!
-cpyet [x0]!, [x1]!, x2!
-cpyetwn [x0]!, [x1]!, x2!
-cpyetrn [x0]!, [x1]!, x2!
-cpyetn [x0]!, [x1]!, x2!
-setp [x0]!, x1!, x2
-setpt [x0]!, x1!, x2
-setpn [x0]!, x1!, x2
-setptn [x0]!, x1!, x2
-setm [x0]!, x1!, x2
-setmt [x0]!, x1!, x2
-setmn [x0]!, x1!, x2
-setmtn [x0]!, x1!, x2
-sete [x0]!, x1!, x2
-setet [x0]!, x1!, x2
-seten [x0]!, x1!, x2
-setetn [x0]!, x1!, x2
-setgp [x0]!, x1!, x2
-setgpt [x0]!, x1!, x2
-setgpn [x0]!, x1!, x2
-setgptn [x0]!, x1!, x2
-setgm [x0]!, x1!, x2
-setgmt [x0]!, x1!, x2
-setgmn [x0]!, x1!, x2
-setgmtn [x0]!, x1!, x2
-setge [x0]!, x1!, x2
-setget [x0]!, x1!, x2
-setgen [x0]!, x1!, x2
-setgetn [x0]!, x1!, x2
-cpyfp [x0]!, [x1]!, xzr!
-cpyfm [x0]!, [x1]!, xzr!
-cpyfe [x0]!, [x1]!, xzr!
-cpyp [x0]!, [x1]!, xzr!
-cpym [x0]!, [x1]!, xzr!
-cpye [x0]!, [x1]!, xzr!
-setp [x0]!, xzr!, x2
-setp [x0]!, x1!, xzr
-setm [x0]!, xzr!, x2
-setm [x0]!, x1!, xzr
-sete [x0]!, xzr!, x2
-sete [x0]!, x1!, xzr
-setgp [x0]!, xzr!, x2
-setgp [x0]!, x1!, xzr
-setgm [x0]!, xzr!, x2
-setgm [x0]!, x1!, xzr
-setge [x0]!, xzr!, x2
-setge [x0]!, x1!, xzr

From eeca78019189a5559a694bb4774fec3e6ae07096 Mon Sep 17 00:00:00 2001
From: Aiden Grossman 
Date: Tue, 12 May 2026 10:16:15 -0700
Subject: [PATCH 483/538] [llvm][tools] Use temp dir for offload-binary
 unbundling test (#197234)

Certain environments will leave some of the test dirs read-only for
immutability purposes. Create a new temporary directory so that
llvm-offload-binary has a writable directory to unbundle the image into.
With this method we can also delete the temporary directory preventing
breakage of the failure from still passing tests due to leftover files.
---
 llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll b/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll
index 023194dfab60c..dd1e839ddf905 100644
--- a/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll
+++ b/llvm/test/tools/llvm-offload-binary/llvm-offload-binary.ll
@@ -17,6 +17,8 @@
 ; RUN: diff %s %t4
 
 ; Test extracting all images without specifying --image filters.
+; RUN: rm -rf %t.dir && mkdir %t.dir
+; RUN: cd %t.dir
 ; RUN: llvm-offload-binary %t | FileCheck --check-prefix=EXTRACT %s
 
 ; EXTRACT: Extracted: llvm-offload-binary.{{.*}}-x-y-z-abc.0.

From 9a39c60406638ee8545d432e7f22a5ecd253c24d Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Tue, 12 May 2026 12:20:50 -0500
Subject: [PATCH 484/538] [libclc] Consolidate `amdgpu` and `amdgcn`
 architectures consistently (#197233)

Summary:
Currently we did not pass all checks with amdgpu triple as we did with
amdgcn. SPIR-V set this pattern so let's make it consistent.
---
 libclc/CMakeLists.txt | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 334faa1a02667..e6fdff67028ea 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -23,9 +23,10 @@ option(
 )
 
 # List of all supported architectures.
-set( LIBCLC_ARCHS_ALL amdgpu amdgcn nvptx64 )
+set( LIBCLC_ARCHS_NVPTX nvptx64 )
+set( LIBCLC_ARCHS_AMDGPU amdgpu amdgcn )
 set( LIBCLC_ARCHS_SPIRV spirv spirv32 spirv64 )
-list( APPEND LIBCLC_ARCHS_ALL ${LIBCLC_ARCHS_SPIRV} )
+list( APPEND LIBCLC_ARCHS_ALL ${LIBCLC_ARCHS_AMDGPU} ${LIBCLC_ARCHS_NVPTX} ${LIBCLC_ARCHS_SPIRV} )
 
 set(LIBCLC_TARGET ${LLVM_DEFAULT_TARGET_TRIPLE})
 
@@ -102,9 +103,9 @@ endif()
 message(STATUS "libclc target '${LIBCLC_TARGET}' is enabled")
 
 # Map the LLVM target architecture to the standard directory name.
-if(LIBCLC_TARGET_ARCH STREQUAL amdgcn OR LIBCLC_TARGET_ARCH STREQUAL amdgpu)
+if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_AMDGPU)
   set(LIBCLC_ARCH_DIR amdgpu)
-elseif(LIBCLC_TARGET_ARCH STREQUAL nvptx64)
+elseif(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_NVPTX)
   set(LIBCLC_ARCH_DIR nvptx)
 elseif(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV)
   set(LIBCLC_ARCH_DIR spirv)
@@ -138,7 +139,7 @@ endif()
 # Address space values.
 set(private_addrspace_val 0)
 set(generic_addrspace_val 0)
-if(LIBCLC_TARGET_ARCH STREQUAL amdgcn)
+if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_AMDGPU)
   set(private_addrspace_val 5)
 endif()
 if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV AND NOT LIBCLC_TARGET_OS STREQUAL vulkan)
@@ -158,7 +159,7 @@ if(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_SPIRV)
     list(APPEND target_extra_defines CLC_SPIRV)
     set(opt_flags)
   endif()
-elseif(LIBCLC_TARGET_ARCH STREQUAL amdgcn)
+elseif(LIBCLC_TARGET_ARCH IN_LIST LIBCLC_ARCHS_AMDGPU)
   list(APPEND target_compile_flags "SHELL:-Xclang -mcode-object-version=none")
 endif()
 

From d04ea84c5c5671be8786b41e8eea9bdd5e42d38c Mon Sep 17 00:00:00 2001
From: Nico Weber 
Date: Tue, 12 May 2026 13:23:55 -0400
Subject: [PATCH 485/538] [gn build] Port bc39082d02f7 (#197248)

---
 .../gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn
index c6893bcccd46b..1861c28dca107 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/llvm/BUILD.gn
@@ -13,6 +13,7 @@ static_library("llvm") {
     "//llvm/lib/Support",
   ]
   sources = [
+    "FormatvStringCheck.cpp",
     "HeaderGuardCheck.cpp",
     "IncludeOrderCheck.cpp",
     "LLVMTidyModule.cpp",

From c8fe5e818ae8f32584fe808d2aa3a0ba2814f9d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= 
Date: Tue, 12 May 2026 18:26:13 +0100
Subject: [PATCH 486/538] Fix "author" handling in GitHub PR Greeter (#197140)

This is a follow-up to #194307 and fixes the issue reported in:
* https://github.com/llvm/llvm-project/pull/194307#issuecomment-4426270256

Use the same author-detection logic in `PRGreeter` as in
`PRBuildbotInformation`, so both components handle PR authors
consistently.
---
 llvm/utils/git/github-automation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index bbd029e59cfbc..d1e5f7b778eaf 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -245,10 +245,10 @@ def get_user_values_str(values: list) -> str:
 class PRGreeter:
     COMMENT_TAG = "\n"
 
-    def __init__(self, token: str, repo: str, pr_number: int):
+    def __init__(self, token: str, repo: str, pr_number: int, author: str):
         repo = github.Github(auth=github.Auth.Token(token)).get_repo(repo)
         self.pr = repo.get_issue(pr_number).as_pull_request()
-        self.author = self.pr.user
+        self.author = author
 
     def run(self) -> bool:
         # We assume that this is only called for a PR that has just been opened
@@ -963,7 +963,7 @@ def request_release_note(token: str, repo_name: str, pr_number: int):
     )
     pr_subscriber.run()
 elif args.command == "pr-greeter":
-    pr_greeter = PRGreeter(args.token, args.repo, args.issue_number)
+    pr_greeter = PRGreeter(args.token, args.repo, args.issue_number, args.author)
     pr_greeter.run()
 elif args.command == "commit-request-greeter":
     commit_greeter = CommitRequestGreeter(args.token, args.repo, args.issue_number)

From 31d25c75ec03fca6c7ff684c15087314538cba07 Mon Sep 17 00:00:00 2001
From: Erich Keane 
Date: Tue, 12 May 2026 10:32:09 -0700
Subject: [PATCH 487/538] [CIR] Global-TLS variable 'call' rewriting- (#197026)

This is a followup to my previous patch to handle global/namespace
thread local variables. This patch handles the
re-writing/lowering-prepare of the `get-global` for these variables.
Each call to one of these is required to go to a 'wrapper' function,
which optionally calls the initializer. This patch does not handle the
initializer call (so each wrapper call is a very simple 'return the
variable'), as that will be handled in a followup.

Also, variables without initialization don't use a wrapper in Classic
Codegen, however this patch does. The followup patch that will call the
initializer will skip the call to the initializer, but leave the wrapper
in place. This is a necessity due to how we handle global ops/get-global
ops: we won't know whether there is a required ctor/dtor that needs an
initializer at the time of wrapper-write-replacement.
---
 .../Dialect/Transforms/LoweringPrepare.cpp    | 159 +++++++++++++-
 .../test/CIR/CodeGen/global-tls-dyn-init.cpp  | 117 +++++++++++
 .../CIR/CodeGen/global-tls-simple-init.cpp    | 195 ++++++++++++++++++
 .../test/CIR/CodeGen/global-tls-templates.cpp |  55 +++++
 4 files changed, 523 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 5ada3248d3185..dc1a872bd8c8a 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -90,6 +90,7 @@ struct LoweringPreparePass
   void lowerComplexDivOp(cir::ComplexDivOp op);
   void lowerComplexMulOp(cir::ComplexMulOp op);
   void lowerUnaryOp(cir::UnaryOpInterface op);
+  void lowerGetGlobalOp(cir::GetGlobalOp op);
   void lowerGlobalOp(cir::GlobalOp op);
   void lowerThreeWayCmpOp(cir::CmpThreeWayOp op);
   void lowerArrayDtor(cir::ArrayDtor op);
@@ -119,6 +120,13 @@ struct LoweringPreparePass
   /// Build the function that initializes the specified global
   cir::FuncOp buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op);
 
+  /// When looking at the 'global' op, create the wrapper function.
+  void defineGlobalThreadLocalWrapper(cir::GlobalOp op, cir::FuncOp initAlias,
+                                      bool isVarDefinition);
+  /// Get the declaration for the 'wrapper' function for a global-TLS variable.
+  cir::FuncOp getOrCreateThreadLocalWrapper(CIRBaseBuilderTy &builder,
+                                            cir::GlobalOp op);
+
   /// Handle the dtor region by registering destructor with __cxa_atexit
   cir::FuncOp getOrCreateDtorFunc(CIRBaseBuilderTy &builder, cir::GlobalOp op,
                                   mlir::Region &dtorRegion,
@@ -256,6 +264,7 @@ struct LoweringPreparePass
   /// Tracks existing dynamic initializers.
   llvm::StringMap dynamicInitializerNames;
   llvm::SmallVector dynamicInitializers;
+  llvm::StringMap threadLocalWrappers;
 
   /// Tracks guard variables for static locals (keyed by global symbol name).
   llvm::StringMap staticLocalDeclGuardMap;
@@ -1361,6 +1370,88 @@ void LoweringPreparePass::lowerLocalInitOp(cir::LocalInitOp initOp) {
   // Remove the init local op, now that we've done everything we need with it.
   initOp.erase();
 }
+static bool isThreadWrapperReplaceable(cir::TLS_Model tls,
+                                       clang::ASTContext &astCtx) {
+  return tls == cir::TLS_Model::GeneralDynamic &&
+         astCtx.getTargetInfo().getTriple().isOSDarwin();
+}
+
+static cir::GlobalLinkageKind
+getThreadLocalWrapperLinkage(GlobalOp op, clang::ASTContext &astCtx) {
+  if (isLocalLinkage(op.getLinkage()))
+    return op.getLinkage();
+
+  if (isThreadWrapperReplaceable(*op.getTlsModel(), astCtx))
+    if (!isLinkOnceLinkage(op.getLinkage()) &&
+        !isWeakODRLinkage(op.getLinkage()))
+      return op.getLinkage();
+
+  // If this isn't a TU in which this variable is defined, the thread wrapper is
+  // discardable.
+  if (op.isDeclaration())
+    return cir::GlobalLinkageKind::LinkOnceODRLinkage;
+  return cir::GlobalLinkageKind::WeakODRLinkage;
+}
+
+cir::FuncOp
+LoweringPreparePass::getOrCreateThreadLocalWrapper(CIRBaseBuilderTy &builder,
+                                                   GlobalOp op) {
+  mlir::OpBuilder::InsertionGuard insertGuard(builder);
+  builder.setInsertionPointToStart(&mlirModule.getBodyRegion().front());
+
+  mlir::StringAttr wrapperName = op.getDynTlsRefs()->getWrapperName();
+
+  auto existingWrapperIter = threadLocalWrappers.find(wrapperName.getValue());
+  if (existingWrapperIter != threadLocalWrappers.end())
+    return existingWrapperIter->second;
+
+  // type is ptr-to-global-type(void);
+  auto funcType = cir::FuncType::get({}, builder.getPointerTo(op.getSymType()));
+  cir::FuncOp func =
+      cir::FuncOp::create(builder, op.getLoc(), wrapperName, funcType);
+
+  cir::GlobalLinkageKind linkageKind =
+      getThreadLocalWrapperLinkage(op, *astCtx);
+  func.setLinkageAttr(
+      cir::GlobalLinkageKindAttr::get(&getContext(), linkageKind));
+
+  // TODO(cir): This is supposed to refer to the comdat of the global symbol,
+  // but that isn't in CIR yet.
+  if (astCtx->getTargetInfo().getTriple().supportsCOMDAT() &&
+      func.isWeakForLinker())
+    func.setComdat(true);
+
+  mlir::SymbolTable::setSymbolVisibility(
+      func, mlir::SymbolTable::Visibility::Private);
+
+  if (!isLocalLinkage(linkageKind)) {
+    if (!isThreadWrapperReplaceable(*op.getTlsModel(), *astCtx) ||
+        isLinkOnceLinkage(linkageKind) || isWeakODRLinkage(linkageKind) ||
+        op.getGlobalVisibility() == cir::VisibilityKind::Hidden)
+      func.setGlobalVisibility(cir::VisibilityKind::Hidden);
+  }
+  if (isThreadWrapperReplaceable(*op.getTlsModel(), *astCtx))
+    op->emitError("Unhandled thread wrapper attributes for CC and Nounwind");
+
+  threadLocalWrappers.insert({wrapperName.getValue(), func});
+  return func;
+}
+
+void LoweringPreparePass::defineGlobalThreadLocalWrapper(cir::GlobalOp op,
+                                                         cir::FuncOp initAlias,
+                                                         bool isVarDefinition) {
+  CIRBaseBuilderTy builder(getContext());
+  cir::FuncOp wrapper = getOrCreateThreadLocalWrapper(builder, op);
+  mlir::Block *entryBB = wrapper.addEntryBlock();
+  builder.setInsertionPointToStart(entryBB);
+  // If we are a situation where we have/need one, emit a call to the init
+  // function.
+  if (initAlias) {
+    op->emitError("not yet implemented, wrapper with an init alias");
+  }
+  auto get = builder.createGetGlobal(op, /*tls=*/true);
+  cir::ReturnOp::create(builder, op.getLoc(), {get});
+}
 
 void LoweringPreparePass::lowerGlobalOp(GlobalOp op) {
   // Static locals are handled separately via guard variables.
@@ -1369,6 +1460,8 @@ void LoweringPreparePass::lowerGlobalOp(GlobalOp op) {
 
   mlir::Region &ctorRegion = op.getCtorRegion();
   mlir::Region &dtorRegion = op.getDtorRegion();
+  // TODO(cir): Implement the initialization of this.
+  cir::FuncOp initAlias;
 
   if (!ctorRegion.empty() || !dtorRegion.empty()) {
     // Build a variable initialization function and move the initialzation code
@@ -1383,9 +1476,67 @@ void LoweringPreparePass::lowerGlobalOp(GlobalOp op) {
     dynamicInitializers.push_back(f);
   }
 
+  // We need a wrapper for TLS globals that MIGHT have a non-constant
+  // initialization. The FE will have generated the DynTlsRefs for any with
+  // known dynamic init, or unknown (extern) init.
+  if (op.getTlsModel() == TLS_Model::GeneralDynamic && op.getDynTlsRefs())
+    defineGlobalThreadLocalWrapper(op, initAlias, !op.isDeclaration());
+
   assert(!cir::MissingFeatures::opGlobalAnnotations());
 }
 
+void LoweringPreparePass::lowerGetGlobalOp(GetGlobalOp op) {
+  if (!op.getTls())
+    return;
+  auto globalOp = mlir::cast(
+      symbolTables.lookupNearestSymbolFrom(op, op.getNameAttr()));
+
+  // Only global/namespace scope thread local variables need to have their
+  // get-global operations rewritten to be calls to a wrapper function.  If
+  // we're not in a dynamic TLS (or one without the TLS markers), we can leave
+  // this one as a get-global and return early.
+  if (globalOp.getTlsModel() != TLS_Model::GeneralDynamic ||
+      !globalOp.getDynTlsRefs())
+    return;
+
+  // If this is a global TLS, we need to replace the call to 'get_global' with a
+  // call to the wrapper function.  Classic codegen figures out some cases where
+  // we can omit this, but for now we're going to always put it in, as it is
+  // effectively a no-op.
+
+  // The first 'GetGlobalOp' at the beginning of a ctor/dtor region on one of
+  // these is for the purpose of creating/destroying.  We want to skip replacing
+  // THAT one, but leave all other get-global-ops in place, else
+  // self-referential ops won't work right.
+
+  // Note that ctors/dtors are removed during this pass. We get away with these
+  // checks because the only time that these situations can actually be true
+  // (that is, the ctor/dtor region exist) is if we're in the process of
+  // converting the ctor/dtor for this. If we're NOT doing that, the ctor/dtor
+  // will have already disappeared.
+  mlir::Operation *parentOp = op->getParentOp();
+  if (parentOp == globalOp) {
+    mlir::Region *ctorRegion = &globalOp.getCtorRegion();
+    mlir::Region *dtorRegion = &globalOp.getDtorRegion();
+
+    if (!ctorRegion->empty() && &*ctorRegion->op_begin() == op.getOperation())
+      return;
+    if (!dtorRegion->empty() && &*dtorRegion->op_begin() == op.getOperation())
+      return;
+  }
+
+  CIRBaseBuilderTy builder(getContext());
+  cir::FuncOp wrapperFunc = getOrCreateThreadLocalWrapper(builder, globalOp);
+
+  builder.setInsertionPoint(op);
+  cir::CallOp call = builder.createCallOp(
+      wrapperFunc.getLoc(),
+      mlir::FlatSymbolRefAttr::get(wrapperFunc.getSymNameAttr()),
+      wrapperFunc.getFunctionType().getReturnType(), {});
+  op->replaceAllUsesWith(call);
+  op.erase();
+}
+
 void LoweringPreparePass::lowerThreeWayCmpOp(CmpThreeWayOp op) {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
@@ -1878,6 +2029,8 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
     lowerComplexMulOp(complexMul);
   } else if (auto glob = mlir::dyn_cast(op)) {
     lowerGlobalOp(glob);
+  } else if (auto getGlob = mlir::dyn_cast(op)) {
+    lowerGetGlobalOp(getGlob);
   } else if (auto unaryOp = mlir::dyn_cast(op)) {
     lowerUnaryOp(unaryOp);
   } else if (auto callOp = dyn_cast(op)) {
@@ -2421,9 +2574,9 @@ void LoweringPreparePass::runOnOperation() {
   op->walk([&](mlir::Operation *op) {
     if (mlir::isa(op))
+                  cir::FuncOp, cir::CallOp, cir::GetGlobalOp, cir::GlobalOp,
+                  cir::StoreOp, cir::CmpThreeWayOp, cir::IncOp, cir::DecOp,
+                  cir::MinusOp, cir::NotOp, cir::LocalInitOp>(op))
       opsToTransform.push_back(op);
   });
 
diff --git a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp
index ef3c1e306f62d..b974f61131353 100644
--- a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp
+++ b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2>&1 | FileCheck %s --check-prefix=CIR-BEFORE-LPP
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - | FileCheck %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM-BOTH,LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM-BOTH,OGCG
 
 int get_i();
 struct CtorDtor {
@@ -7,11 +10,62 @@ struct CtorDtor {
     int i;
 };
 
+// Wrappers:
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW19tls_cd_dyn_not_used() -> !cir.ptr {
+// CIR-NOT: cir.call @_ZTH19tls_cd_dyn_not_used() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW10tls_cd_ref() -> !cir.ptr> {
+// CIR-NOT: cir.call @_ZTH10tls_cd_ref() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_ref : !cir.ptr>
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr>
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW10tls_cd_dyn() -> !cir.ptr {
+// CIR-NOT: cir.call @_ZTH10tls_cd_dyn() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr 
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW6tls_cd() -> !cir.ptr {
+// CIR-NOT: cir.call @_ZTH6tls_cd() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr
+
+// LLVM: define weak_odr hidden ptr @_ZTW19tls_cd_dyn_not_used() {
+// LLVM-NOT:   call void @_ZTH19tls_cd_dyn_not_used()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_dyn_not_used)
+// LLVM:   ret ptr %[[GET_GLOB]]
+// LLVM: }
+//
+// LLVM: define weak_odr hidden ptr @_ZTW10tls_cd_ref() {
+// LLVM-NOT:   call void @_ZTH10tls_cd_ref()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_ref)
+// LLVM:   ret ptr %[[GET_GLOB]]
+// LLVM: }
+//
+// LLVM: define weak_odr hidden ptr @_ZTW10tls_cd_dyn() {
+// LLVM-NOT:   call void @_ZTH10tls_cd_dyn()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_dyn)
+// LLVM:   ret ptr %[[GET_GLOB]]
+// LLVM: }
+//
+// LLVM: define weak_odr hidden ptr @_ZTW6tls_cd() {
+// LLVM-NOT:   call void @_ZTH6tls_cd()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd)
+// LLVM:   ret ptr %[[GET_GLOB]]
+// LLVM: }
+//
+
 thread_local CtorDtor tls_cd = 5;
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor dtor {
 // CIR-BEFORE-LPP:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd : !cir.ptr
 // CIR-BEFORE-LPP:   cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> ()
 // CIR-BEFORE-LPP: }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor
+
+// OGCG: define internal void @[[TLS_CD_INIT:.*]]() {{.*}}{
+// OGCG:   call i32 @__cxa_thread_atexit(ptr @_ZN8CtorDtorD1Ev, ptr @tls_cd, ptr @__dso_handle)
+// OGCG:   ret void
 
 thread_local CtorDtor tls_cd_dyn = get_i();
 // CIR-BEFORE-LPP:  cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_dyn", "_ZTH10tls_cd_dyn"> @tls_cd_dyn = ctor : !rec_CtorDtor {
@@ -22,6 +76,13 @@ thread_local CtorDtor tls_cd_dyn = get_i();
 // CIR-BEFORE-LPP:    %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr
 // CIR-BEFORE-LPP:    cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> ()
 // CIR-BEFORE-LPP:  }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_dyn", "_ZTH10tls_cd_dyn"> @tls_cd_dyn = #cir.zero : !rec_CtorDtor
+
+// OGCG: define internal void @[[TLS_CD_DYN_INIT:.*]]() {{.*}} {
+// OGCG:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
+// OGCG:   call void @_ZN8CtorDtorC1Ei(ptr {{.*}}@tls_cd_dyn, i32 {{.*}}%[[CALL]])
+// OGCG:   call i32 @__cxa_thread_atexit(ptr @_ZN8CtorDtorD1Ev, ptr @tls_cd_dyn, ptr @__dso_handle)
+// OGCG:   ret void
 
 thread_local CtorDtor &tls_cd_ref = tls_cd_dyn;
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_ref", "_ZTH10tls_cd_ref"> @tls_cd_ref = ctor : !cir.ptr {
@@ -29,6 +90,19 @@ thread_local CtorDtor &tls_cd_ref = tls_cd_dyn;
 // CIR-BEFORE-LPP:   %[[CALL:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr>
 // CIR-BEFORE-LPP: }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_ref", "_ZTH10tls_cd_ref"> @tls_cd_ref = #cir.ptr : !cir.ptr
+
+// OGCG: define internal void @[[TLS_CD_REF_INIT:.*]]() {{.*}} {
+// OGCG:   %[[CALL:.*]] = call ptr @_ZTW10tls_cd_dyn()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_cd_ref)
+// OGCG:   store ptr %[[CALL]], ptr %[[GET_GLOB]], align 8
+// OGCG:   ret void
+
+// OGCG: define weak_odr hidden noundef ptr @_ZTW10tls_cd_dyn() {{.*}} comdat {
+// OGCG:   call void @_ZTH10tls_cd_dyn()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_cd_dyn)
+// OGCG:   ret ptr %[[GET_GLOB]]
+// OGCG: }
 
 thread_local CtorDtor tls_cd_dyn_not_used = get_i();
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW19tls_cd_dyn_not_used", "_ZTH19tls_cd_dyn_not_used"> @tls_cd_dyn_not_used = ctor : !rec_CtorDtor {
@@ -39,12 +113,55 @@ thread_local CtorDtor tls_cd_dyn_not_used = get_i();
 // CIR-BEFORE-LPP:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr
 // CIR-BEFORE-LPP:   cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> ()
 // CIR-BEFORE-LPP: }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW19tls_cd_dyn_not_used", "_ZTH19tls_cd_dyn_not_used"> @tls_cd_dyn_not_used = #cir.zero : !rec_CtorDtor
+
+// OGCG: define internal void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() {{.*}} {
+// OGCG:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
+// OGCG:   call void @_ZN8CtorDtorC1Ei(ptr {{.*}}@tls_cd_dyn_not_used, i32 {{.*}}%[[CALL]])
+// OGCG:   call i32 @__cxa_thread_atexit(ptr @_ZN8CtorDtorD1Ev, ptr @tls_cd_dyn_not_used, ptr @__dso_handle)
+// OGCG:   ret void
 
 void uses() {
   auto a = tls_cd;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_cd : !cir.ptr
+// CIR: cir.call @_ZTW6tls_cd() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW6tls_cd()
   auto b = tls_cd_dyn;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_cd_dyn : !cir.ptr
+// CIR: cir.call @_ZTW10tls_cd_dyn() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW10tls_cd_dyn()
   auto c = tls_cd_ref;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_cd_ref : !cir.ptr>
+// CIR: cir.call @_ZTW10tls_cd_ref() : () -> !cir.ptr>
+// LLVM-BOTH: call ptr @_ZTW10tls_cd_ref()
 }
+
+// OGCG: define weak_odr hidden noundef ptr @_ZTW6tls_cd() {{.*}} comdat {
+// OGCG:   call void @_ZTH6tls_cd()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_cd)
+// OGCG:   ret ptr %[[GET_GLOB]]
+// OGCG: }
+//
+// OGCG: define weak_odr hidden noundef ptr @_ZTW10tls_cd_ref() {{.*}} comdat {
+// OGCG:   call void @_ZTH10tls_cd_ref()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_cd_ref)
+// OGCG:   %[[LOAD_GLOB:.*]] = load ptr, ptr %[[GET_GLOB]], align 8
+// OGCG:   ret ptr %[[LOAD_GLOB]]
+// OGCG: }
+//
+// OGCG: define internal void @__tls_init() {{.*}} {
+// OGCG:   %[[GET_GUARD:.*]] = load i8, ptr @__tls_guard, align 1
+// OGCG:   %[[IS_UNINIT:.*]] = icmp eq i8 %[[GET_GUARD]], 0
+// OGCG:   br i1 %[[IS_UNINIT]]
+// OGCG
+// OGCG:   store i8 1, ptr @__tls_guard, align 1
+// OGCG:   call void @[[TLS_CD_INIT]]()
+// OGCG:   call void @[[TLS_CD_DYN_INIT]]()
+// OGCG:   call void @[[TLS_CD_REF_INIT]]()
+// OGCG:   call void @[[TLS_CD_DYN_NOT_USED_INIT]]()
+//
+// OGCG: define weak_odr hidden noundef ptr @_ZTW19tls_cd_dyn_not_used() {{.*}} comdat {
+// OGCG:   call void @_ZTH19tls_cd_dyn_not_used()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}} ptr @llvm.threadlocal.address.p0(ptr {{.*}} @tls_cd_dyn_not_used)
+// OGCG:   ret ptr %[[GET_GLOB]]
+// OGCG: }
diff --git a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp
index fef55b0298c33..b9030d6518222 100644
--- a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp
+++ b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2>&1 | FileCheck %s --check-prefix=CIR-BEFORE-LPP
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - | FileCheck %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM-BOTH,LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM-BOTH,OGCG
 
 int get_i();
 struct CtorDtor {
@@ -7,8 +10,88 @@ struct CtorDtor {
     int i;
 };
 
+// Wrappers:
+// CIR-LABEL: cir.func comdat linkonce_odr private hidden @_ZTW12maybe_inited() -> !cir.ptr {
+// CIR-NOT: %[[GET_INIT_FUNC:.*]] = cir.get_global @_ZTH12maybe_inited : !cir.ptr>
+// Note: The following intentionally disabled since they have matchers in them.
+// CIRX-NOT: %[[NULL:.*]] = cir.const #cir.ptr : !cir.ptr>
+// CIRX-NOT: %[[IS_VALID:.*]] = cir.cmp ne %[[GET_INIT_FUNC]], %[[NULL]] : !cir.ptr>
+// CIRX-NOT: cir.if %[[IS_VALID]] {
+// CIR-NOT: cir.call @_ZTH12maybe_inited() : () -> ()
+// CIRX-NOT: }
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @maybe_inited : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW21definitely_inited_dyn() -> !cir.ptr {
+// CIR-NOT: cir.call @_ZTH21definitely_inited_dyn() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @definitely_inited_dyn : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr
+
+// CIR: cir.func comdat weak_odr private hidden @_ZTW17definitely_inited() -> !cir.ptr {
+// CIR-NEXT:   %[[GET_GLOB:.*]] = cir.get_global thread_local @definitely_inited : !cir.ptr
+// CIR:   cir.return %[[GET_GLOB]] : !cir.ptr
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW17tls_int_self_init() -> !cir.ptr {
+// CIR-NOT: cir.call @_ZTH17tls_int_self_init() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_self_init : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW11tls_int_ref() -> !cir.ptr> {
+// CIR-NOT: cir.call @_ZTH11tls_int_ref() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_ref : !cir.ptr>
+// CIR: cir.return %0 : !cir.ptr>
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW11tls_int_dyn() -> !cir.ptr {
+// CIR-NOT: cir.call @_ZTH11tls_int_dyn() : () -> ()
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_dyn : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]] : !cir.ptr
+
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW7tls_int() -> !cir.ptr {
+// CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int : !cir.ptr
+// CIR: cir.return %[[GET_GLOB]]
+
+// Wrappers: 
+// LLVM: define linkonce_odr hidden ptr @_ZTW12maybe_inited() {
+// Intentionally disabled until we implement this.
+// LLVMX:   %[[HAS_INIT_FUNC:.*]] = icmp ne ptr @_ZTH12maybe_inited, null
+// LLVMX:   br i1 %[[HAS_INIT_FUNC]]
+// LLVM-NOT:   call void @_ZTH12maybe_inited()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @maybe_inited)
+// LLVM:   ret ptr %[[GET_GLOB]]
+//
+// LLVM: define weak_odr hidden ptr @_ZTW21definitely_inited_dyn() {
+// LLVM-NOT:   call void @_ZTH21definitely_inited_dyn()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @definitely_inited_dyn)
+// LLVM:   ret ptr %[[GET_GLOB]]
+//
+// LLVM: define weak_odr hidden ptr @_ZTW17definitely_inited() {
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @definitely_inited)
+// LLVM:   ret ptr %[[GET_GLOB]]
+// LLVM: }
+//
+// LLVM: define weak_odr hidden ptr @_ZTW17tls_int_self_init() {
+// LLVM-NOT:   call void @_ZTH17tls_int_self_init()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_int_self_init)
+// LLVM:   ret ptr %[[GET_GLOB]]
+//
+// LLVM: define weak_odr hidden ptr @_ZTW11tls_int_ref() {
+// LLVM-NOT:   call void @_ZTH11tls_int_ref()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_int_ref)
+// LLVM:   ret ptr %[[GET_GLOB]]
+//
+// LLVM: define weak_odr hidden ptr @_ZTW11tls_int_dyn() {
+// LLVM-NOT:   call void @_ZTH11tls_int_dyn()
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_int_dyn)
+// LLVM:   ret ptr %[[GET_GLOB]]
+
+// LLVM: define weak_odr hidden ptr @_ZTW7tls_int() {
+// LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_int)
+// LLVM:   ret ptr %[[GET_GLOB]]
+// LLVM: }
+
 thread_local int tls_int = 5;
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW7tls_int", "_ZTH7tls_int"> @tls_int = #cir.int<5> : !s32i
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW7tls_int", "_ZTH7tls_int"> @tls_int = #cir.int<5> : !s32i
 
 thread_local int tls_int_dyn = get_i();
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_dyn", "_ZTH11tls_int_dyn"> @tls_int_dyn = ctor : !s32i {
@@ -16,6 +99,13 @@ thread_local int tls_int_dyn = get_i();
 // CIR-BEFORE-LPP:   %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr
 // CIR-BEFORE-LPP: }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_dyn", "_ZTH11tls_int_dyn"> @tls_int_dyn = #cir.int<0> : !s32i 
+
+// OGCG: define internal void @[[TLS_INT_DYN_INIT:.*]]()
+// OGCG:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_dyn)
+// OGCG:   store i32 %[[CALL]], ptr %[[GET_GLOB]], align 4
+// OGCG:   ret void
 
 thread_local int &tls_int_ref = tls_int_dyn;
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_ref", "_ZTH11tls_int_ref"> @tls_int_ref = ctor : !cir.ptr {
@@ -23,6 +113,18 @@ thread_local int &tls_int_ref = tls_int_dyn;
 // CIR-BEFORE-LPP:   %[[GET_OTHER:.*]] = cir.get_global thread_local @tls_int_dyn : !cir.ptr
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[GET_OTHER]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr>
 // CIR-BEFORE-LPP: }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_ref", "_ZTH11tls_int_ref"> @tls_int_ref = #cir.ptr : !cir.ptr
+
+// OGCG: define internal void @[[TLS_INT_REF_INIT:.*]]()
+// OGCG:   %[[GET_REF:.*]] = call ptr @_ZTW11tls_int_dyn()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_ref)
+// OGCG:   store ptr %[[GET_REF]], ptr %[[GET_GLOB]], align 8
+// OGCG:   ret void
+
+// OGCG: define weak_odr hidden noundef ptr @_ZTW11tls_int_dyn() {{.*}} comdat {
+// OGCG:   call void @_ZTH11tls_int_dyn()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_dyn)
+// OGCG:   ret ptr %[[GET_GLOB]]
 
 thread_local int tls_int_self_init = tls_int_self_init + get_i();
 // CIR-BEFORE-LPP:  cir.global external tls_dyn dyn_tls_refs = <"_ZTW17tls_int_self_init", "_ZTH17tls_int_self_init"> @tls_int_self_init = ctor : !s32i {
@@ -33,9 +135,31 @@ thread_local int tls_int_self_init = tls_int_self_init + get_i();
 // CIR-BEFORE-LPP:    %[[ADD:.*]] = cir.add nsw %[[LOAD_SELF]], %[[CALL]] : !s32i
 // CIR-BEFORE-LPP:    cir.store {{.*}}%[[ADD]], %[[GET_GLOB]] : !s32i, !cir.ptr
 // CIR-BEFORE-LPP:  }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17tls_int_self_init", "_ZTH17tls_int_self_init"> @tls_int_self_init = #cir.int<0> : !s32i
+
+// OGCG: define internal void @[[TLS_INT_SELF_REF_INIT:.*]]()
+// OGCG:   %[[GET_SELF_FROM_WRAPPER:.*]] = call ptr @_ZTW17tls_int_self_init()
+// OGCG:   %[[SELF_LOAD:.*]] = load i32, ptr %[[GET_SELF_FROM_WRAPPER]], align 4
+// OGCG:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
+// OGCG:   %[[ADD:.*]] = add nsw i32 %[[SELF_LOAD]], %[[CALL]]
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_self_init)
+// OGCG:   store i32 %[[ADD]], ptr %[[GET_GLOB]], align 4
+// OGCG:   ret void
+
+// OGCG: define weak_odr hidden noundef ptr @_ZTW17tls_int_self_init() {{.*}} comdat {
+// OGCG:   call void @_ZTH17tls_int_self_init()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_self_init)
+// OGCG:   ret ptr %[[GET_GLOB]]
 
 extern thread_local int definitely_inited = 5;
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17definitely_inited", "_ZTH17definitely_inited"> @definitely_inited = #cir.int<5> : !s32i
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17definitely_inited", "_ZTH17definitely_inited"> @definitely_inited = #cir.int<5> : !s32i
+
+// OGCG: define internal void @[[DEF_INITED_DYN:.*]]()
+// OGCG:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@definitely_inited_dyn)
+// OGCG:   store i32 %[[CALL]], ptr %[[GET_GLOB]], align 4
+// OGCG:   ret void
 
 extern thread_local int definitely_inited_dyn = get_i();
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW21definitely_inited_dyn", "_ZTH21definitely_inited_dyn"> @definitely_inited_dyn = ctor : !s32i {
@@ -43,6 +167,7 @@ extern thread_local int definitely_inited_dyn = get_i();
 // CIR-BEFORE-LPP:   %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr
 // CIR-BEFORE-LPP: }
+// CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW21definitely_inited_dyn", "_ZTH21definitely_inited_dyn"> @definitely_inited_dyn = #cir.int<0> : !s32i
 
 extern thread_local int maybe_inited;
 // CIR-BEFORE-LPP: cir.global "private" external tls_dyn dyn_tls_refs = <"_ZTW12maybe_inited", "_ZTH12maybe_inited"> @maybe_inited : !s32i
@@ -50,16 +175,86 @@ extern thread_local int maybe_inited;
 void uses() {
   auto a = tls_int;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_int : !cir.ptr
+// CIR: cir.call @_ZTW7tls_int() : () -> !cir.ptr
+// Note: CIR is currently ALWAYS using the wrapper here even though it doesn't
+// need to, however this is a 'no-op' anyway, so we'd expect this to be
+// optimized away.
+// LLVM: call ptr @_ZTW7tls_int()
+// OGCG: call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int)
   auto b = tls_int_dyn;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_int_dyn : !cir.ptr
+// CIR: cir.call @_ZTW11tls_int_dyn() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW11tls_int_dyn()
   auto c = tls_int_ref;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_int_ref : !cir.ptr>
+// CIR: cir.call @_ZTW11tls_int_ref() : () -> !cir.ptr>
+// LLVM-BOTH: call ptr @_ZTW11tls_int_ref()
+
   auto d = tls_int_self_init;
 // CIR-BEFORE-LPP: cir.get_global thread_local @tls_int_self_init : !cir.ptr
+// CIR: cir.call @_ZTW17tls_int_self_init() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW17tls_int_self_init()
   auto e = maybe_inited;
 // CIR-BEFORE-LPP: cir.get_global thread_local @maybe_inited : !cir.ptr
+// CIR: cir.call @_ZTW12maybe_inited() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW12maybe_inited()
+
   auto f = definitely_inited;
 // CIR-BEFORE-LPP: cir.get_global thread_local @definitely_inited : !cir.ptr
+// CIR: cir.call @_ZTW17definitely_inited() : () -> !cir.ptr
+// Note: CIR is currently ALWAYS using the wrapper here even though it doesn't
+// need to, however this is a 'no-op' anyway, so we'd expect this to be
+// optimized away.
+// LLVM: call ptr @_ZTW17definitely_inited()
+// OGCG: call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@definitely_inited)
   auto g = definitely_inited_dyn;
 // CIR-BEFORE-LPP: cir.get_global thread_local @definitely_inited_dyn : !cir.ptr
+// CIR: cir.call @_ZTW21definitely_inited_dyn() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW21definitely_inited_dyn()
 }
+// OGCG Wrappers: For some reason this puts them at the end, otherwise they are
+// basically identical (return val has a noundef?). Note some are above because
+// they are referenced up there.
+// Also: these have 'comdat' but the above LLVM versions don't, because we
+// haven't yet lowered comdat on functions.
+// OGCG: define weak_odr hidden noundef ptr @_ZTW11tls_int_ref() {{.*}} comdat {
+// OGCG:   call void @_ZTH11tls_int_ref()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_ref)
+// OGCG:   %[[GET_PTR:.*]] = load ptr, ptr %[[GET_GLOB]]
+// OGCG:   ret ptr %[[GET_PTR]]
+//
+// OGCG: define linkonce_odr hidden noundef ptr @_ZTW12maybe_inited() {{.*}} comdat {
+// OGCG:   %[[HAS_INIT_FUNC:.*]] = icmp ne ptr @_ZTH12maybe_inited, null
+// OGCG:   br i1 %[[HAS_INIT_FUNC]]
+// OGCG:   call void @_ZTH12maybe_inited()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@maybe_inited)
+// OGCG:   ret ptr %[[GET_GLOB]]
+//
+// OGCG: define weak_odr hidden noundef ptr @_ZTW21definitely_inited_dyn() {{.*}} comdat {
+// OGCG:   call void @_ZTH21definitely_inited_dyn()
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@definitely_inited_dyn)
+// OGCG:   ret ptr %[[GET_GLOB]]
+//
+// The init function here happens in the middle for some reason?  
+// OGCG: define internal void @__tls_init()
+// OGCG:   %[[GET_GUARD:.*]] = load i8, ptr @__tls_guard, align 1
+// OGCG:   %[[IS_UNINIT:.*]] = icmp eq i8 %[[GET_GUARD]], 0
+// OGCG:   br i1 %[[IS_UNINIT]]
+//
+// OGCG:   store i8 1, ptr @__tls_guard, align 1
+// OGCG:   call void @[[TLS_INT_DYN_INIT]]()
+// OGCG:   call void @[[TLS_INT_REF_INIT]]()
+// OGCG:   call void @[[TLS_INT_SELF_REF_INIT]]()
+// OGCG:   call void @[[DEF_INITED_DYN]]()
+// OGCG:   br label 
+// OGCG:   ret void
+//
+// OGCG: define weak_odr hidden noundef ptr @_ZTW7tls_int() {{.*}} comdat {
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int)
+// OGCG:   ret ptr %[[GET_GLOB]]
+// OGCG: }
+//
+// OGCG: define weak_odr hidden noundef ptr @_ZTW17definitely_inited() {{.*}} comdat {
+// OGCG:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@definitely_inited)
+// OGCG:   ret ptr %[[GET_GLOB]]
+// OGCG: }
diff --git a/clang/test/CIR/CodeGen/global-tls-templates.cpp b/clang/test/CIR/CodeGen/global-tls-templates.cpp
index bad1f1440dde5..36086f45e6543 100644
--- a/clang/test/CIR/CodeGen/global-tls-templates.cpp
+++ b/clang/test/CIR/CodeGen/global-tls-templates.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2>&1 | FileCheck %s --check-prefix=CIR-BEFORE-LPP
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - | FileCheck %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM,LLVM-BOTH
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=OGCG,LLVM-BOTH
 
 int get_i();
 struct CtorDtor {
@@ -25,10 +28,62 @@ thread_local T tls_templ = {get_i()};
 // CIR-BEFORE-LPP:    cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> ()
 // CIR-BEFORE-LPP:  }
 
+// Wrapper: Ctor/Dtor
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW9tls_templI8CtorDtorE() -> !cir.ptr {
+// CIR-NOT:  cir.call @_ZTH9tls_templI8CtorDtorE() : () -> ()
+// CIR:  %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr
+// CIR:  cir.return %[[GET_GLOB]] : !cir.ptr
+// CIR:}
+
+// Wrapper: int
+// CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW9tls_templIiE() -> !cir.ptr
+// CIR-NOT:   cir.call @_ZTH9tls_templIiE() : () -> () 
+// CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templIiE : !cir.ptr
+// CIR:   cir.return %[[GET_GLOB]] : !cir.ptr
+// CIR: }
+
+// Global: int
+// CIR: cir.global linkonce_odr comdat tls_dyn dyn_tls_refs = <"_ZTW9tls_templIiE", "_ZTH9tls_templIiE", "_ZGV9tls_templIiE"> @_Z9tls_templIiE = #cir.int<0> : !s32i
+// Global: Ctor/Dotr:
+// CIR: cir.global linkonce_odr comdat tls_dyn dyn_tls_refs = <"_ZTW9tls_templI8CtorDtorE", "_ZTH9tls_templI8CtorDtorE", "_ZGV9tls_templI8CtorDtorE"> @_Z9tls_templI8CtorDtorE = #cir.zero : !rec_CtorDtor
+
+// Globals:
+// LLVM-BOTH-DAG: @_Z9tls_templIiE = linkonce_odr thread_local global i32 0, comdat, align 4
+// LLVM-BOTH-DAG: @_Z9tls_templI8CtorDtorE = linkonce_odr thread_local global %struct.CtorDtor zeroinitializer, comdat, align 4
+
+// Wrappers: Just opposite ordering, same check lines as LLVM.
+// FIXME: OGCG has these set as 'comdat'. However, CIR doesn't lower comdat to
+// LLVM, so it doesn't show up in the IR here.
+// LLVM-LABEL: define weak_odr hidden {{.*}}ptr @_ZTW9tls_templI8CtorDtorE() {
+// LLVM-NOT:   call void @_ZTH9tls_templI8CtorDtorE()
+// LLVM:   call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templI8CtorDtorE)
+// LLVM: }
+
+// LLVM-LABEL: define weak_odr hidden {{.*}}ptr @_ZTW9tls_templIiE() {
+// LLVM-NOT:   call void @_ZTH9tls_templIiE()
+// LLVM:   call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templIiE)
+// LLVM: }
+
 // CIR-BEFORE-LPP-LABEL: cir.func{{.*}}@_Z4usesv
+// CIR-LABEL: cir.func{{.*}}@_Z4usesv
+// LLVM-BOTH-LABEL: define dso_local void @_Z4usesv()
 void uses() {
   auto x = tls_templ;
 // CIR-BEFORE-LPP: cir.get_global thread_local @_Z9tls_templIiE : !cir.ptr
+// CIR: cir.call @_ZTW9tls_templIiE() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW9tls_templIiE()
   auto y = tls_templ;
 // CIR-BEFORE-LPP: cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr
+// CIR: cir.call @_ZTW9tls_templI8CtorDtorE() : () -> !cir.ptr
+// LLVM-BOTH: call ptr @_ZTW9tls_templI8CtorDtorE()
 }
+
+// OGCG-LABEL: define weak_odr hidden {{.*}}ptr @_ZTW9tls_templIiE() {{.*}} comdat {
+// OGCG:   call void @_ZTH9tls_templIiE()
+// OGCG:   call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templIiE)
+// OGCG: }
+
+// OGCG-LABEL: define weak_odr hidden {{.*}}ptr @_ZTW9tls_templI8CtorDtorE(){{.*}} comdat {
+// OGCG:   call void @_ZTH9tls_templI8CtorDtorE()
+// OGCG:   call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templI8CtorDtorE)
+// OGCG: }

From fe5a48c427b47bf7298589e9e527f2a3ad70aa96 Mon Sep 17 00:00:00 2001
From: NeKon69 
Date: Tue, 12 May 2026 20:35:11 +0300
Subject: [PATCH 488/538] [LifetimeSafety] Update user documentation (#196790)

Updates LifetimeSafety user documentation for

* `new`/`delete` lifetime checks (#193776)
* `std::unique_ptr::reset` invalidation (#194907)
* Explicit destructor calls and `std::destroy_at` (#195010)
---
 clang/docs/LifetimeSafety.rst | 78 +++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 12 deletions(-)

diff --git a/clang/docs/LifetimeSafety.rst b/clang/docs/LifetimeSafety.rst
index db166db0637ca..324ee51d85242 100644
--- a/clang/docs/LifetimeSafety.rst
+++ b/clang/docs/LifetimeSafety.rst
@@ -13,10 +13,11 @@ potential dangling pointer defects in code. The analysis aims to detect
 when a pointer, reference or view type (such as ``std::string_view``) refers to an object
 that is no longer alive, a condition that leads to use-after-free bugs and
 security vulnerabilities. Common examples include pointers to stack variables
-that have gone out of scope, fields holding views to stack-allocated objects
-(dangling-field), returning pointers/references to stack variables 
-(return stack address) or iterators into container elements invalidated by
-container operations (e.g., ``std::vector::push_back``)
+that have gone out of scope, pointers to heap objects that have been
+freed, fields holding views to stack-allocated objects (dangling-field),
+returning pointers/references to stack variables (return stack address) or
+iterators into container elements invalidated by container operations (e.g.,
+``std::vector::push_back``)
 
 The analysis design is inspired by `Polonius, the Rust borrow checker `_,
 but adapted to C++ idioms and constraints, such as the lack of exclusivity enforcement (alias-xor-mutability). 
@@ -265,6 +266,36 @@ it refers to has gone out of scope.
              p = &i; // OK!
            }
            (void)*p;
+          }
+
+Use after free
+--------------
+
+This check warns when a pointer or reference is used after the object it refers
+to has been freed.
+
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+   :class: colored-code-table
+
+   * - Use after free
+     - Correct
+   * -
+       .. code-block:: c++
+
+         void foo() {
+           int *p = new int(0); // warning: allocated object does not live long enough
+           delete p;            // note: freed here
+           (void)*p;            // note: later used here
+         }
+     -
+       .. code-block:: c++
+
+         void foo() {
+           int *p = new int(0);
+           (void)*p;
+           delete p; // OK!
          }
 
 Return of stack address
@@ -356,15 +387,15 @@ stack-allocated variable or temporary to a field of the class.
 Use after invalidation (experimental)
 -------------------------------------
 
-This check warns when a reference to a container element (such as an iterator,
-pointer or reference) is used after a container operation that may have
-invalidated it. For example, adding elements to ``std::vector`` may cause
-reallocation, invalidating all existing iterators, pointers and references to
-its elements.
+This check warns when a pointer, reference or view is used after an operation
+that may have invalidated it. This includes references to container elements
+used after a container operation, and pointers to objects managed by owners such
+as ``std::unique_ptr`` after operations like ``reset``. For example, adding
+elements to ``std::vector`` may cause reallocation, invalidating all existing
+iterators, pointers and references to its elements.
 
 .. note::
-  Container invalidation checking is highly experimental and may produce false
-  positives.
+  Invalidation checking is highly experimental and may produce false positives.
 
 .. list-table::
    :widths: 50 50
@@ -395,6 +426,28 @@ its elements.
           *p = 10;
         }
 
+The analysis also treats explicit destruction as invalidation. Explicit
+destructor calls and ``std::destroy_at`` invalidate pointers, references and
+views into the destroyed object.
+
+.. code-block:: c++
+
+  #include 
+  #include 
+
+  void explicit_destruction() {
+    std::string s = "hello";
+    const char *p = s.data(); // warning: object whose reference is captured is later invalidated
+    std::destroy_at(&s);      // note: invalidated here
+    (void)*p;                 // note: later used here
+  }
+
+  void unique_ptr_reset() {
+    std::unique_ptr u(new int(0));
+    int *p = u.get(); // warning: object whose reference is captured is later invalidated
+    u.reset();        // note: invalidated here
+    (void)*p;         // note: later used here
+  }
 
 Annotation Inference and Suggestions
 ====================================
@@ -449,6 +502,7 @@ enables only the high-confidence subset of these checks.
   * ``-Wlifetime-safety-permissive``: Enables high-confidence checks for dangling pointers. **Recommended for initial adoption.**
 
     * ``-Wlifetime-safety-use-after-scope``: Warns when a pointer to a stack variable is used after the variable's lifetime has ended.
+    * ``-Wlifetime-safety-use-after-free``: Warns when a pointer to an object is used after it's been freed.
     * ``-Wlifetime-safety-return-stack-addr``: Warns when a function returns a pointer or reference to one of its local stack variables.
     * ``-Wlifetime-safety-dangling-field``: Warns when a class field is assigned a pointer to a temporary or stack variable whose lifetime is shorter than the class instance.
   
@@ -457,7 +511,7 @@ enables only the high-confidence subset of these checks.
     *   ``-Wlifetime-safety-use-after-scope-moved``: Same as ``-Wlifetime-safety-use-after-scope`` but for cases where the variable may have been moved from before its destruction.
     *   ``-Wlifetime-safety-return-stack-addr-moved``: Same as ``-Wlifetime-safety-return-stack-addr`` but for cases where the variable may have been moved from.
     *   ``-Wlifetime-safety-dangling-field-moved``: Same as ``-Wlifetime-safety-dangling-field`` but for cases where the variable may have been moved from.
-    *   ``-Wlifetime-safety-invalidation``: Warns when a container iterator or reference to an element is used after an operation that may invalidate it (Experimental).
+    *   ``-Wlifetime-safety-invalidation``: Warns when a pointer, reference, iterator or view is used after an operation that may invalidate it, such as container mutation or explicit destruction (e.g., ``std::unique_ptr::reset``, ``std::destroy_at``) (Experimental).
 
 *   ``-Wlifetime-safety-suggestions``: Enables suggestions to add ``[[clang::lifetimebound]]`` to function parameters and ``this`` parameters.
 

From fb5ea45c0783656bc89b6fda430556d0529d28a4 Mon Sep 17 00:00:00 2001
From: NeKon69 
Date: Tue, 12 May 2026 20:40:28 +0300
Subject: [PATCH 489/538] [LifetimeSafety] Warn on implicit this lifetimebound
 violations (#196926)

With this change we report `[[clang::lifetimebound]]` violations on the
implicit `this` parameter.

It also adds a helper to retrieve the `[[clang::lifetimebound]]`
attribute on method declarations, so diagnostics can point directly at
the attribute location.
---
 .../LifetimeSafety/LifetimeAnnotations.h      |  5 +++
 .../Analyses/LifetimeSafety/LifetimeSafety.h  |  5 +++
 .../clang/Basic/DiagnosticSemaKinds.td        |  4 +-
 clang/lib/Analysis/LifetimeSafety/Checker.cpp | 13 ++++--
 .../LifetimeSafety/LifetimeAnnotations.cpp    | 27 ++++++-----
 clang/lib/Sema/SemaLifetimeSafety.h           | 15 ++++++-
 .../warn-lifetime-safety-lifetimebound.cpp    | 45 +++++++++++++++++++
 7 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h
index 0db10f8a58cea..f418f8a5132ec 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h
@@ -41,6 +41,11 @@ bool isNormalAssignmentOperator(const FunctionDecl *FD);
 /// has the lifetimebound attribute.
 bool isAssignmentOperatorLifetimeBound(const CXXMethodDecl *CMD);
 
+/// Returns the lifetimebound attribute for the implicit this parameter, if it
+/// exists on any redeclaration.
+const LifetimeBoundAttr *
+getImplicitObjectParamLifetimeBoundAttr(const FunctionDecl *FD);
+
 /// Returns true if the implicit object parameter (this) should be considered
 /// lifetimebound, either due to an explicit lifetimebound attribute on the
 /// method or because it's a normal assignment operator.
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 37ffa36fbe865..203db2695f838 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -119,6 +119,11 @@ class LifetimeSafetySemaHelper {
   virtual void
   reportLifetimeboundViolation(const ParmVarDecl *ParmWithLifetimebound) {}
 
+  // Reports misuse of [[clang::lifetimebound]] when implicit this parameter
+  // doesn't escape through return.
+  virtual void
+  reportLifetimeboundViolation(const CXXMethodDecl *MDWithLifetimebound) {}
+
   // Suggests lifetime bound annotations for implicit this.
   virtual void suggestLifetimeboundToImplicitThis(SuggestionScope Scope,
                                                   const CXXMethodDecl *MD,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8c549f121e032..80f9d7cc1d9d8 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11011,8 +11011,8 @@ def warn_lifetime_safety_dangling_global_moved
       InGroup,
       DefaultIgnore;
 
-def warn_lifetime_safety_param_lifetimebound_violation
-    : Warning<"could not verify that the return value can be lifetime bound to %select{an unnamed parameter|'%1'}0">,
+def warn_lifetime_safety_lifetimebound_violation
+    : Warning<"could not verify that the return value can be lifetime bound to %select{an unnamed parameter|'%1'|the implicit this parameter}0">,
       InGroup,
       DefaultIgnore;
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Checker.cpp b/clang/lib/Analysis/LifetimeSafety/Checker.cpp
index bad17e88f0b9b..03585804ad579 100644
--- a/clang/lib/Analysis/LifetimeSafety/Checker.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Checker.cpp
@@ -60,7 +60,7 @@ class LifetimeChecker {
   llvm::DenseMap FinalWarningsMap;
   llvm::DenseMap AnnotationWarningsMap;
   llvm::DenseMap NoescapeWarningsMap;
-  llvm::DenseSet VerifiedLiftimeboundEscapes;
+  llvm::DenseSet VerifiedLiftimeboundEscapes;
   const LoanPropagationAnalysis &LoanPropagation;
   const MovedLoansAnalysis &MovedLoans;
   const LiveOriginsAnalysis &LiveOrigins;
@@ -147,9 +147,10 @@ class LifetimeChecker {
       // field!
     };
     auto CheckImplicitThis = [&](const CXXMethodDecl *MD) {
-      if (!implicitObjectParamIsLifetimeBound(MD))
-        if (auto *ReturnEsc = dyn_cast(OEF))
-          AnnotationWarningsMap.try_emplace(MD, ReturnEsc->getReturnExpr());
+      if (implicitObjectParamIsLifetimeBound(MD))
+        VerifiedLiftimeboundEscapes.insert(MD);
+      else if (auto *ReturnEsc = dyn_cast(OEF))
+        AnnotationWarningsMap.try_emplace(MD, ReturnEsc->getReturnExpr());
     };
     auto MovedAtEscape = MovedLoans.getMovedLoans(OEF);
     for (LoanID LID : EscapedLoans) {
@@ -385,6 +386,10 @@ class LifetimeChecker {
   void reportLifetimeboundViolations() {
     if (!isa(FD))
       return;
+    if (const auto *MD = dyn_cast(FD);
+        MD && implicitObjectParamIsLifetimeBound(MD) &&
+        !VerifiedLiftimeboundEscapes.contains(MD))
+      SemaHelper->reportLifetimeboundViolation(MD);
     for (const ParmVarDecl *PVD : cast(FD)->parameters()) {
       if (!PVD->hasAttr())
         continue;
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeAnnotations.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeAnnotations.cpp
index 559188fddc9fa..393e558fd39c3 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeAnnotations.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeAnnotations.cpp
@@ -72,21 +72,28 @@ getLifetimeBoundAttrFromFunctionType(const TypeSourceInfo &TSI) {
   return nullptr;
 }
 
-bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
+const LifetimeBoundAttr *
+getImplicitObjectParamLifetimeBoundAttr(const FunctionDecl *FD) {
   FD = getDeclWithMergedLifetimeBoundAttrs(FD);
   // Attribute merging doesn't work well with attributes on function types (like
   // 'this' param). We need to check all redeclarations.
-  auto CheckRedecls = [](const FunctionDecl *F) {
-    return llvm::any_of(F->redecls(), [](const FunctionDecl *Redecl) {
-      const TypeSourceInfo *TSI = Redecl->getTypeSourceInfo();
-      return TSI && getLifetimeBoundAttrFromFunctionType(*TSI);
-    });
+  auto CheckRedecls = [](const FunctionDecl *F) -> const LifetimeBoundAttr * {
+    for (const FunctionDecl *Redecl : F->redecls())
+      if (const TypeSourceInfo *TSI = Redecl->getTypeSourceInfo())
+        if (const auto *Attr = getLifetimeBoundAttrFromFunctionType(*TSI))
+          return Attr;
+    return nullptr;
   };
 
-  if (CheckRedecls(FD))
-    return true;
-  if (const FunctionDecl *Pattern = FD->getTemplateInstantiationPattern();
-      Pattern && CheckRedecls(Pattern))
+  if (const auto *Attr = CheckRedecls(FD))
+    return Attr;
+  if (const FunctionDecl *Pattern = FD->getTemplateInstantiationPattern())
+    return CheckRedecls(Pattern);
+  return nullptr;
+}
+
+bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
+  if (getImplicitObjectParamLifetimeBoundAttr(FD))
     return true;
   return isNormalAssignmentOperator(FD);
 }
diff --git a/clang/lib/Sema/SemaLifetimeSafety.h b/clang/lib/Sema/SemaLifetimeSafety.h
index 4d20c4c337b0f..dabdcf81f6f8b 100644
--- a/clang/lib/Sema/SemaLifetimeSafety.h
+++ b/clang/lib/Sema/SemaLifetimeSafety.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_LIB_SEMA_SEMALIFETIMESAFETY_H
 #define LLVM_CLANG_LIB_SEMA_SEMALIFETIMESAFETY_H
 
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Lex/Lexer.h"
@@ -36,7 +37,7 @@ inline bool IsLifetimeSafetyDiagnosticEnabled(Sema &S, const Decl *D) {
       diag::warn_lifetime_safety_dangling_global,
       diag::warn_lifetime_safety_dangling_global_moved,
       diag::warn_lifetime_safety_noescape_escapes,
-      diag::warn_lifetime_safety_param_lifetimebound_violation,
+      diag::warn_lifetime_safety_lifetimebound_violation,
   };
   for (unsigned DiagID : DiagIDs)
     if (!Diags.isIgnored(DiagID, D->getBeginLoc()))
@@ -216,10 +217,20 @@ class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper {
     StringRef ParamName = ParmWithLifetimebound->getName();
     bool HasName = ParamName.size() > 0;
     S.Diag(Attr->getLocation(),
-           diag::warn_lifetime_safety_param_lifetimebound_violation)
+           diag::warn_lifetime_safety_lifetimebound_violation)
         << HasName << ParamName << Attr->getRange();
   }
 
+  void reportLifetimeboundViolation(
+      const CXXMethodDecl *MDWithLifetimebound) override {
+    const auto *Attr =
+        getImplicitObjectParamLifetimeBoundAttr(MDWithLifetimebound);
+    assert(Attr && "Expected lifetimebound attribute");
+    S.Diag(Attr->getLocation(),
+           diag::warn_lifetime_safety_lifetimebound_violation)
+        << 2 << "" << Attr->getRange();
+  }
+
   void suggestLifetimeboundToImplicitThis(SuggestionScope Scope,
                                           const CXXMethodDecl *MD,
                                           const Expr *EscapeExpr) override {
diff --git a/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp b/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp
index 5764647ca62e0..a4841086c07db 100644
--- a/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-lifetimebound.cpp
@@ -86,3 +86,48 @@ View annotated_decl_but_not_def_not_returned(const MyObj &obj [[clang::lifetimeb
 View annotated_decl_but_not_def_not_returned(const MyObj &obj) {
   return not_lb(obj);
 }
+
+struct BadThisReturn {
+  MyObj data;
+
+  View get() const [[clang::lifetimebound]] { // expected-warning {{could not verify that the return value can be lifetime bound to the implicit this parameter}}
+    return not_lb(data);
+  }
+};
+
+struct GoodThisReturn {
+  MyObj data;
+
+  View get() const [[clang::lifetimebound]] {
+    return data;
+  }
+};
+
+struct RedeclaredThis {
+  MyObj data;
+  View get() const [[clang::lifetimebound]]; // expected-warning {{could not verify that the return value can be lifetime bound to the implicit this parameter}}
+};
+
+View RedeclaredThis::get() const {
+  return not_lb(data);
+}
+
+struct ThisAndParam {
+  MyObj data;
+
+  View get(const MyObj &obj [[clang::lifetimebound]]) const [[clang::lifetimebound]] { // expected-warning {{could not verify that the return value can be lifetime bound to the implicit this parameter}}
+    return lb(obj);
+  }
+};
+
+struct ThisAndMixedParams {
+  MyObj data;
+
+  View get(
+      const MyObj &a [[clang::lifetimebound]],
+      const MyObj &b,
+      const MyObj &c [[clang::lifetimebound]]) const // expected-warning {{could not verify that the return value can be lifetime bound to 'c'}}
+      [[clang::lifetimebound]] {                     // expected-warning {{could not verify that the return value can be lifetime bound to the implicit this parameter}}
+    return cond() ? lb(a) : not_lb(b);
+  }
+};

From db436f826bb97a1a36d300eaa8f8f5317fb66686 Mon Sep 17 00:00:00 2001
From: Jake Egan 
Date: Tue, 12 May 2026 13:41:35 -0400
Subject: [PATCH 490/538] [sanitizer_common] Implement address sanitizer on
 AIX: platform specific support (#131866)

Add recognition of AIX and some platform specific changes. This lays the
groundwork to implement AIX in sanitizer_common/asan.

Issue: https://github.com/llvm/llvm-project/issues/138916
---
 .../lib/sanitizer_common/sanitizer_errno.h    |  2 +
 .../lib/sanitizer_common/sanitizer_platform.h | 10 ++-
 .../sanitizer_platform_limits_posix.cpp       | 70 +++++++++++++------
 .../sanitizer_platform_limits_posix.h         | 61 +++++++++++++---
 .../lib/sanitizer_common/sanitizer_posix.cpp  |  9 +--
 5 files changed, 114 insertions(+), 38 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_errno.h b/compiler-rt/lib/sanitizer_common/sanitizer_errno.h
index 76919da57d942..0d2a9307cde6f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_errno.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_errno.h
@@ -31,6 +31,8 @@
 #  define __errno_location _errno
 #elif SANITIZER_HAIKU
 #  define __errno_location _errnop
+#elif SANITIZER_AIX
+#  define __errno_location _Errno
 #endif
 
 extern "C" int *__errno_location();
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 2c70fb7712597..4e0349161e2cf 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -16,7 +16,7 @@
     !defined(__APPLE__) && !defined(_WIN32) && !defined(__Fuchsia__) &&     \
     !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__) &&      \
     !defined(__wasi__) && !defined(__NVPTX__) && !defined(__AMDGPU__) &&    \
-    !defined(__SPIRV__)
+    !defined(__SPIRV__) && !defined(_AIX)
 #  error "This operating system is not supported"
 #endif
 
@@ -33,6 +33,12 @@
 #  define SANITIZER_LINUX 0
 #endif
 
+#if defined(_AIX)
+#  define SANITIZER_AIX 1
+#else
+#  define SANITIZER_AIX 0
+#endif
+
 #if defined(__GLIBC__)
 #  define SANITIZER_GLIBC 1
 #else
@@ -152,7 +158,7 @@
 
 #define SANITIZER_POSIX                                       \
   (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_APPLE || \
-   SANITIZER_NETBSD || SANITIZER_SOLARIS || SANITIZER_HAIKU)
+   SANITIZER_NETBSD || SANITIZER_SOLARIS || SANITIZER_HAIKU || SANITIZER_AIX)
 
 #if __LP64__ || defined(_WIN64)
 #  define SANITIZER_WORDSIZE 64
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 3ea9002f5f19e..5c6330b3dfe9a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -24,7 +24,7 @@
 // Must go after undef _FILE_OFFSET_BITS.
 #include "sanitizer_platform.h"
 
-#if SANITIZER_LINUX || SANITIZER_APPLE || SANITIZER_HAIKU
+#if SANITIZER_LINUX || SANITIZER_APPLE || SANITIZER_HAIKU || SANITIZER_AIX
 // Must go after undef _FILE_OFFSET_BITS.
 #include "sanitizer_glibc_version.h"
 
@@ -61,11 +61,11 @@
 #endif
 
 #if !SANITIZER_ANDROID
-#if !SANITIZER_HAIKU
-#include 
-#endif
-#include 
-#include 
+#    if !SANITIZER_HAIKU && !SANITIZER_AIX
+#      include 
+#    endif
+#    include 
+#    include 
 #endif
 
 #if SANITIZER_LINUX
@@ -113,11 +113,15 @@ typedef struct user_fpregs elf_fpregset_t;
 #endif
 
 #if !SANITIZER_ANDROID
-#include 
-#if !SANITIZER_HAIKU
-#include 
-#include 
-#endif
+#    if !SANITIZER_AIX
+#      include 
+#    else
+#      include 
+#    endif
+#    if !SANITIZER_HAIKU
+#      include 
+#      include 
+#    endif
 #endif
 
 #if SANITIZER_LINUX
@@ -182,6 +186,17 @@ typedef struct user_fpregs elf_fpregset_t;
 #include 
 #endif
 
+#  if SANITIZER_AIX
+#    include 
+#    include 
+#    include 
+#    include 
+#    include 
+#    if HAVE_RPC_XDR_H
+#      include 
+#    endif
+#  endif
+
 // Include these after system headers to avoid name clashes and ambiguities.
 #  include "sanitizer_common.h"
 #  include "sanitizer_internal_defs.h"
@@ -559,13 +574,13 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   const unsigned IOCTL_NOT_PRESENT = 0;
 
   unsigned IOCTL_FIONBIO = FIONBIO;
-#if !SANITIZER_HAIKU
+#  if !SANITIZER_HAIKU
   unsigned IOCTL_FIOASYNC = FIOASYNC;
   unsigned IOCTL_FIOCLEX = FIOCLEX;
   unsigned IOCTL_FIOGETOWN = FIOGETOWN;
   unsigned IOCTL_FIONCLEX = FIONCLEX;
   unsigned IOCTL_FIOSETOWN = FIOSETOWN;
-#endif
+#  endif
   unsigned IOCTL_SIOCADDMULTI = SIOCADDMULTI;
   unsigned IOCTL_SIOCATMARK = SIOCATMARK;
   unsigned IOCTL_SIOCDELMULTI = SIOCDELMULTI;
@@ -587,14 +602,14 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned IOCTL_SIOCSIFNETMASK = SIOCSIFNETMASK;
   unsigned IOCTL_SIOCSPGRP = SIOCSPGRP;
 
-#if !SANITIZER_HAIKU
+#  if !SANITIZER_HAIKU
   unsigned IOCTL_TIOCCONS = TIOCCONS;
   unsigned IOCTL_TIOCGETD = TIOCGETD;
   unsigned IOCTL_TIOCNOTTY = TIOCNOTTY;
   unsigned IOCTL_TIOCPKT = TIOCPKT;
   unsigned IOCTL_TIOCSETD = TIOCSETD;
   unsigned IOCTL_TIOCSTI = TIOCSTI;
-#endif
+#  endif
 
   unsigned IOCTL_TIOCEXCL = TIOCEXCL;
   unsigned IOCTL_TIOCGPGRP = TIOCGPGRP;
@@ -605,10 +620,12 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned IOCTL_TIOCMSET = TIOCMSET;
   unsigned IOCTL_TIOCNXCL = TIOCNXCL;
   unsigned IOCTL_TIOCOUTQ = TIOCOUTQ;
+#  if !SANITIZER_AIX
   unsigned IOCTL_TIOCSCTTY = TIOCSCTTY;
+#  endif
   unsigned IOCTL_TIOCSPGRP = TIOCSPGRP;
   unsigned IOCTL_TIOCSWINSZ = TIOCSWINSZ;
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
+#  if SANITIZER_LINUX && !SANITIZER_ANDROID
   unsigned IOCTL_SIOCGETSGCNT = SIOCGETSGCNT;
   unsigned IOCTL_SIOCGETVIFCNT = SIOCGETVIFCNT;
 #endif
@@ -1070,6 +1087,9 @@ CHECK_SIZE_AND_OFFSET(addrinfo, ai_protocol);
 CHECK_SIZE_AND_OFFSET(addrinfo, ai_addrlen);
 CHECK_SIZE_AND_OFFSET(addrinfo, ai_canonname);
 CHECK_SIZE_AND_OFFSET(addrinfo, ai_addr);
+#  if SANITIZER_AIX
+CHECK_SIZE_AND_OFFSET(addrinfo, ai_eflags);
+#  endif
 
 CHECK_TYPE_SIZE(hostent);
 CHECK_SIZE_AND_OFFSET(hostent, h_name);
@@ -1116,11 +1136,13 @@ COMPILER_CHECK(sizeof(__sanitizer_dirent) <= sizeof(dirent));
 CHECK_SIZE_AND_OFFSET(dirent, d_ino);
 #if SANITIZER_APPLE
 CHECK_SIZE_AND_OFFSET(dirent, d_seekoff);
-#elif SANITIZER_FREEBSD || SANITIZER_HAIKU
+#  elif SANITIZER_AIX
+CHECK_SIZE_AND_OFFSET(dirent, d_offset);
+#  elif SANITIZER_FREEBSD || SANITIZER_HAIKU
 // There is no 'd_off' field on FreeBSD.
-#else
+#  else
 CHECK_SIZE_AND_OFFSET(dirent, d_off);
-#endif
+#  endif
 CHECK_SIZE_AND_OFFSET(dirent, d_reclen);
 
 #if SANITIZER_GLIBC
@@ -1195,6 +1217,10 @@ CHECK_SIZE_AND_OFFSET(wordexp_t, we_wordc);
 CHECK_SIZE_AND_OFFSET(wordexp_t, we_wordv);
 CHECK_SIZE_AND_OFFSET(wordexp_t, we_offs);
 #endif
+#  if SANITIZER_AIX
+CHECK_SIZE_AND_OFFSET(wordexp_t, we_sflags);
+CHECK_SIZE_AND_OFFSET(wordexp_t, we_soffs);
+#  endif
 
 CHECK_TYPE_SIZE(tm);
 CHECK_SIZE_AND_OFFSET(tm, tm_sec);
@@ -1206,10 +1232,12 @@ CHECK_SIZE_AND_OFFSET(tm, tm_year);
 CHECK_SIZE_AND_OFFSET(tm, tm_wday);
 CHECK_SIZE_AND_OFFSET(tm, tm_yday);
 CHECK_SIZE_AND_OFFSET(tm, tm_isdst);
+#  if !SANITIZER_AIX
 CHECK_SIZE_AND_OFFSET(tm, tm_gmtoff);
 CHECK_SIZE_AND_OFFSET(tm, tm_zone);
+#  endif
 
-#if SANITIZER_LINUX
+#  if SANITIZER_LINUX
 CHECK_TYPE_SIZE(mntent);
 CHECK_SIZE_AND_OFFSET(mntent, mnt_fsname);
 CHECK_SIZE_AND_OFFSET(mntent, mnt_dir);
@@ -1259,7 +1287,7 @@ CHECK_TYPE_SIZE(clock_t);
 CHECK_TYPE_SIZE(clockid_t);
 #endif
 
-#if !SANITIZER_ANDROID && !SANITIZER_HAIKU
+#  if !SANITIZER_ANDROID && !SANITIZER_HAIKU && !SANITIZER_AIX
 CHECK_TYPE_SIZE(ifaddrs);
 CHECK_SIZE_AND_OFFSET(ifaddrs, ifa_next);
 CHECK_SIZE_AND_OFFSET(ifaddrs, ifa_name);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 05ebee49f2ab4..419380bcd84ea 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -14,7 +14,7 @@
 #ifndef SANITIZER_PLATFORM_LIMITS_POSIX_H
 #define SANITIZER_PLATFORM_LIMITS_POSIX_H
 
-#if SANITIZER_LINUX || SANITIZER_APPLE || SANITIZER_HAIKU
+#if SANITIZER_LINUX || SANITIZER_APPLE || SANITIZER_HAIKU || SANITIZER_AIX
 
 #  include "sanitizer_internal_defs.h"
 #  include "sanitizer_mallinfo.h"
@@ -29,7 +29,7 @@
 #      define SANITIZER_HAS_STAT64 0
 #      define SANITIZER_HAS_STATFS64 0
 #    endif
-#  elif SANITIZER_GLIBC || SANITIZER_ANDROID
+#  elif SANITIZER_GLIBC || SANITIZER_ANDROID || SANITIZER_AIX
 #    define SANITIZER_HAS_STAT64 1
 #    define SANITIZER_HAS_STATFS64 1
 #  elif SANITIZER_HAIKU
@@ -323,7 +323,7 @@ struct __sanitizer_iovec {
   usize iov_len;
 };
 
-#  if !SANITIZER_ANDROID
+#  if !SANITIZER_ANDROID && !SANITIZER_AIX
 struct __sanitizer_ifaddrs {
   struct __sanitizer_ifaddrs *ifa_next;
   char *ifa_name;
@@ -337,7 +337,7 @@ struct __sanitizer_ifaddrs {
   void *ifa_dstaddr;  // (struct sockaddr *)
   void *ifa_data;
 };
-#  endif  // !SANITIZER_ANDROID
+#  endif  // !SANITIZER_ANDROID && !SANITIZER_AIX
 
 #  if SANITIZER_APPLE
 typedef unsigned long __sanitizer_pthread_key_t;
@@ -345,7 +345,7 @@ typedef unsigned long __sanitizer_pthread_key_t;
 typedef unsigned __sanitizer_pthread_key_t;
 #  endif
 
-#  if SANITIZER_LINUX && !SANITIZER_ANDROID
+#  if (SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_AIX
 
 struct __sanitizer_XDR {
   int x_op;
@@ -440,12 +440,14 @@ struct __sanitizer_tm {
   int tm_wday;
   int tm_yday;
   int tm_isdst;
-#  if SANITIZER_HAIKU
+#  if !SANITIZER_AIX
+#    if SANITIZER_HAIKU
   int tm_gmtoff;
 #  else
   long int tm_gmtoff;
 #  endif
   const char *tm_zone;
+#  endif
 };
 
 #  if SANITIZER_LINUX
@@ -513,11 +515,19 @@ struct __sanitizer_msghdr {
   struct __sanitizer_iovec *msg_iov;
   uptr msg_iovlen;
   void *msg_control;
+#    if !SANITIZER_AIX
   uptr msg_controllen;
+#    else
+  unsigned msg_controllen;
+#    endif
   int msg_flags;
 };
 struct __sanitizer_cmsghdr {
+#    if !SANITIZER_AIX
   uptr cmsg_len;
+#    else
+  unsigned cmsg_len;
+#    endif
   int cmsg_level;
   int cmsg_type;
 };
@@ -556,8 +566,13 @@ struct __sanitizer_dirent {
 };
 #  else
 struct __sanitizer_dirent {
+#    if SANITIZER_AIX
+  uptr d_offset;
+  uptr d_ino;
+#    else
   uptr d_ino;
   uptr d_off;
+#    endif
   unsigned short d_reclen;
   // more fields that we don't care about
 };
@@ -573,7 +588,7 @@ struct __sanitizer_dirent64 {
 extern unsigned struct_sock_fprog_sz;
 #  endif
 
-#  if SANITIZER_HAIKU
+#  if SANITIZER_HAIKU || SANITIZER_AIX
 typedef int __sanitizer_clock_t;
 #  elif defined(__x86_64__) && !defined(_LP64)
 typedef long long __sanitizer_clock_t;
@@ -581,8 +596,10 @@ typedef long long __sanitizer_clock_t;
 typedef long __sanitizer_clock_t;
 #  endif
 
-#  if SANITIZER_LINUX || SANITIZER_HAIKU
+#  if SANITIZER_LINUX || SANITIZER_HAIKU || SANITIZER_AIX
 typedef int __sanitizer_clockid_t;
+#  endif
+#  if SANITIZER_LINUX || SANITIZER_HAIKU
 typedef unsigned long long __sanitizer_eventfd_t;
 #  endif
 
@@ -637,6 +654,14 @@ struct __sanitizer_sigset_t {
   // The size is determined by looking at sizeof of real sigset_t on linux.
   uptr val[128 / sizeof(uptr)];
 };
+#  elif SANITIZER_AIX
+struct __sanitizer_sigset_t {
+#    if SANITIZER_WORDSIZE == 64
+  uptr val[4];
+#    else
+  uptr val[2];
+#    endif
+};
 #  endif
 
 struct __sanitizer_siginfo_pad {
@@ -828,8 +853,12 @@ struct __sanitizer_addrinfo {
   int ai_family;
   int ai_socktype;
   int ai_protocol;
-#  if SANITIZER_ANDROID || SANITIZER_APPLE || SANITIZER_HAIKU
+#  if SANITIZER_ANDROID || SANITIZER_APPLE || SANITIZER_HAIKU || SANITIZER_AIX
+#    if SANITIZER_AIX  // AIX ai_addrlen type is size_t
+  uptr ai_addrlen;
+#    else
   unsigned ai_addrlen;
+#    endif
   char *ai_canonname;
   void *ai_addr;
 #  else  // LINUX
@@ -838,6 +867,9 @@ struct __sanitizer_addrinfo {
   char *ai_canonname;
 #  endif
   struct __sanitizer_addrinfo *ai_next;
+#  if SANITIZER_AIX
+  int ai_eflags;
+#  endif
 };
 
 struct __sanitizer_hostent {
@@ -854,7 +886,7 @@ struct __sanitizer_pollfd {
   short revents;
 };
 
-#  if SANITIZER_ANDROID || SANITIZER_APPLE
+#  if SANITIZER_ANDROID || SANITIZER_APPLE || SANITIZER_AIX
 typedef unsigned __sanitizer_nfds_t;
 #  else
 typedef unsigned long __sanitizer_nfds_t;
@@ -892,6 +924,10 @@ struct __sanitizer_wordexp_t {
   uptr we_wordc;
   char **we_wordv;
   uptr we_offs;
+#  if SANITIZER_AIX
+  int we_sflags;
+  uptr we_soffs;
+#  endif
 };
 
 #  if SANITIZER_LINUX && !SANITIZER_ANDROID
@@ -1193,7 +1229,9 @@ extern unsigned IOCTL_TIOCMGET;
 extern unsigned IOCTL_TIOCMSET;
 extern unsigned IOCTL_TIOCNXCL;
 extern unsigned IOCTL_TIOCOUTQ;
+#  if !SANITIZER_AIX
 extern unsigned IOCTL_TIOCSCTTY;
+#  endif
 extern unsigned IOCTL_TIOCSPGRP;
 extern unsigned IOCTL_TIOCSWINSZ;
 #  if SANITIZER_LINUX && !SANITIZER_ANDROID
@@ -1593,6 +1631,7 @@ extern const int si_SEGV_ACCERR;
 typedef void *__sanitizer_timer_t;
 #  endif
 
-#endif  // SANITIZER_LINUX || SANITIZER_APPLE || SANITIZER_HAIKU
+#endif  // SANITIZER_LINUX || SANITIZER_APPLE || SANITIZER_HAIKU ||
+        // SANITIZER_AIX
 
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
index 33e4ac22c09c2..87e2878c6ce22 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
@@ -27,12 +27,13 @@
 #include 
 #include 
 
-#if SANITIZER_FREEBSD
+#  if SANITIZER_FREEBSD || SANITIZER_AIX
 // The MAP_NORESERVE define has been removed in FreeBSD 11.x, and even before
 // that, it was never implemented.  So just define it to zero.
-#undef  MAP_NORESERVE
-#define MAP_NORESERVE 0
-#endif
+// Similarly, AIX does not define MAP_NORESERVE.
+#    undef MAP_NORESERVE
+#    define MAP_NORESERVE 0
+#  endif
 
 namespace __sanitizer {
 

From 857dad2b877a045dac634e0d11a5bdfb97fbe210 Mon Sep 17 00:00:00 2001
From: Andy Kaylor 
Date: Tue, 12 May 2026 11:01:09 -0700
Subject: [PATCH 491/538] [CIR] Fix function signature mismatch on redirected
 calls (#196665)

We were running into CIR verification errors ("error: 'cir.call' op
operand type mismatch") when compiling with some older versions of the
GLIBC headers that used a macro to redirect system library calls to a
function that used different, but compatible, arguments.

This change fixes the problem by detecting the mismatch at the callsite
and bitcasting the arguments.

Assisted-by: Cursor / claude-opus-4.7-thinking-xhigh
---
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 44 +++++++++---
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 41 +++++++++++-
 .../CIR/CodeGen/asm-label-redirect-inline.c   | 67 +++++++++++++++++++
 clang/test/CIR/CodeGen/asm-label-redirect.c   | 59 ++++++++++++++++
 4 files changed, 199 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/asm-label-redirect-inline.c
 create mode 100644 clang/test/CIR/CodeGen/asm-label-redirect.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 56519d4592aac..228168f708a1e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -1266,19 +1266,43 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
                              attrs, argAttrs, retAttrs, callingConv, sideEffect,
                              /*attrOnCallSite=*/true, /*isThunk=*/false);
 
+  auto resolvedFuncOpFromGlobal = [&](mlir::Operation *op) -> cir::FuncOp {
+    if (auto fnOp = dyn_cast(op))
+      return fnOp;
+    if (auto getGlobalOp = dyn_cast(op)) {
+      // FIXME(cir): This peephole optimization avoids indirect calls for
+      // builtins. This should be fixed in the builtin declaration instead by
+      // not emitting an unecessary get_global in the first place. However,
+      // this is also used for no-prototype functions.
+      mlir::Operation *globalOp = cgm.getGlobalValue(getGlobalOp.getName());
+      assert(globalOp && "undefined global function");
+      return cast(globalOp);
+    }
+    return nullptr;
+  };
+
   cir::FuncType indirectFuncTy;
   mlir::Value indirectFuncVal;
   cir::FuncOp directFuncOp;
-  if (auto fnOp = dyn_cast(calleePtr)) {
-    directFuncOp = fnOp;
-  } else if (auto getGlobalOp = mlir::dyn_cast(calleePtr)) {
-    // FIXME(cir): This peephole optimization avoids indirect calls for
-    // builtins. This should be fixed in the builtin declaration instead by
-    // not emitting an unecessary get_global in the first place.
-    // However, this is also used for no-prototype functions.
-    mlir::Operation *globalOp = cgm.getGlobalValue(getGlobalOp.getName());
-    assert(globalOp && "undefined global function");
-    directFuncOp = mlir::cast(globalOp);
+
+  // If the callee resolves to a FuncOp whose stored signature differs from
+  // this call site's expected signature, the CIR verifier would reject the
+  // mismatched types. This happens, for example, when two declarations share a
+  // mangled name via __asm__ renaming (glibc's __REDIRECT_NTH pattern) but
+  // disagree about a struct argument type. If that happens, we demote the
+  // direct call to an indirect call through a function-pointer bitcast typed
+  // at the call site.
+  if (cir::FuncOp candidate = resolvedFuncOpFromGlobal(calleePtr)) {
+    if (candidate.getFunctionType() == cirFuncTy) {
+      directFuncOp = candidate;
+    } else {
+      mlir::Value addr = cir::GetGlobalOp::create(
+          builder, loc, cir::PointerType::get(candidate.getFunctionType()),
+          candidate.getSymName());
+      indirectFuncTy = cirFuncTy;
+      indirectFuncVal =
+          builder.createBitcast(addr, cir::PointerType::get(cirFuncTy));
+    }
   } else {
     [[maybe_unused]] mlir::ValueTypeRange resultTypes =
         calleePtr->getResultTypes();
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 8c39d94a6b2ec..9ebdac56006a4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1935,8 +1935,45 @@ void CIRGenModule::replaceUsesOfNonProtoTypeWithRealFunction(
       builder.setInsertionPoint(noProtoCallOp);
 
       // Patch call type with the real function type.
-      cir::CallOp realCallOp = builder.createCallOp(
-          noProtoCallOp.getLoc(), newFn, noProtoCallOp.getOperands());
+      cir::FuncType newFnType = newFn.getFunctionType();
+      mlir::OperandRange callOperands = noProtoCallOp.getOperands();
+      bool returnTypeMatches =
+          newFnType.hasVoidReturn()
+              ? noProtoCallOp.getNumResults() == 0
+              : noProtoCallOp.getNumResults() == 1 &&
+                    noProtoCallOp.getResultTypes().front() ==
+                        newFnType.getReturnType();
+      bool typesMatch = !newFn.getNoProto() && returnTypeMatches &&
+                        callOperands.size() == newFnType.getNumInputs();
+      for (unsigned i = 0, e = newFnType.getNumInputs(); typesMatch && i != e;
+           ++i) {
+        if (callOperands[i].getType() != newFnType.getInput(i))
+          typesMatch = false;
+      }
+
+      cir::CallOp realCallOp;
+      if (typesMatch) {
+        // Patch call type with the real function type.
+        realCallOp =
+            builder.createCallOp(noProtoCallOp.getLoc(), newFn, callOperands);
+      } else {
+        // Build an indirect call whose function-pointer signature matches
+        // the existing call site.
+        cir::FuncType origFnType = oldFn.getFunctionType();
+        cir::FuncType callFnType =
+            origFnType.isVarArg()
+                ? cir::FuncType::get(origFnType.getInputs(),
+                                     origFnType.getReturnType(),
+                                     /*isVarArg=*/false)
+                : origFnType;
+        mlir::Value addr = cir::GetGlobalOp::create(
+            builder, noProtoCallOp.getLoc(), cir::PointerType::get(newFnType),
+            newFn.getSymName());
+        mlir::Value casted =
+            builder.createBitcast(addr, cir::PointerType::get(callFnType));
+        realCallOp = builder.createIndirectCallOp(
+            noProtoCallOp.getLoc(), casted, callFnType, callOperands);
+      }
 
       // Replace old no proto call with fixed call.
       noProtoCallOp.replaceAllUsesWith(realCallOp);
diff --git a/clang/test/CIR/CodeGen/asm-label-redirect-inline.c b/clang/test/CIR/CodeGen/asm-label-redirect-inline.c
new file mode 100644
index 0000000000000..3b97fc8ddb6de
--- /dev/null
+++ b/clang/test/CIR/CodeGen/asm-label-redirect-inline.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -disable-llvm-passes -o %t.cir %s
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -disable-llvm-passes -o %t-cir.ll %s
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -disable-llvm-passes -o %t.ll %s
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// This test mirrors what older glibc headers (e.g. GCC 8.5 / glibc 2.27 era)
+// expand to with -D_FILE_OFFSET_BITS=64. There are two declarations sharing
+// the same mangled symbol "real_impl":
+//
+//   1. `my_stat` is declared with `struct my_stat *` and asm-renamed to
+//      "real_impl" - this is the GNU __REDIRECT_NTH pattern.
+//   2. `real_impl` is declared with `struct my_stat64 *` and additionally
+//      provided as an `extern __inline __gnu_inline__` definition with a
+//      body - this is the FORTIFY/__extern_inline wrapper pattern.
+//
+// CIR materializes the FuncOp for "real_impl" lazily on first use. When
+// `test` calls `my_stat(p, &s)`, the FuncOp gets created with the
+// `(ptr, ptr) -> i32` signature. Later, `real_impl`'s inline body
+// is materialized: this triggers `replaceUsesOfNonProtoTypeWithRealFunction`
+// which retroactively rewires the existing call site onto the new FuncOp,
+// whose signature is `(ptr, ptr) -> i32`. Without special
+// handling, the rewired direct call would carry mismatching operand types.
+// The rewrite must instead fall back to an indirect call through a
+// function-pointer bitcast, the same shape used at initial call emission
+// time.
+
+struct my_stat   { int  legacy_field; };
+struct my_stat64 { long modern_field; };
+
+extern int xreal_impl(const char *path, struct my_stat64 *buf);
+
+extern __inline __attribute__((__always_inline__))
+__attribute__((__gnu_inline__)) int
+real_impl(const char *path, struct my_stat64 *buf) {
+  return xreal_impl(path, buf);
+}
+
+extern int my_stat(const char *path, struct my_stat *buf) __asm__("real_impl");
+
+int test(const char *p) {
+  struct my_stat s;
+  return my_stat(p, &s);
+}
+
+// CIR-LABEL: cir.func {{.*}} @test(
+//
+// After `real_impl`'s body is materialized, the FuncOp's signature is
+// updated to `(ptr, ptr) -> i32`. The pre-existing call site
+// in `test` (which still has `(ptr, ptr)` operand types) gets
+// rewritten to an indirect call through a function-pointer bitcast.
+// CIR:         %[[GLOBAL:.+]] = cir.get_global @real_impl : !cir.ptr, !cir.ptr) -> !s32i>>
+// CIR-NEXT:    %[[PTR:.+]] = cir.cast bitcast %[[GLOBAL]] : !cir.ptr, !cir.ptr) -> !s32i>> -> !cir.ptr, !cir.ptr) -> !s32i>>
+// CIR-NEXT:    cir.call %[[PTR]](%{{.+}}, %{{.+}}) : (!cir.ptr, !cir.ptr) -> !s32i>>, !cir.ptr, !cir.ptr) -> !s32i
+
+// The inline definition of `real_impl` is emitted with `available_externally`
+// linkage, taking `struct my_stat64 *`.
+// CIR:       cir.func {{.*}} available_externally @real_impl(%{{.+}}: !cir.ptr{{.*}}, %{{.+}}: !cir.ptr{{.*}}) -> !s32i
+
+// LLVM-LABEL: define dso_local i32 @test(
+// LLVM:         %{{.+}} = call i32 @real_impl(ptr {{.*}}%{{.+}}, ptr {{.*}}%{{.+}})
+// LLVM:       define {{.*}}available_externally i32 @real_impl(
+
+// OGCG-LABEL: define dso_local i32 @test(
+// OGCG:         %{{.+}} = call i32 @real_impl(ptr {{.*}}%{{.+}}, ptr {{.*}}%{{.+}})
+// OGCG:       define {{.*}}available_externally i32 @real_impl(
diff --git a/clang/test/CIR/CodeGen/asm-label-redirect.c b/clang/test/CIR/CodeGen/asm-label-redirect.c
new file mode 100644
index 0000000000000..3e40976df04a9
--- /dev/null
+++ b/clang/test/CIR/CodeGen/asm-label-redirect.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -disable-llvm-passes -o %t.cir %s
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -disable-llvm-passes -o %t-cir.ll %s
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -disable-llvm-passes -o %t.ll %s
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// This test simulates the GNU __REDIRECT_NTH pattern from older glibc headers.
+
+struct my_stat   { int  legacy_field; };
+struct my_stat64 { long modern_field; };
+
+extern int my_stat(const char *path, struct my_stat *buf) __asm__("real_impl");
+extern int real_impl(const char *path, struct my_stat64 *buf);
+
+int test(const char *p) {
+  struct my_stat   s_old;
+  struct my_stat64 s_new;
+  int r1 = my_stat(p, &s_old);
+  int r2 = real_impl(p, &s_new);
+  return r1 + r2;
+}
+
+// Both declarations are mangled to the same symbol "real_impl". CIR
+// materializes a single FuncOp whose signature is whichever declaration
+// it sees first - here, the my_stat declaration.
+//
+// CIR-LABEL: cir.func private @real_impl(
+// CIR-SAME:    !cir.ptr {{.*}},
+// CIR-SAME:    !cir.ptr {{.*}}) -> !s32i
+
+// CIR-LABEL: cir.func {{.*}} @test(
+//
+// The first call site uses `struct my_stat *`, which matches the FuncOp's
+// stored signature, so it lowers to a direct call.
+// CIR:         %[[R1:.*]] = cir.call @real_impl(%{{.+}}, %{{.+}}) :
+// CIR-SAME:      (!cir.ptr {{.*}}, !cir.ptr {{.*}}) -> !s32i
+//
+// The second call site uses `struct my_stat64 *`. The FuncOp's stored
+// signature does not match, so the function pointer is bitcast to the
+// call site's expected signature and the call becomes indirect.
+// CIR:         %[[GLOBAL:.*]] = cir.get_global @real_impl :
+// CIR-SAME:      !cir.ptr, !cir.ptr) -> !s32i>>
+// CIR-NEXT:    %[[PTR:.*]] = cir.cast bitcast %[[GLOBAL]] :
+// CIR-SAME:      !cir.ptr, !cir.ptr) -> !s32i>>
+// CIR-SAME:      -> !cir.ptr, !cir.ptr) -> !s32i>>
+// CIR-NEXT:    %[[R2:.*]] = cir.call %[[PTR]](%{{.+}}, %{{.+}}) :
+// CIR-SAME:      (!cir.ptr, !cir.ptr) -> !s32i>>,
+// CIR-SAME:       !cir.ptr {{.*}}, !cir.ptr {{.*}}) -> !s32i
+
+// LLVM:        declare i32 @real_impl(ptr {{[^,]*}}, ptr {{.*}})
+// LLVM-LABEL:  define dso_local i32 @test(
+// LLVM:          %{{.+}} = call i32 @real_impl(ptr {{.*}}%{{.+}}, ptr {{.*}}%{{.+}})
+// LLVM:          %{{.+}} = call i32 @real_impl(ptr {{.*}}%{{.+}}, ptr {{.*}}%{{.+}})
+
+// OGCG-LABEL:  define dso_local i32 @test(
+// OGCG:          %{{.+}} = call i32 @real_impl(ptr {{.*}}%{{.+}}, ptr {{.*}}%{{.+}})
+// OGCG:          %{{.+}} = call i32 @real_impl(ptr {{.*}}%{{.+}}, ptr {{.*}}%{{.+}})
+// OGCG:        declare i32 @real_impl(ptr {{[^,]*}}, ptr {{.*}})

From 033c9e65cdef46a374c3622633777fbf0f7ad35b Mon Sep 17 00:00:00 2001
From: Alexey Bataev 
Date: Tue, 12 May 2026 14:06:33 -0400
Subject: [PATCH 492/538] [SLP][NFC]Add a test with the overestimation for
 block with noreturn function call

Reviewers:

Pull Request: https://github.com/llvm/llvm-project/pull/197264
---
 .../AArch64/spillcost-noreturn-block.ll       | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-noreturn-block.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-noreturn-block.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-noreturn-block.ll
new file mode 100644
index 0000000000000..2514ebc11010a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-noreturn-block.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t %s --check-prefix=YAML
+
+declare void @external_call()
+declare void @abort() noreturn
+
+; YAML: --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            StoresVectorized
+; YAML-NEXT: Function:        test_noreturn_in_loop_body
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-99'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '6'
+; YAML-NEXT: ...
+define void @test_noreturn_in_loop_body(ptr noalias %res, ptr noalias %in, double %x, double %y, i1 %err) {
+; CHECK-LABEL: define void @test_noreturn_in_loop_body(
+; CHECK-SAME: ptr noalias [[RES:%.*]], ptr noalias [[IN:%.*]], double [[X:%.*]], double [[Y:%.*]], i1 [[ERR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[X]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[Y]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <2 x double> splat (double 1.000000e+00), [[TMP1]]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY:.*]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IV]], 100
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[GEP_IN_0:%.*]] = getelementptr double, ptr [[IN]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_OUT_0:%.*]] = getelementptr double, ptr [[RES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[GEP_IN_0]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[GEP_OUT_0]], align 8
+; CHECK-NEXT:    br i1 [[ERR]], label %[[ERROR:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[ERROR]]:
+; CHECK-NEXT:    call void @external_call()
+; CHECK-NEXT:    call void @abort()
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = fdiv double 1.000000e+00, %x
+  %b = fdiv double 1.000000e+00, %y
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp = icmp slt i64 %iv, 100
+  br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+  %gep.in.0 = getelementptr double, ptr %in, i64 %iv
+  %gep.in.1 = getelementptr inbounds double, ptr %gep.in.0, i64 1
+  %v0 = load double, ptr %gep.in.0, align 8
+  %v1 = load double, ptr %gep.in.1, align 8
+  %r1 = fsub double %v0, %a
+  %r2 = fsub double %v1, %b
+  %gep.out.0 = getelementptr double, ptr %res, i64 %iv
+  %gep.out.1 = getelementptr inbounds double, ptr %gep.out.0, i64 1
+  store double %r1, ptr %gep.out.0, align 8
+  store double %r2, ptr %gep.out.1, align 8
+  br i1 %err, label %error, label %loop.latch
+
+error:
+  call void @external_call()
+  call void @abort()
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+}

From 1537c32236103455236e89b22fa9c52805888167 Mon Sep 17 00:00:00 2001
From: Diksha 
Date: Tue, 12 May 2026 11:27:28 -0700
Subject: [PATCH 493/538] [mlir] Use add_tablegen() for mlir-src-sharder to fix
 aarch64 cross-compile (#196202)

`add_tablegen()` already sets `MLIR_SRC_SHARDER_TABLEGEN_EXE` to the
native host-tool path during cross-compilation (via
`build_native_tool`). The leftover manual
`set(MLIR_SRC_SHARDER_TABLEGEN_EXE mlir-src-sharder PARENT_SCOPE)`
clobbered that path with the bare binary name, causing aarch64
cross-builds to fail with:

```
/bin/sh: 1: mlir-src-sharder: not found
```

when sharding `TestOps`. Switching `mlir-src-sharder` from
`add_llvm_executable` to `add_tablegen` (and dropping the redundant
`set(... PARENT_SCOPE)`) lets the existing cross-compile machinery point
consumers at the host build of the tool.
---
 mlir/tools/mlir-src-sharder/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/tools/mlir-src-sharder/CMakeLists.txt b/mlir/tools/mlir-src-sharder/CMakeLists.txt
index ba78dad1c87b5..bc4a11bd4c0cb 100644
--- a/mlir/tools/mlir-src-sharder/CMakeLists.txt
+++ b/mlir/tools/mlir-src-sharder/CMakeLists.txt
@@ -1,11 +1,10 @@
 set(LLVM_LINK_COMPONENTS Support)
 set(LIBS MLIRSupport)
 
-add_llvm_executable(mlir-src-sharder mlir-src-sharder.cpp)
+add_tablegen(mlir-src-sharder MLIR_SRC_SHARDER
+  mlir-src-sharder.cpp)
 
 set_target_properties(mlir-src-sharder PROPERTIES FOLDER "MLIR/Tablegenning")
 target_link_libraries(mlir-src-sharder PRIVATE ${LIBS})
 
-set(MLIR_SRC_SHARDER_TABLEGEN_EXE mlir-src-sharder PARENT_SCOPE)
-
 mlir_check_all_link_libraries(mlir-src-sharder)

From e249fddc70948f9dac2507c84a6114f695d8cc3f Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak 
Date: Tue, 12 May 2026 13:54:04 -0500
Subject: [PATCH 494/538] [mlir][memref] Drop inaccurate bounds on
 collapse_shape delinearize (#197041)

If (as with something like `vector.load`) we know that the indices that
are indexnig a `collapse_shape`'d memref are not necessarily within the
bounds of the product of tose collapsed dimensions, we shouldn't be
setting the outrmost bound on the
`affine.delinearize_index` we use to split up those indices, as that
would incorrectly assert in-bounds-ness per the semantics of
`affine.delinearize_index`.

Resolve this by giving `resolveSourceIndicesCollapseShape` a
`startsInbounds` parameter by analogy to the one for expand shape, and
conservatively set that to false for AMDGPU ops for now.

AI: An LLM spotted this isssue while I was implementing the indexed
access ops for AMDGPU operations, but I made the changes by hand.
---
 .../mlir/Dialect/MemRef/Utils/MemRefUtils.h   |  7 +++-
 .../AMDGPU/Transforms/FoldMemRefsOps.cpp      |  5 ++-
 .../Affine/Transforms/FoldMemRefAliasOps.cpp  |  6 ++-
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 12 ++++--
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 14 +++++--
 .../Dialect/AMDGPU/amdgpu-fold-memrefs.mlir   | 10 ++---
 .../Dialect/MemRef/fold-memref-alias-ops.mlir | 38 +++++++++++++++++--
 7 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
index f58b776138def..613d567de2457 100644
--- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -169,10 +169,15 @@ void resolveSourceIndicesExpandShape(Location loc, PatternRewriter &rewriter,
 ///
 /// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
 ///          memref<2x6x42xf32>
+///
+/// If `startsInbounds` is true, optimizations that rely on all indices being
+/// non-negative and less than the corresponding memref dimension may be
+/// performed.
 void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
                                        memref::CollapseShapeOp collapseShapeOp,
                                        ValueRange indices,
-                                       SmallVectorImpl &sourceIndices);
+                                       SmallVectorImpl &sourceIndices,
+                                       bool startsInbounds);
 
 /// Given the 'indices' of a load/store operation where the memref is a result
 /// of a rank-reducing full subview op, returns the indices w.r.t to the source
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 53cb673ced999..24c30525957c7 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -46,14 +46,17 @@ static LogicalResult foldMemrefViewOp(PatternRewriter &rewriter, Location loc,
         return success();
       })
       .Case([&](memref::ExpandShapeOp expandShapeOp) {
+        // The lack of inbounds is conservative and will be fixed.
         mlir::memref::resolveSourceIndicesExpandShape(
             loc, rewriter, expandShapeOp, indices, resolvedIndices, false);
         memrefBase = expandShapeOp.getViewSource();
         return success();
       })
       .Case([&](memref::CollapseShapeOp collapseShapeOp) {
+        // The collapse shape in-bounds-ness is defaulted to false
+        // conservatively.
         mlir::memref::resolveSourceIndicesCollapseShape(
-            loc, rewriter, collapseShapeOp, indices, resolvedIndices);
+            loc, rewriter, collapseShapeOp, indices, resolvedIndices, false);
         memrefBase = collapseShapeOp.getViewSource();
         return success();
       })
diff --git a/mlir/lib/Dialect/Affine/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/Affine/Transforms/FoldMemRefAliasOps.cpp
index 6f6e40f586fc8..bb7b231ea02c2 100644
--- a/mlir/lib/Dialect/Affine/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/FoldMemRefAliasOps.cpp
@@ -128,7 +128,8 @@ struct AffineLoadOpOfCollapseShapeOpFolder final
 
     SmallVector sourceIndices;
     memref::resolveSourceIndicesCollapseShape(
-        loadOp.getLoc(), rewriter, collapseShapeOp, indices, sourceIndices);
+        loadOp.getLoc(), rewriter, collapseShapeOp, indices, sourceIndices,
+        /*startsInbounds=*/true);
 
     rewriter.replaceOpWithNewOp(
         loadOp, collapseShapeOp.getViewSource(), sourceIndices);
@@ -212,7 +213,8 @@ struct AffineStoreOpOfCollapseShapeOpFolder final
 
     SmallVector sourceIndices;
     memref::resolveSourceIndicesCollapseShape(
-        storeOp.getLoc(), rewriter, collapseShapeOp, indices, sourceIndices);
+        storeOp.getLoc(), rewriter, collapseShapeOp, indices, sourceIndices,
+        /*startsInbounds=*/true);
 
     rewriter.replaceOpWithNewOp(
         storeOp, storeOp.getValueToStore(), collapseShapeOp.getViewSource(),
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index de7662753d142..f5c5a48e7f543 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -316,7 +316,8 @@ LogicalResult AccessOpOfCollapseShapeOpFolder::matchAndRewrite(
 
   SmallVector sourceIndices;
   memref::resolveSourceIndicesCollapseShape(op.getLoc(), rewriter, collapse,
-                                            op.getIndices(), sourceIndices);
+                                            op.getIndices(), sourceIndices,
+                                            op.hasInboundsIndices());
 
   std::optional> newValues = op.updateMemrefAndIndices(
       rewriter, collapse.getViewSource(), sourceIndices);
@@ -405,13 +406,15 @@ LogicalResult IndexedMemCopyOpOfCollapseShapeOpFolder::matchAndRewrite(
     newSrc = srcCollapse.getViewSource();
     newSrcIndices.clear();
     memref::resolveSourceIndicesCollapseShape(
-        op.getLoc(), rewriter, srcCollapse, op.getSrcIndices(), newSrcIndices);
+        op.getLoc(), rewriter, srcCollapse, op.getSrcIndices(), newSrcIndices,
+        /*startsInbounds=*/true);
   }
   if (dstCollapse) {
     newDst = dstCollapse.getViewSource();
     newDstIndices.clear();
     memref::resolveSourceIndicesCollapseShape(
-        op.getLoc(), rewriter, dstCollapse, op.getDstIndices(), newDstIndices);
+        op.getLoc(), rewriter, dstCollapse, op.getDstIndices(), newDstIndices,
+        /*startsInbounds=*/true);
   }
   op.setMemrefsAndIndices(rewriter, newSrc, newSrcIndices, newDst,
                           newDstIndices);
@@ -550,7 +553,8 @@ LogicalResult TransferOpOfCollapseShapeOpFolder::matchAndRewrite(
 
   SmallVector newIndices;
   memref::resolveSourceIndicesCollapseShape(op.getLoc(), rewriter, collapse,
-                                            op.getIndices(), newIndices);
+                                            op.getIndices(), newIndices,
+                                            /*startsInbounds=*/!op.getMask());
 
   op.updateStartingPosition(rewriter, collapse.getViewSource(), newIndices,
                             AffineMapAttr::get(newPerm));
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 0899b1a9faeb4..bc5a327df9140 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -258,7 +258,8 @@ void resolveSourceIndicesExpandShape(Location loc, PatternRewriter &rewriter,
 void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
                                        memref::CollapseShapeOp collapseShapeOp,
                                        ValueRange indices,
-                                       SmallVectorImpl &sourceIndices) {
+                                       SmallVectorImpl &sourceIndices,
+                                       bool startsInbounds) {
   // Note: collapse_shape requires a strided memref, we can do this.
   auto metadata = memref::ExtractStridedMetadataOp::create(
       rewriter, loc, collapseShapeOp.getSrc());
@@ -273,10 +274,15 @@ void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
       continue;
     }
 
-    SmallVector basis =
-        llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
+    // If we don't know that this value is in-bounds, the largest return value
+    // of the delinearization may exceed `sourceSizes[d]`, so we drop that first
+    // group entry in order to maintain soundness.
+    auto trimmedGroup =
+        ArrayRef(group).drop_front(startsInbounds ? 0 : 1);
+    SmallVector basis = llvm::map_to_vector(
+        trimmedGroup, [&](int64_t d) { return sourceSizes[d]; });
     auto delinearize = affine::AffineDelinearizeIndexOp::create(
-        rewriter, loc, index, basis, /*hasOuterBound=*/true);
+        rewriter, loc, index, basis, /*hasOuterBound=*/startsInbounds);
     llvm::append_range(sourceIndices, delinearize.getResults());
   }
   if (collapseShapeOp.getReassociationIndices().empty()) {
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 1274fe59f8be5..82f76953e2522 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -82,8 +82,8 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
-  // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
+  // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (128) : index, index
+  // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64) : index, index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space>
 
@@ -256,7 +256,7 @@ func.func @test_transpose_load_expand_shape(%offset_i: index, %offset_j: index)
 // CHECK-SAME: %[[ARG0:.*]]: index
 func.func @test_transpose_load_collapse_shape(%offset_i: index) -> vector<4xf16> {
   // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space>
-  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (32, 128) : index, index
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (128) : index, index
   // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[INDICES]]#0, %[[INDICES]]#1]
   // CHECK-SAME: memref<32x128xf16, #gpu.address_space> -> vector<4xf16>
 
@@ -347,7 +347,7 @@ func.func @test_make_gather_dma_base_subview(%mem: memref<64x128xf16, #gpu_globa
 // CHECK: func @test_make_gather_dma_base_collapse_shape
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_IDX:.*]]: index
 func.func @test_make_gather_dma_base_collapse_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_idx: index) {
-  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[LDS_IDX]] into (64, 64) : index, index
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[LDS_IDX]] into (64) : index, index
   // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space>, memref<64x64xf16, #gpu.address_space> -> !amdgpu.tdm_gather_base
 
@@ -456,7 +456,7 @@ func.func @test_global_load_async_to_lds_both_fold_masked(%src: memref<64x128xf3
 // CHECK: func @test_global_load_async_to_lds_no_mask_dst_collapse
 // CHECK-SAME: %[[SRC:.*]]: memref<8192xi32, #gpu.address_space>, %[[LDS:.*]]: memref<64x64xi32, #gpu.address_space>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index
 func.func @test_global_load_async_to_lds_no_mask_dst_collapse(%src: memref<8192xi32, #gpu_global_addrspace>, %lds: memref<64x64xi32, #gpu_lds_addrspace>, %src_idx: index, %dst_idx: index) {
-  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[DST_IDX]] into (64, 64) : index, index
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[DST_IDX]] into (64) : index, index
   // CHECK: amdgpu.global_load_async_to_lds %[[SRC]][%[[SRC_IDX]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1] :
   // CHECK-SAME: i32, memref<8192xi32, #gpu.address_space>, memref<64x64xi32, #gpu.address_space>
 
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 6e2702d936ee0..48241a3e5d9df 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -988,6 +988,21 @@ func.func @fold_vector_transfer_read_rank_mismatch(
 
 // -----
 
+func.func @fold_memref_load_collapse_shape(
+  %arg0 : memref<4x8xf32>, %arg1 : index) -> f32 {
+  %0 = memref.collapse_shape %arg0 [[0, 1]] : memref<4x8xf32> into memref<32xf32>
+  %1 = memref.load %0[%arg1] {nontemporal = true} : memref<32xf32>
+  return %1 : f32
+}
+
+// CHECK-LABEL: func @fold_memref_load_collapse_shape
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x8xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (4, 8)
+//       CHECK:   memref.load %[[ARG0]][%[[IDXS]]#0, %[[IDXS]]#1] {nontemporal = true}
+
+// -----
+
 func.func @fold_vector_load_collapse_shape(
   %arg0 : memref<4x8xf32>, %arg1 : index) -> vector<8xf32> {
   %0 = memref.collapse_shape %arg0 [[0, 1]] : memref<4x8xf32> into memref<32xf32>
@@ -998,7 +1013,7 @@ func.func @fold_vector_load_collapse_shape(
 // CHECK-LABEL: func @fold_vector_load_collapse_shape
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x8xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
-//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (4, 8)
+//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (8)
 //       CHECK:   vector.load %[[ARG0]][%[[IDXS]]#0, %[[IDXS]]#1] {nontemporal = true}
 
 // -----
@@ -1015,11 +1030,26 @@ func.func @fold_vector_maskedload_collapse_shape(
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
 //  CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: vector<8xi1>
 //  CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: vector<8xf32>
-//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (4, 8)
+//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (8)
 //       CHECK:   vector.maskedload %[[ARG0]][%[[IDXS]]#0, %[[IDXS]]#1], %[[ARG3]], %[[ARG4]]
 
 // -----
 
+func.func @fold_memref_store_collapse_shape(
+  %arg0 : memref<4x8xf32>, %arg1 : index, %val : f32) {
+  %0 = memref.collapse_shape %arg0 [[0, 1]] : memref<4x8xf32> into memref<32xf32>
+  memref.store %val, %0[%arg1] {nontemporal = true} : memref<32xf32>
+  return
+}
+
+// CHECK-LABEL: func @fold_memref_store_collapse_shape
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x8xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (4, 8)
+//       CHECK:   memref.store %{{.*}}, %[[ARG0]][%[[IDXS]]#0, %[[IDXS]]#1] {nontemporal = true}
+
+// -----
+
 func.func @fold_vector_store_collapse_shape(
   %arg0 : memref<4x8xf32>, %arg1 : index, %val : vector<8xf32>) {
   %0 = memref.collapse_shape %arg0 [[0, 1]] : memref<4x8xf32> into memref<32xf32>
@@ -1030,7 +1060,7 @@ func.func @fold_vector_store_collapse_shape(
 // CHECK-LABEL: func @fold_vector_store_collapse_shape
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x8xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
-//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (4, 8)
+//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (8)
 //       CHECK:   vector.store %{{.*}}, %[[ARG0]][%[[IDXS]]#0, %[[IDXS]]#1] {nontemporal = true}
 
 // -----
@@ -1047,7 +1077,7 @@ func.func @fold_vector_maskedstore_collapse_shape(
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
 //  CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: vector<8xi1>
 //  CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: vector<8xf32>
-//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (4, 8)
+//       CHECK:   %[[IDXS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (8)
 //       CHECK:   vector.maskedstore %[[ARG0]][%[[IDXS]]#0, %[[IDXS]]#1], %[[ARG3]], %[[ARG4]]
 
 // -----

From e365ea8d03f28e5a6b727584a1dc880756beaa1e Mon Sep 17 00:00:00 2001
From: Harald van Dijk 
Date: Tue, 12 May 2026 20:04:57 +0100
Subject: [PATCH 495/538] [DebugInfo] Robustify DISubprogram upgrade (#190611)

The bitcode upgrade process for DISubprogram was fragile and depended on
the exact order in which DICompileUnit, DISubprogram, and the list of
DISubprograms were emitted. This would work out okay with bitcode
written by standard LLVM, but was not a rule that the old format had.
This commit makes the upgrade process work when these metadata nodes are
emitted in any order, and updates the existing upgrade-subprogram.ll
test to cover all orders. The corresponding .bc file was written by a
patched version of LLVM 3.7 that provides a hook to force nodes to be
emitted in a specific order and can be inspected with llvm-bcanalyzer
--dump.
---
 llvm/lib/Bitcode/Reader/MetadataLoader.cpp |  14 ++--
 llvm/test/Bitcode/upgrade-subprogram.ll    |  74 ++++++++++++++++++---
 llvm/test/Bitcode/upgrade-subprogram.ll.bc | Bin 784 -> 1080 bytes
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index a11e5609d5294..769bf473e4ec3 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -446,7 +446,7 @@ class MetadataLoader::MetadataLoaderImpl {
 
   // Keep mapping of seens pair of old-style CU <-> SP, and update pointers to
   // point from SP to CU after a block is completly parsed.
-  std::vector> CUSubprograms;
+  std::vector> CUSubprograms;
 
   /// Functions that need to be matched with subprograms when upgrading old
   /// metadata.
@@ -485,7 +485,8 @@ class MetadataLoader::MetadataLoaderImpl {
   /// Upgrade old-style CU <-> SP pointers to point from SP to CU.
   void upgradeCUSubprograms() {
     for (auto CU_SP : CUSubprograms)
-      if (auto *SPs = dyn_cast_or_null(CU_SP.second))
+      if (auto *SPs =
+              dyn_cast_or_null(MetadataList.lookup(CU_SP.second - 1)))
         for (auto &Op : SPs->operands())
           if (auto *SP = dyn_cast_or_null(Op))
             SP->replaceUnit(CU_SP.first);
@@ -1333,11 +1334,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       return getMD(ID - 1);
     return nullptr;
   };
-  auto getMDOrNullWithoutPlaceholders = [&](unsigned ID) -> Metadata * {
-    if (ID)
-      return MetadataList.getMetadataFwdRef(ID - 1);
-    return nullptr;
-  };
   auto getMDString = [&](unsigned ID) -> MDString * {
     // This requires that the ID is not really a forward reference.  In
     // particular, the MDString must already have been resolved.
@@ -1975,8 +1971,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     NextMetadataNo++;
 
     // Move the Upgrade the list of subprograms.
-    if (Metadata *SPs = getMDOrNullWithoutPlaceholders(Record[11]))
-      CUSubprograms.push_back({CU, SPs});
+    if (Record[11])
+      CUSubprograms.push_back({CU, Record[11]});
     break;
   }
   case bitc::METADATA_SUBPROGRAM: {
diff --git a/llvm/test/Bitcode/upgrade-subprogram.ll b/llvm/test/Bitcode/upgrade-subprogram.ll
index fb807ed6daaf0..a9a5399c7fa97 100644
--- a/llvm/test/Bitcode/upgrade-subprogram.ll
+++ b/llvm/test/Bitcode/upgrade-subprogram.ll
@@ -1,17 +1,75 @@
 ; RUN: llvm-dis < %s.bc | FileCheck %s
 ; RUN: verify-uselistorder < %s.bc
 
-; CHECK: define void @foo() !dbg [[SP:![0-9]+]]
-define void @foo() {
+; CHECK: define void @fooCLS() !dbg [[CLSSP:![0-9]+]]
+define void @fooCLS() {
+  ret void
+}
+
+; CHECK: define void @fooCSL() !dbg [[CSLSP:![0-9]+]]
+define void @fooCSL() {
+  ret void
+}
+
+; CHECK: define void @fooLCS() !dbg [[LCSSP:![0-9]+]]
+define void @fooLCS() {
+  ret void
+}
+
+; CHECK: define void @fooLSC() !dbg [[LSCSP:![0-9]+]]
+define void @fooLSC() {
+  ret void
+}
+
+; CHECK: define void @fooSCL() !dbg [[SCLSP:![0-9]+]]
+define void @fooSCL() {
+  ret void
+}
+
+; CHECK: define void @fooSLC() !dbg [[SLCSP:![0-9]+]]
+define void @fooSLC() {
   ret void
 }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 
-!llvm.dbg.cu = !{!1}
-!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, emissionKind: FullDebug)
-!2 = !DIFile(filename: "foo.c", directory: "/path/to/dir")
-; CHECK: [[SP]] = distinct !DISubprogram
-!3 = distinct !DISubprogram(file: !2, scope: !2, line: 51, name: "foo", function: void ()* @foo, type: !4)
-!4 = !DISubroutineType(types: !{})
+!llvm.dbg.cu = !{!3, !6, !10, !14, !16, !20}
+!1 = !DIFile(filename: "foo.c", directory: "/path/to/dir")
+!2 = !DISubroutineType(types: !{})
+
+; CHECK-DAG: [[CLSSP]] = distinct !DISubprogram{{.*}}, unit: [[CLSCU:![0-9]+]]
+; CHECK-DAG: [[CLSCU]] = distinct !DICompileUnit
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, subprograms: !4, emissionKind: 1)
+!4 = !{!5}
+!5 = !DISubprogram(file: !1, scope: !2, line: 51, name: "fooCLS", function: void ()* @fooCLS, type: !2)
+
+; CHECK-DAG: [[CSLSP]] = distinct !DISubprogram{{.*}}, unit: [[CSLCU:![0-9]+]]
+; CHECK-DAG: [[CSLCU]] = distinct !DICompileUnit
+!6 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, subprograms: !8, emissionKind: 1)
+!7 = !DISubprogram(file: !1, scope: !2, line: 51, name: "fooCSL", function: void ()* @fooCSL, type: !2)
+!8 = !{!7}
+
+; CHECK-DAG: [[LCSSP]] = distinct !DISubprogram{{.*}}, unit: [[LCSCU:![0-9]+]]
+; CHECK-DAG: [[LCSCU]] = distinct !DICompileUnit
+!9 = !{!11}
+!10 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, subprograms: !9, emissionKind: 1)
+!11 = !DISubprogram(file: !1, scope: !2, line: 51, name: "fooLCS", function: void ()* @fooLCS, type: !2)
+
+; CHECK-DAG: [[LSCSP]] = distinct !DISubprogram{{.*}}, unit: [[LSCCU:![0-9]+]]
+; CHECK-DAG: [[LSCCU]] = distinct !DICompileUnit
+!12 = !{!13}
+!13 = !DISubprogram(file: !1, scope: !2, line: 51, name: "fooLSC", function: void ()* @fooLSC, type: !2)
+!14 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, subprograms: !12, emissionKind: 1)
+
+; CHECK-DAG: [[SCLSP]] = distinct !DISubprogram{{.*}}, unit: [[SCLCU:![0-9]+]]
+; CHECK-DAG: [[SCLCU]] = distinct !DICompileUnit
+!15 = !DISubprogram(file: !1, scope: !2, line: 51, name: "fooSCL", function: void ()* @fooSCL, type: !2)
+!16 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, subprograms: !17, emissionKind: 1)
+!17 = !{!15}
+
+; CHECK-DAG: [[SLCSP]] = distinct !DISubprogram{{.*}}, unit: [[SLCCU:![0-9]+]]
+; CHECK-DAG: [[SLCCU]] = distinct !DICompileUnit
+!18 = !DISubprogram(file: !1, scope: !2, line: 51, name: "fooSLC", function: void ()* @fooSLC, type: !2)
+!19 = !{!18}
+!20 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, subprograms: !19, emissionKind: 1)
diff --git a/llvm/test/Bitcode/upgrade-subprogram.ll.bc b/llvm/test/Bitcode/upgrade-subprogram.ll.bc
index cfab5a2c76a9814c8324d6cece8bf7172fc1ea65..814296e7e2799be7890d84265d7d34bd4f5e053e 100644
GIT binary patch
delta 681
zcmbQhwu6Jq$@#!NMIHtQZpMjfv+LVz7#ZMzr;-WIW{~*Ag5VfD;Y4r}P6#774q_ZY
z4l58F$;}Xu6!e+U!>Q(ejKw)ga0(+sr33>*I#5oE)8_(9qk)*mk^sI(0sPMy*j7H^
zd-j2^J|ThosRIA!34HG#@Cma#x?#X_Xu|@7_VNmLbBT7=Gtvi~EiN!y?P0Xtb69$U
zv(<^i3m7$ecn>UNWS9`>e?SIkv;)v93=Gmt>5Yke??3P@KESovz)_$fRCt2H0Yi39
z0iY5F2WEx?E7TMCfJ_C322O?s);7yMhpl@K)$43H#Nx;i=Ig-3cw9yWXciC%f{e>_
zZs@IIdIsV%F)UzUh)9@m*m@7McF!Rl9|4xct1K#F1!o+Yz+qIt$gn~2ftUo)L+?S>
za|yE~Mwl{ovdI)rU;u}24g*6E+_V!g(+=y@_b_P#jY|R=CkHp~10#dME6y3rK&v?z
z9Dq@EL!jXx(_xu=4`tIDMcopc6hv05DM&Iun9fKtIlYj}ZDQ*_vYCL{v&fV?Wq
L!pKAk8yuhjsDPw=

delta 382
zcmdnNF@a6Z$@#!NQxOIRRt5%!Bpw45pU8dB-hQ^z4LvKs1QcRWs3GpGo&Fh-cNvNOpPFJNGF1i40sfgy)uMxg%z86d}jfq|ESq4&m(H@9ML-8=E>
z#;KSyHx8Y9wv5GD+{|pZnYhtL%(zhmJ46u|dn0^ef?{)Y;DZvyy?6*Oe;9F#lcD0fLw=1!yBod+^c
z9?HIAluMHcwAkQmHK*Ac=#WGO1_nkT4RkM4A&|`uWC!sHD5W$lkx_6t@x;T@p<}{A
PraNprt&cf?a$o=ec;a
Date: Tue, 12 May 2026 12:14:44 -0700
Subject: [PATCH 496/538] [clang] Don't warn on __COUNTER__ in system macros
 (#196689)

The introduction of extension and compatibility warnings means that
`__COUNTER__` has started causing warnings (and -Werror= build failures)
due to use of system APIs.

This PR simply ensures that these diagnostics don't get reported for
system macro expansions as well.
---
 .../include/clang/Basic/DiagnosticLexKinds.td  |  4 ++--
 .../Lexer/Inputs/__counter__-system-header.h   |  7 +++++++
 clang/test/Lexer/__counter__-system-include.c  | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Lexer/Inputs/__counter__-system-header.h
 create mode 100644 clang/test/Lexer/__counter__-system-include.c

diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 85fa290de6fd9..0ac7ac27a0271 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -93,10 +93,10 @@ def err_conflict_marker : Error<"version control conflict marker in file">;
 def err_counter_overflow : Error<
   "'__COUNTER__' value cannot exceed 2'147'483'647">;
 def ext_counter : Extension<
-  "'__COUNTER__' is a C2y extension">, InGroup;
+  "'__COUNTER__' is a C2y extension">, InGroup, SuppressInSystemMacro;
 def warn_counter : Warning<
   "'__COUNTER__' is incompatible with standards before C2y">,
-  InGroup, DefaultIgnore;
+  InGroup, DefaultIgnore, SuppressInSystemMacro;
 
 def err_raw_delim_too_long : Error<
   "raw string delimiter longer than 16 characters"
diff --git a/clang/test/Lexer/Inputs/__counter__-system-header.h b/clang/test/Lexer/Inputs/__counter__-system-header.h
new file mode 100644
index 0000000000000..60619215abd27
--- /dev/null
+++ b/clang/test/Lexer/Inputs/__counter__-system-header.h
@@ -0,0 +1,7 @@
+#define COUNTER_ALIAS __COUNTER__
+#define COUNTER_MACRO() __COUNTER__
+
+int header_counter_value = __COUNTER__;
+int header_counter_alias = COUNTER_ALIAS;
+int header_counter_macro = COUNTER_MACRO();
+
diff --git a/clang/test/Lexer/__counter__-system-include.c b/clang/test/Lexer/__counter__-system-include.c
new file mode 100644
index 0000000000000..5bf4b49f5ff2f
--- /dev/null
+++ b/clang/test/Lexer/__counter__-system-include.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1                  -Wpedantic %s -fsyntax-only -isystem %S/Inputs -verify=ext
+// RUN: %clang_cc1 -std=c2y         -Wpedantic %s -fsyntax-only -isystem %S/Inputs -verify
+// RUN: %clang_cc1 -std=c2y -Wpre-c2y-compat   %s -fsyntax-only -isystem %S/Inputs -verify=pre
+// RUN: %clang_cc1                            -pedantic %s -fsyntax-only -isystem %S/Inputs -verify=ext
+// RUN: %clang_cc1 -std=c2y -Wpre-c2y-compat  -pedantic %s -fsyntax-only -isystem %S/Inputs -verify=pre
+// RUN: %clang_cc1                            -pedantic-errors %s -fsyntax-only -isystem %S/Inputs -verify=pedant
+// RUN: %clang_cc1 -std=c2y -Wpre-c2y-compat  -pedantic-errors %s -fsyntax-only -isystem %S/Inputs -verify=pre
+
+#include <__counter__-system-header.h>
+
+// expected-no-diagnostics
+
+int tu_direct_reference = __COUNTER__; // #errorline
+// ext-warning@#errorline {{'__COUNTER__' is a C2y extension}}
+// pre-warning@#errorline {{'__COUNTER__' is incompatible with standards before C2y}}
+// pedant-error@#errorline {{'__COUNTER__' is a C2y extension}}
+int tu_counter_alias = COUNTER_ALIAS;
+int tu_counter_macro = COUNTER_MACRO();

From 40da6920a0d71d49dfa2392b09153600b0759f5e Mon Sep 17 00:00:00 2001
From: Igor Kudrin 
Date: Tue, 12 May 2026 12:18:14 -0700
Subject: [PATCH 497/538] Reland "[Clang][CodeGen] Report when an alias points
 to an incompatible target" (#195550)

This relands #192397, which was reverted in #194106. The new version
includes the following fixes:
- Set an explicit triple in the `attr-alias.m` test because aliases are
not supported on Darwin.
- Relax the check to only diagnose mismatches in return types and
parameter lists, while ignoring exception specifications and other
attributes.

Original description follows:

Add checks to ensure that an alias and its target have compatible types:
- Generate an error if a function alias points to a variable or vice
versa.
- Issue a warning for mismatches in function types.
- Ignore type discrepancies for variables.

This behavior aligns with similar diagnostics in GCC.

Resolves: #47301
---
 clang/docs/ReleaseNotes.rst                   |  4 ++
 .../clang/Basic/DiagnosticFrontendKinds.td    |  6 ++
 clang/lib/CodeGen/CodeGenModule.cpp           | 66 +++++++++++++++++++
 clang/test/CodeGen/alias.c                    |  2 +-
 clang/test/Sema/attr-alias-elf.c              | 55 ++++++++++++++++
 clang/test/SemaCXX/attr-alias.cpp             | 13 ++++
 clang/test/SemaObjC/attr-alias.m              |  6 ++
 compiler-rt/lib/dfsan/dfsan_custom.cpp        |  7 +-
 8 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/SemaCXX/attr-alias.cpp
 create mode 100644 clang/test/SemaObjC/attr-alias.m

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 896a4b4867ed4..db48f0860896f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -500,6 +500,10 @@ Improvements to Clang's diagnostics
 - Fixed false positive host-device mismatch errors in discarded `if constexpr` branches for CUDA/HIP;
   such calls are now correctly skipped.
 
+- Clang now errors when a function declaration aliases a variable or vice versa. (#GH195550)
+
+- Added ``-Wattribute-alias`` to diagnose type mismatches between an alias and its aliased function. (#GH195550)
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index f384a97b6825e..3d5da95de99ee 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -359,6 +359,12 @@ def err_non_default_visibility_dllimport : Error<
   "non-default visibility cannot be applied to 'dllimport' declaration">;
 def err_ifunc_resolver_return : Error<
   "ifunc resolver function must return a pointer">;
+def err_alias_between_function_and_variable : Error<
+  "cannot alias a %select{function|variable}0 with a %select{variable|function}0">;
+def note_aliasee_declaration: Note<"aliasee is declared here">;
+def warn_alias_type_mismatch : Warning<
+  "alias and aliasee have different types %0 and %1">,
+  InGroup>;
 
 def warn_atomic_op_misaligned : Warning<
   "misaligned atomic operation may incur "
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 72a1f771962b1..c0d9bc933ceee 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -830,6 +830,72 @@ void CodeGenModule::checkAliases() {
       continue;
     }
 
+    if (!IsIFunc) {
+      GlobalDecl AliaseeGD;
+      if (!lookupRepresentativeDecl(GV->getName(), AliaseeGD) ||
+          !isa(AliaseeGD.getDecl())) {
+        Diags.Report(Location, diag::err_alias_to_undefined)
+            << IsIFunc << IsIFunc;
+        Error = true;
+        continue;
+      }
+
+      bool AliasIsFuncDecl = isa(D);
+      bool AliaseeIsFunc = isa(GV);
+      // Function declarations can only alias functions (including IFUNCs).
+      // Similarly, variable declarations can only alias variables.
+      if (AliasIsFuncDecl != AliaseeIsFunc) {
+        Diags.Report(Location, diag::err_alias_between_function_and_variable)
+            << AliasIsFuncDecl;
+        Diags.Report(AliaseeGD.getDecl()->getLocation(),
+                     diag::note_aliasee_declaration);
+        Error = true;
+        continue;
+      }
+
+      // Only report functions.
+      // Type mismatches for variables can be intentional.
+      if (AliasIsFuncDecl && AliaseeIsFunc) {
+        QualType AliasTy = D->getType();
+        QualType AliaseeTy = cast(AliaseeGD.getDecl())->getType();
+        auto shouldReportTypeMismatch = [&]() {
+          const auto *AliasFTy =
+              AliasTy.getCanonicalType()->getAs();
+          const auto *AliaseeFTy =
+              AliaseeTy.getCanonicalType()->getAs();
+          assert(AliasFTy && AliaseeFTy);
+          if (!Context.typesAreCompatible(AliasFTy->getReturnType(),
+                                          AliaseeFTy->getReturnType()))
+            return true;
+          const auto *AliasFPTy = dyn_cast(AliasFTy);
+          const auto *AliaseeFPTy = dyn_cast(AliaseeFTy);
+          // Report variadic vs no-prototype.
+          if ((AliasFPTy && AliasFPTy->isVariadic() && !AliaseeFPTy) ||
+              (AliaseeFPTy && AliaseeFPTy->isVariadic() && !AliasFPTy))
+            return true;
+          // Do not report aliases with unspecified parameter lists.
+          if (!AliasFPTy || !AliaseeFPTy)
+            return false;
+          // Report if the parameter lists are different. Any other mismatches,
+          // such as in exception specifications, are ignored.
+          if (AliasFPTy->getNumParams() != AliaseeFPTy->getNumParams() ||
+              AliasFPTy->isVariadic() != AliaseeFPTy->isVariadic())
+            return true;
+          for (unsigned i = 0; i < AliasFPTy->getNumParams(); ++i)
+            if (!Context.typesAreCompatible(AliasFPTy->getParamType(i),
+                                            AliaseeFPTy->getParamType(i)))
+              return true;
+          return false;
+        };
+        if (shouldReportTypeMismatch()) {
+          Diags.Report(Location, diag::warn_alias_type_mismatch)
+              << AliasTy << AliaseeTy;
+          Diags.Report(AliaseeGD.getDecl()->getLocation(),
+                       diag::note_aliasee_declaration);
+        }
+      }
+    }
+
     if (getContext().getTargetInfo().getTriple().isOSAIX())
       if (const llvm::GlobalVariable *GVar =
               dyn_cast(GV))
diff --git a/clang/test/CodeGen/alias.c b/clang/test/CodeGen/alias.c
index 9403c55beae0b..f4bc9668e343c 100644
--- a/clang/test/CodeGen/alias.c
+++ b/clang/test/CodeGen/alias.c
@@ -59,7 +59,7 @@ extern void f1(void) __attribute((alias("f0")));
 static inline int foo1() { return 0; }
 // CHECKBASIC-LABEL: define internal i32 @foo1()
 int foo() __attribute__((alias("foo1")));
-int bar() __attribute__((alias("bar1")));
+extern int bar __attribute__((alias("bar1")));
 
 extern int test6();
 void test7() { test6(); }  // test6 is emitted as extern.
diff --git a/clang/test/Sema/attr-alias-elf.c b/clang/test/Sema/attr-alias-elf.c
index d2674d1db0312..e2d4d41f459ff 100644
--- a/clang/test/Sema/attr-alias-elf.c
+++ b/clang/test/Sema/attr-alias-elf.c
@@ -71,3 +71,58 @@ void test4_foo() __attribute__((alias("test4_bar")));
 
 int test5_bar = 0;
 extern struct incomplete_type test5_foo __attribute__((alias("test5_bar")));
+
+int test6 = 0;
+// expected-note@-1 {{aliasee is declared here}}
+void test6_alias() __attribute__((alias("test6")));
+// expected-error@-1 {{cannot alias a variable with a function}}
+
+extern int test7_alias __attribute__((alias("test7")));
+// expected-error@-1 {{cannot alias a function with a variable}}
+int test7(int x) { return x * 2; }
+// expected-note@-1 {{aliasee is declared here}}
+
+void *test8_ifunc() { return 0; }
+void test8(void) __attribute__((ifunc("test8_ifunc")));
+// expected-note@-1 {{aliasee is declared here}}
+extern int test8_alias __attribute__((alias("test8")));
+// expected-error@-1 {{cannot alias a function with a variable}}
+
+void test9() {}
+// expected-note@-1 {{aliasee is declared here}}
+int test9_alias() __attribute__((alias("test9")));
+// expected-warning@-1 {{alias and aliasee have different types 'int ()' and 'void ()'}}
+
+// No warning for an alias with unspecified parameters if the return types match.
+int test10(int x, int y) { return x + y; }
+int test10_alias() __attribute__((alias("test10")));
+
+// No warning for an alias target with unspecified parameters if the return types match.
+int test11() { return 7; }
+int test11_alias(int x) __attribute__((alias("test11")));
+
+int test12(int x, int y) { return x + y; }
+// expected-note@-1 {{aliasee is declared here}}
+int test12_alias(int x, ...) __attribute__((alias("test12")));
+// expected-warning@-1 {{alias and aliasee have different types 'int (int, ...)' and 'int (int, int)'}}
+
+// No warning when using typedef equivalents.
+typedef int Integer;
+Integer test13(int x) { return x; }
+int test13_alias(Integer) __attribute__((alias("test13")));
+
+// Compiler-generated variables are not valid alias targets.
+char *test14 = "asdf";
+extern char test14_alias[5] __attribute__((alias(".str")));
+// expected-error@-1 {{alias must point to a defined variable or function}}
+
+// Unprototyped functions should not alias variadic function and vice versa.
+int test15() { return 9; }
+// expected-note@-1 {{aliasee is declared here}}
+int test15_alias(int x, ...) __attribute__((alias("test15")));
+// expected-warning@-1 {{alias and aliasee have different types 'int (int, ...)' and 'int ()'}}
+
+void test16(int x, ...) { }
+// expected-note@-1 {{aliasee is declared here}}
+void test16_alias() __attribute__((alias("test16")));
+// expected-warning@-1 {{alias and aliasee have different types 'void ()' and 'void (int, ...)'}}
diff --git a/clang/test/SemaCXX/attr-alias.cpp b/clang/test/SemaCXX/attr-alias.cpp
new file mode 100644
index 0000000000000..bf76e56f7eca4
--- /dev/null
+++ b/clang/test/SemaCXX/attr-alias.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux -std=c++17 -emit-llvm-only -verify %s
+// expected-no-diagnostics
+
+// Note: this mimics how interceptor functions are defined in the compiler-rt
+// libraries. Despite a declaration in the system header having an exception
+// specification, redeclaring it in the user code does not produce the "missing
+// exception specification" error. Consequently, there should be no warnings
+// about type mismatches for the alias and its aliasee.
+# 1 "attr-alias.h" 1 3
+extern "C" void test1() noexcept(true);
+# 12 "attr-alias.cpp" 2
+extern "C" void test1() __attribute__((alias("test1_aliasee")));
+extern "C" void test1_aliasee() { }
diff --git a/clang/test/SemaObjC/attr-alias.m b/clang/test/SemaObjC/attr-alias.m
new file mode 100644
index 0000000000000..49c92c4436b1b
--- /dev/null
+++ b/clang/test/SemaObjC/attr-alias.m
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fblocks -verify -emit-llvm-only %s
+
+// Compiler-generated functions are not valid alias targets.
+void foo() { void(^myBlock)(void) = ^{ }; }
+void bar() __attribute__((alias("__foo_block_invoke")));
+// expected-error@-1 {{alias must point to a defined variable or function}}
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index b060e5c56edbe..9bb9b07037921 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -55,9 +55,14 @@ using namespace __dfsan;
 #define DECLARE_WEAK_INTERCEPTOR_HOOK(f, ...) \
 SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void f(__VA_ARGS__);
 
+#define PRAGMA(x) _Pragma(#x)
 #define WRAPPER_ALIAS(fun, real)                                          \
+  PRAGMA(clang diagnostic push)                                           \
+  PRAGMA(clang diagnostic ignored "-Wunknown-warning-option")             \
+  PRAGMA(clang diagnostic ignored "-Wattribute-alias")                    \
   SANITIZER_INTERFACE_ATTRIBUTE void __dfsw_##fun() ALIAS(__dfsw_##real); \
-  SANITIZER_INTERFACE_ATTRIBUTE void __dfso_##fun() ALIAS(__dfso_##real);
+  SANITIZER_INTERFACE_ATTRIBUTE void __dfso_##fun() ALIAS(__dfso_##real); \
+  PRAGMA(clang diagnostic pop)
 
 // Async-safe, non-reentrant spin lock.
 namespace {

From 22a056e85d79930f510b50464dfcc1c0980d72de Mon Sep 17 00:00:00 2001
From: Jan Svoboda 
Date: Tue, 12 May 2026 12:25:42 -0700
Subject: [PATCH 498/538] [clang][deps] Use single controller instance for
 by-name scans (#197266)

This avoids creating redundant instances of `DependencyActionController`
in the by-name scanning APIs. NFCI
---
 .../clang/Tooling/DependencyScanningTool.h       | 12 ++++++------
 clang/lib/Tooling/DependencyScanningTool.cpp     | 16 +++++-----------
 clang/tools/clang-scan-deps/ClangScanDeps.cpp    |  8 +++++---
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Tooling/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanningTool.h
index c368e93fa6286..e06fd616a31b9 100644
--- a/clang/include/clang/Tooling/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanningTool.h
@@ -110,7 +110,7 @@ class DependencyScanningTool {
   llvm::Expected getModuleDependencies(
       StringRef ModuleName, ArrayRef CommandLine, StringRef CWD,
       const llvm::DenseSet &AlreadySeen,
-      dependencies::LookupModuleOutputCallback LookupModuleOutput);
+      dependencies::DependencyActionController &Controller);
 
   llvm::vfs::FileSystem &getWorkerVFS() const { return Worker.getVFS(); }
 
@@ -201,10 +201,10 @@ class CompilerInstanceWithContext {
   /// @param CWD The current working directory used during the scan.
   /// @param CommandLine The commandline used for the scan.
   /// @return Error if the initializaiton fails.
-  static llvm::Expected initializeOrError(
-      DependencyScanningTool &Tool, StringRef CWD,
-      ArrayRef CommandLine,
-      dependencies::LookupModuleOutputCallback LookupModuleOutput);
+  static llvm::Expected
+  initializeOrError(DependencyScanningTool &Tool, StringRef CWD,
+                    ArrayRef CommandLine,
+                    dependencies::DependencyActionController &Controller);
 
   bool
   computeDependencies(StringRef ModuleName,
@@ -229,7 +229,7 @@ class CompilerInstanceWithContext {
   computeDependenciesByNameOrError(
       StringRef ModuleName,
       const llvm::DenseSet &AlreadySeen,
-      dependencies::LookupModuleOutputCallback LookupModuleOutput);
+      dependencies::DependencyActionController &Controller);
 
   // MaxNumOfQueries is the upper limit of the number of names the by-name
   // scanning API (computeDependencies) can support after a
diff --git a/clang/lib/Tooling/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanningTool.cpp
index d9d368c6ffde3..45e517f95aa18 100644
--- a/clang/lib/Tooling/DependencyScanningTool.cpp
+++ b/clang/lib/Tooling/DependencyScanningTool.cpp
@@ -357,14 +357,14 @@ llvm::Expected
 DependencyScanningTool::getModuleDependencies(
     StringRef ModuleName, ArrayRef CommandLine, StringRef CWD,
     const llvm::DenseSet &AlreadySeen,
-    LookupModuleOutputCallback LookupModuleOutput) {
+    DependencyActionController &Controller) {
   auto MaybeCIWithContext = CompilerInstanceWithContext::initializeOrError(
-      *this, CWD, CommandLine, LookupModuleOutput);
+      *this, CWD, CommandLine, Controller);
   if (auto Error = MaybeCIWithContext.takeError())
     return Error;
 
   return MaybeCIWithContext->computeDependenciesByNameOrError(
-      ModuleName, AlreadySeen, LookupModuleOutput);
+      ModuleName, AlreadySeen, Controller);
 }
 
 static std::optional> getFirstCC1CommandLine(
@@ -431,12 +431,7 @@ CompilerInstanceWithContext::initializeFromCommandline(
 llvm::Expected
 CompilerInstanceWithContext::initializeOrError(
     DependencyScanningTool &Tool, StringRef CWD,
-    ArrayRef CommandLine,
-    LookupModuleOutputCallback LookupModuleOutput) {
-  // It might seem wasteful to create fresh controller just for initializing the
-  // compiler instance, but repeated calls to computeDependenciesByNameOrError()
-  // do that as well, so this gets amortized.
-  CallbackActionController Controller(LookupModuleOutput);
+    ArrayRef CommandLine, DependencyActionController &Controller) {
   auto DiagPrinterWithOS =
       std::make_unique(CommandLine);
 
@@ -452,9 +447,8 @@ CompilerInstanceWithContext::initializeOrError(
 llvm::Expected
 CompilerInstanceWithContext::computeDependenciesByNameOrError(
     StringRef ModuleName, const llvm::DenseSet &AlreadySeen,
-    LookupModuleOutputCallback LookupModuleOutput) {
+    DependencyActionController &Controller) {
   FullDependencyConsumer Consumer(AlreadySeen);
-  CallbackActionController Controller(LookupModuleOutput);
   // We need to clear the DiagnosticOutput so that each by-name lookup
   // has a clean diagnostics buffer.
   DiagPrinterWithOS->DiagnosticOutput.clear();
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 3417dc2a07d39..969428a6fe621 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -1102,16 +1102,18 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
         SmallVector Names;
         ModuleNameRef.split(Names, ',');
 
+        CallbackActionController Controller(LookupOutput);
+
         if (Names.size() == 1) {
           auto MaybeModuleDepsGraph = WorkerTool.getModuleDependencies(
               Names[0], Input->CommandLine, CWD, AlreadySeenModules,
-              LookupOutput);
+              Controller);
           if (handleModuleResult(Names[0], MaybeModuleDepsGraph, *FD,
                                  LocalIndex, DependencyOS, Errs))
             HadErrors = true;
         } else {
           auto CIWithCtx = CompilerInstanceWithContext::initializeOrError(
-              WorkerTool, CWD, Input->CommandLine, LookupOutput);
+              WorkerTool, CWD, Input->CommandLine, Controller);
           if (llvm::Error Err = CIWithCtx.takeError()) {
             handleErrorWithInfoString(
                 "Compiler instance with context setup error", std::move(Err),
@@ -1123,7 +1125,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
           for (auto N : Names) {
             auto MaybeModuleDepsGraph =
                 CIWithCtx->computeDependenciesByNameOrError(
-                    N, AlreadySeenModules, LookupOutput);
+                    N, AlreadySeenModules, Controller);
             if (handleModuleResult(N, MaybeModuleDepsGraph, *FD, LocalIndex,
                                    DependencyOS, Errs)) {
               HadErrors = true;

From db20a981b9e7149e2ed2dc015c09020cbc2ab960 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov 
Date: Tue, 12 May 2026 12:28:53 -0700
Subject: [PATCH 499/538] [libc] Implement pthread_getthreadid_np and
 pthread_getunique_np. (#197027)

This is an alternative to
https://github.com/llvm/llvm-project/pull/195202 which uses @petrhosek
's suggestion to use an existing `pthread_getthreadid_np()` instead.

We define two functions:
* pthread_getthreadid_np - to return a thread-id for the current thread.
**NOTE:** We're using the IBM variant of this function
(https://www.ibm.com/docs/en/i/7.6.0?topic=ssw_ibm_i_76/apis/users_22.html)
instead of FreeBSD one
(https://man.freebsd.org/cgi/man.cgi?pthread_getthreadid_np), so that we
can return a more-opaque integral type (in our case - `uintptr_t` rather
than `int`).
* pthread_getunique_np - it's even less standard one. We're also using
IBM variant here
(https://www.ibm.com/docs/en/i/7.6.0?topic=ssw_ibm_i_76/apis/users_23.html).
This one is needed because we need to have the capability to convert
`pthread_t` into a numeric ID.
---
 libc/config/linux/aarch64/entrypoints.txt     |  2 ++
 libc/config/linux/riscv/entrypoints.txt       |  2 ++
 libc/config/linux/x86_64/entrypoints.txt      |  2 ++
 libc/include/CMakeLists.txt                   |  1 +
 libc/include/llvm-libc-types/CMakeLists.txt   |  1 +
 .../include/llvm-libc-types/pthread_id_np_t.h | 21 +++++++++++
 libc/include/pthread.yaml                     | 16 +++++++++
 libc/src/pthread/CMakeLists.txt               | 23 ++++++++++++
 libc/src/pthread/pthread_getthreadid_np.cpp   | 27 ++++++++++++++
 libc/src/pthread/pthread_getthreadid_np.h     | 26 ++++++++++++++
 libc/src/pthread/pthread_getunique_np.cpp     | 35 +++++++++++++++++++
 libc/src/pthread/pthread_getunique_np.h       | 28 +++++++++++++++
 .../integration/src/pthread/CMakeLists.txt    |  3 ++
 .../src/pthread/pthread_create_test.cpp       |  4 +++
 .../src/pthread/pthread_equal_test.cpp        |  9 +++++
 15 files changed, 200 insertions(+)
 create mode 100644 libc/include/llvm-libc-types/pthread_id_np_t.h
 create mode 100644 libc/src/pthread/pthread_getthreadid_np.cpp
 create mode 100644 libc/src/pthread/pthread_getthreadid_np.h
 create mode 100644 libc/src/pthread/pthread_getunique_np.cpp
 create mode 100644 libc/src/pthread/pthread_getunique_np.h

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 0c8fb3c8dbc15..e62bc67e2d5ca 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -1038,6 +1038,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_exit
     libc.src.pthread.pthread_getname_np
     libc.src.pthread.pthread_getspecific
+    libc.src.pthread.pthread_getthreadid_np
+    libc.src.pthread.pthread_getunique_np
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_key_create
     libc.src.pthread.pthread_key_delete
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 99a5c820159f8..d1c52dffdb6e7 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -1171,6 +1171,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_exit
     libc.src.pthread.pthread_getname_np
     libc.src.pthread.pthread_getspecific
+    libc.src.pthread.pthread_getthreadid_np
+    libc.src.pthread.pthread_getunique_np
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_key_create
     libc.src.pthread.pthread_key_delete
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 45fdde6454880..73b4b3fcd191f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1228,6 +1228,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_exit
     libc.src.pthread.pthread_getname_np
     libc.src.pthread.pthread_getspecific
+    libc.src.pthread.pthread_getthreadid_np
+    libc.src.pthread.pthread_getunique_np
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_key_create
     libc.src.pthread.pthread_key_delete
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 5f836e0f480e1..90055b41a37cf 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -473,6 +473,7 @@ add_header_macro(
     .llvm-libc-types.pthread_rwlockattr_t
     .llvm-libc-types.pthread_spinlock_t
     .llvm-libc-types.pthread_t
+    .llvm-libc-types.pthread_id_np_t
     .llvm-libc-types.struct_timespec
     .llvm_libc_common_h
 )
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index aad6650d90428..207834072ede9 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -85,6 +85,7 @@ add_header(pthread_rwlock_t HDR pthread_rwlock_t.h DEPENDS .__futex_word .pid_t)
 add_header(pthread_rwlockattr_t HDR pthread_rwlockattr_t.h)
 add_header(pthread_spinlock_t HDR pthread_spinlock_t.h DEPENDS .pid_t)
 add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type)
+add_header(pthread_id_np_t HDR pthread_id_np_t.h DEPENDS libc.include.llvm-libc-macros.stdint_macros)
 add_header(rlim_t HDR rlim_t.h)
 if(LIBC_TYPES_TIME_T_IS_32_BIT)
   add_header(time_t HDR time_t_32.h DEST_HDR time_t.h)
diff --git a/libc/include/llvm-libc-types/pthread_id_np_t.h b/libc/include/llvm-libc-types/pthread_id_np_t.h
new file mode 100644
index 0000000000000..92c24b0d35452
--- /dev/null
+++ b/libc/include/llvm-libc-types/pthread_id_np_t.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Definition of pthread_id_np_t type.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_PTHREAD_ID_NP_T_H
+#define LLVM_LIBC_TYPES_PTHREAD_ID_NP_T_H
+
+#include "../llvm-libc-macros/stdint-macros.h"
+
+typedef uintptr_t pthread_id_np_t;
+
+#endif // LLVM_LIBC_TYPES_PTHREAD_ID_NP_T_H
diff --git a/libc/include/pthread.yaml b/libc/include/pthread.yaml
index bb735780ab555..2d5f5f6d1b659 100644
--- a/libc/include/pthread.yaml
+++ b/libc/include/pthread.yaml
@@ -64,6 +64,9 @@ types:
   - type_name: pthread_rwlockattr_t
   - type_name: pthread_attr_t
   - type_name: pthread_spinlock_t
+  - type_name: pthread_id_np_t
+    standards:
+      - llvm_libc_ext
 functions:
   - name: pthread_atfork
     return_type: int
@@ -444,3 +447,16 @@ functions:
     return_type: int
     arguments:
       - type: pthread_spinlock_t *
+  - name: pthread_getthreadid_np
+    standards:
+      - llvm_libc_ext
+    return_type: pthread_id_np_t
+    arguments:
+      - type: void
+  - name: pthread_getunique_np
+    standards:
+      - llvm_libc_ext
+    return_type: int
+    arguments:
+      - type: const pthread_t *__restrict
+      - type: pthread_id_np_t *__restrict
diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
index 9b08c4c7394ff..bd344800ddc54 100644
--- a/libc/src/pthread/CMakeLists.txt
+++ b/libc/src/pthread/CMakeLists.txt
@@ -591,6 +591,29 @@ add_entrypoint_object(
     libc.src.__support.threads.thread
 )
 
+add_entrypoint_object(
+  pthread_getthreadid_np
+  SRCS
+    pthread_getthreadid_np.cpp
+  HDRS
+    pthread_getthreadid_np.h
+  DEPENDS
+    libc.include.llvm-libc-types.pthread_id_np_t
+    libc.src.__support.threads.thread
+)
+
+add_entrypoint_object(
+  pthread_getunique_np
+  SRCS
+    pthread_getunique_np.cpp
+  HDRS
+    pthread_getunique_np.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.include.llvm-libc-types.pthread_id_np_t
+    libc.include.llvm-libc-types.pthread_t
+)
+
 add_entrypoint_object(
   pthread_key_create
   SRCS
diff --git a/libc/src/pthread/pthread_getthreadid_np.cpp b/libc/src/pthread/pthread_getthreadid_np.cpp
new file mode 100644
index 0000000000000..4fa9b7cc1166f
--- /dev/null
+++ b/libc/src/pthread/pthread_getthreadid_np.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation of the pthread_getthreadid_np function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "pthread_getthreadid_np.h"
+
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/thread.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(pthread_id_np_t, pthread_getthreadid_np, ()) {
+  // We assume that unique thread ID is an integer value of a pointer to TCB.
+  return reinterpret_cast(self.attrib);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_getthreadid_np.h b/libc/src/pthread/pthread_getthreadid_np.h
new file mode 100644
index 0000000000000..ae2fa4150e28e
--- /dev/null
+++ b/libc/src/pthread/pthread_getthreadid_np.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation header for pthread_getthreadid_np.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_GETTHREADID_NP_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_GETTHREADID_NP_H
+
+#include "include/llvm-libc-types/pthread_id_np_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+pthread_id_np_t pthread_getthreadid_np();
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_GETTHREADID_NP_H
diff --git a/libc/src/pthread/pthread_getunique_np.cpp b/libc/src/pthread/pthread_getunique_np.cpp
new file mode 100644
index 0000000000000..15aed2bd96b65
--- /dev/null
+++ b/libc/src/pthread/pthread_getunique_np.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation of the pthread_getunique_np function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "pthread_getunique_np.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, pthread_getunique_np,
+                   (const pthread_t *__restrict thread,
+                    pthread_id_np_t *__restrict id)) {
+  if (id == nullptr) {
+    return EINVAL;
+  }
+  // We assume that unique thread ID is an integer value of a pointer to TCB.
+  *id = (thread == nullptr)
+            ? 0
+            : reinterpret_cast(thread->__attrib);
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_getunique_np.h b/libc/src/pthread/pthread_getunique_np.h
new file mode 100644
index 0000000000000..3770c6a388144
--- /dev/null
+++ b/libc/src/pthread/pthread_getunique_np.h
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation header for pthread_getunique_np function.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_GETUNIQUE_NP_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_GETUNIQUE_NP_H
+
+#include "include/llvm-libc-types/pthread_id_np_t.h"
+#include "include/llvm-libc-types/pthread_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_getunique_np(const pthread_t *__restrict thread,
+                         pthread_id_np_t *__restrict id);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_GETUNIQUE_NP_H
diff --git a/libc/test/integration/src/pthread/CMakeLists.txt b/libc/test/integration/src/pthread/CMakeLists.txt
index 98e5dbebbfcd1..250cc9114f13d 100644
--- a/libc/test/integration/src/pthread/CMakeLists.txt
+++ b/libc/test/integration/src/pthread/CMakeLists.txt
@@ -165,6 +165,8 @@ add_integration_test(
     libc.src.pthread.pthread_mutex_unlock
     libc.src.pthread.pthread_create
     libc.src.pthread.pthread_equal
+    libc.src.pthread.pthread_getthreadid_np
+    libc.src.pthread.pthread_getunique_np
     libc.src.pthread.pthread_join
     libc.src.pthread.pthread_self
 )
@@ -278,6 +280,7 @@ add_integration_test(
     libc.src.pthread.pthread_attr_setstacksize
     libc.src.pthread.pthread_attr_init
     libc.src.pthread.pthread_attr_destroy
+    libc.src.pthread.pthread_getunique_np
     libc.src.pthread.pthread_self
     libc.src.sys.mman.mmap
     libc.src.sys.mman.munmap
diff --git a/libc/test/integration/src/pthread/pthread_create_test.cpp b/libc/test/integration/src/pthread/pthread_create_test.cpp
index a067a829e039b..dd8d3ef1c8e35 100644
--- a/libc/test/integration/src/pthread/pthread_create_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_create_test.cpp
@@ -18,6 +18,7 @@
 #include "src/pthread/pthread_attr_setstack.h"
 #include "src/pthread/pthread_attr_setstacksize.h"
 #include "src/pthread/pthread_create.h"
+#include "src/pthread/pthread_getunique_np.h"
 #include "src/pthread/pthread_join.h"
 #include "src/pthread/pthread_self.h"
 
@@ -180,6 +181,9 @@ static void run_success_config(int detachstate, size_t guardsize,
                                            reinterpret_cast(th_arg)),
             0);
   ASSERT_ERRNO_SUCCESS();
+  pthread_id_np_t id;
+  ASSERT_EQ(LIBC_NAMESPACE::pthread_getunique_np(&tid, &id), 0);
+  ASSERT_NE(id, 0);
 
   if (detachstate == PTHREAD_CREATE_JOINABLE) {
     void *th_ret;
diff --git a/libc/test/integration/src/pthread/pthread_equal_test.cpp b/libc/test/integration/src/pthread/pthread_equal_test.cpp
index 01569798537e3..435065e92d698 100644
--- a/libc/test/integration/src/pthread/pthread_equal_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_equal_test.cpp
@@ -9,6 +9,8 @@
 #include "hdr/stdint_proxy.h" // uintptr_t
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_equal.h"
+#include "src/pthread/pthread_getthreadid_np.h"
+#include "src/pthread/pthread_getunique_np.h"
 #include "src/pthread/pthread_join.h"
 #include "src/pthread/pthread_mutex_destroy.h"
 #include "src/pthread/pthread_mutex_init.h"
@@ -20,6 +22,7 @@
 #include 
 
 pthread_t child_thread;
+pthread_id_np_t child_self_thread_id;
 pthread_mutex_t mutex;
 
 static void *child_func(void *arg) {
@@ -27,6 +30,7 @@ static void *child_func(void *arg) {
   int *ret = reinterpret_cast(arg);
   auto self = LIBC_NAMESPACE::pthread_self();
   *ret = LIBC_NAMESPACE::pthread_equal(child_thread, self);
+  child_self_thread_id = LIBC_NAMESPACE::pthread_getthreadid_np();
   LIBC_NAMESPACE::pthread_mutex_unlock(&mutex);
   return nullptr;
 }
@@ -38,6 +42,7 @@ TEST_MAIN() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_mutex_lock(&mutex), 0);
 
   auto main_thread = LIBC_NAMESPACE::pthread_self();
+  pthread_id_np_t main_thread_id = LIBC_NAMESPACE::pthread_getthreadid_np();
 
   // The idea here is that, we start a child thread which will immediately
   // wait on |mutex|. The main thread will update the global |child_thread| var
@@ -46,10 +51,13 @@ TEST_MAIN() {
   // comparison is returned in the thread arg.
   int result = 0;
   pthread_t th;
+  pthread_id_np_t th_id;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_create(&th, nullptr, child_func, &result),
             0);
   // This new thread should of course not be equal to the main thread.
   ASSERT_EQ(LIBC_NAMESPACE::pthread_equal(th, main_thread), 0);
+  ASSERT_EQ(LIBC_NAMESPACE::pthread_getunique_np(&th, &th_id), 0);
+  ASSERT_NE(th_id, main_thread_id);
 
   // Set the |child_thread| global var and unlock to allow the child to perform
   // the comparison.
@@ -62,6 +70,7 @@ TEST_MAIN() {
   // The child thread should see that pthread_self return value is the same as
   // |child_thread|.
   ASSERT_NE(result, 0);
+  ASSERT_EQ(th_id, child_self_thread_id);
 
   LIBC_NAMESPACE::pthread_mutex_destroy(&mutex);
   return 0;

From 0f37c48d459d6bba3ab6866cceb57539dce002fa Mon Sep 17 00:00:00 2001
From: Aaron <58265908+aaronsms@users.noreply.github.com>
Date: Wed, 13 May 2026 03:31:31 +0800
Subject: [PATCH 500/538] [clang-format] Add AlignConsecutiveEnums (#194154)

Address #52983. Add ability to align enums similar to how bit fields are
done.

---------

Co-authored-by: Aaron Saw Min Sern 
---
 clang/docs/ClangFormatStyleOptions.rst     | 35 +++++++++++
 clang/docs/ReleaseNotes.rst                |  2 +
 clang/include/clang/Format/Format.h        |  6 ++
 clang/lib/Format/Format.cpp                | 16 +++--
 clang/lib/Format/FormatToken.h             |  1 +
 clang/lib/Format/TokenAnnotator.cpp        | 17 ++++--
 clang/lib/Format/TokenAnnotator.h          |  2 +
 clang/lib/Format/WhitespaceManager.cpp     | 13 +++-
 clang/unittests/Format/AlignmentTest.cpp   | 23 ++++++-
 clang/unittests/Format/ConfigParseTest.cpp | 71 +++++++++++-----------
 10 files changed, 139 insertions(+), 47 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 61f27bcf9dbbc..f852f76f5038c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -402,6 +402,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
@@ -560,6 +565,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
@@ -718,6 +728,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
@@ -877,6 +892,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
@@ -1155,6 +1175,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
@@ -1311,6 +1336,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
@@ -1467,6 +1497,11 @@ the configuration (without a prefix: ``Auto``).
       int     *p;
       int (*f)();
 
+  * ``bool EnumAssignments`` Only for ``AlignConsecutiveAssignments``.
+    Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    setting this to ``true`` forces alignment for enum assignments only.
+    If ``Enabled`` is ``true``, enum assignments are always aligned.
+
   * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     operators are left-padded to the same length as long ones in order to
     put all assignment operators to the right of the left hand side.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index db48f0860896f..4b0eb5b9d8505 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -763,6 +763,8 @@ clang-format
 - Add ``AllowShortRecordOnASingleLine`` option and set it to ``EmptyAndAttached`` for LLVM style.
 - Add ``BreakFunctionDeclarationParameters`` option to always break before function
   declaration parameters.
+- Add ``EnumAssignments`` option to ``AlignConsecutiveAssignments`` for aligning
+  enum assignments without affecting other assignments.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 0e883837ac0e9..eca3cc44c41b6 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -236,6 +236,11 @@ struct FormatStyle {
     ///   int (*f)();
     /// \endcode
     bool AlignFunctionPointers;
+    /// Only for ``AlignConsecutiveAssignments``.
+    /// Whether enum assignments are aligned. If ``Enabled`` is ``false``,
+    /// setting this to ``true`` forces alignment for enum assignments only.
+    /// If ``Enabled`` is ``true``, enum assignments are always aligned.
+    bool EnumAssignments;
     /// Only for ``AlignConsecutiveAssignments``.  Whether short assignment
     /// operators are left-padded to the same length as long ones in order to
     /// put all assignment operators to the right of the left hand side.
@@ -261,6 +266,7 @@ struct FormatStyle {
              AlignCompound == R.AlignCompound &&
              AlignFunctionDeclarations == R.AlignFunctionDeclarations &&
              AlignFunctionPointers == R.AlignFunctionPointers &&
+             EnumAssignments == R.EnumAssignments &&
              PadOperators == R.PadOperators;
     }
     bool operator!=(const AlignConsecutiveStyle &R) const {
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 74b31810843fc..ec0ad98f37753 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -63,25 +63,29 @@ template <> struct MappingTraits {
                     {/*Enabled=*/true, /*AcrossEmptyLines=*/false,
                      /*AcrossComments=*/false, /*AlignCompound=*/false,
                      /*AlignFunctionDeclarations=*/true,
-                     /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));
+                     /*AlignFunctionPointers=*/false,
+                     /*EnumAssignments=*/true, /*PadOperators=*/true}));
     IO.enumCase(Value, "AcrossEmptyLines",
                 FormatStyle::AlignConsecutiveStyle(
                     {/*Enabled=*/true, /*AcrossEmptyLines=*/true,
                      /*AcrossComments=*/false, /*AlignCompound=*/false,
                      /*AlignFunctionDeclarations=*/true,
-                     /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));
+                     /*AlignFunctionPointers=*/false,
+                     /*EnumAssignments=*/true, /*PadOperators=*/true}));
     IO.enumCase(Value, "AcrossComments",
                 FormatStyle::AlignConsecutiveStyle(
                     {/*Enabled=*/true, /*AcrossEmptyLines=*/false,
                      /*AcrossComments=*/true, /*AlignCompound=*/false,
                      /*AlignFunctionDeclarations=*/true,
-                     /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));
+                     /*AlignFunctionPointers=*/false,
+                     /*EnumAssignments=*/true, /*PadOperators=*/true}));
     IO.enumCase(Value, "AcrossEmptyLinesAndComments",
                 FormatStyle::AlignConsecutiveStyle(
                     {/*Enabled=*/true, /*AcrossEmptyLines=*/true,
                      /*AcrossComments=*/true, /*AlignCompound=*/false,
                      /*AlignFunctionDeclarations=*/true,
-                     /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));
+                     /*AlignFunctionPointers=*/false,
+                     /*EnumAssignments=*/true, /*PadOperators=*/true}));
 
     // For backward compatibility.
     IO.enumCase(Value, "true",
@@ -89,7 +93,8 @@ template <> struct MappingTraits {
                     {/*Enabled=*/true, /*AcrossEmptyLines=*/false,
                      /*AcrossComments=*/false, /*AlignCompound=*/false,
                      /*AlignFunctionDeclarations=*/true,
-                     /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));
+                     /*AlignFunctionPointers=*/false,
+                     /*EnumAssignments=*/true, /*PadOperators=*/true}));
     IO.enumCase(Value, "false", FormatStyle::AlignConsecutiveStyle{});
   }
 
@@ -101,6 +106,7 @@ template <> struct MappingTraits {
     IO.mapOptional("AlignFunctionDeclarations",
                    Value.AlignFunctionDeclarations);
     IO.mapOptional("AlignFunctionPointers", Value.AlignFunctionPointers);
+    IO.mapOptional("EnumAssignments", Value.EnumAssignments);
     IO.mapOptional("PadOperators", Value.PadOperators);
   }
 };
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 68d94b087136d..1d8f0f1cfe412 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -76,6 +76,7 @@ namespace format {
   TYPE(DoWhile)                                                                \
   TYPE(ElseLBrace)                                                             \
   TYPE(ElseRBrace)                                                             \
+  TYPE(EnumEqual)                                                              \
   TYPE(EnumLBrace)                                                             \
   TYPE(EnumRBrace)                                                             \
   TYPE(EnumUnderlyingTypeColon)                                                \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 640f03a4ac130..afdb59617fb2a 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -139,6 +139,8 @@ class AnnotatingParser {
     case TT_StructLBrace:
     case TT_UnionLBrace:
       return ST_Class;
+    case TT_EnumLBrace:
+      return ST_Enum;
     case TT_CompoundRequirementLBrace:
       return ST_CompoundRequirement;
     default:
@@ -1212,8 +1214,8 @@ class AnnotatingParser {
 
     unsigned CommaCount = 0;
     while (CurrentToken) {
+      assert(!Scopes.empty());
       if (CurrentToken->is(tok::r_brace)) {
-        assert(!Scopes.empty());
         assert(Scopes.back() == getScopeType(OpeningBrace));
         Scopes.pop_back();
         assert(OpeningBrace.Optional == CurrentToken->Optional);
@@ -1239,6 +1241,7 @@ class AnnotatingParser {
              (!Contexts.back().ColonIsDictLiteral || !IsCpp)) ||
             Style.isProto()) {
           OpeningBrace.setType(TT_DictLiteral);
+          Scopes.back() = getScopeType(OpeningBrace);
           if (Previous->Tok.getIdentifierInfo() ||
               Previous->is(tok::string_literal)) {
             Previous->setType(TT_SelectorName);
@@ -1247,16 +1250,20 @@ class AnnotatingParser {
         if (CurrentToken->is(tok::colon) && OpeningBrace.is(TT_Unknown) &&
             !Style.isTableGen()) {
           OpeningBrace.setType(TT_DictLiteral);
+          Scopes.back() = getScopeType(OpeningBrace);
         } else if (Style.isJavaScript()) {
           OpeningBrace.overwriteFixedType(TT_DictLiteral);
+          Scopes.back() = getScopeType(OpeningBrace);
         }
       }
       bool IsBracedListComma = false;
       if (CurrentToken->is(tok::comma)) {
-        if (Style.isJavaScript())
+        if (Style.isJavaScript()) {
           OpeningBrace.overwriteFixedType(TT_DictLiteral);
-        else
+          Scopes.back() = getScopeType(OpeningBrace);
+        } else {
           IsBracedListComma = OpeningBrace.is(BK_BracedInit);
+        }
         ++CommaCount;
       }
       if (!consumeToken())
@@ -1835,6 +1842,8 @@ class AnnotatingParser {
       // In TableGen, there must be a value after "=";
       if (Style.isTableGen() && !parseTableGenValue())
         return false;
+      if (!Scopes.empty() && Scopes.back() == ST_Enum)
+        Tok->setFinalizedType(TT_EnumEqual);
       break;
     default:
       break;
@@ -5705,7 +5714,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     return getTokenReferenceAlignment(Right) != FormatStyle::PAS_Left;
   }
   if ((Right.is(TT_BinaryOperator) && Left.isNot(tok::l_paren)) ||
-      (Left.isOneOf(TT_BinaryOperator, TT_ConditionalExpr) &&
+      (Left.isOneOf(TT_BinaryOperator, TT_EnumEqual, TT_ConditionalExpr) &&
        Right.isNot(tok::r_paren))) {
     return true;
   }
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index 597dd890ee990..33c7df9d0f949 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -40,6 +40,8 @@ enum LineType {
 enum ScopeType {
   // Contained in class declaration/definition.
   ST_Class,
+  // Contained in enum declaration/definition.
+  ST_Enum,
   // Contained in compound requirement.
   ST_CompoundRequirement,
   // Contained in other blocks (function, lambda, loop, if/else, child, etc).
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 74aa8a2795150..4a72abbfa9ec3 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -813,8 +813,10 @@ void WhitespaceManager::alignConsecutiveMacros() {
 }
 
 void WhitespaceManager::alignConsecutiveAssignments() {
-  if (!Style.AlignConsecutiveAssignments.Enabled)
+  if (!Style.AlignConsecutiveAssignments.Enabled &&
+      !Style.AlignConsecutiveAssignments.EnumAssignments) {
     return;
+  }
 
   AlignTokens(
       Style,
@@ -827,6 +829,15 @@ void WhitespaceManager::alignConsecutiveAssignments() {
         if (&C != &Changes.back() && (&C + 1)->NewlinesBefore > 0)
           return false;
 
+        // Align enum '=' when EnumAssignments is enabled.
+        if (Style.AlignConsecutiveAssignments.EnumAssignments &&
+            C.Tok->is(TT_EnumEqual)) {
+          return true;
+        }
+
+        if (!Style.AlignConsecutiveAssignments.Enabled)
+          return false;
+
         // Do not align operator= overloads.
         FormatToken *Previous = C.Tok->getPreviousNonComment();
         if (Previous && Previous->is(tok::kw_operator))
diff --git a/clang/unittests/Format/AlignmentTest.cpp b/clang/unittests/Format/AlignmentTest.cpp
index 9421a4c933b9e..fbc0cb4d825ea 100644
--- a/clang/unittests/Format/AlignmentTest.cpp
+++ b/clang/unittests/Format/AlignmentTest.cpp
@@ -1135,6 +1135,25 @@ TEST_F(AlignmentTest, ConsecutiveAssignmentsAcrossEmptyLinesAndComments) {
                Alignment);
 }
 
+TEST_F(AlignmentTest, ConsecutiveEnumAssignments) {
+  FormatStyle Alignment = getLLVMStyle();
+  Alignment.AlignConsecutiveAssignments.EnumAssignments = true;
+  verifyFormat("enum ValueKind {\n"
+               "  VK_Argument   = 1,\n"
+               "  VK_BasicBlock = 2,\n"
+               "  VK_Segment    = 8,\n"
+               "};",
+               Alignment);
+  Alignment.AlignConsecutiveAssignments.Enabled = true;
+  Alignment.AlignConsecutiveAssignments.EnumAssignments = false;
+  verifyFormat("enum ValueKind {\n"
+               "  VK_Argument   = 1,\n"
+               "  VK_BasicBlock = 2,\n"
+               "  VK_Segment    = 8,\n"
+               "};",
+               Alignment);
+}
+
 TEST_F(AlignmentTest, ConsecutiveCompoundAssignments) {
   FormatStyle Alignment = getLLVMStyle();
   Alignment.AlignConsecutiveAssignments.Enabled = true;
@@ -2370,14 +2389,14 @@ TEST_F(AlignmentTest, AlignWithLineBreaks) {
                  /*AcrossComments=*/false, /*AlignCompound=*/false,
                  /*AlignFunctionDeclarations=*/false,
                  /*AlignFunctionPointers=*/false,
-                 /*PadOperators=*/true}));
+                 /*EnumAssignments=*/false, /*PadOperators=*/true}));
   EXPECT_EQ(Style.AlignConsecutiveDeclarations,
             FormatStyle::AlignConsecutiveStyle(
                 {/*Enabled=*/false, /*AcrossEmptyLines=*/false,
                  /*AcrossComments=*/false, /*AlignCompound=*/false,
                  /*AlignFunctionDeclarations=*/true,
                  /*AlignFunctionPointers=*/false,
-                 /*PadOperators=*/false}));
+                 /*EnumAssignments=*/false, /*PadOperators=*/false}));
   verifyFormat("void foo() {\n"
                "  int myVar = 5;\n"
                "  double x = 3.14;\n"
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index ccb9c837d8362..f731922030777 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -339,44 +339,44 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   do {                                                                         \
     Style.FIELD.Enabled = true;                                                \
     CHECK_PARSE(#FIELD ": None", FIELD, FormatStyle::AlignConsecutiveStyle{}); \
-    CHECK_PARSE(                                                               \
-        #FIELD ": Consecutive", FIELD,                                         \
-        FormatStyle::AlignConsecutiveStyle(                                    \
-            {/*Enabled=*/true, /*AcrossEmptyLines=*/false,                     \
-             /*AcrossComments=*/false, /*AlignCompound=*/false,                \
-             /*AlignFunctionDeclarations=*/true,                               \
-             /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));        \
-    CHECK_PARSE(                                                               \
-        #FIELD ": AcrossEmptyLines", FIELD,                                    \
-        FormatStyle::AlignConsecutiveStyle(                                    \
-            {/*Enabled=*/true, /*AcrossEmptyLines=*/true,                      \
-             /*AcrossComments=*/false, /*AlignCompound=*/false,                \
-             /*AlignFunctionDeclarations=*/true,                               \
-             /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));        \
-    CHECK_PARSE(                                                               \
-        #FIELD ": AcrossComments", FIELD,                                      \
-        FormatStyle::AlignConsecutiveStyle(                                    \
-            {/*Enabled=*/true, /*AcrossEmptyLines=*/false,                     \
-             /*AcrossComments=*/true, /*AlignCompound=*/false,                 \
-             /*AlignFunctionDeclarations=*/true,                               \
-             /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));        \
-    CHECK_PARSE(                                                               \
-        #FIELD ": AcrossEmptyLinesAndComments", FIELD,                         \
-        FormatStyle::AlignConsecutiveStyle(                                    \
-            {/*Enabled=*/true, /*AcrossEmptyLines=*/true,                      \
-             /*AcrossComments=*/true, /*AlignCompound=*/false,                 \
-             /*AlignFunctionDeclarations=*/true,                               \
-             /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));        \
+    CHECK_PARSE(#FIELD ": Consecutive", FIELD,                                 \
+                FormatStyle::AlignConsecutiveStyle(                            \
+                    {/*Enabled=*/true, /*AcrossEmptyLines=*/false,             \
+                     /*AcrossComments=*/false, /*AlignCompound=*/false,        \
+                     /*AlignFunctionDeclarations=*/true,                       \
+                     /*AlignFunctionPointers=*/false,                          \
+                     /*EnumAssignments =*/true, /*PadOperators=*/true}));      \
+    CHECK_PARSE(#FIELD ": AcrossEmptyLines", FIELD,                            \
+                FormatStyle::AlignConsecutiveStyle(                            \
+                    {/*Enabled=*/true, /*AcrossEmptyLines=*/true,              \
+                     /*AcrossComments=*/false, /*AlignCompound=*/false,        \
+                     /*AlignFunctionDeclarations=*/true,                       \
+                     /*AlignFunctionPointers=*/false,                          \
+                     /*EnumAssignments =*/true, /*PadOperators=*/true}));      \
+    CHECK_PARSE(#FIELD ": AcrossComments", FIELD,                              \
+                FormatStyle::AlignConsecutiveStyle(                            \
+                    {/*Enabled=*/true, /*AcrossEmptyLines=*/false,             \
+                     /*AcrossComments=*/true, /*AlignCompound=*/false,         \
+                     /*AlignFunctionDeclarations=*/true,                       \
+                     /*AlignFunctionPointers=*/false,                          \
+                     /*EnumAssignments =*/true, /*PadOperators=*/true}));      \
+    CHECK_PARSE(#FIELD ": AcrossEmptyLinesAndComments", FIELD,                 \
+                FormatStyle::AlignConsecutiveStyle(                            \
+                    {/*Enabled=*/true, /*AcrossEmptyLines=*/true,              \
+                     /*AcrossComments=*/true, /*AlignCompound=*/false,         \
+                     /*AlignFunctionDeclarations=*/true,                       \
+                     /*AlignFunctionPointers=*/false,                          \
+                     /*EnumAssignments =*/true, /*PadOperators=*/true}));      \
     /* For backwards compability, false / true should still parse */           \
     CHECK_PARSE(#FIELD ": false", FIELD,                                       \
                 FormatStyle::AlignConsecutiveStyle{});                         \
-    CHECK_PARSE(                                                               \
-        #FIELD ": true", FIELD,                                                \
-        FormatStyle::AlignConsecutiveStyle(                                    \
-            {/*Enabled=*/true, /*AcrossEmptyLines=*/false,                     \
-             /*AcrossComments=*/false, /*AlignCompound=*/false,                \
-             /*AlignFunctionDeclarations=*/true,                               \
-             /*AlignFunctionPointers=*/false, /*PadOperators=*/true}));        \
+    CHECK_PARSE(#FIELD ": true", FIELD,                                        \
+                FormatStyle::AlignConsecutiveStyle(                            \
+                    {/*Enabled=*/true, /*AcrossEmptyLines=*/false,             \
+                     /*AcrossComments=*/false, /*AlignCompound=*/false,        \
+                     /*AlignFunctionDeclarations=*/true,                       \
+                     /*AlignFunctionPointers=*/false,                          \
+                     /*EnumAssignments =*/true, /*PadOperators=*/true}));      \
                                                                                \
     CHECK_PARSE_NESTED_BOOL(FIELD, Enabled);                                   \
     CHECK_PARSE_NESTED_BOOL(FIELD, AcrossEmptyLines);                          \
@@ -384,6 +384,7 @@ TEST(ConfigParseTest, ParsesConfiguration) {
     CHECK_PARSE_NESTED_BOOL(FIELD, AlignCompound);                             \
     CHECK_PARSE_NESTED_BOOL(FIELD, AlignFunctionDeclarations);                 \
     CHECK_PARSE_NESTED_BOOL(FIELD, AlignFunctionPointers);                     \
+    CHECK_PARSE_NESTED_BOOL(FIELD, EnumAssignments);                           \
     CHECK_PARSE_NESTED_BOOL(FIELD, PadOperators);                              \
   } while (false)
 

From ac46db6dc49eb9fd8e35298e6c35aa6a5ac02f43 Mon Sep 17 00:00:00 2001
From: Henrich Lauko 
Date: Tue, 12 May 2026 21:49:32 +0200
Subject: [PATCH 501/538] [CIR] Use HasAncestor trait in place of hand-written
 verifiers (#197271)

Replace the verify() functions on BreakOp, ContinueOp, LocalInitOp, and
CoReturnOp - each of which just checked for a specific ancestor op -
with
the declarative MLIR HasAncestor / AncestorOneOf traits.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td | 23 ++++++++------
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp      | 33 --------------------
 clang/test/CIR/IR/invalid-co-return.cir      |  2 +-
 clang/test/CIR/IR/invalid-static-local.cir   |  2 +-
 4 files changed, 15 insertions(+), 45 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 9d9aaec1b275a..7c4abd794be68 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1023,7 +1023,12 @@ def CIR_YieldOp : CIR_Op<"yield", [
 // BreakOp
 //===----------------------------------------------------------------------===//
 
-def CIR_BreakOp : CIR_Op<"break", [Terminator]> {
+defvar CIR_LoopScopes = ["WhileOp", "DoWhileOp", "ForOp"];
+defvar CIR_BreakableScopes = !listconcat(CIR_LoopScopes, ["SwitchOp"]);
+
+def CIR_BreakOp : CIR_Op<"break", [
+  Terminator, AncestorOneOf
+]> {
   let summary = "C/C++ `break` statement equivalent";
   let description = [{
     The `cir.break` operation is used to cease the execution of the current loop
@@ -1031,7 +1036,6 @@ def CIR_BreakOp : CIR_Op<"break", [Terminator]> {
     allowed within a breakable operations (loops and switches).
   }];
   let assemblyFormat = "attr-dict";
-  let hasVerifier = 1;
   let hasLLVMLowering = false;
 }
 
@@ -1039,7 +1043,9 @@ def CIR_BreakOp : CIR_Op<"break", [Terminator]> {
 // ContinueOp
 //===----------------------------------------------------------------------===//
 
-def CIR_ContinueOp : CIR_Op<"continue", [Terminator]> {
+def CIR_ContinueOp : CIR_Op<"continue", [
+  Terminator, AncestorOneOf
+]> {
   let summary = "C/C++ `continue` statement equivalent";
   let description = [{
     The `cir.continue` operation is used to end execution of the current
@@ -1047,7 +1053,6 @@ def CIR_ContinueOp : CIR_Op<"continue", [Terminator]> {
     It is only allowed within loop regions.
   }];
   let assemblyFormat = "attr-dict";
-  let hasVerifier = 1;
   let hasLLVMLowering = false;
 }
 
@@ -3846,7 +3851,8 @@ def CIR_FuncOp : CIR_Op<"func", [
 //===----------------------------------------------------------------------===//
 
 def CIR_LocalInitOp : CIR_Op<"local_init", [
-  DeclareOpInterfaceMethods, NoRegionArguments
+  DeclareOpInterfaceMethods, NoRegionArguments,
+  HasAncestor<"FuncOp">
 ]> {
   let summary = "initialize a static or thread local object";
   let description = [{
@@ -3901,7 +3907,6 @@ def CIR_LocalInitOp : CIR_Op<"local_init", [
 }
 }];
 
-  let hasVerifier = 1;
   let hasLLVMLowering = false;
 }
 
@@ -4317,7 +4322,7 @@ def CIR_CoroBodyOp : CIR_Op<"coro.body", [
 //===----------------------------------------------------------------------===//
 
 def CIR_CoReturnOp : CIR_Op<"co_return", [
-  ReturnLike, Pure, Terminator
+  ReturnLike, Pure, Terminator, HasAncestor<"CoroBodyOp">
 ]> {
   let summary = "Coroutine return operation";
   let description = [{
@@ -4331,8 +4336,6 @@ def CIR_CoReturnOp : CIR_Op<"co_return", [
     attr-dict
   }];
 
-  let hasVerifier = 1;
-
   let hasLLVMLowering = false;
 }
 
@@ -7209,7 +7212,7 @@ def CIR_TryOp : CIR_Op<"try",[
 // CatchParamOp
 //===----------------------------------------------------------------------===//
 
-def CIR_CatchParamOp : CIR_Op<"catch_param", [HasParent<"cir::TryOp">]> {
+def CIR_CatchParamOp : CIR_Op<"catch_param", [HasParent<"TryOp">]> {
   let summary = "Represents the catch clause formal parameter";
   let description = [{
     The `cir.catch_param` is used to retrieve the exception object inside
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 74ef856c5a067..99b38be47121b 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -372,27 +372,10 @@ LogicalResult cir::DeleteArrayOp::verify() {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// BreakOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult cir::BreakOp::verify() {
-  if (!getOperation()->getParentOfType() &&
-      !getOperation()->getParentOfType())
-    return emitOpError("must be within a loop");
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // LocalInitOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult cir::LocalInitOp::verify() {
-  if (!getOperation()->getParentOfType())
-    return emitOpError("must be inside of a 'cir.func'");
-  return success();
-}
-
 LogicalResult
 cir::LocalInitOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   cir::GlobalOp global = getReferencedGlobal(symbolTable);
@@ -525,16 +508,6 @@ OpFoldResult cir::ConstantOp::fold(FoldAdaptor /*adaptor*/) {
   return getValue();
 }
 
-//===----------------------------------------------------------------------===//
-// ContinueOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult cir::ContinueOp::verify() {
-  if (!getOperation()->getParentOfType())
-    return emitOpError("must be within a loop");
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // CastOp
 //===----------------------------------------------------------------------===//
@@ -3102,12 +3075,6 @@ LogicalResult cir::AwaitOp::verify() {
   return success();
 }
 
-LogicalResult cir::CoReturnOp::verify() {
-  if (!getOperation()->getParentOfType())
-    return emitOpError("must be inside a cir.coro.body");
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // CoroBody
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CIR/IR/invalid-co-return.cir b/clang/test/CIR/IR/invalid-co-return.cir
index 6085bcfe3a408..115bfbac26d42 100644
--- a/clang/test/CIR/IR/invalid-co-return.cir
+++ b/clang/test/CIR/IR/invalid-co-return.cir
@@ -1,5 +1,5 @@
 // RUN: cir-opt %s -verify-diagnostics -split-input-file
 
 cir.func @must_be_inside_coro_body() { 
-  cir.co_return // expected-error {{must be inside a cir.coro.body}}
+  cir.co_return // expected-error {{expects ancestor op 'cir.coro.body'}}
 }
diff --git a/clang/test/CIR/IR/invalid-static-local.cir b/clang/test/CIR/IR/invalid-static-local.cir
index 033d80e071ed9..2a4d22d9ceae0 100644
--- a/clang/test/CIR/IR/invalid-static-local.cir
+++ b/clang/test/CIR/IR/invalid-static-local.cir
@@ -65,7 +65,7 @@ module {
 cir.global "private" internal static_local_guard<"_ZGVZ1fvE1y"> @_ZZ1fvE1y : !s32i
 
 cir.global "private" internal @_AnotherGlobal = ctor : !s32i {
-  // expected-error @below {{'cir.local_init' op must be inside of a 'cir.func'}}
+  // expected-error @below {{'cir.local_init' op expects ancestor op 'cir.func'}}
   cir.local_init static_local @_ZZ1fvE1y ctor {
     cir.yield
   } dtor {

From 8c187665e883e7c37ddff733ea50304d093dc9f4 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan 
Date: Tue, 12 May 2026 15:53:39 -0400
Subject: [PATCH 502/538] [libc][math] fix rounding issue in tanhf (#197260)

---
 libc/src/__support/math/tanhf.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/math/tanhf.h b/libc/src/__support/math/tanhf.h
index 15c86c6b25f5f..28d1c60a6912d 100644
--- a/libc/src/__support/math/tanhf.h
+++ b/libc/src/__support/math/tanhf.h
@@ -12,6 +12,7 @@
 #include "exp10f_utils.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/macros/config.h"
@@ -30,8 +31,6 @@ LIBC_INLINE float tanhf(float x) {
   FPBits xbits(x);
   uint32_t x_abs = xbits.abs().uintval();
 
-  const int sign_index = xbits.is_neg() ? 1 : 0;
-
   // When |x| >= 15, or x is inf or nan, or |x| <= 0.078125
   if (LIBC_UNLIKELY((x_abs >= 0x4170'0000U) || (x_abs <= 0x3da0'0000U))) {
     if (x_abs <= 0x3da0'0000U) {
@@ -62,12 +61,13 @@ LIBC_INLINE float tanhf(float x) {
     if (LIBC_UNLIKELY(xbits.is_nan()))
       return x + 1.0f; // sNaN to qNaN + signal
 
-    constexpr float SIGNS[2][2] = {{1.0f, -0x1.0p-25f}, {-1.0f, 0x1.0p-25f}};
-
     if (LIBC_UNLIKELY(xbits.is_inf()))
-      return SIGNS[sign_index][0];
+      return xbits.is_neg() ? -1.0f : 1.0f;
+
+    if (xbits.is_pos())
+      return fputil::round_result_slightly_down(1.0f);
 
-    return SIGNS[sign_index][0] + SIGNS[sign_index][1];
+    return fputil::round_result_slightly_up(-1.0f);
   }
 
   // Range reduction: e^(2x) = 2^(hi + mid) * e^lo
@@ -84,10 +84,9 @@ LIBC_INLINE float tanhf(float x) {
   k = fputil::nearest_integer(xd * LOG2_E_EXP2_6);
   mk = -static_cast(k);
 #else
-  constexpr double HALF_WAY[2] = {-0.5, 0.5};
+  const double half_way = xbits.is_neg() ? 0.5 : -0.5;
 
-  mk = static_cast(
-      fputil::multiply_add(xd, -LOG2_E_EXP2_6, HALF_WAY[sign_index]));
+  mk = static_cast(fputil::multiply_add(xd, -LOG2_E_EXP2_6, half_way));
   k = static_cast(-mk);
 #endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
   // -hi = floor(-k * 2^(-MID_BITS))

From e18775051a02e368fa9163bbb84c4aa68d4f2b97 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra 
Date: Tue, 12 May 2026 13:03:51 -0700
Subject: [PATCH 503/538] [VPlan] Extend licm to hoist replicate loads
 (#179506)

The patch eliminates the need for a separate hoistInvariantLoads
transform. In the consolidation, it was discovered that
hoistInvariantLoads actually has a major bug, and doesn't even collect
stores to check loads against, due to an early continue. A planned
follow-up is to get licm to sink replicate stores: #191026.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  64 ++-----
 .../Transforms/Vectorize/VPlanTransforms.h    |   5 -
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    |   1 +
 .../AArch64/conditional-branches-cost.ll      |  10 +-
 .../LoopVectorize/AArch64/predicated-costs.ll |   2 +-
 .../LoopVectorize/AArch64/reduction-cost.ll   |   6 +-
 .../LoopVectorize/AArch64/store-costs-sve.ll  |   2 +-
 .../VPlan/vplan-print-after-all.ll            |   1 -
 .../LoopVectorize/X86/cost-model.ll           |   2 +-
 .../LoopVectorize/X86/induction-step.ll       |  25 ++-
 ...nd-sink-mem-ops-with-invariant-pointers.ll | 175 +++++++++++++-----
 .../LoopVectorize/if-pred-stores.ll           |   4 +-
 .../interleaved-accesses-metadata.ll          |   4 +-
 .../pointer-select-runtime-checks.ll          |   6 +-
 llvm/test/Transforms/LoopVectorize/pr50686.ll |   6 +-
 ...ive-path-inner-loop-with-runtime-checks.ll |   2 +-
 16 files changed, 177 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c836a280eac19..4050f9edd5b32 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2449,6 +2449,18 @@ void VPlanTransforms::cse(VPlan &Plan) {
   }
 }
 
+/// Return true if we do not know how to (mechanically) hoist or sink a
+/// non-memory or memory recipe \p R out of a loop region.
+static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB,
+                                    VPBasicBlock *LastBB) {
+  if (!isa(R) || !R.mayReadFromMemory())
+    return vputils::cannotHoistOrSinkRecipe(R);
+
+  // Check that the load doesn't alias with stores between FirstBB and LastBB.
+  auto MemLoc = vputils::getMemoryLocation(R);
+  return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
+}
+
 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
 static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -2464,7 +2476,8 @@ static void licm(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(
            vp_depth_first_shallow(LoopRegion->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      if (vputils::cannotHoistOrSinkRecipe(R))
+      if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
+                                  LoopRegion->getExitingBasicBlock()))
         continue;
       if (any_of(R.operands(), [](VPValue *Op) {
             return !Op->isDefinedOutsideLoopRegions();
@@ -2715,7 +2728,6 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   RUN_VPLAN_PASS(removeDeadRecipes, Plan);
 
   RUN_VPLAN_PASS(createAndOptimizeReplicateRegions, Plan);
-  RUN_VPLAN_PASS(hoistInvariantLoads, Plan);
   RUN_VPLAN_PASS(mergeBlocksIntoPredecessors, Plan);
   RUN_VPLAN_PASS(licm, Plan);
 }
@@ -4554,54 +4566,6 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   }
 }
 
-void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
-  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-
-  // Collect candidate loads with invariant addresses and noalias scopes
-  // metadata and memory-writing recipes with noalias metadata.
-  SmallVector> CandidateLoads;
-  SmallVector Stores;
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(
-           vp_depth_first_shallow(LoopRegion->getEntry()))) {
-    for (VPRecipeBase &R : *VPBB) {
-      // Only handle single-scalar replicated loads with invariant addresses.
-      if (auto *RepR = dyn_cast(&R)) {
-        if (RepR->isPredicated() || !RepR->isSingleScalar() ||
-            RepR->getOpcode() != Instruction::Load)
-          continue;
-
-        VPValue *Addr = RepR->getOperand(0);
-        if (Addr->isDefinedOutsideLoopRegions()) {
-          MemoryLocation Loc = *vputils::getMemoryLocation(*RepR);
-          if (!Loc.AATags.Scope)
-            continue;
-          CandidateLoads.push_back({RepR, Loc});
-        }
-      }
-      if (R.mayWriteToMemory()) {
-        auto Loc = vputils::getMemoryLocation(R);
-        if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
-          return;
-        Stores.push_back(*Loc);
-      }
-    }
-  }
-
-  VPBasicBlock *Preheader = Plan.getVectorPreheader();
-  for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
-    // Hoist the load to the preheader if it doesn't alias with any stores
-    // according to the noalias metadata. Other loads should have been hoisted
-    // by other passes
-    const AAMDNodes &LoadAA = LoadLoc.AATags;
-    if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
-          return !ScopedNoAliasAAResult::mayAliasInScopes(
-              LoadAA.Scope, StoreLoc.AATags.NoAlias);
-        })) {
-      LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
-    }
-  }
-}
-
 // Collect common metadata from a group of replicate recipes by intersecting
 // metadata from all recipes in the group.
 static VPIRMetadata getCommonMetadata(ArrayRef Recipes) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 75fc549167e03..c66d83d3177d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -385,11 +385,6 @@ struct VPlanTransforms {
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
 
-  /// Hoist single-scalar loads with invariant addresses out of the vector loop
-  /// to the preheader, if they are proven not to alias with any stores in the
-  /// plan using noalias metadata.
-  static void hoistInvariantLoads(VPlan &Plan);
-
   /// Hoist predicated loads from the same address to the loop entry block, if
   /// they are guaranteed to execute on both paths (i.e., in replicate regions
   /// with complementary masks P and NOT P).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 2a4b8566d8475..21da1864d5d6a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -74,6 +74,7 @@ unsigned getVFScaleFactor(VPRecipeBase *R);
 
 /// Return true if we do not know how to (mechanically) hoist or sink \p R.
 /// When sinking, passing \p Sinking = true ensures that assumes aren't sunk.
+/// Returns true for recipes that access memory.
 bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking = false);
 
 /// Returns the VPValue representing the uncountable exit comparison used by
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index abee7f8a8d9c2..690a61e3e05c2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -393,10 +393,10 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META8:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
-; DEFAULT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META13:![0-9]+]]
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]]
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
@@ -477,13 +477,13 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 2
-; PRED-NEXT:    [[L_C:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META3:![0-9]+]]
-; PRED-NEXT:    [[L_B:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META6:![0-9]+]]
-; PRED-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META3:![0-9]+]]
+; PRED-NEXT:    [[L_B:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META6:![0-9]+]]
 ; PRED-NEXT:    [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement  poison, i32 [[OR]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT30]],  poison,  zeroinitializer
+; PRED-NEXT:    [[L_C:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META8:![0-9]+]]
 ; PRED-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[L_C]], [[OR]]
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement  poison, i1 [[C_0]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT28]],  poison,  zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index a57cdee91d9c2..6808eddd5a1c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -66,10 +66,10 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
 ; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll
index 92e75f26fffcb..8c8348cf5700f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll
@@ -98,11 +98,11 @@ define i32 @or_reduction_with_freeze(ptr %dst, ptr %src) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DST]], align 8, !alias.scope [[META4:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DST]], align 8, !alias.scope [[META4:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP24]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 1dc82491f944d..fdd23d30025a2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -207,10 +207,10 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement  poison, i16 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT]],  poison,  zeroinitializer
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 1000)
 ; PRED-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META3:![0-9]+]]
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement  poison, i64 [[TMP4]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT2]],  poison,  zeroinitializer
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call  @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 1000)
 ; PRED-NEXT:    [[TMP3:%.*]] = trunc  [[BROADCAST_SPLAT3]] to 
 ; PRED-NEXT:    [[TMP2:%.*]] = trunc  [[BROADCAST_SPLAT]] to 
 ; PRED-NEXT:    [[TMP5:%.*]] = and  [[TMP3]], [[TMP2]]
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index 061588317eba7..fcaf5f4867ef7 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -43,7 +43,6 @@
 ; CHECK: VPlan for loop in 'foo' after removeBranchOnConst
 ; CHECK: VPlan for loop in 'foo' after removeDeadRecipes
 ; CHECK: VPlan for loop in 'foo' after createAndOptimizeReplicateRegions
-; CHECK: VPlan for loop in 'foo' after hoistInvariantLoads
 ; CHECK: VPlan for loop in 'foo' after mergeBlocksIntoPredecessors
 ; CHECK: VPlan for loop in 'foo' after licm
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimize
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 476bbe8fba51e..71a3c5bccb30a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -355,8 +355,8 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 4, i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP21]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[TMP23]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
index 124431fa5d311..7f3b22ed5df09 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -179,23 +179,30 @@ define void @canonical_iv_matches_user_iv(i64 %n, ptr %A, ptr %B) {
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load double, ptr [[A]], align 8, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fdiv double 1.000000e+00, [[TMP11]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr [16 x i8], ptr [[B]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr [16 x i8], ptr [[B]], i64 [[TMP14]]
 ; CHECK-NEXT:    store <2 x double> [[BROADCAST_SPLAT]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    store <2 x double> [[BROADCAST_SPLAT]], ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
index 4079d8f364cbe..c1815226a3883 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
 define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invariant_ptr, i32 %n) {
 ; CHECK-LABEL: define void @hoist_invariant_load_noalias_due_to_memchecks(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -34,20 +34,8 @@ define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invari
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 [[INV_VAL]], ptr [[GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -69,7 +57,7 @@ exit:
 define void @dont_hoist_variant_address(ptr %dst, ptr %src, i32 %n) {
 ; CHECK-LABEL: define void @dont_hoist_variant_address(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoaddr ptr [[SRC]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = ptrtoaddr ptr [[DST]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
@@ -93,21 +81,8 @@ define void @dont_hoist_variant_address(ptr %dst, ptr %src, i32 %n) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -130,7 +105,7 @@ exit:
 define void @dont_hoist_predicated_load(ptr %dst, ptr %invariant_ptr, ptr %cond_ptr, i32 %n) {
 ; CHECK-LABEL: define void @dont_hoist_predicated_load(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], ptr [[COND_PTR:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -198,27 +173,8 @@ define void @dont_hoist_predicated_load(ptr %dst, ptr %invariant_ptr, ptr %cond_
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[IV]]
-; CHECK-NEXT:    [[COND:%.*]] = load i32, ptr [[GEP_COND]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[COND]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 [[INV_VAL]], ptr [[GEP]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -244,3 +200,120 @@ loop.latch:
 exit:
   ret void
 }
+
+; TODO: Here, the load of q can be hoisted past the preceding store,
+; as it doesn't alias via TBAA. However, we do not hoist it at the
+; moment because there no scoped noalias metadata.
+define void @load_store_noalias_via_tbaa(ptr %p, ptr %q, ptr %n) {
+; CHECK-LABEL: define void @load_store_noalias_via_tbaa(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[N5:%.*]] = ptrtoint ptr [[N]] to i64
+; CHECK-NEXT:    [[P2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[N1:%.*]] = ptrtoint ptr [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N5]], 3074457345618258603
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[P6]], 3074457345618258603
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[N1]] to i2
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[P2]] to i2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i2 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i2 [[TMP7]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N1]], 3074457345618258603
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[P2]], 3074457345618258603
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP11]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 12, i64 [[TMP12]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ult ptr [[TMP13]], [[SCEVGEP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[IDENT_CHECK]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[Q]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[P]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[Q]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 12
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP18]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 12
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[TMP20]], 24
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP20]], 36
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP21]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP22]]
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP]], align 4, !tbaa [[FLOAT_TBAA21:![0-9]+]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP7]], align 4, !tbaa [[FLOAT_TBAA21]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP8]], align 4, !tbaa [[FLOAT_TBAA21]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP9]], align 4, !tbaa [[FLOAT_TBAA21]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr [[Q]], align 4, !tbaa [[FLOAT_TBAA27:![0-9]+]], !alias.scope [[META28:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP39]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[FLOAT_TBAA27]], !alias.scope [[META31:![0-9]+]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP25]], align 4, !tbaa [[FLOAT_TBAA27]], !alias.scope [[META31]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[FLOAT_TBAA27]], !alias.scope [[META31]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP27]], align 4, !tbaa [[FLOAT_TBAA27]], !alias.scope [[META31]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP28]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP29]], i32 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP30]], i32 2
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i32 3
+; CHECK-NEXT:    [[TMP36:%.*]] = fadd <4 x float> [[BROADCAST_SPLAT]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP36]], i64 3
+; CHECK-NEXT:    store float [[TMP37]], ptr [[P]], align 4, !tbaa [[FLOAT_TBAA27]], !alias.scope [[META33:![0-9]+]], !noalias [[META28]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %p, %entry ], [ %ptr.iv.next, %loop ]
+  store float 0.000000e+00, ptr %ptr.iv, align 4, !tbaa !0
+  %ld.q = load float, ptr %q, align 4, !tbaa !6
+  %gep.iv.4 = getelementptr i8, ptr %ptr.iv, i64 4
+  %ld.iv = load float, ptr %gep.iv.4, align 4, !tbaa !6
+  %res = fadd float %ld.q, %ld.iv
+  store float %res, ptr %p, align 4, !tbaa !6
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 12
+  %ec = icmp eq ptr %ptr.iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+!0 = !{!1, !2, i64 0}
+!1 = !{!"foo", !2, i64 0, !2, i64 4, !3, i64 8, !5, i64 9}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+!5 = !{!"bool", !3, i64 0}
+!6 = !{!1, !2, i64 4}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f5ab855e56efb..f9886bb3c0033 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -1161,11 +1161,11 @@ define void @hoistable_predicated_store(ptr %A, ptr %B, ptr %C, ptr %D) {
 ; VEC-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]]
 ; VEC-NEXT:    br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 8, !alias.scope [[META16:![0-9]+]]
 ; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VEC:       [[VECTOR_BODY]]:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VEC-NEXT:    store i32 0, ptr [[C]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META19:![0-9]+]]
-; VEC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 8, !alias.scope [[META23:![0-9]+]]
+; VEC-NEXT:    store i32 0, ptr [[C]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META21:![0-9]+]]
 ; VEC-NEXT:    store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META24:![0-9]+]], !noalias [[META25:![0-9]+]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll
index a8ad9121475a1..e9dfebff0d809 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll
@@ -101,11 +101,11 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META10:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll
index 66ab7939c3cac..9b383bcb88a69 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll
@@ -23,8 +23,8 @@ define void @test1_select_invariant(ptr %src.1, ptr %src.2, ptr %dst, i1 %c, i8
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[PTR_SEL]], align 8, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[PTR_SEL]], align 8, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -32,8 +32,8 @@ define void @test1_select_invariant(ptr %src.1, ptr %src.2, ptr %dst, i1 %c, i8
 ; CHECK-NEXT:    [[INDUCTION2:%.*]] = add i8 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i8 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i8 [[INDUCTION2]]
-; CHECK-NEXT:    store i8 [[TMP6]], ptr [[TMP11]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    store i8 [[TMP6]], ptr [[TMP8]], align 2, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    store i8 [[TMP7]], ptr [[TMP11]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    store i8 [[TMP7]], ptr [[TMP8]], align 2, !alias.scope [[META3]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll
index 5e1f1b3cbed7e..2d13256c9757c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr50686.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll
@@ -15,11 +15,11 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) {
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw i32 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
index 14d82834bdfda..9b62a6391a663 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
@@ -50,8 +50,8 @@ define void @expand(ptr %src, ptr %dst, i64 %0) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], 

From 2ac27bf5e81c164a5c7406370ed212aac418e107 Mon Sep 17 00:00:00 2001
From: Andy Kaylor 
Date: Tue, 12 May 2026 13:21:47 -0700
Subject: [PATCH 504/538] [CIR] Implement ARM-specific lowering for method
 pointers (#196592)

This implements the ARM-specific CXXABI lowering details for pointers to
member functions, including comparsion and cast to bool. This includes
updates to several places that we had neglected to insert diagnostics
saying that ARM-specific handling was needed.
---
 .../TargetLowering/LowerItaniumCXXABI.cpp     | 133 +++++++---
 .../CodeGen/pointer-to-data-member-cmp.cpp    |   4 +
 .../CodeGen/pointer-to-member-func-cast.cpp   | 190 +++++++++------
 .../CodeGen/pointer-to-member-func-cmp.cpp    | 212 +++++++++-------
 .../CIR/CodeGen/pointer-to-member-func.cpp    | 227 +++++++++++-------
 5 files changed, 490 insertions(+), 276 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
index 8769975bdc948..3ed5ba4750a0c 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
@@ -33,10 +33,13 @@ namespace {
 class LowerItaniumCXXABI : public CIRCXXABI {
 protected:
   bool useARMMethodPtrABI;
+  bool use32BitVTableOffsetABI;
 
 public:
-  LowerItaniumCXXABI(LowerModule &lm, bool useARMMethodPtrABI = false)
-      : CIRCXXABI(lm), useARMMethodPtrABI(useARMMethodPtrABI) {}
+  LowerItaniumCXXABI(LowerModule &lm, bool useARMMethodPtrABI = false,
+                     bool use32BitVTableOffsetABI = false)
+      : CIRCXXABI(lm), useARMMethodPtrABI(useARMMethodPtrABI),
+        use32BitVTableOffsetABI(use32BitVTableOffsetABI) {}
 
   /// Lower the given data member pointer type to its ABI type. The returned
   /// type is also a CIR type.
@@ -128,12 +131,18 @@ std::unique_ptr createItaniumCXXABI(LowerModule &lm) {
   // include the other 32-bit ARM oddities: constructor/destructor return values
   // and array cookies.
   case clang::TargetCXXABI::GenericAArch64:
+    return std::make_unique(
+        lm,
+        /*useARMMethodPtrABI=*/true,
+        /*use32BitVTableOffsetABI=*/false);
   case clang::TargetCXXABI::AppleARM64:
     // TODO: this isn't quite right, clang uses AppleARM64CXXABI which inherits
     // from ARMCXXABI. We'll have to follow suit.
     assert(!cir::MissingFeatures::appleArm64CXXABI());
-    return std::make_unique(lm,
-                                                /*useARMMethodPtrABI=*/true);
+    return std::make_unique(
+        lm,
+        /*useARMMethodPtrABI=*/true,
+        /*use32BitVTableOffsetABI=*/true);
 
   case clang::TargetCXXABI::GenericItanium:
     return std::make_unique(lm);
@@ -247,7 +256,12 @@ mlir::TypedAttr LowerItaniumCXXABI::lowerMethodConstant(
       //   least significant bit of adj then makes exactly the same
       //   discrimination as the least significant bit of ptr does for
       //   Itanium.
-      llvm_unreachable("ARM method ptr abi NYI");
+      assert(!cir::MissingFeatures::pointerAuthentication());
+      auto ptr =
+          cir::IntAttr::get(ptrdiffCIRTy, attr.getVtableOffset().value());
+      auto one = cir::IntAttr::get(ptrdiffCIRTy, 1);
+      return cir::ConstRecordAttr::get(
+          loweredMethodTy, mlir::ArrayAttr::get(attr.getContext(), {ptr, one}));
     }
 
     // Itanium C++ ABI 2.3.2:
@@ -321,12 +335,12 @@ void LowerItaniumCXXABI::lowerGetMethod(
   mlir::Value ptrdiffOne =
       cir::ConstantOp::create(locBuilder, cir::IntAttr::get(ptrdiffCIRTy, 1));
 
-  mlir::Value adj =
+  mlir::Value rawAdj =
       cir::ExtractMemberOp::create(locBuilder, ptrdiffCIRTy, loweredMethod, 1);
-  if (useARMMethodPtrABI) {
-    op.emitError("ARM method ptr abi NYI");
-    return;
-  }
+  mlir::Value adj = rawAdj;
+  if (useARMMethodPtrABI)
+    adj = cir::ShiftOp::create(locBuilder, ptrdiffCIRTy, adj, ptrdiffOne,
+                               /*isLeftShift=*/false);
 
   // Apply the adjustment to the 'this' pointer.
   mlir::Type thisVoidPtrTy =
@@ -341,14 +355,13 @@ void LowerItaniumCXXABI::lowerGetMethod(
   // points to a virtual function.
   mlir::Value methodPtrField =
       cir::ExtractMemberOp::create(locBuilder, ptrdiffCIRTy, loweredMethod, 0);
-  mlir::Value virtualBit =
-      cir::AndOp::create(rewriter, op.getLoc(), methodPtrField, ptrdiffOne);
-  mlir::Value isVirtual;
+  mlir::Value virtualBit;
   if (useARMMethodPtrABI)
-    llvm_unreachable("ARM method ptr abi NYI");
+    virtualBit = cir::AndOp::create(locBuilder, rawAdj, ptrdiffOne);
   else
-    isVirtual = cir::CmpOp::create(locBuilder, cir::CmpOpKind::eq, virtualBit,
-                                   ptrdiffOne);
+    virtualBit = cir::AndOp::create(locBuilder, methodPtrField, ptrdiffOne);
+  mlir::Value isVirtual = cir::CmpOp::create(locBuilder, cir::CmpOpKind::eq,
+                                             virtualBit, ptrdiffOne);
 
   assert(!cir::MissingFeatures::emitCFICheck());
   assert(!cir::MissingFeatures::emitVFEInfo());
@@ -371,11 +384,15 @@ void LowerItaniumCXXABI::lowerGetMethod(
                             /*sync_scope=*/cir::SyncScopeKindAttr{},
                             /*mem_order=*/cir::MemOrderAttr());
 
-    // Get the vtable offset.
+    // Apply the offset.
+    // On ARM64, to reserve extra space in virtual member function pointers,
+    // we only pay attention to the low 32 bits of the offset.
     mlir::Value vtableOffset = methodPtrField;
-    assert(!useARMMethodPtrABI && "ARM method ptr abi NYI");
-    vtableOffset = cir::SubOp::create(b, loc, vtableOffset.getType(),
-                                      vtableOffset, ptrdiffOne);
+    if (!useARMMethodPtrABI)
+      vtableOffset = cir::SubOp::create(b, loc, vtableOffset.getType(),
+                                        vtableOffset, ptrdiffOne);
+    if (use32BitVTableOffsetABI)
+      llvm_unreachable("AppleARM64 method ptr abi NYI");
 
     assert(!cir::MissingFeatures::emitCFICheck());
     assert(!cir::MissingFeatures::emitVFEInfo());
@@ -462,11 +479,17 @@ LowerItaniumCXXABI::lowerDerivedDataMember(cir::DerivedDataMemberOp op,
 
 static mlir::Value lowerMethodCast(mlir::Operation *op, mlir::Value loweredSrc,
                                    std::int64_t offset, bool isDerivedToBase,
+                                   bool useARMMethodPtrABI,
                                    LowerModule &lowerMod,
                                    mlir::OpBuilder &builder) {
   if (offset == 0)
     return loweredSrc;
 
+  // The this-adjustment is left-shifted by 1 on ARM, since the low bit of the
+  // adjustment field is used to encode whether the member function is virtual.
+  if (useARMMethodPtrABI)
+    offset <<= 1;
+
   cir::IntType ptrdiffCIRTy = getPtrDiffCIRTy(lowerMod);
   auto adjField = cir::ExtractMemberOp::create(builder, op->getLoc(),
                                                ptrdiffCIRTy, loweredSrc, 1);
@@ -495,7 +518,8 @@ LowerItaniumCXXABI::lowerBaseMethod(cir::BaseMethodOp op,
                                     mlir::Value loweredSrc,
                                     mlir::OpBuilder &builder) const {
   return lowerMethodCast(op, loweredSrc, op.getOffset().getSExtValue(),
-                         /*isDerivedToBase=*/true, lm, builder);
+                         /*isDerivedToBase=*/true, useARMMethodPtrABI, lm,
+                         builder);
 }
 
 mlir::Value
@@ -503,7 +527,8 @@ LowerItaniumCXXABI::lowerDerivedMethod(cir::DerivedMethodOp op,
                                        mlir::Value loweredSrc,
                                        mlir::OpBuilder &builder) const {
   return lowerMethodCast(op, loweredSrc, op.getOffset().getSExtValue(),
-                         /*isDerivedToBase=*/false, lm, builder);
+                         /*isDerivedToBase=*/false, useARMMethodPtrABI, lm,
+                         builder);
 }
 
 mlir::Value
@@ -549,12 +574,43 @@ mlir::Value LowerItaniumCXXABI::lowerMethodCmp(cir::CmpOp op,
     return cir::OrOp::create(locBuilder, lhs.getType(), lhs, rhs);
   };
 
+  // Null member function pointers on ARM clear the low bit of Adj,
+  // so the zero condition has to check that neither low bit is set.
+  if (useARMMethodPtrABI) {
+    mlir::Value one =
+        cir::ConstantOp::create(locBuilder, cir::IntAttr::get(ptrdiffCIRTy, 1));
+
+    // The low bit of the adjustment field is used to encode whether the member
+    // function is virtual, but the ARM ABI specifies that for null pointers
+    // this bit must be clear. Therefore, to test whether the member pointer is
+    // null, we need to check that bit.
+    //
+    // If we are performing an equality check, ptrCmpToNull indicates that both
+    // pointers are null (if they are equal -- we only actually test lhs).
+    // If we are performing an inequality check, ptrCmpToNull indicates that
+    // one of the pointers is not null.
+    //
+    // To apply the ARM-specific logic, if either virtual bit is set, they
+    // cannot both be null (equality case -- ptrCmpToNull &= orAdjAnd1CmpZero),
+    // and if either virtual bit is set, one of the pointers is not null
+    // (inequality case -- ptrCmpToNull |= orAdjAnd1CmpZero).
+    mlir::Value orAdj = create_or(lhsAdjField, rhsAdjField);
+    mlir::Value orAdjAnd1 = create_and(orAdj, one);
+    mlir::Value orAdjAnd1CmpZero =
+        cir::CmpOp::create(locBuilder, op.getKind(), orAdjAnd1, ptrdiffZero);
+
+    if (op.getKind() == cir::CmpOpKind::eq)
+      ptrCmpToNull = create_and(ptrCmpToNull, orAdjAnd1CmpZero);
+    else
+      ptrCmpToNull = create_or(ptrCmpToNull, orAdjAnd1CmpZero);
+  }
+
   mlir::Value result;
   if (op.getKind() == cir::CmpOpKind::eq) {
     // (lhs.ptr == null || lhs.adj == rhs.adj) && lhs.ptr == rhs.ptr
     result = create_and(ptrCmp, create_or(ptrCmpToNull, adjCmp));
   } else {
-    // (lhs.ptr != null && lhs.adj != rhs.adj) || lhs.ptr != rhs.ptr
+    // lhs.ptr == rhs.ptr && (lhs.ptr == null || lhs.adj == rhs.adj)
     result = create_or(ptrCmp, create_and(ptrCmpToNull, adjCmp));
   }
 
@@ -593,18 +649,37 @@ LowerItaniumCXXABI::lowerMethodBitcast(cir::CastOp op, mlir::Type loweredDstTy,
 
 mlir::Value LowerItaniumCXXABI::lowerMethodToBoolCast(
     cir::CastOp op, mlir::Value loweredSrc, mlir::OpBuilder &builder) const {
+  mlir::ImplicitLocOpBuilder locBuilder(op.getLoc(), builder);
+
   // Itanium C++ ABI 2.3.2:
   //
   //   In the standard representation, a null member function pointer is
   //   represented with ptr set to a null pointer. The value of adj is
   //   unspecified for null member function pointers.
   cir::IntType ptrdiffCIRTy = getPtrDiffCIRTy(lm);
-  mlir::Value ptrdiffZero = cir::ConstantOp::create(
-      builder, op.getLoc(), cir::IntAttr::get(ptrdiffCIRTy, 0));
-  mlir::Value ptrField = cir::ExtractMemberOp::create(
-      builder, op.getLoc(), ptrdiffCIRTy, loweredSrc, 0);
-  return cir::CmpOp::create(builder, op.getLoc(), cir::CmpOpKind::ne, ptrField,
-                            ptrdiffZero);
+  mlir::Value ptrdiffZero =
+      cir::ConstantOp::create(locBuilder, cir::IntAttr::get(ptrdiffCIRTy, 0));
+  mlir::Value ptrField =
+      cir::ExtractMemberOp::create(locBuilder, ptrdiffCIRTy, loweredSrc, 0);
+
+  mlir::Value result =
+      cir::CmpOp::create(locBuilder, cir::CmpOpKind::ne, ptrField, ptrdiffZero);
+
+  // On ARM, a member function pointer is also non-null if the low bit of 'adj'
+  // (the virtual bit) is set.
+  if (useARMMethodPtrABI) {
+    mlir::Value one =
+        cir::ConstantOp::create(locBuilder, cir::IntAttr::get(ptrdiffCIRTy, 1));
+    mlir::Value adj =
+        cir::ExtractMemberOp::create(locBuilder, ptrdiffCIRTy, loweredSrc, 1);
+    mlir::Value virtualBit =
+        cir::AndOp::create(locBuilder, ptrdiffCIRTy, adj, one);
+    mlir::Value isVirtual = cir::CmpOp::create(locBuilder, cir::CmpOpKind::ne,
+                                               virtualBit, ptrdiffZero);
+    result = cir::OrOp::create(locBuilder, result, isVirtual);
+  }
+
+  return result;
 }
 
 static void buildBadCastCall(mlir::OpBuilder &builder, mlir::Location loc,
diff --git a/clang/test/CIR/CodeGen/pointer-to-data-member-cmp.cpp b/clang/test/CIR/CodeGen/pointer-to-data-member-cmp.cpp
index dbc05e13c0733..fd1e615c351a8 100644
--- a/clang/test/CIR/CodeGen/pointer-to-data-member-cmp.cpp
+++ b/clang/test/CIR/CodeGen/pointer-to-data-member-cmp.cpp
@@ -6,6 +6,10 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t.ll
 // RUN: FileCheck --input-file=%t.ll --check-prefix=OGCG %s
 
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t-arm.cir 2> %t-arm-before.cir
+// RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-arm-before.cir %s
+// RUN: FileCheck --check-prefix=CIR-AFTER --input-file=%t-arm.cir %s
+
 struct Foo {
   int a;
 };
diff --git a/clang/test/CIR/CodeGen/pointer-to-member-func-cast.cpp b/clang/test/CIR/CodeGen/pointer-to-member-func-cast.cpp
index 1da7fcb557903..d4fdc10fc601a 100644
--- a/clang/test/CIR/CodeGen/pointer-to-member-func-cast.cpp
+++ b/clang/test/CIR/CodeGen/pointer-to-member-func-cast.cpp
@@ -1,10 +1,18 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t.cir 2> %t-before.cir
 // RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-before.cir %s
-// RUN: FileCheck --check-prefix=CIR-AFTER --input-file=%t.cir %s
+// RUN: FileCheck --check-prefixes=CIR-AFTER,CIR-AFTER-X86 --input-file=%t.cir %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -emit-llvm %s -o %t-cir.ll
-// RUN: FileCheck --input-file=%t-cir.ll --check-prefix=LLVM %s
+// RUN: FileCheck --input-file=%t-cir.ll --check-prefixes=LLVM,LLVM-X86 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t.ll
-// RUN: FileCheck --input-file=%t.ll --check-prefix=OGCG %s
+// RUN: FileCheck --input-file=%t.ll --check-prefixes=OGCG,OGCG-X86 %s
+
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t-arm.cir 2> %t-arm-before.cir
+// RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-before.cir %s
+// RUN: FileCheck --check-prefixes=CIR-AFTER,CIR-AFTER-ARM --input-file=%t-arm.cir %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-llvm %s -o %t-arm-cir.ll
+// RUN: FileCheck --input-file=%t-arm-cir.ll --check-prefixes=LLVM,LLVM-ARM %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t-arm.ll
+// RUN: FileCheck --input-file=%t-arm.ll --check-prefixes=OGCG,OGCG-ARM %s
 
 struct Foo {
   void m1(int);
@@ -23,16 +31,34 @@ bool memfunc_to_bool(void (Foo::*func)(int)) {
 // CIR-BEFORE: cir.func {{.*}} @_Z15memfunc_to_boolM3FooFviE
 // CIR-BEFORE:   %{{.*}} = cir.cast member_ptr_to_bool %{{.*}} : !cir.method, !s32i)> in !rec_Foo> -> !cir.bool
 
-// CIR-AFTER: cir.func {{.*}} @_Z15memfunc_to_boolM3FooFviE
-// CIR-AFTER:   %[[FUNC:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[NULL_VAL:.*]] = cir.const #cir.int<0> : !s64i
-// CIR-AFTER:   %[[FUNC_PTR:.*]] = cir.extract_member %[[FUNC]][0] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[BOOL_VAL:.*]] = cir.cmp ne %[[FUNC_PTR]], %[[NULL_VAL]] : !s64i
+// CIR-AFTER:     cir.func {{.*}} @_Z15memfunc_to_boolM3FooFviE
+// CIR-AFTER:       %[[FUNC:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[NULL_VAL:.*]] = cir.const #cir.int<0> : !s64i
+// CIR-AFTER:       %[[FUNC_PTR:.*]] = cir.extract_member %[[FUNC]][0] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[BOOL_VAL:.*]] = cir.cmp ne %[[FUNC_PTR]], %[[NULL_VAL]] : !s64i
+// CIR-AFTER-ARM:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR-AFTER-ARM:   %[[ADJ:.*]] = cir.extract_member %[[FUNC]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER-ARM:   %[[AND:.*]] = cir.and %[[ADJ]], %[[ONE]] : !s64i
+// CIR-AFTER-ARM:   %[[NOT_VIRTUAL:.*]] = cir.cmp ne %[[AND]], %[[NULL_VAL]] : !s64i
+// CIR-AFTER-ARM:   %[[TMP:.*]] = cir.or %[[BOOL_VAL]], %[[NOT_VIRTUAL]] : !cir.bool
+// CIR-AFTER-X86-NOT: cir.extract_member
+// CIR-AFTER-X86-NOT: cir.and
+// CIR-AFTER-X86-NOT: cir.cmp
+// CIR-AFTER-X86-NOT: cir.or
+
+// LLVM:     define {{.*}} i1 @_Z15memfunc_to_boolM3FooFviE
+// LLVM:       %[[FUNC:.*]] = load { i64, i64 }, ptr %{{.*}}
+// LLVM:       %[[FUNC_PTR:.*]] = extractvalue { i64, i64 } %[[FUNC]], 0
+// LLVM:       %[[BOOL_VAL:.*]] = icmp ne i64 %[[FUNC_PTR]], 0
+// LLVM-ARM:   %[[ADJ:.*]] = extractvalue { i64, i64 } %[[FUNC]], 1
+// LLVM-ARM:   %[[AND:.*]] = and i64 %[[ADJ]], 1
+// LLVM-ARM:   %[[NOT_VIRTUAL:.*]] = icmp ne i64 %[[AND]], 0
+// LLVM-ARM:   %[[TMP:.*]] = or i1 %[[BOOL_VAL]], %[[NOT_VIRTUAL]]
+// LLVM-X86-NOT: extractvalue
+// LLVM-X86-NOT: and
+// LLVM-X86-NOT: icmp
+// LLVM-X86-NOT: or i1
 
-// LLVM: define {{.*}} i1 @_Z15memfunc_to_boolM3FooFviE
-// LLVM:   %[[FUNC:.*]] = load { i64, i64 }, ptr %{{.*}}
-// LLVM:   %[[FUNC_PTR:.*]] = extractvalue { i64, i64 } %[[FUNC]], 0
-// LLVM:   %{{.*}} = icmp ne i64 %[[FUNC_PTR]], 0
 
 // Note: OGCG uses an extra temporary for the function argument because it
 //       composes it from coerced arguments. We'll do that in CIR too after
@@ -43,7 +69,15 @@ bool memfunc_to_bool(void (Foo::*func)(int)) {
 // OGCG:   store { i64, i64 } %[[FUNC_TMP]], ptr %[[FUNC_ADDR:.*]]
 // OGCG:   %[[FUNC:.*]] = load { i64, i64 }, ptr %[[FUNC_ADDR]]
 // OGCG:   %[[FUNC_PTR:.*]] = extractvalue { i64, i64 } %[[FUNC]], 0
-// OGCG:   %{{.*}} = icmp ne i64 %[[FUNC_PTR]], 0
+// OGCG:   %[[BOOL_VAL:.*]] = icmp ne i64 %[[FUNC_PTR]], 0
+// OGCG-ARM:   %[[ADJ:.*]] = extractvalue { i64, i64 } %[[FUNC]], 1
+// OGCG-ARM:   %[[AND:.*]] = and i64 %[[ADJ]], 1
+// OGCG-ARM:   %[[NOT_VIRTUAL:.*]] = icmp ne i64 %[[AND]], 0
+// OGCG-ARM:   %[[TMP:.*]] = or i1 %[[BOOL_VAL]], %[[NOT_VIRTUAL]]
+// OGCG-X86-NOT: extractvalue
+// OGCG-X86-NOT: and
+// OGCG-X86-NOT: icmp
+// OGCG-X86-NOT: or i1
 
 auto memfunc_reinterpret(void (Foo::*func)(int)) -> void (Bar::*)() {
   return reinterpret_cast(func);
@@ -64,11 +98,15 @@ auto memfunc_reinterpret(void (Foo::*func)(int)) -> void (Bar::*)() {
 // LLVM:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RET_ADDR]]
 // LLVM:   ret { i64, i64 } %[[RET]]
 
-// OGCG: define {{.*}} { i64, i64 } @_Z19memfunc_reinterpretM3FooFviE
-// OGCG:   %[[FUNC:.*]] = load { i64, i64 }, ptr %{{.*}}
-// OGCG:   store { i64, i64 } %[[FUNC]], ptr %[[RET_ADDR:.*]]
-// OGCG:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RET_ADDR]]
-// OGCG:   ret { i64, i64 } %[[RET]]
+// OGCG-X86: define {{.*}} { i64, i64 } @_Z19memfunc_reinterpretM3FooFviE
+// OGCG-ARM: define {{.*}} [2 x i64] @_Z19memfunc_reinterpretM3FooFviE
+// OGCG:       %[[FUNC:.*]] = load { i64, i64 }, ptr %{{.*}}
+// OGCG:       store { i64, i64 } %[[FUNC]], ptr %[[FUNC_ADDR:[^,]+]]
+// OGCG-X86:   %[[RET:.*]] = load { i64, i64 }, ptr %[[FUNC_ADDR]]
+// OGCG-ARM:   %[[TMP:.*]] = load { i64, i64 }, ptr %[[FUNC_ADDR]]
+// OGCG-ARM:   store { i64, i64 } %[[TMP]], ptr %[[RET_ADDR:[^,]+]]
+// OGCG-ARM:   %[[RET:.*]] = load [2 x i64], ptr %[[RET_ADDR]]
+// OGCG:       ret {{.*}} %[[RET]]
 
 struct Base1 {
   int x;
@@ -114,11 +152,13 @@ DerivedMemFunc base_to_derived_zero_offset(Base1MemFunc ptr) {
 // LLVM:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RET_ADDR]]
 // LLVM:   ret { i64, i64 } %[[RET]]
 
-// OGCG: define {{.*}} { i64, i64 } @_Z27base_to_derived_zero_offsetM5Base1FviE
-// OGCG:   %[[ARG_ADDR:.*]] = alloca { i64, i64 }
-// OGCG:   store { i64, i64 } %{{.*}}, ptr %[[ARG_ADDR]]
-// OGCG:   %[[RET:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
-// OGCG:   ret { i64, i64 } %[[RET]]
+// OGCG-X86: define {{.*}} { i64, i64 } @_Z27base_to_derived_zero_offsetM5Base1FviE
+// OGCG-ARM: define {{.*}} [2 x i64] @_Z27base_to_derived_zero_offsetM5Base1FviE
+// OGCG:       %[[ARG_ADDR:.*]] = alloca { i64, i64 }
+// OGCG:       store { i64, i64 } %{{.*}}, ptr %[[ARG_ADDR]]
+// OGCG-X86:   %[[RET:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
+// OGCG-ARM:   %[[RET:.*]] = load [2 x i64], ptr %[[ARG_ADDR]]
+// OGCG:       ret {{.*}} %[[RET]]
 
 DerivedMemFunc base_to_derived(Base2MemFunc ptr) {
   return static_cast(ptr);
@@ -128,26 +168,30 @@ DerivedMemFunc base_to_derived(Base2MemFunc ptr) {
 // CIR-BEFORE:   %[[PTR:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i)> in !rec_Base2>>, !cir.method, !s32i)> in !rec_Base2>
 // CIR-BEFORE:   %{{.*}} = cir.derived_method %[[PTR]][16] : !cir.method, !s32i)> in !rec_Base2> -> !cir.method, !s32i)> in !rec_Derived>
 
-// CIR-AFTER: cir.func {{.*}} @_Z15base_to_derivedM5Base2FviE
-// CIR-AFTER:   %[[PTR:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[OFFSET:.*]] = cir.extract_member %[[PTR]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[OFFSET_ADJ:.*]] = cir.const #cir.int<16> : !s64i
-// CIR-AFTER:   %[[BINOP_KIND:.*]] = cir.add nsw %[[OFFSET]], %[[OFFSET_ADJ]] : !s64i
-// CIR-AFTER:   %{{.*}} = cir.insert_member %[[PTR]][1], %[[BINOP_KIND]] : !rec_anon_struct, !s64i
-
-// LLVM: define {{.*}} { i64, i64 } @_Z15base_to_derivedM5Base2FviE
-// LLVM:   %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
-// LLVM:   %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG]], 1
-// LLVM:   %[[ADJ_ADJ:.*]] = add nsw i64 %[[ADJ]], 16
-// LLVM:   %{{.*}} = insertvalue { i64, i64 } %[[ARG]], i64 %[[ADJ_ADJ]], 1
-
-// OGCG: define {{.*}} { i64, i64 } @_Z15base_to_derivedM5Base2FviE
-// OGCG:   %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
-// OGCG:   store { i64, i64 } %[[ARG]], ptr %[[ARG_ADDR:.*]]
-// OGCG:   %[[ARG1:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
-// OGCG:   %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG1]], 1
-// OGCG:   %[[ADJ_ADJ:.*]] = add nsw i64 %[[ADJ]], 16
-// OGCG:   %{{.*}} = insertvalue { i64, i64 } %[[ARG1]], i64 %[[ADJ_ADJ]], 1
+// CIR-AFTER:     cir.func {{.*}} @_Z15base_to_derivedM5Base2FviE
+// CIR-AFTER:       %[[PTR:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[OFFSET:.*]] = cir.extract_member %[[PTR]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER-X86:   %[[OFFSET_ADJ:.*]] = cir.const #cir.int<16> : !s64i
+// CIR-AFTER-ARM:   %[[OFFSET_ADJ:.*]] = cir.const #cir.int<32> : !s64i
+// CIR-AFTER:       %[[BINOP_KIND:.*]] = cir.add nsw %[[OFFSET]], %[[OFFSET_ADJ]] : !s64i
+// CIR-AFTER:       %{{.*}} = cir.insert_member %[[PTR]][1], %[[BINOP_KIND]] : !rec_anon_struct, !s64i
+
+// LLVM:     define {{.*}} { i64, i64 } @_Z15base_to_derivedM5Base2FviE
+// LLVM:       %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
+// LLVM:       %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG]], 1
+// LLVM-X86:   %[[ADJ_ADJ:.*]] = add nsw i64 %[[ADJ]], 16
+// LLVM-ARM:   %[[ADJ_ADJ:.*]] = add nsw i64 %[[ADJ]], 32
+// LLVM:       %{{.*}} = insertvalue { i64, i64 } %[[ARG]], i64 %[[ADJ_ADJ]], 1
+
+// OGCG-X86: define {{.*}} { i64, i64 } @_Z15base_to_derivedM5Base2FviE
+// OGCG-ARM: define {{.*}} [2 x i64] @_Z15base_to_derivedM5Base2FviE
+// OGCG:       %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
+// OGCG:       store { i64, i64 } %[[ARG]], ptr %[[ARG_ADDR:.*]]
+// OGCG:       %[[ARG1:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
+// OGCG:       %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG1]], 1
+// OGCG-X86:   %[[ADJ_ADJ:.*]] = add nsw i64 %[[ADJ]], 16
+// OGCG-ARM:   %[[ADJ_ADJ:.*]] = add nsw i64 %[[ADJ]], 32
+// OGCG:       %{{.*}} = insertvalue { i64, i64 } %[[ARG1]], i64 %[[ADJ_ADJ]], 1
 
 Base1MemFunc derived_to_base_zero_offset(DerivedMemFunc ptr) {
   return static_cast(ptr);
@@ -175,11 +219,17 @@ Base1MemFunc derived_to_base_zero_offset(DerivedMemFunc ptr) {
 // LLVM:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RET_ADDR]]
 // LLVM:   ret { i64, i64 } %[[RET]]
 
-// OGCG: define {{.*}} { i64, i64 } @_Z27derived_to_base_zero_offsetM7DerivedFviE
-// OGCG:   %[[ARG_ADDR:.*]] = alloca { i64, i64 }
-// OGCG:   store { i64, i64 } %{{.*}}, ptr %[[ARG_ADDR]]
-// OGCG:   %[[RET:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
-// OGCG:   ret { i64, i64 } %[[RET]]
+// OGCG-X86: define {{.*}} { i64, i64 } @_Z27derived_to_base_zero_offsetM7DerivedFviE
+// OGCG-ARM: define {{.*}} [2 x i64] @_Z27derived_to_base_zero_offsetM7DerivedFviE
+// OGCG-ARM:   %[[RETVAL:.*]] = alloca { i64, i64 }
+// OGCG:       %[[ARG_ADDR:.*]] = alloca { i64, i64 }
+// OGCG-ARM:   %[[ARG_COERCE:.*]] = alloca { i64, i64 }
+// OGCG:       store { i64, i64 } %{{.*}}, ptr %[[ARG_ADDR]]
+// OGCG-X86:   %[[RET:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
+// OGCG-ARM:   %[[TMP:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
+// OGCG-ARM:   store { i64, i64 } %[[TMP]], ptr %[[RETVAL]]
+// OGCG-ARM:   %[[RET:.*]] = load [2 x i64], ptr %[[RETVAL]]
+// OGCG:       ret {{.*}} %[[RET]]
 
 Base2MemFunc derived_to_base(DerivedMemFunc ptr) {
   return static_cast(ptr);
@@ -189,23 +239,27 @@ Base2MemFunc derived_to_base(DerivedMemFunc ptr) {
 // CIR-BEFORE:   %[[PTR:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i)> in !rec_Derived>>, !cir.method, !s32i)> in !rec_Derived>
 // CIR-BEFORE:   %{{.*}} = cir.base_method %[[PTR]][16] : !cir.method, !s32i)> in !rec_Derived> -> !cir.method, !s32i)> in !rec_Base2>
 
-// CIR-AFTER: cir.func {{.*}} @_Z15derived_to_baseM7DerivedFviE
-// CIR-AFTER:   %[[PTR:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[OFFSET:.*]] = cir.extract_member %[[PTR]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[OFFSET_ADJ:.*]] = cir.const #cir.int<16> : !s64i
-// CIR-AFTER:   %[[BINOP_KIND:.*]] = cir.sub nsw %[[OFFSET]], %[[OFFSET_ADJ]] : !s64i
-// CIR-AFTER:   %{{.*}} = cir.insert_member %[[PTR]][1], %[[BINOP_KIND]] : !rec_anon_struct, !s64i
-
-// LLVM: define {{.*}} { i64, i64 } @_Z15derived_to_baseM7DerivedFviE
-// LLVM:   %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
-// LLVM:   %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG]], 1
-// LLVM:   %[[ADJ_ADJ:.*]] = sub nsw i64 %[[ADJ]], 16
-// LLVM:   %{{.*}} = insertvalue { i64, i64 } %[[ARG]], i64 %[[ADJ_ADJ]], 1
-
-// OGCG: define {{.*}} { i64, i64 } @_Z15derived_to_baseM7DerivedFviE
-// OGCG:   %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
-// OGCG:   store { i64, i64 } %[[ARG]], ptr %[[ARG_ADDR:.*]]
-// OGCG:   %[[ARG1:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
-// OGCG:   %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG1]], 1
-// OGCG:   %[[ADJ_ADJ:.*]] = sub nsw i64 %[[ADJ]], 16
-// OGCG:   %{{.*}} = insertvalue { i64, i64 } %[[ARG1]], i64 %[[ADJ_ADJ]], 1
+// CIR-AFTER:     cir.func {{.*}} @_Z15derived_to_baseM7DerivedFviE
+// CIR-AFTER:       %[[PTR:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[OFFSET:.*]] = cir.extract_member %[[PTR]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER-X86:   %[[OFFSET_ADJ:.*]] = cir.const #cir.int<16> : !s64i
+// CIR-AFTER-ARM:   %[[OFFSET_ADJ:.*]] = cir.const #cir.int<32> : !s64i
+// CIR-AFTER:       %[[BINOP_KIND:.*]] = cir.sub nsw %[[OFFSET]], %[[OFFSET_ADJ]] : !s64i
+// CIR-AFTER:       %{{.*}} = cir.insert_member %[[PTR]][1], %[[BINOP_KIND]] : !rec_anon_struct, !s64i
+
+// LLVM:     define {{.*}} { i64, i64 } @_Z15derived_to_baseM7DerivedFviE
+// LLVM:       %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
+// LLVM:       %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG]], 1
+// LLVM-X86:   %[[ADJ_ADJ:.*]] = sub nsw i64 %[[ADJ]], 16
+// LLVM-ARM:   %[[ADJ_ADJ:.*]] = sub nsw i64 %[[ADJ]], 32
+// LLVM:       %{{.*}} = insertvalue { i64, i64 } %[[ARG]], i64 %[[ADJ_ADJ]], 1
+
+// OGCG-X86: define {{.*}} { i64, i64 } @_Z15derived_to_baseM7DerivedFviE
+// OGCG-ARM: define {{.*}} [2 x i64] @_Z15derived_to_baseM7DerivedFviE
+// OGCG:       %[[ARG:.*]] = load { i64, i64 }, ptr %{{.*}}
+// OGCG:       store { i64, i64 } %[[ARG]], ptr %[[ARG_ADDR:.*]]
+// OGCG:       %[[ARG1:.*]] = load { i64, i64 }, ptr %[[ARG_ADDR]]
+// OGCG:       %[[ADJ:.*]] = extractvalue { i64, i64 } %[[ARG1]], 1
+// OGCG-X86:   %[[ADJ_ADJ:.*]] = sub nsw i64 %[[ADJ]], 16
+// OGCG-ARM:   %[[ADJ_ADJ:.*]] = sub nsw i64 %[[ADJ]], 32
+// OGCG:       %{{.*}} = insertvalue { i64, i64 } %[[ARG1]], i64 %[[ADJ_ADJ]], 1
diff --git a/clang/test/CIR/CodeGen/pointer-to-member-func-cmp.cpp b/clang/test/CIR/CodeGen/pointer-to-member-func-cmp.cpp
index 3707f03da3f7d..b7bcc31573c05 100644
--- a/clang/test/CIR/CodeGen/pointer-to-member-func-cmp.cpp
+++ b/clang/test/CIR/CodeGen/pointer-to-member-func-cmp.cpp
@@ -1,10 +1,18 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t.cir 2> %t-before.cir
 // RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-before.cir %s
-// RUN: FileCheck --check-prefix=CIR-AFTER --input-file=%t.cir %s
+// RUN: FileCheck --check-prefixes=CIR-AFTER,CIR-AFTER-X86 --input-file=%t.cir %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -emit-llvm %s -o %t-cir.ll
-// RUN: FileCheck --input-file=%t-cir.ll --check-prefix=LLVM %s
+// RUN: FileCheck --input-file=%t-cir.ll --check-prefixes=LLVM,LLVM-X86 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t.ll
-// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+// RUN: FileCheck --check-prefixes=OGCG,OGCG-X86 --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t-arm.cir 2> %t-arm-before.cir
+// RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-arm-before.cir %s
+// RUN: FileCheck --check-prefixes=CIR-AFTER,CIR-AFTER-ARM --input-file=%t-arm.cir %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-llvm %s -o %t-arm-cir.ll
+// RUN: FileCheck --input-file=%t-arm-cir.ll --check-prefixes=LLVM,LLVM-ARM %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t-arm.ll
+// RUN: FileCheck --check-prefixes=OGCG,OGCG-ARM --input-file=%t-arm.ll %s
 
 struct Foo {
   void m1(int);
@@ -15,56 +23,72 @@ struct Foo {
 bool cmp_eq(void (Foo::*lhs)(int), void (Foo::*rhs)(int)) {
   return lhs == rhs;
 }
-  
+
 // CIR-BEFORE: cir.func {{.*}} @_Z6cmp_eqM3FooFviES1_
 // CIR-BEFORE:   %[[LHS:.*]] = cir.load{{.*}} %0 : !cir.ptr, !s32i)> in !rec_Foo>>
 // CIR-BEFORE:   %[[RHS:.*]] = cir.load{{.*}} %1 : !cir.ptr, !s32i)> in !rec_Foo>>
 // CIR-BEFORE:   %[[CMP:.*]] = cir.cmp eq %[[LHS]], %[[RHS]] : !cir.method, !s32i)> in !rec_Foo>
 // CIR-BEFORE:   cir.store %[[CMP]], %{{.*}} : !cir.bool, !cir.ptr
 
-// CIR-AFTER: @_Z6cmp_eqM3FooFviES1_
-// CIR-AFTER:   %[[LHS:.*]] = cir.load{{.*}} %0 : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[RHS:.*]] = cir.load{{.*}} %1 : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[NULL:.*]] = cir.const #cir.int<0> : !s64i
-// CIR-AFTER:   %[[LHS_PTR:.*]] = cir.extract_member %[[LHS]][0] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[RHS_PTR:.*]] = cir.extract_member %[[RHS]][0] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[PTR_CMP:.*]] = cir.cmp eq %[[LHS_PTR]], %[[RHS_PTR]] : !s64i
-// CIR-AFTER:   %[[PTR_NULL:.*]] = cir.cmp eq %[[LHS_PTR]], %[[NULL]] : !s64i
-// CIR-AFTER:   %[[LHS_ADJ:.*]] = cir.extract_member %[[LHS]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[RHS_ADJ:.*]] = cir.extract_member %[[RHS]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[ADJ_CMP:.*]] = cir.cmp eq %[[LHS_ADJ]], %[[RHS_ADJ]] : !s64i
-// CIR-AFTER:   %[[TMP:.*]] = cir.or %[[PTR_NULL]], %[[ADJ_CMP]] : !cir.bool
-// CIR-AFTER:   %[[RESULT:.*]] = cir.and %[[PTR_CMP]], %[[TMP]] : !cir.bool
+// CIR-AFTER:     @_Z6cmp_eqM3FooFviES1_
+// CIR-AFTER:       %[[LHS:.*]] = cir.load{{.*}} %0 : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[RHS:.*]] = cir.load{{.*}} %1 : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[NULL:.*]] = cir.const #cir.int<0> : !s64i
+// CIR-AFTER:       %[[LHS_PTR:.*]] = cir.extract_member %[[LHS]][0] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[RHS_PTR:.*]] = cir.extract_member %[[RHS]][0] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[PTR_CMP:.*]] = cir.cmp eq %[[LHS_PTR]], %[[RHS_PTR]] : !s64i
+// CIR-AFTER:       %[[PTR_NULL:.*]] = cir.cmp eq %[[LHS_PTR]], %[[NULL]] : !s64i
+// CIR-AFTER:       %[[LHS_ADJ:.*]] = cir.extract_member %[[LHS]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[RHS_ADJ:.*]] = cir.extract_member %[[RHS]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[ADJ_CMP:.*]] = cir.cmp eq %[[LHS_ADJ]], %[[RHS_ADJ]] : !s64i
+// CIR-AFTER-X86:   %[[TMP:.*]] = cir.or %[[PTR_NULL]], %[[ADJ_CMP]] : !cir.bool
+// CIR-AFTER-ARM:   %[[ONE:.*]] = cir.const #cir.int<1>
+// CIR-AFTER-ARM:   %[[OR_ADJ:.*]] = cir.or %[[LHS_ADJ]], %[[RHS_ADJ]] : !s64i
+// CIR-AFTER-ARM:   %[[AND_ADJ:.*]] = cir.and %[[OR_ADJ]], %[[ONE]] : !s64i
+// CIR-AFTER-ARM:   %[[ADJ_CMP2:.*]] = cir.cmp eq %[[AND_ADJ]], %[[NULL]] : !s64i
+// CIR-AFTER-ARM:   %[[AND_PTR_NULL:.*]] = cir.and %[[PTR_NULL]], %[[ADJ_CMP2]] : !cir.bool
+// CIR-AFTER-ARM:   %[[TMP:.*]] = cir.or %[[AND_PTR_NULL]], %[[ADJ_CMP]] : !cir.bool
+// CIR-AFTER:       %[[RESULT:.*]] = cir.and %[[PTR_CMP]], %[[TMP]] : !cir.bool
 
-// LLVM: define {{.*}} i1 @_Z6cmp_eqM3FooFviES1_
-// LLVM:   %[[LHS:.*]] = load { i64, i64 }, ptr %{{.+}}
-// LLVM:   %[[RHS:.*]] = load { i64, i64 }, ptr %{{.+}}
-// LLVM:   %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
-// LLVM:   %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
-// LLVM:   %[[PTR_CMP:.*]] = icmp eq i64 %[[LHS_PTR]], %[[RHS_PTR]]
-// LLVM:   %[[PTR_NULL:.*]] = icmp eq i64 %[[LHS_PTR]], 0
-// LLVM:   %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
-// LLVM:   %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
-// LLVM:   %[[ADJ_CMP:.*]] = icmp eq i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
-// LLVM:   %[[TMP:.*]] = or i1 %[[PTR_NULL]], %[[ADJ_CMP]]
-// LLVM:   %[[RESULT:.*]] = and i1 %[[PTR_CMP]], %[[TMP]]
+// LLVM:     define {{.*}} i1 @_Z6cmp_eqM3FooFviES1_
+// LLVM:       %[[LHS:.*]] = load { i64, i64 }, ptr %{{.+}}
+// LLVM:       %[[RHS:.*]] = load { i64, i64 }, ptr %{{.+}}
+// LLVM:       %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
+// LLVM:       %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
+// LLVM:       %[[PTR_CMP:.*]] = icmp eq i64 %[[LHS_PTR]], %[[RHS_PTR]]
+// LLVM:       %[[PTR_NULL:.*]] = icmp eq i64 %[[LHS_PTR]], 0
+// LLVM:       %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
+// LLVM:       %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
+// LLVM:       %[[ADJ_CMP:.*]] = icmp eq i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// LLVM-X86:   %[[TMP:.*]] = or i1 %[[PTR_NULL]], %[[ADJ_CMP]]
+// LLVM-ARM:   %[[OR_ADJ:.*]] = or i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// LLVM-ARM:   %[[AND_ADJ:.*]] = and i64 %[[OR_ADJ]], 1
+// LLVM-ARM:   %[[ADJ_CMP2:.*]] = icmp eq i64 %[[AND_ADJ]], 0
+// LLVM-ARM:   %[[AND_PTR_NULL:.*]] = and i1 %[[PTR_NULL]], %[[ADJ_CMP2]]
+// LLVM-ARM:   %[[TMP:.*]] = or i1 %[[AND_PTR_NULL]], %[[ADJ_CMP]]
+// LLVM:       %[[RESULT:.*]] = and i1 %[[PTR_CMP]], %[[TMP]]
 
-// OGCG: define {{.*}} i1 @_Z6cmp_eqM3FooFviES1_
-// OGCG:   %[[LHS_TMP:.*]] = alloca { i64, i64 }
-// OGCG:   %[[RHS_TMP:.*]] = alloca { i64, i64 }
-// OGCG:   %[[LHS_ADDR:.*]] = alloca { i64, i64 }
-// OGCG:   %[[RHS_ADDR:.*]] = alloca { i64, i64 }
-// OGCG:   %[[LHS:.*]] = load { i64, i64 }, ptr %[[LHS_ADDR]]
-// OGCG:   %[[RHS:.*]] = load { i64, i64 }, ptr %[[RHS_ADDR]]
-// OGCG:   %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
-// OGCG:   %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
-// OGCG:   %[[PTR_CMP:.*]] = icmp eq i64 %[[LHS_PTR]], %[[RHS_PTR]]
-// OGCG:   %[[PTR_NULL:.*]] = icmp eq i64 %[[LHS_PTR]], 0
-// OGCG:   %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
-// OGCG:   %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
-// OGCG:   %[[ADJ_CMP:.*]] = icmp eq i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
-// OGCG:   %[[TMP:.*]] = or i1 %[[PTR_NULL]], %[[ADJ_CMP]]
-// OGCG:   %[[RESULT:.*]] = and i1 %[[PTR_CMP]], %[[TMP]]
+// OGCG:     define {{.*}} i1 @_Z6cmp_eqM3FooFviES1_
+// OGCG:       %[[LHS_TMP:.*]] = alloca { i64, i64 }
+// OGCG:       %[[RHS_TMP:.*]] = alloca { i64, i64 }
+// OGCG:       %[[LHS_ADDR:.*]] = alloca { i64, i64 }
+// OGCG:       %[[RHS_ADDR:.*]] = alloca { i64, i64 }
+// OGCG:       %[[LHS:.*]] = load { i64, i64 }, ptr %[[LHS_ADDR]]
+// OGCG:       %[[RHS:.*]] = load { i64, i64 }, ptr %[[RHS_ADDR]]
+// OGCG:       %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
+// OGCG:       %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
+// OGCG:       %[[PTR_CMP:.*]] = icmp eq i64 %[[LHS_PTR]], %[[RHS_PTR]]
+// OGCG:       %[[PTR_NULL:.*]] = icmp eq i64 %[[LHS_PTR]], 0
+// OGCG:       %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
+// OGCG:       %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
+// OGCG:       %[[ADJ_CMP:.*]] = icmp eq i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// OGCG-X86:   %[[TMP:.*]] = or i1 %[[PTR_NULL]], %[[ADJ_CMP]]
+// OGCG-ARM:   %[[OR_ADJ:.*]] = or i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// OGCG-ARM:   %[[AND_ADJ:.*]] = and i64 %[[OR_ADJ]], 1
+// OGCG-ARM:   %[[ADJ_CMP2:.*]] = icmp eq i64 %[[AND_ADJ]], 0
+// OGCG-ARM:   %[[AND_PTR_NULL:.*]] = and i1 %[[PTR_NULL]], %[[ADJ_CMP2]]
+// OGCG-ARM:   %[[TMP:.*]] = or i1 %[[AND_PTR_NULL]], %[[ADJ_CMP]]
+// OGCG:       %[[RESULT:.*]] = and i1 %[[PTR_CMP]], %[[TMP]]
 
 bool cmp_ne(void (Foo::*lhs)(int), void (Foo::*rhs)(int)) {
   return lhs != rhs;
@@ -76,46 +100,62 @@ bool cmp_ne(void (Foo::*lhs)(int), void (Foo::*rhs)(int)) {
 // CIR-BEFORE:   %[[CMP:.*]] = cir.cmp ne %[[LHS]], %[[RHS]] : !cir.method, !s32i)> in !rec_Foo>
 // CIR-BEFORE:   cir.store %[[CMP]], %{{.*}} : !cir.bool, !cir.ptr
 
-// CIR-AFTER: cir.func {{.*}} @_Z6cmp_neM3FooFviES1_
-// CIR-AFTER:   %[[LHS:.*]] = cir.load{{.*}} %0 : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[RHS:.*]] = cir.load{{.*}} %1 : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[NULL:.*]] = cir.const #cir.int<0> : !s64i
-// CIR-AFTER:   %[[LHS_PTR:.*]] = cir.extract_member %[[LHS]][0] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[RHS_PTR:.*]] = cir.extract_member %[[RHS]][0] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[PTR_CMP:.*]] = cir.cmp ne %[[LHS_PTR]], %[[RHS_PTR]] : !s64i
-// CIR-AFTER:   %[[PTR_NULL:.*]] = cir.cmp ne %[[LHS_PTR]], %[[NULL]] : !s64i
-// CIR-AFTER:   %[[LHS_ADJ:.*]] = cir.extract_member %[[LHS]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[RHS_ADJ:.*]] = cir.extract_member %[[RHS]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[ADJ_CMP:.*]] = cir.cmp ne %[[LHS_ADJ]], %[[RHS_ADJ]] : !s64i
-// CIR-AFTER:   %[[TMP:.*]] = cir.and %[[PTR_NULL]], %[[ADJ_CMP]] : !cir.bool
-// CIR-AFTER:   %[[RESULT:.*]] = cir.or %[[PTR_CMP]], %[[TMP]] : !cir.bool
+// CIR-AFTER:     cir.func {{.*}} @_Z6cmp_neM3FooFviES1_
+// CIR-AFTER:       %[[LHS:.*]] = cir.load{{.*}} %0 : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[RHS:.*]] = cir.load{{.*}} %1 : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:       %[[NULL:.*]] = cir.const #cir.int<0> : !s64i
+// CIR-AFTER:       %[[LHS_PTR:.*]] = cir.extract_member %[[LHS]][0] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[RHS_PTR:.*]] = cir.extract_member %[[RHS]][0] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[PTR_CMP:.*]] = cir.cmp ne %[[LHS_PTR]], %[[RHS_PTR]] : !s64i
+// CIR-AFTER:       %[[PTR_NULL:.*]] = cir.cmp ne %[[LHS_PTR]], %[[NULL]] : !s64i
+// CIR-AFTER:       %[[LHS_ADJ:.*]] = cir.extract_member %[[LHS]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[RHS_ADJ:.*]] = cir.extract_member %[[RHS]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER:       %[[ADJ_CMP:.*]] = cir.cmp ne %[[LHS_ADJ]], %[[RHS_ADJ]] : !s64i
+// CIR-AFTER-X86:   %[[TMP:.*]] = cir.and %[[PTR_NULL]], %[[ADJ_CMP]] : !cir.bool
+// CIR-AFTER-ARM:   %[[ONE:.*]] = cir.const #cir.int<1>
+// CIR-AFTER-ARM:   %[[OR_ADJ:.*]] = cir.or %[[LHS_ADJ]], %[[RHS_ADJ]] : !s64i
+// CIR-AFTER-ARM:   %[[AND_ADJ:.*]] = cir.and %[[OR_ADJ]], %[[ONE]] : !s64i
+// CIR-AFTER-ARM:   %[[ADJ_CMP2:.*]] = cir.cmp ne %[[AND_ADJ]], %[[NULL]] : !s64i
+// CIR-AFTER-ARM:   %[[AND_PTR_NULL:.*]] = cir.or %[[PTR_NULL]], %[[ADJ_CMP2]] : !cir.bool
+// CIR-AFTER-ARM:   %[[TMP:.*]] = cir.and %[[AND_PTR_NULL]], %[[ADJ_CMP]] : !cir.bool
+// CIR-AFTER:       %[[RESULT:.*]] = cir.or %[[PTR_CMP]], %[[TMP]] : !cir.bool
 
-// LLVM: define {{.*}} i1 @_Z6cmp_neM3FooFviES1_
-// LLVM:   %[[LHS:.*]] = load { i64, i64 }, ptr %{{.*}}
-// LLVM:   %[[RHS:.*]] = load { i64, i64 }, ptr %{{.*}}
-// LLVM:   %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
-// LLVM:   %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
-// LLVM:   %[[PTR_CMP:.*]] = icmp ne i64 %[[LHS_PTR]], %[[RHS_PTR]]
-// LLVM:   %[[PTR_NULL:.*]] = icmp ne i64 %[[LHS_PTR]], 0
-// LLVM:   %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
-// LLVM:   %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
-// LLVM:   %[[ADJ_CMP:.*]] = icmp ne i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
-// LLVM:   %[[TMP:.*]] = and i1 %[[PTR_NULL]], %[[ADJ_CMP]]
-// LLVM:   %[[RESULT:.*]] = or i1 %[[PTR_CMP]], %[[TMP]]
+// LLVM:     define {{.*}} i1 @_Z6cmp_neM3FooFviES1_
+// LLVM:       %[[LHS:.*]] = load { i64, i64 }, ptr %{{.*}}
+// LLVM:       %[[RHS:.*]] = load { i64, i64 }, ptr %{{.*}}
+// LLVM:       %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
+// LLVM:       %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
+// LLVM:       %[[PTR_CMP:.*]] = icmp ne i64 %[[LHS_PTR]], %[[RHS_PTR]]
+// LLVM:       %[[PTR_NULL:.*]] = icmp ne i64 %[[LHS_PTR]], 0
+// LLVM:       %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
+// LLVM:       %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
+// LLVM:       %[[ADJ_CMP:.*]] = icmp ne i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// LLVM-X86:   %[[TMP:.*]] = and i1 %[[PTR_NULL]], %[[ADJ_CMP]]
+// LLVM-ARM:   %[[OR_ADJ:.*]] = or i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// LLVM-ARM:   %[[AND_ADJ:.*]] = and i64 %[[OR_ADJ]], 1
+// LLVM-ARM:   %[[ADJ_CMP2:.*]] = icmp ne i64 %[[AND_ADJ]], 0
+// LLVM-ARM:   %[[AND_PTR_NULL:.*]] = or i1 %[[PTR_NULL]], %[[ADJ_CMP2]]
+// LLVM-ARM:   %[[TMP:.*]] = and i1 %[[AND_PTR_NULL]], %[[ADJ_CMP]]
+// LLVM:       %[[RESULT:.*]] = or i1 %[[PTR_CMP]], %[[TMP]]
 
-// OGCG: define {{.*}} i1 @_Z6cmp_neM3FooFviES1_
-// OGCG:   %[[LHS_TMP:.*]] = alloca { i64, i64 }
-// OGCG:   %[[RHS_TMP:.*]] = alloca { i64, i64 }
-// OGCG:   %[[LHS_ADDR:.*]] = alloca { i64, i64 }
-// OGCG:   %[[RHS_ADDR:.*]] = alloca { i64, i64 }
-// OGCG:   %[[LHS:.*]] = load { i64, i64 }, ptr %[[LHS_ADDR]]
-// OGCG:   %[[RHS:.*]] = load { i64, i64 }, ptr %[[RHS_ADDR]]
-// OGCG:   %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
-// OGCG:   %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
-// OGCG:   %[[PTR_CMP:.*]] = icmp ne i64 %[[LHS_PTR]], %[[RHS_PTR]]
-// OGCG:   %[[PTR_NULL:.*]] = icmp ne i64 %[[LHS_PTR]], 0
-// OGCG:   %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
-// OGCG:   %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
-// OGCG:   %[[ADJ_CMP:.*]] = icmp ne i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
-// OGCG:   %[[TMP:.*]] = and i1 %[[PTR_NULL]], %[[ADJ_CMP]]
-// OGCG:   %[[RESULT:.*]] = or i1 %[[PTR_CMP]], %[[TMP]]
+// OGCG:     define {{.*}} i1 @_Z6cmp_neM3FooFviES1_
+// OGCG:       %[[LHS_TMP:.*]] = alloca { i64, i64 }
+// OGCG:       %[[RHS_TMP:.*]] = alloca { i64, i64 }
+// OGCG:       %[[LHS_ADDR:.*]] = alloca { i64, i64 }
+// OGCG:       %[[RHS_ADDR:.*]] = alloca { i64, i64 }
+// OGCG:       %[[LHS:.*]] = load { i64, i64 }, ptr %[[LHS_ADDR]]
+// OGCG:       %[[RHS:.*]] = load { i64, i64 }, ptr %[[RHS_ADDR]]
+// OGCG:       %[[LHS_PTR:.*]] = extractvalue { i64, i64 } %[[LHS]], 0
+// OGCG:       %[[RHS_PTR:.*]] = extractvalue { i64, i64 } %[[RHS]], 0
+// OGCG:       %[[PTR_CMP:.*]] = icmp ne i64 %[[LHS_PTR]], %[[RHS_PTR]]
+// OGCG:       %[[PTR_NULL:.*]] = icmp ne i64 %[[LHS_PTR]], 0
+// OGCG:       %[[LHS_ADJ:.*]] = extractvalue { i64, i64 } %[[LHS]], 1
+// OGCG:       %[[RHS_ADJ:.*]] = extractvalue { i64, i64 } %[[RHS]], 1
+// OGCG:       %[[ADJ_CMP:.*]] = icmp ne i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// OGCG-X86:   %[[TMP:.*]] = and i1 %[[PTR_NULL]], %[[ADJ_CMP]]
+// OGCG-ARM:   %[[OR_ADJ:.*]] = or i64 %[[LHS_ADJ]], %[[RHS_ADJ]]
+// OGCG-ARM:   %[[AND_ADJ:.*]] = and i64 %[[OR_ADJ]], 1
+// OGCG-ARM:   %[[ADJ_CMP2:.*]] = icmp ne i64 %[[AND_ADJ]], 0
+// OGCG-ARM:   %[[AND_PTR_NULL:.*]] = or i1 %[[PTR_NULL]], %[[ADJ_CMP2]]
+// OGCG-ARM:   %[[TMP:.*]] = and i1 %[[AND_PTR_NULL]], %[[ADJ_CMP]]
+// OGCG:       %[[RESULT:.*]] = or i1 %[[PTR_CMP]], %[[TMP]]
diff --git a/clang/test/CIR/CodeGen/pointer-to-member-func.cpp b/clang/test/CIR/CodeGen/pointer-to-member-func.cpp
index 1388e38882617..def822350a220 100644
--- a/clang/test/CIR/CodeGen/pointer-to-member-func.cpp
+++ b/clang/test/CIR/CodeGen/pointer-to-member-func.cpp
@@ -1,10 +1,21 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t.cir 2> %t-before.cir
 // RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-before.cir %s
-// RUN: FileCheck --check-prefix=CIR-AFTER --input-file=%t.cir %s
+// RUN: FileCheck --check-prefixes=CIR-AFTER,CIR-AFTER-X86 --input-file=%t.cir %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -emit-llvm %s -o %t-cir.ll
-// RUN: FileCheck --input-file=%t-cir.ll --check-prefix=LLVM %s
+// RUN: FileCheck --input-file=%t-cir.ll --check-prefixes=LLVM,LLVM-X86 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t.ll
-// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+// RUN: FileCheck --check-prefixes=OGCG,OGCG-X86 --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-cir -mmlir -mlir-print-ir-before=cir-cxxabi-lowering %s -o %t-arm.cir 2> %t-arm-before.cir
+// RUN: FileCheck --check-prefix=CIR-BEFORE --input-file=%t-arm-before.cir %s
+// RUN: FileCheck --check-prefixes=CIR-AFTER,CIR-AFTER-ARM --input-file=%t-arm.cir %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -fclangir -emit-llvm %s -o %t-arm-cir.ll
+// RUN: FileCheck --input-file=%t-arm-cir.ll --check-prefixes=LLVM,LLVM-ARM %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -std=c++17 -emit-llvm %s -o %t-arm.ll
+// RUN: FileCheck --check-prefixes=OGCG,OGCG-ARM --input-file=%t-arm.ll %s
+
+// FIXME: Some of the differences between LLVM (via CIR) and OGCG below are
+//        due to calling convention lowering being missing in the CIR path.
 
 struct Foo {
   void m1(int);
@@ -16,23 +27,28 @@ struct Foo {
 void (Foo::*m1_ptr)(int) = &Foo::m1;
 
 // CIR-BEFORE: cir.global external @m1_ptr = #cir.method<@_ZN3Foo2m1Ei> : !cir.method, !s32i)> in !rec_Foo>
-// CIR-AFTER-DAG: cir.global "private" constant cir_private @[[NONVIRT_RET:.*]] = #cir.const_record<{#cir.global_view<@_ZN3Foo2m1Ei> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
-// CIR-AFTER-DAG: cir.global "private" constant cir_private @[[VIRT_RET:.*]] = #cir.const_record<{#cir.int<9> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
-// CIR-AFTER-DAG: cir.global "private" constant cir_private @[[NULL_RET:.*]] = #cir.const_record<{#cir.int<0> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
-// CIR-AFTER: cir.global external @m1_ptr = #cir.const_record<{#cir.global_view<@_ZN3Foo2m1Ei> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
-// LLVM-DAG: @m1_ptr = global { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
-// LLVM-DAG: @[[NONVIRT_RET:.*]] = private constant { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
-// LLVM-DAG: @[[VIRT_RET:.*]] = private constant { i64, i64 } { i64 9, i64 0 }
-// LLVM-DAG: @[[NULL_RET:.*]] = private constant { i64, i64 } zeroinitializer
+// CIR-AFTER-DAG:     cir.global "private" constant cir_private @[[NONVIRT_RET:.*]] = #cir.const_record<{#cir.global_view<@_ZN3Foo2m1Ei> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
+// CIR-AFTER-X86-DAG: cir.global "private" constant cir_private @[[VIRT_RET:.*]] = #cir.const_record<{#cir.int<9> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
+// CIR-AFTER-ARM-DAG: cir.global "private" constant cir_private @[[VIRT_RET:.*]] = #cir.const_record<{#cir.int<8> : !s64i, #cir.int<1> : !s64i}> : !rec_anon_struct
+// CIR-AFTER-DAG:     cir.global "private" constant cir_private @[[NULL_RET:.*]] = #cir.const_record<{#cir.int<0> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
+// CIR-AFTER:         cir.global external @m1_ptr = #cir.const_record<{#cir.global_view<@_ZN3Foo2m1Ei> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
+// LLVM-DAG:     @m1_ptr = global { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
+// LLVM-DAG:     @[[NONVIRT_RET:.*]] = private constant { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
+// LLVM-X86-DAG: @[[VIRT_RET:.*]] = private constant { i64, i64 } { i64 9, i64 0 }
+// LLVM-ARM-DAG: @[[VIRT_RET:.*]] = private constant { i64, i64 } { i64 8, i64 1 }
+// LLVM-DAG:     @[[NULL_RET:.*]] = private constant { i64, i64 } zeroinitializer
 // OGCG: @m1_ptr = global { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
 
 // Global pointer to virtual method
 void (Foo::*m2_ptr)(int) = &Foo::m2;
 
 // CIR-BEFORE: cir.global external @m2_ptr = #cir.method : !cir.method, !s32i)> in !rec_Foo>
-// CIR-AFTER: cir.global external @m2_ptr = #cir.const_record<{#cir.int<1> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
-// LLVM-DAG: @m2_ptr = global { i64, i64 } { i64 1, i64 0 }
-// OGCG: @m2_ptr = global { i64, i64 } { i64 1, i64 0 }
+// CIR-AFTER-X86: cir.global external @m2_ptr = #cir.const_record<{#cir.int<1> : !s64i, #cir.int<0> : !s64i}> : !rec_anon_struct
+// CIR-AFTER-ARM: cir.global external @m2_ptr = #cir.const_record<{#cir.int<0> : !s64i, #cir.int<1> : !s64i}> : !rec_anon_struct
+// LLVM-X86-DAG: @m2_ptr = global { i64, i64 } { i64 1, i64 0 }
+// LLVM-ARM-DAG: @m2_ptr = global { i64, i64 } { i64 0, i64 1 }
+// OGCG-X86: @m2_ptr = global { i64, i64 } { i64 1, i64 0 }
+// OGCG-ARM: @m2_ptr = global { i64, i64 } { i64 0, i64 1 }
 
 // Self-referencing PMF causes a null method.
 long (Foo::*pmf1)(int) = pmf1;
@@ -65,8 +81,14 @@ auto make_non_virtual() -> void (Foo::*)(int) {
 // LLVM:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RETVAL]]
 // LLVM:   ret { i64, i64 } %[[RET]]
 
-// OGCG: define {{.*}} { i64, i64 } @_Z16make_non_virtualv()
-// OGCG:   ret { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
+// OGCG-X86: define {{.*}} { i64, i64 } @_Z16make_non_virtualv()
+// OGCG-X86:   ret { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }
+
+// OGCG-ARM: define {{.*}} [2 x i64] @_Z16make_non_virtualv()
+// OGCG-ARM:   %[[RETVAL:.*]] = alloca { i64, i64 }
+// OGCG-ARM:   store { i64, i64 } { i64 ptrtoint (ptr @_ZN3Foo2m1Ei to i64), i64 0 }, ptr %[[RETVAL]]
+// OGCG-ARM:   %[[RET:.*]] = load [2 x i64], ptr %[[RETVAL]]
+// OGCG-ARM:   ret [2 x i64] %[[RET]]
 
 auto make_virtual() -> void (Foo::*)(int) {
   return &Foo::m3;
@@ -92,8 +114,12 @@ auto make_virtual() -> void (Foo::*)(int) {
 // LLVM:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RETVAL]]
 // LLVM:   ret { i64, i64 } %[[RET]]
 
-// OGCG: define {{.*}} @_Z12make_virtualv()
-// OGCG:   ret { i64, i64 } { i64 9, i64 0 }
+// OGCG:     define {{.*}} @_Z12make_virtualv()
+// OGCG-X86:   ret { i64, i64 } { i64 9, i64 0 }
+// OGCG-ARM:   %[[RETVAL:.*]] = alloca { i64, i64 }
+// OGCG-ARM:   store { i64, i64 } { i64 8, i64 1 }, ptr %[[RETVAL]]
+// OGCG-ARM:   %[[RET:.*]] = load [2 x i64], ptr %[[RETVAL]]
+// OGCG-ARM:   ret [2 x i64] %[[RET]]
 
 auto make_null() -> void (Foo::*)(int) {
   return nullptr;
@@ -119,8 +145,12 @@ auto make_null() -> void (Foo::*)(int) {
 // LLVM:   %[[RET:.*]] = load { i64, i64 }, ptr %[[RETVAL]]
 // LLVM:   ret { i64, i64 } %[[RET]]
 
-// OGCG: define {{.*}} @_Z9make_nullv()
-// OGCG:   ret { i64, i64 } zeroinitializer
+// OGCG:     define {{.*}} @_Z9make_nullv()
+// OGCG-X86:   ret { i64, i64 } zeroinitializer
+// OGCG-ARM:   %[[RETVAL:.*]] = alloca { i64, i64 }
+// OGCG-ARM:   store { i64, i64 } zeroinitializer, ptr %[[RETVAL]]
+// OGCG-ARM:   %[[RET:.*]] = load [2 x i64], ptr %[[RETVAL]]
+// OGCG-ARM:   ret [2 x i64] %[[RET]]
 
 void call(Foo *obj, void (Foo::*func)(int), int arg) {
   (obj->*func)(arg);
@@ -133,76 +163,87 @@ void call(Foo *obj, void (Foo::*func)(int), int arg) {
 // CIR-BEFORE:   %[[ARG:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
 // CIR-BEFORE:   cir.call %[[CALLEE]](%[[THIS]], %[[ARG]]) : (!cir.ptr, !cir.ptr, !s32i)>>, !cir.ptr {{.*}}, !s32i {{.*}}) -> ()
 
-// CIR-AFTER: cir.func {{.*}} @_Z4callP3FooMS_FviEi
-// CIR-AFTER:   %[[OBJ:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr>, !cir.ptr
-// CIR-AFTER:   %[[FUNC:.*]] = cir.load{{.*}} : !cir.ptr, !rec_anon_struct
-// CIR-AFTER:   %[[VIRT_BIT:.*]] = cir.const #cir.int<1> : !s64i
-// CIR-AFTER:   %[[ADJ:.*]] = cir.extract_member %[[FUNC]][1] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[THIS:.*]] = cir.cast bitcast %[[OBJ]] : !cir.ptr -> !cir.ptr
-// CIR-AFTER:   %[[ADJUSTED_THIS:.*]] = cir.ptr_stride %[[THIS]], %[[ADJ]] : (!cir.ptr, !s64i) -> !cir.ptr
-// CIR-AFTER:   %[[METHOD_PTR:.*]] = cir.extract_member %[[FUNC]][0] : !rec_anon_struct -> !s64i
-// CIR-AFTER:   %[[VIRT_BIT_TEST:.*]] = cir.and %[[METHOD_PTR]], %[[VIRT_BIT]] : !s64i
-// CIR-AFTER:   %[[IS_VIRTUAL:.*]] = cir.cmp eq %[[VIRT_BIT_TEST]], %[[VIRT_BIT]] : !s64i
-// CIR-AFTER:   %[[CALLEE:.*]] = cir.ternary(%[[IS_VIRTUAL]], true {
-// CIR-AFTER:     %[[VTABLE_PTR:.*]] = cir.cast bitcast %[[ADJUSTED_THIS]] : !cir.ptr -> !cir.ptr>
-// CIR-AFTER:     %[[VTABLE:.*]] = cir.load %[[VTABLE_PTR]] : !cir.ptr>, !cir.ptr
-// CIR-AFTER:     %[[OFFSET:.*]] = cir.sub %[[METHOD_PTR]], %[[VIRT_BIT]] : !s64i
-// CIR-AFTER:     %[[VTABLE_SLOT:.*]] = cir.ptr_stride %[[VTABLE]], %[[OFFSET]] : (!cir.ptr, !s64i) -> !cir.ptr
-// CIR-AFTER:     %[[VIRTUAL_FN_PTR:.*]] = cir.cast bitcast %[[VTABLE_SLOT]] : !cir.ptr -> !cir.ptr, !cir.ptr, !s32i)>>>
-// CIR-AFTER:     %[[VIRTUAL_FN_PTR_LOAD:.*]] = cir.load %[[VIRTUAL_FN_PTR]] : !cir.ptr, !cir.ptr, !s32i)>>>, !cir.ptr, !cir.ptr, !s32i)>>
-// CIR-AFTER:     cir.yield %[[VIRTUAL_FN_PTR_LOAD]] : !cir.ptr, !cir.ptr, !s32i)>>
-// CIR-AFTER:   }, false {
-// CIR-AFTER:     %[[CALLEE_PTR:.*]] = cir.cast int_to_ptr %[[METHOD_PTR]] : !s64i -> !cir.ptr, !cir.ptr, !s32i)>>
-// CIR-AFTER:     cir.yield %[[CALLEE_PTR]] : !cir.ptr, !cir.ptr, !s32i)>>
-// CIR-AFTER:   }) : (!cir.bool) -> !cir.ptr, !cir.ptr, !s32i)>>
-// CIR-AFTER:   %[[ARG:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
-// CIR-AFTER:   cir.call %[[CALLEE]](%[[ADJUSTED_THIS]], %[[ARG]]) : (!cir.ptr, !cir.ptr, !s32i)>>, !cir.ptr {{.*}}, !s32i {{.*}}) -> ()
-
-// LLVM: define {{.*}} @_Z4callP3FooMS_FviEi
-// LLVM:   %[[OBJ:.*]] = load ptr, ptr %{{.*}}
-// LLVM:   %[[MEMFN_PTR:.*]] = load { i64, i64 }, ptr %{{.*}}
-// LLVM:   %[[THIS_ADJ:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 1
-// LLVM:   %[[ADJUSTED_THIS:.*]] = getelementptr i8, ptr %[[OBJ]], i64 %[[THIS_ADJ]]
-// LLVM:   %[[PTR_FIELD:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 0
-// LLVM:   %[[VIRT_BIT:.*]] = and i64 %[[PTR_FIELD]], 1
-// LLVM:   %[[IS_VIRTUAL:.*]] = icmp eq i64 %[[VIRT_BIT]], 1
-// LLVM:   br i1 %[[IS_VIRTUAL]], label %[[HANDLE_VIRTUAL:.*]], label %[[HANDLE_NON_VIRTUAL:.*]]
-// LLVM: [[HANDLE_VIRTUAL]]:
-// LLVM:   %[[VTABLE:.*]] = load ptr, ptr %[[ADJUSTED_THIS]]
-// LLVM:   %[[OFFSET:.*]] = sub i64 %[[PTR_FIELD]], 1
-// LLVM:   %[[VTABLE_SLOT:.*]] = getelementptr i8, ptr %[[VTABLE]], i64 %[[OFFSET]]
-// LLVM:   %[[VIRTUAL_FN_PTR:.*]] = load ptr, ptr %[[VTABLE_SLOT]]
-// LLVM:   br label %[[CONTINUE:.*]]
-// LLVM: [[HANDLE_NON_VIRTUAL]]:
-// LLVM:   %[[FUNC_PTR:.*]] = inttoptr i64 %[[PTR_FIELD]] to ptr
-// LLVM:   br label %[[CONTINUE]]
-// LLVM: [[CONTINUE]]:
-// LLVM:   %[[CALLEE_PTR:.*]] = phi ptr [ %[[FUNC_PTR]], %[[HANDLE_NON_VIRTUAL]] ], [ %[[VIRTUAL_FN_PTR]], %[[HANDLE_VIRTUAL]] ]
-// LLVM:   %[[ARG:.*]] = load i32, ptr %{{.+}}
-// LLVM:   call void %[[CALLEE_PTR]](ptr {{.*}} %[[ADJUSTED_THIS]], i32 {{.*}} %[[ARG]])
-// LLVM: }
-
-// OGCG: define {{.*}} @_Z4callP3FooMS_FviEi
-// OGCG:   %[[OBJ:.*]] = load ptr, ptr %{{.*}}
-// OGCG:   %[[MEMFN_PTR:.*]] = load { i64, i64 }, ptr %{{.*}}
-// OGCG:   %[[THIS_ADJ:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 1
-// OGCG:   %[[ADJUSTED_THIS:.*]] = getelementptr inbounds i8, ptr %[[OBJ]], i64 %[[THIS_ADJ]]
-// OGCG:   %[[PTR_FIELD:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 0
-// OGCG:   %[[VIRT_BIT:.*]] = and i64 %[[PTR_FIELD]], 1
-// OGCG:   %[[IS_VIRTUAL:.*]] = icmp ne i64 %[[VIRT_BIT]], 0
-// OGCG:   br i1 %[[IS_VIRTUAL]], label %[[HANDLE_VIRTUAL:.*]], label %[[HANDLE_NON_VIRTUAL:.*]]
-// OGCG: [[HANDLE_VIRTUAL]]:
-// OGCG:   %[[VTABLE:.*]] = load ptr, ptr %[[ADJUSTED_THIS]]
-// OGCG:   %[[OFFSET:.*]] = sub i64 %[[PTR_FIELD]], 1
-// OGCG:   %[[VTABLE_SLOT:.*]] = getelementptr i8, ptr %[[VTABLE]], i64 %[[OFFSET]]
-// OGCG:   %[[VIRTUAL_FN_PTR:.*]] = load ptr, ptr %[[VTABLE_SLOT]]
-// OGCG:   br label %[[CONTINUE:.*]]
-// OGCG: [[HANDLE_NON_VIRTUAL]]:
-// OGCG:   %[[FUNC_PTR:.*]] = inttoptr i64 %[[PTR_FIELD]] to ptr
-// OGCG:   br label %[[CONTINUE]]
-// OGCG: [[CONTINUE]]:
-// OGCG:   %[[CALLEE_PTR:.*]] = phi ptr [ %[[VIRTUAL_FN_PTR]], %[[HANDLE_VIRTUAL]] ], [ %[[FUNC_PTR]], %[[HANDLE_NON_VIRTUAL]] ]
-// OGCG:   %[[ARG:.*]] = load i32, ptr %{{.+}}
-// OGCG:   call void %[[CALLEE_PTR]](ptr {{.*}} %[[ADJUSTED_THIS]], i32 {{.*}} %[[ARG]])
-// OGCG: }
-
+// CIR-AFTER:    cir.func {{.*}} @_Z4callP3FooMS_FviEi
+// CIR-AFTER:      %[[OBJ:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr>, !cir.ptr
+// CIR-AFTER:      %[[FUNC:.*]] = cir.load{{.*}} : !cir.ptr, !rec_anon_struct
+// CIR-AFTER:      %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR-AFTER:      %[[ADJ:.*]] = cir.extract_member %[[FUNC]][1] : !rec_anon_struct -> !s64i
+// CIR-AFTER-ARM:  %[[ADJ_SHIFT:.*]] = cir.shift(right, %[[ADJ]] : !s64i, %[[ONE:.*]] : !s64i) -> !s64i
+// CIR-AFTER:      %[[THIS:.*]] = cir.cast bitcast %[[OBJ]] : !cir.ptr -> !cir.ptr
+// CIR-AFTER-X86:  %[[ADJUSTED_THIS:.*]] = cir.ptr_stride %[[THIS]], %[[ADJ]] : (!cir.ptr, !s64i) -> !cir.ptr
+// CIR-AFTER-ARM:  %[[ADJUSTED_THIS:.*]] = cir.ptr_stride %[[THIS]], %[[ADJ_SHIFT]] : (!cir.ptr, !s64i) -> !cir.ptr
+// CIR-AFTER:      %[[METHOD_PTR:.*]] = cir.extract_member %[[FUNC]][0] : !rec_anon_struct -> !s64i
+// CIR-AFTER-X86:  %[[VIRT_BIT_TEST:.*]] = cir.and %[[METHOD_PTR]], %[[ONE]] : !s64i
+// CIR-AFTER-ARM:  %[[VIRT_BIT_TEST:.*]] = cir.and %[[ADJ]], %[[ONE]] : !s64i
+// CIR-AFTER:      %[[IS_VIRTUAL:.*]] = cir.cmp eq %[[VIRT_BIT_TEST]], %[[ONE]] : !s64i
+// CIR-AFTER:      %[[CALLEE:.*]] = cir.ternary(%[[IS_VIRTUAL]], true {
+// CIR-AFTER:        %[[VTABLE_PTR:.*]] = cir.cast bitcast %[[ADJUSTED_THIS]] : !cir.ptr -> !cir.ptr>
+// CIR-AFTER:        %[[VTABLE:.*]] = cir.load %[[VTABLE_PTR]] : !cir.ptr>, !cir.ptr
+// CIR-AFTER-X86:    %[[OFFSET:.*]] = cir.sub %[[METHOD_PTR]], %[[ONE]] : !s64i
+// CIR-AFTER-X86:    %[[VTABLE_SLOT:.*]] = cir.ptr_stride %[[VTABLE]], %[[OFFSET]] : (!cir.ptr, !s64i) -> !cir.ptr
+// CIR-AFTER-ARM:    %[[VTABLE_SLOT:.*]] = cir.ptr_stride %[[VTABLE]], %[[METHOD_PTR]] : (!cir.ptr, !s64i) -> !cir.ptr
+// CIR-AFTER:        %[[VIRTUAL_FN_PTR:.*]] = cir.cast bitcast %[[VTABLE_SLOT]] : !cir.ptr -> !cir.ptr, !cir.ptr, !s32i)>>>
+// CIR-AFTER:        %[[VIRTUAL_FN_PTR_LOAD:.*]] = cir.load %[[VIRTUAL_FN_PTR]] : !cir.ptr, !cir.ptr, !s32i)>>>, !cir.ptr, !cir.ptr, !s32i)>>
+// CIR-AFTER:        cir.yield %[[VIRTUAL_FN_PTR_LOAD]] : !cir.ptr, !cir.ptr, !s32i)>>
+// CIR-AFTER:      }, false {
+// CIR-AFTER:        %[[CALLEE_PTR:.*]] = cir.cast int_to_ptr %[[METHOD_PTR]] : !s64i -> !cir.ptr, !cir.ptr, !s32i)>>
+// CIR-AFTER:        cir.yield %[[CALLEE_PTR]] : !cir.ptr, !cir.ptr, !s32i)>>
+// CIR-AFTER:      }) : (!cir.bool) -> !cir.ptr, !cir.ptr, !s32i)>>
+// CIR-AFTER:      %[[ARG:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i
+// CIR-AFTER:      cir.call %[[CALLEE]](%[[ADJUSTED_THIS]], %[[ARG]]) : (!cir.ptr, !cir.ptr, !s32i)>>, !cir.ptr {{.*}}, !s32i {{.*}}) -> ()
+
+// LLVM:     define {{.*}} @_Z4callP3FooMS_FviEi
+// LLVM:       %[[OBJ:.*]] = load ptr, ptr %{{.*}}
+// LLVM:       %[[MEMFN_PTR:.*]] = load { i64, i64 }, ptr %{{.*}}
+// LLVM:       %[[THIS_ADJ:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 1
+// LLVM-X86:   %[[ADJUSTED_THIS:.*]] = getelementptr i8, ptr %[[OBJ]], i64 %[[THIS_ADJ]]
+// LLVM-ARM:   %[[THIS_ADJ_SHIFT:.*]] = ashr i64 %[[THIS_ADJ]], 1
+// LLVM-ARM:   %[[ADJUSTED_THIS:.*]] = getelementptr i8, ptr %[[OBJ]], i64 %[[THIS_ADJ_SHIFT]]
+// LLVM:       %[[PTR_FIELD:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 0
+// LLVM-ARM:   %[[VIRT_BIT:.*]] = and i64 %[[THIS_ADJ]], 1
+// LLVM-X86:   %[[VIRT_BIT:.*]] = and i64 %[[PTR_FIELD]], 1
+// LLVM:       %[[IS_VIRTUAL:.*]] = icmp eq i64 %[[VIRT_BIT]], 1
+// LLVM:       br i1 %[[IS_VIRTUAL]], label %[[HANDLE_VIRTUAL:.*]], label %[[HANDLE_NON_VIRTUAL:.*]]
+// LLVM:     [[HANDLE_VIRTUAL]]:
+// LLVM:       %[[VTABLE:.*]] = load ptr, ptr %[[ADJUSTED_THIS]]
+// LLVM-X86:   %[[OFFSET:.*]] = sub i64 %[[PTR_FIELD]], 1
+// LLVM-X86:   %[[VTABLE_SLOT:.*]] = getelementptr i8, ptr %[[VTABLE]], i64 %[[OFFSET]]
+// LLVM-ARM:   %[[VTABLE_SLOT:.*]] = getelementptr i8, ptr %[[VTABLE]], i64 %[[PTR_FIELD]]
+// LLVM:       %[[VIRTUAL_FN_PTR:.*]] = load ptr, ptr %[[VTABLE_SLOT]]
+// LLVM:       br label %[[CONTINUE:.*]]
+// LLVM:     [[HANDLE_NON_VIRTUAL]]:
+// LLVM:       %[[FUNC_PTR:.*]] = inttoptr i64 %[[PTR_FIELD]] to ptr
+// LLVM:       br label %[[CONTINUE]]
+// LLVM:     [[CONTINUE]]:
+// LLVM:       %[[CALLEE_PTR:.*]] = phi ptr [ %[[FUNC_PTR]], %[[HANDLE_NON_VIRTUAL]] ], [ %[[VIRTUAL_FN_PTR]], %[[HANDLE_VIRTUAL]] ]
+// LLVM:       %[[ARG:.*]] = load i32, ptr %{{.+}}
+// LLVM:       call void %[[CALLEE_PTR]](ptr {{.*}} %[[ADJUSTED_THIS]], i32 {{.*}} %[[ARG]])
+// LLVM:     }
+
+// OGCG:     define {{.*}} @_Z4callP3FooMS_FviEi
+// OGCG:       %[[OBJ:.*]] = load ptr, ptr %{{.*}}
+// OGCG:       %[[MEMFN_PTR:.*]] = load { i64, i64 }, ptr %{{.*}}
+// OGCG:       %[[THIS_ADJ:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 1
+// OGCG-X86:   %[[ADJUSTED_THIS:.*]] = getelementptr inbounds i8, ptr %[[OBJ]], i64 %[[THIS_ADJ]]
+// OGCG-ARM:   %[[THIS_ADJ_SHIFT:.*]] = ashr i64 %[[THIS_ADJ]], 1
+// OGCG-ARM:   %[[ADJUSTED_THIS:.*]] = getelementptr inbounds i8, ptr %[[OBJ]], i64 %[[THIS_ADJ_SHIFT]]
+// OGCG:       %[[PTR_FIELD:.*]] = extractvalue { i64, i64 } %[[MEMFN_PTR]], 0
+// OGCG-X86:   %[[VIRT_BIT:.*]] = and i64 %[[PTR_FIELD]], 1
+// OGCG-ARM:   %[[VIRT_BIT:.*]] = and i64 %[[THIS_ADJ]], 1
+// OGCG:       %[[IS_VIRTUAL:.*]] = icmp ne i64 %[[VIRT_BIT]], 0
+// OGCG:       br i1 %[[IS_VIRTUAL]], label %[[HANDLE_VIRTUAL:.*]], label %[[HANDLE_NON_VIRTUAL:.*]]
+// OGCG:     [[HANDLE_VIRTUAL]]:
+// OGCG:       %[[VTABLE:.*]] = load ptr, ptr %[[ADJUSTED_THIS]]
+// OGCG-X86:   %[[OFFSET:.*]] = sub i64 %[[PTR_FIELD]], 1
+// OGCG-X86:   %[[VTABLE_SLOT:.*]] = getelementptr i8, ptr %[[VTABLE]], i64 %[[OFFSET]]
+// OGCG-ARM:   %[[VTABLE_SLOT:.*]] = getelementptr i8, ptr %[[VTABLE]], i64 %[[PTR_FIELD]]
+// OGCG:       %[[VIRTUAL_FN_PTR:.*]] = load ptr, ptr %[[VTABLE_SLOT]]
+// OGCG:       br label %[[CONTINUE:.*]]
+// OGCG:     [[HANDLE_NON_VIRTUAL]]:
+// OGCG:       %[[FUNC_PTR:.*]] = inttoptr i64 %[[PTR_FIELD]] to ptr
+// OGCG:       br label %[[CONTINUE]]
+// OGCG:     [[CONTINUE]]:
+// OGCG:       %[[CALLEE_PTR:.*]] = phi ptr [ %[[VIRTUAL_FN_PTR]], %[[HANDLE_VIRTUAL]] ], [ %[[FUNC_PTR]], %[[HANDLE_NON_VIRTUAL]] ]
+// OGCG:       %[[ARG:.*]] = load i32, ptr %{{.+}}
+// OGCG:       call void %[[CALLEE_PTR]](ptr {{.*}} %[[ADJUSTED_THIS]], i32 {{.*}} %[[ARG]])
+// OGCG:     }

From 8e6cb677dec7d36de339a4e215555b96449c2b0c Mon Sep 17 00:00:00 2001
From: LumioseSil 
Date: Tue, 12 May 2026 16:28:54 -0400
Subject: [PATCH 505/538] Replace m_SpecificInt(1) with m_One in places where
 poison sensitivity is not a problem (NFC) (#196838)

These are from places where I know poison sensitivity is not an issue.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp          |  4 ++--
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 10 +++++-----
 llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 48400dadbe352..91c3542eba90d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12314,8 +12314,8 @@ SDValue DAGCombiner::foldCTLZToCTLS(SDValue Src, const SDLoc &DL) {
   bool NeedAdd = true;
 
   SDValue X;
-  if (sd_match(Src, m_OneUse(m_Or(m_OneUse(m_Shl(m_Value(X), m_SpecificInt(1))),
-                                  m_SpecificInt(1))))) {
+  if (sd_match(Src,
+               m_OneUse(m_Or(m_OneUse(m_Shl(m_Value(X), m_One())), m_One())))) {
     NeedAdd = false;
     Src = X;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 2e3e7b73ba390..42cb253209495 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2713,8 +2713,8 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
   // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
   // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
   Value *IndexBase;
-  if (match(Index, m_Intrinsic(
-                       m_Value(IndexBase), m_SpecificInt(1)))) {
+  if (match(Index, m_Intrinsic(m_Value(IndexBase),
+                                                             m_One()))) {
     Align Alignment =
         BasePtr->getPointerAlignment(II.getDataLayout());
 
@@ -2741,8 +2741,8 @@ instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
   // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
   Value *IndexBase;
-  if (match(Index, m_Intrinsic(
-                       m_Value(IndexBase), m_SpecificInt(1)))) {
+  if (match(Index, m_Intrinsic(m_Value(IndexBase),
+                                                             m_One()))) {
     Align Alignment =
         BasePtr->getPointerAlignment(II.getDataLayout());
 
@@ -3358,7 +3358,7 @@ bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
   // m_ZExtOrSExt matched.
   Instruction *Ex1, *Ex2;
   if (!(match(Add, m_c_Add(m_Instruction(Ex1),
-                           m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
+                           m_c_Add(m_Instruction(Ex2), m_One())))))
     return false;
 
   // Ensure both extends are of the same type
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b920caf737df0..9a61e165a7d73 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1603,7 +1603,7 @@ class LowerMatrixIntrinsics {
                   m_Load(m_Value()),
                   m_CombineOr(m_Intrinsic(),
                               m_Intrinsic(
-                                  m_Value(), m_SpecificInt(1))))));
+                                  m_Value(), m_One())))));
     };
     // Returns the cost benefit of using \p Op with the dot product lowering. If
     // the returned cost is < 0, the argument is cheaper to use in the

From 2efd5301e49ac61de908c04be5b609b92b01131b Mon Sep 17 00:00:00 2001
From: Ryan Buchner 
Date: Tue, 12 May 2026 13:29:53 -0700
Subject: [PATCH 506/538] [Support] Always scale InstructionCost::Value
 (#178962)

Allows for fractional InstructionCost's up to a granularity with little overhead.
Will allow for more accurate division results and will support finer granularity
of TTI costing.

Before:
InstructionCost(2) / 4 = 0

After (with ScalingFactor 4):
InstructionCost(2) / 4 = 1 / 2

Also, there is a decrease in the maximum value of InstructionCost, as
the largest value is now `std::numeric_limits::max() /
ScalingFactor`.

Addresses #174429
---
 llvm/include/llvm/Support/InstructionCost.h   |  24 +++-
 llvm/lib/Support/InstructionCost.cpp          |  20 +++-
 .../PowerPC/LoopnestFixedSize.ll              |   2 +-
 .../interchange-refcost-overflow.ll           |   6 +-
 .../LoopVectorize/AArch64/call-costs.ll       |  79 +++++++++---
 .../LoopVectorize/ARM/scalar-block-cost.ll    |   2 +-
 .../LoopVectorize/RISCV/predicated-costs.ll   |  32 +++--
 .../SystemZ/branch-for-predicated-block.ll    |   2 +-
 .../X86/CostModel/gather-i16-with-i8-index.ll |   2 +-
 .../X86/CostModel/gather-i32-with-i8-index.ll |   2 +-
 .../X86/CostModel/gather-i64-with-i8-index.ll |   2 +-
 .../X86/CostModel/gather-i8-with-i8-index.ll  |   2 +-
 ...dle-iptr-with-data-layout-to-not-assert.ll |   2 +-
 .../interleaved-load-f32-stride-2.ll          |   2 +-
 .../interleaved-load-f32-stride-3.ll          |   4 +-
 .../interleaved-load-f32-stride-4.ll          |   2 +-
 .../interleaved-load-f32-stride-5.ll          |   4 +-
 .../interleaved-load-f32-stride-6.ll          |   2 +-
 .../interleaved-load-f32-stride-7.ll          |   6 +-
 .../interleaved-load-f32-stride-8.ll          |   2 +-
 .../interleaved-load-f64-stride-2.ll          |   2 +-
 .../interleaved-load-f64-stride-3.ll          |   4 +-
 .../interleaved-load-f64-stride-4.ll          |   2 +-
 .../interleaved-load-f64-stride-5.ll          |   4 +-
 .../interleaved-load-f64-stride-6.ll          |   2 +-
 .../interleaved-load-f64-stride-7.ll          |   6 +-
 .../interleaved-load-f64-stride-8.ll          |   2 +-
 .../X86/CostModel/interleaved-load-half.ll    |   2 +-
 .../interleaved-load-i16-stride-2.ll          |   2 +-
 .../interleaved-load-i16-stride-3.ll          |   4 +-
 .../interleaved-load-i16-stride-4.ll          |   2 +-
 .../interleaved-load-i16-stride-5.ll          |   4 +-
 .../interleaved-load-i16-stride-6.ll          |   2 +-
 .../interleaved-load-i16-stride-7.ll          |   6 +-
 .../interleaved-load-i16-stride-8.ll          |   2 +-
 ...nterleaved-load-i32-stride-2-indices-0u.ll |   2 +-
 .../interleaved-load-i32-stride-2.ll          |   2 +-
 ...terleaved-load-i32-stride-3-indices-01u.ll |   2 +-
 ...terleaved-load-i32-stride-3-indices-0uu.ll |   2 +-
 .../interleaved-load-i32-stride-3.ll          |   4 +-
 ...erleaved-load-i32-stride-4-indices-012u.ll |   6 +-
 ...erleaved-load-i32-stride-4-indices-01uu.ll |   2 +-
 ...erleaved-load-i32-stride-4-indices-0uuu.ll |   2 +-
 .../interleaved-load-i32-stride-4.ll          |   2 +-
 .../interleaved-load-i32-stride-5.ll          |   4 +-
 .../interleaved-load-i32-stride-6.ll          |   2 +-
 .../interleaved-load-i32-stride-7.ll          |   6 +-
 .../interleaved-load-i32-stride-8.ll          |   2 +-
 .../interleaved-load-i64-stride-2.ll          |   2 +-
 .../interleaved-load-i64-stride-3.ll          |   4 +-
 .../interleaved-load-i64-stride-4.ll          |   2 +-
 .../interleaved-load-i64-stride-5.ll          |   4 +-
 .../interleaved-load-i64-stride-6.ll          |   2 +-
 .../interleaved-load-i64-stride-7.ll          |   6 +-
 .../interleaved-load-i64-stride-8.ll          |   2 +-
 .../CostModel/interleaved-load-i8-stride-2.ll |   2 +-
 .../CostModel/interleaved-load-i8-stride-3.ll |   2 +-
 .../CostModel/interleaved-load-i8-stride-4.ll |   2 +-
 .../CostModel/interleaved-load-i8-stride-5.ll |   4 +-
 .../CostModel/interleaved-load-i8-stride-6.ll |   2 +-
 .../CostModel/interleaved-load-i8-stride-7.ll |   6 +-
 .../CostModel/interleaved-load-i8-stride-8.ll |   2 +-
 .../interleaved-store-f32-stride-2.ll         |   2 +-
 .../interleaved-store-f32-stride-3.ll         |   2 +-
 .../interleaved-store-f32-stride-4.ll         |   2 +-
 .../interleaved-store-f32-stride-5.ll         |   2 +-
 .../interleaved-store-f32-stride-6.ll         |   2 +-
 .../interleaved-store-f32-stride-7.ll         |   2 +-
 .../interleaved-store-f32-stride-8.ll         |   2 +-
 .../interleaved-store-f64-stride-2.ll         |   2 +-
 .../interleaved-store-f64-stride-3.ll         |   2 +-
 .../interleaved-store-f64-stride-4.ll         |   2 +-
 .../interleaved-store-f64-stride-5.ll         |   2 +-
 .../interleaved-store-f64-stride-6.ll         |   2 +-
 .../interleaved-store-f64-stride-7.ll         |   2 +-
 .../interleaved-store-f64-stride-8.ll         |   2 +-
 .../interleaved-store-i16-stride-2.ll         |   2 +-
 .../interleaved-store-i16-stride-3.ll         |   2 +-
 .../interleaved-store-i16-stride-4.ll         |   2 +-
 .../interleaved-store-i16-stride-5.ll         |   2 +-
 .../interleaved-store-i16-stride-6.ll         |   2 +-
 .../interleaved-store-i16-stride-7.ll         |   2 +-
 .../interleaved-store-i16-stride-8.ll         |   2 +-
 .../interleaved-store-i32-stride-2.ll         |   2 +-
 .../interleaved-store-i32-stride-3.ll         |   2 +-
 .../interleaved-store-i32-stride-4.ll         |   2 +-
 .../interleaved-store-i32-stride-5.ll         |   2 +-
 .../interleaved-store-i32-stride-6.ll         |   2 +-
 .../interleaved-store-i32-stride-7.ll         |   2 +-
 .../interleaved-store-i32-stride-8.ll         |   2 +-
 .../interleaved-store-i64-stride-2.ll         |   2 +-
 .../interleaved-store-i64-stride-3.ll         |   2 +-
 .../interleaved-store-i64-stride-4.ll         |   2 +-
 .../interleaved-store-i64-stride-5.ll         |   2 +-
 .../interleaved-store-i64-stride-6.ll         |   2 +-
 .../interleaved-store-i64-stride-7.ll         |   2 +-
 .../interleaved-store-i64-stride-8.ll         |   2 +-
 .../interleaved-store-i8-stride-2.ll          |   2 +-
 .../interleaved-store-i8-stride-3.ll          |   2 +-
 .../interleaved-store-i8-stride-4.ll          |   2 +-
 .../interleaved-store-i8-stride-5.ll          |   2 +-
 .../interleaved-store-i8-stride-6.ll          |   2 +-
 .../interleaved-store-i8-stride-7.ll          |   2 +-
 .../interleaved-store-i8-stride-8.ll          |   2 +-
 .../masked-gather-i32-with-i8-index.ll        |   2 +-
 .../masked-gather-i64-with-i8-index.ll        |   2 +-
 .../CostModel/masked-interleaved-load-i16.ll  |   2 +-
 .../CostModel/masked-interleaved-store-i16.ll |  20 +---
 .../X86/CostModel/masked-load-i16.ll          |   2 +-
 .../X86/CostModel/masked-load-i32.ll          |   2 +-
 .../X86/CostModel/masked-load-i64.ll          |   2 +-
 .../X86/CostModel/masked-load-i8.ll           |   2 +-
 .../masked-scatter-i32-with-i8-index.ll       |  12 +-
 .../masked-scatter-i64-with-i8-index.ll       |   8 +-
 .../X86/CostModel/masked-store-i16.ll         |   6 +-
 .../X86/CostModel/masked-store-i32.ll         |   6 +-
 .../X86/CostModel/masked-store-i64.ll         |   4 +-
 .../X86/CostModel/masked-store-i8.ll          |  14 +--
 .../CostModel/scatter-i16-with-i8-index.ll    |   2 +-
 .../CostModel/scatter-i32-with-i8-index.ll    |   2 +-
 .../CostModel/scatter-i64-with-i8-index.ll    |   2 +-
 .../X86/CostModel/scatter-i8-with-i8-index.ll |   2 +-
 .../LoopVectorize/X86/cost-model-i386.ll      |  49 +++++++-
 .../LoopVectorize/X86/masked_load_store.ll    |  65 ++++++----
 .../LoopVectorize/X86/predicate-switch.ll     | 112 +++++++++++++++---
 .../unittests/Support/InstructionCostTest.cpp |  19 +++
 126 files changed, 486 insertions(+), 260 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index 507c16666b958..5f8c64b39057a 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -59,6 +59,9 @@ class InstructionCost {
       State = Invalid;
   }
 
+  // Matches GCC, can use shift rather than multiply/divide to scale
+  static constexpr CostType CostGranularity = 4;
+
   static constexpr CostType MaxValue = std::numeric_limits::max();
   static constexpr CostType MinValue = std::numeric_limits::min();
 
@@ -67,7 +70,12 @@ class InstructionCost {
   InstructionCost() = default;
 
   InstructionCost(CostState) = delete;
-  InstructionCost(CostType Val) : Value(Val), State(Valid) {}
+  InstructionCost(CostType Val) : Value(), State(Valid) {
+    InstructionCost::CostType Result;
+    if (MulOverflow(Val, CostGranularity, Result))
+      Result = Val > 0 ? MaxValue : MinValue;
+    Value = Result;
+  }
 
   static InstructionCost getMax() { return MaxValue; }
   static InstructionCost getMin() { return MinValue; }
@@ -87,7 +95,7 @@ class InstructionCost {
   /// and comparisons.
   CostType getValue() const {
     assert(isValid());
-    return Value;
+    return Value / CostGranularity;
   }
 
   /// For all of the arithmetic operators provided here any invalid state is
@@ -141,6 +149,8 @@ class InstructionCost {
         Result = MaxValue;
       else
         Result = MinValue;
+    } else {
+      Result /= CostGranularity;
     }
 
     Value = Result;
@@ -155,13 +165,17 @@ class InstructionCost {
 
   InstructionCost &operator/=(const InstructionCost &RHS) {
     propagateState(RHS);
-    Value /= RHS.Value;
+    // Saturating multiply.
+    InstructionCost::CostType Result;
+    if (MulOverflow(Value, CostGranularity, Result))
+      Result = Value > 0 ? MaxValue : MinValue;
+    Result /= RHS.Value;
+    Value = Result;
     return *this;
   }
 
   InstructionCost &operator/=(const CostType RHS) {
-    InstructionCost RHS2(RHS);
-    *this /= RHS2;
+    Value /= RHS;
     return *this;
   }
 
diff --git a/llvm/lib/Support/InstructionCost.cpp b/llvm/lib/Support/InstructionCost.cpp
index c485ce9107af9..3a12e0bac497f 100644
--- a/llvm/lib/Support/InstructionCost.cpp
+++ b/llvm/lib/Support/InstructionCost.cpp
@@ -12,13 +12,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 void InstructionCost::print(raw_ostream &OS) const {
-  if (isValid())
-    OS << Value;
-  else
+  using UnsignedCostType = std::make_unsigned_t;
+  if (isValid()) {
+    UnsignedCostType AbsValue =
+        (Value < 0) ? -((UnsignedCostType)Value) : ((UnsignedCostType)Value);
+    UnsignedCostType WholeNumber = AbsValue / CostGranularity;
+    UnsignedCostType Remainder = AbsValue % CostGranularity;
+    if (Value < 0)
+      OS << "-";
+    UnsignedCostType RemainderHundreds = (Remainder * 100) / CostGranularity;
+    while (RemainderHundreds % 10 == 0 && RemainderHundreds)
+      RemainderHundreds /= 10;
+    OS << WholeNumber;
+    if (RemainderHundreds)
+      OS << "." << RemainderHundreds;
+  } else {
     OS << "Invalid";
+  }
 }
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
index 5209d290c83da..02b09c0b3c684 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -83,7 +83,7 @@ for.end13:                                        ; preds = %for.inc11
 
 declare ptr @func_with_returned_arg(ptr returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 2112128815104000000
+; CHECK: Loop 'for.body' has cost = 2305843009213693951.75
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
index 52a530b2feebb..90dd96d322e92 100644
--- a/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/interchange-refcost-overflow.ll
@@ -10,9 +10,9 @@
 ;         A[c][d][e] = 0;
 ; }
 
-; CHECK: Loop 'outer.loop' has cost = 9223372036854775807
-; CHECK: Loop 'middle.loop' has cost = 9223372036854775807
-; CHECK: Loop 'inner.loop' has cost = 9223372036854775807
+; CHECK: Loop 'outer.loop' has cost = 2305843009213693951.75
+; CHECK: Loop 'middle.loop' has cost = 2305843009213693951.75
+; CHECK: Loop 'inner.loop' has cost = 2305843009213693951.75
 
 @A = local_unnamed_addr global [11 x [11 x [11 x i32]]] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index d01e860fbc04f..1f6c7ec5c317c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -104,26 +104,77 @@ exit:
 define void @call_scalarized(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-LABEL: define void @call_scalarized(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 100, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = sub i64 100, [[INDEX]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_SRC]], align 8
-; CHECK-NEXT:    [[CMP295:%.*]] = fcmp une double [[L]], 4.000000e+00
-; CHECK-NEXT:    [[CMP299:%.*]] = fcmp ugt double [[L]], 0.000000e+00
-; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP295]], [[CMP299]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[L]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[GEP_SRC]], i64 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[GEP_SRC]], i64 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x double> [[WIDE_LOAD]], <2 x double> poison, <2 x i32> 
+; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <2 x double> [[WIDE_LOAD1]], <2 x double> poison, <2 x i32> 
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp une <2 x double> [[REVERSE]], splat (double 4.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp une <2 x double> [[REVERSE2]], splat (double 4.000000e+00)
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ugt <2 x double> [[REVERSE]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp ugt <2 x double> [[REVERSE2]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <2 x i1> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[TMP10]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true)
+; CHECK-NEXT:    [[OR_COND:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP_LATCH:.*]], label %[[THEN:.*]]
+; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L:%.*]] = extractelement <2 x double> [[REVERSE]], i64 0
+; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[L]])
 ; CHECK-NEXT:    store double [[SQRT]], ptr [[GEP_DST]], align 8
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br label %[[THEN]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK:       [[PRED_STORE_IF3]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[IV]], -1
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], -1
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[REVERSE]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.sqrt.f64(double [[TMP24]])
+; CHECK-NEXT:    store double [[TMP25]], ptr [[TMP23]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[EXIT:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[IV]], -2
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], -1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x double> [[REVERSE2]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = call double @llvm.sqrt.f64(double [[TMP30]])
+; CHECK-NEXT:    store double [[TMP31]], ptr [[TMP29]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i1> [[TMP13]], i64 1
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_IF7]]:
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[IV]], -3
+; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], -1
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[REVERSE2]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = call double @llvm.sqrt.f64(double [[TMP36]])
+; CHECK-NEXT:    store double [[TMP37]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT1:.*]]
+; CHECK:       [[EXIT1]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index b0c6a48ec6668..8ebb04dd671dc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -57,7 +57,7 @@ define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %inc = add nsw i32 %i.032, 1
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %end
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK-COST-2-NEXT: LV: Scalar loop costs: 8.
+; CHECK-COST-2-NEXT: LV: Scalar loop costs: 8.5.
 
 entry:
   %cmp31 = icmp slt i32 %start, %end
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll
index e0ab30b0ae5cc..2316a478becc5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/predicated-costs.ll
@@ -8,23 +8,29 @@
 define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK-LABEL: define void @nested(
 ; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C0]], label %[[THEN_0:.*]], label %[[LATCH]]
-; CHECK:       [[THEN_0]]:
-; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement  poison, i1 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT]],  poison,  zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement  poison, i1 [[C0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector  [[BROADCAST_SPLATINSERT1]],  poison,  zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select  [[BROADCAST_SPLAT2]],  [[BROADCAST_SPLAT]],  zeroinitializer
+; CHECK-NEXT:    br label %[[THEN_1:.*]]
 ; CHECK:       [[THEN_1]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[LOOP]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[THEN_1]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 1024, %[[LOOP]] ], [ [[AVL_NEXT:%.*]], %[[THEN_1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[P0]], i32 [[IV1]]
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i32 [[X]]
-; CHECK-NEXT:    store i32 0, ptr [[GEP1]], align 4
-; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call  @llvm.vp.load.nxv4i32.p0(ptr align 4 [[GEP2]],  [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P1]],  [[VP_OP_LOAD]]
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0( zeroinitializer,  align 4 [[TMP3]],  [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[IV1]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LATCH:.*]], label %[[THEN_1]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -80,7 +86,7 @@ define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
index 7678c06a04759..8d6bbf2c8369b 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
@@ -33,7 +33,7 @@ for.end.loopexit:
   ret void
 
 ; CHECK: Cost of 1 for VF 2: profitable to scalarize   store i32 %sub, ptr %arrayidx, align 4
-; CHECK: Cost of 2 for VF 2: profitable to scalarize   %sub = sub nsw i32 0, %l
+; CHECK: Cost of 2.5 for VF 2: profitable to scalarize   %sub = sub nsw i32 0, %l
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%cmp55> = icmp sgt ir<%l>, ir<0>
 ; CHECK: Cost of 0 for VF 2: vector loop backedge
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll
index 93d25a7ae9469..8cf210ad4f2c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB = load i16, ptr %inB, align 2"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB = load i16, ptr %inB, align 2"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll
index a23d57cfd4448..a44654e157a49 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB = load i32, ptr %inB, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB = load i32, ptr %inB, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll
index 5e52483aa93b3..7040087e4b400 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB = load i64, ptr %inB, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB = load i64, ptr %inB, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll
index 81d646715160a..d51a6b341b8cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB = load i8, ptr %inB, align 1"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB = load i8, ptr %inB, align 1"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/handle-iptr-with-data-layout-to-not-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/handle-iptr-with-data-layout-to-not-assert.ll
index eb87b420ff654..c0e70d2033cf4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/handle-iptr-with-data-layout-to-not-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/handle-iptr-with-data-layout-to-not-assert.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store ptr" --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at" --version 5
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store ptr" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at" --version 5
 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll
index 5c43db13f12ca..b97ddcd649af3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
index a90795399544f..930e47a60c230 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -60,7 +60,7 @@ define void @test() {
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
-; AVX512:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
+; AVX512:  Cost of 6.5 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll
index 9fb4c01abe813..3b6d6e1f33b0f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
index 0be2dc2c43504..a8e5ee9f4a33f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:    ir<%v2> = load from index 2
 ; AVX512:    ir<%v3> = load from index 3
 ; AVX512:    ir<%v4> = load from index 4
-; AVX512:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
+; AVX512:  Cost of 9.5 for VF 4: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll
index 6b64df947f103..1df2bb2d6880f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
index 9f044950e99f0..1494459b99fc5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -45,7 +45,7 @@ define void @test() {
 ; AVX512:    ir<%v4> = load from index 4
 ; AVX512:    ir<%v5> = load from index 5
 ; AVX512:    ir<%v6> = load from index 6
-; AVX512:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 12.5 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
@@ -53,7 +53,7 @@ define void @test() {
 ; AVX512:    ir<%v4> = load from index 4
 ; AVX512:    ir<%v5> = load from index 5
 ; AVX512:    ir<%v6> = load from index 6
-; AVX512:  Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 35.5 for VF 8: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll
index 9c0cbfd45d415..663f22aff36ea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-2.ll
index b3cc8811542f5..d4346359fa249 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
index c847f70e41b15..530cd8e95bc8b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -53,7 +53,7 @@ define void @test() {
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
-; AVX512:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
+; AVX512:  Cost of 6.5 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll
index d88c58571c773..bc69c76def03e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
index f0d72a118d2f2..969bda6306bc4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -37,7 +37,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
+; AVX512:  Cost of 9.5 for VF 2: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll
index d43a3b3746783..3f023fe47d657 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
index 0d3f3b708ce08..21597b800dcd6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -37,7 +37,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 12.5 for VF 2: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
@@ -45,7 +45,7 @@ define void @test() {
 ; AVX512:    ir<%v4> = load from index 4
 ; AVX512:    ir<%v5> = load from index 5
 ; AVX512:    ir<%v6> = load from index 6
-; AVX512:  Cost of 35 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 35.5 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll
index f739aadf7e138..68d0c17a3b936 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll
index 268fc58795135..53d770aa7c8af 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF 32: INTERLEAVE-GROUP with factor [0-9]+ at %0," --version 5
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF 32: INTERLEAVE-GROUP with factor [0-9]+ at %0," --version 5
 ; RUN: opt -S -passes=loop-vectorize -debug-only=loop-vectorize -mattr=avx512fp16 %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll
index d589c446f5330..e107164f7276f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
index 222dac04dc019..97f7874b05c01 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -92,7 +92,7 @@ define void @test() {
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
-; AVX512BW:  Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
+; AVX512BW:  Cost of 9.5 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll
index 5d822e5824e0f..2af8a1ff785c6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
index 3cdc803c73098..da12b39bb9fb1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -89,7 +89,7 @@ define void @test() {
 ; AVX512BW:    ir<%v2> = load from index 2
 ; AVX512BW:    ir<%v3> = load from index 3
 ; AVX512BW:    ir<%v4> = load from index 4
-; AVX512BW:  Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
+; AVX512BW:  Cost of 14.5 for VF 8: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll
index 1bfe9f6d31769..bada7e1f26570 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
index 3b313a83c05ce..44b62529a7f95 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -105,7 +105,7 @@ define void @test() {
 ; AVX512BW:    ir<%v4> = load from index 4
 ; AVX512BW:    ir<%v5> = load from index 5
 ; AVX512BW:    ir<%v6> = load from index 6
-; AVX512BW:  Cost of 19 for VF 8: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512BW:  Cost of 19.5 for VF 8: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
@@ -113,7 +113,7 @@ define void @test() {
 ; AVX512BW:    ir<%v4> = load from index 4
 ; AVX512BW:    ir<%v5> = load from index 5
 ; AVX512BW:    ir<%v6> = load from index 6
-; AVX512BW:  Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512BW:  Cost of 56.5 for VF 16: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll
index 281639de7a6ab..5f946718cad8b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll
index ce92fccd3772e..cd3955ae036ef 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll
index 7c11092830f9f..253f28bba0398 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
index 90915502dba2f..38be5cd74ce92 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
index f036e897ad20c..9c9b0f141650f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
index 6d02b6504333d..9d3688d256bc9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -60,7 +60,7 @@ define void @test() {
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
-; AVX512:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
+; AVX512:  Cost of 6.5 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
index bfbd85555db00..f84d0e9124838 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -60,11 +60,11 @@ define void @test() {
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
-; AVX512:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0, ir<%in0>
+; AVX512:  Cost of 6.5 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
-; AVX512:  Cost of 17 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0, ir<%in0>
+; AVX512:  Cost of 17.5 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-01uu.ll
index 96e9a7a710fae..c9180c3762de0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll
index 0bdefbcb269bf..9ee4d9c869bcf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll
index 8fb210b0b2dbf..7e7f7642208da 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
index 5b2f2d522563a..d7388a34ff057 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -43,7 +43,7 @@ define void @test() {
 ; AVX512:    ir<%v2> = load from index 2
 ; AVX512:    ir<%v3> = load from index 3
 ; AVX512:    ir<%v4> = load from index 4
-; AVX512:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
+; AVX512:  Cost of 9.5 for VF 4: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll
index e0674c038ecf4..181b0209ce3fe 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
index 754be0e80299f..62bcfeee2c646 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -45,7 +45,7 @@ define void @test() {
 ; AVX512:    ir<%v4> = load from index 4
 ; AVX512:    ir<%v5> = load from index 5
 ; AVX512:    ir<%v6> = load from index 6
-; AVX512:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 12.5 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
@@ -53,7 +53,7 @@ define void @test() {
 ; AVX512:    ir<%v4> = load from index 4
 ; AVX512:    ir<%v5> = load from index 5
 ; AVX512:    ir<%v6> = load from index 6
-; AVX512:  Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 35.5 for VF 8: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll
index 2ab06d6f82b9c..69dcdc59c60a0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-2.ll
index 52276bce225eb..1eec2e78c3f8a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
index e842981174205..bca565fb1b5ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -53,7 +53,7 @@ define void @test() {
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
-; AVX512:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
+; AVX512:  Cost of 6.5 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll
index 328d0d6f8cef8..893dfbbbeb58e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
index e534038b2e795..a4f4443c725e9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -37,7 +37,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
+; AVX512:  Cost of 9.5 for VF 2: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-6.ll
index 8647841feeaab..a869d5c7aa2b8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
index 972ebc51fdeec..56d07c5e518a9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -37,7 +37,7 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 12.5 for VF 2: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
@@ -45,7 +45,7 @@ define void @test() {
 ; AVX512:    ir<%v4> = load from index 4
 ; AVX512:    ir<%v5> = load from index 5
 ; AVX512:    ir<%v6> = load from index 6
-; AVX512:  Cost of 35 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512:  Cost of 35.5 for VF 4: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512:    ir<%v0> = load from index 0
 ; AVX512:    ir<%v1> = load from index 1
 ; AVX512:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll
index e166fd5296d4b..d46d66e397b01 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load|WIDEN ir<%v[0-9]> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll
index eafd91b4bf950..ad7fd15a9f11f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
index 061f27cd8caee..d6ef47ad00dcb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll
index 3e15690abe167..0135476488847 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
index 2778841dbd3a1..dca54e574d3ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -95,7 +95,7 @@ define void @test() {
 ; AVX512BW:    ir<%v2> = load from index 2
 ; AVX512BW:    ir<%v3> = load from index 3
 ; AVX512BW:    ir<%v4> = load from index 4
-; AVX512BW:  Cost of 99 for VF 16: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
+; AVX512BW:  Cost of 99.5 for VF 16: INTERLEAVE-GROUP with factor 5 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-6.ll
index 281628dd5966d..3992a2ad5cfeb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
index efc6704e4785a..088a29d257417 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
@@ -113,7 +113,7 @@ define void @test() {
 ; AVX512BW:    ir<%v4> = load from index 4
 ; AVX512BW:    ir<%v5> = load from index 5
 ; AVX512BW:    ir<%v6> = load from index 6
-; AVX512BW:  Cost of 138 for VF 16: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512BW:  Cost of 138.5 for VF 16: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
@@ -121,7 +121,7 @@ define void @test() {
 ; AVX512BW:    ir<%v4> = load from index 4
 ; AVX512BW:    ir<%v5> = load from index 5
 ; AVX512BW:    ir<%v6> = load from index 6
-; AVX512BW:  Cost of 413 for VF 32: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
+; AVX512BW:  Cost of 413.5 for VF 32: INTERLEAVE-GROUP with factor 7 at %v0, ir<%in0>
 ; AVX512BW:    ir<%v0> = load from index 0
 ; AVX512BW:    ir<%v1> = load from index 1
 ; AVX512BW:    ir<%v2> = load from index 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-8.ll
index dc06af6263551..178b99d03ef4b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%v0 = load" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at %v0,|REPLICATE ir<%v0> = load)" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll
index d73dc5e0134d6..2b11cb3c968db 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
index d372ab153784a..df8c6d5c59f09 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll
index f2f7107cc1cc7..e8044d4d2294b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
index 5e29f47acaf35..8dbd4969b0027 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
index c003c1314575e..fe3d14db34d84 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
index d48a3409f9b3e..8eb1e4a7aea44 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-8.ll
index 117966d4dbc39..9c33e6932b95c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll
index c0e32fb5dee91..bf5ef2f633aa6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
index 2c24bd2095c89..3619f856ed3ec 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll
index 8b7ec565f1dde..cc0ff1494723d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll
index 14216fca2fc90..a88edd0ba8d73 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll
index f15e3ffc88103..5016958fa79f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
index 763c95910162a..60f83d3e54ae9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll
index 90c4333ee7597..5be0d9a386c42 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store double %v\., ptr %out" --filter "Cost of [0-9]+ for VF [0-9]+: (WIDEN store|REPLICATE store ir<%v\.>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store double %v\., ptr %out" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (WIDEN store|REPLICATE store ir<%v\.>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll
index ff1e2e9d786e1..8969c89ff485f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
index 9d7bab77ae8f2..fb3baa4115401 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll
index 323f0ea8b3007..5e04cad079cff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
index 1ca528a9ab95f..915082bbf9eb0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
index 04feb5c3e0252..a9ea73d9c3ad7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
index e6dd98c0a5326..3b07b7cdf58ca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll
index e1e267b2595c8..de2c7eb6f8021 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll
index 95523a3dce3fc..9623d7f5a8b5c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
index 07fe59c0f36c3..0b2bd26618fb2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll
index 0949b11043e22..b5ee0289d2282 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
index f2a50db468e5b..db0251770ce42 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
index 36b06e06a26b6..5e9e5782451e3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
index 49aa508224965..e62c117c34dad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll
index 55cad6f0afd75..8070076aa929e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll
index 9610349875d56..533bbfbd833cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
index 2b1d991b2a9ec..55aff61dcceff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll
index 953f7a94396b4..4b8dcf20aa040 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll
index 8fc4d18cc706f..b8c49529cbfa2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll
index 4647cbab6b60d..12e83ad16a361 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
index b0a8727383234..a5c145e39a7e8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll
index 4c6710e763c9e..065dc17187131 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i64 %v\., ptr %out" --filter "Cost of [0-9]+ for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at ,|WIDEN store|REPLICATE store ir<%v\.>)" --filter "^  store ir<%v.?> to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i64 %v\., ptr %out" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (INTERLEAVE-GROUP with factor [0-9]+ at ,|WIDEN store|REPLICATE store ir<%v\.>)" --filter "^  store ir<%v.?> to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll
index 91c82cc8fd5b7..58f9536fa5fb1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
index 779134134fe39..8e5a6f14b08cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll
index 38b86e812f2dc..4ad386562997d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
index d2245df5aa9b0..823926d05f873 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
index 8c603581aa08e..ae1bf232278a2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll
index d10cab37a697d..2fd883c0b152c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll
index 2f8d2ff1cc8d2..e024a8e0459fb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+ for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: INTERLEAVE-GROUP with factor [0-9]+ at .*, ir<%out" --filter "^  store ir<.* to index"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX1
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=AVX2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
index caf8f10d1169c..1433c2ad35dcc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB.loaded = load i32, ptr %inB, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB.loaded = load i32, ptr %inB, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
index c70ab9962785b..be49e2198052e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB.loaded = load i64, ptr %inB, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB.loaded = load i64, ptr %inB, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-load-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-load-i16.ll
index 2ee610ff5b79a..b020bf1363c82 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-load-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-load-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*%i[2,4] = load i16, ptr %[a-zA-Z0-7]+, align 2" --filter "Cost of [0-9]+ for VF [0-9]+: (REPLICATE ir<%i[24]> = load|INTERLEAVE-GROUP with factor [0-9]+ at %i[24])" --filter "^  ir<.* = load from index"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*%i[2,4] = load i16, ptr %[a-zA-Z0-7]+, align 2" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (REPLICATE ir<%i[24]> = load|INTERLEAVE-GROUP with factor [0-9]+ at %i[24])" --filter "^  ir<.* = load from index"
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -tail-folding-policy=must-fold-tail -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=DISABLED_MASKED_STRIDED
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -tail-folding-policy=must-fold-tail -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=ENABLED_MASKED_STRIDED
 ; REQUIRES: asserts
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
index 398f553059297..865942265146d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i16 %[0,2], ptr %[a-zA-Z0-7]+, align 2" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i16 %[02]|REPLICATE store ir<%[02]>|INTERLEAVE-GROUP with factor [0-9]+ at )"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i16 %[0,2], ptr %[a-zA-Z0-7]+, align 2" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i16 %[02]|REPLICATE store ir<%[02]>|INTERLEAVE-GROUP with factor [0-9]+ at )"
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -tail-folding-policy=must-fold-tail -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=DISABLED_MASKED_STRIDED
 ; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -tail-folding-policy=must-fold-tail -S -mcpu=skx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefix=ENABLED_MASKED_STRIDED
 ; REQUIRES: asserts
@@ -37,14 +37,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED:  Cost of 6 for VF 2: REPLICATE store ir<%0>, ir<%arrayidx2>
 ; ENABLED_MASKED_STRIDED:  Cost of 6 for VF 2: REPLICATE store ir<%2>, ir<%arrayidx7>
 ; ENABLED_MASKED_STRIDED:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ; ENABLED_MASKED_STRIDED:  Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ; ENABLED_MASKED_STRIDED:  Cost of 27 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ;
 entry:
   br label %for.body
@@ -95,17 +89,9 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>, vp<[[VP8:%[0-9]+]]>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ; ENABLED_MASKED_STRIDED:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>, vp<[[VP8]]>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ; ENABLED_MASKED_STRIDED:  Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>, vp<[[VP8]]>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ; ENABLED_MASKED_STRIDED:  Cost of 27 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%arrayidx2>, vp<[[VP8]]>
-; ENABLED_MASKED_STRIDED:    store ir<%0> to index 0
-; ENABLED_MASKED_STRIDED:    store ir<%2> to index 1
 ;
 entry:
   %cmp15 = icmp sgt i32 %numPoints, 0
@@ -155,14 +141,14 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED:  Cost of 2 for VF 2: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  Cost of 4 for VF 4: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED:  Cost of 8 for VF 8: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED:  Cost of 16 for VF 16: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED:  Cost of 16.5 for VF 16: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  Cost of 2 for VF 2: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  Cost of 4 for VF 4: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED:  Cost of 8 for VF 8: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED:  Cost of 16 for VF 16: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED:  Cost of 16.5 for VF 16: profitable to scalarize store i16 %0, ptr %arrayidx6, align 2
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
index 4f3f141de3adb..f6f11624ad7e9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB.loaded = load i16, ptr %inB, align 2"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB.loaded = load i16, ptr %inB, align 2"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
index 6d613871639bb..6295588ebb07f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB.loaded = load i32, ptr %inB, align 4"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB.loaded = load i32, ptr %inB, align 4"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
index c0d1144fab800..85953f2b3733e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB.loaded = load i64, ptr %inB, align 8"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB.loaded = load i64, ptr %inB, align 8"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
index 65075cd094527..333be63f50c06 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%valB.loaded = load i8, ptr %inB, align 1"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF [0-9]+ For instruction:\s*%valB.loaded = load i8, ptr %inB, align 1"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
index 986d7b7104d88..6959fea2d512b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i32 %valB, ptr %out" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i32 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i32 %valB, ptr %out" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i32 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,8 +18,8 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  Cost of 2 for VF 2: profitable to scalarize store i32 %valB, ptr %out, align 4
-; SSE2:  Cost of 5 for VF 4: profitable to scalarize store i32 %valB, ptr %out, align 4
+; SSE2:  Cost of 2.5 for VF 2: profitable to scalarize store i32 %valB, ptr %out, align 4
+; SSE2:  Cost of 5.5 for VF 4: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; SSE2:  Cost of 11 for VF 8: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; SSE2:  Cost of 22 for VF 16: profitable to scalarize store i32 %valB, ptr %out, align 4
 ;
@@ -34,7 +34,7 @@ define void @test() {
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX1:  Cost of 2 for VF 2: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; AVX1:  Cost of 4 for VF 4: profitable to scalarize store i32 %valB, ptr %out, align 4
-; AVX1:  Cost of 8 for VF 8: profitable to scalarize store i32 %valB, ptr %out, align 4
+; AVX1:  Cost of 8.5 for VF 8: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; AVX1:  Cost of 17 for VF 16: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; AVX1:  Cost of 34 for VF 32: profitable to scalarize store i32 %valB, ptr %out, align 4
 ;
@@ -42,14 +42,14 @@ define void @test() {
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX2:  Cost of 2 for VF 2: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; AVX2:  Cost of 4 for VF 4: profitable to scalarize store i32 %valB, ptr %out, align 4
-; AVX2:  Cost of 8 for VF 8: profitable to scalarize store i32 %valB, ptr %out, align 4
+; AVX2:  Cost of 8.5 for VF 8: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; AVX2:  Cost of 17 for VF 16: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; AVX2:  Cost of 34 for VF 32: profitable to scalarize store i32 %valB, ptr %out, align 4
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  Cost of 5 for VF 2: REPLICATE store ir<%valB>, ir<%out>
-; AVX512:  Cost of 10 for VF 4: REPLICATE store ir<%valB>, ir<%out>
+; AVX512:  Cost of 10.5 for VF 4: REPLICATE store ir<%valB>, ir<%out>
 ; AVX512:  Cost of 10 for VF 8: WIDEN store ir<%out>, ir<%valB>, ir<%canStore>
 ; AVX512:  Cost of 18 for VF 16: WIDEN store ir<%out>, ir<%valB>, ir<%canStore>
 ; AVX512:  Cost of 36 for VF 32: WIDEN store ir<%out>, ir<%valB>, ir<%canStore>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
index 56d3f973b3177..41ae89933204e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i64 %valB, ptr %out" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i64 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i64 %valB, ptr %out" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i64 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  Cost of 2 for VF 2: profitable to scalarize store i64 %valB, ptr %out, align 8
+; SSE2:  Cost of 2.5 for VF 2: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; SSE2:  Cost of 5 for VF 4: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; SSE2:  Cost of 10 for VF 8: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; SSE2:  Cost of 20 for VF 16: profitable to scalarize store i64 %valB, ptr %out, align 8
@@ -33,7 +33,7 @@ define void @test() {
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX1:  Cost of 2 for VF 2: profitable to scalarize store i64 %valB, ptr %out, align 8
-; AVX1:  Cost of 4 for VF 4: profitable to scalarize store i64 %valB, ptr %out, align 8
+; AVX1:  Cost of 4.5 for VF 4: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; AVX1:  Cost of 9 for VF 8: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; AVX1:  Cost of 18 for VF 16: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; AVX1:  Cost of 36 for VF 32: profitable to scalarize store i64 %valB, ptr %out, align 8
@@ -41,7 +41,7 @@ define void @test() {
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX2:  Cost of 2 for VF 2: profitable to scalarize store i64 %valB, ptr %out, align 8
-; AVX2:  Cost of 4 for VF 4: profitable to scalarize store i64 %valB, ptr %out, align 8
+; AVX2:  Cost of 4.5 for VF 4: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; AVX2:  Cost of 9 for VF 8: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; AVX2:  Cost of 18 for VF 16: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; AVX2:  Cost of 36 for VF 32: profitable to scalarize store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
index 89df198fc74a9..5b3d83a13e919 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i16 %valB, ptr %out" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i16 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i16 %valB, ptr %out" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i16 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -27,7 +27,7 @@ define void @test(ptr %C) {
 ; AVX1:  Cost of 2 for VF 2: profitable to scalarize store i16 %valB, ptr %out, align 2
 ; AVX1:  Cost of 4 for VF 4: profitable to scalarize store i16 %valB, ptr %out, align 2
 ; AVX1:  Cost of 8 for VF 8: profitable to scalarize store i16 %valB, ptr %out, align 2
-; AVX1:  Cost of 16 for VF 16: profitable to scalarize store i16 %valB, ptr %out, align 2
+; AVX1:  Cost of 16.5 for VF 16: profitable to scalarize store i16 %valB, ptr %out, align 2
 ; AVX1:  Cost of 33 for VF 32: profitable to scalarize store i16 %valB, ptr %out, align 2
 ;
 ; AVX2-LABEL: 'test'
@@ -35,7 +35,7 @@ define void @test(ptr %C) {
 ; AVX2:  Cost of 2 for VF 2: profitable to scalarize store i16 %valB, ptr %out, align 2
 ; AVX2:  Cost of 4 for VF 4: profitable to scalarize store i16 %valB, ptr %out, align 2
 ; AVX2:  Cost of 8 for VF 8: profitable to scalarize store i16 %valB, ptr %out, align 2
-; AVX2:  Cost of 16 for VF 16: profitable to scalarize store i16 %valB, ptr %out, align 2
+; AVX2:  Cost of 16.5 for VF 16: profitable to scalarize store i16 %valB, ptr %out, align 2
 ; AVX2:  Cost of 33 for VF 32: profitable to scalarize store i16 %valB, ptr %out, align 2
 ;
 ; AVX512-LABEL: 'test'
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
index 0111cc162b4de..c659b0a53364c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i32 %valB, ptr %out" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i32 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i32 %valB, ptr %out" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i32 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -17,8 +17,8 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
-; SSE2:  Cost of 2 for VF 2: profitable to scalarize store i32 %valB, ptr %out, align 4
-; SSE2:  Cost of 5 for VF 4: profitable to scalarize store i32 %valB, ptr %out, align 4
+; SSE2:  Cost of 2.5 for VF 2: profitable to scalarize store i32 %valB, ptr %out, align 4
+; SSE2:  Cost of 5.5 for VF 4: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; SSE2:  Cost of 11 for VF 8: profitable to scalarize store i32 %valB, ptr %out, align 4
 ; SSE2:  Cost of 22 for VF 16: profitable to scalarize store i32 %valB, ptr %out, align 4
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
index afa821586bb1c..1c05a4c25d973 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i64 %valB, ptr %out" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i64 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i64 %valB, ptr %out" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i64 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -17,7 +17,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
-; SSE2:  Cost of 2 for VF 2: profitable to scalarize store i64 %valB, ptr %out, align 8
+; SSE2:  Cost of 2.5 for VF 2: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; SSE2:  Cost of 5 for VF 4: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; SSE2:  Cost of 10 for VF 8: profitable to scalarize store i64 %valB, ptr %out, align 8
 ; SSE2:  Cost of 20 for VF 16: profitable to scalarize store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
index 862a572643895..16a7768bbc302 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i8 %valB, ptr %out" --filter "Cost of [1-9][0-9]* for VF [0-9]+: (profitable to scalarize\s+store i8 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i8 %valB, ptr %out" --filter "Cost of [1-9][0-9]*(.[0-9]+)? for VF [0-9]+: (profitable to scalarize\s+store i8 %valB|WIDEN store .*, ir<%valB>|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
@@ -17,10 +17,10 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %C) {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
-; SSE2:  Cost of 2 for VF 2: profitable to scalarize store i8 %valB, ptr %out, align 1
-; SSE2:  Cost of 5 for VF 4: profitable to scalarize store i8 %valB, ptr %out, align 1
-; SSE2:  Cost of 11 for VF 8: profitable to scalarize store i8 %valB, ptr %out, align 1
-; SSE2:  Cost of 23 for VF 16: profitable to scalarize store i8 %valB, ptr %out, align 1
+; SSE2:  Cost of 2.5 for VF 2: profitable to scalarize store i8 %valB, ptr %out, align 1
+; SSE2:  Cost of 5.5 for VF 4: profitable to scalarize store i8 %valB, ptr %out, align 1
+; SSE2:  Cost of 11.5 for VF 8: profitable to scalarize store i8 %valB, ptr %out, align 1
+; SSE2:  Cost of 23.5 for VF 16: profitable to scalarize store i8 %valB, ptr %out, align 1
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -35,7 +35,7 @@ define void @test(ptr %C) {
 ; AVX1:  Cost of 4 for VF 4: profitable to scalarize store i8 %valB, ptr %out, align 1
 ; AVX1:  Cost of 8 for VF 8: profitable to scalarize store i8 %valB, ptr %out, align 1
 ; AVX1:  Cost of 16 for VF 16: profitable to scalarize store i8 %valB, ptr %out, align 1
-; AVX1:  Cost of 32 for VF 32: profitable to scalarize store i8 %valB, ptr %out, align 1
+; AVX1:  Cost of 32.5 for VF 32: profitable to scalarize store i8 %valB, ptr %out, align 1
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
@@ -43,7 +43,7 @@ define void @test(ptr %C) {
 ; AVX2:  Cost of 4 for VF 4: profitable to scalarize store i8 %valB, ptr %out, align 1
 ; AVX2:  Cost of 8 for VF 8: profitable to scalarize store i8 %valB, ptr %out, align 1
 ; AVX2:  Cost of 16 for VF 16: profitable to scalarize store i8 %valB, ptr %out, align 1
-; AVX2:  Cost of 32 for VF 32: profitable to scalarize store i8 %valB, ptr %out, align 1
+; AVX2:  Cost of 32.5 for VF 32: profitable to scalarize store i8 %valB, ptr %out, align 1
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, ptr %out, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i16-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i16-with-i8-index.ll
index aa89dbbcd72ac..6782bbfeb53b3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i16-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i16-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i16 %valB, ptr %out" --filter "Cost of [0-9]+ for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i16 %valB, ptr %out" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i32-with-i8-index.ll
index 816ddfbadc4d1..5e0b3277dd5e9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i32-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i32 %valB, ptr %out" --filter "Cost of [0-9]+ for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i32 %valB, ptr %out" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i64-with-i8-index.ll
index 64c41c2a31311..fcbf6042dec14 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i64-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i64 %valB, ptr %out" --filter "Cost of [0-9]+ for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i64 %valB, ptr %out" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i8-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i8-with-i8-index.ll
index 1be3a62186e16..2946cd291d7fa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i8-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/scatter-i8-with-i8-index.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF 1 For instruction:\s*store i8 %valB, ptr %out" --filter "Cost of [0-9]+ for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+(.[0-9]+)? for VF 1 For instruction:\s*store i8 %valB, ptr %out" --filter "Cost of [0-9]+(.[0-9]+)? for VF [0-9]+: (WIDEN store|REPLICATE store ir<%valB>)"
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll
index 14f20464093cf..f790ca8d48cf0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-i386.ll
@@ -6,26 +6,63 @@ target triple = "i386-unknow-linux"
 define void @icmp_predicate_and_branch_cost(i32 %size, ptr %dst, i64 %conv5.i) #0 {
 ; CHECK-LABEL: @icmp_predicate_and_branch_cost(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[N_VEC]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[CONV5_I:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[SIZE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i32> [[VEC_IND]] to <16 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i64> [[TMP4]], splat (i64 8)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt <16 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp uge <16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[BROADCAST_SPLAT2]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP6]], <16 x i1> [[TMP7]], <16 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> zeroinitializer, <16 x i8> [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[PREDPHI]], <16 x i8> splat (i8 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[PREDPHI3]], i64 15
+; CHECK-NEXT:    store i8 [[TMP11]], ptr [[DST:%.*]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 128)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER1:%.*]]
 ; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[EXT_IV:%.*]] = zext i32 [[IV]] to i64
 ; CHECK-NEXT:    [[ADD_IV:%.*]] = add i64 [[EXT_IV]], 8
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 [[ADD_IV]], [[CONV5_I:%.*]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 [[ADD_IV]], [[CONV5_I]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_1:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       then.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[IV]], [[SIZE:%.*]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[IV]], [[SIZE]]
 ; CHECK-NEXT:    br i1 [[C_2]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       then.2:
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SIZE]], [[IV]]
 ; CHECK-NEXT:    [[TRUNC_OR:%.*]] = trunc i32 [[OR]] to i8
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[SINK:%.*]] = phi i8 [ [[TRUNC_OR]], [[THEN_2]] ], [ 1, [[LOOP_HEADER]] ], [ 0, [[THEN_1]] ]
-; CHECK-NEXT:    store i8 [[SINK]], ptr [[DST:%.*]], align 1
+; CHECK-NEXT:    [[SINK:%.*]] = phi i8 [ [[TRUNC_OR]], [[THEN_2]] ], [ 1, [[LOOP_HEADER1]] ], [ 0, [[THEN_1]] ]
+; CHECK-NEXT:    store i8 [[SINK]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = trunc i64 [[ADD_IV]] to i32
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i32 [[SIZE]], [[IV]]
-; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 381e8b6978342..6f64b80095c7e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1073,27 +1073,46 @@ for.end:
 define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) {
 ; AVX1-LABEL: define void @foo6(
 ; AVX1-SAME: ptr readonly captures(none) [[IN:%.*]], ptr captures(none) [[OUT:%.*]], i32 [[SIZE:%.*]], ptr readonly captures(none) [[TRIGGER:%.*]]) #[[ATTR0]] {
-; AVX1-NEXT:  [[ENTRY:.*]]:
+; AVX1-NEXT:  [[ENTRY:.*:]]
 ; AVX1-NEXT:    br label %[[FOR_BODY:.*]]
 ; AVX1:       [[FOR_BODY]]:
-; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 4095, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
-; AVX1-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
-; AVX1:       [[IF_THEN]]:
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
-; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store double [[ADD]], ptr [[ARRAYIDX5]], align 8
-; AVX1-NEXT:    br label %[[FOR_INC]]
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 32768
+; AVX1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER]], i64 16384
+; AVX1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN]], i64 32768
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]]
+; AVX1-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
+; AVX1-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX1-NEXT:    [[CMP1:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; AVX1-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC:.*]]
 ; AVX1:       [[FOR_INC]]:
-; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; AVX1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX1-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; AVX1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; AVX1:       [[VECTOR_BODY]]:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_INC]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 -3
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META18:![0-9]+]]
+; AVX1-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> 
+; AVX1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[IN]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP4]], i64 -3
+; AVX1-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <4 x i32> 
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP5]], <4 x i1> [[REVERSE6]], <4 x double> poison), !alias.scope [[META21:![0-9]+]]
+; AVX1-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> 
+; AVX1-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[REVERSE7]], splat (double 5.000000e-01)
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[OUT]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP8]], i64 -3
+; AVX1-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> 
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE8]], ptr align 8 [[TMP9]], <4 x i1> [[REVERSE6]]), !alias.scope [[META23:![0-9]+]], !noalias [[META25:![0-9]+]]
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX1-NEXT:    br i1 [[TMP11]], label %[[FOR_END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; AVX1:       [[FOR_END]]:
-; AVX1-NEXT:    ret void
+; AVX1-NEXT:    br [[FOR_END1:label %.*]]
+; AVX1:       [[IF_THEN]]:
 ;
 ; AVX2-LABEL: define void @foo6(
 ; AVX2-SAME: ptr readonly captures(none) [[IN:%.*]], ptr captures(none) [[OUT:%.*]], i32 [[SIZE:%.*]], ptr readonly captures(none) [[TRIGGER:%.*]]) #[[ATTR0]] {
@@ -1349,13 +1368,13 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; AVX1:       [[MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; AVX1:       [[VEC_EPILOG_ITER_CHECK]]:
 ; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19:![0-9]+]]
+; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29:![0-9]+]]
 ; AVX1:       [[VEC_EPILOG_PH]]:
 ; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX1-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
@@ -1375,7 +1394,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
-; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; AVX1:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[CMP_N14]], [[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
@@ -1670,13 +1689,13 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; AVX1:       [[MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; AVX1:       [[VEC_EPILOG_ITER_CHECK]]:
 ; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19]]
+; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29]]
 ; AVX1:       [[VEC_EPILOG_PH]]:
 ; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX1-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
@@ -1696,7 +1715,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
-; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; AVX1:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; AVX1-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[CMP_N14]], [[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index c61ef5762f894..983dcf5310773 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -287,10 +287,10 @@ exit:
 define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; COST-LABEL: define void @switch_all_dests_distinct(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
-; COST-NEXT:  [[ENTRY:.*]]:
-; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
-; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:  [[SCALAR_PH:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -312,7 +312,7 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER1]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -638,10 +638,10 @@ exit:
 define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; COST-LABEL: define void @switch_multiple_common_dests(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
-; COST-NEXT:  [[ENTRY:.*]]:
-; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
-; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:  [[SCALAR_PH:.*]]:
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -662,7 +662,7 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER1]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -790,9 +790,43 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; COST-LABEL: define void @switch4_default_common_dest_with_case(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
 ; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; COST-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; COST-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; COST-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; COST-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; COST-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; COST-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; COST-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST:       [[VECTOR_PH]]:
+; COST-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; COST-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; COST-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
+; COST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
 ; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER]] ]
+; COST-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
+; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
+; COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
+; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
+; COST-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
+; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP7]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP6]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP9]])
+; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; COST-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; COST-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
+; COST:       [[MIDDLE_BLOCK]]:
+; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; COST:       [[SCALAR_PH]]:
+; COST-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -811,7 +845,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -957,7 +991,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; COST:       [[MIDDLE_BLOCK]]:
 ; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -987,7 +1021,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -1116,9 +1150,51 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-LABEL: define void @br_under_switch_default_common_dest_with_case(
 ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; COST-NEXT:  [[ENTRY:.*]]:
+; COST-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; COST-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; COST-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; COST-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; COST-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; COST-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; COST-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; COST-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; COST:       [[VECTOR_PH]]:
+; COST-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; COST-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; COST-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
+; COST-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; COST-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0
+; COST-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; COST-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; COST:       [[LOOP_HEADER]]:
-; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER]] ]
+; COST-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
+; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
+; COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
+; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
+; COST-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
+; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; COST-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; COST-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
+; COST-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; COST-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP12]], [[TMP7]]
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
+; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP10]], <4 x i1> zeroinitializer
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
+; COST-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP9]]
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
+; COST:       [[MIDDLE_BLOCK]]:
+; COST-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; COST-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; COST:       [[SCALAR_PH]]:
+; COST-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; COST-NEXT:    br label %[[LOOP_HEADER1:.*]]
+; COST:       [[LOOP_HEADER1]]:
+; COST-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; COST-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; COST-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; COST-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -1140,7 +1216,7 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST:       [[LOOP_LATCH]]:
 ; COST-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; COST-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; COST-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COST-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; COST:       [[EXIT]]:
 ; COST-NEXT:    ret void
 ;
@@ -1433,6 +1509,10 @@ exit:
 ; COST: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ; COST: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
 ; COST: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; COST: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; COST: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; COST: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; COST: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
 ;.
 ; FORCED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; FORCED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/unittests/Support/InstructionCostTest.cpp b/llvm/unittests/Support/InstructionCostTest.cpp
index 5392689131071..91f9458439e34 100644
--- a/llvm/unittests/Support/InstructionCostTest.cpp
+++ b/llvm/unittests/Support/InstructionCostTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -91,3 +92,21 @@ TEST_F(CostTest, Operators) {
   EXPECT_EQ(Max * MinusTwo, Min);
   EXPECT_EQ(Min * MinusTwo, Max);
 }
+
+TEST_F(CostTest, StreamOperator) {
+  auto ToString = [](InstructionCost C) {
+    std::string Printed;
+    raw_string_ostream OS(Printed);
+    OS << C;
+    return Printed;
+  };
+
+  EXPECT_EQ(ToString(InstructionCost(3)), "3");
+  EXPECT_EQ(ToString(InstructionCost(-2)), "-2");
+  EXPECT_EQ(ToString(InstructionCost(-3) / 4), "-0.75");
+  EXPECT_EQ(ToString(InstructionCost(-5) / 3), "-1.5");
+  EXPECT_EQ(ToString(InstructionCost(5) / 3), "1.5");
+  EXPECT_EQ(ToString(InstructionCost::getMax()), "2305843009213693951.75");
+  EXPECT_EQ(ToString(InstructionCost::getMin()), "-2305843009213693952");
+  EXPECT_EQ(ToString(InstructionCost::getInvalid(-7)), "Invalid");
+}

From 939912e66b8e36f1564ecef21ed9fa4b9440fe67 Mon Sep 17 00:00:00 2001
From: Sadaf Ebrahimi 
Date: Tue, 12 May 2026 13:40:36 -0700
Subject: [PATCH 507/538] [scudo] Move MemMap tests from to map_test.cpp

The tests VerifyGetResidentPages, VerifyReleasePagesToOS, and Zeros test
MemMapT functionality and fit better in map_test.cpp where other MemMapT
tests reside.
---
 .../scudo/standalone/tests/common_test.cpp    | 98 -------------------
 .../lib/scudo/standalone/tests/map_test.cpp   | 98 +++++++++++++++++++
 2 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
index d1d978212e06d..564cc15d52e3b 100644
--- a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
@@ -16,106 +16,8 @@
 #include 
 #include 
 
-#include 
-#include 
-
 namespace scudo {
 
-TEST(ScudoCommonTest, VerifyGetResidentPages) {
-  if (!SCUDO_LINUX)
-    TEST_SKIP("Only valid on linux systems.");
-
-  constexpr uptr NumPages = 512;
-  const uptr SizeBytes = NumPages * getPageSizeCached();
-
-  MemMapT MemMap;
-  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize"));
-  ASSERT_NE(MemMap.getBase(), 0U);
-
-  // Only android seems to properly detect when single pages are touched.
-#if SCUDO_ANDROID
-  // Verify nothing should be mapped in right after the map is created.
-  EXPECT_EQ(0U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-
-  // Touch a page.
-  u8 *Data = reinterpret_cast(MemMap.getBase());
-  Data[0] = 1;
-  EXPECT_EQ(1U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-
-  // Touch a non-consective page.
-  Data[getPageSizeCached() * 2] = 1;
-  EXPECT_EQ(2U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-
-  // Touch a page far enough that the function has to make multiple calls
-  // to mincore.
-  Data[getPageSizeCached() * 300] = 1;
-  EXPECT_EQ(3U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-
-  // Touch another page in the same range to make sure the second
-  // read is working.
-  Data[getPageSizeCached() * 400] = 1;
-  EXPECT_EQ(4U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-#endif
-
-  // Now write the whole thing.
-  memset(reinterpret_cast(MemMap.getBase()), 1, SizeBytes);
-  s64 ResidentPages = MemMap.getResidentPages(MemMap.getBase(), SizeBytes);
-  EXPECT_EQ(NumPages, static_cast(ResidentPages));
-
-  MemMap.unmap();
-}
-
-TEST(ScudoCommonTest, VerifyReleasePagesToOS) {
-  if (!SCUDO_LINUX)
-    TEST_SKIP("Only valid on linux systems.");
-
-  constexpr uptr NumPages = 1000;
-  const uptr SizeBytes = NumPages * getPageSizeCached();
-
-  MemMapT MemMap;
-  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize"));
-  ASSERT_NE(MemMap.getBase(), 0U);
-
-  void *P = reinterpret_cast(MemMap.getBase());
-  EXPECT_EQ(0U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-
-  // Make the entire map resident.
-  memset(P, 1, SizeBytes);
-  s64 ResidentPages = MemMap.getResidentPages(MemMap.getBase(), SizeBytes);
-  if (ResidentPages >= 0)
-    EXPECT_EQ(NumPages, static_cast(ResidentPages));
-
-  // Should release the memory to the kernel immediately.
-  MemMap.releasePagesToOS(MemMap.getBase(), SizeBytes);
-  EXPECT_EQ(0U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
-
-  // Make the entire map resident again.
-  memset(P, 1, SizeBytes);
-  ResidentPages = MemMap.getResidentPages(MemMap.getBase(), SizeBytes);
-  EXPECT_EQ(NumPages, static_cast(ResidentPages));
-
-  MemMap.unmap();
-}
-
-TEST(ScudoCommonTest, Zeros) {
-  const uptr Size = 1ull << 20;
-
-  MemMapT MemMap;
-  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "Zeros"));
-  ASSERT_NE(MemMap.getBase(), 0U);
-  uptr *P = reinterpret_cast(MemMap.getBase());
-  const ptrdiff_t N = Size / sizeof(uptr);
-  EXPECT_EQ(std::count(P, P + N, 0), N);
-
-  memset(P, 1, Size);
-  EXPECT_EQ(std::count(P, P + N, 0), 0);
-
-  MemMap.releasePagesToOS(MemMap.getBase(), Size);
-  EXPECT_EQ(std::count(P, P + N, 0), N);
-
-  MemMap.unmap();
-}
-
 TEST(ScudoCommonTest, IsPowerOfTwo) {
   EXPECT_FALSE(isPowerOfTwo(0));
   EXPECT_TRUE(isPowerOfTwo(1));
diff --git a/compiler-rt/lib/scudo/standalone/tests/map_test.cpp b/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
index bfc31094996db..620dffca33b83 100644
--- a/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
@@ -11,6 +11,7 @@
 #include "common.h"
 #include "mem_map.h"
 
+#include 
 #include 
 #include 
 
@@ -25,6 +26,103 @@ TEST(ScudoMapTest, PageSize) {
             static_cast(sysconf(_SC_PAGESIZE)));
 }
 
+TEST(ScudoMapTest, VerifyGetResidentPages) {
+  if (!SCUDO_LINUX)
+    TEST_SKIP("Only valid on linux systems.");
+
+  constexpr scudo::uptr NumPages = 512;
+  const scudo::uptr SizeBytes = NumPages * scudo::getPageSizeCached();
+
+  scudo::MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+
+  // Only android seems to properly detect when single pages are touched.
+#if SCUDO_ANDROID
+  // Verify nothing should be mapped in right after the map is created.
+  EXPECT_EQ(0U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+
+  // Touch a page.
+  scudo::u8 *Data = reinterpret_cast(MemMap.getBase());
+  Data[0] = 1;
+  EXPECT_EQ(1U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+
+  // Touch a non-consective page.
+  Data[scudo::getPageSizeCached() * 2] = 1;
+  EXPECT_EQ(2U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+
+  // Touch a page far enough that the function has to make multiple calls
+  // to mincore.
+  Data[scudo::getPageSizeCached() * 300] = 1;
+  EXPECT_EQ(3U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+
+  // Touch another page in the same range to make sure the second
+  // read is working.
+  Data[scudo::getPageSizeCached() * 400] = 1;
+  EXPECT_EQ(4U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+#endif
+
+  // Now write the whole thing.
+  memset(reinterpret_cast(MemMap.getBase()), 1, SizeBytes);
+  scudo::s64 ResidentPages =
+      MemMap.getResidentPages(MemMap.getBase(), SizeBytes);
+  EXPECT_EQ(NumPages, static_cast(ResidentPages));
+
+  MemMap.unmap();
+}
+
+TEST(ScudoMapTest, VerifyReleasePagesToOS) {
+  if (!SCUDO_LINUX)
+    TEST_SKIP("Only valid on linux systems.");
+
+  constexpr scudo::uptr NumPages = 1000;
+  const scudo::uptr SizeBytes = NumPages * scudo::getPageSizeCached();
+
+  scudo::MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+
+  void *P = reinterpret_cast(MemMap.getBase());
+  EXPECT_EQ(0U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+
+  // Make the entire map resident.
+  memset(P, 1, SizeBytes);
+  scudo::s64 ResidentPages =
+      MemMap.getResidentPages(MemMap.getBase(), SizeBytes);
+  if (ResidentPages >= 0)
+    EXPECT_EQ(NumPages, static_cast(ResidentPages));
+
+  // Should release the memory to the kernel immediately.
+  MemMap.releasePagesToOS(MemMap.getBase(), SizeBytes);
+  EXPECT_EQ(0U, MemMap.getResidentPages(MemMap.getBase(), SizeBytes));
+
+  // Make the entire map resident again.
+  memset(P, 1, SizeBytes);
+  ResidentPages = MemMap.getResidentPages(MemMap.getBase(), SizeBytes);
+  EXPECT_EQ(NumPages, static_cast(ResidentPages));
+
+  MemMap.unmap();
+}
+
+TEST(ScudoMapTest, Zeros) {
+  const scudo::uptr Size = 1ull << 20;
+
+  scudo::MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "Zeros"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+  scudo::uptr *P = reinterpret_cast(MemMap.getBase());
+  const ptrdiff_t N = Size / sizeof(scudo::uptr);
+  EXPECT_EQ(std::count(P, P + N, 0), N);
+
+  memset(P, 1, Size);
+  EXPECT_EQ(std::count(P, P + N, 0), 0);
+
+  MemMap.releasePagesToOS(MemMap.getBase(), Size);
+  EXPECT_EQ(std::count(P, P + N, 0), N);
+
+  MemMap.unmap();
+}
+
 TEST(ScudoMapDeathTest, MapNoAccessUnmap) {
   const scudo::uptr Size = 4 * scudo::getPageSizeCached();
   scudo::ReservedMemoryT ReservedMemory;

From f1c30af5f80fa7d2f7ac4fbf4fbbb36ec387413d Mon Sep 17 00:00:00 2001
From: Jameson Nash 
Date: Tue, 12 May 2026 16:42:15 -0400
Subject: [PATCH 508/538] [SimplifyCFG] correct and move debug info for
 mergeConditionalStoreToAddress (#180789)

Previously, a combination of TryToSimplifyUncondBranchFromEmptyBlock
and SpeculatedStoreValue was changing the separate conditional stores
into a store of one value, which was then being hoisted to a
non-conditional store of that one value (and a DCE of the other). This
makes all linked stores use the new value, which is still
unconditionally correct. It isn't easy for
TryToSimplifyUncondBranchFromEmptyBlock to otherwise guess why the
value is different and try to recover which one is correct when doing
the conditional update. The end result being that the debug info might
have the wrong value. Now instead this updates the debug info at the
same time to reflect that the merged store will be equivalent, hoping
to turn these into the same info. This ensures that later passes don't
need to reverse how the different stores connected back to the new IR,
since either debug info now contains correct information for either
branch taken.

And additionally, without `combineMetadataForCSE`, it was dropping the
debug assignment for the merged store.

Note that this means the SSA value in the debug info does not maintain
dominator. OCHyams confirmed that this is both correct and intended
behavior for debug info tracking.

Assisted-by: Claude Sonnet 4.5 via Claude Code (just the test though,
since it wanted to just delete all associated debug code when deleting
an instruction, so it was pretty useless at writing the fix)
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 16 +++-
 .../merge-cond-stores-debuginfo.ll            | 78 +++++++++++++++++++
 2 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/merge-cond-stores-debuginfo.ll

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 3a98e86038904..f1d47abe79365 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3343,7 +3343,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(CondBrInst *BI,
     SpeculatedStore->applyMergedLocation(BI->getDebugLoc(),
                                          SpeculatedStore->getDebugLoc());
     // The value stored is still conditional, but the store itself is now
-    // unconditonally executed, so we must be sure that any linked dbg.assign
+    // unconditionally executed, so we must be sure that any linked dbg.assign
     // intrinsics are tracking the new stored value (the result of the
     // select). If we don't, and the store were to be removed by another pass
     // (e.g. DSE), then we'd eventually end up emitting a location describing
@@ -4406,7 +4406,19 @@ static bool mergeConditionalStoreToAddress(
 
   QB.SetInsertPoint(T);
   StoreInst *SI = cast(QB.CreateStore(QPHI, Address));
-  SI->setAAMetadata(PStore->getAAMetadata().merge(QStore->getAAMetadata()));
+  combineMetadataForCSE(QStore, PStore, true);
+  SI->copyMetadata(*QStore);
+  // Update any dbg.assign intrinsics to track the merged value (QPHI) instead
+  // of the original constant values, likely making these identical.
+  for (auto *DbgAssign : at::getDVRAssignmentMarkers(SI)) {
+    if (llvm::is_contained(DbgAssign->location_ops(),
+                           PStore->getValueOperand()))
+      DbgAssign->replaceVariableLocationOp(PStore->getValueOperand(), QPHI);
+    if (llvm::is_contained(DbgAssign->location_ops(),
+                           QStore->getValueOperand()))
+      DbgAssign->replaceVariableLocationOp(QStore->getValueOperand(), QPHI);
+  }
+
   // Choose the minimum alignment. If we could prove both stores execute, we
   // could use biggest one.  In this case, though, we only know that one of the
   // stores executes.  And we don't know it's safe to take the alignment from a
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-cond-stores-debuginfo.ll b/llvm/test/Transforms/SimplifyCFG/merge-cond-stores-debuginfo.ll
new file mode 100644
index 0000000000000..4eb8028a9e19e
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-cond-stores-debuginfo.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=simplifycfg < %s -S | FileCheck %s
+
+; Test that SimplifyCFG properly preserves debug info when merging conditional stores.
+; This test verifies that:
+; 1. DIAssignID metadata is properly merged.
+; 2. The hoisted dbg_assign value correctly represents the merged store value.
+
+define void @merge_stores_different_values(ptr %p, i32 %a, i32 %b) !dbg !10 {
+; CHECK-LABEL: @merge_stores_different_values(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0, !dbg [[DBG15:![0-9]+]]
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B:%.*]], 0, !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:      #dbg_assign(i32 [[SPEC_SELECT:%.*]], [[META17:![0-9]+]], !DIExpression(), [[META18:![0-9]+]], ptr [[P:%.*]], !DIExpression(), [[META19:![0-9]+]])
+; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[X2]], i32 200, i32 100, !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[X1]], true, !dbg [[DBG21:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X2]], true, !dbg [[DBG21]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]], !dbg [[DBG21]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]], !dbg [[DBG21]]
+; CHECK:       3:
+; CHECK-NEXT:    store i32 [[SPEC_SELECT]], ptr [[P]], align 4, !dbg [[META19]], !DIAssignID [[META18]]
+; CHECK-NEXT:    br label [[TMP4]], !dbg [[DBG21]]
+; CHECK:       4:
+; CHECK-NEXT:    ret void, !dbg [[DBG21]]
+;
+entry:
+  %x1 = icmp eq i32 %a, 0, !dbg !15
+  br i1 %x1, label %fallthrough, label %yes1, !dbg !16
+
+yes1:                                             ; preds = %entry
+  store i32 200, ptr %p, align 4, !dbg !17, !DIAssignID !18
+    #dbg_assign(i32 200, !19, !DIExpression(), !18, ptr %p, !DIExpression(), !17)
+  br label %fallthrough, !dbg !20
+
+fallthrough:                                      ; preds = %yes1, %entry
+  %x2 = icmp eq i32 %b, 0, !dbg !21
+  br i1 %x2, label %end, label %yes2, !dbg !22
+
+yes2:                                             ; preds = %fallthrough
+  store i32 100, ptr %p, align 4, !dbg !23, !DIAssignID !24
+    #dbg_assign(i32 100, !19, !DIExpression(), !24, ptr %p, !DIExpression(), !23)
+  br label %end, !dbg !25
+
+end:                                              ; preds = %yes2, %fallthrough
+  ret void, !dbg !26
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!9 = !{!"clang version 19.0.0"}
+!10 = distinct !DISubprogram(name: "merge_stores_different_values", scope: !1, file: !1, line: 10, type: !11, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14)
+!11 = !DISubroutineType(types: !12)
+!12 = !{null, !13, !13, !13}
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !{}
+!15 = !DILocation(line: 12, column: 7, scope: !10)
+!16 = !DILocation(line: 12, column: 5, scope: !10)
+!17 = !DILocation(line: 13, column: 12, scope: !10)
+!18 = distinct !DIAssignID()
+!19 = !DILocalVariable(name: "val", scope: !10, file: !1, line: 11, type: !13)
+!20 = !DILocation(line: 13, column: 5, scope: !10)
+!21 = !DILocation(line: 14, column: 7, scope: !10)
+!22 = !DILocation(line: 14, column: 5, scope: !10)
+!23 = !DILocation(line: 15, column: 12, scope: !10)
+!24 = distinct !DIAssignID()
+!25 = !DILocation(line: 15, column: 5, scope: !10)
+!26 = !DILocation(line: 16, column: 1, scope: !10)

From 483ecf89f9fa64c2138c6f5c7b6688e4c95c330e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= 
Date: Tue, 12 May 2026 13:44:33 -0700
Subject: [PATCH 509/538] [flang][cuda] Fix CUDA generic matching with omitted
 optional args (#197275)

Skip omitted optional arguments when computing CUDA address-space
matching distances, so -gpu=unified overload resolution does not compare
expanded dummy-argument lists of different sizes. Adds a regression
covering a unified-memory overload with optional extras.
---
 flang/lib/Semantics/expression.cpp |  4 ++++
 flang/test/Semantics/cuf14.cuf     | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 066ead7fc28e8..dad401f0baa74 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2965,6 +2965,10 @@ static CudaMatchingDistance ComputeCudaMatchingDistance(
   for (std::size_t i{0}; i < dummies.size(); ++i) {
     const characteristics::DummyArgument &dummy{dummies[i]};
     const std::optional &actual{actuals[i]};
+    if (!actual) {
+      // Omitted optional arguments do not affect CUDA matching distances.
+      continue;
+    }
     int d{GetMatchingDistance(features, dummy, actual)};
     if (d == cudaInfMatchingValue) {
       distance.isInfinite = true;
diff --git a/flang/test/Semantics/cuf14.cuf b/flang/test/Semantics/cuf14.cuf
index 29c9ecf90677f..28bd410ad0034 100644
--- a/flang/test/Semantics/cuf14.cuf
+++ b/flang/test/Semantics/cuf14.cuf
@@ -19,6 +19,11 @@ module matching
     module procedure sub_managed
   end interface
 
+  interface optional_extra_args
+    module procedure sub_host_3args
+    module procedure sub_unified_5args
+  end interface
+
 contains
   subroutine sub_host(a)
     integer :: a(:)
@@ -35,21 +40,35 @@ contains
   subroutine sub_unified(a)
     integer, unified :: a(:)
   end
+
+  subroutine sub_host_3args(a, b, c)
+    integer :: a(:)
+    integer :: b, c
+  end
+
+  subroutine sub_unified_5args(a, b, c, d, e)
+    integer, unified :: a(:)
+    integer :: b, c
+    integer, optional :: d, e
+  end
 end module
 
 program m
   use matching
 
   integer, allocatable :: actual_host(:)
+  integer :: i
 
   allocate(actual_host(10))
 
   call host_and_device(actual_host)     ! Should resolve to sub_device
   call all(actual_host)                 ! Should resolved to unified
   call all_without_unified(actual_host) ! Should resolved to managed
+  call optional_extra_args(actual_host, i, i) ! Should resolve to sub_unified_5args
 end
 
 ! CHECK: fir.call @_QMmatchingPsub_device
 ! CHECK: fir.call @_QMmatchingPsub_unified
 ! CHECK: fir.call @_QMmatchingPsub_managed
+! CHECK: fir.call @_QMmatchingPsub_unified_5args
 

From a2a3554b68d3a3d972b1ee76a3428ed455a621f8 Mon Sep 17 00:00:00 2001
From: Jameson Nash 
Date: Tue, 12 May 2026 16:46:51 -0400
Subject: [PATCH 510/538] [clang] use QualType addrspace when making an alloca
 (#181390)

Instead of assuming that QualType is in default addrspace (or
compatible with it), actually use the addrspace declared by the
frontend. That removes needless dueling addrspacecast calls and
associated IR noise. Any callers that intend to discard the attributes
of the type (e.g. because they are casting an rvalue through memory)
need to now be explicit about that (e.g. by calling getUnqualifiedType).

This is part of a commit sequence trying to help the WASM be able to
have distinguished pointer types between stack memory and local memory
(attempting to emit an addrspacecast between the two is invalid).

Assisted-By: Claude Sonnet 4.5 
---
 clang/lib/CIR/CodeGen/CIRGenAtomic.cpp        |   11 +-
 clang/lib/CodeGen/CGAtomic.cpp                |   12 +-
 clang/lib/CodeGen/CGCUDANV.cpp                |    9 +-
 clang/lib/CodeGen/CGCleanup.cpp               |    4 +-
 clang/lib/CodeGen/CGExpr.cpp                  |    7 +-
 clang/lib/CodeGen/CodeGenFunction.h           |    7 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |    1 -
 clang/lib/CodeGen/CodeGenTypeCache.h          |    4 -
 clang/lib/CodeGen/TargetInfo.h                |    5 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |    8 +-
 clang/lib/CodeGen/Targets/SPIR.cpp            |    8 +-
 clang/test/CodeGen/scoped-atomic-ops.c        |  837 +++++------
 clang/test/CodeGen/scoped-fence-ops.c         |  192 ++-
 .../CodeGenOpenCL/addr-space-struct-arg.cl    |  174 +--
 .../amdgcn-automatic-variable.cl              |    5 +-
 .../amdgpu-abi-struct-arg-byref.cl            |  236 ++--
 .../test/CodeGenOpenCL/amdgpu-cluster-dims.cl |   10 +-
 .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl    |  222 ++-
 clang/test/CodeGenOpenCL/atomic-ops.cl        |    5 +-
 clang/test/CodeGenOpenCL/builtins-alloca.cl   |   80 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl    |  144 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 1257 +++++++----------
 .../builtins-amdgcn-global-load-lds.cl        |   42 +-
 .../builtins-amdgcn-load-to-lds.cl            |   42 +-
 .../CodeGenOpenCL/check-atomic-alignment.cl   |   14 +-
 ...plicit-addrspacecast-function-parameter.cl |   21 +-
 clang/test/Index/pipe-size.cl                 |    4 +-
 27 files changed, 1541 insertions(+), 1820 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index e9bf89c91627f..3a02fa0888f82 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -225,10 +225,13 @@ RValue AtomicInfo::emitAtomicLoad(AggValueSlot resultSlot, SourceLocation loc,
 }
 
 Address AtomicInfo::createTempAlloca() const {
-  Address tempAlloca = cgf.createMemTemp(
-      (lvalue.isBitField() && valueSizeInBits > atomicSizeInBits) ? valueTy
-                                                                  : atomicTy,
-      getAtomicAlignment(), loc, "atomic-temp");
+  // Remove addrspace info from the atomic pointer element when making the
+  // alloca pointer element.
+  QualType tmpTy = (lvalue.isBitField() && valueSizeInBits > atomicSizeInBits)
+                       ? valueTy
+                       : atomicTy.getUnqualifiedType();
+  Address tempAlloca =
+      cgf.createMemTemp(tmpTy, getAtomicAlignment(), loc, "atomic-temp");
 
   // Cast to pointer to value type for bitfields.
   if (lvalue.isBitField()) {
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 859ab20bb6740..b4fd0fdb795aa 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -298,11 +298,13 @@ namespace {
 }
 
 Address AtomicInfo::CreateTempAlloca() const {
-  Address TempAlloca = CGF.CreateMemTemp(
-      (LVal.isBitField() && ValueSizeInBits > AtomicSizeInBits) ? ValueTy
-                                                                : AtomicTy,
-      getAtomicAlignment(),
-      "atomic-temp");
+  // Remove addrspace info from the atomic pointer element when making the
+  // alloca pointer element.
+  QualType TmpTy = (LVal.isBitField() && ValueSizeInBits > AtomicSizeInBits)
+                       ? ValueTy
+                       : AtomicTy.getUnqualifiedType();
+  Address TempAlloca =
+      CGF.CreateMemTemp(TmpTy, getAtomicAlignment(), "atomic-temp");
   // Cast to pointer to value type for bitfields.
   if (LVal.isBitField())
     return CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 9f38aeed5b6d3..3eda4237b0549 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -372,7 +372,7 @@ Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
   // args, allocate a single pointer so we still have a valid pointer to the
   // argument array that we can pass to runtime, even if it will be unused.
   Address KernelArgs = CGF.CreateTempAlloca(
-      PtrTy, CharUnits::fromQuantity(16), "kernel_args",
+      PtrTy, LangAS::Default, CharUnits::fromQuantity(16), "kernel_args",
       llvm::ConstantInt::get(SizeTy, std::max(1, Args.size())));
   // Store pointers to the arguments in a locally allocated launch_args.
   for (unsigned i = 0; i < Args.size(); ++i) {
@@ -437,9 +437,10 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
   Address BlockDim =
       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
-  Address ShmemSize =
-      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
-  Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
+  Address ShmemSize = CGF.CreateTempAlloca(SizeTy, LangAS::Default,
+                                           CGM.getSizeAlign(), "shmem_size");
+  Address Stream = CGF.CreateTempAlloca(PtrTy, LangAS::Default,
+                                        CGM.getPointerAlign(), "stream");
   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(IntTy,
                               {/*gridDim=*/GridDim.getType(),
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index 12042a292d8fe..977f81a641ff3 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -1258,8 +1258,8 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   Address var = Scope.getActiveFlag();
   if (!var.isValid()) {
     CodeGenFunction::AllocaTrackerRAII AllocaTracker(CGF);
-    var = CGF.CreateTempAlloca(CGF.Builder.getInt1Ty(), CharUnits::One(),
-                               "cleanup.isactive");
+    var = CGF.CreateTempAlloca(CGF.Builder.getInt1Ty(), LangAS::Default,
+                               CharUnits::One(), "cleanup.isactive");
     Scope.setActiveFlag(var);
     Scope.AddAuxAllocas(AllocaTracker.Take());
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5764b59e538ae..8ca4ee64136c8 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -182,7 +182,7 @@ RawAddress CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,
                                                          const Twine &Name) {
   CharUnits Align =
       CharUnits::fromQuantity(CGM.getDataLayout().getPrefTypeAlign(Ty));
-  return CreateTempAlloca(Ty, Align, Name);
+  return CreateTempAlloca(Ty, LangAS::Default, Align, Name);
 }
 
 RawAddress CodeGenFunction::CreateIRTempWithoutCast(QualType Ty,
@@ -200,8 +200,9 @@ RawAddress CodeGenFunction::CreateMemTemp(QualType Ty, const Twine &Name,
 RawAddress CodeGenFunction::CreateMemTemp(QualType Ty, CharUnits Align,
                                           const Twine &Name,
                                           RawAddress *Alloca) {
-  RawAddress Result = CreateTempAlloca(ConvertTypeForMem(Ty), Align, Name,
-                                       /*ArraySize=*/nullptr, Alloca);
+  RawAddress Result =
+      CreateTempAlloca(ConvertTypeForMem(Ty), Ty.getAddressSpace(), Align, Name,
+                       /*ArraySize=*/nullptr, Alloca);
 
   if (Ty->isConstantMatrixType()) {
     auto *ArrayTy = cast(Result.getElementType());
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 464bdeb801a29..e0f8e62fb53af 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2868,8 +2868,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an
-  /// appropriate alignment on
-  /// the alloca.
+  /// appropriate alignment on the alloca.
   ///
   /// \p ArraySize is the number of array elements to be allocated if it
   ///    is not nullptr.
@@ -2889,7 +2888,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// various ways, this function will perform the cast. The original alloca
   /// instruction is returned through \p Alloca if it is not nullptr.
   ///
-  /// The cast is not performaed in CreateTempAllocaWithoutCast. This is
+  /// The cast is not performed in CreateTempAllocaWithoutCast. This is
   /// more efficient if the caller knows that the address will not be exposed.
   llvm::AllocaInst *CreateTempAlloca(llvm::Type *Ty, const Twine &Name = "tmp",
                                      llvm::Value *ArraySize = nullptr);
@@ -2962,7 +2961,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   AggValueSlot CreateAggTemp(QualType T, const Twine &Name = "tmp",
                              RawAddress *Alloca = nullptr) {
     return AggValueSlot::forAddr(
-        CreateMemTemp(T, Name, Alloca), T.getQualifiers(),
+        CreateMemTemp(T.getUnqualifiedType(), Name, Alloca), T.getQualifiers(),
         AggValueSlot::IsNotDestructed, AggValueSlot::DoesNotNeedGCBarriers,
         AggValueSlot::IsNotAliased, AggValueSlot::DoesNotOverlap);
   }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c0d9bc933ceee..416a5beca0de7 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -482,7 +482,6 @@ CodeGenModule::CodeGenModule(ASTContext &C,
       llvm::PointerType::get(LLVMContext, DL.getProgramAddressSpace());
   ConstGlobalsPtrTy = llvm::PointerType::get(
       LLVMContext, C.getTargetAddressSpace(GetGlobalConstantAddressSpace()));
-  ASTAllocaAddressSpace = getTargetCodeGenInfo().getASTAllocaAddressSpace();
 
   // Build C++20 Module initializers.
   // TODO: Add Microsoft here once we know the mangling required for the
diff --git a/clang/lib/CodeGen/CodeGenTypeCache.h b/clang/lib/CodeGen/CodeGenTypeCache.h
index 39ea8a681dc42..17eca207d7c80 100644
--- a/clang/lib/CodeGen/CodeGenTypeCache.h
+++ b/clang/lib/CodeGen/CodeGenTypeCache.h
@@ -106,8 +106,6 @@ struct CodeGenTypeCache {
     unsigned char SizeAlignInBytes;
   };
 
-  LangAS ASTAllocaAddressSpace;
-
   CharUnits getSizeSize() const {
     return CharUnits::fromQuantity(SizeSizeInBytes);
   }
@@ -123,8 +121,6 @@ struct CodeGenTypeCache {
 
   llvm::CallingConv::ID RuntimeCC;
   llvm::CallingConv::ID getRuntimeCC() const { return RuntimeCC; }
-
-  LangAS getASTAllocaAddressSpace() const { return ASTAllocaAddressSpace; }
 };
 
 }  // end namespace CodeGen
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 93997d881d5ad..89ea27b748aa9 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -319,13 +319,10 @@ class TargetCodeGenInfo {
   virtual LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
                                           const VarDecl *D) const;
 
-  /// Get the AST address space for alloca.
-  virtual LangAS getASTAllocaAddressSpace() const { return LangAS::Default; }
-
   /// Get the address space for an indirect (sret) return of the given type.
   /// The default falls back to the alloca AS.
   virtual LangAS getSRetAddrSpace(const CXXRecordDecl *RD) const {
-    return getASTAllocaAddressSpace();
+    return LangAS::Default;
   }
 
   /// Get address space of pointer parameter for __cxa_atexit.
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index a4b6a5fa5d35b..0d36f166328c7 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -378,11 +378,6 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
       llvm::PointerType *T, QualType QT) const override;
 
-  LangAS getASTAllocaAddressSpace() const override {
-    return getLangASFromTargetAS(
-        getABIInfo().getDataLayout().getAllocaAddrSpace());
-  }
-
   LangAS getSRetAddrSpace(const CXXRecordDecl *RD) const override;
 
   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
@@ -550,7 +545,8 @@ AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
   // default AS so the sret pointer matches the "this" convention.
   if (RD && !RD->canPassInRegisters())
     return LangAS::Default;
-  return getASTAllocaAddressSpace();
+  return getLangASFromTargetAS(
+      getABIInfo().getDataLayout().getAllocaAddrSpace());
 }
 
 LangAS
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 708921b3d5d43..0a96d612c8a87 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -92,11 +92,6 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
   CommonSPIRTargetCodeGenInfo(std::unique_ptr ABIInfo)
       : TargetCodeGenInfo(std::move(ABIInfo)) {}
 
-  LangAS getASTAllocaAddressSpace() const override {
-    return getLangASFromTargetAS(
-        getABIInfo().getDataLayout().getAllocaAddrSpace());
-  }
-
   unsigned getDeviceKernelCallingConv() const override;
   llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override;
   llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *Ty,
@@ -453,7 +448,8 @@ LangAS SPIRVTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
   // default AS so the sret pointer matches the "this" convention.
   if (RD && !RD->canPassInRegisters())
     return LangAS::Default;
-  return getASTAllocaAddressSpace();
+  return getLangASFromTargetAS(
+      getABIInfo().getDataLayout().getAllocaAddrSpace());
 }
 
 void SPIRVTargetCodeGenInfo::setCUDAKernelCallingConvention(
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index e7145b0e5d89b..16b2b459e2cb2 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -42,24 +42,23 @@
 // AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[V]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[V]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[V]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr addrspace(5) [[V]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP9]], ptr addrspace(5) [[V]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP11]], ptr addrspace(5) [[V]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[V]], align 4
@@ -130,63 +129,120 @@ int fi1a(int *i) {
   return v;
 }
 
-// AMDGCN-LABEL: define hidden i32 @fi1b(
-// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// AMDGCN-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN-NEXT:    [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
-// AMDGCN-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
-// AMDGCN-NEXT:    [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
-// AMDGCN-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
-// AMDGCN-NEXT:    [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
-// AMDGCN-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
-// AMDGCN-NEXT:    store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
-// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
-// AMDGCN-NEXT:    store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
-// AMDGCN-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
-// AMDGCN-NEXT:    store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 [[TMP10]], ptr [[TMP11]], align 4
-// AMDGCN-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
-// AMDGCN-NEXT:    store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
-// AMDGCN-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
-// AMDGCN-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
-// AMDGCN-NEXT:    store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 4
-// AMDGCN-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-// AMDGCN-NEXT:    ret i32 [[TMP25]]
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi1b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP10]], ptr [[TMP11]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP25]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi1b(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP10]], ptr [[TMP11]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP25]]
 //
 // NVPTX-LABEL: define hidden i32 @fi1b(
 // NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
@@ -334,30 +390,29 @@ int fi1b(int *i) {
 // AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[V]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP2]], ptr [[TMP0]] monotonic, align 4
-// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP3]] syncscope("agent") monotonic, align 4
-// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP8]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
 // AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP9]] syncscope("cluster") monotonic, align 4
-// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP14]], ptr [[TMP12]] syncscope("wavefront") monotonic, align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP17]], ptr [[TMP15]] syncscope("singlethread") monotonic, align 4
@@ -427,49 +482,92 @@ void fi2a(int *i) {
   __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
 }
 
-// AMDGCN-LABEL: define hidden void @fi2b(
-// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[DOTATOMICTMP2:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// AMDGCN-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
-// AMDGCN-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
-// AMDGCN-NEXT:    [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
-// AMDGCN-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
-// AMDGCN-NEXT:    [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
-// AMDGCN-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
-// AMDGCN-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
-// AMDGCN-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
-// AMDGCN-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
-// AMDGCN-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
-// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
-// AMDGCN-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
-// AMDGCN-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
-// AMDGCN-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
-// AMDGCN-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
-// AMDGCN-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
-// AMDGCN-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
-// AMDGCN-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
-// AMDGCN-NEXT:    ret void
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi2b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi2b(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    ret void
 //
 // NVPTX-LABEL: define hidden void @fi2b(
 // NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
@@ -706,14 +804,6 @@ void fi2b(int *i) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
@@ -730,77 +820,77 @@ void fi2b(int *i) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr addrspace(5) [[E_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr addrspace(5) [[F_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -1169,14 +1259,6 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
@@ -1193,77 +1275,77 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr addrspace(5) [[E_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr addrspace(5) [[F_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -1632,14 +1714,6 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
@@ -1656,77 +1730,77 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr addrspace(5) [[E_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr addrspace(5) [[F_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -2095,14 +2169,6 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
@@ -2119,77 +2185,77 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr addrspace(5) [[E_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr addrspace(5) [[F_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -2558,14 +2624,6 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
@@ -2582,77 +2640,77 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr addrspace(5) [[E_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr addrspace(5) [[F_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -3021,14 +3079,6 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
@@ -3045,77 +3095,77 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr addrspace(5) [[E_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr addrspace(5) [[F_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -3370,12 +3420,11 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -3493,12 +3542,11 @@ _Bool fi4a(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -3616,12 +3664,11 @@ _Bool fi4b(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -3739,12 +3786,11 @@ _Bool fi4c(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -3862,12 +3908,11 @@ _Bool fi4_clustr(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -3985,12 +4030,11 @@ _Bool fi4d(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -4108,12 +4152,11 @@ _Bool fi4e(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
@@ -4230,12 +4273,11 @@ _Bool fi5a(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
@@ -4352,12 +4394,11 @@ _Bool fi5b(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
@@ -4473,12 +4514,11 @@ _Bool fi5c(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
@@ -4594,12 +4634,11 @@ _Bool fi5_clustr(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
@@ -4715,12 +4754,11 @@ _Bool fi5d(int *i) {
 // AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
@@ -4823,12 +4861,10 @@ _Bool fi5e(int *i) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -4899,12 +4935,10 @@ int fi6a(int *c, int *d) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -4975,12 +5009,10 @@ int fi6b(int *c, int *d) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5051,12 +5083,10 @@ int fi6c(int *c, int *d) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5127,12 +5157,10 @@ int fi6_clustr(int *c, int *d) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5203,12 +5231,10 @@ int fi6d(int *c, int *d) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5279,11 +5305,10 @@ int fi6e(int *c, int *d) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5354,11 +5379,10 @@ _Bool fi7a(_Bool *c) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5429,11 +5453,10 @@ _Bool fi7b(_Bool *c) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5504,11 +5527,10 @@ _Bool fi7c(_Bool *c) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5579,11 +5601,10 @@ _Bool fi7_clustr(_Bool *c) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5654,11 +5675,10 @@ _Bool fi7d(_Bool *c) {
 // AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
@@ -5748,29 +5768,27 @@ _Bool fi7e(_Bool *c) {
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
 // AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
 // AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
-// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
 // AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
-// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
 // AMDGCN_CL_20-NEXT:    ret void
 //
@@ -5836,3 +5854,10 @@ void fi8a(unsigned int *a, unsigned int *b) {
   *b = __scoped_atomic_fetch_uinc(b, ~0U, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   *a = __scoped_atomic_fetch_udec(a, ~0U, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
 }
+//.
+// AMDGCN_CL_DEF: [[META2]] = !{}
+//.
+// AMDGCN_CL_20: [[META3]] = !{}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// AMDGCN: {{.*}}
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
index 259e8d333e4c8..eb49a668c7945 100644
--- a/clang/test/CodeGen/scoped-fence-ops.c
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -30,34 +30,62 @@ void fe1a() {
   __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden void @fe1b(
-// AMDGCN-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr
-// AMDGCN-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// AMDGCN-NEXT:      i32 1, label %[[ACQUIRE:.*]]
-// AMDGCN-NEXT:      i32 2, label %[[ACQUIRE]]
-// AMDGCN-NEXT:      i32 3, label %[[RELEASE:.*]]
-// AMDGCN-NEXT:      i32 4, label %[[ACQREL:.*]]
-// AMDGCN-NEXT:      i32 5, label %[[SEQCST:.*]]
-// AMDGCN-NEXT:    ]
-// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
-// AMDGCN-NEXT:    ret void
-// AMDGCN:       [[ACQUIRE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") acquire
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[RELEASE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[ACQREL]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") acq_rel
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[SEQCST]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF-LABEL: define hidden void @fe1b(
+// AMDGCN_CL_DEF-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN_CL_DEF-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 2, label %[[ACQUIRE]]
+// AMDGCN_CL_DEF-NEXT:      i32 3, label %[[RELEASE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 4, label %[[ACQREL:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 5, label %[[SEQCST:.*]]
+// AMDGCN_CL_DEF-NEXT:    ]
+// AMDGCN_CL_DEF:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    ret void
+// AMDGCN_CL_DEF:       [[ACQUIRE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("workgroup") acquire
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[RELEASE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("workgroup") release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[ACQREL]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("workgroup") acq_rel
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[SEQCST]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fe1b(
+// AMDGCN_CL_20-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    store i32 [[ORD]], ptr addrspace(5) [[ORD_ADDR]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[ORD_ADDR]], align 4
+// AMDGCN_CL_20-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN_CL_20-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 2, label %[[ACQUIRE]]
+// AMDGCN_CL_20-NEXT:      i32 3, label %[[RELEASE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 4, label %[[ACQREL:.*]]
+// AMDGCN_CL_20-NEXT:      i32 5, label %[[SEQCST:.*]]
+// AMDGCN_CL_20-NEXT:    ]
+// AMDGCN_CL_20:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    ret void
+// AMDGCN_CL_20:       [[ACQUIRE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("workgroup") acquire
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[RELEASE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("workgroup") release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[ACQREL]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("workgroup") acq_rel
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[SEQCST]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1b(
 // SPIRV-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
@@ -119,41 +147,76 @@ void fe1b(int ord) {
   __scoped_atomic_thread_fence(ord, __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden void @fe1c(
-// AMDGCN-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
-// AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// AMDGCN-NEXT:    [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr
-// AMDGCN-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
-// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// AMDGCN-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 5, label %[[CLUSTER_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
-// AMDGCN-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
-// AMDGCN-NEXT:    ]
-// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
-// AMDGCN-NEXT:    ret void
-// AMDGCN:       [[SYSTEM_SCOPE]]:
-// AMDGCN-NEXT:    fence release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[DEVICE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("agent") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[WORKGROUP_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[CLUSTER_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("cluster") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[WAVEFRONT_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("wavefront") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// AMDGCN:       [[SINGLE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("singlethread") release
-// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF-LABEL: define hidden void @fe1c(
+// AMDGCN_CL_DEF-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN_CL_DEF-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 5, label %[[CLUSTER_SCOPE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// AMDGCN_CL_DEF-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// AMDGCN_CL_DEF-NEXT:    ]
+// AMDGCN_CL_DEF:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    ret void
+// AMDGCN_CL_DEF:       [[SYSTEM_SCOPE]]:
+// AMDGCN_CL_DEF-NEXT:    fence release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[DEVICE_SCOPE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("agent") release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[WORKGROUP_SCOPE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("workgroup") release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CLUSTER_SCOPE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("cluster") release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[WAVEFRONT_SCOPE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("wavefront") release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_DEF:       [[SINGLE_SCOPE]]:
+// AMDGCN_CL_DEF-NEXT:    fence syncscope("singlethread") release
+// AMDGCN_CL_DEF-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fe1c(
+// AMDGCN_CL_20-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    store i32 [[SCOPE]], ptr addrspace(5) [[SCOPE_ADDR]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SCOPE_ADDR]], align 4
+// AMDGCN_CL_20-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN_CL_20-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 5, label %[[CLUSTER_SCOPE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// AMDGCN_CL_20-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// AMDGCN_CL_20-NEXT:    ]
+// AMDGCN_CL_20:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    ret void
+// AMDGCN_CL_20:       [[SYSTEM_SCOPE]]:
+// AMDGCN_CL_20-NEXT:    fence release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[DEVICE_SCOPE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("agent") release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[WORKGROUP_SCOPE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("workgroup") release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[CLUSTER_SCOPE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("cluster") release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[WAVEFRONT_SCOPE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("wavefront") release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN_CL_20:       [[SINGLE_SCOPE]]:
+// AMDGCN_CL_20-NEXT:    fence syncscope("singlethread") release
+// AMDGCN_CL_20-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1c(
 // SPIRV-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
@@ -269,6 +332,3 @@ void fe2a() {
 void fe2b() {
   __scoped_atomic_thread_fence(__ATOMIC_RELEASE, 999);
 }
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// AMDGCN_CL_20: {{.*}}
-// AMDGCN_CL_DEF: {{.*}}
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 6000ba07978cb..32ab1372ae591 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -645,10 +645,8 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
-// AMDGCN20-NEXT:    [[IN1_ASCAST:%.*]] = addrspacecast ptr [[IN1]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN20-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
 // AMDGCN20-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
 // AMDGCN20-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
 //
@@ -658,12 +656,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -674,13 +670,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
@@ -695,10 +689,8 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local void @foo_large(
 // AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
-// AMDGCN20-NEXT:    [[IN_ASCAST:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -707,12 +699,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -724,13 +714,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
-// AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
 // AMDGCN20-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
@@ -743,14 +731,12 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
 // AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
 // AMDGCN20-NEXT:    ret void
 //
@@ -758,15 +744,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember(
 // AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
 // AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN20-NEXT:    ret void
@@ -809,12 +793,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
 // AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -823,12 +805,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
 // AMDGCN20-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -837,9 +817,8 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
 // AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
@@ -848,9 +827,8 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr
-// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
 // AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
 // AMDGCN20-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
@@ -860,22 +838,18 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
 // AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 //
 // AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember(
 // AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -884,16 +858,14 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
 // AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN20-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
 // AMDGCN20-NEXT:    ret void
 //
@@ -901,15 +873,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember(
 // AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
 // AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
 // AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN20-NEXT:    ret void
@@ -918,13 +888,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
 // AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
-// AMDGCN20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN20-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
 // AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
@@ -934,15 +902,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN20-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
 // AMDGCN20-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
@@ -951,22 +917,18 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
 // AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 //
 // AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember(
 // AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    ret void
 //
 //
diff --git a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
index af50928d8ecf0..a17abbc02b216 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -15,9 +15,8 @@
 // CL20-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 // CL20-NEXT:  [[ENTRY:.*:]]
 // CL20-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CL20-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CL20-NEXT:    store ptr [[X]], ptr [[X_ADDR_ASCAST]], align 8
-// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
+// CL20-NEXT:    store ptr [[X]], ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[X_ADDR]], align 8
 // CL20-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CL20-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index bef560c6f119f..ffeb942b6e0a3 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -119,42 +119,36 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
-// AMDGCN-NEXT:    store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
-// AMDGCN-NEXT:    [[IN1_ASCAST:%.*]] = addrspacecast ptr [[IN1]] to ptr addrspace(5)
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
 // AMDGCN-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
@@ -169,42 +163,36 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN-LABEL: define dso_local void @foo_large(
 // AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
-// AMDGCN-NEXT:    [[IN_ASCAST:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(5)
+// AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
-// AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
-// AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
 // AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
@@ -217,14 +205,12 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
 // AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
+// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
 // AMDGCN-NEXT:    ret void
 //
@@ -232,15 +218,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN-LABEL: define dso_local void @FuncOneLargeMember(
 // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 0
+// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
 // AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN-NEXT:    ret void
@@ -256,14 +240,14 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
-// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local(
-// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
@@ -280,51 +264,45 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
-// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember(
-// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
 // AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir(
-// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr
-// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
 // AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
@@ -332,24 +310,20 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember(
-// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -358,16 +332,14 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
 // AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 1
+// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
 // AMDGCN-NEXT:    ret void
 //
@@ -375,88 +347,78 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 // AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember(
 // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
 // AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
-// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 1
+// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
 // AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 0
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
-// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_ASCAST]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember(
-// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U1_ASCAST]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
 // AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember(
-// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
-// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
-// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR3]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
 // AMDGCN-NEXT:    ret void
 //
 //.
-// AMDGCN: [[META4]] = !{i32 1, i32 1}
-// AMDGCN: [[META5]] = !{!"none", !"none"}
-// AMDGCN: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
-// AMDGCN: [[META7]] = !{!"", !""}
-// AMDGCN: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
-// AMDGCN: [[META9]] = !{}
-// AMDGCN: [[META10]] = !{i32 0}
-// AMDGCN: [[META11]] = !{!"none"}
-// AMDGCN: [[META12]] = !{!"struct StructOneMember"}
-// AMDGCN: [[META13]] = !{!""}
-// AMDGCN: [[META14]] = !{i32 1}
-// AMDGCN: [[META15]] = !{!"struct StructOneMember*"}
-// AMDGCN: [[META16]] = !{!"struct LargeStructOneMember"}
-// AMDGCN: [[META17]] = !{!"struct StructTwoMember"}
-// AMDGCN: [[META18]] = !{!"struct LargeStructTwoMember"}
+// AMDGCN: [[META3]] = !{i32 1, i32 1}
+// AMDGCN: [[META4]] = !{!"none", !"none"}
+// AMDGCN: [[META5]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN: [[META6]] = !{!"", !""}
+// AMDGCN: [[META7]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN: [[META8]] = !{}
+// AMDGCN: [[META9]] = !{i32 0}
+// AMDGCN: [[META10]] = !{!"none"}
+// AMDGCN: [[META11]] = !{!"struct StructOneMember"}
+// AMDGCN: [[META12]] = !{!""}
+// AMDGCN: [[META13]] = !{i32 1}
+// AMDGCN: [[META14]] = !{!"struct StructOneMember*"}
+// AMDGCN: [[META15]] = !{!"struct LargeStructOneMember"}
+// AMDGCN: [[META16]] = !{!"struct StructTwoMember"}
+// AMDGCN: [[META17]] = !{!"struct LargeStructTwoMember"}
 //.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
index c095001ce898c..c435fad4897cd 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
@@ -7,9 +7,8 @@ kernel void foo(global int *p) { *p = 1; }
 // CHECK-SAME: ptr addrspace(1) noundef align 4 [[P:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META6:![0-9]+]] !kernel_arg_access_qual [[META7:![0-9]+]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META9:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[P]], ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA10:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA10]]
+// CHECK-NEXT:    store ptr addrspace(1) [[P]], ptr addrspace(5) [[P_ADDR]], align 8, !tbaa [[INTPTR_TBAA10:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[P_ADDR]], align 8, !tbaa [[INTPTR_TBAA10]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_foo(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -19,9 +18,8 @@ kernel void foo(global int *p) { *p = 1; }
 // CHECK-SAME: ptr addrspace(1) noundef align 4 [[P:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META6]] !kernel_arg_access_qual [[META7]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META9]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[P]], ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA10]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA10]]
+// CHECK-NEXT:    store ptr addrspace(1) [[P]], ptr addrspace(5) [[P_ADDR]], align 8, !tbaa [[INTPTR_TBAA10]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[P_ADDR]], align 8, !tbaa [[INTPTR_TBAA10]]
 // CHECK-NEXT:    store i32 1, ptr addrspace(1) [[TMP0]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 7be672ebe2af2..6b94d5b868cec 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -75,13 +75,11 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // NOCPU-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// NOCPU-NEXT:    [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr
-// NOCPU-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// NOCPU-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8
 // NOCPU-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
 // NOCPU-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8
 // NOCPU-NEXT:    ret void
@@ -95,18 +93,14 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
 // NOCPU-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // NOCPU-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
-// NOCPU-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// NOCPU-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// NOCPU-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// NOCPU-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP3:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
 // NOCPU-NEXT:    call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]]
 // NOCPU-NEXT:    ret void
 //
@@ -132,27 +126,19 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
 // NOCPU-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// NOCPU-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// NOCPU-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// NOCPU-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// NOCPU-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// NOCPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // NOCPU-NEXT:    [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// NOCPU-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP2]] to ptr
 // NOCPU-NEXT:    [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// NOCPU-NEXT:    [[TMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP11]] to ptr
 // NOCPU-NEXT:    [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
 // NOCPU-NEXT:    [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
 // NOCPU-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// NOCPU-NEXT:    [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr
-// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1
-// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// NOCPU-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1
+// NOCPU-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8
+// NOCPU-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8
 // NOCPU-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
 // NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
 // NOCPU-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
 // NOCPU-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
 // NOCPU-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
 // NOCPU-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1
@@ -160,15 +146,15 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 2
 // NOCPU-NEXT:    store ptr @__test_block_invoke, ptr [[BLOCK_INVOKE]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
 // NOCPU-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
+// NOCPU-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
 // NOCPU-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8
 // NOCPU-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]])
 // NOCPU-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
 // NOCPU-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
 // NOCPU-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
 // NOCPU-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
 // NOCPU-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1
@@ -176,21 +162,21 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 2
 // NOCPU-NEXT:    store ptr @__test_block_invoke_2, ptr [[BLOCK_INVOKE6]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
 // NOCPU-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 6
-// NOCPU-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
+// NOCPU-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
 // NOCPU-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
 // NOCPU-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
-// NOCPU-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
 // NOCPU-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8
 // NOCPU-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]])
 // NOCPU-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
 // NOCPU-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
 // NOCPU-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
 // NOCPU-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
 // NOCPU-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1
@@ -198,16 +184,16 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 2
 // NOCPU-NEXT:    store ptr @__test_block_invoke_3, ptr [[BLOCK_INVOKE15]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
 // NOCPU-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 6
-// NOCPU-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
+// NOCPU-NEXT:    [[TMP15:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
 // NOCPU-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
 // NOCPU-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
-// NOCPU-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP17:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
 // NOCPU-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
 // NOCPU-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
 // NOCPU-NEXT:    store i64 100, ptr addrspace(5) [[TMP18]], align 8
@@ -219,15 +205,15 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 2
 // NOCPU-NEXT:    store ptr @__test_block_invoke_4, ptr [[BLOCK_INVOKE24]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 3
-// NOCPU-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP20:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
 // NOCPU-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 4
-// NOCPU-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
 // NOCPU-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8
 // NOCPU-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8
 // NOCPU-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
 // NOCPU-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
 // NOCPU-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8
 // NOCPU-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]])
 // NOCPU-NEXT:    ret void
@@ -238,9 +224,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META6:![0-9]+]] !kernel_arg_access_qual [[META7:![0-9]+]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META9:![0-9]+]] {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// NOCPU-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
+// NOCPU-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[I_ADDR]], align 8
 // NOCPU-NEXT:    call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR8]]
 // NOCPU-NEXT:    ret void
 //
@@ -254,14 +239,12 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
 // NOCPU-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
 // NOCPU-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// NOCPU-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// NOCPU-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
 // NOCPU-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4
 // NOCPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // NOCPU-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
 // NOCPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
-// NOCPU-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
+// NOCPU-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
 // NOCPU-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // NOCPU-NEXT:    ret void
 //
@@ -272,9 +255,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
@@ -302,9 +284,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
@@ -339,11 +320,9 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// NOCPU-NEXT:    [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr
 // NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// NOCPU-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// NOCPU-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4
 // NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
@@ -357,7 +336,7 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
 // NOCPU-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
 // NOCPU-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
-// NOCPU-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4
+// NOCPU-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4
 // NOCPU-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
 // NOCPU-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4
 // NOCPU-NEXT:    ret void
@@ -380,9 +359,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
 // NOCPU-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
 // NOCPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8
@@ -409,9 +387,8 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// NOCPU-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // NOCPU-NEXT:    [[BLOCK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_ADDR]] to ptr
-// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // NOCPU-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR_ASCAST]], align 8
 // NOCPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // NOCPU-NEXT:    ret void
@@ -449,13 +426,11 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // GFX900-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// GFX900-NEXT:    [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr
-// GFX900-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// GFX900-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8:![0-9]+]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[LONG_TBAA6:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(5) [[ID_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
 // GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    ret void
@@ -469,18 +444,14 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
 // GFX900-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
-// GFX900-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// GFX900-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// GFX900-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// GFX900-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA15:![0-9]+]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA17:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA15]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA17]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
-// GFX900-NEXT:    [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15:![0-9]+]]
+// GFX900-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR7:[0-9]+]]
 // GFX900-NEXT:    ret void
 //
@@ -506,30 +477,22 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
 // GFX900-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
 // GFX900-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// GFX900-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// GFX900-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// GFX900-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// GFX900-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // GFX900-NEXT:    [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// GFX900-NEXT:    [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP2]] to ptr
 // GFX900-NEXT:    [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// GFX900-NEXT:    [[TMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP11]] to ptr
 // GFX900-NEXT:    [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
 // GFX900-NEXT:    [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
 // GFX900-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// GFX900-NEXT:    [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA15]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA17]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]]
+// GFX900-NEXT:    store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8:[0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
 // GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
 // GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA18:![0-9]+]]
 // GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20:![0-9]+]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
 // GFX900-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1
@@ -537,15 +500,15 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke, ptr [[BLOCK_INVOKE]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA15]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8, !tbaa [[CHARPTR_TBAA15]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA17]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]])
 // GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA18]]
 // GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
 // GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
 // GFX900-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1
@@ -553,21 +516,21 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_2, ptr [[BLOCK_INVOKE6]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA15]]
+// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8, !tbaa [[CHARPTR_TBAA15]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA17]]
+// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]])
 // GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA18]]
 // GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
 // GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
 // GFX900-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1
@@ -575,16 +538,16 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_3, ptr [[BLOCK_INVOKE15]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA15]]
+// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8, !tbaa [[CHARPTR_TBAA15]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA17]]
+// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
 // GFX900-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
@@ -599,15 +562,15 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_4, ptr [[BLOCK_INVOKE24]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA6]]
+// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA8]]
+// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA18]]
 // GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
 // GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
@@ -622,9 +585,8 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META21:![0-9]+]] !kernel_arg_access_qual [[META22:![0-9]+]] !kernel_arg_type [[META23:![0-9]+]] !kernel_arg_base_type [[META23]] !kernel_arg_type_qual [[META24:![0-9]+]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// GFX900-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA25:![0-9]+]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA25]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[INTPTR_TBAA25:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[INTPTR_TBAA25]]
 // GFX900-NEXT:    call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR7]]
 // GFX900-NEXT:    ret void
 //
@@ -638,9 +600,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
 // GFX900-NEXT:    [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
 // GFX900-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-// GFX900-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
-// GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA25]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[INTPTR_TBAA25]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
 // GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2]]
@@ -648,7 +608,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA18]]
 // GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA2]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
+// GFX900-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT20]]
 // GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
@@ -661,8 +621,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
 // GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -688,8 +647,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -722,10 +680,8 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // GFX900-NEXT:    [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
-// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[ANYPTR_TBAA31:![0-9]+]]
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
+// GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[ANYPTR_TBAA31:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA17]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
@@ -738,7 +694,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA8]]
 // GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
 // GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA6]]
-// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[ANYPTR_TBAA31]]
+// GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[ANYPTR_TBAA31]]
 // GFX900-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
 // GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[INT_TBAA2]]
 // GFX900-NEXT:    ret void
@@ -760,8 +716,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
 // GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[LONG_TBAA6]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
@@ -786,8 +741,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
-// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
+// GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // GFX900-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/atomic-ops.cl b/clang/test/CodeGenOpenCL/atomic-ops.cl
index db2cb571b0d8f..28d1f572421f6 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops.cl
@@ -344,10 +344,9 @@ int test_volatile(volatile atomic_int *i) {
   // CHECK-LABEL: @test_volatile
   // CHECK:      %[[i_addr:.*]] = alloca ptr
   // CHECK-NEXT: %[[atomicdst:.*]] = alloca i32
-  // CHECK-NEXT: %[[i_addr_ascast:.*]] = addrspacecast ptr addrspace(5) %[[i_addr]] to ptr
   // CHECK-NEXT: %[[atomicdst_ascast:.*]] = addrspacecast ptr addrspace(5) %[[atomicdst]] to ptr
-  // CHECK-NEXT: store ptr %i, ptr %[[i_addr_ascast]]
-  // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr %[[i_addr_ascast]]
+  // CHECK-NEXT: store ptr %i, ptr addrspace(5) %[[i_addr]]
+  // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr addrspace(5) %[[i_addr]]
   // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4{{$}}
   // CHECK-NEXT: store i32 %[[res]], ptr %[[atomicdst_ascast]]
   // CHECK-NEXT: %[[retval:.*]] = load i32, ptr %[[atomicdst_ascast]]
diff --git a/clang/test/CodeGenOpenCL/builtins-alloca.cl b/clang/test/CodeGenOpenCL/builtins-alloca.cl
index 51da8e3b3badb..01f12788342d9 100644
--- a/clang/test/CodeGenOpenCL/builtins-alloca.cl
+++ b/clang/test/CodeGenOpenCL/builtins-alloca.cl
@@ -38,9 +38,8 @@
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
@@ -65,9 +64,8 @@
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
@@ -108,9 +106,8 @@ void test1_builtin_alloca(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
@@ -135,9 +132,8 @@ void test1_builtin_alloca(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
@@ -178,9 +174,8 @@ void test1_builtin_alloca_uninitialized(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
@@ -205,9 +200,8 @@ void test1_builtin_alloca_uninitialized(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
@@ -248,9 +242,8 @@ void test1_builtin_alloca_with_align(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
@@ -275,9 +268,8 @@ void test1_builtin_alloca_with_align(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
@@ -316,9 +308,8 @@ void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
 // OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
@@ -341,9 +332,8 @@ void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
 // OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
@@ -381,9 +371,8 @@ void test2_builtin_alloca(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
 // OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
@@ -406,9 +395,8 @@ void test2_builtin_alloca(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
 // OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
@@ -446,9 +434,8 @@ void test2_builtin_alloca_uninitialized(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
 // OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
@@ -471,9 +458,8 @@ void test2_builtin_alloca_uninitialized(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
 // OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
@@ -511,9 +497,8 @@ void test2_builtin_alloca_with_align(unsigned n) {
 // OPENCL20-NEXT:  [[ENTRY:.*:]]
 // OPENCL20-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL20-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL20-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL20-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL20-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL20-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL20-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL20-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
 // OPENCL20-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
@@ -536,9 +521,8 @@ void test2_builtin_alloca_with_align(unsigned n) {
 // OPENCL30GAS-NEXT:  [[ENTRY:.*:]]
 // OPENCL30GAS-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // OPENCL30GAS-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
-// OPENCL30GAS-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
-// OPENCL30GAS-NEXT:    store i32 [[N]], ptr [[N_ADDR_ASCAST]], align 4
-// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
+// OPENCL30GAS-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL30GAS-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
 // OPENCL30GAS-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
 // OPENCL30GAS-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
 // OPENCL30GAS-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 14d7e7a365989..332a2fa94ee92 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -8,9 +8,8 @@ typedef unsigned int uint;
 // CHECK-LABEL: @test_s_sleep_var(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[TMP0]])
 // CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 15)
 // CHECK-NEXT:    ret void
@@ -27,19 +26,15 @@ void test_s_sleep_var(int d)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -53,19 +48,15 @@ void test_permlane16_var(global uint* out, uint a, uint b, uint c) {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlanex16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -89,13 +80,11 @@ void test_s_barrier_signal()
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -109,21 +98,18 @@ void test_s_barrier_signal_var(void *bar, int a)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
 // CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 // CHECK:       if.then:
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[TMP1]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
 // CHECK-NEXT:    br label [[IF_END:%.*]]
 // CHECK:       if.else:
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
 // CHECK-NEXT:    br label [[IF_END]]
 // CHECK:       if.end:
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
@@ -143,13 +129,11 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret void
 //
@@ -161,9 +145,8 @@ void test_s_barrier_init(void *bar, int a)
 // CHECK-LABEL: @test_s_barrier_join(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
 // CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
 // CHECK-NEXT:    ret void
@@ -187,9 +170,8 @@ void test_s_barrier_leave()
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
 // CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
@@ -205,9 +187,8 @@ unsigned test_s_get_barrier_state(int a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4
@@ -246,20 +227,16 @@ void test_s_ttracedata_imm()
 // CHECK-NEXT:    [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
 // CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[FP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FP_ADDR]] to ptr
-// CHECK-NEXT:    [[GP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GP_ADDR]] to ptr
-// CHECK-NEXT:    [[CP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CP_ADDR]] to ptr
-// CHECK-NEXT:    [[LEN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LEN_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[FP:%.*]], ptr [[FP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr [[GP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr [[CP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr [[LEN_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
 // CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GP_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEN_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr [[CP_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
 // CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
 // CHECK-NEXT:    ret void
 //
@@ -274,14 +251,12 @@ void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
 // CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[RSRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RSRC_ADDR]] to ptr
-// CHECK-NEXT:    [[LEN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LEN_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(8) [[RSRC:%.*]], ptr [[RSRC_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr [[LEN_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEN_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(8), ptr [[RSRC_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
 // CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
 // CHECK-NEXT:    ret void
 //
@@ -296,16 +271,13 @@ void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.ds.bpermute.fi.b32(i32 [[TMP0]], i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 0b4cdd0c2c28f..c40172a0d7fcd 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -65,19 +65,15 @@ void test_s_wait_tensorcnt() {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -91,19 +87,15 @@ void test_bitop3_b32(global uint* out, uint a, uint b, uint c) {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    store i16 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    store i16 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(5) [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[C_ADDR]], align 2
 // CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -115,13 +107,11 @@ void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -133,13 +123,11 @@ void test_prng_b32(global uint* out, uint a) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -152,14 +140,12 @@ void test_tanh_f32(global float* out, float a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2
 // CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -172,13 +158,11 @@ void test_tanh_f16(global half* out, global half* a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.tanh.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -191,13 +175,11 @@ void test_tanh_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rcp.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -210,13 +192,11 @@ void test_rcp_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -229,13 +209,11 @@ void test_sqrt_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -248,13 +226,11 @@ void test_rsq_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -267,13 +243,11 @@ void test_log_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -286,13 +260,11 @@ void test_exp2_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sin.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -305,13 +277,11 @@ void test_sin_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.cos.bf16(bfloat [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -326,19 +296,15 @@ void test_cos_bf16(global __bf16* out, __bf16 a)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
 // CHECK-NEXT:    [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store float [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SR:%.*]], ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float [[TMP0]], float [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x bfloat> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -353,19 +319,15 @@ void test_cvt_sr_pk_bf16_f32(global bfloat2* out, float a, float b, uint sr)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
 // CHECK-NEXT:    [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store float [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SR:%.*]], ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float [[TMP0]], float [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -378,28 +340,26 @@ void test_cvt_sr_pk_f16_f32(global half2* out, float a, float b, uint sr)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP0]], i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0
 // CHECK-NEXT:    store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP3]], i32 1)
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1
 // CHECK-NEXT:    store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP6]], i32 2)
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2
 // CHECK-NEXT:    store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP9]], i32 3)
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3
 // CHECK-NEXT:    store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2
 // CHECK-NEXT:    ret void
@@ -416,28 +376,26 @@ void test_cvt_f16_fp8(global half* out, int a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP0]], i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0
 // CHECK-NEXT:    store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP3]], i32 1)
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1
 // CHECK-NEXT:    store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP6]], i32 2)
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2
 // CHECK-NEXT:    store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP9]], i32 3)
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3
 // CHECK-NEXT:    store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2
 // CHECK-NEXT:    ret void
@@ -454,13 +412,11 @@ void test_cvt_f16_bf8(global half* out, int a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0
 // CHECK-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4
 // CHECK-NEXT:    ret void
@@ -474,13 +430,11 @@ void test_cvt_pk_f16_fp8(global half2* out, short a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0
 // CHECK-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4
 // CHECK-NEXT:    ret void
@@ -494,13 +448,11 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x half> [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -513,13 +465,11 @@ void test_cvt_pk_bf8_f16(global short* out, half2 a)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x half> [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -534,37 +484,33 @@ void test_cvt_pk_fp8_f16(global short* out, half2 a)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
 // CHECK-NEXT:    [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
-// CHECK-NEXT:    [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store half [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    store i32 [[SR:%.*]], ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1)
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2)
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3)
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -582,41 +528,37 @@ void test_cvt_sr_bf8_f16(global int* out, half a, uint sr, int old)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
 // CHECK-NEXT:    [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
 // CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
-// CHECK-NEXT:    [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store half [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[SR:%.*]], ptr addrspace(5) [[SR_ADDR]], align 2
+// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(5) [[SR_ADDR]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(5) [[SR_ADDR]], align 2
 // CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[TMP6]] to i32
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1)
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(5) [[SR_ADDR]], align 2
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2)
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
-// CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr addrspace(5) [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr addrspace(5) [[SR_ADDR]], align 2
 // CHECK-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP16]] to i32
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3)
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -641,102 +583,91 @@ void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old)
 // CHECK-NEXT:    [[SRC3_ADDR:%.*]] = alloca <3 x i32>, align 16, addrspace(5)
 // CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUTH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH8_ADDR]] to ptr
-// CHECK-NEXT:    [[OUTY8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY8_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
-// CHECK-NEXT:    [[OUTF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF32_ADDR]] to ptr
-// CHECK-NEXT:    [[OUTF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF8_ADDR]] to ptr
-// CHECK-NEXT:    [[OUTH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH16_ADDR]] to ptr
-// CHECK-NEXT:    [[OUTY16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY16_ADDR]] to ptr
-// CHECK-NEXT:    [[OUTF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF16_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC3_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
-// CHECK-NEXT:    [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTH8:%.*]], ptr [[OUTH8_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTY8:%.*]], ptr [[OUTY8_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x i32> [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTF32:%.*]], ptr [[OUTF32_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTF8:%.*]], ptr [[OUTF8_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTH16:%.*]], ptr [[OUTH16_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTY16:%.*]], ptr [[OUTY16_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[OUTF16:%.*]], ptr [[OUTF16_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <3 x i32> [[SRC3:%.*]], ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTH8:%.*]], ptr addrspace(5) [[OUTH8_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTY8:%.*]], ptr addrspace(5) [[OUTY8_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i32> [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTF32:%.*]], ptr addrspace(5) [[OUTF32_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTF8:%.*]], ptr addrspace(5) [[OUTF8_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTH16:%.*]], ptr addrspace(5) [[OUTH16_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTY16:%.*]], ptr addrspace(5) [[OUTY16_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUTF16:%.*]], ptr addrspace(5) [[OUTF16_ADDR]], align 8
+// CHECK-NEXT:    store <3 x i32> [[SRC3:%.*]], ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[TMP0]], i32 [[TMP1]], i32 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTH8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> [[TMP4]], i32 [[TMP5]], i32 5)
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTY8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[TMP8]], i32 [[TMP9]], i32 6)
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTH8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i32>, ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP14:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> [[TMP12]], i32 [[TMP13]], i32 7)
-// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTY8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[TMP16]], i32 [[TMP17]], i32 1)
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTH8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x half> [[TMP18]], ptr addrspace(1) [[TMP19]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP22:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 [[TMP20]], i32 [[TMP21]], i32 2)
-// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTY8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x bfloat> [[TMP22]], ptr addrspace(1) [[TMP23]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load <2 x i32>, ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP26:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> [[TMP24]], i32 [[TMP25]], i32 5)
-// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTF8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x float> [[TMP26]], ptr addrspace(1) [[TMP27]], align 32
-// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i32>, ptr addrspace(5) [[SRC2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> [[TMP28]], i32 [[TMP29]], i32 6)
-// CHECK-NEXT:    [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTF8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x float> [[TMP30]], ptr addrspace(1) [[TMP31]], align 32
-// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7)
-// CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTF8_ADDR]], align 8
 // CHECK-NEXT:    store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32
-// CHECK-NEXT:    [[TMP36:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP36:%.*]] = load <3 x i32>, ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP38:%.*]] = call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> [[TMP36]], i32 [[TMP37]], i32 0)
-// CHECK-NEXT:    [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUTH16_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTH16_ADDR]], align 8
 // CHECK-NEXT:    store <16 x half> [[TMP38]], ptr addrspace(1) [[TMP39]], align 32
-// CHECK-NEXT:    [[TMP40:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP40:%.*]] = load <3 x i32>, ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP42:%.*]] = call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> [[TMP40]], i32 [[TMP41]], i32 1)
-// CHECK-NEXT:    [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUTY16_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTY16_ADDR]], align 8
 // CHECK-NEXT:    store <16 x bfloat> [[TMP42]], ptr addrspace(1) [[TMP43]], align 32
-// CHECK-NEXT:    [[TMP44:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP44:%.*]] = load <3 x i32>, ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP46:%.*]] = call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> [[TMP44]], i32 [[TMP45]], i32 2)
-// CHECK-NEXT:    [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUTH16_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTH16_ADDR]], align 8
 // CHECK-NEXT:    store <16 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 32
-// CHECK-NEXT:    [[TMP48:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP48:%.*]] = load <3 x i32>, ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> [[TMP48]], i32 [[TMP49]], i32 3)
-// CHECK-NEXT:    [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUTY16_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP51:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTY16_ADDR]], align 8
 // CHECK-NEXT:    store <16 x bfloat> [[TMP50]], ptr addrspace(1) [[TMP51]], align 32
-// CHECK-NEXT:    [[TMP52:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP52:%.*]] = load <3 x i32>, ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP54:%.*]] = call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> [[TMP52]], i32 [[TMP53]], i32 3)
-// CHECK-NEXT:    [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUTF16_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP55:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTF16_ADDR]], align 8
 // CHECK-NEXT:    store <16 x float> [[TMP54]], ptr addrspace(1) [[TMP55]], align 64
-// CHECK-NEXT:    [[TMP56:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP56:%.*]] = load <3 x i32>, ptr addrspace(5) [[SRC3_ADDR]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP58:%.*]] = call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> [[TMP56]], i32 [[TMP57]], i32 4)
-// CHECK-NEXT:    [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUTF16_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUTF16_ADDR]], align 8
 // CHECK-NEXT:    store <16 x float> [[TMP58]], ptr addrspace(1) [[TMP59]], align 64
 // CHECK-NEXT:    ret void
 //
@@ -775,100 +706,90 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
 // CHECK-NEXT:    [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
 // CHECK-NEXT:    [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr
-// CHECK-NEXT:    [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(5) [[OUT2_ADDR]], align 8
+// CHECK-NEXT:    store <8 x bfloat> [[SRCBF8:%.*]], ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[SRCH8:%.*]], ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    store <8 x float> [[SRCF8:%.*]], ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT3:%.*]], ptr addrspace(5) [[OUT3_ADDR]], align 8
+// CHECK-NEXT:    store <16 x bfloat> [[SRCBF16:%.*]], ptr addrspace(5) [[SRCBF16_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[SRCH16:%.*]], ptr addrspace(5) [[SRCH16_ADDR]], align 32
+// CHECK-NEXT:    store <16 x float> [[SRCF16:%.*]], ptr addrspace(5) [[SRCF16_ADDR]], align 64
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(5) [[OUT1_ADDR]], align 8
+// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], float [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> [[TMP4]], float [[TMP5]])
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> [[TMP8]], float [[TMP9]])
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP14:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> [[TMP12]], float [[TMP13]])
-// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load <8 x float>, ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> [[TMP16]], float [[TMP17]])
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x float>, ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> [[TMP20]], float [[TMP21]])
-// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load <8 x float>, ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> [[TMP24]], float [[TMP25]])
-// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT1_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
-// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x half>, ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> [[TMP28]], float [[TMP29]])
-// CHECK-NEXT:    [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT1_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP30]], ptr addrspace(1) [[TMP31]], align 4
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x bfloat>, ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]])
-// CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT1_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
-// CHECK-NEXT:    [[TMP36:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP36:%.*]] = load <16 x bfloat>, ptr addrspace(5) [[SRCBF16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP38:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> [[TMP36]], float [[TMP37]])
-// CHECK-NEXT:    [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP40:%.*]] = load <16 x half>, ptr addrspace(5) [[SRCH16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP42:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> [[TMP40]], float [[TMP41]])
-// CHECK-NEXT:    [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP42]], ptr addrspace(1) [[TMP43]], align 16
-// CHECK-NEXT:    [[TMP44:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP44:%.*]] = load <16 x bfloat>, ptr addrspace(5) [[SRCBF16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP46:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> [[TMP44]], float [[TMP45]])
-// CHECK-NEXT:    [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP46]], ptr addrspace(1) [[TMP47]], align 16
-// CHECK-NEXT:    [[TMP48:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP48:%.*]] = load <16 x half>, ptr addrspace(5) [[SRCH16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> [[TMP48]], float [[TMP49]])
-// CHECK-NEXT:    [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP51:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP50]], ptr addrspace(1) [[TMP51]], align 16
-// CHECK-NEXT:    [[TMP52:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
-// CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP52:%.*]] = load <16 x float>, ptr addrspace(5) [[SRCF16_ADDR]], align 64
+// CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP54:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> [[TMP52]], float [[TMP53]])
-// CHECK-NEXT:    [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP55:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP54]], ptr addrspace(1) [[TMP55]], align 16
-// CHECK-NEXT:    [[TMP56:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
-// CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP56:%.*]] = load <16 x float>, ptr addrspace(5) [[SRCF16_ADDR]], align 64
+// CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> [[TMP56]], float [[TMP57]])
-// CHECK-NEXT:    [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
 // CHECK-NEXT:    ret void
 //
@@ -906,117 +827,106 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
 // CHECK-NEXT:    [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr
-// CHECK-NEXT:    [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr
-// CHECK-NEXT:    [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
-// CHECK-NEXT:    [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(5) [[OUT2_ADDR]], align 8
+// CHECK-NEXT:    store <8 x bfloat> [[SRCBF8:%.*]], ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[SRCH8:%.*]], ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    store <8 x float> [[SRCF8:%.*]], ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT3:%.*]], ptr addrspace(5) [[OUT3_ADDR]], align 8
+// CHECK-NEXT:    store <16 x bfloat> [[SRCBF16:%.*]], ptr addrspace(5) [[SRCBF16_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[SRCH16:%.*]], ptr addrspace(5) [[SRCH16_ADDR]], align 32
+// CHECK-NEXT:    store <16 x float> [[SRCF16:%.*]], ptr addrspace(5) [[SRCF16_ADDR]], align 64
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(5) [[OUT1_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SR:%.*]], ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x bfloat>, ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> [[TMP5]], i32 [[TMP6]], float [[TMP7]])
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> [[TMP10]], i32 [[TMP11]], float [[TMP12]])
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 8
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x half>, ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> [[TMP15]], i32 [[TMP16]], float [[TMP17]])
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x float>, ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP23:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> [[TMP20]], i32 [[TMP21]], float [[TMP22]])
-// CHECK-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x float>, ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP28:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]])
-// CHECK-NEXT:    [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x float>, ptr addrspace(5) [[SRCF8_ADDR]], align 32
+// CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> [[TMP30]], i32 [[TMP31]], float [[TMP32]])
-// CHECK-NEXT:    [[TMP34:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT1_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP33]], ptr addrspace(1) [[TMP34]], align 4
-// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x half>, ptr addrspace(5) [[SRCH8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> [[TMP35]], i32 [[TMP36]], float [[TMP37]])
-// CHECK-NEXT:    [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT1_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP38]], ptr addrspace(1) [[TMP39]], align 4
-// CHECK-NEXT:    [[TMP40:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP40:%.*]] = load <8 x bfloat>, ptr addrspace(5) [[SRCBF8_ADDR]], align 16
+// CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
-// CHECK-NEXT:    [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP44:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT1_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
-// CHECK-NEXT:    [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP45:%.*]] = load <16 x bfloat>, ptr addrspace(5) [[SRCBF16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]])
-// CHECK-NEXT:    [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP49:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP50:%.*]] = load <16 x half>, ptr addrspace(5) [[SRCH16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]])
-// CHECK-NEXT:    [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP54:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16
-// CHECK-NEXT:    [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP55:%.*]] = load <16 x bfloat>, ptr addrspace(5) [[SRCBF16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP57:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]])
-// CHECK-NEXT:    [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
-// CHECK-NEXT:    [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
-// CHECK-NEXT:    [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP60:%.*]] = load <16 x half>, ptr addrspace(5) [[SRCH16_ADDR]], align 32
+// CHECK-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP62:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]])
-// CHECK-NEXT:    [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16
-// CHECK-NEXT:    [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
-// CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP65:%.*]] = load <16 x float>, ptr addrspace(5) [[SRCF16_ADDR]], align 64
+// CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP67:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]])
-// CHECK-NEXT:    [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16
-// CHECK-NEXT:    [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
-// CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP70:%.*]] = load <16 x float>, ptr addrspace(5) [[SRCF16_ADDR]], align 64
+// CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(5) [[SR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP72:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]])
-// CHECK-NEXT:    [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP74:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16
 // CHECK-NEXT:    ret void
 //
@@ -1045,17 +955,15 @@ void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, fl
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP2]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]])
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i16 [[TMP4]], ptr [[TMP5]], align 2
 // CHECK-NEXT:    ret void
 //
@@ -1069,11 +977,9 @@ void test_sat_pk4_i4_i8(ushort *out, uint src)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
 // CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
 // CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 // CHECK-NEXT:      i32 1, label [[SW_BB1:%.*]]
@@ -1081,21 +987,21 @@ void test_sat_pk4_i4_i8(ushort *out, uint src)
 // CHECK-NEXT:    ]
 // CHECK:       sw.bb:
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.id.x()
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 // CHECK:       sw.bb1:
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.id.y()
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.bb2:
 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.id.z()
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.default:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 0, ptr addrspace(1) [[TMP7]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.epilog:
@@ -1115,11 +1021,9 @@ void test_get_cluster_id(int d, global int *out)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
 // CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
 // CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 // CHECK-NEXT:      i32 1, label [[SW_BB1:%.*]]
@@ -1127,21 +1031,21 @@ void test_get_cluster_id(int d, global int *out)
 // CHECK-NEXT:    ]
 // CHECK:       sw.bb:
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.x()
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 // CHECK:       sw.bb1:
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.y()
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.bb2:
 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.z()
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.default:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 0, ptr addrspace(1) [[TMP7]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.epilog:
@@ -1160,10 +1064,9 @@ void test_get_cluster_group_id(int d, global int *out)
 // CHECK-LABEL: @test_cluster_workgroup_flat_id(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.flat.id()
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1176,11 +1079,9 @@ void test_cluster_workgroup_flat_id(global uint *out)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
 // CHECK-NEXT:    switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
 // CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 // CHECK-NEXT:      i32 1, label [[SW_BB1:%.*]]
@@ -1188,21 +1089,21 @@ void test_cluster_workgroup_flat_id(global uint *out)
 // CHECK-NEXT:    ]
 // CHECK:       sw.bb:
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 // CHECK:       sw.bb1:
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.bb2:
 // CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.default:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 0, ptr addrspace(1) [[TMP7]], align 4
 // CHECK-NEXT:    br label [[SW_EPILOG]]
 // CHECK:       sw.epilog:
@@ -1221,10 +1122,9 @@ void test_get_cluster_workgroups_max_id(int d, global int *out)
 // CHECK-LABEL: @test_get_cluster_workgroup_max_flat_id(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.flat.id()
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1238,38 +1138,35 @@ void test_get_cluster_workgroup_max_flat_id(global int *out)
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
 // CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
 // CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
 // CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
 // CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
 // CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
 // CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
-// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
 // CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
 // CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
 // CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
 // CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
-// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1285,19 +1182,15 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
 // CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1311,19 +1204,15 @@ void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1337,19 +1226,15 @@ void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1363,19 +1248,15 @@ void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1388,16 +1269,13 @@ void test_permlane_xor(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
-// CHECK-NEXT:    [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.permlane.idx.gen(i32 [[TMP0]], i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1415,42 +1293,34 @@ void test_permlane_idx_gen(global uint* out, uint src0, uint src1) {
 // CHECK-NEXT:    [[OUT2_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[OUT3_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[OUT4_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[A32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A32_ADDR]] to ptr
-// CHECK-NEXT:    [[A64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A64_ADDR]] to ptr
-// CHECK-NEXT:    [[B32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B32_ADDR]] to ptr
-// CHECK-NEXT:    [[B64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B64_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
-// CHECK-NEXT:    [[OUT4_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT4_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[A32:%.*]], ptr [[A32_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[A64:%.*]], ptr [[A64_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B32:%.*]], ptr [[B32_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B64:%.*]], ptr [[B64_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i32> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr [[OUT4:%.*]], ptr [[OUT4_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B32_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[A32:%.*]], ptr addrspace(5) [[A32_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[A64:%.*]], ptr addrspace(5) [[A64_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B32:%.*]], ptr addrspace(5) [[B32_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B64:%.*]], ptr addrspace(5) [[B64_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i32> [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[OUT2:%.*]], ptr addrspace(5) [[OUT2_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[OUT3:%.*]], ptr addrspace(5) [[OUT3_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[OUT4:%.*]], ptr addrspace(5) [[OUT4_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A32_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B32_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 [[TMP0]], i32 [[TMP1]], <2 x i32> [[TMP2]])
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[OUT2_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[A32_ADDR]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[B64_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP8:%.*]] = call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 [[TMP5]], i64 [[CONV]], <2 x i32> [[TMP7]])
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[OUT3_ADDR]], align 8
 // CHECK-NEXT:    store <3 x i32> [[TMP8]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A64_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[A64_ADDR]], align 4
 // CHECK-NEXT:    [[CONV1:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(5) [[B64_ADDR]], align 4
 // CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i32>, ptr addrspace(5) [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 [[CONV1]], i64 [[CONV2]], <2 x i32> [[TMP12]])
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[OUT4_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[OUT4_ADDR]], align 8
 // CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP14]], align 16
 // CHECK-NEXT:    ret void
 //
@@ -1464,13 +1334,11 @@ void test_perm_pk(uint a32, uint a64, uint b32, uint b64, uint2 c, uint2 *out2,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
-// CHECK-NEXT:    [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr
-// CHECK-NEXT:    [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[FPTR:%.*]], ptr addrspace(5) [[FPTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[GPTR:%.*]], ptr addrspace(5) [[GPTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FPTR_ADDR]], align 8
 // CHECK-NEXT:    call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GPTR_ADDR]], align 8
 // CHECK-NEXT:    call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8)
 // CHECK-NEXT:    ret void
 //
@@ -1492,9 +1360,8 @@ void test_s_cluster_barrier()
 // CHECK-LABEL: @test_s_wakeup_barrier(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
 // CHECK-NEXT:    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
 // CHECK-NEXT:    ret void
@@ -1508,13 +1375,11 @@ void test_s_wakeup_barrier(void *bar)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca float, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]]
+// CHECK-NEXT:    store ptr addrspace(1) [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    store float [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
 // CHECK-NEXT:    ret float [[TMP2]]
 //
 float test_global_add_f32(global float *addr, float x) {
@@ -1525,13 +1390,11 @@ float test_global_add_f32(global float *addr, float x) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT:    store ptr addrspace(1) [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    store <2 x half> [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
 // CHECK-NEXT:    ret <2 x half> [[TMP2]]
 //
 half2 test_global_add_half2(global half2 *addr, half2 x) {
@@ -1542,13 +1405,11 @@ half2 test_global_add_half2(global half2 *addr, half2 x) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT:    store ptr [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    store <2 x half> [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
 // CHECK-NEXT:    ret <2 x half> [[TMP2]]
 //
 half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
@@ -1559,14 +1420,12 @@ half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[X_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
-// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
 // CHECK-NEXT:    ret <2 x i16> [[TMP4]]
 //
@@ -1578,14 +1437,12 @@ short2 test_flat_add_2bf16(generic short2 *addr, short2 x) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[X_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
-// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
 // CHECK-NEXT:    ret <2 x i16> [[TMP4]]
 //
@@ -1597,12 +1454,10 @@ short2 test_global_add_2bf16(global short2 *addr, short2 x) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(3) [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[ADDR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[X_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
@@ -1616,12 +1471,10 @@ short2 test_local_add_2f16(local short2 *addr, short2 x) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(3) [[ADDR:%.*]], ptr addrspace(5) [[ADDR_ADDR]], align 4
+// CHECK-NEXT:    store <2 x half> [[X:%.*]], ptr addrspace(5) [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[ADDR_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(5) [[X_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT:    ret <2 x half> [[TMP2]]
 //
@@ -1635,19 +1488,15 @@ half2 test_local_add_2bf16(local half2 *addr, half2 x) {
 // CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    store float [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store float [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 true)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1662,19 +1511,15 @@ void test_cvt_pk_fp8_f32_e5m3(global int* out, int old, float a, float b)
 // CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
+// CHECK-NEXT:    store float [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 3)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1687,29 +1532,27 @@ void test_cvt_sr_fp8_f32_e5m3(global int* out, int old, float a, int b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP0]], i32 0)
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP1]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[TMP2]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP3]], i32 1)
 // CHECK-NEXT:    [[CONV1:%.*]] = fptosi float [[TMP4]] to i32
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[CONV1]], ptr addrspace(1) [[TMP5]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP6]], i32 2)
 // CHECK-NEXT:    [[CONV2:%.*]] = fptosi float [[TMP7]] to i32
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[CONV2]], ptr addrspace(1) [[TMP8]], align 4
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP9]], i32 3)
 // CHECK-NEXT:    [[CONV3:%.*]] = fptosi float [[TMP10]] to i32
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[CONV3]], ptr addrspace(1) [[TMP11]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1727,37 +1570,33 @@ void test_cvt_f32_fp8_e5m3(global int* out, int a)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.add.max.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.add.max.u32(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true)
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.add.min.i32(i32 [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i1 false)
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.add.min.u32(i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i1 true)
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1779,45 +1618,37 @@ void test_add_min_max(global int *out, int a, int b, int c)
 // CHECK-NEXT:    [[UA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
 // CHECK-NEXT:    [[UB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
 // CHECK-NEXT:    [[UC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
-// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT:    [[UOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UOUT_ADDR]] to ptr
-// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT:    [[UA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UA_ADDR]] to ptr
-// CHECK-NEXT:    [[UB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UB_ADDR]] to ptr
-// CHECK-NEXT:    [[UC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UC_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(1) [[UOUT:%.*]], ptr [[UOUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x i16> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i16> [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i16> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i16> [[UA:%.*]], ptr [[UA_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i16> [[UB:%.*]], ptr [[UB_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    store <2 x i16> [[UC:%.*]], ptr [[UC_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[UOUT:%.*]], ptr addrspace(5) [[UOUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[UA:%.*]], ptr addrspace(5) [[UA_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[UB:%.*]], ptr addrspace(5) [[UB_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[UC:%.*]], ptr addrspace(5) [[UC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]], i1 false)
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i16> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(5) [[UA_ADDR]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(5) [[UB_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(5) [[UC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], <2 x i16> [[TMP7]], i1 true)
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[UOUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i16> [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i16>, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i16>, ptr addrspace(5) [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr addrspace(5) [[C_ADDR]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> [[TMP10]], <2 x i16> [[TMP11]], <2 x i16> [[TMP12]], i1 false)
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i16> [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i16>, ptr addrspace(5) [[UA_ADDR]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i16>, ptr addrspace(5) [[UB_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(5) [[UC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> [[TMP15]], <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], i1 true)
-// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[UOUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i16> [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl
index e7c81b000a8f0..7dec7b24d5d28 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl
@@ -12,15 +12,13 @@ typedef unsigned char u8;
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 4, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
@@ -33,15 +31,13 @@ void test_global_load_lds_u32(global u32* src, local u32 *dst) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 2, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
@@ -54,15 +50,13 @@ void test_global_load_lds_u16(global u16* src, local u16 *dst) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 1, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
index cc944204446ae..d6b79fc2533e1 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
@@ -12,15 +12,13 @@ typedef unsigned char u8;
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.load.async.to.lds.p1(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 4, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
@@ -33,15 +31,13 @@ void test_load_to_lds_u32(global u32* src, local u32 *dst) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.load.async.to.lds.p1(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 2, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
@@ -54,15 +50,13 @@ void test_load_to_lds_u16(global u16* src, local u16 *dst) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
-// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
-// CHECK-NEXT:    [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
-// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.amdgcn.load.async.to.lds.p1(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 1, i32 0, i32 0)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/check-atomic-alignment.cl b/clang/test/CodeGenOpenCL/check-atomic-alignment.cl
index 1f7e4ce78ea55..66753052cba26 100644
--- a/clang/test/CodeGenOpenCL/check-atomic-alignment.cl
+++ b/clang/test/CodeGenOpenCL/check-atomic-alignment.cl
@@ -26,18 +26,16 @@ struct __half2 {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
-// CHECK-NEXT:    [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
-// CHECK-NEXT:    [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VAL_ADDR]] to ptr
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store <2 x half> [[VAL]], ptr [[VAL_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[VAL_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]]
+// CHECK-NEXT:    store ptr [[ADDR]], ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    store <2 x half> [[VAL]], ptr addrspace(5) [[VAL_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(5) [[VAL_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]]
 // CHECK-NEXT:    ret <2 x half> [[TMP2]]
 //
 half2 test_flat_add_2f16(short2 *addr, half2 val) {
   return __builtin_amdgcn_flat_atomic_fadd_v2f16((struct __half2*)addr, val);
 }
 //.
-// CHECK: [[META4]] = !{}
+// CHECK: [[META3]] = !{}
 //.
diff --git a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
index 4f2a75a76abbb..6b08d366dc539 100644
--- a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
+++ b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
@@ -33,7 +33,7 @@ __kernel void use_of_local_var()
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[X]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[X]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr addrspace(5) [[X]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[X]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
 // CHECK-NEXT:    call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR6]]
@@ -45,32 +45,31 @@ __kernel void use_of_local_var()
 // CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store i32 [[X]], ptr addrspace(5) [[X_ADDR]], align 4, !tbaa [[INT_TBAA3]]
+// CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[X_ADDR]]) #[[ATTR6]]
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4, !tbaa [[INT_TBAA4]]
-// CHECK-NEXT:    [[X_ADDR_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5)
-// CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[X_ADDR_ASCAST_ASCAST]]) #[[ATTR6]]
 // CHECK-NEXT:    call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local amdgpu_kernel void @use_of_local_var(
-// CHECK-SAME: ) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
+// CHECK-SAME: ) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META7]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_use_of_local_var() #[[ATTR6]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_use_of_local_var(
-// CHECK-SAME: ) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
+// CHECK-SAME: ) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META7]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @local_ptr(ptr addrspace(3) noundef @use_of_local_var.x) #[[ATTR6]]
 // CHECK-NEXT:    call void @generic_ptr(ptr noundef addrspacecast (ptr addrspace(3) @use_of_local_var.x to ptr)) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 //
 //.
-// CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
-// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[META8]] = !{}
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META7]] = !{}
 //.
diff --git a/clang/test/Index/pipe-size.cl b/clang/test/Index/pipe-size.cl
index f15bbefb68e7f..08b936f1a9b07 100644
--- a/clang/test/Index/pipe-size.cl
+++ b/clang/test/Index/pipe-size.cl
@@ -11,6 +11,6 @@ __kernel void testPipe( pipe int test )
     // SPIR: store i32 4, ptr %s, align 4
     // SPIR64: store target("spirv.Pipe", 0) %test, ptr %test.addr, align 8
     // SPIR64: store i32 8, ptr %s, align 4
-    // AMDGCN: store ptr addrspace(1) %test, ptr %test{{.*}}, align 8
-    // AMDGCN: store i32 8, ptr addrspace(5) %s{{.*}}, align 4
+    // AMDGCN: store ptr addrspace(1) %test, ptr addrspace(5) %test.addr, align 8
+    // AMDGCN: store i32 8, ptr addrspace(5) %s, align 4
 }

From bbc3c0873d7d53137e4f5bc650043a02214154ac Mon Sep 17 00:00:00 2001
From: Alexey Bataev 
Date: Tue, 12 May 2026 16:54:15 -0400
Subject: [PATCH 511/538] [SLP]Disable reused reductions in revec mode for
 vector scalars

Reused reductions may require some special processing, but courrently it
crashes the compiler. Disable reused reductions for vector scalars in
revec mode to fix a crash.

Fixes #196914

Reviewers:

Pull Request: https://github.com/llvm/llvm-project/pull/197291
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  3 +-
 .../AArch64/reused-reduction-revec.ll         | 68 +++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/reused-reduction-revec.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f1a6eb2d7e8af..cb2901418cef2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -28661,7 +28661,8 @@ class HorizontalReduction {
       // original scalar identity operations on matched horizontal reductions).
       IsSupportedHorRdxIdentityOp =
           RK == ReductionOrdering::Unordered && RdxKind != RecurKind::Mul &&
-          RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
+          RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd &&
+          (!SLPReVec || !S.getMainOp()->getType()->isVectorTy());
       // Gather same values.
       SmallMapVector SameValuesCounter;
       if (IsSupportedHorRdxIdentityOp)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-reduction-revec.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-reduction-revec.ll
new file mode 100644
index 0000000000000..61c8084f5d3e2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-reduction-revec.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt --passes=slp-vectorizer -S -slp-revec -mtriple=aarch64-pc-windows-gnu < %s | FileCheck %s
+
+define <8 x i64> @test(ptr %0, <8 x i32> %1) {
+; CHECK-LABEL: define <8 x i64> @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], <8 x i32> [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[VECTOR_PH:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 52
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 68
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 36
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i32> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i32> [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <8 x i32> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <8 x i64> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 20
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <8 x i32> [[TMP1]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <8 x i32> [[TMP14]] to <8 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <8 x i64> [[TMP15]], [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add <8 x i64> [[TMP16]], [[TMP10]]
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <8 x i16> [[WIDE_LOAD7]] to <8 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i32> [[TMP1]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <8 x i32> [[TMP19]] to <8 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = add <8 x i64> [[TMP10]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add <8 x i64> [[TMP11]], [[TMP20]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP17]], [[TMP8]]
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add <8 x i64> [[TMP22]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <8 x i64> [[TMP21]], [[BIN_RDX12]]
+; CHECK-NEXT:    ret <8 x i64> [[BIN_RDX13]]
+;
+vector.ph:
+  %2 = getelementptr i8, ptr %0, i64 52
+  %wide.load3 = load <8 x i16>, ptr %2, align 2
+  %3 = zext <8 x i16> %wide.load3 to <8 x i32>
+  %4 = getelementptr i8, ptr %0, i64 68
+  %wide.load7 = load <8 x i16>, ptr %4, align 2
+  %5 = getelementptr i8, ptr %0, i64 36
+  %wide.load2 = load <8 x i16>, ptr %5, align 2
+  %6 = zext <8 x i16> %wide.load2 to <8 x i32>
+  %7 = mul <8 x i32> %1, %6
+  %8 = zext <8 x i32> %7 to <8 x i64>
+  %9 = mul <8 x i32> %1, %3
+  %10 = zext <8 x i32> %9 to <8 x i64>
+  %11 = add <8 x i64> %8, %10
+  %12 = getelementptr i8, ptr %0, i64 20
+  %wide.load1 = load <8 x i16>, ptr %12, align 2
+  %13 = zext <8 x i16> %wide.load1 to <8 x i32>
+  %14 = mul <8 x i32> %1, %13
+  %15 = zext <8 x i32> %14 to <8 x i64>
+  %16 = add <8 x i64> %15, %8
+  %17 = add <8 x i64> %16, %10
+  %18 = zext <8 x i16> %wide.load7 to <8 x i32>
+  %19 = mul <8 x i32> %1, %18
+  %20 = zext <8 x i32> %19 to <8 x i64>
+  %21 = add <8 x i64> %10, %20
+  %22 = add <8 x i64> %11, %20
+  %bin.rdx = add <8 x i64> %17, %8
+  %bin.rdx12 = add <8 x i64> %22, %bin.rdx
+  %bin.rdx13 = add <8 x i64> %21, %bin.rdx12
+  ret <8 x i64> %bin.rdx13
+}

From a9d11f940560f02a609ba3f766933946c2da3842 Mon Sep 17 00:00:00 2001
From: Victor Chernyakin 
Date: Tue, 12 May 2026 13:56:28 -0700
Subject: [PATCH 512/538] [clang][NFC] Mark CWG730 as implemented and add a
 test (#197186)

[CWG730](https://wg21.link/cwg730) clarifies that it's allowed to
specialize templates that are members of a non-template class. Clang
implements this since 2.7: https://godbolt.org/z/bWzb766rz
---
 clang/test/CXX/drs/cwg7xx.cpp | 10 ++++++++++
 clang/www/cxx_dr_status.html  |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/test/CXX/drs/cwg7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp
index 09869128f6054..03a1cf1e73469 100644
--- a/clang/test/CXX/drs/cwg7xx.cpp
+++ b/clang/test/CXX/drs/cwg7xx.cpp
@@ -335,6 +335,16 @@ namespace cwg727 { // cwg727: partial
   Collision c; // #cwg727-Collision-int-int
 } // namespace cwg727
 
+namespace cwg730 { // cwg730: 2.7
+struct A {
+  template  struct S {};
+  template  void f() {}
+};
+
+template <> struct A::S {};
+template <> void A::f() {}
+} // namespace cwg730
+
 namespace cwg743 { // cwg743: 3.1
 #if __cplusplus >= 201103L
 struct S {
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 842c91a346c3c..8663e224ee6ce 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -5139,7 +5139,7 @@ 

C++ defect report implementation status

[temp.expl.spec] CD2 Explicit specializations of members of non-template classes - Unknown + Clang 2.7 731 From d48651fd1a7a65747601debd3046e4664ed1d64f Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Tue, 12 May 2026 15:57:51 -0500 Subject: [PATCH 513/538] [mlir][AMDGPU] Canonicalize masks on global_load_async_to_lds (#197280) If the mask is always true, remove the mask operand (there are patterns that key off the presence of the lack of a mask operand to know when they can be more aggressive). If the mask is always false, just go ahead and delete the op as it won't write anythig. AI: I described the patterns, Codex 5.5 wrote them --- .../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 1 + mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 25 ++++++++++++++++ mlir/test/Dialect/AMDGPU/canonicalize.mlir | 29 +++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td index 0ec788e21f0bf..7d33ca163fb2f 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td @@ -1413,6 +1413,7 @@ def AMDGPU_GlobalLoadAsyncToLDSOp : attr-dict `:` $transferType `,` type($src) `,` type($dst) }]; let hasVerifier = 1; + let hasCanonicalizer = 1; } def AMDGPU_TransposeLoadOp : diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp index fd9a153ada2b8..209d52ec7a1c8 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp @@ -1065,6 +1065,31 @@ LogicalResult GlobalLoadAsyncToLDSOp::verify() { return success(); } +static LogicalResult +foldGlobalLoadAsyncToLDSConstantMask(GlobalLoadAsyncToLDSOp op, + PatternRewriter &rewriter) { + Value mask = op.getMask(); + if (!mask) + return failure(); + + APInt maskValue; + if (!matchPattern(mask, m_ConstantInt(&maskValue))) + return failure(); + + if (maskValue.isZero()) { + rewriter.eraseOp(op); + return success(); + } + + rewriter.modifyOpInPlace(op, [&]() { op.getMaskMutable().clear(); }); + return success(); +} + +void GlobalLoadAsyncToLDSOp::getCanonicalizationPatterns( + RewritePatternSet &results, MLIRContext *context) { + results.add(foldGlobalLoadAsyncToLDSConstantMask); +} + //===----------------------------------------------------------------------===// // TransposeLoadOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir index f51d0d0d12ed0..f5c117101493e 100644 --- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir +++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir @@ -162,6 +162,35 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds: // ----- +// CHECK-LABEL: func @global_load_async_to_lds_true_mask +// CHECK-SAME: %[[SRC:.*]]: memref<16xf32, #gpu.address_space>, %[[DST:.*]]: memref<16xf32, #gpu.address_space> +func.func @global_load_async_to_lds_true_mask(%src: memref<16xf32, #gpu.address_space>, %dst: memref<16xf32, #gpu.address_space>) { + // CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index + // CHECK-NEXT: amdgpu.global_load_async_to_lds %[[SRC]][%[[C0]]], %[[DST]][%[[C0]]] : f32, memref<16xf32, #gpu.address_space>, memref<16xf32, #gpu.address_space> + // CHECK-NEXT: return + %c0 = arith.constant 0 : index + %true = arith.constant true + amdgpu.global_load_async_to_lds %src[%c0], %dst[%c0], %true + : f32, memref<16xf32, #gpu.address_space>, + memref<16xf32, #gpu.address_space> + func.return +} + +// ----- + +// CHECK-LABEL: func @global_load_async_to_lds_false_mask +func.func @global_load_async_to_lds_false_mask(%src: memref<16xf32, #gpu.address_space>, %dst: memref<16xf32, #gpu.address_space>) { + // CHECK-NEXT: return + %c0 = arith.constant 0 : index + %false = arith.constant false + amdgpu.global_load_async_to_lds %src[%c0], %dst[%c0], %false + : f32, memref<16xf32, #gpu.address_space>, + memref<16xf32, #gpu.address_space> + func.return +} + +// ----- + // CHECK-LABEL: func @scaled_mfma // CHECK: %[[SCALE_1:.*]] = vector.extract_strided_slice %0 {offsets = [0], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> // CHECK: %[[SCALE_2:.*]] = vector.extract_strided_slice %2 {offsets = [4], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> From c924ecb1fea9cc205a6e35ee3af55acfd6612ea3 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Tue, 12 May 2026 17:00:31 -0400 Subject: [PATCH 514/538] [CodeGen] Use byte offsets and ptradd in ShadowStackGCLowering (#178436) Replace typed struct GEPs with byte array allocation and ptradd operations: 1. Track root offsets as byte offsets instead of building typed struct. 2. Use `ComputeFrameLayout` to compute byte offsets based on DataLayout, properly accounting for each root's size and alignment. 3. Allocate frame as `[FrameSize x i8]` byte array instead of typed struct. 4. Replace all CreateGEP operations with CreatePtrAdd using computed offsets. 5. Frame layout unchanged: `[Next ptr | Map ptr | Root 0 | Root 1 | ... | Root N]` where each root is placed at its computed aligned offset. 6. Zero out padding between roots with memset for deterministic frame contents for GC. Benefits: - Removes dependency on `getAllocatedType` for building frame struct - Properly handles root alignment requirements (implements a TODO in code) Apparently this is an upside (or downside) of leaving a TODO behind in the code, since AI goes and implements it when the refactoring happened to make it much more trivial to do. Co-authored-by: Claude Sonnet 4.5 --- llvm/lib/CodeGen/ShadowStackGCLowering.cpp | 187 +++++++++++---------- 1 file changed, 101 insertions(+), 86 deletions(-) diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index 000d6d842c6be..deb5d0e9c1555 100644 --- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -38,7 +39,9 @@ #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" #include #include @@ -56,14 +59,16 @@ class ShadowStackGCLoweringImpl { /// roots. GlobalVariable *Head = nullptr; - /// StackEntryTy - Abstract type of a link in the shadow stack. - StructType *StackEntryTy = nullptr; StructType *FrameMapTy = nullptr; /// Roots - GC roots in the current function. Each is a pair of the /// intrinsic call and its corresponding alloca. std::vector> Roots; + /// RootOffsets - Byte offsets and sizes of each root within the frame. + /// Each element is a pair of (offset, size). + std::vector> RootOffsets; + public: ShadowStackGCLoweringImpl() = default; @@ -72,16 +77,9 @@ class ShadowStackGCLoweringImpl { private: bool IsNullValue(Value *V); - Constant *GetFrameMap(Function &F); - Type *GetConcreteStackEntryType(Function &F); + Constant *GetFrameMap(Function &F, uint64_t FrameSizeInPtrs); + std::pair ComputeFrameLayout(Function &F); void CollectRoots(Function &F); - - static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B, - Type *Ty, Value *BasePtr, int Idx1, - const char *Name); - static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B, - Type *Ty, Value *BasePtr, int Idx1, int Idx2, - const char *Name); }; class ShadowStackGCLowering : public FunctionPass { @@ -143,7 +141,8 @@ FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGC ShadowStackGCLowering::ShadowStackGCLowering() : FunctionPass(ID) {} -Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F) { +Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F, + uint64_t FrameSizeInPtrs) { // doInitialization creates the abstract type of this value. Type *VoidPtr = PointerType::getUnqual(F.getContext()); @@ -161,7 +160,7 @@ Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F) { Type *Int32Ty = Type::getInt32Ty(F.getContext()); Constant *BaseElts[] = { - ConstantInt::get(Int32Ty, Roots.size(), false), + ConstantInt::get(Int32Ty, FrameSizeInPtrs, false), ConstantInt::get(Int32Ty, NumMeta, false), }; @@ -192,14 +191,44 @@ Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F) { "__gc_" + F.getName()); } -Type *ShadowStackGCLoweringImpl::GetConcreteStackEntryType(Function &F) { - // doInitialization creates the generic version of this type. - std::vector EltTys; - EltTys.push_back(StackEntryTy); - for (const std::pair &Root : Roots) - EltTys.push_back(Root.second->getAllocatedType()); +std::pair +ShadowStackGCLoweringImpl::ComputeFrameLayout(Function &F) { + // Compute the layout of the shadow stack frame using byte offsets. + // Layout: [Next ptr | Map ptr | Root 0 | Root 1 | ... | Root N] + + const DataLayout &DL = F.getParent()->getDataLayout(); + uint64_t PtrSize = DL.getPointerSize(0); + Align PtrAlign = DL.getPointerABIAlignment(0); + + RootOffsets.clear(); + Align MaxAlign = PtrAlign; + + // Offset 0: Next pointer + // Offset PtrSize: Map pointer + uint64_t Offset = 2 * PtrSize; + + // Compute offsets and sizes for each root + for (const std::pair &Root : Roots) { + AllocaInst *AI = Root.second; + std::optional RootSize = AI->getAllocationSize(DL); + if (!RootSize || !RootSize->isFixed()) + reportFatalUsageError( + "Intrinsic::gcroot requires a fixed size stack object"); + uint64_t Size = RootSize->getFixedValue(); + Align RootAlign = AI->getAlign(); + MaxAlign = std::max(MaxAlign, RootAlign); + + // Align the offset for this root + uint64_t AlignedOffset = alignTo(Offset, RootAlign); + + // Store both offset and size as a pair + RootOffsets.push_back({AlignedOffset, Size}); + Offset = AlignedOffset + Size; + } - return StructType::create(EltTys, ("gc_stackentry." + F.getName()).str()); + // Final frame size, aligned to maximum alignment + uint64_t FrameSize = alignTo(Offset, MaxAlign); + return {FrameSize, MaxAlign}; } /// doInitialization - If this module uses the GC intrinsics, find them now. If @@ -226,21 +255,11 @@ bool ShadowStackGCLoweringImpl::doInitialization(Module &M) { // Specifies length of variable length array. EltTys.push_back(Type::getInt32Ty(M.getContext())); FrameMapTy = StructType::create(EltTys, "gc_map"); - PointerType *FrameMapPtrTy = PointerType::getUnqual(M.getContext()); - - // struct StackEntry { - // ShadowStackEntry *Next; // Caller's stack entry. - // FrameMap *Map; // Pointer to constant FrameMap. - // void *Roots[]; // Stack roots (in-place array, so we pretend). - // }; + // The shadow stack linked list uses opaque pointers. + // Each frame is a byte array with: [Next ptr | Map ptr | Roots...] PointerType *StackEntryPtrTy = PointerType::getUnqual(M.getContext()); - EltTys.clear(); - EltTys.push_back(StackEntryPtrTy); - EltTys.push_back(FrameMapPtrTy); - StackEntryTy = StructType::create(EltTys, "gc_stackentry"); - // Get the root chain if it already exists. Head = M.getGlobalVariable("llvm_gc_root_chain"); if (!Head) { @@ -264,10 +283,6 @@ bool ShadowStackGCLoweringImpl::IsNullValue(Value *V) { } void ShadowStackGCLoweringImpl::CollectRoots(Function &F) { - // FIXME: Account for original alignment. Could fragment the root array. - // Approach 1: Null initialize empty slots at runtime. Yuck. - // Approach 2: Emit a map of the array instead of just a count. - assert(Roots.empty() && "Not cleaned up?"); SmallVector, 16> MetaRoots; @@ -291,34 +306,6 @@ void ShadowStackGCLoweringImpl::CollectRoots(Function &F) { Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end()); } -GetElementPtrInst * -ShadowStackGCLoweringImpl::CreateGEP(LLVMContext &Context, IRBuilder<> &B, - Type *Ty, Value *BasePtr, int Idx, - int Idx2, const char *Name) { - Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0), - ConstantInt::get(Type::getInt32Ty(Context), Idx), - ConstantInt::get(Type::getInt32Ty(Context), Idx2)}; - Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name); - - assert(isa(Val) && "Unexpected folded constant"); - - return dyn_cast(Val); -} - -GetElementPtrInst *ShadowStackGCLoweringImpl::CreateGEP(LLVMContext &Context, - IRBuilder<> &B, - Type *Ty, - Value *BasePtr, int Idx, - const char *Name) { - Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0), - ConstantInt::get(Type::getInt32Ty(Context), Idx)}; - Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name); - - assert(isa(Val) && "Unexpected folded constant"); - - return dyn_cast(Val); -} - /// runOnFunction - Insert code to maintain the shadow stack. bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, DomTreeUpdater *DTU) { @@ -327,6 +314,7 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, return false; LLVMContext &Context = F.getContext(); + const DataLayout &DL = F.getParent()->getDataLayout(); // Find calls to llvm.gcroot. CollectRoots(F); @@ -336,16 +324,20 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, if (Roots.empty()) return false; - // Build the constant map and figure the type of the shadow stack entry. - Value *FrameMap = GetFrameMap(F); - Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F); + // Compute frame layout using byte offsets first. + auto [FrameSize, FrameAlign] = ComputeFrameLayout(F); + + // Build the constant map with frame size in pointer-sized units. + uint64_t PtrSize = DL.getPointerSize(); + Value *FrameMap = GetFrameMap(F, FrameSize / PtrSize); // Build the shadow stack entry at the very start of the function. BasicBlock::iterator IP = F.getEntryBlock().begin(); IRBuilder<> AtEntry(IP->getParent(), IP); - - Instruction *StackEntry = - AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr, "gc_frame"); + Type *Int8Ty = Type::getInt8Ty(Context); + AllocaInst *StackEntry = AtEntry.CreateAlloca( + ArrayType::get(Int8Ty, FrameSize), nullptr, "gc_frame"); + StackEntry->setAlignment(FrameAlign); AtEntry.SetInsertPointPastAllocas(&F); IP = AtEntry.GetInsertPoint(); @@ -353,20 +345,45 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, // Initialize the map pointer and load the current head of the shadow stack. Instruction *CurrentHead = AtEntry.CreateLoad(AtEntry.getPtrTy(), Head, "gc_currhead"); - Instruction *EntryMapPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, - StackEntry, 0, 1, "gc_frame.map"); + + // Map pointer is at offset PtrSize (after the Next pointer) + Value *EntryMapPtr = AtEntry.CreatePtrAdd( + StackEntry, AtEntry.getInt64(PtrSize), "gc_frame.map"); AtEntry.CreateStore(FrameMap, EntryMapPtr); - // After all the allocas... + // Zero out any padding between roots to ensure deterministic frame contents. + // This includes the region after the map pointer up to the first root. + uint64_t LastEnd = 2 * PtrSize; // End of Map pointer field + assert(RootOffsets.size() == Roots.size()); for (unsigned I = 0, E = Roots.size(); I != E; ++I) { - // For each root, find the corresponding slot in the aggregate... - Value *SlotPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, - StackEntry, 1 + I, "gc_root"); + auto [RootOffset, RootSize] = RootOffsets[I]; + + // Zero any padding before this root + if (RootOffset > LastEnd) { + Value *PaddingPtr = + AtEntry.CreatePtrAdd(StackEntry, AtEntry.getInt64(LastEnd)); + AtEntry.CreateMemSet(PaddingPtr, AtEntry.getInt8(0), RootOffset - LastEnd, + Align(1)); + } + + // For each root, compute pointer using precomputed offset + Value *SlotPtr = AtEntry.CreatePtrAdd( + StackEntry, AtEntry.getInt64(RootOffset), "gc_root"); // And use it in lieu of the alloca. AllocaInst *OriginalAlloca = Roots[I].second; SlotPtr->takeName(OriginalAlloca); OriginalAlloca->replaceAllUsesWith(SlotPtr); + + LastEnd = RootOffset + RootSize; + } + + // Zero any padding at the end of the frame + if (FrameSize > LastEnd) { + Value *PaddingPtr = + AtEntry.CreatePtrAdd(StackEntry, AtEntry.getInt64(LastEnd)); + AtEntry.CreateMemSet(PaddingPtr, AtEntry.getInt8(0), FrameSize - LastEnd, + Align(1)); } // Move past the original stores inserted by GCStrategy::InitRoots. This isn't @@ -378,23 +395,20 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, AtEntry.SetInsertPoint(IP->getParent(), IP); // Push the entry onto the shadow stack. - Instruction *EntryNextPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, - StackEntry, 0, 0, "gc_frame.next"); - Instruction *NewHeadVal = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, - StackEntry, 0, "gc_newhead"); - AtEntry.CreateStore(CurrentHead, EntryNextPtr); - AtEntry.CreateStore(NewHeadVal, Head); + // Next pointer is at offset 0, so it's just the frame pointer + AtEntry.CreateStore(CurrentHead, StackEntry); + // The new head value is also the frame pointer (the linked list links to + // frame base) + AtEntry.CreateStore(StackEntry, Head); // For each instruction that escapes... EscapeEnumerator EE(F, "gc_cleanup", /*HandleExceptions=*/true, DTU); while (IRBuilder<> *AtExit = EE.Next()) { // Pop the entry from the shadow stack. Don't reuse CurrentHead from // AtEntry, since that would make the value live for the entire function. - Instruction *EntryNextPtr2 = - CreateGEP(Context, *AtExit, ConcreteStackEntryTy, StackEntry, 0, 0, - "gc_frame.next"); + // Next pointer is at offset 0, so load from the frame base Value *SavedHead = - AtExit->CreateLoad(AtExit->getPtrTy(), EntryNextPtr2, "gc_savedhead"); + AtExit->CreateLoad(AtExit->getPtrTy(), StackEntry, "gc_savedhead"); AtExit->CreateStore(SavedHead, Head); } @@ -407,5 +421,6 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, } Roots.clear(); + RootOffsets.clear(); return true; } From 5e4a21a1b9464ac483daf47deb2d6e4edb8228fe Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 12 May 2026 14:05:07 -0700 Subject: [PATCH 515/538] [RISCV][P-ext] Add initial 64-bit support for RV32. (#197093) Most operations are set to expand. A few operations that were easy to support using isel patterns have been added. concat_vectors and extract_subvector are supported in order to allow type legalization to split 64-bit vectors into 32-bit vectors around the supported operations. Loads and stores are custom split into two i32 scalars or two v4i8/v2i16 vectors. I've added new opcodes to build and split vectors into 2 GPRs at function arguments and returns. These are similar to BuildPairF64 and SplitF64 nodes we use for RV32D soft float. Long term we might want to use concat_vectors/build_vector and extract_subvector/extract_vectorelt. --- llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 18 +- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 72 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 256 +- llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 84 + llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 12 +- llvm/lib/Target/RISCV/RISCVSubtarget.cpp | 18 +- llvm/lib/Target/RISCV/RISCVSubtarget.h | 1 + .../RISCV/calling-conv-p-ext-vector.ll | 86 +- .../RISCV/rvp-narrowing-shift-trunc.ll | 24 +- llvm/test/CodeGen/RISCV/rvp-simd-64.ll | 3973 +++++++++++++---- .../CodeGen/RISCV/rvp-unaligned-load-store.ll | 144 +- 11 files changed, 3564 insertions(+), 1124 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index 872a879dcf9c4..9853644080161 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -405,6 +405,10 @@ static bool CC_RISCV_Impl(unsigned ValNo, MVT ValVT, MVT LocVT, ValNo > 1) return true; + // Double wide packed types require 2 GPRs so we can only return 1 of them. + if (Subtarget.isPExtPackedDoubleType(LocVT) && IsRet && ValNo > 0) + return true; + // AllowFPRForF16_F32 if targeting an FLEN>=32 ABI and the argument isn't // variadic. bool AllowFPRForF16_F32 = false; @@ -524,9 +528,12 @@ static bool CC_RISCV_Impl(unsigned ValNo, MVT ValVT, MVT LocVT, "PendingLocs and PendingArgFlags out of sync"); // Handle passing f64 on RV32D with a soft float ABI or when floating point - // registers are exhausted. - if (XLen == 32 && LocVT == MVT::f64) { - assert(PendingLocs.empty() && "Can't lower f64 if it is split"); + // registers are exhausted. Or 64-bit P extension vectors on RV32. + if (XLen == 32 && + (LocVT == MVT::f64 || (Subtarget.isPExtPackedDoubleType(LocVT) && + !ArgFlags.isSplit() && PendingLocs.empty()))) { + assert(PendingLocs.empty() && + "Can't lower f64 or P extension vector if it is split"); // Depending on available argument GPRS, f64 may be passed in a pair of // GPRs, split between a GPR and the stack, or passed completely on the // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these @@ -554,7 +561,8 @@ static bool CC_RISCV_Impl(unsigned ValNo, MVT ValVT, MVT LocVT, // If the split argument only had two elements, it should be passed directly // in registers or on the stack. - if ((LocVT.isScalarInteger() || Subtarget.isPExtPackedType(LocVT)) && + if ((LocVT.isScalarInteger() || + (Subtarget.isPExtPackedType(LocVT) && LocVT.getSizeInBits() == XLen)) && ArgFlags.isSplitEnd() && PendingLocs.size() <= 1) { assert(PendingLocs.size() == 1 && "Unexpected PendingLocs.size()"); // Apply the normal calling convention rules to the first half of the @@ -630,7 +638,7 @@ static bool CC_RISCV_Impl(unsigned ValNo, MVT ValVT, MVT LocVT, // end of a split argument that must be passed indirectly. if (!PendingLocs.empty()) { assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()"); - assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()"); + assert(PendingLocs.size() > 1 && "Unexpected PendingLocs.size()"); for (auto &It : PendingLocs) { if (Reg) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 359ba913589a6..777a833ddd3ba 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1237,11 +1237,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } case RISCVISD::BuildGPRPair: - case RISCVISD::BuildPairF64: { + case RISCVISD::BuildPairF64: + case RISCVISD::BuildPairGPRVec: { if (Opcode == RISCVISD::BuildPairF64 && !Subtarget->hasStdExtZdinx()) break; - assert((!Subtarget->is64Bit() || Opcode == RISCVISD::BuildGPRPair) && + assert((!Subtarget->is64Bit() || Opcode != RISCVISD::BuildPairF64) && "BuildPairF64 only handled here on rv32i_zdinx"); SDValue N = @@ -1250,9 +1251,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } case RISCVISD::SplitGPRPair: - case RISCVISD::SplitF64: { + case RISCVISD::SplitF64: + case RISCVISD::SplitGPRVec: { if (Subtarget->hasStdExtZdinx() || Opcode != RISCVISD::SplitF64) { - assert((!Subtarget->is64Bit() || Opcode == RISCVISD::SplitGPRPair) && + assert((!Subtarget->is64Bit() || Opcode != RISCVISD::SplitF64) && "SplitF64 only handled here on rv32i_zdinx"); if (!SDValue(Node, 0).use_empty()) { @@ -1272,9 +1274,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } - assert(Opcode != RISCVISD::SplitGPRPair && - "SplitGPRPair should already be handled"); - if (!Subtarget->hasStdExtZfa()) break; assert(Subtarget->hasStdExtD() && !Subtarget->is64Bit() && @@ -3009,6 +3008,49 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; } + case ISD::BUILD_VECTOR: { + if (Subtarget->hasStdExtP() && !Subtarget->is64Bit() && VT == MVT::v2i32) { + SDValue Pair = buildGPRPair(CurDAG, DL, VT, Node->getOperand(0), + Node->getOperand(1)); + ReplaceNode(Node, Pair.getNode()); + return; + } + break; + } + case ISD::CONCAT_VECTORS: { + if (Subtarget->hasStdExtP() && !Subtarget->is64Bit() && + (VT == MVT::v4i16 || VT == MVT::v8i8)) { + assert(Node->getNumOperands() == 2); + SDValue Lo = Node->getOperand(0); + SDValue Hi = Node->getOperand(1); + SDValue Pair = buildGPRPair(CurDAG, DL, VT, Lo, Hi); + ReplaceNode(Node, Pair.getNode()); + return; + } + break; + } + case ISD::EXTRACT_VECTOR_ELT: { + if (Subtarget->hasStdExtP() && !Subtarget->is64Bit()) { + MVT SrcVT = Node->getOperand(0).getSimpleValueType(); + if (VT == MVT::i32 && SrcVT == MVT::v2i32) { + auto *IdxC = dyn_cast(Node->getOperand(1)); + if (!IdxC) + break; + unsigned Idx = IdxC->getZExtValue(); + if (Idx > 1) + break; + + unsigned SubRegIdx = + Idx == 0 ? RISCV::sub_gpr_even : RISCV::sub_gpr_odd; + SDValue Src = Node->getOperand(0); + SDValue Extract = + CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Src); + ReplaceNode(Node, Extract.getNode()); + return; + } + } + break; + } case ISD::SCALAR_TO_VECTOR: if (Subtarget->hasStdExtP()) { MVT SrcVT = Node->getOperand(0).getSimpleValueType(); @@ -3092,6 +3134,22 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { SDValue V = Node->getOperand(0); auto Idx = Node->getConstantOperandVal(1); MVT InVT = V.getSimpleValueType(); + + // Handle P-extension extract_subvector for v2i16 from v4i16 and v4i8 from + // v8i8 + if (Subtarget->hasStdExtP() && !Subtarget->is64Bit() && + ((InVT == MVT::v4i16 && VT == MVT::v2i16) || + (InVT == MVT::v8i8 && VT == MVT::v4i8))) { + unsigned NumElts = VT.getVectorNumElements(); + if (Idx != 0 && Idx != NumElts) + break; + + unsigned SubRegIdx = Idx == 0 ? RISCV::sub_gpr_even : RISCV::sub_gpr_odd; + SDValue Extract = CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, V); + ReplaceNode(Node, Extract.getNode()); + return; + } + SDLoc DL(V); const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index bdee637794756..ac1ef084d1d24 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -296,6 +296,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } else { addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); + + addRegisterClass(MVT::v2i32, &RISCV::GPRPairRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRPairRegClass); + addRegisterClass(MVT::v8i8, &RISCV::GPRPairRegClass); } } @@ -540,11 +544,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FCANONICALIZE}; if (Subtarget.hasStdExtP()) { - static const MVT RV32VTs[] = {MVT::v2i16, MVT::v4i8}; - static const MVT RV64VTs[] = {MVT::v2i32, MVT::v4i16, MVT::v8i8}; + static const MVT P32VecVTs[] = {MVT::v2i16, MVT::v4i8}; + static const MVT P64VecVTs[] = {MVT::v2i32, MVT::v4i16, MVT::v8i8}; ArrayRef VTs; if (Subtarget.is64Bit()) { - VTs = RV64VTs; + VTs = P64VecVTs; // There's no instruction for vector shamt in P extension so we unroll to // scalar instructions. Vector VTs that are 32-bit are widened to 64-bit // vector, e.g. v2i16 -> v4i16, before getting unrolled, so we need custom @@ -552,7 +556,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::SHL, ISD::SRL, ISD::SRA}, {MVT::v2i16, MVT::v4i8}, Custom); } else { - VTs = RV32VTs; + VTs = P32VecVTs; } // By default everything must be expanded. for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) @@ -603,6 +607,34 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // P extension vector comparisons produce all 1s for true, all 0s for false setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + if (!Subtarget.is64Bit()) { + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, P64VecVTs, Expand); + + for (MVT VT : P64VecVTs) { + for (MVT OtherVT : MVT::integer_fixedlen_vector_valuetypes()) { + setTruncStoreAction(VT, OtherVT, Expand); + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, VT, + OtherVT, Expand); + } + } + + setOperationAction({ISD::LOAD, ISD::STORE}, P64VecVTs, Custom); + setOperationAction({ISD::ADD, ISD::SUB}, P64VecVTs, Legal); + setOperationAction( + {ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, ISD::SSUBSAT}, P64VecVTs, + Legal); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, P64VecVTs, Legal); + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, + P64VecVTs, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, {MVT::v4i16, MVT::v8i8}, Legal); + setOperationAction(ISD::EXTRACT_SUBVECTOR, {MVT::v2i16, MVT::v4i8}, + Legal); + } } if (Subtarget.hasStdExtZfbfmin()) { @@ -2685,6 +2717,9 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, // TODO: This is very conservative. bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { + if (!Subtarget.hasVInstructions()) + return false; + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; @@ -3047,6 +3082,10 @@ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( // Permit combining of mask vectors as BUILD_VECTOR never expands to scalar // stores for those types. bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const { + if (Subtarget.hasStdExtP() && !Subtarget.is64Bit() && + (VT == MVT::i32 || VT == MVT::v2i16 || VT == MVT::v4i8)) + return false; + return !Subtarget.useRVVForFixedLengthVectors() || (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1); } @@ -8501,6 +8540,50 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getMergeValues({Pair, Chain}, DL); } + if (Subtarget.hasStdExtP() && !Subtarget.is64Bit() && + (VT == MVT::v2i32 || VT == MVT::v4i16 || VT == MVT::v8i8)) { + assert(!Subtarget.is64Bit() && "Unexpected custom legalisation"); + + // Determine the half-size type + MVT HalfVT; + if (VT == MVT::v2i32) + HalfVT = MVT::i32; + else if (VT == MVT::v4i16) + HalfVT = MVT::v2i16; + else // VT == MVT::v8i8 + HalfVT = MVT::v4i8; + + SDLoc DL(Op); + SDValue BasePtr = Load->getBasePtr(); + SDValue Chain = Load->getChain(); + + // Create two loads for the lower and upper halves + SDValue Lo = + DAG.getLoad(HalfVT, DL, Chain, BasePtr, Load->getPointerInfo(), + Load->getBaseAlign(), Load->getMemOperand()->getFlags()); + unsigned HalfSize = HalfVT.getStoreSize(); + BasePtr = + DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(HalfSize)); + SDValue Hi = + DAG.getLoad(HalfVT, DL, Chain, BasePtr, + Load->getPointerInfo().getWithOffset(HalfSize), + Load->getBaseAlign(), Load->getMemOperand()->getFlags()); + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Combine the two halves into the result vector + SDValue Result; + if (VT == MVT::v2i32) { + // For v2i32, build vector from two i32 scalars + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Lo, Hi); + } else { + // For v4i16 and v8i8, use CONCAT_VECTORS + Result = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + + return DAG.getMergeValues({Result, Chain}, DL); + } + if (VT == MVT::bf16) return lowerXAndesBfHCvtBFloat16Load(Op, DAG); @@ -8596,6 +8679,58 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, Store->getMemOperand()); } + if (Subtarget.hasStdExtP() && !Subtarget.is64Bit() && + (VT == MVT::v2i32 || VT == MVT::v4i16 || VT == MVT::v8i8)) { + assert(!Subtarget.is64Bit() && "Unexpected custom legalisation"); + + auto *Store = cast(Op); + SDValue Val = Store->getValue(); + + // Determine the half-size type + MVT HalfVT; + if (VT == MVT::v2i32) + HalfVT = MVT::i32; + else if (VT == MVT::v4i16) + HalfVT = MVT::v2i16; + else // VT == MVT::v8i8 + HalfVT = MVT::v4i8; + + SDLoc DL(Op); + SDValue BasePtr = Store->getBasePtr(); + SDValue Chain = Store->getChain(); + + // Extract the two halves from the vector + SDValue Lo, Hi; + if (VT == MVT::v2i32) { + // For v2i32, extract two i32 scalars + Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Val, + DAG.getVectorIdxConstant(0, DL)); + Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Val, + DAG.getVectorIdxConstant(1, DL)); + } else { + // For v4i16 and v8i8, extract two vector halves + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Val, + DAG.getVectorIdxConstant(0, DL)); + unsigned HalfNumElts = HalfVT.getVectorNumElements(); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Val, + DAG.getVectorIdxConstant(HalfNumElts, DL)); + } + + // Create two stores for the lower and upper halves + SDValue LoStore = DAG.getStore( + Chain, DL, Lo, BasePtr, Store->getPointerInfo(), + Store->getBaseAlign(), Store->getMemOperand()->getFlags()); + unsigned HalfSize = HalfVT.getStoreSize(); + BasePtr = + DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(HalfSize)); + SDValue HiStore = DAG.getStore( + Chain, DL, Hi, BasePtr, + Store->getPointerInfo().getWithOffset(HalfSize), + Store->getBaseAlign(), Store->getMemOperand()->getFlags()); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoStore, HiStore); + } + if (VT == MVT::bf16) return lowerXAndesBfHCvtBFloat16Store(Op, DAG); @@ -16404,10 +16539,6 @@ static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG, VT != MVT::v4i8 && VT != MVT::v2i32) return SDValue(); - // We only support XLen or smaller vectors. - if (VT.getSizeInBits() > Subtarget.getXLen()) - return SDValue(); - // Check if shift amount is a splat constant SDValue ShAmt = N0.getOperand(1); if (ShAmt.getOpcode() != ISD::BUILD_VECTOR) @@ -16484,6 +16615,8 @@ static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG, // MULH*/MULHR*: shift amount must be element size, only for i16/i32 if (ShAmtVal != EltBits || (EltBits != 16 && EltBits != 32)) return SDValue(); + if (!Subtarget.is64Bit() && (VT == MVT::v2i32 || VT == MVT::v4i16)) + return SDValue(); if (IsRounding) { if (LHSIsSExt && RHSIsSExt) { Opc = RISCVISD::MULHR; @@ -23978,6 +24111,36 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } +static SDValue unpackGPRVecOnRV32(SelectionDAG &DAG, SDValue Chain, + const CCValAssign &VA, + const CCValAssign &HiVA, const SDLoc &DL) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + + assert(VA.isRegLoc() && "Expected register VA assignment"); + + Register LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + RegInfo.addLiveIn(VA.getLocReg(), LoVReg); + SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32); + SDValue Hi; + if (HiVA.isMemLoc()) { + // Second half of f64 is passed on the stack. + int FI = MFI.CreateFixedObject(4, HiVA.getLocMemOffset(), + /*IsImmutable=*/true); + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN, + MachinePointerInfo::getFixedStack(MF, FI)); + } else { + // Second half of f64 is passed in another GPR. + Register HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); + RegInfo.addLiveIn(HiVA.getLocReg(), HiVReg); + Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32); + } + + return DAG.getNode(RISCVISD::BuildPairGPRVec, DL, VA.getValVT(), Lo, Hi); +} + // Transform physical registers into virtual registers. SDValue RISCVTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, @@ -24076,6 +24239,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments( if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { assert(VA.needsCustom()); ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, ArgLocs[++i], DL); + } else if (VA.getLocVT() == MVT::i32 && + Subtarget.isPExtPackedDoubleType(VA.getValVT()) && + VA.getLocInfo() != CCValAssign::Indirect) { + assert(VA.needsCustom()); + ArgValue = unpackGPRVecOnRV32(DAG, Chain, VA, ArgLocs[++i], DL); } else if (VA.isRegLoc()) ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, Ins[InsIdx], *this); else @@ -24369,6 +24537,43 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, continue; } + // Handle passing 64-bit vector on RV32 as a special case. + if (VA.getLocVT() == MVT::i32 && + Subtarget.isPExtPackedDoubleType(VA.getValVT()) && + VA.getLocInfo() != CCValAssign::Indirect) { + assert(VA.isRegLoc() && "Expected register VA assignment"); + assert(VA.needsCustom()); + SDValue SplitGPRVec = + DAG.getNode(RISCVISD::SplitGPRVec, DL, + DAG.getVTList(MVT::i32, MVT::i32), ArgValue); + SDValue Lo = SplitGPRVec.getValue(0); + SDValue Hi = SplitGPRVec.getValue(1); + + Register RegLo = VA.getLocReg(); + RegsToPass.push_back(std::make_pair(RegLo, Lo)); + + // Get the CCValAssign for the Hi part. + CCValAssign &HiVA = ArgLocs[++i]; + + if (HiVA.isMemLoc()) { + // Second half of vector is passed on the stack. + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT); + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, + DAG.getIntPtrConstant(HiVA.getLocMemOffset(), DL)); + // Emit the store. + MemOpChains.push_back(DAG.getStore( + Chain, DL, Hi, Address, + MachinePointerInfo::getStack(MF, HiVA.getLocMemOffset()))); + } else { + // Second half of vector is passed in another GPR. + Register RegHigh = HiVA.getLocReg(); + RegsToPass.push_back(std::make_pair(RegHigh, Hi)); + } + continue; + } + // Promote the value if needed. // For now, only handle fully promoted and indirect arguments. if (VA.getLocInfo() == CCValAssign::Indirect) { @@ -24579,6 +24784,16 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, std::swap(Lo, Hi); RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); + } else if (VA.getLocVT() == MVT::i32 && + Subtarget.isPExtPackedDoubleType(VA.getValVT())) { + assert(VA.needsCustom()); + SDValue RetValue2 = DAG.getCopyFromReg(Chain, DL, RVLocs[++i].getLocReg(), + MVT::i32, Glue); + Chain = RetValue2.getValue(1); + Glue = RetValue2.getValue(2); + + RetValue = DAG.getNode(RISCVISD::BuildPairGPRVec, DL, VA.getValVT(), + RetValue, RetValue2); } else RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget); @@ -24642,6 +24857,31 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Register RegLo = VA.getLocReg(); Register RegHi = RVLocs[++i].getLocReg(); + if (Subtarget.isRegisterReservedByUser(RegLo) || + Subtarget.isRegisterReservedByUser(RegHi)) + MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{ + MF.getFunction(), + "Return value register required, but has been reserved."}); + + Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(RegLo, MVT::i32)); + Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(RegHi, MVT::i32)); + } else if (VA.getLocVT() == MVT::i32 && + Subtarget.isPExtPackedDoubleType(VA.getValVT())) { + // Handle returning 64-bit vector on RV32. + assert(VA.isRegLoc() && "Expected return via registers"); + assert(VA.needsCustom()); + SDValue SplitGPRVec = DAG.getNode(RISCVISD::SplitGPRVec, DL, + DAG.getVTList(MVT::i32, MVT::i32), Val); + SDValue Lo = SplitGPRVec.getValue(0); + SDValue Hi = SplitGPRVec.getValue(1); + + Register RegLo = VA.getLocReg(); + Register RegHi = RVLocs[++i].getLocReg(); + if (Subtarget.isRegisterReservedByUser(RegLo) || Subtarget.isRegisterReservedByUser(RegHi)) MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{ diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index eb943e8e9569d..3f5508f5df4e8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1840,6 +1840,11 @@ let Predicates = [HasStdExtP, IsRV32] in { // Codegen patterns //===----------------------------------------------------------------------===// +// Pattern class for GPRPair operations +class PatGprPairGprPair + : Pat<(vt (OpNode (vt GPRPair:$rs1), (vt GPRPair:$rs2))), + (Inst GPRPair:$rs1, GPRPair:$rs2)>; + def riscv_absw : RVSDNode<"ABSW", SDT_RISCVIntUnaryOpW>; def riscv_clsw : RVSDNode<"CLSW", SDT_RISCVIntUnaryOpW>; @@ -1937,6 +1942,23 @@ def IncImm : SDNodeXFormgetValueType(0)); }]>; +def SDT_RISCVBuildPairGPRVec : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; +def SDT_RISCVSplitGPRVec : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, + SDTCisSameAs<0, 1>, + SDTCisVec<2>]>; + +// These nodes are used to handle 64-bit vector arguments and returns on RV32. +// One side is a pair of i32 GPRs, the other is a 64-bit vector. +// TOD: We might be able to use 32-bit vectors with concat_vectors/build_vector +// and extract_subvector/extract_vectorelt instead. Using custom nodes prevents +// 64-bit vector aithmetic operations in our lit tests from being split until +// we can tune some generic combines. +def RISCVBuildPairGPRVec : RVSDNode<"BuildPairGPRVec", + SDT_RISCVBuildPairGPRVec>; +def RISCVSplitGPRVec : RVSDNode<"SplitGPRVec", SDT_RISCVSplitGPRVec>; + let Predicates = [HasStdExtP] in { def : PatGpr; @@ -2187,6 +2209,68 @@ let Predicates = [HasStdExtP, IsRV32] in { // Build vector patterns def : Pat<(v2i16 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b))), (PACK GPR:$a, GPR:$b)>; + + // Basic 8-bit arithmetic patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // Basic 16-bit arithmetic patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // Basic 32-bit arithmetic patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // 8-bit saturating add/sub patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // 16-bit saturating add/sub patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // 32-bit saturating add/sub patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // 8-bit averaging patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // 16-bit averaging patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // 32-bit averaging patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + + // [s|u]min/[s|u]max patterns + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; + def : PatGprPairGprPair; } // Predicates = [HasStdExtP, IsRV32] let Predicates = [HasStdExtP, IsRV64] in { diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index d3a387d57b338..58af1f863f7bf 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -273,6 +273,12 @@ def XLenFVT : ValueTypeByHwMode<[RV64], [f64]>; def XLenPairFVT : ValueTypeByHwMode<[RV32], [f64]>; +def XLenPairVecI8VT : ValueTypeByHwMode<[RV32], + [v8i8]>; +def XLenPairVecI16VT : ValueTypeByHwMode<[RV32], + [v4i16]>; +def XLenPairVecI32VT : ValueTypeByHwMode<[RV32], + [v2i32]>; // P extension def XLenVecI8VT : ValueTypeByHwMode<[RV32, RV64], @@ -436,7 +442,7 @@ let RegAltNameIndices = [ABIRegAltName] in { } let RegInfos = XLenPairRI, CopyCost = 2 in { -def GPRPair : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (add +def GPRPair : RISCVRegisterClass<[XLenPairVT, XLenPairFVT, XLenPairVecI8VT, XLenPairVecI16VT, XLenPairVecI32VT], 64, (add X10_X11, X12_X13, X14_X15, X16_X17, X6_X7, X28_X29, X30_X31, @@ -445,9 +451,9 @@ def GPRPair : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (add X0_Pair, X2_X3, X4_X5 )>; -def GPRPairNoX0 : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (sub GPRPair, X0_Pair)>; +def GPRPairNoX0 : RISCVRegisterClass<[XLenPairVT, XLenPairFVT, XLenPairVecI8VT, XLenPairVecI16VT, XLenPairVecI32VT], 64, (sub GPRPair, X0_Pair)>; -def GPRPairC : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (add +def GPRPairC : RISCVRegisterClass<[XLenPairVT, XLenPairFVT, XLenPairVecI8VT, XLenPairVecI16VT, XLenPairVecI32VT], 64, (add X10_X11, X12_X13, X14_X15, X8_X9 )>; } // let RegInfos = XLenPairRI, CopyCost = 2 diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index ab342cb5215e3..9f68c8d354d6b 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -166,14 +166,24 @@ bool RISCVSubtarget::useConstantPoolForLargeInts() const { return !RISCVDisableUsingConstantPoolForLargeInts; } -// Returns true if VT is a P extension packed SIMD type that fits in XLen. +// Returns true if VT is a P extension packed SIMD type. bool RISCVSubtarget::isPExtPackedType(MVT VT) const { if (!HasStdExtP) return false; - if (is64Bit()) - return VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32; - return VT == MVT::v4i8 || VT == MVT::v2i16; + // RV32 supports 32-bit and 64-bit vectors. RV64 only support 64-bit vectors. + if (!is64Bit() && (VT == MVT::v4i8 || VT == MVT::v2i16)) + return true; + + return VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32; +} + +// Returns true if VT is a P extension packed double-wide SIMD type. +bool RISCVSubtarget::isPExtPackedDoubleType(MVT VT) const { + if (!HasStdExtP || is64Bit()) + return false; + + return VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32; } unsigned RISCVSubtarget::getMaxBuildIntsCost() const { diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 0eaa3315222e8..7947f938f3a9a 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -337,6 +337,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } bool isPExtPackedType(MVT VT) const; + bool isPExtPackedDoubleType(MVT VT) const; // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the // vector hardware implementation which may be less than VLEN. diff --git a/llvm/test/CodeGen/RISCV/calling-conv-p-ext-vector.ll b/llvm/test/CodeGen/RISCV/calling-conv-p-ext-vector.ll index 9455a285924f2..a0d41077ee535 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-p-ext-vector.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-p-ext-vector.ll @@ -27,8 +27,7 @@ define <2 x i16> @test_cc_v2i16(<2 x i16> %a, <2 x i16> %b) { define <8 x i8> @test_cc_v8i8(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_cc_v8i8: ; RV32: # %bb.0: -; RV32-NEXT: padd.b a0, a0, a2 -; RV32-NEXT: padd.b a1, a1, a3 +; RV32-NEXT: padd.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_cc_v8i8: @@ -42,8 +41,7 @@ define <8 x i8> @test_cc_v8i8(<8 x i8> %a, <8 x i8> %b) { define <4 x i16> @test_cc_v4i16(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_cc_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: padd.h a0, a0, a2 -; RV32-NEXT: padd.h a1, a1, a3 +; RV32-NEXT: padd.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_cc_v4i16: @@ -57,8 +55,7 @@ define <4 x i16> @test_cc_v4i16(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_cc_v2i32(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_cc_v2i32: ; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: padd.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_cc_v2i32: @@ -77,18 +74,18 @@ define <16 x i8> @test_cc_v16i8(<16 x i8> %a, <16 x i8> %b) { ; RV32-NEXT: lw a4, 4(a2) ; RV32-NEXT: lw a5, 8(a2) ; RV32-NEXT: lw a2, 12(a2) -; RV32-NEXT: lw a6, 0(a1) -; RV32-NEXT: lw a7, 4(a1) -; RV32-NEXT: lw t0, 8(a1) -; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: padd.b a3, a6, a3 -; RV32-NEXT: padd.b a4, a7, a4 -; RV32-NEXT: padd.b a5, t0, a5 -; RV32-NEXT: padd.b a1, a1, a2 -; RV32-NEXT: sw a3, 0(a0) +; RV32-NEXT: lw a6, 12(a1) +; RV32-NEXT: lw a7, 8(a1) +; RV32-NEXT: lw t0, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: padd.b a2, a6, a2 +; RV32-NEXT: padd.b a5, a7, a5 +; RV32-NEXT: padd.b a4, t0, a4 +; RV32-NEXT: padd.b a1, a1, a3 +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: sw a4, 4(a0) ; RV32-NEXT: sw a5, 8(a0) -; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: sw a2, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test_cc_v16i8: @@ -107,18 +104,18 @@ define <8 x i16> @test_cc_v8i16(<8 x i16> %a, <8 x i16> %b) { ; RV32-NEXT: lw a4, 4(a2) ; RV32-NEXT: lw a5, 8(a2) ; RV32-NEXT: lw a2, 12(a2) -; RV32-NEXT: lw a6, 0(a1) -; RV32-NEXT: lw a7, 4(a1) -; RV32-NEXT: lw t0, 8(a1) -; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: padd.h a3, a6, a3 -; RV32-NEXT: padd.h a4, a7, a4 -; RV32-NEXT: padd.h a5, t0, a5 -; RV32-NEXT: padd.h a1, a1, a2 -; RV32-NEXT: sw a3, 0(a0) +; RV32-NEXT: lw a6, 12(a1) +; RV32-NEXT: lw a7, 8(a1) +; RV32-NEXT: lw t0, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: padd.h a2, a6, a2 +; RV32-NEXT: padd.h a5, a7, a5 +; RV32-NEXT: padd.h a4, t0, a4 +; RV32-NEXT: padd.h a1, a1, a3 +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: sw a4, 4(a0) ; RV32-NEXT: sw a5, 8(a0) -; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: sw a2, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test_cc_v8i16: @@ -133,22 +130,20 @@ define <8 x i16> @test_cc_v8i16(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @test_cc_v4i32(<4 x i32> %a, <4 x i32> %b) { ; RV32-LABEL: test_cc_v4i32: ; RV32: # %bb.0: -; RV32-NEXT: lw a3, 0(a2) -; RV32-NEXT: lw a4, 4(a2) -; RV32-NEXT: lw a5, 8(a2) -; RV32-NEXT: lw a2, 12(a2) -; RV32-NEXT: lw a6, 0(a1) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: lw a3, 12(a2) +; RV32-NEXT: lw a4, 0(a2) +; RV32-NEXT: lw a2, 8(a2) ; RV32-NEXT: lw a7, 4(a1) -; RV32-NEXT: lw t0, 8(a1) -; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: add a3, a6, a3 -; RV32-NEXT: add a4, a7, a4 -; RV32-NEXT: add a5, t0, a5 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a4, 4(a0) -; RV32-NEXT: sw a5, 8(a0) -; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: lw t2, 12(a1) +; RV32-NEXT: lw a6, 0(a1) +; RV32-NEXT: lw t1, 8(a1) +; RV32-NEXT: padd.dw a4, a6, a4 +; RV32-NEXT: padd.dw a2, t1, a2 +; RV32-NEXT: sw a4, 0(a0) +; RV32-NEXT: sw a5, 4(a0) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: sw a3, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test_cc_v4i32: @@ -329,9 +324,9 @@ define <2 x i16> @test_exhaust(i64 %dummy, i64 %dummy2, i64 %dummy3, i64 %dummy4 define <4 x i16> @test_exhaust_2xlen_rv32(i64 %dummy, i64 %dummy2, i64 %dummy3, i32 %dummy4, <4 x i16> %b) { ; RV32-LABEL: test_exhaust_2xlen_rv32: ; RV32: # %bb.0: +; RV32-NEXT: mv a0, a7 ; RV32-NEXT: lw a1, 0(sp) -; RV32-NEXT: padd.h a0, a7, a7 -; RV32-NEXT: padd.h a1, a1, a1 +; RV32-NEXT: padd.dh a0, a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_exhaust_2xlen_rv32: @@ -345,10 +340,9 @@ define <4 x i16> @test_exhaust_2xlen_rv32(i64 %dummy, i64 %dummy2, i64 %dummy3, define <4 x i16> @test_exhaust_2xlen_rv32_2(i64 %dummy, i64 %dummy2, i64 %dummy3, i64 %dummy4, <4 x i16> %b) { ; RV32-LABEL: test_exhaust_2xlen_rv32_2: ; RV32: # %bb.0: -; RV32-NEXT: lw a0, 0(sp) ; RV32-NEXT: lw a1, 4(sp) -; RV32-NEXT: padd.h a0, a0, a0 -; RV32-NEXT: padd.h a1, a1, a1 +; RV32-NEXT: lw a0, 0(sp) +; RV32-NEXT: padd.dh a0, a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_exhaust_2xlen_rv32_2: diff --git a/llvm/test/CodeGen/RISCV/rvp-narrowing-shift-trunc.ll b/llvm/test/CodeGen/RISCV/rvp-narrowing-shift-trunc.ll index 4fe4ed66fa842..0f0e70324fb92 100644 --- a/llvm/test/CodeGen/RISCV/rvp-narrowing-shift-trunc.ll +++ b/llvm/test/CodeGen/RISCV/rvp-narrowing-shift-trunc.ll @@ -54,12 +54,26 @@ define i32 @trunc_ashr_v2i32_to_v2i16(i64 %a.coerce) { define i32 @trunc_lshr_v4i16_to_v4i8(i64 %a.coerce) { ; CHECK-RV32-LABEL: trunc_lshr_v4i16_to_v4i8: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: psrli.h a1, a1, 4 -; CHECK-RV32-NEXT: psrli.h a0, a0, 4 -; CHECK-RV32-NEXT: srli a3, a1, 16 -; CHECK-RV32-NEXT: srli a2, a0, 16 -; CHECK-RV32-NEXT: ppaire.db a0, a0, a2 +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: sw a0, 0(sp) +; CHECK-RV32-NEXT: sw a1, 4(sp) +; CHECK-RV32-NEXT: lw a0, 0(sp) +; CHECK-RV32-NEXT: lw a1, 4(sp) +; CHECK-RV32-NEXT: sw a0, 8(sp) +; CHECK-RV32-NEXT: sw a1, 12(sp) +; CHECK-RV32-NEXT: lhu a0, 12(sp) +; CHECK-RV32-NEXT: lhu a1, 8(sp) +; CHECK-RV32-NEXT: lhu a2, 14(sp) +; CHECK-RV32-NEXT: lhu a3, 10(sp) +; CHECK-RV32-NEXT: srli a5, a0, 4 +; CHECK-RV32-NEXT: srli a4, a1, 4 +; CHECK-RV32-NEXT: srli a1, a2, 4 +; CHECK-RV32-NEXT: srli a0, a3, 4 +; CHECK-RV32-NEXT: ppaire.db a0, a4, a0 ; CHECK-RV32-NEXT: pack a0, a0, a1 +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 0 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: trunc_lshr_v4i16_to_v4i8: diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll index 4d12032dd0c80..6c6ee5ebcf6a8 100644 --- a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll +++ b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll @@ -10,8 +10,7 @@ define <4 x i16> @test_padd_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_padd_h: ; RV32: # %bb.0: -; RV32-NEXT: padd.h a0, a0, a2 -; RV32-NEXT: padd.h a1, a1, a3 +; RV32-NEXT: padd.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_h: @@ -25,8 +24,7 @@ define <4 x i16> @test_padd_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_psub_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_psub_h: ; RV32: # %bb.0: -; RV32-NEXT: psub.h a0, a0, a2 -; RV32-NEXT: psub.h a1, a1, a3 +; RV32-NEXT: psub.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psub_h: @@ -41,8 +39,7 @@ define <4 x i16> @test_psub_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_padd_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_padd_b: ; RV32: # %bb.0: -; RV32-NEXT: padd.b a0, a0, a2 -; RV32-NEXT: padd.b a1, a1, a3 +; RV32-NEXT: padd.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_b: @@ -56,8 +53,7 @@ define <8 x i8> @test_padd_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_psub_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_psub_b: ; RV32: # %bb.0: -; RV32-NEXT: psub.b a0, a0, a2 -; RV32-NEXT: psub.b a1, a1, a3 +; RV32-NEXT: psub.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psub_b: @@ -72,8 +68,32 @@ define <8 x i8> @test_psub_b(<8 x i8> %a, <8 x i8> %b) { define <4 x i16> @test_and_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_and_h: ; RV32: # %bb.0: -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_and_h: @@ -87,8 +107,32 @@ define <4 x i16> @test_and_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_or_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_or_h: ; RV32: # %bb.0: -; RV32-NEXT: or a0, a0, a2 -; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_or_h: @@ -102,8 +146,32 @@ define <4 x i16> @test_or_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_xor_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_xor_h: ; RV32: # %bb.0: -; RV32-NEXT: xor a0, a0, a2 -; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_xor_h: @@ -117,8 +185,32 @@ define <4 x i16> @test_xor_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_andn_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_andn_h: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: lh a2, 16(sp) +; RV32-NEXT: lh a3, 18(sp) +; RV32-NEXT: lh a4, 20(sp) +; RV32-NEXT: lh a5, 22(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: andn a0, a1, a5 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 12(sp) +; RV32-NEXT: andn a0, a0, a4 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: andn a0, a0, a3 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 8(sp) ; RV32-NEXT: andn a0, a0, a2 -; RV32-NEXT: andn a1, a1, a3 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_andn_h: @@ -133,8 +225,32 @@ define <4 x i16> @test_andn_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_orn_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_orn_h: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: lh a2, 16(sp) +; RV32-NEXT: lh a3, 18(sp) +; RV32-NEXT: lh a4, 20(sp) +; RV32-NEXT: lh a5, 22(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: orn a0, a1, a5 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 12(sp) +; RV32-NEXT: orn a0, a0, a4 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: orn a0, a0, a3 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 8(sp) ; RV32-NEXT: orn a0, a0, a2 -; RV32-NEXT: orn a1, a1, a3 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_orn_h: @@ -149,8 +265,32 @@ define <4 x i16> @test_orn_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_xnor_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_xnor_h: ; RV32: # %bb.0: -; RV32-NEXT: xnor a0, a2, a0 -; RV32-NEXT: xnor a1, a3, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: lh a2, 12(sp) +; RV32-NEXT: lh a3, 14(sp) +; RV32-NEXT: lh a4, 22(sp) +; RV32-NEXT: lh a5, 20(sp) +; RV32-NEXT: lh a6, 18(sp) +; RV32-NEXT: lh a7, 16(sp) +; RV32-NEXT: xnor a3, a4, a3 +; RV32-NEXT: xnor a2, a5, a2 +; RV32-NEXT: xnor a1, a6, a1 +; RV32-NEXT: xnor a0, a7, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: sh a1, 26(sp) +; RV32-NEXT: sh a2, 28(sp) +; RV32-NEXT: sh a3, 30(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_xnor_h: @@ -166,8 +306,48 @@ define <4 x i16> @test_xnor_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_and_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_and_b: ; RV32: # %bb.0: -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_and_b: @@ -181,8 +361,48 @@ define <8 x i8> @test_and_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_or_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_or_b: ; RV32: # %bb.0: -; RV32-NEXT: or a0, a0, a2 -; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_or_b: @@ -196,8 +416,48 @@ define <8 x i8> @test_or_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_xor_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_xor_b: ; RV32: # %bb.0: -; RV32-NEXT: xor a0, a0, a2 -; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_xor_b: @@ -211,8 +471,48 @@ define <8 x i8> @test_xor_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_andn_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_andn_b: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: lbu a2, 16(sp) +; RV32-NEXT: lbu a3, 17(sp) +; RV32-NEXT: lbu a4, 18(sp) +; RV32-NEXT: lbu a5, 19(sp) +; RV32-NEXT: lbu a6, 20(sp) +; RV32-NEXT: lbu a7, 21(sp) +; RV32-NEXT: lbu t0, 22(sp) +; RV32-NEXT: lbu t1, 23(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: andn a0, a1, t1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 14(sp) +; RV32-NEXT: andn a0, a0, t0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 13(sp) +; RV32-NEXT: andn a0, a0, a7 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 12(sp) +; RV32-NEXT: andn a0, a0, a6 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 11(sp) +; RV32-NEXT: andn a0, a0, a5 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 10(sp) +; RV32-NEXT: andn a0, a0, a4 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 9(sp) +; RV32-NEXT: andn a0, a0, a3 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 8(sp) ; RV32-NEXT: andn a0, a0, a2 -; RV32-NEXT: andn a1, a1, a3 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_andn_b: @@ -227,8 +527,48 @@ define <8 x i8> @test_andn_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_orn_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_orn_b: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: lbu a2, 16(sp) +; RV32-NEXT: lbu a3, 17(sp) +; RV32-NEXT: lbu a4, 18(sp) +; RV32-NEXT: lbu a5, 19(sp) +; RV32-NEXT: lbu a6, 20(sp) +; RV32-NEXT: lbu a7, 21(sp) +; RV32-NEXT: lbu t0, 22(sp) +; RV32-NEXT: lbu t1, 23(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: orn a0, a1, t1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 14(sp) +; RV32-NEXT: orn a0, a0, t0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 13(sp) +; RV32-NEXT: orn a0, a0, a7 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 12(sp) +; RV32-NEXT: orn a0, a0, a6 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 11(sp) +; RV32-NEXT: orn a0, a0, a5 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 10(sp) +; RV32-NEXT: orn a0, a0, a4 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 9(sp) +; RV32-NEXT: orn a0, a0, a3 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 8(sp) ; RV32-NEXT: orn a0, a0, a2 -; RV32-NEXT: orn a1, a1, a3 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_orn_b: @@ -243,8 +583,52 @@ define <8 x i8> @test_orn_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_xnor_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_xnor_b: ; RV32: # %bb.0: -; RV32-NEXT: xnor a0, a2, a0 -; RV32-NEXT: xnor a1, a3, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: lbu a1, 1(sp) +; RV32-NEXT: lbu a2, 2(sp) +; RV32-NEXT: lbu a3, 3(sp) +; RV32-NEXT: lbu a4, 4(sp) +; RV32-NEXT: lbu a5, 5(sp) +; RV32-NEXT: lbu a6, 6(sp) +; RV32-NEXT: lbu a7, 7(sp) +; RV32-NEXT: lbu t0, 12(sp) +; RV32-NEXT: lbu t1, 13(sp) +; RV32-NEXT: lbu t2, 14(sp) +; RV32-NEXT: lbu t3, 15(sp) +; RV32-NEXT: lbu t4, 8(sp) +; RV32-NEXT: lbu t5, 9(sp) +; RV32-NEXT: lbu t6, 10(sp) +; RV32-NEXT: lbu s0, 11(sp) +; RV32-NEXT: xnor a7, t3, a7 +; RV32-NEXT: xnor a6, t2, a6 +; RV32-NEXT: xnor a5, t1, a5 +; RV32-NEXT: xnor a4, t0, a4 +; RV32-NEXT: xnor a3, s0, a3 +; RV32-NEXT: xnor a2, t6, a2 +; RV32-NEXT: xnor a1, t5, a1 +; RV32-NEXT: xnor a0, t4, a0 +; RV32-NEXT: sb a4, 20(sp) +; RV32-NEXT: sb a5, 21(sp) +; RV32-NEXT: sb a6, 22(sp) +; RV32-NEXT: sb a7, 23(sp) +; RV32-NEXT: sb a0, 16(sp) +; RV32-NEXT: sb a1, 17(sp) +; RV32-NEXT: sb a2, 18(sp) +; RV32-NEXT: sb a3, 19(sp) +; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_xnor_b: @@ -260,8 +644,8 @@ define <8 x i8> @test_xnor_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_and_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_and_w: ; RV32: # %bb.0: -; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_and_w: @@ -275,8 +659,8 @@ define <2 x i32> @test_and_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_or_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_or_w: ; RV32: # %bb.0: -; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_or_w: @@ -290,8 +674,8 @@ define <2 x i32> @test_or_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_xor_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_xor_w: ; RV32: # %bb.0: -; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_xor_w: @@ -305,8 +689,8 @@ define <2 x i32> @test_xor_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_andn_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_andn_w: ; RV32: # %bb.0: -; RV32-NEXT: andn a0, a0, a2 ; RV32-NEXT: andn a1, a1, a3 +; RV32-NEXT: andn a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_andn_w: @@ -321,8 +705,8 @@ define <2 x i32> @test_andn_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_orn_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_orn_w: ; RV32: # %bb.0: -; RV32-NEXT: orn a0, a0, a2 ; RV32-NEXT: orn a1, a1, a3 +; RV32-NEXT: orn a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_orn_w: @@ -337,8 +721,8 @@ define <2 x i32> @test_orn_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_xnor_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_xnor_w: ; RV32: # %bb.0: -; RV32-NEXT: xnor a0, a2, a0 ; RV32-NEXT: xnor a1, a3, a1 +; RV32-NEXT: xnor a0, a2, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_xnor_w: @@ -353,8 +737,26 @@ define <2 x i32> @test_xnor_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_not_h(<4 x i16> %a) { ; RV32-LABEL: test_not_h: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: not a0, a1 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) ; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_not_h: @@ -368,8 +770,38 @@ define <4 x i16> @test_not_h(<4 x i16> %a) { define <8 x i8> @test_not_b(<8 x i8> %a) { ; RV32-LABEL: test_not_b: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: not a0, a1 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) ; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: not a0, a0 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_not_b: @@ -383,8 +815,8 @@ define <8 x i8> @test_not_b(<8 x i8> %a) { define <2 x i32> @test_not_w(<2 x i32> %a) { ; RV32-LABEL: test_not_w: ; RV32: # %bb.0: -; RV32-NEXT: not a0, a0 ; RV32-NEXT: not a1, a1 +; RV32-NEXT: not a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_not_w: @@ -399,8 +831,7 @@ define <2 x i32> @test_not_w(<2 x i32> %a) { define <4 x i16> @test_psadd_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_psadd_h: ; RV32: # %bb.0: -; RV32-NEXT: psadd.h a0, a0, a2 -; RV32-NEXT: psadd.h a1, a1, a3 +; RV32-NEXT: psadd.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psadd_h: @@ -414,8 +845,7 @@ define <4 x i16> @test_psadd_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_psaddu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_psaddu_h: ; RV32: # %bb.0: -; RV32-NEXT: psaddu.h a0, a0, a2 -; RV32-NEXT: psaddu.h a1, a1, a3 +; RV32-NEXT: psaddu.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psaddu_h: @@ -430,8 +860,7 @@ define <4 x i16> @test_psaddu_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_pssub_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pssub_h: ; RV32: # %bb.0: -; RV32-NEXT: pssub.h a0, a0, a2 -; RV32-NEXT: pssub.h a1, a1, a3 +; RV32-NEXT: pssub.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssub_h: @@ -445,8 +874,7 @@ define <4 x i16> @test_pssub_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_pssubu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pssubu_h: ; RV32: # %bb.0: -; RV32-NEXT: pssubu.h a0, a0, a2 -; RV32-NEXT: pssubu.h a1, a1, a3 +; RV32-NEXT: pssubu.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssubu_h: @@ -461,8 +889,7 @@ define <4 x i16> @test_pssubu_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_psadd_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_psadd_b: ; RV32: # %bb.0: -; RV32-NEXT: psadd.b a0, a0, a2 -; RV32-NEXT: psadd.b a1, a1, a3 +; RV32-NEXT: psadd.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psadd_b: @@ -476,8 +903,7 @@ define <8 x i8> @test_psadd_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_psaddu_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_psaddu_b: ; RV32: # %bb.0: -; RV32-NEXT: psaddu.b a0, a0, a2 -; RV32-NEXT: psaddu.b a1, a1, a3 +; RV32-NEXT: psaddu.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psaddu_b: @@ -492,8 +918,7 @@ define <8 x i8> @test_psaddu_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_pssub_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pssub_b: ; RV32: # %bb.0: -; RV32-NEXT: pssub.b a0, a0, a2 -; RV32-NEXT: pssub.b a1, a1, a3 +; RV32-NEXT: pssub.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssub_b: @@ -507,8 +932,7 @@ define <8 x i8> @test_pssub_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_pssubu_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pssubu_b: ; RV32: # %bb.0: -; RV32-NEXT: pssubu.b a0, a0, a2 -; RV32-NEXT: pssubu.b a1, a1, a3 +; RV32-NEXT: pssubu.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssubu_b: @@ -524,8 +948,7 @@ define <8 x i8> @test_pssubu_b(<8 x i8> %a, <8 x i8> %b) { define <4 x i16> @test_paadd_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_paadd_h: ; RV32: # %bb.0: -; RV32-NEXT: paadd.h a0, a0, a2 -; RV32-NEXT: paadd.h a1, a1, a3 +; RV32-NEXT: paadd.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_paadd_h: @@ -545,8 +968,7 @@ define <4 x i16> @test_paadd_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_paaddu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_paaddu_h: ; RV32: # %bb.0: -; RV32-NEXT: paaddu.h a0, a0, a2 -; RV32-NEXT: paaddu.h a1, a1, a3 +; RV32-NEXT: paaddu.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_paaddu_h: @@ -564,8 +986,7 @@ define <4 x i16> @test_paaddu_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_paadd_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_paadd_b: ; RV32: # %bb.0: -; RV32-NEXT: paadd.b a0, a0, a2 -; RV32-NEXT: paadd.b a1, a1, a3 +; RV32-NEXT: paadd.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_paadd_b: @@ -584,8 +1005,7 @@ define <8 x i8> @test_paadd_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_paaddu_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_paaddu_b: ; RV32: # %bb.0: -; RV32-NEXT: paaddu.b a0, a0, a2 -; RV32-NEXT: paaddu.b a1, a1, a3 +; RV32-NEXT: paaddu.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_paaddu_b: @@ -602,8 +1022,10 @@ define <8 x i8> @test_paaddu_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_pabs_w(<2 x i32> %a) { ; RV32-LABEL: test_pabs_w: ; RV32: # %bb.0: -; RV32-NEXT: abs a0, a0 -; RV32-NEXT: abs a1, a1 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: psub.dw a2, a2, a0 +; RV32-NEXT: pmax.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pabs_w: @@ -618,8 +1040,12 @@ define <2 x i32> @test_pabs_w(<2 x i32> %a) { define <4 x i16> @test_pabs_h(<4 x i16> %a) { ; RV32-LABEL: test_pabs_h: ; RV32: # %bb.0: -; RV32-NEXT: pabs.h a0, a0 -; RV32-NEXT: pabs.h a1, a1 +; RV32-NEXT: lui a2, %hi(.LCPI38_0) +; RV32-NEXT: lw a4, %lo(.LCPI38_0)(a2) +; RV32-NEXT: addi a2, a2, %lo(.LCPI38_0) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: psub.dh a2, a4, a0 +; RV32-NEXT: pmax.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pabs_h: @@ -633,8 +1059,12 @@ define <4 x i16> @test_pabs_h(<4 x i16> %a) { define <8 x i8> @test_pabs_b(<8 x i8> %a) { ; RV32-LABEL: test_pabs_b: ; RV32: # %bb.0: -; RV32-NEXT: pabs.b a0, a0 -; RV32-NEXT: pabs.b a1, a1 +; RV32-NEXT: lui a2, %hi(.LCPI39_0) +; RV32-NEXT: lw a4, %lo(.LCPI39_0)(a2) +; RV32-NEXT: addi a2, a2, %lo(.LCPI39_0) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: psub.db a2, a4, a0 +; RV32-NEXT: pmax.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pabs_b: @@ -650,12 +1080,9 @@ define <8 x i8> @test_pabs_b(<8 x i8> %a) { define <2 x i32> @test_pdif_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pdif_w: ; RV32: # %bb.0: -; RV32-NEXT: min a4, a0, a2 -; RV32-NEXT: max a0, a0, a2 -; RV32-NEXT: min a2, a1, a3 -; RV32-NEXT: max a1, a1, a3 -; RV32-NEXT: sub a0, a0, a4 -; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: pmin.dw a4, a0, a2 +; RV32-NEXT: pmax.dw a0, a0, a2 +; RV32-NEXT: psub.dw a0, a0, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pdif_w: @@ -675,12 +1102,9 @@ define <2 x i32> @test_pdif_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pdifu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pdifu_w: ; RV32: # %bb.0: -; RV32-NEXT: minu a4, a0, a2 -; RV32-NEXT: maxu a0, a0, a2 -; RV32-NEXT: minu a2, a1, a3 -; RV32-NEXT: maxu a1, a1, a3 -; RV32-NEXT: sub a0, a0, a4 -; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: pminu.dw a4, a0, a2 +; RV32-NEXT: pmaxu.dw a0, a0, a2 +; RV32-NEXT: psub.dw a0, a0, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pdifu_w: @@ -700,8 +1124,9 @@ define <2 x i32> @test_pdifu_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pdif_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pdif_h: ; RV32: # %bb.0: -; RV32-NEXT: pabd.h a0, a0, a2 -; RV32-NEXT: pabd.h a1, a1, a3 +; RV32-NEXT: pmin.dh a4, a0, a2 +; RV32-NEXT: pmax.dh a0, a0, a2 +; RV32-NEXT: psub.dh a0, a0, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pdif_h: @@ -719,8 +1144,9 @@ define <4 x i16> @test_pdif_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_pdifu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pdifu_h: ; RV32: # %bb.0: -; RV32-NEXT: pabdu.h a0, a0, a2 -; RV32-NEXT: pabdu.h a1, a1, a3 +; RV32-NEXT: pminu.dh a4, a0, a2 +; RV32-NEXT: pmaxu.dh a0, a0, a2 +; RV32-NEXT: psub.dh a0, a0, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pdifu_h: @@ -737,8 +1163,9 @@ define <4 x i16> @test_pdifu_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_pdif_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pdif_b: ; RV32: # %bb.0: -; RV32-NEXT: pabd.b a0, a0, a2 -; RV32-NEXT: pabd.b a1, a1, a3 +; RV32-NEXT: pmin.db a4, a0, a2 +; RV32-NEXT: pmax.db a0, a0, a2 +; RV32-NEXT: psub.db a0, a0, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pdif_b: @@ -755,8 +1182,9 @@ define <8 x i8> @test_pdif_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_pdifu_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pdifu_b: ; RV32: # %bb.0: -; RV32-NEXT: pabdu.b a0, a0, a2 -; RV32-NEXT: pabdu.b a1, a1, a3 +; RV32-NEXT: pminu.db a4, a0, a2 +; RV32-NEXT: pmaxu.db a0, a0, a2 +; RV32-NEXT: psub.db a0, a0, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pdifu_b: @@ -774,24 +1202,7 @@ define <8 x i8> @test_pdifu_b(<8 x i8> %a, <8 x i8> %b) { define <4 x i16> @test_pasub_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pasub_h: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a1, 16 -; RV32-NEXT: sext.h a1, a1 -; RV32-NEXT: srai a5, a0, 16 -; RV32-NEXT: sext.h a0, a0 -; RV32-NEXT: srai a6, a3, 16 -; RV32-NEXT: sext.h a3, a3 -; RV32-NEXT: srai a7, a2, 16 -; RV32-NEXT: sext.h a2, a2 -; RV32-NEXT: sub a0, a0, a2 -; RV32-NEXT: sub a2, a5, a7 -; RV32-NEXT: sub a1, a1, a3 -; RV32-NEXT: sub a3, a4, a6 -; RV32-NEXT: srli a3, a3, 1 -; RV32-NEXT: srli a1, a1, 1 -; RV32-NEXT: srli a2, a2, 1 -; RV32-NEXT: srli a0, a0, 1 -; RV32-NEXT: pack a0, a0, a2 -; RV32-NEXT: pack a1, a1, a3 +; RV32-NEXT: pasub.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pasub_h: @@ -811,24 +1222,7 @@ define <4 x i16> @test_pasub_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_pasubu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pasubu_h: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a1, 16 -; RV32-NEXT: zext.h a1, a1 -; RV32-NEXT: srli a5, a0, 16 -; RV32-NEXT: zext.h a0, a0 -; RV32-NEXT: srli a6, a3, 16 -; RV32-NEXT: zext.h a3, a3 -; RV32-NEXT: srli a7, a2, 16 -; RV32-NEXT: zext.h a2, a2 -; RV32-NEXT: sub a0, a0, a2 -; RV32-NEXT: sub a2, a5, a7 -; RV32-NEXT: sub a1, a1, a3 -; RV32-NEXT: sub a3, a4, a6 -; RV32-NEXT: srli a3, a3, 1 -; RV32-NEXT: srli a1, a1, 1 -; RV32-NEXT: srli a2, a2, 1 -; RV32-NEXT: srli a0, a0, 1 -; RV32-NEXT: pack a0, a0, a2 -; RV32-NEXT: pack a1, a1, a3 +; RV32-NEXT: pasubu.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pasubu_h: @@ -847,58 +1241,7 @@ define <4 x i16> @test_pasubu_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_pasub_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pasub_b: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a1, 24 -; RV32-NEXT: srli a5, a1, 16 -; RV32-NEXT: srli a6, a1, 8 -; RV32-NEXT: srli a7, a0, 24 -; RV32-NEXT: srli t0, a0, 16 -; RV32-NEXT: pack a4, a5, a4 -; RV32-NEXT: srli a5, a0, 8 -; RV32-NEXT: pack a1, a1, a6 -; RV32-NEXT: srli a6, a3, 24 -; RV32-NEXT: pack a7, t0, a7 -; RV32-NEXT: srli t0, a3, 16 -; RV32-NEXT: pack a0, a0, a5 -; RV32-NEXT: srli a5, a3, 8 -; RV32-NEXT: pack a6, t0, a6 -; RV32-NEXT: srli t0, a2, 24 -; RV32-NEXT: pack a3, a3, a5 -; RV32-NEXT: srli a5, a2, 16 -; RV32-NEXT: pack a5, a5, t0 -; RV32-NEXT: srli t0, a2, 8 -; RV32-NEXT: pack a2, a2, t0 -; RV32-NEXT: pslli.h a4, a4, 8 -; RV32-NEXT: pslli.h a1, a1, 8 -; RV32-NEXT: pslli.h a7, a7, 8 -; RV32-NEXT: pslli.h a0, a0, 8 -; RV32-NEXT: pslli.h a6, a6, 8 -; RV32-NEXT: pslli.h a3, a3, 8 -; RV32-NEXT: pslli.h a5, a5, 8 -; RV32-NEXT: pslli.h a2, a2, 8 -; RV32-NEXT: psrai.h a4, a4, 8 -; RV32-NEXT: psrai.h a1, a1, 8 -; RV32-NEXT: psrai.h a7, a7, 8 -; RV32-NEXT: psrai.h a0, a0, 8 -; RV32-NEXT: psrai.h a6, a6, 8 -; RV32-NEXT: psrai.h a3, a3, 8 -; RV32-NEXT: psrai.h a5, a5, 8 -; RV32-NEXT: psrai.h a2, a2, 8 -; RV32-NEXT: psub.h a0, a0, a2 -; RV32-NEXT: psub.h a2, a7, a5 -; RV32-NEXT: psub.h a1, a1, a3 -; RV32-NEXT: psub.h a3, a4, a6 -; RV32-NEXT: psrli.h a3, a3, 1 -; RV32-NEXT: psrli.h a5, a2, 1 -; RV32-NEXT: psrli.h a2, a1, 1 -; RV32-NEXT: psrli.h a4, a0, 1 -; RV32-NEXT: srli a1, a5, 16 -; RV32-NEXT: srli a7, a3, 16 -; RV32-NEXT: srli a0, a4, 16 -; RV32-NEXT: srli a6, a2, 16 -; RV32-NEXT: ppaire.db a0, a4, a0 -; RV32-NEXT: ppaire.db a2, a2, a6 -; RV32-NEXT: pack a0, a0, a1 -; RV32-NEXT: pack a1, a2, a3 +; RV32-NEXT: pasub.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pasub_b: @@ -917,51 +1260,7 @@ define <8 x i8> @test_pasub_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_pasubu_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pasubu_b: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a1, 24 -; RV32-NEXT: srli a5, a1, 16 -; RV32-NEXT: pli.h a6, 255 -; RV32-NEXT: srli a7, a1, 8 -; RV32-NEXT: srli t0, a0, 24 -; RV32-NEXT: srli t1, a0, 16 -; RV32-NEXT: pack a4, a5, a4 -; RV32-NEXT: srli a5, a0, 8 -; RV32-NEXT: pack a1, a1, a7 -; RV32-NEXT: srli a7, a3, 24 -; RV32-NEXT: pack t0, t1, t0 -; RV32-NEXT: srli t1, a3, 16 -; RV32-NEXT: pack a0, a0, a5 -; RV32-NEXT: srli a5, a3, 8 -; RV32-NEXT: pack a7, t1, a7 -; RV32-NEXT: srli t1, a2, 24 -; RV32-NEXT: pack a3, a3, a5 -; RV32-NEXT: srli a5, a2, 16 -; RV32-NEXT: pack a5, a5, t1 -; RV32-NEXT: srli t1, a2, 8 -; RV32-NEXT: pack a2, a2, t1 -; RV32-NEXT: and a4, a4, a6 -; RV32-NEXT: and a1, a1, a6 -; RV32-NEXT: and t0, t0, a6 -; RV32-NEXT: and a0, a0, a6 -; RV32-NEXT: and a7, a7, a6 -; RV32-NEXT: and a3, a3, a6 -; RV32-NEXT: and a5, a5, a6 -; RV32-NEXT: and a2, a2, a6 -; RV32-NEXT: psub.h a0, a0, a2 -; RV32-NEXT: psub.h a2, t0, a5 -; RV32-NEXT: psub.h a1, a1, a3 -; RV32-NEXT: psub.h a3, a4, a7 -; RV32-NEXT: psrli.h a3, a3, 1 -; RV32-NEXT: psrli.h a5, a2, 1 -; RV32-NEXT: psrli.h a2, a1, 1 -; RV32-NEXT: psrli.h a4, a0, 1 -; RV32-NEXT: srli a1, a5, 16 -; RV32-NEXT: srli a7, a3, 16 -; RV32-NEXT: srli a0, a4, 16 -; RV32-NEXT: srli a6, a2, 16 -; RV32-NEXT: ppaire.db a0, a4, a0 -; RV32-NEXT: ppaire.db a2, a2, a6 -; RV32-NEXT: pack a0, a0, a1 -; RV32-NEXT: pack a1, a2, a3 +; RV32-NEXT: pasubu.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pasubu_b: @@ -980,8 +1279,10 @@ define <8 x i8> @test_pasubu_b(<8 x i8> %a, <8 x i8> %b) { define <4 x i16> @test_pli_h() { ; RV32-LABEL: test_pli_h: ; RV32: # %bb.0: -; RV32-NEXT: pli.h a0, 100 -; RV32-NEXT: pli.h a1, 100 +; RV32-NEXT: lui a1, %hi(.LCPI50_0) +; RV32-NEXT: lw a0, %lo(.LCPI50_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI50_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_pli_h: @@ -1009,8 +1310,10 @@ define <2 x i32> @test_pli_h_v2i32() { define <8 x i8> @test_pli_b() { ; RV32-LABEL: test_pli_b: ; RV32: # %bb.0: -; RV32-NEXT: pli.b a0, 64 -; RV32-NEXT: pli.b a1, 64 +; RV32-NEXT: lui a1, %hi(.LCPI52_0) +; RV32-NEXT: lw a0, %lo(.LCPI52_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI52_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_pli_b: @@ -1023,8 +1326,10 @@ define <8 x i8> @test_pli_b() { define <4 x i16> @test_pli_b_v4i16() { ; RV32-LABEL: test_pli_b_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: pli.b a0, 64 -; RV32-NEXT: pli.b a1, 64 +; RV32-NEXT: lui a1, %hi(.LCPI53_0) +; RV32-NEXT: lw a0, %lo(.LCPI53_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI53_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_pli_b_v4i16: @@ -1066,8 +1371,10 @@ define <2 x i32> @test_pli_w() { define <4 x i16> @test_plui_h() { ; RV32-LABEL: test_plui_h: ; RV32: # %bb.0: -; RV32-NEXT: plui.h a0, 100 -; RV32-NEXT: plui.h a1, 100 +; RV32-NEXT: lui a1, %hi(.LCPI56_0) +; RV32-NEXT: lw a0, %lo(.LCPI56_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI56_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_plui_h: @@ -1080,8 +1387,10 @@ define <4 x i16> @test_plui_h() { define <4 x i16> @test_plui_h_negative() { ; RV32-LABEL: test_plui_h_negative: ; RV32: # %bb.0: -; RV32-NEXT: plui.h a0, -412 -; RV32-NEXT: plui.h a1, -412 +; RV32-NEXT: lui a1, %hi(.LCPI57_0) +; RV32-NEXT: lw a0, %lo(.LCPI57_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI57_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_plui_h_negative: @@ -1150,8 +1459,10 @@ define <2 x i32> @test_plui_w_negative() { define <8 x i8> @test_allones_v8i8() { ; RV32-LABEL: test_allones_v8i8: ; RV32: # %bb.0: -; RV32-NEXT: li a0, -1 -; RV32-NEXT: li a1, -1 +; RV32-NEXT: lui a1, %hi(.LCPI62_0) +; RV32-NEXT: lw a0, %lo(.LCPI62_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI62_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_allones_v8i8: @@ -1164,8 +1475,10 @@ define <8 x i8> @test_allones_v8i8() { define <4 x i16> @test_allones_v4i16() { ; RV32-LABEL: test_allones_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: li a0, -1 -; RV32-NEXT: li a1, -1 +; RV32-NEXT: lui a1, %hi(.LCPI63_0) +; RV32-NEXT: lw a0, %lo(.LCPI63_0)(a1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI63_0) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: test_allones_v4i16: @@ -1190,17 +1503,37 @@ define <2 x i32> @test_allones_v2i32() { } define i16 @test_extract_vector_16(<4 x i16> %a) { -; CHECK-LABEL: test_extract_vector_16: -; CHECK: # %bb.0: -; CHECK-NEXT: ret +; RV32-LABEL: test_extract_vector_16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: test_extract_vector_16: +; RV64: # %bb.0: +; RV64-NEXT: ret %extracted = extractelement <4 x i16> %a, i32 0 ret i16 %extracted } define i8 @test_extract_vector_8(<8 x i8> %a) { -; CHECK-LABEL: test_extract_vector_8: -; CHECK: # %bb.0: -; CHECK-NEXT: ret +; RV32-LABEL: test_extract_vector_8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 8(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: test_extract_vector_8: +; RV64: # %bb.0: +; RV64-NEXT: ret %extracted = extractelement <8 x i8> %a, i32 0 ret i8 %extracted } @@ -1230,8 +1563,23 @@ define i32 @test_extract_vector_32_elem1(<2 x i32> %a) { define <4 x i16> @test_insert_vector_16(<4 x i16> %a, i16 %val) { ; RV32-LABEL: test_insert_vector_16: ; RV32: # %bb.0: -; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: pack a0, a2, a0 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sh a2, 24(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: lw a0, 12(sp) +; RV32-NEXT: sw a0, 20(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: sh a0, 18(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: sh a0, 16(sp) +; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_insert_vector_16: @@ -1247,8 +1595,25 @@ define <4 x i16> @test_insert_vector_16(<4 x i16> %a, i16 %val) { define <4 x i16> @test_insert_vector_16_elem2(<4 x i16> %a, i16 %val) { ; RV32-LABEL: test_insert_vector_16_elem2: ; RV32: # %bb.0: -; RV32-NEXT: srli a1, a1, 16 -; RV32-NEXT: pack a1, a2, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sh a2, 24(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lw a1, 24(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sh a1, 22(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: sh a0, 20(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: sh a0, 18(sp) +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: sh a0, 16(sp) +; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_insert_vector_16_elem2: @@ -1265,8 +1630,25 @@ define <4 x i16> @test_insert_vector_16_elem2(<4 x i16> %a, i16 %val) { define <8 x i8> @test_insert_vector_8(<8 x i8> %a, i8 %val) { ; RV32-LABEL: test_insert_vector_8: ; RV32: # %bb.0: -; RV32-NEXT: li a3, 255 -; RV32-NEXT: mvm a0, a2, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sb a2, 24(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: lw a0, 12(sp) +; RV32-NEXT: sw a0, 20(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: sh a0, 18(sp) +; RV32-NEXT: lbu a0, 9(sp) +; RV32-NEXT: sb a0, 17(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: sb a0, 16(sp) +; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_insert_vector_8: @@ -1281,9 +1663,27 @@ define <8 x i8> @test_insert_vector_8(<8 x i8> %a, i8 %val) { define <8 x i8> @test_insert_vector_8_elem3(<8 x i8> %a, i8 %val) { ; RV32-LABEL: test_insert_vector_8_elem3: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a2, 24 -; RV32-NEXT: lui a3, 1044480 -; RV32-NEXT: mvm a0, a2, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sb a2, 24(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lw a1, 24(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: sb a0, 19(sp) +; RV32-NEXT: lbu a0, 10(sp) +; RV32-NEXT: sb a0, 18(sp) +; RV32-NEXT: lbu a0, 9(sp) +; RV32-NEXT: sb a0, 17(sp) +; RV32-NEXT: lbu a0, 8(sp) +; RV32-NEXT: sb a0, 16(sp) +; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_insert_vector_8_elem3: @@ -1300,7 +1700,11 @@ define <8 x i8> @test_insert_vector_8_elem3(<8 x i8> %a, i8 %val) { define <2 x i32> @test_insert_vector_32(<2 x i32> %a, i32 %val) { ; RV32-LABEL: test_insert_vector_32: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: mv a0, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_insert_vector_32: @@ -1315,7 +1719,11 @@ define <2 x i32> @test_insert_vector_32(<2 x i32> %a, i32 %val) { define <2 x i32> @test_insert_vector_32_elem1(<2 x i32> %a, i32 %val) { ; RV32-LABEL: test_insert_vector_32_elem1: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: mv a1, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_insert_vector_32_elem1: @@ -1330,8 +1738,7 @@ define <2 x i32> @test_insert_vector_32_elem1(<2 x i32> %a, i32 %val) { define <2 x i32> @test_padd_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_padd_w: ; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: padd.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_w: @@ -1345,8 +1752,7 @@ define <2 x i32> @test_padd_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_psub_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psub_w: ; RV32: # %bb.0: -; RV32-NEXT: sub a0, a0, a2 -; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: psub.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psub_w: @@ -1361,8 +1767,7 @@ define <2 x i32> @test_psub_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_psadd_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psadd_w: ; RV32: # %bb.0: -; RV32-NEXT: sadd a0, a0, a2 -; RV32-NEXT: sadd a1, a1, a3 +; RV32-NEXT: psadd.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psadd_w: @@ -1376,8 +1781,7 @@ define <2 x i32> @test_psadd_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_psaddu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psaddu_w: ; RV32: # %bb.0: -; RV32-NEXT: saddu a0, a0, a2 -; RV32-NEXT: saddu a1, a1, a3 +; RV32-NEXT: psaddu.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psaddu_w: @@ -1392,8 +1796,7 @@ define <2 x i32> @test_psaddu_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pssub_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pssub_w: ; RV32: # %bb.0: -; RV32-NEXT: ssub a0, a0, a2 -; RV32-NEXT: ssub a1, a1, a3 +; RV32-NEXT: pssub.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssub_w: @@ -1407,8 +1810,7 @@ define <2 x i32> @test_pssub_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pssubu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pssubu_w: ; RV32: # %bb.0: -; RV32-NEXT: ssubu a0, a0, a2 -; RV32-NEXT: ssubu a1, a1, a3 +; RV32-NEXT: pssubu.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssubu_w: @@ -1424,8 +1826,7 @@ define <2 x i32> @test_pssubu_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_paadd_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_paadd_w: ; RV32: # %bb.0: -; RV32-NEXT: aadd a0, a0, a2 -; RV32-NEXT: aadd a1, a1, a3 +; RV32-NEXT: paadd.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_paadd_w: @@ -1445,8 +1846,7 @@ define <2 x i32> @test_paadd_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_paaddu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_paaddu_w: ; RV32: # %bb.0: -; RV32-NEXT: aaddu a0, a0, a2 -; RV32-NEXT: aaddu a1, a1, a3 +; RV32-NEXT: paaddu.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_paaddu_w: @@ -1465,16 +1865,7 @@ define <2 x i32> @test_paaddu_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pasub_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pasub_w: ; RV32: # %bb.0: -; RV32-NEXT: mv a4, a3 -; RV32-NEXT: mv a6, a1 -; RV32-NEXT: srai a1, a0, 31 -; RV32-NEXT: srai a7, a6, 31 -; RV32-NEXT: srai a3, a2, 31 -; RV32-NEXT: srai a5, a4, 31 -; RV32-NEXT: subd a4, a6, a4 -; RV32-NEXT: subd a0, a0, a2 -; RV32-NEXT: nsrli a0, a0, 1 -; RV32-NEXT: nsrli a1, a4, 1 +; RV32-NEXT: pasub.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pasub_w: @@ -1494,10 +1885,7 @@ define <2 x i32> @test_pasub_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pasubu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pasubu_w: ; RV32: # %bb.0: -; RV32-NEXT: wsubu a4, a1, a3 -; RV32-NEXT: wsubu a0, a0, a2 -; RV32-NEXT: nsrli a0, a0, 1 -; RV32-NEXT: nsrli a1, a4, 1 +; RV32-NEXT: pasubu.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pasubu_w: @@ -1532,8 +1920,21 @@ define <2 x i32> @test_non_const_splat_i32(i32 %elt) { define <8 x i8> @test_padd_bs_splat_lhs(<8 x i8> %a, i8 %b) { ; RV32-LABEL: test_padd_bs_splat_lhs: ; RV32: # %bb.0: -; RV32-NEXT: padd.bs a0, a0, a2 -; RV32-NEXT: padd.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sb a2, 12(sp) +; RV32-NEXT: sb a2, 13(sp) +; RV32-NEXT: sb a2, 14(sp) +; RV32-NEXT: sb a2, 15(sp) +; RV32-NEXT: sb a2, 8(sp) +; RV32-NEXT: sb a2, 9(sp) +; RV32-NEXT: sb a2, 10(sp) +; RV32-NEXT: sb a2, 11(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: padd.db a0, a2, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_bs_splat_lhs: @@ -1549,8 +1950,21 @@ define <8 x i8> @test_padd_bs_splat_lhs(<8 x i8> %a, i8 %b) { define <8 x i8> @test_padd_bs_splat_rhs(<8 x i8> %a, i8 %b) { ; RV32-LABEL: test_padd_bs_splat_rhs: ; RV32: # %bb.0: -; RV32-NEXT: padd.bs a0, a0, a2 -; RV32-NEXT: padd.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sb a2, 12(sp) +; RV32-NEXT: sb a2, 13(sp) +; RV32-NEXT: sb a2, 14(sp) +; RV32-NEXT: sb a2, 15(sp) +; RV32-NEXT: sb a2, 8(sp) +; RV32-NEXT: sb a2, 9(sp) +; RV32-NEXT: sb a2, 10(sp) +; RV32-NEXT: sb a2, 11(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: padd.db a0, a0, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_bs_splat_rhs: @@ -1566,8 +1980,17 @@ define <8 x i8> @test_padd_bs_splat_rhs(<8 x i8> %a, i8 %b) { define <4 x i16> @test_padd_hs_splat_lhs(<4 x i16> %a, i16 %b) { ; RV32-LABEL: test_padd_hs_splat_lhs: ; RV32: # %bb.0: -; RV32-NEXT: padd.hs a0, a0, a2 -; RV32-NEXT: padd.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sh a2, 8(sp) +; RV32-NEXT: sh a2, 10(sp) +; RV32-NEXT: sh a2, 12(sp) +; RV32-NEXT: sh a2, 14(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: padd.dh a0, a2, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_hs_splat_lhs: @@ -1583,8 +2006,17 @@ define <4 x i16> @test_padd_hs_splat_lhs(<4 x i16> %a, i16 %b) { define <4 x i16> @test_padd_hs_splat_rhs(<4 x i16> %a, i16 %b) { ; RV32-LABEL: test_padd_hs_splat_rhs: ; RV32: # %bb.0: -; RV32-NEXT: padd.hs a0, a0, a2 -; RV32-NEXT: padd.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sh a2, 8(sp) +; RV32-NEXT: sh a2, 10(sp) +; RV32-NEXT: sh a2, 12(sp) +; RV32-NEXT: sh a2, 14(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: padd.dh a0, a0, a2 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_hs_splat_rhs: @@ -1600,8 +2032,8 @@ define <4 x i16> @test_padd_hs_splat_rhs(<4 x i16> %a, i16 %b) { define <2 x i32> @test_padd_ws_splat_lhs(<2 x i32> %a, i32 %b) { ; RV32-LABEL: test_padd_ws_splat_lhs: ; RV32: # %bb.0: -; RV32-NEXT: add a0, a2, a0 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: padd.dw a0, a2, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_ws_splat_lhs: @@ -1617,8 +2049,8 @@ define <2 x i32> @test_padd_ws_splat_lhs(<2 x i32> %a, i32 %b) { define <2 x i32> @test_padd_ws_splat_rhs(<2 x i32> %a, i32 %b) { ; RV32-LABEL: test_padd_ws_splat_rhs: ; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: padd.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_padd_ws_splat_rhs: @@ -1634,16 +2066,20 @@ define <2 x i32> @test_padd_ws_splat_rhs(<2 x i32> %a, i32 %b) { define <8 x i8> @test_build_vector_i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h) { ; RV32-LABEL: test_build_vector_i8: ; RV32: # %bb.0: -; RV32-NEXT: mv t2, a6 -; RV32-NEXT: mv t4, a2 -; RV32-NEXT: mv a6, a5 -; RV32-NEXT: mv t1, a4 -; RV32-NEXT: mv a2, a1 -; RV32-NEXT: mv t3, a0 -; RV32-NEXT: ppaire.db a0, t3, a2 -; RV32-NEXT: ppaire.db a2, t1, a6 -; RV32-NEXT: pack a0, a0, a1 -; RV32-NEXT: pack a1, a2, a3 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sb a4, 12(sp) +; RV32-NEXT: sb a5, 13(sp) +; RV32-NEXT: sb a6, 14(sp) +; RV32-NEXT: sb a7, 15(sp) +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: sb a1, 9(sp) +; RV32-NEXT: sb a2, 10(sp) +; RV32-NEXT: sb a3, 11(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_build_vector_i8: @@ -1670,8 +2106,16 @@ define <8 x i8> @test_build_vector_i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, define <4 x i16> @test_build_vector_i16(i16 %a, i16 %b, i16 %c, i16 %d) { ; RV32-LABEL: test_build_vector_i16: ; RV32: # %bb.0: -; RV32-NEXT: pack a0, a0, a1 -; RV32-NEXT: pack a1, a2, a3 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: sh a1, 10(sp) +; RV32-NEXT: sh a2, 12(sp) +; RV32-NEXT: sh a3, 14(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_build_vector_i16: @@ -1705,8 +2149,26 @@ define <2 x i32> @test_build_vector_i32(i32 %a, i32 %b) { define <4 x i16> @test_pslli_h(<4 x i16> %a) { ; RV32-LABEL: test_pslli_h: ; RV32: # %bb.0: -; RV32-NEXT: pslli.h a0, a0, 2 -; RV32-NEXT: pslli.h a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: sh a1, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pslli_h: @@ -1721,8 +2183,38 @@ define <4 x i16> @test_pslli_h(<4 x i16> %a) { define <8 x i8> @test_pslli_b(<8 x i8> %a) { ; RV32-LABEL: test_pslli_b: ; RV32: # %bb.0: -; RV32-NEXT: pslli.b a0, a0, 2 -; RV32-NEXT: pslli.b a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: sb a1, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pslli_b: @@ -1737,8 +2229,8 @@ define <8 x i8> @test_pslli_b(<8 x i8> %a) { define <2 x i32> @test_pslli_w(<2 x i32> %a) { ; RV32-LABEL: test_pslli_w: ; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 2 ; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a0, a0, 2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pslli_w: @@ -1753,8 +2245,8 @@ define <2 x i32> @test_pslli_w(<2 x i32> %a) { define <2 x i32> @test_psrli_w(<2 x i32> %a) { ; RV32-LABEL: test_psrli_w: ; RV32: # %bb.0: -; RV32-NEXT: srli a0, a0, 2 ; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: srli a0, a0, 2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrli_w: @@ -1768,8 +2260,26 @@ define <2 x i32> @test_psrli_w(<2 x i32> %a) { define <4 x i16> @test_psrli_h(<4 x i16> %a) { ; RV32-LABEL: test_psrli_h: ; RV32: # %bb.0: -; RV32-NEXT: psrli.h a0, a0, 2 -; RV32-NEXT: psrli.h a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lhu a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: sh a1, 14(sp) +; RV32-NEXT: lhu a0, 4(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lhu a0, 2(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lhu a0, 0(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrli_h: @@ -1783,8 +2293,46 @@ define <4 x i16> @test_psrli_h(<4 x i16> %a) { define <8 x i8> @test_psrli_b(<8 x i8> %a) { ; RV32-LABEL: test_psrli_b: ; RV32: # %bb.0: -; RV32-NEXT: psrli.b a0, a0, 2 -; RV32-NEXT: psrli.b a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: slli a1, a1, 24 +; RV32-NEXT: srli a1, a1, 26 +; RV32-NEXT: sb a1, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: srli a0, a0, 26 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrli_b: @@ -1799,8 +2347,8 @@ define <8 x i8> @test_psrli_b(<8 x i8> %a) { define <2 x i32> @test_psrai_w(<2 x i32> %a) { ; RV32-LABEL: test_psrai_w: ; RV32: # %bb.0: -; RV32-NEXT: srai a0, a0, 2 ; RV32-NEXT: srai a1, a1, 2 +; RV32-NEXT: srai a0, a0, 2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrai_w: @@ -1814,8 +2362,26 @@ define <2 x i32> @test_psrai_w(<2 x i32> %a) { define <4 x i16> @test_psrai_h(<4 x i16> %a) { ; RV32-LABEL: test_psrai_h: ; RV32: # %bb.0: -; RV32-NEXT: psrai.h a0, a0, 2 -; RV32-NEXT: psrai.h a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: sh a1, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrai_h: @@ -1829,8 +2395,38 @@ define <4 x i16> @test_psrai_h(<4 x i16> %a) { define <8 x i8> @test_psrai_b(<8 x i8> %a) { ; RV32-LABEL: test_psrai_b: ; RV32: # %bb.0: -; RV32-NEXT: psrai.b a0, a0, 2 -; RV32-NEXT: psrai.b a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lb a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: sb a1, 15(sp) +; RV32-NEXT: lb a0, 6(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lb a0, 5(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lb a0, 4(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lb a0, 3(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lb a0, 2(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lb a0, 1(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lb a0, 0(sp) +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrai_b: @@ -1845,8 +2441,8 @@ define <8 x i8> @test_psrai_b(<8 x i8> %a) { define <2 x i32> @test_psslai_w(<2 x i32> %a) { ; RV32-LABEL: test_psslai_w: ; RV32: # %bb.0: -; RV32-NEXT: sslai a0, a0, 2 ; RV32-NEXT: sslai a1, a1, 2 +; RV32-NEXT: sslai a0, a0, 2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psslai_w: @@ -1861,8 +2457,34 @@ define <2 x i32> @test_psslai_w(<2 x i32> %a) { define <4 x i16> @test_psslai_h(<4 x i16> %a) { ; RV32-LABEL: test_psslai_h: ; RV32: # %bb.0: -; RV32-NEXT: psslai.h a0, a0, 2 -; RV32-NEXT: psslai.h a1, a1, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: sslai a0, a1, 2 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psslai_h: @@ -1877,20 +2499,54 @@ define <4 x i16> @test_psslai_h(<4 x i16> %a) { define <8 x i8> @test_psslai_b(<8 x i8> %a) { ; RV32-LABEL: test_psslai_b: ; RV32: # %bb.0: -; RV32-NEXT: pmsltz.b a2, a0 -; RV32-NEXT: pli.b a3, -128 -; RV32-NEXT: pli.b a4, 127 -; RV32-NEXT: pslli.b a5, a0, 2 -; RV32-NEXT: pmsltz.b a6, a1 -; RV32-NEXT: merge a2, a4, a3 -; RV32-NEXT: merge a6, a4, a3 -; RV32-NEXT: pslli.b a3, a1, 2 -; RV32-NEXT: psrai.b a4, a5, 2 -; RV32-NEXT: pmseq.b a0, a0, a4 -; RV32-NEXT: psrai.b a4, a3, 2 -; RV32-NEXT: pmseq.b a1, a1, a4 -; RV32-NEXT: merge a0, a2, a5 -; RV32-NEXT: merge a1, a6, a3 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: slli a1, a1, 24 +; RV32-NEXT: sslai a0, a1, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: slli a0, a0, 24 +; RV32-NEXT: sslai a0, a0, 2 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psslai_b: @@ -1912,23 +2568,35 @@ define <8 x i8> @test_psslai_b(<8 x i8> %a) { define <4 x i16> @test_pssla_hs(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_pssla_hs: ; RV32: # %bb.0: -; RV32-NEXT: pmsltz.h a3, a0 -; RV32-NEXT: lui a4, 8 -; RV32-NEXT: plui.h a5, -512 -; RV32-NEXT: psll.hs a6, a0, a2 -; RV32-NEXT: psra.hs a7, a6, a2 -; RV32-NEXT: pmseq.h a0, a0, a7 -; RV32-NEXT: psll.hs a7, a1, a2 -; RV32-NEXT: psra.hs a2, a7, a2 -; RV32-NEXT: pmseq.h a2, a1, a2 -; RV32-NEXT: pmsltz.h a1, a1 -; RV32-NEXT: addi a4, a4, -1 -; RV32-NEXT: pmv.hs a4, a4 -; RV32-NEXT: merge a3, a4, a5 -; RV32-NEXT: merge a1, a4, a5 -; RV32-NEXT: merge a0, a3, a6 -; RV32-NEXT: merge a2, a1, a7 -; RV32-NEXT: mv a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: zext.h a0, a2 +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: ssha a1, a1, a0 +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: sh a1, 14(sp) +; RV32-NEXT: lh a1, 4(sp) +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: ssha a1, a1, a0 +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: sh a1, 12(sp) +; RV32-NEXT: lh a1, 2(sp) +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: ssha a1, a1, a0 +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: sh a1, 10(sp) +; RV32-NEXT: lh a1, 0(sp) +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: ssha a0, a1, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssla_hs: @@ -1954,8 +2622,8 @@ define <4 x i16> @test_pssla_hs(<4 x i16> %a, i16 %shamt) { define <2 x i32> @test_pssla_ws(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_pssla_ws: ; RV32: # %bb.0: -; RV32-NEXT: ssha a0, a0, a2 ; RV32-NEXT: ssha a1, a1, a2 +; RV32-NEXT: ssha a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssla_ws: @@ -1981,38 +2649,40 @@ define <2 x i32> @test_pssla_ws(<2 x i32> %a, i32 %shamt) { define <4 x i16> @test_pssla_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pssla_h: ; RV32: # %bb.0: -; RV32-NEXT: sll a4, a0, a2 -; RV32-NEXT: srli a5, a2, 16 -; RV32-NEXT: srli a6, a0, 16 -; RV32-NEXT: pmsltz.h a7, a0 -; RV32-NEXT: lui t0, 8 -; RV32-NEXT: plui.h t1, -512 -; RV32-NEXT: pmsltz.h t2, a1 -; RV32-NEXT: addi t0, t0, -1 -; RV32-NEXT: pmv.hs t0, t0 -; RV32-NEXT: merge a7, t0, t1 -; RV32-NEXT: merge t2, t0, t1 -; RV32-NEXT: sll t0, a1, a3 -; RV32-NEXT: sext.h t1, a4 -; RV32-NEXT: sra a2, t1, a2 -; RV32-NEXT: sext.h t1, t0 -; RV32-NEXT: sra t1, t1, a3 -; RV32-NEXT: srli a3, a3, 16 -; RV32-NEXT: sll a6, a6, a5 -; RV32-NEXT: pack a4, a4, a6 -; RV32-NEXT: sext.h a6, a6 -; RV32-NEXT: sra a5, a6, a5 -; RV32-NEXT: srli a6, a1, 16 -; RV32-NEXT: sll a6, a6, a3 -; RV32-NEXT: pack t0, t0, a6 -; RV32-NEXT: sext.h a6, a6 -; RV32-NEXT: sra a3, a6, a3 -; RV32-NEXT: pack a2, a2, a5 -; RV32-NEXT: pack a3, t1, a3 -; RV32-NEXT: pmseq.h a0, a0, a2 -; RV32-NEXT: pmseq.h a1, a1, a3 -; RV32-NEXT: merge a0, a7, a4 -; RV32-NEXT: merge a1, t2, t0 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lh a0, 14(sp) +; RV32-NEXT: lhu a1, 22(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: ssha a0, a0, a1 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 12(sp) +; RV32-NEXT: lhu a1, 20(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: ssha a0, a0, a1 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: lhu a1, 18(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: ssha a0, a0, a1 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: lhu a1, 16(sp) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: ssha a0, a0, a1 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssla_h: @@ -2058,8 +2728,8 @@ define <4 x i16> @test_pssla_h(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pssla_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pssla_w: ; RV32: # %bb.0: -; RV32-NEXT: ssha a0, a0, a2 ; RV32-NEXT: ssha a1, a1, a3 +; RV32-NEXT: ssha a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pssla_w: @@ -2089,8 +2759,26 @@ define <2 x i32> @test_pssla_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_psll_hs(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_psll_hs: ; RV32: # %bb.0: -; RV32-NEXT: psll.hs a0, a0, a2 -; RV32-NEXT: psll.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sll a0, a1, a2 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_hs: @@ -2106,8 +2794,27 @@ define <4 x i16> @test_psll_hs(<4 x i16> %a, i16 %shamt) { define <4 x i16> @test_psll_hs_mask(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_psll_hs_mask: ; RV32: # %bb.0: -; RV32-NEXT: psll.hs a0, a0, a2 -; RV32-NEXT: psll.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: andi a2, a2, 15 +; RV32-NEXT: sll a0, a1, a2 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_hs_mask: @@ -2124,8 +2831,38 @@ define <4 x i16> @test_psll_hs_mask(<4 x i16> %a, i16 %shamt) { define <8 x i8> @test_psll_bs(<8 x i8> %a, i8 %shamt) { ; RV32-LABEL: test_psll_bs: ; RV32: # %bb.0: -; RV32-NEXT: psll.bs a0, a0, a2 -; RV32-NEXT: psll.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sll a0, a1, a2 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_bs: @@ -2141,8 +2878,39 @@ define <8 x i8> @test_psll_bs(<8 x i8> %a, i8 %shamt) { define <8 x i8> @test_psll_bs_mask(<8 x i8> %a, i8 %shamt) { ; RV32-LABEL: test_psll_bs_mask: ; RV32: # %bb.0: -; RV32-NEXT: psll.bs a0, a0, a2 -; RV32-NEXT: psll.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: andi a2, a2, 7 +; RV32-NEXT: sll a0, a1, a2 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: sll a0, a0, a2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_bs_mask: @@ -2159,8 +2927,8 @@ define <8 x i8> @test_psll_bs_mask(<8 x i8> %a, i8 %shamt) { define <2 x i32> @test_psll_ws(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_psll_ws: ; RV32: # %bb.0: -; RV32-NEXT: sll a0, a0, a2 ; RV32-NEXT: sll a1, a1, a2 +; RV32-NEXT: sll a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_ws: @@ -2176,8 +2944,8 @@ define <2 x i32> @test_psll_ws(<2 x i32> %a, i32 %shamt) { define <2 x i32> @test_psll_ws_mask(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_psll_ws_mask: ; RV32: # %bb.0: -; RV32-NEXT: sll a0, a0, a2 ; RV32-NEXT: sll a1, a1, a2 +; RV32-NEXT: sll a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_ws_mask: @@ -2195,8 +2963,8 @@ define <2 x i32> @test_psll_ws_mask(<2 x i32> %a, i32 %shamt) { define <2 x i32> @test_psll_ws_vec_shamt(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psll_ws_vec_shamt: ; RV32: # %bb.0: -; RV32-NEXT: sll a0, a0, a2 ; RV32-NEXT: sll a1, a1, a3 +; RV32-NEXT: sll a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psll_ws_vec_shamt: @@ -2215,8 +2983,26 @@ define <2 x i32> @test_psll_ws_vec_shamt(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_psrl_hs(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_psrl_hs: ; RV32: # %bb.0: -; RV32-NEXT: psrl.hs a0, a0, a2 -; RV32-NEXT: psrl.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lhu a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: srl a0, a1, a2 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lhu a0, 4(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lhu a0, 2(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lhu a0, 0(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_hs: @@ -2232,8 +3018,27 @@ define <4 x i16> @test_psrl_hs(<4 x i16> %a, i16 %shamt) { define <4 x i16> @test_psrl_hs_mask(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_psrl_hs_mask: ; RV32: # %bb.0: -; RV32-NEXT: psrl.hs a0, a0, a2 -; RV32-NEXT: psrl.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lhu a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: andi a2, a2, 15 +; RV32-NEXT: srl a0, a1, a2 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lhu a0, 4(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lhu a0, 2(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lhu a0, 0(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_hs_mask: @@ -2250,8 +3055,38 @@ define <4 x i16> @test_psrl_hs_mask(<4 x i16> %a, i16 %shamt) { define <8 x i8> @test_psrl_bs(<8 x i8> %a, i8 %shamt) { ; RV32-LABEL: test_psrl_bs: ; RV32: # %bb.0: -; RV32-NEXT: psrl.bs a0, a0, a2 -; RV32-NEXT: psrl.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: srl a0, a1, a2 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_bs: @@ -2267,8 +3102,39 @@ define <8 x i8> @test_psrl_bs(<8 x i8> %a, i8 %shamt) { define <8 x i8> @test_psrl_bs_mask(<8 x i8> %a, i8 %shamt) { ; RV32-LABEL: test_psrl_bs_mask: ; RV32: # %bb.0: -; RV32-NEXT: psrl.bs a0, a0, a2 -; RV32-NEXT: psrl.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: andi a2, a2, 7 +; RV32-NEXT: srl a0, a1, a2 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: srl a0, a0, a2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_bs_mask: @@ -2285,8 +3151,8 @@ define <8 x i8> @test_psrl_bs_mask(<8 x i8> %a, i8 %shamt) { define <2 x i32> @test_psrl_ws(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_psrl_ws: ; RV32: # %bb.0: -; RV32-NEXT: srl a0, a0, a2 ; RV32-NEXT: srl a1, a1, a2 +; RV32-NEXT: srl a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_ws: @@ -2302,8 +3168,8 @@ define <2 x i32> @test_psrl_ws(<2 x i32> %a, i32 %shamt) { define <2 x i32> @test_psrl_ws_mask(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_psrl_ws_mask: ; RV32: # %bb.0: -; RV32-NEXT: srl a0, a0, a2 ; RV32-NEXT: srl a1, a1, a2 +; RV32-NEXT: srl a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_ws_mask: @@ -2321,8 +3187,26 @@ define <2 x i32> @test_psrl_ws_mask(<2 x i32> %a, i32 %shamt) { define <4 x i16> @test_psra_hs(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_psra_hs: ; RV32: # %bb.0: -; RV32-NEXT: psra.hs a0, a0, a2 -; RV32-NEXT: psra.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sra a0, a1, a2 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_hs: @@ -2338,8 +3222,27 @@ define <4 x i16> @test_psra_hs(<4 x i16> %a, i16 %shamt) { define <4 x i16> @test_psra_hs_mask(<4 x i16> %a, i16 %shamt) { ; RV32-LABEL: test_psra_hs_mask: ; RV32: # %bb.0: -; RV32-NEXT: psra.hs a0, a0, a2 -; RV32-NEXT: psra.hs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: andi a2, a2, 15 +; RV32-NEXT: sra a0, a1, a2 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_hs_mask: @@ -2356,8 +3259,38 @@ define <4 x i16> @test_psra_hs_mask(<4 x i16> %a, i16 %shamt) { define <8 x i8> @test_psra_bs(<8 x i8> %a, i8 %shamt) { ; RV32-LABEL: test_psra_bs: ; RV32: # %bb.0: -; RV32-NEXT: psra.bs a0, a0, a2 -; RV32-NEXT: psra.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lb a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sra a0, a1, a2 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lb a0, 6(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lb a0, 5(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lb a0, 4(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lb a0, 3(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lb a0, 2(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lb a0, 1(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lb a0, 0(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_bs: @@ -2373,8 +3306,39 @@ define <8 x i8> @test_psra_bs(<8 x i8> %a, i8 %shamt) { define <8 x i8> @test_psra_bs_mask(<8 x i8> %a, i8 %shamt) { ; RV32-LABEL: test_psra_bs_mask: ; RV32: # %bb.0: -; RV32-NEXT: psra.bs a0, a0, a2 -; RV32-NEXT: psra.bs a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lb a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: andi a2, a2, 7 +; RV32-NEXT: sra a0, a1, a2 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lb a0, 6(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lb a0, 5(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lb a0, 4(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lb a0, 3(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lb a0, 2(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lb a0, 1(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lb a0, 0(sp) +; RV32-NEXT: sra a0, a0, a2 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_bs_mask: @@ -2391,8 +3355,8 @@ define <8 x i8> @test_psra_bs_mask(<8 x i8> %a, i8 %shamt) { define <2 x i32> @test_psra_ws(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_psra_ws: ; RV32: # %bb.0: -; RV32-NEXT: sra a0, a0, a2 ; RV32-NEXT: sra a1, a1, a2 +; RV32-NEXT: sra a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_ws: @@ -2408,8 +3372,8 @@ define <2 x i32> @test_psra_ws(<2 x i32> %a, i32 %shamt) { define <2 x i32> @test_psra_ws_mask(<2 x i32> %a, i32 %shamt) { ; RV32-LABEL: test_psra_ws_mask: ; RV32: # %bb.0: -; RV32-NEXT: sra a0, a0, a2 ; RV32-NEXT: sra a1, a1, a2 +; RV32-NEXT: sra a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_ws_mask: @@ -2427,8 +3391,8 @@ define <2 x i32> @test_psra_ws_mask(<2 x i32> %a, i32 %shamt) { define <2 x i32> @test_psrl_ws_vec_shamt(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psrl_ws_vec_shamt: ; RV32: # %bb.0: -; RV32-NEXT: srl a0, a0, a2 ; RV32-NEXT: srl a1, a1, a3 +; RV32-NEXT: srl a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrl_ws_vec_shamt: @@ -2447,8 +3411,8 @@ define <2 x i32> @test_psrl_ws_vec_shamt(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_psra_ws_vec_shamt(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psra_ws_vec_shamt: ; RV32: # %bb.0: -; RV32-NEXT: sra a0, a0, a2 ; RV32-NEXT: sra a1, a1, a3 +; RV32-NEXT: sra a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psra_ws_vec_shamt: @@ -2467,8 +3431,8 @@ define <2 x i32> @test_psra_ws_vec_shamt(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmulh_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulh_h: ; RV32: # %bb.0: -; RV32-NEXT: pmulh.h a0, a0, a2 ; RV32-NEXT: pmulh.h a1, a1, a3 +; RV32-NEXT: pmulh.h a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulh_h: @@ -2486,8 +3450,8 @@ define <4 x i16> @test_pmulh_h(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pmulh_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulh_w: ; RV32: # %bb.0: -; RV32-NEXT: mulh a0, a0, a2 ; RV32-NEXT: mulh a1, a1, a3 +; RV32-NEXT: mulh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulh_w: @@ -2506,8 +3470,8 @@ define <2 x i32> @test_pmulh_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmulhu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhu_h: ; RV32: # %bb.0: -; RV32-NEXT: pmulhu.h a0, a0, a2 ; RV32-NEXT: pmulhu.h a1, a1, a3 +; RV32-NEXT: pmulhu.h a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhu_h: @@ -2525,8 +3489,8 @@ define <4 x i16> @test_pmulhu_h(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pmulhu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhu_w: ; RV32: # %bb.0: -; RV32-NEXT: mulhu a0, a0, a2 ; RV32-NEXT: mulhu a1, a1, a3 +; RV32-NEXT: mulhu a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhu_w: @@ -2545,20 +3509,8 @@ define <2 x i32> @test_pmulhu_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmulhsu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhsu_h: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a1, 16 -; RV32-NEXT: srai a5, a0, 16 -; RV32-NEXT: srli a6, a3, 16 -; RV32-NEXT: srli a7, a2, 16 -; RV32-NEXT: mulsu.h00 a0, a0, a2 -; RV32-NEXT: mulsu.h00 a1, a1, a3 -; RV32-NEXT: mul a2, a5, a7 -; RV32-NEXT: mul a3, a4, a6 -; RV32-NEXT: srli a1, a1, 16 -; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: srli a3, a3, 16 -; RV32-NEXT: srli a2, a2, 16 -; RV32-NEXT: pack a0, a0, a2 -; RV32-NEXT: pack a1, a1, a3 +; RV32-NEXT: pmulhsu.h a1, a1, a3 +; RV32-NEXT: pmulhsu.h a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhsu_h: @@ -2576,20 +3528,8 @@ define <4 x i16> @test_pmulhsu_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_pmulhsu_h_commuted(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhsu_h_commuted: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a1, 16 -; RV32-NEXT: srli a5, a0, 16 -; RV32-NEXT: srai a6, a3, 16 -; RV32-NEXT: srai a7, a2, 16 -; RV32-NEXT: mulsu.h00 a0, a2, a0 -; RV32-NEXT: mulsu.h00 a1, a3, a1 -; RV32-NEXT: mul a2, a5, a7 -; RV32-NEXT: mul a3, a4, a6 -; RV32-NEXT: srli a1, a1, 16 -; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: srli a3, a3, 16 -; RV32-NEXT: srli a2, a2, 16 -; RV32-NEXT: pack a0, a0, a2 -; RV32-NEXT: pack a1, a1, a3 +; RV32-NEXT: pmulhsu.h a1, a3, a1 +; RV32-NEXT: pmulhsu.h a0, a2, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhsu_h_commuted: @@ -2607,8 +3547,8 @@ define <4 x i16> @test_pmulhsu_h_commuted(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pmulhsu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhsu_w: ; RV32: # %bb.0: -; RV32-NEXT: mulhsu a0, a0, a2 ; RV32-NEXT: mulhsu a1, a1, a3 +; RV32-NEXT: mulhsu a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhsu_w: @@ -2626,8 +3566,8 @@ define <2 x i32> @test_pmulhsu_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pmulhsu_w_commuted(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhsu_w_commuted: ; RV32: # %bb.0: -; RV32-NEXT: mulhsu a0, a2, a0 ; RV32-NEXT: mulhsu a1, a3, a1 +; RV32-NEXT: mulhsu a0, a2, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhsu_w_commuted: @@ -2646,24 +3586,8 @@ define <2 x i32> @test_pmulhsu_w_commuted(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmulhr_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhr_h: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a1, 16 -; RV32-NEXT: srai a5, a0, 16 -; RV32-NEXT: srai a6, a3, 16 -; RV32-NEXT: srai a7, a2, 16 -; RV32-NEXT: lui t0, 8 -; RV32-NEXT: lui t1, 8 -; RV32-NEXT: macc.h00 t1, a5, a7 -; RV32-NEXT: lui a5, 8 -; RV32-NEXT: lui a7, 8 -; RV32-NEXT: macc.h00 a5, a0, a2 -; RV32-NEXT: macc.h00 a7, a4, a6 -; RV32-NEXT: macc.h00 t0, a1, a3 -; RV32-NEXT: srli a1, t0, 16 -; RV32-NEXT: srli a2, a7, 16 -; RV32-NEXT: srli a5, a5, 16 -; RV32-NEXT: srli a0, t1, 16 -; RV32-NEXT: pack a0, a5, a0 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: pmulhr.h a1, a1, a3 +; RV32-NEXT: pmulhr.h a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhr_h: @@ -2682,13 +3606,12 @@ define <4 x i16> @test_pmulhr_h(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pmulhr_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhr_w: ; RV32: # %bb.0: -; RV32-NEXT: mv a4, a0 +; RV32-NEXT: wmul a4, a0, a2 ; RV32-NEXT: wmul a0, a1, a3 -; RV32-NEXT: wmul a2, a4, a2 -; RV32-NEXT: lui a4, 524288 -; RV32-NEXT: waddau a2, a4, zero -; RV32-NEXT: waddau a0, a4, zero -; RV32-NEXT: mv a0, a3 +; RV32-NEXT: lui a2, 524288 +; RV32-NEXT: waddau a0, a2, zero +; RV32-NEXT: waddau a4, a2, zero +; RV32-NEXT: mv a0, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhr_w: @@ -2708,24 +3631,8 @@ define <2 x i32> @test_pmulhr_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmulhru_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhru_h: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a1, 16 -; RV32-NEXT: srli a5, a0, 16 -; RV32-NEXT: srli a6, a3, 16 -; RV32-NEXT: srli a7, a2, 16 -; RV32-NEXT: lui t0, 8 -; RV32-NEXT: lui t1, 8 -; RV32-NEXT: maccu.h00 t1, a5, a7 -; RV32-NEXT: lui a5, 8 -; RV32-NEXT: lui a7, 8 -; RV32-NEXT: maccu.h00 a5, a0, a2 -; RV32-NEXT: maccu.h00 a7, a4, a6 -; RV32-NEXT: maccu.h00 t0, a1, a3 -; RV32-NEXT: srli a1, t0, 16 -; RV32-NEXT: srli a2, a7, 16 -; RV32-NEXT: srli a5, a5, 16 -; RV32-NEXT: srli a0, t1, 16 -; RV32-NEXT: pack a0, a5, a0 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: pmulhru.h a1, a1, a3 +; RV32-NEXT: pmulhru.h a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhru_h: @@ -2744,13 +3651,12 @@ define <4 x i16> @test_pmulhru_h(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pmulhru_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhru_w: ; RV32: # %bb.0: -; RV32-NEXT: mv a4, a0 +; RV32-NEXT: wmulu a4, a0, a2 ; RV32-NEXT: wmulu a0, a1, a3 -; RV32-NEXT: wmulu a2, a4, a2 -; RV32-NEXT: lui a4, 524288 -; RV32-NEXT: waddau a2, a4, zero -; RV32-NEXT: waddau a0, a4, zero -; RV32-NEXT: mv a0, a3 +; RV32-NEXT: lui a2, 524288 +; RV32-NEXT: waddau a0, a2, zero +; RV32-NEXT: waddau a4, a2, zero +; RV32-NEXT: mv a0, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhru_w: @@ -2770,24 +3676,8 @@ define <2 x i32> @test_pmulhru_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmulhrsu_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhrsu_h: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a1, 16 -; RV32-NEXT: srai a5, a0, 16 -; RV32-NEXT: srli a6, a3, 16 -; RV32-NEXT: srli a7, a2, 16 -; RV32-NEXT: lui t0, 8 -; RV32-NEXT: lui t1, 8 -; RV32-NEXT: maccsu.h00 t1, a5, a7 -; RV32-NEXT: lui a5, 8 -; RV32-NEXT: lui a7, 8 -; RV32-NEXT: maccsu.h00 a5, a0, a2 -; RV32-NEXT: maccsu.h00 a7, a4, a6 -; RV32-NEXT: maccsu.h00 t0, a1, a3 -; RV32-NEXT: srli a1, t0, 16 -; RV32-NEXT: srli a2, a7, 16 -; RV32-NEXT: srli a5, a5, 16 -; RV32-NEXT: srli a0, t1, 16 -; RV32-NEXT: pack a0, a5, a0 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: pmulhrsu.h a1, a1, a3 +; RV32-NEXT: pmulhrsu.h a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhrsu_h: @@ -2806,24 +3696,8 @@ define <4 x i16> @test_pmulhrsu_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_pmulhrsu_h_commuted(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmulhrsu_h_commuted: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a1, 16 -; RV32-NEXT: srli a5, a0, 16 -; RV32-NEXT: srai a6, a3, 16 -; RV32-NEXT: srai a7, a2, 16 -; RV32-NEXT: lui t0, 8 -; RV32-NEXT: lui t1, 8 -; RV32-NEXT: maccsu.h00 t1, a7, a5 -; RV32-NEXT: lui a5, 8 -; RV32-NEXT: lui a7, 8 -; RV32-NEXT: maccsu.h00 a5, a2, a0 -; RV32-NEXT: maccsu.h00 a7, a6, a4 -; RV32-NEXT: maccsu.h00 t0, a3, a1 -; RV32-NEXT: srli a1, t0, 16 -; RV32-NEXT: srli a2, a7, 16 -; RV32-NEXT: srli a5, a5, 16 -; RV32-NEXT: srli a0, t1, 16 -; RV32-NEXT: pack a0, a5, a0 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: pmulhrsu.h a1, a3, a1 +; RV32-NEXT: pmulhrsu.h a0, a2, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhrsu_h_commuted: @@ -2842,13 +3716,12 @@ define <4 x i16> @test_pmulhrsu_h_commuted(<4 x i16> %a, <4 x i16> %b) { define <2 x i32> @test_pmulhrsu_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhrsu_w: ; RV32: # %bb.0: -; RV32-NEXT: mv a4, a0 +; RV32-NEXT: wmulsu a4, a0, a2 ; RV32-NEXT: wmulsu a0, a1, a3 -; RV32-NEXT: wmulsu a2, a4, a2 -; RV32-NEXT: lui a4, 524288 -; RV32-NEXT: waddau a2, a4, zero -; RV32-NEXT: waddau a0, a4, zero -; RV32-NEXT: mv a0, a3 +; RV32-NEXT: lui a2, 524288 +; RV32-NEXT: waddau a0, a2, zero +; RV32-NEXT: waddau a4, a2, zero +; RV32-NEXT: mv a0, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhrsu_w: @@ -2867,13 +3740,12 @@ define <2 x i32> @test_pmulhrsu_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_pmulhrsu_w_commuted(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmulhrsu_w_commuted: ; RV32: # %bb.0: -; RV32-NEXT: mv a4, a0 +; RV32-NEXT: wmulsu a4, a2, a0 ; RV32-NEXT: wmulsu a0, a3, a1 -; RV32-NEXT: wmulsu a2, a2, a4 -; RV32-NEXT: lui a4, 524288 -; RV32-NEXT: waddau a2, a4, zero -; RV32-NEXT: waddau a0, a4, zero -; RV32-NEXT: mv a0, a3 +; RV32-NEXT: lui a2, 524288 +; RV32-NEXT: waddau a0, a2, zero +; RV32-NEXT: waddau a4, a2, zero +; RV32-NEXT: mv a0, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmulhrsu_w_commuted: @@ -2893,10 +3765,32 @@ define <2 x i32> @test_pmulhrsu_w_commuted(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pmul_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pmul_h: ; RV32: # %bb.0: -; RV32-NEXT: pwmul.h a4, a0, a2 -; RV32-NEXT: pwmul.h a2, a1, a3 -; RV32-NEXT: pncvt.h a0, a4 -; RV32-NEXT: pncvt.h a1, a2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmul_h: @@ -2913,10 +3807,48 @@ define <4 x i16> @test_pmul_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_pmul_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pmul_b: ; RV32: # %bb.0: -; RV32-NEXT: pwmul.b a4, a0, a2 -; RV32-NEXT: pwmul.b a2, a1, a3 -; RV32-NEXT: pncvt.b a0, a4 -; RV32-NEXT: pncvt.b a1, a2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: mul a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmul_b: @@ -2933,8 +3865,8 @@ define <8 x i8> @test_pmul_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_pmul_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pmul_w: ; RV32: # %bb.0: -; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pmul_w: @@ -2951,20 +3883,32 @@ define <2 x i32> @test_pmul_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_psdiv_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_psdiv_h: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a2, 16 -; RV32-NEXT: srai a5, a0, 16 -; RV32-NEXT: sext.h a2, a2 -; RV32-NEXT: sext.h a0, a0 -; RV32-NEXT: div a4, a5, a4 -; RV32-NEXT: srai a5, a3, 16 -; RV32-NEXT: div a0, a0, a2 -; RV32-NEXT: srai a2, a1, 16 -; RV32-NEXT: sext.h a3, a3 -; RV32-NEXT: sext.h a1, a1 -; RV32-NEXT: div a2, a2, a5 -; RV32-NEXT: div a1, a1, a3 -; RV32-NEXT: pack a0, a0, a4 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psdiv_h: @@ -2996,42 +3940,48 @@ define <4 x i16> @test_psdiv_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_psdiv_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_psdiv_b: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a2, 24 -; RV32-NEXT: srai a5, a0, 24 -; RV32-NEXT: slli a6, a2, 16 -; RV32-NEXT: slli a7, a0, 16 -; RV32-NEXT: sext.b t0, a2 -; RV32-NEXT: sext.b t1, a0 -; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: slli a0, a0, 8 -; RV32-NEXT: div a5, a5, a4 -; RV32-NEXT: srai a4, a6, 24 -; RV32-NEXT: srai a6, a7, 24 -; RV32-NEXT: div a4, a6, a4 -; RV32-NEXT: srai a6, a3, 24 -; RV32-NEXT: srai a7, a1, 24 -; RV32-NEXT: div t1, t1, t0 -; RV32-NEXT: srai a2, a2, 24 -; RV32-NEXT: srai a0, a0, 24 -; RV32-NEXT: div t2, a0, a2 -; RV32-NEXT: slli a0, a3, 16 -; RV32-NEXT: slli a2, a1, 16 -; RV32-NEXT: div a7, a7, a6 -; RV32-NEXT: srai a0, a0, 24 -; RV32-NEXT: srai a2, a2, 24 -; RV32-NEXT: div a6, a2, a0 -; RV32-NEXT: sext.b a0, a3 -; RV32-NEXT: sext.b a2, a1 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: div a0, a2, a0 -; RV32-NEXT: srai a3, a3, 24 -; RV32-NEXT: srai a1, a1, 24 -; RV32-NEXT: div a1, a1, a3 -; RV32-NEXT: ppaire.db a2, t1, a4 -; RV32-NEXT: ppaire.db a4, a0, a6 -; RV32-NEXT: pack a0, a2, a3 -; RV32-NEXT: pack a1, a4, a5 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lb a0, 23(sp) +; RV32-NEXT: lb a1, 15(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lb a0, 22(sp) +; RV32-NEXT: lb a1, 14(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lb a0, 21(sp) +; RV32-NEXT: lb a1, 13(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lb a0, 20(sp) +; RV32-NEXT: lb a1, 12(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lb a0, 19(sp) +; RV32-NEXT: lb a1, 11(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lb a0, 18(sp) +; RV32-NEXT: lb a1, 10(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lb a0, 17(sp) +; RV32-NEXT: lb a1, 9(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lb a0, 16(sp) +; RV32-NEXT: lb a1, 8(sp) +; RV32-NEXT: div a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psdiv_b: @@ -3087,8 +4037,8 @@ define <8 x i8> @test_psdiv_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_psdiv_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psdiv_w: ; RV32: # %bb.0: -; RV32-NEXT: div a0, a0, a2 ; RV32-NEXT: div a1, a1, a3 +; RV32-NEXT: div a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psdiv_w: @@ -3106,20 +4056,32 @@ define <2 x i32> @test_psdiv_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_pudiv_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_pudiv_h: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a2, 16 -; RV32-NEXT: srli a5, a0, 16 -; RV32-NEXT: zext.h a2, a2 -; RV32-NEXT: zext.h a0, a0 -; RV32-NEXT: divu a4, a5, a4 -; RV32-NEXT: srli a5, a3, 16 -; RV32-NEXT: divu a0, a0, a2 -; RV32-NEXT: srli a2, a1, 16 -; RV32-NEXT: zext.h a3, a3 -; RV32-NEXT: zext.h a1, a1 -; RV32-NEXT: divu a2, a2, a5 -; RV32-NEXT: divu a1, a1, a3 -; RV32-NEXT: pack a0, a0, a4 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lhu a0, 22(sp) +; RV32-NEXT: lhu a1, 14(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 20(sp) +; RV32-NEXT: lhu a1, 12(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 18(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 16(sp) +; RV32-NEXT: lhu a1, 8(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pudiv_h: @@ -3149,42 +4111,48 @@ define <4 x i16> @test_pudiv_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_pudiv_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_pudiv_b: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a2, 24 -; RV32-NEXT: srli a5, a0, 24 -; RV32-NEXT: slli a6, a2, 16 -; RV32-NEXT: slli a7, a0, 16 -; RV32-NEXT: zext.b t0, a2 -; RV32-NEXT: zext.b t1, a0 -; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: slli a0, a0, 8 -; RV32-NEXT: divu a5, a5, a4 -; RV32-NEXT: srli a4, a6, 24 -; RV32-NEXT: srli a6, a7, 24 -; RV32-NEXT: divu a4, a6, a4 -; RV32-NEXT: srli a6, a3, 24 -; RV32-NEXT: srli a7, a1, 24 -; RV32-NEXT: divu t1, t1, t0 -; RV32-NEXT: srli a2, a2, 24 -; RV32-NEXT: srli a0, a0, 24 -; RV32-NEXT: divu t2, a0, a2 -; RV32-NEXT: slli a0, a3, 16 -; RV32-NEXT: slli a2, a1, 16 -; RV32-NEXT: divu a7, a7, a6 -; RV32-NEXT: srli a0, a0, 24 -; RV32-NEXT: srli a2, a2, 24 -; RV32-NEXT: divu a6, a2, a0 -; RV32-NEXT: zext.b a0, a3 -; RV32-NEXT: zext.b a2, a1 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: divu a0, a2, a0 -; RV32-NEXT: srli a3, a3, 24 -; RV32-NEXT: srli a1, a1, 24 -; RV32-NEXT: divu a1, a1, a3 -; RV32-NEXT: ppaire.db a2, t1, a4 -; RV32-NEXT: ppaire.db a4, a0, a6 -; RV32-NEXT: pack a0, a2, a3 -; RV32-NEXT: pack a1, a4, a5 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: divu a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pudiv_b: @@ -3238,8 +4206,8 @@ define <8 x i8> @test_pudiv_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_pudiv_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_pudiv_w: ; RV32: # %bb.0: -; RV32-NEXT: divu a0, a0, a2 ; RV32-NEXT: divu a1, a1, a3 +; RV32-NEXT: divu a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_pudiv_w: @@ -3257,20 +4225,32 @@ define <2 x i32> @test_pudiv_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_psrem_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_psrem_h: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a2, 16 -; RV32-NEXT: srai a5, a0, 16 -; RV32-NEXT: sext.h a2, a2 -; RV32-NEXT: sext.h a0, a0 -; RV32-NEXT: rem a4, a5, a4 -; RV32-NEXT: srai a5, a3, 16 -; RV32-NEXT: rem a0, a0, a2 -; RV32-NEXT: srai a2, a1, 16 -; RV32-NEXT: sext.h a3, a3 -; RV32-NEXT: sext.h a1, a1 -; RV32-NEXT: rem a2, a2, a5 -; RV32-NEXT: rem a1, a1, a3 -; RV32-NEXT: pack a0, a0, a4 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrem_h: @@ -3302,42 +4282,48 @@ define <4 x i16> @test_psrem_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_psrem_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_psrem_b: ; RV32: # %bb.0: -; RV32-NEXT: srai a4, a2, 24 -; RV32-NEXT: srai a5, a0, 24 -; RV32-NEXT: slli a6, a2, 16 -; RV32-NEXT: slli a7, a0, 16 -; RV32-NEXT: sext.b t0, a2 -; RV32-NEXT: sext.b t1, a0 -; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: slli a0, a0, 8 -; RV32-NEXT: rem a5, a5, a4 -; RV32-NEXT: srai a4, a6, 24 -; RV32-NEXT: srai a6, a7, 24 -; RV32-NEXT: rem a4, a6, a4 -; RV32-NEXT: srai a6, a3, 24 -; RV32-NEXT: srai a7, a1, 24 -; RV32-NEXT: rem t1, t1, t0 -; RV32-NEXT: srai a2, a2, 24 -; RV32-NEXT: srai a0, a0, 24 -; RV32-NEXT: rem t2, a0, a2 -; RV32-NEXT: slli a0, a3, 16 -; RV32-NEXT: slli a2, a1, 16 -; RV32-NEXT: rem a7, a7, a6 -; RV32-NEXT: srai a0, a0, 24 -; RV32-NEXT: srai a2, a2, 24 -; RV32-NEXT: rem a6, a2, a0 -; RV32-NEXT: sext.b a0, a3 -; RV32-NEXT: sext.b a2, a1 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: rem a0, a2, a0 -; RV32-NEXT: srai a3, a3, 24 -; RV32-NEXT: srai a1, a1, 24 -; RV32-NEXT: rem a1, a1, a3 -; RV32-NEXT: ppaire.db a2, t1, a4 -; RV32-NEXT: ppaire.db a4, a0, a6 -; RV32-NEXT: pack a0, a2, a3 -; RV32-NEXT: pack a1, a4, a5 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lb a0, 23(sp) +; RV32-NEXT: lb a1, 15(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lb a0, 22(sp) +; RV32-NEXT: lb a1, 14(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lb a0, 21(sp) +; RV32-NEXT: lb a1, 13(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lb a0, 20(sp) +; RV32-NEXT: lb a1, 12(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lb a0, 19(sp) +; RV32-NEXT: lb a1, 11(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lb a0, 18(sp) +; RV32-NEXT: lb a1, 10(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lb a0, 17(sp) +; RV32-NEXT: lb a1, 9(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lb a0, 16(sp) +; RV32-NEXT: lb a1, 8(sp) +; RV32-NEXT: rem a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrem_b: @@ -3393,8 +4379,8 @@ define <8 x i8> @test_psrem_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_psrem_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_psrem_w: ; RV32: # %bb.0: -; RV32-NEXT: rem a0, a0, a2 ; RV32-NEXT: rem a1, a1, a3 +; RV32-NEXT: rem a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_psrem_w: @@ -3412,20 +4398,32 @@ define <2 x i32> @test_psrem_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_purem_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_purem_h: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a2, 16 -; RV32-NEXT: srli a5, a0, 16 -; RV32-NEXT: zext.h a2, a2 -; RV32-NEXT: zext.h a0, a0 -; RV32-NEXT: remu a4, a5, a4 -; RV32-NEXT: srli a5, a3, 16 -; RV32-NEXT: remu a0, a0, a2 -; RV32-NEXT: srli a2, a1, 16 -; RV32-NEXT: zext.h a3, a3 -; RV32-NEXT: zext.h a1, a1 -; RV32-NEXT: remu a2, a2, a5 -; RV32-NEXT: remu a1, a1, a3 -; RV32-NEXT: pack a0, a0, a4 -; RV32-NEXT: pack a1, a1, a2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lhu a0, 22(sp) +; RV32-NEXT: lhu a1, 14(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 20(sp) +; RV32-NEXT: lhu a1, 12(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 18(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 16(sp) +; RV32-NEXT: lhu a1, 8(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_purem_h: @@ -3455,42 +4453,48 @@ define <4 x i16> @test_purem_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_purem_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_purem_b: ; RV32: # %bb.0: -; RV32-NEXT: srli a4, a2, 24 -; RV32-NEXT: srli a5, a0, 24 -; RV32-NEXT: slli a6, a2, 16 -; RV32-NEXT: slli a7, a0, 16 -; RV32-NEXT: zext.b t0, a2 -; RV32-NEXT: zext.b t1, a0 -; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: slli a0, a0, 8 -; RV32-NEXT: remu a5, a5, a4 -; RV32-NEXT: srli a4, a6, 24 -; RV32-NEXT: srli a6, a7, 24 -; RV32-NEXT: remu a4, a6, a4 -; RV32-NEXT: srli a6, a3, 24 -; RV32-NEXT: srli a7, a1, 24 -; RV32-NEXT: remu t1, t1, t0 -; RV32-NEXT: srli a2, a2, 24 -; RV32-NEXT: srli a0, a0, 24 -; RV32-NEXT: remu t2, a0, a2 -; RV32-NEXT: slli a0, a3, 16 -; RV32-NEXT: slli a2, a1, 16 -; RV32-NEXT: remu a7, a7, a6 -; RV32-NEXT: srli a0, a0, 24 -; RV32-NEXT: srli a2, a2, 24 -; RV32-NEXT: remu a6, a2, a0 -; RV32-NEXT: zext.b a0, a3 -; RV32-NEXT: zext.b a2, a1 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: remu a0, a2, a0 -; RV32-NEXT: srli a3, a3, 24 -; RV32-NEXT: srli a1, a1, 24 -; RV32-NEXT: remu a1, a1, a3 -; RV32-NEXT: ppaire.db a2, t1, a4 -; RV32-NEXT: ppaire.db a4, a0, a6 -; RV32-NEXT: pack a0, a2, a3 -; RV32-NEXT: pack a1, a4, a5 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: remu a0, a1, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_purem_b: @@ -3544,8 +4548,8 @@ define <8 x i8> @test_purem_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_purem_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_purem_w: ; RV32: # %bb.0: -; RV32-NEXT: remu a0, a0, a2 ; RV32-NEXT: remu a1, a1, a3 +; RV32-NEXT: remu a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_purem_w: @@ -3564,8 +4568,40 @@ define <2 x i32> @test_purem_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_eq_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_eq_h: ; RV32: # %bb.0: -; RV32-NEXT: pmseq.h a0, a0, a2 -; RV32-NEXT: pmseq.h a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lhu a0, 22(sp) +; RV32-NEXT: lhu a1, 14(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 20(sp) +; RV32-NEXT: lhu a1, 12(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 18(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 16(sp) +; RV32-NEXT: lhu a1, 8(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_eq_h: @@ -3580,10 +4616,40 @@ define <4 x i16> @test_eq_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_ne_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_ne_h: ; RV32: # %bb.0: -; RV32-NEXT: pmseq.h a0, a0, a2 -; RV32-NEXT: pmseq.h a1, a1, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lhu a0, 22(sp) +; RV32-NEXT: lhu a1, 14(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 20(sp) +; RV32-NEXT: lhu a1, 12(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 18(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 16(sp) +; RV32-NEXT: lhu a1, 8(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ne_h: @@ -3599,8 +4665,36 @@ define <4 x i16> @test_ne_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_slt_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_slt_h: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.h a0, a0, a2 -; RV32-NEXT: pmslt.h a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_slt_h: @@ -3615,10 +4709,36 @@ define <4 x i16> @test_slt_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_sle_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_sle_h: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.h a0, a2, a0 -; RV32-NEXT: pmslt.h a1, a3, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lh a0, 14(sp) +; RV32-NEXT: lh a1, 22(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 12(sp) +; RV32-NEXT: lh a1, 20(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: lh a1, 18(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: lh a1, 16(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sle_h: @@ -3634,8 +4754,36 @@ define <4 x i16> @test_sle_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_sgt_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_sgt_h: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.h a0, a2, a0 -; RV32-NEXT: pmslt.h a1, a3, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lh a0, 14(sp) +; RV32-NEXT: lh a1, 22(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 12(sp) +; RV32-NEXT: lh a1, 20(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 10(sp) +; RV32-NEXT: lh a1, 18(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: lh a1, 16(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sgt_h: @@ -3650,10 +4798,36 @@ define <4 x i16> @test_sgt_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_sge_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_sge_h: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.h a0, a0, a2 -; RV32-NEXT: pmslt.h a1, a1, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lh a0, 22(sp) +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lh a0, 20(sp) +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lh a0, 18(sp) +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: lh a1, 8(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sge_h: @@ -3669,8 +4843,36 @@ define <4 x i16> @test_sge_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_ult_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_ult_h: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.h a0, a0, a2 -; RV32-NEXT: pmsltu.h a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lhu a0, 22(sp) +; RV32-NEXT: lhu a1, 14(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 20(sp) +; RV32-NEXT: lhu a1, 12(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 18(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 16(sp) +; RV32-NEXT: lhu a1, 8(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ult_h: @@ -3685,10 +4887,36 @@ define <4 x i16> @test_ult_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_ule_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_ule_h: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.h a0, a2, a0 -; RV32-NEXT: pmsltu.h a1, a3, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lhu a0, 14(sp) +; RV32-NEXT: lhu a1, 22(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 12(sp) +; RV32-NEXT: lhu a1, 20(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 10(sp) +; RV32-NEXT: lhu a1, 18(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 8(sp) +; RV32-NEXT: lhu a1, 16(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ule_h: @@ -3704,8 +4932,36 @@ define <4 x i16> @test_ule_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_ugt_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_ugt_h: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.h a0, a2, a0 -; RV32-NEXT: pmsltu.h a1, a3, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lhu a0, 14(sp) +; RV32-NEXT: lhu a1, 22(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 12(sp) +; RV32-NEXT: lhu a1, 20(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 10(sp) +; RV32-NEXT: lhu a1, 18(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 8(sp) +; RV32-NEXT: lhu a1, 16(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ugt_h: @@ -3720,10 +4976,36 @@ define <4 x i16> @test_ugt_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_uge_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_uge_h: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.h a0, a0, a2 -; RV32-NEXT: pmsltu.h a1, a1, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lhu a0, 22(sp) +; RV32-NEXT: lhu a1, 14(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 30(sp) +; RV32-NEXT: lhu a0, 20(sp) +; RV32-NEXT: lhu a1, 12(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 28(sp) +; RV32-NEXT: lhu a0, 18(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 26(sp) +; RV32-NEXT: lhu a0, 16(sp) +; RV32-NEXT: lhu a1, 8(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_uge_h: @@ -3740,8 +5022,64 @@ define <4 x i16> @test_uge_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_eq_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_eq_b: ; RV32: # %bb.0: -; RV32-NEXT: pmseq.b a0, a0, a2 -; RV32-NEXT: pmseq.b a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_eq_b: @@ -3756,10 +5094,64 @@ define <8 x i8> @test_eq_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_ne_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_ne_b: ; RV32: # %bb.0: -; RV32-NEXT: pmseq.b a0, a0, a2 -; RV32-NEXT: pmseq.b a1, a1, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ne_b: @@ -3775,8 +5167,56 @@ define <8 x i8> @test_ne_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_slt_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_slt_b: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.b a0, a0, a2 -; RV32-NEXT: pmslt.b a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lb a0, 23(sp) +; RV32-NEXT: lb a1, 15(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lb a0, 22(sp) +; RV32-NEXT: lb a1, 14(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lb a0, 21(sp) +; RV32-NEXT: lb a1, 13(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lb a0, 20(sp) +; RV32-NEXT: lb a1, 12(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lb a0, 19(sp) +; RV32-NEXT: lb a1, 11(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lb a0, 18(sp) +; RV32-NEXT: lb a1, 10(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lb a0, 17(sp) +; RV32-NEXT: lb a1, 9(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lb a0, 16(sp) +; RV32-NEXT: lb a1, 8(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_slt_b: @@ -3791,10 +5231,56 @@ define <8 x i8> @test_slt_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_sle_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_sle_b: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.b a0, a2, a0 -; RV32-NEXT: pmslt.b a1, a3, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lb a0, 15(sp) +; RV32-NEXT: lb a1, 23(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lb a0, 14(sp) +; RV32-NEXT: lb a1, 22(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lb a0, 13(sp) +; RV32-NEXT: lb a1, 21(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lb a0, 12(sp) +; RV32-NEXT: lb a1, 20(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lb a0, 11(sp) +; RV32-NEXT: lb a1, 19(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lb a0, 10(sp) +; RV32-NEXT: lb a1, 18(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lb a0, 9(sp) +; RV32-NEXT: lb a1, 17(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lb a0, 8(sp) +; RV32-NEXT: lb a1, 16(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sle_b: @@ -3810,8 +5296,56 @@ define <8 x i8> @test_sle_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_sgt_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_sgt_b: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.b a0, a2, a0 -; RV32-NEXT: pmslt.b a1, a3, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lb a0, 15(sp) +; RV32-NEXT: lb a1, 23(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lb a0, 14(sp) +; RV32-NEXT: lb a1, 22(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lb a0, 13(sp) +; RV32-NEXT: lb a1, 21(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lb a0, 12(sp) +; RV32-NEXT: lb a1, 20(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lb a0, 11(sp) +; RV32-NEXT: lb a1, 19(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lb a0, 10(sp) +; RV32-NEXT: lb a1, 18(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lb a0, 9(sp) +; RV32-NEXT: lb a1, 17(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lb a0, 8(sp) +; RV32-NEXT: lb a1, 16(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sgt_b: @@ -3826,10 +5360,56 @@ define <8 x i8> @test_sgt_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_sge_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_sge_b: ; RV32: # %bb.0: -; RV32-NEXT: pmslt.b a0, a0, a2 -; RV32-NEXT: pmslt.b a1, a1, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lb a0, 23(sp) +; RV32-NEXT: lb a1, 15(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lb a0, 22(sp) +; RV32-NEXT: lb a1, 14(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lb a0, 21(sp) +; RV32-NEXT: lb a1, 13(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lb a0, 20(sp) +; RV32-NEXT: lb a1, 12(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lb a0, 19(sp) +; RV32-NEXT: lb a1, 11(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lb a0, 18(sp) +; RV32-NEXT: lb a1, 10(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lb a0, 17(sp) +; RV32-NEXT: lb a1, 9(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lb a0, 16(sp) +; RV32-NEXT: lb a1, 8(sp) +; RV32-NEXT: slt a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sge_b: @@ -3845,8 +5425,56 @@ define <8 x i8> @test_sge_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_ult_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_ult_b: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.b a0, a0, a2 -; RV32-NEXT: pmsltu.b a1, a1, a3 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ult_b: @@ -3861,10 +5489,56 @@ define <8 x i8> @test_ult_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_ule_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_ule_b: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.b a0, a2, a0 -; RV32-NEXT: pmsltu.b a1, a3, a1 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lbu a0, 15(sp) +; RV32-NEXT: lbu a1, 23(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 14(sp) +; RV32-NEXT: lbu a1, 22(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 13(sp) +; RV32-NEXT: lbu a1, 21(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 12(sp) +; RV32-NEXT: lbu a1, 20(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 11(sp) +; RV32-NEXT: lbu a1, 19(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 10(sp) +; RV32-NEXT: lbu a1, 18(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 9(sp) +; RV32-NEXT: lbu a1, 17(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 8(sp) +; RV32-NEXT: lbu a1, 16(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ule_b: @@ -3880,8 +5554,56 @@ define <8 x i8> @test_ule_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_ugt_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_ugt_b: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.b a0, a2, a0 -; RV32-NEXT: pmsltu.b a1, a3, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: lbu a0, 15(sp) +; RV32-NEXT: lbu a1, 23(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 14(sp) +; RV32-NEXT: lbu a1, 22(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 13(sp) +; RV32-NEXT: lbu a1, 21(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 12(sp) +; RV32-NEXT: lbu a1, 20(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 11(sp) +; RV32-NEXT: lbu a1, 19(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 10(sp) +; RV32-NEXT: lbu a1, 18(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 9(sp) +; RV32-NEXT: lbu a1, 17(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 8(sp) +; RV32-NEXT: lbu a1, 16(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ugt_b: @@ -3896,10 +5618,56 @@ define <8 x i8> @test_ugt_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_uge_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_uge_b: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.b a0, a0, a2 -; RV32-NEXT: pmsltu.b a1, a1, a3 -; RV32-NEXT: not a0, a0 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lbu a0, 23(sp) +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 31(sp) +; RV32-NEXT: lbu a0, 22(sp) +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 30(sp) +; RV32-NEXT: lbu a0, 21(sp) +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 29(sp) +; RV32-NEXT: lbu a0, 20(sp) +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 28(sp) +; RV32-NEXT: lbu a0, 19(sp) +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 27(sp) +; RV32-NEXT: lbu a0, 18(sp) +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 26(sp) +; RV32-NEXT: lbu a0, 17(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 25(sp) +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: lbu a1, 8(sp) +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_uge_b: @@ -3916,12 +5684,12 @@ define <8 x i8> @test_uge_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_eq_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_eq_w: ; RV32: # %bb.0: -; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: snez a0, a0 +; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: snez a1, a1 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: snez a0, a0 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: ret ; ; RV64-LABEL: test_eq_w: @@ -3936,12 +5704,12 @@ define <2 x i32> @test_eq_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_ne_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_ne_w: ; RV32: # %bb.0: -; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ne_w: @@ -3957,10 +5725,10 @@ define <2 x i32> @test_ne_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_slt_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_slt_w: ; RV32: # %bb.0: -; RV32-NEXT: slt a0, a0, a2 ; RV32-NEXT: slt a1, a1, a3 -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: slt a0, a0, a2 ; RV32-NEXT: neg a1, a1 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_slt_w: @@ -3975,10 +5743,10 @@ define <2 x i32> @test_slt_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_sle_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_sle_w: ; RV32: # %bb.0: -; RV32-NEXT: slt a0, a2, a0 ; RV32-NEXT: slt a1, a3, a1 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: slt a0, a2, a0 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sle_w: @@ -3994,10 +5762,10 @@ define <2 x i32> @test_sle_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_sgt_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_sgt_w: ; RV32: # %bb.0: -; RV32-NEXT: slt a0, a2, a0 ; RV32-NEXT: slt a1, a3, a1 -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: slt a0, a2, a0 ; RV32-NEXT: neg a1, a1 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sgt_w: @@ -4012,10 +5780,10 @@ define <2 x i32> @test_sgt_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_sge_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_sge_w: ; RV32: # %bb.0: -; RV32-NEXT: slt a0, a0, a2 ; RV32-NEXT: slt a1, a1, a3 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: slt a0, a0, a2 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: ret ; ; RV64-LABEL: test_sge_w: @@ -4031,10 +5799,10 @@ define <2 x i32> @test_sge_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_ult_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_ult_w: ; RV32: # %bb.0: -; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sltu a1, a1, a3 -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: neg a1, a1 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ult_w: @@ -4049,10 +5817,10 @@ define <2 x i32> @test_ult_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_ule_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_ule_w: ; RV32: # %bb.0: -; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: sltu a1, a3, a1 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ule_w: @@ -4068,10 +5836,10 @@ define <2 x i32> @test_ule_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_ugt_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_ugt_w: ; RV32: # %bb.0: -; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: sltu a1, a3, a1 -; RV32-NEXT: neg a0, a0 +; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: neg a1, a1 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_ugt_w: @@ -4086,10 +5854,10 @@ define <2 x i32> @test_ugt_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_uge_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_uge_w: ; RV32: # %bb.0: -; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sltu a1, a1, a3 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: ret ; ; RV64-LABEL: test_uge_w: @@ -4106,8 +5874,7 @@ define <2 x i32> @test_uge_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_smin_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_smin_h: ; RV32: # %bb.0: -; RV32-NEXT: pmin.h a0, a0, a2 -; RV32-NEXT: pmin.h a1, a1, a3 +; RV32-NEXT: pmin.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_smin_h: @@ -4121,8 +5888,7 @@ define <4 x i16> @test_smin_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_umin_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_umin_h: ; RV32: # %bb.0: -; RV32-NEXT: pminu.h a0, a0, a2 -; RV32-NEXT: pminu.h a1, a1, a3 +; RV32-NEXT: pminu.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_umin_h: @@ -4136,8 +5902,7 @@ define <4 x i16> @test_umin_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_smin_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_smin_b: ; RV32: # %bb.0: -; RV32-NEXT: pmin.b a0, a0, a2 -; RV32-NEXT: pmin.b a1, a1, a3 +; RV32-NEXT: pmin.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_smin_b: @@ -4151,8 +5916,7 @@ define <8 x i8> @test_smin_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_umin_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_umin_b: ; RV32: # %bb.0: -; RV32-NEXT: pminu.b a0, a0, a2 -; RV32-NEXT: pminu.b a1, a1, a3 +; RV32-NEXT: pminu.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_umin_b: @@ -4166,8 +5930,7 @@ define <8 x i8> @test_umin_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_smin_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_smin_w: ; RV32: # %bb.0: -; RV32-NEXT: min a0, a0, a2 -; RV32-NEXT: min a1, a1, a3 +; RV32-NEXT: pmin.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_smin_w: @@ -4181,8 +5944,7 @@ define <2 x i32> @test_smin_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_umin_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_umin_w: ; RV32: # %bb.0: -; RV32-NEXT: minu a0, a0, a2 -; RV32-NEXT: minu a1, a1, a3 +; RV32-NEXT: pminu.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_umin_w: @@ -4196,8 +5958,7 @@ define <2 x i32> @test_umin_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_smax_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_smax_h: ; RV32: # %bb.0: -; RV32-NEXT: pmax.h a0, a0, a2 -; RV32-NEXT: pmax.h a1, a1, a3 +; RV32-NEXT: pmax.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_smax_h: @@ -4211,8 +5972,7 @@ define <4 x i16> @test_smax_h(<4 x i16> %a, <4 x i16> %b) { define <4 x i16> @test_umax_h(<4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_umax_h: ; RV32: # %bb.0: -; RV32-NEXT: pmaxu.h a0, a0, a2 -; RV32-NEXT: pmaxu.h a1, a1, a3 +; RV32-NEXT: pmaxu.dh a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_umax_h: @@ -4226,8 +5986,7 @@ define <4 x i16> @test_umax_h(<4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_smax_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_smax_b: ; RV32: # %bb.0: -; RV32-NEXT: pmax.b a0, a0, a2 -; RV32-NEXT: pmax.b a1, a1, a3 +; RV32-NEXT: pmax.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_smax_b: @@ -4241,8 +6000,7 @@ define <8 x i8> @test_smax_b(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @test_umax_b(<8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_umax_b: ; RV32: # %bb.0: -; RV32-NEXT: pmaxu.b a0, a0, a2 -; RV32-NEXT: pmaxu.b a1, a1, a3 +; RV32-NEXT: pmaxu.db a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_umax_b: @@ -4256,8 +6014,7 @@ define <8 x i8> @test_umax_b(<8 x i8> %a, <8 x i8> %b) { define <2 x i32> @test_smax_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_smax_w: ; RV32: # %bb.0: -; RV32-NEXT: max a0, a0, a2 -; RV32-NEXT: max a1, a1, a3 +; RV32-NEXT: pmax.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_smax_w: @@ -4271,8 +6028,7 @@ define <2 x i32> @test_smax_w(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @test_umax_w(<2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_umax_w: ; RV32: # %bb.0: -; RV32-NEXT: maxu a0, a0, a2 -; RV32-NEXT: maxu a1, a1, a3 +; RV32-NEXT: pmaxu.dw a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_umax_w: @@ -4287,14 +6043,49 @@ define <2 x i32> @test_umax_w(<2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_select_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { ; RV32-LABEL: test_select_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: andi a5, a0, 1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bnez a5, .LBB205_2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: sw a4, 20(sp) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a3, 16(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: bnez a0, .LBB205_5 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a3 -; RV32-NEXT: mv a2, a4 +; RV32-NEXT: lh a1, 22(sp) +; RV32-NEXT: sh a1, 30(sp) +; RV32-NEXT: beqz a0, .LBB205_6 ; RV32-NEXT: .LBB205_2: -; RV32-NEXT: mv a1, a2 +; RV32-NEXT: lh a1, 12(sp) +; RV32-NEXT: sh a1, 28(sp) +; RV32-NEXT: beqz a0, .LBB205_7 +; RV32-NEXT: .LBB205_3: +; RV32-NEXT: lh a1, 10(sp) +; RV32-NEXT: sh a1, 26(sp) +; RV32-NEXT: beqz a0, .LBB205_8 +; RV32-NEXT: .LBB205_4: +; RV32-NEXT: lh a0, 8(sp) +; RV32-NEXT: j .LBB205_9 +; RV32-NEXT: .LBB205_5: +; RV32-NEXT: lh a1, 14(sp) +; RV32-NEXT: sh a1, 30(sp) +; RV32-NEXT: bnez a0, .LBB205_2 +; RV32-NEXT: .LBB205_6: +; RV32-NEXT: lh a1, 20(sp) +; RV32-NEXT: sh a1, 28(sp) +; RV32-NEXT: bnez a0, .LBB205_3 +; RV32-NEXT: .LBB205_7: +; RV32-NEXT: lh a1, 18(sp) +; RV32-NEXT: sh a1, 26(sp) +; RV32-NEXT: bnez a0, .LBB205_4 +; RV32-NEXT: .LBB205_8: +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: .LBB205_9: +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_select_v4i16: @@ -4313,14 +6104,81 @@ define <4 x i16> @test_select_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { define <8 x i8> @test_select_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { ; RV32-LABEL: test_select_v8i8: ; RV32: # %bb.0: -; RV32-NEXT: andi a5, a0, 1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bnez a5, .LBB206_2 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: sw a4, 20(sp) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: sw a3, 16(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: bnez a0, .LBB206_9 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a3 -; RV32-NEXT: mv a2, a4 +; RV32-NEXT: lbu a1, 23(sp) +; RV32-NEXT: sb a1, 31(sp) +; RV32-NEXT: beqz a0, .LBB206_10 ; RV32-NEXT: .LBB206_2: -; RV32-NEXT: mv a1, a2 +; RV32-NEXT: lbu a1, 14(sp) +; RV32-NEXT: sb a1, 30(sp) +; RV32-NEXT: beqz a0, .LBB206_11 +; RV32-NEXT: .LBB206_3: +; RV32-NEXT: lbu a1, 13(sp) +; RV32-NEXT: sb a1, 29(sp) +; RV32-NEXT: beqz a0, .LBB206_12 +; RV32-NEXT: .LBB206_4: +; RV32-NEXT: lbu a1, 12(sp) +; RV32-NEXT: sb a1, 28(sp) +; RV32-NEXT: beqz a0, .LBB206_13 +; RV32-NEXT: .LBB206_5: +; RV32-NEXT: lbu a1, 11(sp) +; RV32-NEXT: sb a1, 27(sp) +; RV32-NEXT: beqz a0, .LBB206_14 +; RV32-NEXT: .LBB206_6: +; RV32-NEXT: lbu a1, 10(sp) +; RV32-NEXT: sb a1, 26(sp) +; RV32-NEXT: beqz a0, .LBB206_15 +; RV32-NEXT: .LBB206_7: +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: sb a1, 25(sp) +; RV32-NEXT: beqz a0, .LBB206_16 +; RV32-NEXT: .LBB206_8: +; RV32-NEXT: lbu a0, 8(sp) +; RV32-NEXT: j .LBB206_17 +; RV32-NEXT: .LBB206_9: +; RV32-NEXT: lbu a1, 15(sp) +; RV32-NEXT: sb a1, 31(sp) +; RV32-NEXT: bnez a0, .LBB206_2 +; RV32-NEXT: .LBB206_10: +; RV32-NEXT: lbu a1, 22(sp) +; RV32-NEXT: sb a1, 30(sp) +; RV32-NEXT: bnez a0, .LBB206_3 +; RV32-NEXT: .LBB206_11: +; RV32-NEXT: lbu a1, 21(sp) +; RV32-NEXT: sb a1, 29(sp) +; RV32-NEXT: bnez a0, .LBB206_4 +; RV32-NEXT: .LBB206_12: +; RV32-NEXT: lbu a1, 20(sp) +; RV32-NEXT: sb a1, 28(sp) +; RV32-NEXT: bnez a0, .LBB206_5 +; RV32-NEXT: .LBB206_13: +; RV32-NEXT: lbu a1, 19(sp) +; RV32-NEXT: sb a1, 27(sp) +; RV32-NEXT: bnez a0, .LBB206_6 +; RV32-NEXT: .LBB206_14: +; RV32-NEXT: lbu a1, 18(sp) +; RV32-NEXT: sb a1, 26(sp) +; RV32-NEXT: bnez a0, .LBB206_7 +; RV32-NEXT: .LBB206_15: +; RV32-NEXT: lbu a1, 17(sp) +; RV32-NEXT: sb a1, 25(sp) +; RV32-NEXT: bnez a0, .LBB206_8 +; RV32-NEXT: .LBB206_16: +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: .LBB206_17: +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_select_v8i8: @@ -4340,12 +6198,13 @@ define <2 x i32> @test_select_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { ; RV32-LABEL: test_select_v2i32: ; RV32: # %bb.0: ; RV32-NEXT: andi a5, a0, 1 -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: bnez a5, .LBB207_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a3 -; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a1, a4 +; RV32-NEXT: ret ; RV32-NEXT: .LBB207_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: ret ; @@ -4366,10 +6225,62 @@ define <2 x i32> @test_select_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { define <4 x i16> @test_vselect_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { ; RV32-LABEL: test_vselect_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: pmseq.h a1, a1, a3 -; RV32-NEXT: pmseq.h a0, a0, a2 -; RV32-NEXT: merge a0, a2, a4 -; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lhu a0, 8(sp) +; RV32-NEXT: lhu a1, 10(sp) +; RV32-NEXT: lhu a2, 12(sp) +; RV32-NEXT: lhu a3, 14(sp) +; RV32-NEXT: lhu a6, 0(sp) +; RV32-NEXT: lhu a7, 2(sp) +; RV32-NEXT: lhu t0, 4(sp) +; RV32-NEXT: lhu t1, 6(sp) +; RV32-NEXT: xor a6, a6, a0 +; RV32-NEXT: xor a7, a7, a1 +; RV32-NEXT: xor t0, t0, a2 +; RV32-NEXT: xor t1, t1, a3 +; RV32-NEXT: snez a6, a6 +; RV32-NEXT: snez a7, a7 +; RV32-NEXT: snez t0, t0 +; RV32-NEXT: snez t1, t1 +; RV32-NEXT: addi a7, a7, -1 +; RV32-NEXT: addi t0, t0, -1 +; RV32-NEXT: addi t1, t1, -1 +; RV32-NEXT: zext.h t0, t0 +; RV32-NEXT: zext.h t1, t1 +; RV32-NEXT: sw a4, 16(sp) +; RV32-NEXT: sw a5, 20(sp) +; RV32-NEXT: beqz t1, .LBB208_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lh a3, 22(sp) +; RV32-NEXT: .LBB208_2: +; RV32-NEXT: addi a6, a6, -1 +; RV32-NEXT: zext.h a4, a7 +; RV32-NEXT: sh a3, 30(sp) +; RV32-NEXT: beqz t0, .LBB208_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: lh a2, 20(sp) +; RV32-NEXT: .LBB208_4: +; RV32-NEXT: zext.h a3, a6 +; RV32-NEXT: sh a2, 28(sp) +; RV32-NEXT: beqz a4, .LBB208_6 +; RV32-NEXT: # %bb.5: +; RV32-NEXT: lh a1, 18(sp) +; RV32-NEXT: .LBB208_6: +; RV32-NEXT: sh a1, 26(sp) +; RV32-NEXT: beqz a3, .LBB208_8 +; RV32-NEXT: # %bb.7: +; RV32-NEXT: lh a0, 16(sp) +; RV32-NEXT: .LBB208_8: +; RV32-NEXT: sh a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_vselect_v4i16: @@ -4385,10 +6296,110 @@ define <4 x i16> @test_vselect_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { define <8 x i8> @test_vselect_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { ; RV32-LABEL: test_vselect_v8i8: ; RV32: # %bb.0: -; RV32-NEXT: pmsltu.b a1, a1, a3 -; RV32-NEXT: pmsltu.b a0, a0, a2 -; RV32-NEXT: merge a0, a2, a4 -; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a0, 8(sp) +; RV32-NEXT: lbu a1, 9(sp) +; RV32-NEXT: lbu a2, 10(sp) +; RV32-NEXT: lbu a3, 11(sp) +; RV32-NEXT: lbu a6, 0(sp) +; RV32-NEXT: lbu t0, 1(sp) +; RV32-NEXT: lbu t2, 2(sp) +; RV32-NEXT: lbu t4, 3(sp) +; RV32-NEXT: lbu a7, 12(sp) +; RV32-NEXT: lbu t1, 13(sp) +; RV32-NEXT: lbu t3, 14(sp) +; RV32-NEXT: lbu t5, 15(sp) +; RV32-NEXT: lbu t6, 4(sp) +; RV32-NEXT: lbu s0, 5(sp) +; RV32-NEXT: lbu s1, 6(sp) +; RV32-NEXT: lbu s2, 7(sp) +; RV32-NEXT: sltu t6, t6, a7 +; RV32-NEXT: sltu s0, s0, t1 +; RV32-NEXT: sltu s1, s1, t3 +; RV32-NEXT: sltu s2, s2, t5 +; RV32-NEXT: neg s0, s0 +; RV32-NEXT: neg s1, s1 +; RV32-NEXT: neg s2, s2 +; RV32-NEXT: zext.b s1, s1 +; RV32-NEXT: zext.b s2, s2 +; RV32-NEXT: sw a4, 16(sp) +; RV32-NEXT: sw a5, 20(sp) +; RV32-NEXT: beqz s2, .LBB209_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lbu t5, 23(sp) +; RV32-NEXT: .LBB209_2: +; RV32-NEXT: sltu a5, t4, a3 +; RV32-NEXT: neg t6, t6 +; RV32-NEXT: zext.b a4, s0 +; RV32-NEXT: sb t5, 31(sp) +; RV32-NEXT: beqz s1, .LBB209_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: lbu t3, 22(sp) +; RV32-NEXT: .LBB209_4: +; RV32-NEXT: sltu t2, t2, a2 +; RV32-NEXT: neg t4, a5 +; RV32-NEXT: zext.b a5, t6 +; RV32-NEXT: sb t3, 30(sp) +; RV32-NEXT: beqz a4, .LBB209_6 +; RV32-NEXT: # %bb.5: +; RV32-NEXT: lbu t1, 21(sp) +; RV32-NEXT: .LBB209_6: +; RV32-NEXT: sltu t0, t0, a1 +; RV32-NEXT: neg t2, t2 +; RV32-NEXT: zext.b a4, t4 +; RV32-NEXT: sb t1, 29(sp) +; RV32-NEXT: beqz a5, .LBB209_8 +; RV32-NEXT: # %bb.7: +; RV32-NEXT: lbu a7, 20(sp) +; RV32-NEXT: .LBB209_8: +; RV32-NEXT: sltu a6, a6, a0 +; RV32-NEXT: neg t0, t0 +; RV32-NEXT: zext.b a5, t2 +; RV32-NEXT: sb a7, 28(sp) +; RV32-NEXT: beqz a4, .LBB209_10 +; RV32-NEXT: # %bb.9: +; RV32-NEXT: lbu a3, 19(sp) +; RV32-NEXT: .LBB209_10: +; RV32-NEXT: neg a6, a6 +; RV32-NEXT: zext.b a4, t0 +; RV32-NEXT: sb a3, 27(sp) +; RV32-NEXT: beqz a5, .LBB209_12 +; RV32-NEXT: # %bb.11: +; RV32-NEXT: lbu a2, 18(sp) +; RV32-NEXT: .LBB209_12: +; RV32-NEXT: zext.b a3, a6 +; RV32-NEXT: sb a2, 26(sp) +; RV32-NEXT: beqz a4, .LBB209_14 +; RV32-NEXT: # %bb.13: +; RV32-NEXT: lbu a1, 17(sp) +; RV32-NEXT: .LBB209_14: +; RV32-NEXT: sb a1, 25(sp) +; RV32-NEXT: beqz a3, .LBB209_16 +; RV32-NEXT: # %bb.15: +; RV32-NEXT: lbu a0, 16(sp) +; RV32-NEXT: .LBB209_16: +; RV32-NEXT: sb a0, 24(sp) +; RV32-NEXT: lw a0, 24(sp) +; RV32-NEXT: lw a1, 28(sp) +; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: .cfi_restore s2 +; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_vselect_v8i8: @@ -4404,13 +6415,17 @@ define <8 x i8> @test_vselect_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { define <2 x i32> @test_vselect_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { ; RV32-LABEL: test_vselect_v2i32: ; RV32: # %bb.0: -; RV32-NEXT: blt a2, a0, .LBB210_2 +; RV32-NEXT: slt a0, a2, a0 +; RV32-NEXT: slt a1, a3, a1 +; RV32-NEXT: neg a1, a1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: bnez a1, .LBB210_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a4, a2 +; RV32-NEXT: mv a5, a3 ; RV32-NEXT: .LBB210_2: -; RV32-NEXT: blt a3, a1, .LBB210_4 +; RV32-NEXT: bnez a0, .LBB210_4 ; RV32-NEXT: # %bb.3: -; RV32-NEXT: mv a5, a3 +; RV32-NEXT: mv a4, a2 ; RV32-NEXT: .LBB210_4: ; RV32-NEXT: padd.dw a0, a4, zero ; RV32-NEXT: ret @@ -4428,12 +6443,30 @@ define <2 x i32> @test_vselect_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { define <4 x i16> @test_bswap_v4i16(<4 x i16> %a) { ; RV32-LABEL: test_bswap_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: psrli.h a2, a0, 8 -; RV32-NEXT: pslli.h a0, a0, 8 -; RV32-NEXT: or a0, a0, a2 -; RV32-NEXT: psrli.h a2, a1, 8 -; RV32-NEXT: pslli.h a1, a1, 8 -; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: rev8 a0, a1 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: rev8 a0, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: rev8 a0, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: rev8 a0, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_bswap_v4i16: @@ -4449,8 +6482,8 @@ define <4 x i16> @test_bswap_v4i16(<4 x i16> %a) { define <2 x i32> @test_bswap_v2i32(<2 x i32> %a) { ; RV32-LABEL: test_bswap_v2i32: ; RV32: # %bb.0: -; RV32-NEXT: rev8 a0, a0 ; RV32-NEXT: rev8 a1, a1 +; RV32-NEXT: rev8 a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_bswap_v2i32: @@ -4475,39 +6508,46 @@ define <2 x i32> @test_bswap_v2i32(<2 x i32> %a) { define <8 x i8> @test_bitreverse_v8i8(<8 x i8> %a) { ; RV32-LABEL: test_bitreverse_v8i8: ; RV32: # %bb.0: -; RV32-NEXT: psrli.b a2, a0, 4 -; RV32-NEXT: pli.b a3, 15 -; RV32-NEXT: pli.b a4, 51 -; RV32-NEXT: psrli.b a5, a1, 4 -; RV32-NEXT: and a2, a2, a3 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: and a5, a5, a3 -; RV32-NEXT: and a1, a1, a3 -; RV32-NEXT: pli.b a3, 85 -; RV32-NEXT: pslli.b a0, a0, 4 -; RV32-NEXT: pslli.b a1, a1, 4 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: or a1, a5, a1 -; RV32-NEXT: psrli.b a2, a0, 2 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: psrli.b a5, a1, 2 -; RV32-NEXT: and a1, a1, a4 -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: pslli.b a0, a0, 2 -; RV32-NEXT: and a4, a5, a4 -; RV32-NEXT: pslli.b a1, a1, 2 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: or a1, a4, a1 -; RV32-NEXT: psrli.b a2, a0, 1 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: psrli.b a4, a1, 1 -; RV32-NEXT: and a1, a1, a3 -; RV32-NEXT: and a2, a2, a3 -; RV32-NEXT: pslli.b a0, a0, 1 -; RV32-NEXT: and a3, a4, a3 -; RV32-NEXT: pslli.b a1, a1, 1 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: or a1, a3, a1 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lbu a1, 7(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: rev a0, a1 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 15(sp) +; RV32-NEXT: lbu a0, 6(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 14(sp) +; RV32-NEXT: lbu a0, 5(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 13(sp) +; RV32-NEXT: lbu a0, 4(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 12(sp) +; RV32-NEXT: lbu a0, 3(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 11(sp) +; RV32-NEXT: lbu a0, 2(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 10(sp) +; RV32-NEXT: lbu a0, 1(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 9(sp) +; RV32-NEXT: lbu a0, 0(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 24 +; RV32-NEXT: sb a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_bitreverse_v8i8: @@ -4538,45 +6578,30 @@ define <8 x i8> @test_bitreverse_v8i8(<8 x i8> %a) { define <4 x i16> @test_bitreverse_v4i16(<4 x i16> %a) { ; RV32-LABEL: test_bitreverse_v4i16: ; RV32: # %bb.0: -; RV32-NEXT: psrli.h a2, a0, 8 -; RV32-NEXT: pslli.h a0, a0, 8 -; RV32-NEXT: pli.b a3, 15 -; RV32-NEXT: pli.b a4, 51 -; RV32-NEXT: psrli.h a5, a1, 8 -; RV32-NEXT: pslli.h a1, a1, 8 -; RV32-NEXT: or a0, a0, a2 -; RV32-NEXT: or a1, a1, a5 -; RV32-NEXT: psrli.h a2, a0, 4 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: psrli.h a5, a1, 4 -; RV32-NEXT: and a1, a1, a3 -; RV32-NEXT: and a2, a2, a3 -; RV32-NEXT: and a3, a5, a3 -; RV32-NEXT: pli.b a5, 85 -; RV32-NEXT: pslli.h a0, a0, 4 -; RV32-NEXT: pslli.h a1, a1, 4 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: or a1, a3, a1 -; RV32-NEXT: psrli.h a2, a0, 2 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: psrli.h a3, a1, 2 -; RV32-NEXT: and a1, a1, a4 -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: pslli.h a0, a0, 2 -; RV32-NEXT: and a3, a3, a4 -; RV32-NEXT: pslli.h a1, a1, 2 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: or a1, a3, a1 -; RV32-NEXT: psrli.h a2, a0, 1 -; RV32-NEXT: and a0, a0, a5 -; RV32-NEXT: psrli.h a3, a1, 1 -; RV32-NEXT: and a1, a1, a5 -; RV32-NEXT: and a2, a2, a5 -; RV32-NEXT: pslli.h a0, a0, 1 -; RV32-NEXT: and a3, a3, a5 -; RV32-NEXT: pslli.h a1, a1, 1 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: or a1, a3, a1 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: lh a1, 6(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: rev a0, a1 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 14(sp) +; RV32-NEXT: lh a0, 4(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 12(sp) +; RV32-NEXT: lh a0, 2(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 10(sp) +; RV32-NEXT: lh a0, 0(sp) +; RV32-NEXT: rev a0, a0 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: sh a0, 8(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_bitreverse_v4i16: @@ -4610,8 +6635,8 @@ define <4 x i16> @test_bitreverse_v4i16(<4 x i16> %a) { define <2 x i32> @test_bitreverse_v2i32(<2 x i32> %a) { ; RV32-LABEL: test_bitreverse_v2i32: ; RV32: # %bb.0: -; RV32-NEXT: rev a0, a0 ; RV32-NEXT: rev a1, a1 +; RV32-NEXT: rev a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_bitreverse_v2i32: diff --git a/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll index 4a43dad8519e5..566d8a0c7c288 100644 --- a/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvp-unaligned-load-store.ll @@ -11,22 +11,22 @@ define void @test_load_v4i8_align1(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v4i8_align1: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lbu a2, 4(a1) -; CHECK-RV32-NEXT: lbu a3, 5(a1) -; CHECK-RV32-NEXT: lbu a4, 6(a1) -; CHECK-RV32-NEXT: lbu a5, 7(a1) -; CHECK-RV32-NEXT: lbu a6, 1(a1) -; CHECK-RV32-NEXT: lbu a7, 2(a1) -; CHECK-RV32-NEXT: lbu t0, 3(a1) -; CHECK-RV32-NEXT: lbu a1, 0(a1) +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 1(a1) +; CHECK-RV32-NEXT: lbu a4, 2(a1) +; CHECK-RV32-NEXT: lbu a5, 3(a1) +; CHECK-RV32-NEXT: lbu a6, 5(a1) +; CHECK-RV32-NEXT: lbu a7, 6(a1) +; CHECK-RV32-NEXT: lbu t0, 7(a1) +; CHECK-RV32-NEXT: lbu a1, 4(a1) ; CHECK-RV32-NEXT: ppaire.b a4, a4, a5 ; CHECK-RV32-NEXT: ppaire.b a2, a2, a3 ; CHECK-RV32-NEXT: ppaire.b a3, a7, t0 ; CHECK-RV32-NEXT: ppaire.b a1, a1, a6 ; CHECK-RV32-NEXT: pack a2, a2, a4 ; CHECK-RV32-NEXT: pack a1, a1, a3 -; CHECK-RV32-NEXT: sw a1, 0(a0) -; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: sw a2, 0(a0) +; CHECK-RV32-NEXT: sw a1, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v4i8_align1: @@ -129,14 +129,14 @@ define void @test_store_v4i8_align1(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v4i8_align2(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v4i8_align2: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lhu a2, 2(a1) -; CHECK-RV32-NEXT: lhu a3, 4(a1) -; CHECK-RV32-NEXT: lhu a4, 6(a1) -; CHECK-RV32-NEXT: lhu a1, 0(a1) -; CHECK-RV32-NEXT: pack a3, a3, a4 -; CHECK-RV32-NEXT: pack a1, a1, a2 -; CHECK-RV32-NEXT: sw a1, 0(a0) -; CHECK-RV32-NEXT: sw a3, 4(a0) +; CHECK-RV32-NEXT: lhu a2, 0(a1) +; CHECK-RV32-NEXT: lhu a3, 2(a1) +; CHECK-RV32-NEXT: lhu a4, 4(a1) +; CHECK-RV32-NEXT: lhu a1, 6(a1) +; CHECK-RV32-NEXT: pack a2, a2, a3 +; CHECK-RV32-NEXT: pack a1, a4, a1 +; CHECK-RV32-NEXT: sw a2, 0(a0) +; CHECK-RV32-NEXT: sw a1, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v4i8_align2: @@ -393,22 +393,22 @@ define void @test_store_v2i16_align2(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v8i8_align1(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v8i8_align1: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lbu a2, 4(a1) -; CHECK-RV32-NEXT: lbu a3, 5(a1) -; CHECK-RV32-NEXT: lbu a4, 6(a1) -; CHECK-RV32-NEXT: lbu a5, 7(a1) -; CHECK-RV32-NEXT: lbu a6, 1(a1) -; CHECK-RV32-NEXT: lbu a7, 2(a1) -; CHECK-RV32-NEXT: lbu t0, 3(a1) -; CHECK-RV32-NEXT: lbu a1, 0(a1) +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 1(a1) +; CHECK-RV32-NEXT: lbu a4, 2(a1) +; CHECK-RV32-NEXT: lbu a5, 3(a1) +; CHECK-RV32-NEXT: lbu a6, 5(a1) +; CHECK-RV32-NEXT: lbu a7, 6(a1) +; CHECK-RV32-NEXT: lbu t0, 7(a1) +; CHECK-RV32-NEXT: lbu a1, 4(a1) ; CHECK-RV32-NEXT: ppaire.b a4, a4, a5 ; CHECK-RV32-NEXT: ppaire.b a2, a2, a3 ; CHECK-RV32-NEXT: ppaire.b a3, a7, t0 ; CHECK-RV32-NEXT: ppaire.b a1, a1, a6 ; CHECK-RV32-NEXT: pack a2, a2, a4 ; CHECK-RV32-NEXT: pack a1, a1, a3 -; CHECK-RV32-NEXT: sw a1, 0(a0) -; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: sw a2, 0(a0) +; CHECK-RV32-NEXT: sw a1, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v8i8_align1: @@ -511,14 +511,14 @@ define void @test_store_v8i8_align1(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v8i8_align2(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v8i8_align2: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lhu a2, 2(a1) -; CHECK-RV32-NEXT: lhu a3, 4(a1) -; CHECK-RV32-NEXT: lhu a4, 6(a1) -; CHECK-RV32-NEXT: lhu a1, 0(a1) -; CHECK-RV32-NEXT: pack a3, a3, a4 -; CHECK-RV32-NEXT: pack a1, a1, a2 -; CHECK-RV32-NEXT: sw a1, 0(a0) -; CHECK-RV32-NEXT: sw a3, 4(a0) +; CHECK-RV32-NEXT: lhu a2, 0(a1) +; CHECK-RV32-NEXT: lhu a3, 2(a1) +; CHECK-RV32-NEXT: lhu a4, 4(a1) +; CHECK-RV32-NEXT: lhu a1, 6(a1) +; CHECK-RV32-NEXT: pack a2, a2, a3 +; CHECK-RV32-NEXT: pack a1, a4, a1 +; CHECK-RV32-NEXT: sw a2, 0(a0) +; CHECK-RV32-NEXT: sw a1, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v8i8_align2: @@ -600,22 +600,22 @@ define void @test_store_v8i8_align2(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v4i16_align1(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v4i16_align1: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lbu a2, 4(a1) -; CHECK-RV32-NEXT: lbu a3, 5(a1) -; CHECK-RV32-NEXT: lbu a4, 6(a1) -; CHECK-RV32-NEXT: lbu a5, 7(a1) -; CHECK-RV32-NEXT: lbu a6, 1(a1) -; CHECK-RV32-NEXT: lbu a7, 2(a1) -; CHECK-RV32-NEXT: lbu t0, 3(a1) -; CHECK-RV32-NEXT: lbu a1, 0(a1) +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 1(a1) +; CHECK-RV32-NEXT: lbu a4, 2(a1) +; CHECK-RV32-NEXT: lbu a5, 3(a1) +; CHECK-RV32-NEXT: lbu a6, 5(a1) +; CHECK-RV32-NEXT: lbu a7, 6(a1) +; CHECK-RV32-NEXT: lbu t0, 7(a1) +; CHECK-RV32-NEXT: lbu a1, 4(a1) ; CHECK-RV32-NEXT: ppaire.b a4, a4, a5 ; CHECK-RV32-NEXT: ppaire.b a2, a2, a3 ; CHECK-RV32-NEXT: ppaire.b a3, a7, t0 ; CHECK-RV32-NEXT: ppaire.b a1, a1, a6 ; CHECK-RV32-NEXT: pack a2, a2, a4 ; CHECK-RV32-NEXT: pack a1, a1, a3 -; CHECK-RV32-NEXT: sw a1, 0(a0) -; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: sw a2, 0(a0) +; CHECK-RV32-NEXT: sw a1, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v4i16_align1: @@ -718,14 +718,14 @@ define void @test_store_v4i16_align1(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v4i16_align2(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v4i16_align2: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lhu a2, 2(a1) -; CHECK-RV32-NEXT: lhu a3, 4(a1) -; CHECK-RV32-NEXT: lhu a4, 6(a1) -; CHECK-RV32-NEXT: lhu a1, 0(a1) -; CHECK-RV32-NEXT: pack a3, a3, a4 -; CHECK-RV32-NEXT: pack a1, a1, a2 -; CHECK-RV32-NEXT: sw a1, 0(a0) -; CHECK-RV32-NEXT: sw a3, 4(a0) +; CHECK-RV32-NEXT: lhu a2, 0(a1) +; CHECK-RV32-NEXT: lhu a3, 2(a1) +; CHECK-RV32-NEXT: lhu a4, 4(a1) +; CHECK-RV32-NEXT: lhu a1, 6(a1) +; CHECK-RV32-NEXT: pack a2, a2, a3 +; CHECK-RV32-NEXT: pack a1, a4, a1 +; CHECK-RV32-NEXT: sw a2, 0(a0) +; CHECK-RV32-NEXT: sw a1, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v4i16_align2: @@ -877,22 +877,22 @@ define void @test_store_v4i16_align4(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v2i32_align1(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v2i32_align1: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lbu a2, 0(a1) -; CHECK-RV32-NEXT: lbu a3, 1(a1) -; CHECK-RV32-NEXT: lbu a4, 2(a1) -; CHECK-RV32-NEXT: lbu a5, 3(a1) -; CHECK-RV32-NEXT: lbu a6, 5(a1) -; CHECK-RV32-NEXT: lbu a7, 6(a1) -; CHECK-RV32-NEXT: lbu t0, 7(a1) -; CHECK-RV32-NEXT: lbu a1, 4(a1) +; CHECK-RV32-NEXT: lbu a2, 4(a1) +; CHECK-RV32-NEXT: lbu a3, 5(a1) +; CHECK-RV32-NEXT: lbu a4, 6(a1) +; CHECK-RV32-NEXT: lbu a5, 7(a1) +; CHECK-RV32-NEXT: lbu a6, 1(a1) +; CHECK-RV32-NEXT: lbu a7, 2(a1) +; CHECK-RV32-NEXT: lbu t0, 3(a1) +; CHECK-RV32-NEXT: lbu a1, 0(a1) ; CHECK-RV32-NEXT: ppaire.b a4, a4, a5 ; CHECK-RV32-NEXT: ppaire.b a2, a2, a3 ; CHECK-RV32-NEXT: ppaire.b a3, a7, t0 ; CHECK-RV32-NEXT: ppaire.b a1, a1, a6 ; CHECK-RV32-NEXT: pack a2, a2, a4 ; CHECK-RV32-NEXT: pack a1, a1, a3 -; CHECK-RV32-NEXT: sw a2, 0(a0) -; CHECK-RV32-NEXT: sw a1, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: sw a2, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v2i32_align1: @@ -995,14 +995,14 @@ define void @test_store_v2i32_align1(ptr %a_ptr, ptr %b_ptr) { define void @test_load_v2i32_align2(ptr %ret_ptr, ptr %a_ptr) { ; CHECK-RV32-LABEL: test_load_v2i32_align2: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lhu a2, 0(a1) -; CHECK-RV32-NEXT: lhu a3, 2(a1) -; CHECK-RV32-NEXT: lhu a4, 4(a1) -; CHECK-RV32-NEXT: lhu a1, 6(a1) -; CHECK-RV32-NEXT: pack a2, a2, a3 -; CHECK-RV32-NEXT: pack a1, a4, a1 -; CHECK-RV32-NEXT: sw a2, 0(a0) -; CHECK-RV32-NEXT: sw a1, 4(a0) +; CHECK-RV32-NEXT: lhu a2, 2(a1) +; CHECK-RV32-NEXT: lhu a3, 4(a1) +; CHECK-RV32-NEXT: lhu a4, 6(a1) +; CHECK-RV32-NEXT: lhu a1, 0(a1) +; CHECK-RV32-NEXT: pack a3, a3, a4 +; CHECK-RV32-NEXT: pack a1, a1, a2 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: sw a3, 4(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: test_load_v2i32_align2: From 171e2bd47c6d94421adb908bad80b40a5907e6cb Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Tue, 12 May 2026 17:11:44 -0400 Subject: [PATCH 516/538] Revert "[CodeGen] Use byte offsets and ptradd in ShadowStackGCLowering" (#197297) Reverts llvm/llvm-project#178436. I need to update the tests that I added for that PR. --- llvm/lib/CodeGen/ShadowStackGCLowering.cpp | 187 ++++++++++----------- 1 file changed, 86 insertions(+), 101 deletions(-) diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index deb5d0e9c1555..000d6d842c6be 100644 --- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -24,7 +24,6 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -39,9 +38,7 @@ #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" #include #include @@ -59,16 +56,14 @@ class ShadowStackGCLoweringImpl { /// roots. GlobalVariable *Head = nullptr; + /// StackEntryTy - Abstract type of a link in the shadow stack. + StructType *StackEntryTy = nullptr; StructType *FrameMapTy = nullptr; /// Roots - GC roots in the current function. Each is a pair of the /// intrinsic call and its corresponding alloca. std::vector> Roots; - /// RootOffsets - Byte offsets and sizes of each root within the frame. - /// Each element is a pair of (offset, size). - std::vector> RootOffsets; - public: ShadowStackGCLoweringImpl() = default; @@ -77,9 +72,16 @@ class ShadowStackGCLoweringImpl { private: bool IsNullValue(Value *V); - Constant *GetFrameMap(Function &F, uint64_t FrameSizeInPtrs); - std::pair ComputeFrameLayout(Function &F); + Constant *GetFrameMap(Function &F); + Type *GetConcreteStackEntryType(Function &F); void CollectRoots(Function &F); + + static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B, + Type *Ty, Value *BasePtr, int Idx1, + const char *Name); + static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B, + Type *Ty, Value *BasePtr, int Idx1, int Idx2, + const char *Name); }; class ShadowStackGCLowering : public FunctionPass { @@ -141,8 +143,7 @@ FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGC ShadowStackGCLowering::ShadowStackGCLowering() : FunctionPass(ID) {} -Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F, - uint64_t FrameSizeInPtrs) { +Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F) { // doInitialization creates the abstract type of this value. Type *VoidPtr = PointerType::getUnqual(F.getContext()); @@ -160,7 +161,7 @@ Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F, Type *Int32Ty = Type::getInt32Ty(F.getContext()); Constant *BaseElts[] = { - ConstantInt::get(Int32Ty, FrameSizeInPtrs, false), + ConstantInt::get(Int32Ty, Roots.size(), false), ConstantInt::get(Int32Ty, NumMeta, false), }; @@ -191,44 +192,14 @@ Constant *ShadowStackGCLoweringImpl::GetFrameMap(Function &F, "__gc_" + F.getName()); } -std::pair -ShadowStackGCLoweringImpl::ComputeFrameLayout(Function &F) { - // Compute the layout of the shadow stack frame using byte offsets. - // Layout: [Next ptr | Map ptr | Root 0 | Root 1 | ... | Root N] - - const DataLayout &DL = F.getParent()->getDataLayout(); - uint64_t PtrSize = DL.getPointerSize(0); - Align PtrAlign = DL.getPointerABIAlignment(0); - - RootOffsets.clear(); - Align MaxAlign = PtrAlign; - - // Offset 0: Next pointer - // Offset PtrSize: Map pointer - uint64_t Offset = 2 * PtrSize; - - // Compute offsets and sizes for each root - for (const std::pair &Root : Roots) { - AllocaInst *AI = Root.second; - std::optional RootSize = AI->getAllocationSize(DL); - if (!RootSize || !RootSize->isFixed()) - reportFatalUsageError( - "Intrinsic::gcroot requires a fixed size stack object"); - uint64_t Size = RootSize->getFixedValue(); - Align RootAlign = AI->getAlign(); - MaxAlign = std::max(MaxAlign, RootAlign); - - // Align the offset for this root - uint64_t AlignedOffset = alignTo(Offset, RootAlign); - - // Store both offset and size as a pair - RootOffsets.push_back({AlignedOffset, Size}); - Offset = AlignedOffset + Size; - } +Type *ShadowStackGCLoweringImpl::GetConcreteStackEntryType(Function &F) { + // doInitialization creates the generic version of this type. + std::vector EltTys; + EltTys.push_back(StackEntryTy); + for (const std::pair &Root : Roots) + EltTys.push_back(Root.second->getAllocatedType()); - // Final frame size, aligned to maximum alignment - uint64_t FrameSize = alignTo(Offset, MaxAlign); - return {FrameSize, MaxAlign}; + return StructType::create(EltTys, ("gc_stackentry." + F.getName()).str()); } /// doInitialization - If this module uses the GC intrinsics, find them now. If @@ -255,11 +226,21 @@ bool ShadowStackGCLoweringImpl::doInitialization(Module &M) { // Specifies length of variable length array. EltTys.push_back(Type::getInt32Ty(M.getContext())); FrameMapTy = StructType::create(EltTys, "gc_map"); + PointerType *FrameMapPtrTy = PointerType::getUnqual(M.getContext()); + + // struct StackEntry { + // ShadowStackEntry *Next; // Caller's stack entry. + // FrameMap *Map; // Pointer to constant FrameMap. + // void *Roots[]; // Stack roots (in-place array, so we pretend). + // }; - // The shadow stack linked list uses opaque pointers. - // Each frame is a byte array with: [Next ptr | Map ptr | Roots...] PointerType *StackEntryPtrTy = PointerType::getUnqual(M.getContext()); + EltTys.clear(); + EltTys.push_back(StackEntryPtrTy); + EltTys.push_back(FrameMapPtrTy); + StackEntryTy = StructType::create(EltTys, "gc_stackentry"); + // Get the root chain if it already exists. Head = M.getGlobalVariable("llvm_gc_root_chain"); if (!Head) { @@ -283,6 +264,10 @@ bool ShadowStackGCLoweringImpl::IsNullValue(Value *V) { } void ShadowStackGCLoweringImpl::CollectRoots(Function &F) { + // FIXME: Account for original alignment. Could fragment the root array. + // Approach 1: Null initialize empty slots at runtime. Yuck. + // Approach 2: Emit a map of the array instead of just a count. + assert(Roots.empty() && "Not cleaned up?"); SmallVector, 16> MetaRoots; @@ -306,6 +291,34 @@ void ShadowStackGCLoweringImpl::CollectRoots(Function &F) { Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end()); } +GetElementPtrInst * +ShadowStackGCLoweringImpl::CreateGEP(LLVMContext &Context, IRBuilder<> &B, + Type *Ty, Value *BasePtr, int Idx, + int Idx2, const char *Name) { + Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0), + ConstantInt::get(Type::getInt32Ty(Context), Idx), + ConstantInt::get(Type::getInt32Ty(Context), Idx2)}; + Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name); + + assert(isa(Val) && "Unexpected folded constant"); + + return dyn_cast(Val); +} + +GetElementPtrInst *ShadowStackGCLoweringImpl::CreateGEP(LLVMContext &Context, + IRBuilder<> &B, + Type *Ty, + Value *BasePtr, int Idx, + const char *Name) { + Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0), + ConstantInt::get(Type::getInt32Ty(Context), Idx)}; + Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name); + + assert(isa(Val) && "Unexpected folded constant"); + + return dyn_cast(Val); +} + /// runOnFunction - Insert code to maintain the shadow stack. bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, DomTreeUpdater *DTU) { @@ -314,7 +327,6 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, return false; LLVMContext &Context = F.getContext(); - const DataLayout &DL = F.getParent()->getDataLayout(); // Find calls to llvm.gcroot. CollectRoots(F); @@ -324,20 +336,16 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, if (Roots.empty()) return false; - // Compute frame layout using byte offsets first. - auto [FrameSize, FrameAlign] = ComputeFrameLayout(F); - - // Build the constant map with frame size in pointer-sized units. - uint64_t PtrSize = DL.getPointerSize(); - Value *FrameMap = GetFrameMap(F, FrameSize / PtrSize); + // Build the constant map and figure the type of the shadow stack entry. + Value *FrameMap = GetFrameMap(F); + Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F); // Build the shadow stack entry at the very start of the function. BasicBlock::iterator IP = F.getEntryBlock().begin(); IRBuilder<> AtEntry(IP->getParent(), IP); - Type *Int8Ty = Type::getInt8Ty(Context); - AllocaInst *StackEntry = AtEntry.CreateAlloca( - ArrayType::get(Int8Ty, FrameSize), nullptr, "gc_frame"); - StackEntry->setAlignment(FrameAlign); + + Instruction *StackEntry = + AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr, "gc_frame"); AtEntry.SetInsertPointPastAllocas(&F); IP = AtEntry.GetInsertPoint(); @@ -345,45 +353,20 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, // Initialize the map pointer and load the current head of the shadow stack. Instruction *CurrentHead = AtEntry.CreateLoad(AtEntry.getPtrTy(), Head, "gc_currhead"); - - // Map pointer is at offset PtrSize (after the Next pointer) - Value *EntryMapPtr = AtEntry.CreatePtrAdd( - StackEntry, AtEntry.getInt64(PtrSize), "gc_frame.map"); + Instruction *EntryMapPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, + StackEntry, 0, 1, "gc_frame.map"); AtEntry.CreateStore(FrameMap, EntryMapPtr); - // Zero out any padding between roots to ensure deterministic frame contents. - // This includes the region after the map pointer up to the first root. - uint64_t LastEnd = 2 * PtrSize; // End of Map pointer field - assert(RootOffsets.size() == Roots.size()); + // After all the allocas... for (unsigned I = 0, E = Roots.size(); I != E; ++I) { - auto [RootOffset, RootSize] = RootOffsets[I]; - - // Zero any padding before this root - if (RootOffset > LastEnd) { - Value *PaddingPtr = - AtEntry.CreatePtrAdd(StackEntry, AtEntry.getInt64(LastEnd)); - AtEntry.CreateMemSet(PaddingPtr, AtEntry.getInt8(0), RootOffset - LastEnd, - Align(1)); - } - - // For each root, compute pointer using precomputed offset - Value *SlotPtr = AtEntry.CreatePtrAdd( - StackEntry, AtEntry.getInt64(RootOffset), "gc_root"); + // For each root, find the corresponding slot in the aggregate... + Value *SlotPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, + StackEntry, 1 + I, "gc_root"); // And use it in lieu of the alloca. AllocaInst *OriginalAlloca = Roots[I].second; SlotPtr->takeName(OriginalAlloca); OriginalAlloca->replaceAllUsesWith(SlotPtr); - - LastEnd = RootOffset + RootSize; - } - - // Zero any padding at the end of the frame - if (FrameSize > LastEnd) { - Value *PaddingPtr = - AtEntry.CreatePtrAdd(StackEntry, AtEntry.getInt64(LastEnd)); - AtEntry.CreateMemSet(PaddingPtr, AtEntry.getInt8(0), FrameSize - LastEnd, - Align(1)); } // Move past the original stores inserted by GCStrategy::InitRoots. This isn't @@ -395,20 +378,23 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, AtEntry.SetInsertPoint(IP->getParent(), IP); // Push the entry onto the shadow stack. - // Next pointer is at offset 0, so it's just the frame pointer - AtEntry.CreateStore(CurrentHead, StackEntry); - // The new head value is also the frame pointer (the linked list links to - // frame base) - AtEntry.CreateStore(StackEntry, Head); + Instruction *EntryNextPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, + StackEntry, 0, 0, "gc_frame.next"); + Instruction *NewHeadVal = CreateGEP(Context, AtEntry, ConcreteStackEntryTy, + StackEntry, 0, "gc_newhead"); + AtEntry.CreateStore(CurrentHead, EntryNextPtr); + AtEntry.CreateStore(NewHeadVal, Head); // For each instruction that escapes... EscapeEnumerator EE(F, "gc_cleanup", /*HandleExceptions=*/true, DTU); while (IRBuilder<> *AtExit = EE.Next()) { // Pop the entry from the shadow stack. Don't reuse CurrentHead from // AtEntry, since that would make the value live for the entire function. - // Next pointer is at offset 0, so load from the frame base + Instruction *EntryNextPtr2 = + CreateGEP(Context, *AtExit, ConcreteStackEntryTy, StackEntry, 0, 0, + "gc_frame.next"); Value *SavedHead = - AtExit->CreateLoad(AtExit->getPtrTy(), StackEntry, "gc_savedhead"); + AtExit->CreateLoad(AtExit->getPtrTy(), EntryNextPtr2, "gc_savedhead"); AtExit->CreateStore(SavedHead, Head); } @@ -421,6 +407,5 @@ bool ShadowStackGCLoweringImpl::runOnFunction(Function &F, } Roots.clear(); - RootOffsets.clear(); return true; } From 5c80731fbfe7ab5d7ea64c4dfa157329e9d87632 Mon Sep 17 00:00:00 2001 From: khaki3 <47756807+khaki3@users.noreply.github.com> Date: Tue, 12 May 2026 14:22:22 -0700 Subject: [PATCH 517/538] [flang][cuda] Place box value kernel args in managed memory (#197116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Example: ```fortran type deviceArray integer, allocatable, dimension(:,:), device :: Arr end type deviceArray type(deviceArray), allocatable, dimension(:) :: DA allocate(DA(2)) allocate(DA(1)%Arr(32,32)) call mykernel<<<1,32>>>(DA(1)%Arr, 32) ! cudaErrorIllegalAddress ``` In this code, `DA(1)%Arr` is a device allocatable component inside a managed derived type. The compiler loads the descriptor, reboxes it on the host stack, and passes it to the kernel. Since `!fir.box` is lowered to a pointer in LLVM IR, the kernel receives a host-stack pointer it cannot dereference — causing `cudaErrorIllegalAddress`. The existing `isDeviceAllocation` check is def-based: it traces the value's origin to decide whether to use managed memory. It works for global device descriptors and `cuf.data_attr`-annotated arguments, but not for this case — the rebox input is a plain host alloca, and only the *use* as a kernel argument requires managed memory. **Fix:** Add a use-based `isUsedByGPULaunchFunc` check alongside `isDeviceAllocation` in the embox, rebox, and load conversions in CodeGen. When a box descriptor is passed directly to `gpu.launch_func`, its storage is allocated in managed memory via `_FortranACUFAllocDescriptor` instead of a stack alloca. --- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 31 ++++++++++++++++++++----- flang/test/Fir/CUDA/cuda-code-gen.mlir | 28 ++++++++++++++++++++++ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 7d1068c25e7ca..4fc33d2676cec 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -2099,6 +2099,15 @@ struct EmboxOpConversion : public EmboxCommonConversion { } }; +/// Return true if any user of \p val is a gpu.launch_func operation, +/// indicating the descriptor must be in device-accessible memory. +static bool isUsedByGPULaunchFunc(mlir::Value val) { + for (auto *user : val.getUsers()) + if (mlir::isa(user)) + return true; + return false; +} + static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) { if (val.getDefiningOp() && val.getDefiningOp()->getParentOfType()) @@ -2340,9 +2349,11 @@ struct XEmboxOpConversion : public EmboxCommonConversion { dest = insertBaseAddress(rewriter, loc, dest, base); if (fir::isDerivedTypeWithLenParams(boxTy)) TODO(loc, "fir.embox codegen of derived with length parameters"); - mlir::Value result = placeInMemoryIfNotGlobalInit( - rewriter, loc, boxTy, dest, - isDeviceAllocation(xbox.getMemref(), adaptor.getMemref())); + bool needsDeviceAlloc = + isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()) || + isUsedByGPULaunchFunc(xbox); + mlir::Value result = placeInMemoryIfNotGlobalInit(rewriter, loc, boxTy, + dest, needsDeviceAlloc); rewriter.replaceOp(xbox, result); return mlir::success(); } @@ -2458,9 +2469,11 @@ struct XReboxOpConversion : public EmboxCommonConversion { dest = insertStride(rewriter, loc, dest, dim, std::get<1>(iter.value())); } dest = insertBaseAddress(rewriter, loc, dest, base); + bool needsDeviceAlloc = + isDeviceAllocation(rebox.getBox(), adaptor.getBox()) || + isUsedByGPULaunchFunc(rebox); mlir::Value result = placeInMemoryIfNotGlobalInit( - rewriter, rebox.getLoc(), destBoxTy, dest, - isDeviceAllocation(rebox.getBox(), adaptor.getBox())); + rewriter, rebox.getLoc(), destBoxTy, dest, needsDeviceAlloc); rewriter.replaceOp(rebox, result); return mlir::success(); } @@ -2830,7 +2843,8 @@ struct XArrayCoorOpConversion // operations. mlir::LLVM::IntegerOverflowFlags subFlags = isShifted ? nsw : (nsw | nuw); mlir::LLVM::IntegerOverflowFlags addMulFlags = nsw | nuw; - mlir::LLVM::GEPNoWrapFlags gepFlags = mlir::LLVM::GEPNoWrapFlags::nusw | mlir::LLVM::GEPNoWrapFlags::nuw; + mlir::LLVM::GEPNoWrapFlags gepFlags = + mlir::LLVM::GEPNoWrapFlags::nusw | mlir::LLVM::GEPNoWrapFlags::nuw; // For each dimension of the array, generate the offset calculation. for (unsigned i = 0; i < rank; ++i, ++indexOffset, ++shapeOffset, @@ -3626,6 +3640,11 @@ struct LoadOpConversion : public fir::FIROpConversion { genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy()); } } + if (!newBoxStorage && isUsedByGPULaunchFunc(load)) { + auto mod = load->getParentOfType(); + newBoxStorage = + genCUFAllocDescriptor(loc, rewriter, mod, boxTy, lowerTy()); + } if (!newBoxStorage) newBoxStorage = genAllocaAndAddrCastWithType(loc, llvmLoadTy, defaultAlign, rewriter); diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir index ec066fad724c6..cdaa775b5b49c 100644 --- a/flang/test/Fir/CUDA/cuda-code-gen.mlir +++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir @@ -399,3 +399,31 @@ module { // CHECK-LABEL: llvm.func @_QMkernelsPuse_managed() // CHECK: %{{.*}} = llvm.mlir.addressof @_QMmodEmval : !llvm.ptr // CHECK: llvm.mlir.global external @_QMmodEmval() {addr_space = 0 : i32} : i32 + +// ----- + +// Test that a rebox whose result is passed to gpu.launch_func gets a managed +// descriptor so the GPU kernel can access it. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, gpu.container_module} { + gpu.module @cuda_device_mod { + gpu.func @_QMtestmePmykernel(%arg0: !fir.box>, %arg1: i32) kernel { + gpu.return + } + } + func.func @_QQmain() { + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c0_i32 = arith.constant 0 : i32 + %c32_i32 = arith.constant 32 : i32 + %0 = fir.alloca !fir.box>> + %1 = fir.load %0 : !fir.ref>>> + %2 = fircg.ext_rebox %1 : (!fir.box>>) -> !fir.box> + gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%2 : !fir.box>, %c32_i32 : i32) + return + } +} + +// CHECK-LABEL: llvm.func @_QQmain() +// CHECK: llvm.call @_FortranACUFAllocDescriptor( +// CHECK: gpu.launch_func @cuda_device_mod::@_QMtestmePmykernel From bdb5d95e20a7492d5e3524fee560be4f61c02cd0 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 12 May 2026 17:22:29 -0400 Subject: [PATCH 518/538] [SLP] Do not account scalable vectorized users when estimating geps cost We should not try to widen the scalable users of geps, they are not vectorized and scalable vector type cannot be widened. Fixes #197132 Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/197301 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../AArch64/gep-user-scalable.ll | 39 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/gep-user-scalable.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cb2901418cef2..d5f284bebd940 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19214,7 +19214,7 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, if (User && User->hasOneUse() && isa(User->user_back())) { Type *LocalTy = getValueType(User->user_back()); - if (!UserScalarTy) { + if (!UserScalarTy && !isa(LocalTy)) { UserScalarTy = LocalTy; } else if (UserScalarTy != LocalTy) { AllUsersGEPSWithStoresLoads = false; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gep-user-scalable.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gep-user-scalable.ll new file mode 100644 index 0000000000000..59a0e4ae28469 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gep-user-scalable.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt --passes=slp-vectorizer -S -mtriple=aarch64-unknown-none-elf < %s | FileCheck %s + +define @test(i64 %0, ptr %1) { +; CHECK-LABEL: define @test( +; CHECK-SAME: i64 [[TMP0:%.*]], ptr [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 52 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[INVARIANT_GEP265:%.*]] = getelementptr i8, ptr null, i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[GEP227:%.*]] = getelementptr i8, ptr [[INVARIANT_GEP265]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = load , ptr [[GEP227]], align 1 +; CHECK-NEXT: [[GEP229:%.*]] = getelementptr i8, ptr [[INVARIANT_GEP265]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = load , ptr [[GEP229]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.interleave4.nxv32i8( [[TMP8]], [[TMP9]], zeroinitializer, zeroinitializer) +; CHECK-NEXT: ret [[TMP10]] +; +entry: + %2 = getelementptr i8, ptr %1, i64 48 + %3 = getelementptr i8, ptr %1, i64 52 + %4 = load i32, ptr %3, align 4 + %5 = sext i32 %4 to i64 + %invariant.gep265 = getelementptr i8, ptr null, i64 %0 + %6 = load i32, ptr %2, align 4 + %7 = sext i32 %6 to i64 + %gep227 = getelementptr i8, ptr %invariant.gep265, i64 %7 + %8 = load , ptr %gep227, align 1 + %gep229 = getelementptr i8, ptr %invariant.gep265, i64 %5 + %9 = load , ptr %gep229, align 1 + %10 = tail call @llvm.vector.interleave4.nxv32i8( %8, %9, zeroinitializer, zeroinitializer) + ret %10 +} + +declare @llvm.vector.interleave4.nxv32i8(, , , ) + From f595c61161910f36a93c80a5e951ead82d0e7f71 Mon Sep 17 00:00:00 2001 From: Dominik Steenken Date: Tue, 12 May 2026 23:33:41 +0200 Subject: [PATCH 519/538] [RegisterScavenging] Respect early-clobber defs when scavenging registers (#197120) When scavenging registers backwards for virtual registers introduced during frame index elimination, the register scavenger was ignoring early-clobber constraints on the instruction using the scavenged register. This could lead to assigning a virtual register to a physical register marked as early-clobber output, violating the constraint that early-clobber outputs cannot overlap with inputs. This change inspects `RestoreAfter` to determine if the scavenged register will be used by the instruction pointed at by MBBI, and if so, remove any such registers from the scavengeable set. This also adds a test to check if such EC defs are indeed respected whne they otherwise wouldn't be. co-authored-by: @uweigand --------- Co-authored-by: Matt Arsenault --- llvm/lib/CodeGen/RegisterScavenging.cpp | 10 +++++ .../SystemZ/scavenge-clobbered-reg.mir | 42 +++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 llvm/test/CodeGen/SystemZ/scavenge-clobbered-reg.mir diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp index bcac08ba322a8..ff279bebd906b 100644 --- a/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -141,6 +141,16 @@ findSurvivorBackwards(const MachineRegisterInfo &MRI, "Target instruction is in other than current basic block, use " "enterBasicBlockEnd first"); + // If RestoreAfter is set, the scavenged register is needed at + // std::next(From), so we need to take into account any possible early-clobber + // def regs defined there. + if (RestoreAfter) { + for (const MachineOperand &MOP : std::next(From)->all_defs()) { + if (MOP.getReg().isPhysical() && MOP.isEarlyClobber()) + Used.addReg(MOP.getReg()); + } + } + for (MachineBasicBlock::iterator I = From;; --I) { const MachineInstr &MI = *I; diff --git a/llvm/test/CodeGen/SystemZ/scavenge-clobbered-reg.mir b/llvm/test/CodeGen/SystemZ/scavenge-clobbered-reg.mir new file mode 100644 index 0000000000000..ecd7dd371cb04 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/scavenge-clobbered-reg.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc %s -mtriple=s390x-ibm-linux -mcpu=z15 -run-pass=prologepilog -o - | FileCheck %s +--- +name: repro +tracksRegLiveness: true +stack: + - { id: 0, offset: 0, size: 16384, callee-saved-restored: true } +body: | + bb.0 : + liveins: $r2d, $r3d + + ; CHECK-LABEL: name: repro + ; CHECK: liveins: $r2d, $r3d, $r6d, $r15d, $r7d, $r8d, $r9d, $r10d, $r11d, $r12d, $r13d, $r14d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: STMG killed $r6d, killed $r15d, $r15d, 48, implicit killed $r7d, implicit killed $r8d, implicit killed $r9d, implicit killed $r10d, implicit killed $r11d, implicit killed $r12d, implicit killed $r13d, implicit killed $r14d + ; CHECK-NEXT: CFI_INSTRUCTION offset $r6d, -112 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r7d, -104 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r8d, -96 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r9d, -88 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r10d, -80 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r11d, -72 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r12d, -64 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r13d, -56 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r14d, -48 + ; CHECK-NEXT: CFI_INSTRUCTION offset $r15d, -40 + ; CHECK-NEXT: $r15d = AGHI $r15d, -16560, implicit-def dead $cc + ; CHECK-NEXT: CFI_INSTRUCTION def_cfa_offset 16720 + ; CHECK-NEXT: renamable $r0d = COPY $r3d + ; CHECK-NEXT: STG killed $r2d, $r15d, 168, $noreg :: (store (s64) into %stack.1) + ; CHECK-NEXT: $r2d = LAY $r15d, 4096, $noreg + ; CHECK-NEXT: INLINEASM &" lg $0, $1\0A\09", sideeffect mayload maystore attdialect, regdef-ec:GR64Bit, def early-clobber renamable $r1d, mem:m, killed $r2d, 1080, $noreg, clobber, implicit-def dead early-clobber $r3d, clobber, implicit-def dead early-clobber $r4d, clobber, implicit-def dead early-clobber $r5d, clobber, implicit-def dead early-clobber $r6d, clobber, implicit-def dead early-clobber $r7d, clobber, implicit-def dead early-clobber $r8d, clobber, implicit-def dead early-clobber $r9d, clobber, implicit-def dead early-clobber $r10d, clobber, implicit-def dead early-clobber $r11d, clobber, implicit-def dead early-clobber $r12d, clobber, implicit-def dead early-clobber $r13d, clobber, implicit-def dead early-clobber $r14d, clobber, implicit-def dead early-clobber $cc + ; CHECK-NEXT: $r2d = LG $r15d, 168, $noreg :: (load (s64) from %stack.1) + ; CHECK-NEXT: renamable $r2d = AGR killed renamable $r2d, killed renamable $r0d, implicit-def dead $cc + ; CHECK-NEXT: renamable $r2d = AGR killed renamable $r2d, killed renamable $r1d, implicit-def dead $cc + ; CHECK-NEXT: $r6d, $r15d = LMG $r15d, 16608, implicit-def $r7d, implicit-def $r8d, implicit-def $r9d, implicit-def $r10d, implicit-def $r11d, implicit-def $r12d, implicit-def $r13d, implicit-def $r14d + ; CHECK-NEXT: Return implicit $r2d + renamable $r0d = COPY $r3d + INLINEASM &" lg $0, $1\0A\09", 25 /* sideeffect mayload maystore attdialect */, 1179659 /* regdef-ec:GR64Bit */, def early-clobber renamable $r1d, 262174 /* mem:m */, %stack.0, 5000, $noreg, 12 /* clobber */, implicit-def dead early-clobber $r3d, 12 /* clobber */, implicit-def dead early-clobber $r4d, 12 /* clobber */, implicit-def dead early-clobber $r5d, 12 /* clobber */, implicit-def dead early-clobber $r6d, 12 /* clobber */, implicit-def dead early-clobber $r7d, 12 /* clobber */, implicit-def dead early-clobber $r8d, 12 /* clobber */, implicit-def dead early-clobber $r9d, 12 /* clobber */, implicit-def dead early-clobber $r10d, 12 /* clobber */, implicit-def dead early-clobber $r11d, 12 /* clobber */, implicit-def dead early-clobber $r12d, 12 /* clobber */, implicit-def dead early-clobber $r13d, 12 /* clobber */, implicit-def dead early-clobber $r14d, 12 /* clobber */, implicit-def dead early-clobber $cc + renamable $r2d = AGR killed renamable $r2d, killed renamable $r0d, implicit-def dead $cc + renamable $r2d = AGR killed renamable $r2d, killed renamable $r1d, implicit-def dead $cc + Return implicit $r2d +... From d6b1219119b02119b7fb75495bddc1a764cf8500 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 12 May 2026 14:46:49 -0700 Subject: [PATCH 520/538] [Bazel] Port 8c187665e883e7c37ddff733ea50304d093dc9f4 (#197307) --- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 1143fb6f681ad..5ca3d23ad40fe 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -9050,6 +9050,7 @@ libc_support_library( name = "__support_math_tanhf", hdrs = ["src/__support/math/tanhf.h"], deps = [ + ":__support_fputil_except_value_utils", ":__support_fputil_fp_bits", ":__support_fputil_multiply_add", ":__support_fputil_nearest_integer", From 2cb2dd4352aae7bd9c85bef007b45aed2fae76d2 Mon Sep 17 00:00:00 2001 From: Uyiosa Iyekekpolor <96444432+uyoyo0@users.noreply.github.com> Date: Tue, 12 May 2026 17:49:53 -0400 Subject: [PATCH 521/538] Fix z/OS archive test failure on macOS (#197290) Fixes failures introduced by #187110. - https://lab.llvm.org/buildbot/#/builders/190/builds/42544 - https://lab.llvm.org/buildbot/#/builders/23/builds/19989 - https://logs.chromium.org/logs/fuchsia/buildbucket/cr-buildbucket/8682002400978909393/+/u/clang/test/stdout The original test hit a "file to small" error on macOS before hitting the expected "truncated or malformed archive" error. This patch updates the test to generate a valid archive then truncates it to 28 bytes so that its large enough to pass the initial size check but too small for the 60-byte member header, so it correctly hits the expected failure on all platforms. --- llvm/test/Object/zos-archive-read.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/test/Object/zos-archive-read.test b/llvm/test/Object/zos-archive-read.test index 8f01d13a09f36..2dc2d9e311903 100644 --- a/llvm/test/Object/zos-archive-read.test +++ b/llvm/test/Object/zos-archive-read.test @@ -63,7 +63,9 @@ ## Test that a truncated archive with incomplete member header is rejected. ## The z/OS magic is 8 bytes and a member header requires 60 bytes. ## Writing only 3 bytes after the magic is not enough for a valid header. -# RUN: printf '\x5A\x4C\x81\x99\x83\x88\x6E\x15' > %t.badhdr.a -# RUN: printf '\x00\x00\x00' >> %t.badhdr.a +## Test that a truncated archive with incomplete member header is rejected. +# RUN: %python %p/Inputs/generate_zos_archive.py --output %t.full.a \ +# RUN: --member foo.txt +# RUN: head -c 28 %t.full.a > %t.badhdr.a # RUN: not llvm-ar t %t.badhdr.a 2>&1 | FileCheck %s --check-prefix=ERR-CHILD # ERR-CHILD: truncated or malformed archive From c0792f37bed87e210468eb123c46db5965e4f981 Mon Sep 17 00:00:00 2001 From: Victor Chernyakin Date: Tue, 12 May 2026 15:07:54 -0700 Subject: [PATCH 522/538] [clang][NFC] Mark CWG941 as implemented and add a test (#197202) [CWG941](https://wg21.link/cwg941) allowed specializing deleted function templates. Clang accepted this between 2.7 and 2.9, regressed and started emitting redefinition errors between 3.0 and 3.8, then went back to accepting in 3.9: https://godbolt.org/z/GKnf9je7j. I've marked it as implemented since 3.9. --- clang/test/CXX/drs/cwg9xx.cpp | 28 ++++++++++++++++++++++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp index b5bcffc840725..f218ecddb9686 100644 --- a/clang/test/CXX/drs/cwg9xx.cpp +++ b/clang/test/CXX/drs/cwg9xx.cpp @@ -22,6 +22,34 @@ static_assert(alignof(int[][2]) == alignof(int[2]), ""); #endif } // namespace cwg930 +namespace cwg941 { // cwg941: 3.9 +#if __cplusplus >= 201103L +template +void f() = delete; + +template<> void f() {} + +template +struct A { + void f() = delete; + + template + void g() = delete; +}; + +template<> void A::f() {} + +template<> template<> void A::g() {} + +struct B { + template + void f() = delete; +}; + +template<> void B::f() {} +#endif +} // namespace cwg941 + namespace cwg948 { // cwg948: 3.7 #if __cplusplus >= 201103L class A { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 8663e224ee6ce..7215d276432b8 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -6364,7 +6364,7 @@

C++ defect report implementation status

[temp.expl.spec] C++11 Explicit specialization of deleted function template - Unknown + Clang 3.9 942 From a9c655df69284995967fb61553bef589a706598e Mon Sep 17 00:00:00 2001 From: Zachary Yedidia Date: Tue, 12 May 2026 15:35:47 -0700 Subject: [PATCH 523/538] [LFI] Report reserved register modification in error message (#195160) Reports the name of the modified reserved register in the error message. Updates the MCLFIRewriter error infrastructure to take a Twine for this. Also adds a warning function, which will be useful in future cases where the rewriter sees an unknown instruction/addressing mode, but will pass it through anyway. Fixes #192027. --- llvm/include/llvm/MC/MCLFIRewriter.h | 4 ++- llvm/lib/MC/MCLFIRewriter.cpp | 7 ++++- .../MCTargetDesc/AArch64MCLFIRewriter.cpp | 16 +++++++----- .../MCTargetDesc/AArch64MCLFIRewriter.h | 5 ++-- llvm/test/MC/AArch64/LFI/reserved.s | 26 +++++++++++-------- 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/llvm/include/llvm/MC/MCLFIRewriter.h b/llvm/include/llvm/MC/MCLFIRewriter.h index 835df9a8f9208..33d51f29ce99e 100644 --- a/llvm/include/llvm/MC/MCLFIRewriter.h +++ b/llvm/include/llvm/MC/MCLFIRewriter.h @@ -25,6 +25,7 @@ class MCInst; class MCSubtargetInfo; class MCStreamer; class MCSymbol; +class Twine; class MCLFIRewriter { private: @@ -40,7 +41,8 @@ class MCLFIRewriter { std::unique_ptr &&II) : Ctx(Ctx), InstInfo(std::move(II)), RegInfo(std::move(RI)) {} - LLVM_ABI void error(const MCInst &Inst, const char Msg[]); + LLVM_ABI void error(const MCInst &Inst, const Twine &Msg); + LLVM_ABI void warning(const MCInst &Inst, const Twine &Msg); void disable() { Enabled = false; } void enable() { Enabled = true; } diff --git a/llvm/lib/MC/MCLFIRewriter.cpp b/llvm/lib/MC/MCLFIRewriter.cpp index 473b99bf8635b..7eeebbb900e90 100644 --- a/llvm/lib/MC/MCLFIRewriter.cpp +++ b/llvm/lib/MC/MCLFIRewriter.cpp @@ -13,16 +13,21 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCLFIRewriter.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" using namespace llvm; -void MCLFIRewriter::error(const MCInst &Inst, const char Msg[]) { +void MCLFIRewriter::error(const MCInst &Inst, const Twine &Msg) { Ctx.reportError(Inst.getLoc(), Msg); } +void MCLFIRewriter::warning(const MCInst &Inst, const Twine &Msg) { + Ctx.reportWarning(Inst.getLoc(), Msg); +} + bool MCLFIRewriter::isCall(const MCInst &Inst) const { return InstInfo->get(Inst.getOpcode()).isCall(); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.cpp index 6066ddb7e59a9..a3d9c86629158 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -64,10 +65,12 @@ static bool isPrivilegedTPAccess(const MCInst &Inst) { return false; } -bool AArch64MCLFIRewriter::mayModifyReserved(const MCInst &Inst) const { - return mayModifyRegister(Inst, LFIAddrReg) || - mayModifyRegister(Inst, LFIBaseReg) || - mayModifyRegister(Inst, LFICtxReg); +MCRegister AArch64MCLFIRewriter::mayModifyReserved(const MCInst &Inst) const { + for (MCRegister Reg : {LFIAddrReg, LFIBaseReg, LFICtxReg}) { + if (mayModifyRegister(Inst, Reg)) + return Reg; + } + return {}; } void AArch64MCLFIRewriter::emitInst(const MCInst &Inst, MCStreamer &Out, @@ -216,8 +219,9 @@ void AArch64MCLFIRewriter::rewriteTPWrite(const MCInst &Inst, MCStreamer &Out, void AArch64MCLFIRewriter::doRewriteInst(const MCInst &Inst, MCStreamer &Out, const MCSubtargetInfo &STI) { // Reserved register modification is an error. - if (mayModifyReserved(Inst)) { - error(Inst, "illegal modification of reserved LFI register"); + if (MCRegister Reg = mayModifyReserved(Inst)) { + error(Inst, Twine("illegal modification of reserved LFI register ") + + RegInfo->getName(Reg)); return; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.h index 439e2fe4b7ef4..8af6e1b00a791 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCLFIRewriter.h @@ -50,8 +50,9 @@ class AArch64MCLFIRewriter : public MCLFIRewriter { /// Recursion guard to prevent infinite loops when emitting instructions. bool Guard = false; - // Instruction classification. - bool mayModifyReserved(const MCInst &Inst) const; + // Instruction classification. Returns the reserved register that may be + // modified, or an invalid register if no reserved register is touched. + MCRegister mayModifyReserved(const MCInst &Inst) const; // Instruction emission. void emitInst(const MCInst &Inst, MCStreamer &Out, diff --git a/llvm/test/MC/AArch64/LFI/reserved.s b/llvm/test/MC/AArch64/LFI/reserved.s index 8ad5e7c56bb9e..3e92195ef54f7 100644 --- a/llvm/test/MC/AArch64/LFI/reserved.s +++ b/llvm/test/MC/AArch64/LFI/reserved.s @@ -1,45 +1,49 @@ // RUN: not llvm-mc -triple aarch64_lfi %s 2>&1 | FileCheck %s mov x27, x0 -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X27 // CHECK: mov x27, x0 ldr x27, [x0] -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X27 // CHECK: ldr x27, [x0] add x27, x0, x1 -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X27 // CHECK: add x27, x0, x1 mov x28, x0 -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X28 // CHECK: mov x28, x0 ldr x28, [x0] -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X28 // CHECK: ldr x28, [x0] add x28, x0, x1 -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X28 // CHECK: add x28, x0, x1 ldp x27, x28, [x0] -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X28 // CHECK: ldp x27, x28, [x0] ldp x0, x27, [x1] -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X27 // CHECK: ldp x0, x27, [x1] ldp x28, x0, [x1] -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X28 // CHECK: ldp x28, x0, [x1] ldr x0, [x27], #8 -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X27 // CHECK: ldr x0, [x27], #8 ldr x0, [x28, #8]! -// CHECK: error: illegal modification of reserved LFI register +// CHECK: error: illegal modification of reserved LFI register X28 // CHECK: ldr x0, [x28, #8]! + +mov x25, x0 +// CHECK: error: illegal modification of reserved LFI register X25 +// CHECK: mov x25, x0 From 3471515cc73424ea44c478dc7e80e17e30e11b1a Mon Sep 17 00:00:00 2001 From: Hemant Kulkarni Date: Tue, 12 May 2026 17:44:08 -0500 Subject: [PATCH 524/538] [BOLT] Account for stubs with symbols in plt (#192716) LLD and bfd do not generate functions symbols for stubs in PLT. However, mold does and trips the object discovery to create two functions (BF then PLTFunc). This can cause symbol to be resoved with BF with incorrect ADRP immediate field in AArch64. The issue is described in more detail here: https://github.com/llvm/llvm-project/issues/192552 --- bolt/lib/Rewrite/RewriteInstance.cpp | 7 +++-- bolt/test/AArch64/plt-mold-func-symbols.s | 36 +++++++++++++++++++++++ bolt/test/X86/plt-mold.test | 2 +- 3 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 bolt/test/AArch64/plt-mold-func-symbols.s diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index b1fa65390c5e7..3d933cf24b458 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -1041,8 +1041,11 @@ void RewriteInstance::discoverFileObjects() { FileSymRefs.emplace(SymbolAddress, Symbol); - // Skip section symbols that will be registered by disassemblePLT(). - if (SymbolType == SymbolRef::ST_Debug) { + // Skip symbols in PLT sections that will be registered by disassemblePLT(). + // ST_Debug covers section markers (lld/GNU ld), ST_Function covers + // explicit stub symbols emitted by mold (e.g., malloc$plt). + if (SymbolType == SymbolRef::ST_Debug || + SymbolType == SymbolRef::ST_Function) { ErrorOr BSection = BC->getSectionForAddress(SymbolAddress); if (BSection && getPLTSectionInfo(BSection->getName())) diff --git a/bolt/test/AArch64/plt-mold-func-symbols.s b/bolt/test/AArch64/plt-mold-func-symbols.s new file mode 100644 index 0000000000000..3c710ae23d48b --- /dev/null +++ b/bolt/test/AArch64/plt-mold-func-symbols.s @@ -0,0 +1,36 @@ +## Test that BOLT correctly handles mold-style STT_FUNC symbols in PLT +## sections. + +# REQUIRES: system-linux + +## Build a shared library from the common stubs. +# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t.so + +## Build and link the main binary. The linker creates a real PLT entry. +# RUN: llvm-mc -filetype=obj -triple aarch64-linux %s -o %t.o +# RUN: ld.lld -pie %t.o %t.so -o %t.exe --emit-relocs + +## Inject a mold-style STT_FUNC symbol at the PLT entry for printf. +## Mold places "printf$plt" directly on the PLT stub; we simulate this +## with llvm-objcopy. The PLT header is 32 bytes on AArch64, and each +## entry is 16 bytes. printf is the only entry after the header. +# RUN: llvm-objcopy --add-symbol "printf\$plt=.plt:32,function,local" \ +# RUN: %t.exe %t + +## Verify BOLT resolves the call as printf@PLT, not printf$plt. +# RUN: llvm-bolt %t -o %t.bolt --print-cfg --print-only=_start 2>&1 \ +# RUN: | FileCheck %s +# RUN: llvm-readobj --syms %t.bolt | grep -A7 printf$plt | FileCheck \ +# RUN: %s --check-prefix=CHECK-SYM + +# CHECK: bl printf@PLT +# CHECK-NOT: printf$plt +# CHECK-SYM: Section: .plt + + .text + .globl _start + .type _start, %function +_start: + bl printf + ret + .size _start, .-_start diff --git a/bolt/test/X86/plt-mold.test b/bolt/test/X86/plt-mold.test index 75c8c023cf3c2..8ef21e87b3453 100644 --- a/bolt/test/X86/plt-mold.test +++ b/bolt/test/X86/plt-mold.test @@ -3,4 +3,4 @@ ## Check that llvm-bolt correctly parses PLT created by mold linker. ## The only call instruction in main() should be a call to printf() in PLT. -CHECK: callq "printf$plt +CHECK: callq printf@PLT From 2b2a63819f9f26d661bad5c269a03077d22ff6b4 Mon Sep 17 00:00:00 2001 From: lntue Date: Tue, 12 May 2026 18:47:10 -0400 Subject: [PATCH 525/538] [APFloat] Add exp functions for single and double using exp/expf implementations from LLVM libc. (#190667) This reapplies #143959 with some changes: - Only support default rounding modes for now. Other rounding modes will wait for proper static rounding implementations in LLVM libc. - Add both single and double precision exp. --- llvm/CMakeLists.txt | 4 ++ llvm/include/llvm/ADT/APFloat.h | 4 ++ llvm/lib/Support/APFloat.cpp | 31 ++++++++++++ llvm/lib/Support/CMakeLists.txt | 5 ++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 2 +- llvm/unittests/ADT/APFloatTest.cpp | 58 +++++++++++++++++++++++ 6 files changed, 103 insertions(+), 1 deletion(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 6001928f92e37..4509fbaba1d25 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -700,6 +700,10 @@ endif() set(LLVM_ENABLE_Z3_SOLVER_DEFAULT "${Z3_FOUND}") +include(FindLibcCommonUtils) +if(NOT TARGET llvm-libc-common-utilities) + message(FATAL_ERROR "LLVM libc is not found at ${libc_path}.") +endif() if( LLVM_TARGETS_TO_BUILD STREQUAL "all" ) set( LLVM_TARGETS_TO_BUILD ${LLVM_ALL_TARGETS} ) diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index 4e5a40d241d36..07e509baa5f1c 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -1760,6 +1760,10 @@ inline APFloat maximumnum(const APFloat &A, const APFloat &B) { return A < B ? B : A; } +/// Implement IEEE 754-2019 exp functions +LLVM_READONLY +APFloat exp(const APFloat &X, RoundingMode RM = APFloat::rmNearestTiesToEven); + inline raw_ostream &operator<<(raw_ostream &OS, const APFloat &V) { V.print(OS); return OS; diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 013a7cc2d9941..ee3e0fee5b4bf 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -29,6 +29,19 @@ #include #include +/// Shared headers from LLVM libc +/// Make sure to add ${LLVM_SOURCE_DIR}/../libc to include directories. +/// +/// Notes: So far it looks like APFloat does not check errnos or floating-point +/// exceptions after calling the math functions, so we will configure LLVM libc +/// math functions to skip setting errnos and floating-point exceptions +/// explicitly. We also put them in a separate namespace so that the symbols +/// do not clash with other libc math builds just in case. +#define LIBC_NAMESPACE __llvm_libc_apfloat +#define LIBC_MATH (LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT) + +#include "shared/math.h" + #define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL) \ do { \ if (usesLayout(getSemantics())) \ @@ -6080,6 +6093,24 @@ APFloat::Storage &APFloat::Storage::operator=(APFloat::Storage &&RHS) { return *this; } +// TODO: Support other rounding modes when LLVM libc math implement static +// roundings. +APFloat exp(const APFloat &X, RoundingMode rounding_mode) { + if (rounding_mode == APFloatBase::rmNearestTiesToEven) { + if (APFloat::SemanticsToEnum(X.getSemantics()) == + APFloatBase::S_IEEEsingle) { + float result = LIBC_NAMESPACE::shared::expf(X.convertToFloat()); + return APFloat(result); + } + if (APFloat::SemanticsToEnum(X.getSemantics()) == + APFloatBase::S_IEEEdouble) { + double result = LIBC_NAMESPACE::shared::exp(X.convertToDouble()); + return APFloat(result); + } + } + llvm_unreachable("Unexpected semantics"); +} + } // namespace llvm #undef APFLOAT_DISPATCH_ON_SEMANTICS diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 100cfb567c348..c3f5961b6584d 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -399,7 +399,12 @@ endif() target_include_directories(LLVMSupport PRIVATE ${LLVM_THIRD_PARTY_DIR}/siphash/include + ${LLVM_SOURCE_DIR}/../libc ) +if(NOT MSVC) + target_compile_options(LLVMSupport PRIVATE "-Wno-c99-extensions") # _Complex warnings. +endif() + # SupportLSP depends on Support and therefore must be included afterwards. add_subdirectory(LSP) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index dd4650245ca77..f8a632566bf29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1868,7 +1868,7 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, return true; case AMDGPULibFunc::EI_EXP: - Res0 = APFloat{exp(opr0)}; + Res0 = APFloat{std::exp(opr0)}; return true; case AMDGPULibFunc::EI_EXP2: diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index b14afa5895ea3..b25932cd1d94b 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -10228,4 +10228,62 @@ TEST(APFloatTest, DecimalStringPreservesInexactStatus) { EXPECT_EQ(F.bitcastToAPInt(), Expected.bitcastToAPInt()); } +TEST(APFloatTest, expf) { + // exp(+-0) = 1. + EXPECT_EQ(1.0f, llvm::exp(APFloat(0.0f)).convertToFloat()); + EXPECT_EQ(1.0f, llvm::exp(APFloat(-0.0f)).convertToFloat()); + // exp(+Inf) = +Inf. + EXPECT_EQ(std::numeric_limits::infinity(), + llvm::exp(APFloat::getInf(APFloat::IEEEsingle(), false)) + .convertToFloat()); + // exp(-Inf) = 0. + EXPECT_EQ( + 0.0f, + llvm::exp(APFloat::getInf(APFloat::IEEEsingle(), true)).convertToFloat()); + // exp(NaN) = NaN. + EXPECT_TRUE(llvm::exp(APFloat::getNaN(APFloat::IEEEsingle())).isNaN()); + // exp(1) + EXPECT_EQ(0x1.5bf0a8p1f, llvm::exp(APFloat(1.0f)).convertToFloat()); + // exp(float max) + EXPECT_EQ(std::numeric_limits::infinity(), + llvm::exp(APFloat::getLargest(APFloat::IEEEsingle(), false)) + .convertToFloat()); + // exp(min_denormal) + EXPECT_EQ(1.0f, llvm::exp(APFloat::getSmallest(APFloat::IEEEsingle(), false)) + .convertToFloat()); + // exp(-1) + EXPECT_EQ(0x1.78b564p-2f, llvm::exp(APFloat(-1.0f)).convertToFloat()); + // exp(-90) + EXPECT_EQ(0x1.1d85p-130f, llvm::exp(APFloat(-90.0f)).convertToFloat()); +} + +TEST(APFloatTest, exp) { + // exp(+-0) = 1. + EXPECT_EQ(1.0, llvm::exp(APFloat(0.0)).convertToDouble()); + EXPECT_EQ(1.0, llvm::exp(APFloat(-0.0)).convertToDouble()); + // exp(+Inf) = +Inf. + EXPECT_EQ(std::numeric_limits::infinity(), + llvm::exp(APFloat::getInf(APFloat::IEEEdouble(), false)) + .convertToDouble()); + // exp(-Inf) = 0. + EXPECT_EQ(0.0, llvm::exp(APFloat::getInf(APFloat::IEEEdouble(), true)) + .convertToDouble()); + // exp(NaN) = NaN. + EXPECT_TRUE(llvm::exp(APFloat::getNaN(APFloat::IEEEdouble())).isNaN()); + // exp(1) + EXPECT_EQ(0x1.5bf0a8b145769p1, llvm::exp(APFloat(1.0)).convertToDouble()); + // exp(float max) + EXPECT_EQ(std::numeric_limits::infinity(), + llvm::exp(APFloat::getLargest(APFloat::IEEEdouble(), false)) + .convertToDouble()); + // exp(min_denormal) + EXPECT_EQ(1.0, llvm::exp(APFloat::getSmallest(APFloat::IEEEdouble(), false)) + .convertToDouble()); + // exp(-1) + EXPECT_EQ(0x1.78b56362cef38p-2, llvm::exp(APFloat(-1.0)).convertToDouble()); + // exp(-710) + EXPECT_EQ(0x1.9c017e9459e18p-1025, + llvm::exp(APFloat(-710.0)).convertToDouble()); +} + } // namespace From 7c48e49ce039aeea4ff1a42638f6b6a186aa72ff Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Wed, 13 May 2026 00:04:43 +0100 Subject: [PATCH 526/538] [lldb][windows] mark test_overrides_resolver_resolver_cmd as XFAIL (#197285) Follow up to https://github.com/llvm/llvm-project/pull/195392 to mark `test_overrides_resolver_resolver_cmd` as XFAIL on Windows, like `test_overrides_resolver_resolver_python`. --- .../scripted_bkpt/overrides_resolver/TestOverridesResolver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py index 24ff019831992..dda9822f550d1 100644 --- a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/overrides_resolver/TestOverridesResolver.py @@ -18,6 +18,7 @@ def test_overrides_resolver_resolver_python(self): self.build() self.do_test(True) + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528") def test_overrides_resolver_resolver_cmd(self): """Use facade breakpoints to emulate hitting some locations""" self.build() From 84edb835557aef48bf7c1b2116a055938598d01d Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Wed, 13 May 2026 00:12:48 +0100 Subject: [PATCH 527/538] [DirectX][NFC] Move DICompileUnit conversion into DXILDebugInfo (#196451) In #192574, I added logic for changing versioned language names to unversioned language names, but did so directly in DXILBitcodeWriter. This is better done in DXILDebugInfo instead so that, in a future change, we can check the result of DXILDebugInfo in tests. --- .../DirectX/DXILWriter/DXILBitcodeWriter.cpp | 8 +------- .../DirectX/DirectXIRPasses/DXILDebugInfo.cpp | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 15eb886574e6e..984ddf8f334b7 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1528,13 +1528,7 @@ void DXILBitcodeWriter::writeDICompileUnit(const DICompileUnit *N, SmallVectorImpl &Record, unsigned Abbrev) { Record.push_back(N->isDistinct()); - DISourceLanguageName Lang = N->getSourceLanguage(); - if (Lang.hasVersionedName()) { - auto LangName = static_cast(Lang.getName()); - Lang = dwarf::toDW_LANG(LangName, Lang.getVersion()) - .value_or(dwarf::SourceLanguage{}); - } - Record.push_back(Lang.getUnversionedName()); + Record.push_back(N->getSourceLanguage().getUnversionedName()); Record.push_back(VE.getMetadataOrNullID(N->getFile())); Record.push_back(VE.getMetadataOrNullID(N->getRawProducer())); Record.push_back(N->isOptimized()); diff --git a/llvm/lib/Target/DirectX/DirectXIRPasses/DXILDebugInfo.cpp b/llvm/lib/Target/DirectX/DirectXIRPasses/DXILDebugInfo.cpp index 650338949d19a..1e638d0195327 100644 --- a/llvm/lib/Target/DirectX/DirectXIRPasses/DXILDebugInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXIRPasses/DXILDebugInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "DXILDebugInfo.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Module.h" @@ -20,5 +21,24 @@ DXILDebugInfoMap DXILDebugInfoPass::run(Module &M) { DebugInfoFinder DIF; DIF.processModule(M); + for (DICompileUnit *CU : DIF.compile_units()) { + DISourceLanguageName Lang = CU->getSourceLanguage(); + if (Lang.hasVersionedName()) { + auto LangName = static_cast(Lang.getName()); + Lang = dwarf::toDW_LANG(LangName, Lang.getVersion()) + .value_or(dwarf::SourceLanguage{}); + auto *NewCU = DICompileUnit::getDistinct( + M.getContext(), Lang, CU->getFile(), CU->getProducer(), + CU->isOptimized(), CU->getFlags(), CU->getRuntimeVersion(), + CU->getSplitDebugFilename(), CU->getEmissionKind(), + CU->getEnumTypes(), CU->getRetainedTypes(), CU->getGlobalVariables(), + CU->getImportedEntities(), CU->getMacros(), CU->getDWOId(), + CU->getSplitDebugInlining(), CU->getDebugInfoForProfiling(), + CU->getNameTableKind(), CU->getRangesBaseAddress(), CU->getSysRoot(), + CU->getSDK()); + Res.MDReplace.insert({CU, NewCU}); + } + } + return Res; } From d3831ef1eb178caea5298ad945369ee9eb7670f0 Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Tue, 12 May 2026 23:40:59 +0000 Subject: [PATCH 528/538] [scudo] Add a generic allocation fuzzer. (#197317) The fuzzer disabled memory tagging if supported since there is an unknown problem with trying to create an allocator instance and doing raw allocate/deallocate calls. --- .../lib/scudo/standalone/fuzz/CMakeLists.txt | 11 +++ .../standalone/fuzz/allocator_fuzzer.cpp | 95 +++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 compiler-rt/lib/scudo/standalone/fuzz/allocator_fuzzer.cpp diff --git a/compiler-rt/lib/scudo/standalone/fuzz/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/fuzz/CMakeLists.txt index d29c2f2fe7493..293aed2f8e8e7 100644 --- a/compiler-rt/lib/scudo/standalone/fuzz/CMakeLists.txt +++ b/compiler-rt/lib/scudo/standalone/fuzz/CMakeLists.txt @@ -9,4 +9,15 @@ if (LLVM_USE_SANITIZE_COVERAGE) get_error_info_fuzzer PROPERTIES LINK_FLAGS -fsanitize=fuzzer) target_include_directories( get_error_info_fuzzer PRIVATE .. ../include) + + add_executable(get_error_info_fuzzer + allocator_fuzzer.cpp) + set_target_properties( + allocator_fuzzer PROPERTIES FOLDER "Fuzzers") + target_compile_options( + allocator_fuzzer PRIVATE -fsanitize=fuzzer) + set_target_properties( + allocator_fuzzer PROPERTIES LINK_FLAGS -fsanitize=fuzzer) + target_include_directories( + allocator_fuzzer PRIVATE .. ../include) endif() diff --git a/compiler-rt/lib/scudo/standalone/fuzz/allocator_fuzzer.cpp b/compiler-rt/lib/scudo/standalone/fuzz/allocator_fuzzer.cpp new file mode 100644 index 0000000000000..c5a339c15e634 --- /dev/null +++ b/compiler-rt/lib/scudo/standalone/fuzz/allocator_fuzzer.cpp @@ -0,0 +1,95 @@ +//===-- allocator_fuzzer.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define SCUDO_FUZZ +#include "allocator_config.h" +#include "combined.h" +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + using AllocatorT = scudo::Allocator; + static AllocatorT *Instance = []() { + auto *A = new AllocatorT(); + A->init(); + // The way we are using the allocator doesn't work properly with MTE + // enabled. + if (scudo::systemSupportsMemoryTagging()) + A->disableMemoryTagging(); + return A; + }(); + + FuzzedDataProvider FDP(Data, Size); + std::vector Allocations; + + Instance->setOption(scudo::Option::ReleaseInterval, 1000); + constexpr size_t kMaxAllocatedBytes = 50 * 1024 * 1024; + size_t TotalAllocatedBytes = 0; + while (FDP.remaining_bytes() > 0) { + uint8_t Op = FDP.ConsumeIntegralInRange(0, 4); + if ((Op == 0 || Op == 1) && TotalAllocatedBytes < kMaxAllocatedBytes) { + size_t ReqSize = + FDP.ConsumeIntegralInRange(1, 1 << 20); // Up to 1MB + void *Ptr; + if (Op == 0) { + // Allocate no alignment + Ptr = Instance->allocate(ReqSize, scudo::Chunk::Origin::Malloc); + } else { + // Allocate with alignment + size_t Alignment = + 1 << FDP.ConsumeIntegralInRange(4, 12); // 16 to 4096 + Ptr = Instance->allocate(ReqSize, scudo::Chunk::Origin::Memalign, + Alignment); + CHECK_EQ(0, reinterpret_cast(Ptr) & (Alignment - 1)); + } + CHECK(Ptr != nullptr); + size_t Size = Instance->getUsableSize(Ptr); + TotalAllocatedBytes += Size; + Allocations.push_back(Ptr); + memset(Ptr, 0xff, Size); + } else if (Op == 2 && !Allocations.empty()) { + // Deallocate + size_t Index = + FDP.ConsumeIntegralInRange(0, Allocations.size() - 1); + TotalAllocatedBytes -= Instance->getUsableSize(Allocations[Index]); + Instance->deallocate(Allocations[Index], scudo::Chunk::Origin::Malloc); + Allocations.erase(Allocations.begin() + Index); + } else if (Op == 3 && !Allocations.empty()) { + // Reallocate (Assumes reallocate of a memalign does not crash). + size_t Index = + FDP.ConsumeIntegralInRange(0, Allocations.size() - 1); + size_t OldSize = Instance->getUsableSize(Allocations[Index]); + TotalAllocatedBytes -= OldSize; + size_t NewSize = FDP.ConsumeIntegralInRange(1, 1 << 20); + void *NewPtr = Instance->reallocate(Allocations[Index], NewSize); + if (NewSize == 0) { + CHECK(NewPtr == nullptr); + Allocations.erase(Allocations.begin() + Index); + } else { + CHECK(NewPtr != nullptr); + size_t Size = Instance->getUsableSize(NewPtr); + memset(NewPtr, 0xff, Size); + Allocations[Index] = NewPtr; + TotalAllocatedBytes -= Size; + } + } else if (Op == 4) { + // ReleaseToOS + scudo::ReleaseToOS ReleaseType = + static_cast(FDP.ConsumeIntegralInRange( + 0, static_cast(scudo::ReleaseToOS::Last))); + Instance->releaseToOS(ReleaseType); + } + } + + // Cleanup remaining + for (void *Ptr : Allocations) { + Instance->deallocate(Ptr, scudo::Chunk::Origin::Malloc); + } + + return 0; +} From 0edfd11b360de8d5e579febbcb7012ec6fa22eaf Mon Sep 17 00:00:00 2001 From: lntue Date: Tue, 12 May 2026 19:52:24 -0400 Subject: [PATCH 529/538] [libc] Temporarily disable iscanonical, isnan, issignaling in shared/math.h (#197328) These clashes with same-name macros defined in . --- libc/shared/math.h | 9 ++++++--- libc/test/shared/shared_math_constexpr_test.cpp | 9 ++++++--- libc/test/shared/shared_math_test.cpp | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/libc/shared/math.h b/libc/shared/math.h index 73005f99a2db4..2baeb07294a3b 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -281,16 +281,19 @@ #include "math/ilogbf128.h" #include "math/ilogbf16.h" #include "math/ilogbl.h" -#include "math/iscanonical.h" +// TODO: iscanonical is a macro in +// #include "math/iscanonical.h" #include "math/iscanonicalbf16.h" #include "math/iscanonicalf.h" #include "math/iscanonicalf128.h" #include "math/iscanonicalf16.h" #include "math/iscanonicall.h" -#include "math/isnan.h" +// TODO: isnan is a macro in +// #include "math/isnan.h" #include "math/isnanf.h" #include "math/isnanl.h" -#include "math/issignaling.h" +// TODO: issignaling is a macro in +// #include "math/issignaling.h" #include "math/issignalingbf16.h" #include "math/issignalingf.h" #include "math/issignalingf128.h" diff --git a/libc/test/shared/shared_math_constexpr_test.cpp b/libc/test/shared/shared_math_constexpr_test.cpp index 087a0cf9ba530..eecf304f053f1 100644 --- a/libc/test/shared/shared_math_constexpr_test.cpp +++ b/libc/test/shared/shared_math_constexpr_test.cpp @@ -82,8 +82,10 @@ static_assert(0L == LIBC_NAMESPACE::shared::lround(0.0)); static_assert(0.0 == LIBC_NAMESPACE::shared::nearbyint(0.0)); static_assert(0.0 == LIBC_NAMESPACE::shared::nextafter(0.0, 0.0)); static_assert(0.0 == LIBC_NAMESPACE::shared::rint(0.0)); -static_assert(1 == LIBC_NAMESPACE::shared::iscanonical(0.0)); -static_assert(0.0 == LIBC_NAMESPACE::shared::issignaling(0.0)); +// TODO: iscanonical clashes with a macro defined in +// static_assert(1 == LIBC_NAMESPACE::shared::iscanonical(0.0)); +// TODO: issignaling clashes with a macro defined in +// static_assert(0.0 == LIBC_NAMESPACE::shared::issignaling(0.0)); static_assert(1 == [] { const char arg{}; return LIBC_NAMESPACE::fputil::FPBits( @@ -93,7 +95,8 @@ static_assert(1 == [] { static_assert(0.0 == LIBC_NAMESPACE::shared::round(0.0)); static_assert(0.0 == LIBC_NAMESPACE::shared::roundeven(0.0)); static_assert(0.0 == LIBC_NAMESPACE::shared::trunc(0.0)); -static_assert(0 == LIBC_NAMESPACE::shared::isnan(0.0)); +// TODO: isnan clashes with a macro defined in +// static_assert(0 == LIBC_NAMESPACE::shared::isnan(0.0)); //===----------------------------------------------------------------------===// // Float Tests diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp index 6257ab64da666..634778380dc8e 100644 --- a/libc/test/shared/shared_math_test.cpp +++ b/libc/test/shared/shared_math_test.cpp @@ -442,13 +442,16 @@ TEST(LlvmLibcSharedMathTest, AllDouble) { EXPECT_EQ(0LL, LIBC_NAMESPACE::shared::llround(0.0)); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::nearbyint(0.0)); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::rint(0.0)); - EXPECT_EQ(1, LIBC_NAMESPACE::shared::iscanonical(0.0L)); - EXPECT_EQ(0, LIBC_NAMESPACE::shared::issignaling(0.0)); + // TODO: iscanonical clashes with a macro defined in + // EXPECT_EQ(1, LIBC_NAMESPACE::shared::iscanonical(0.0)); + // TODO: issignaling clashes with a macro defined in + // EXPECT_EQ(0, LIBC_NAMESPACE::shared::issignaling(0.0)); EXPECT_TRUE(FPBits(LIBC_NAMESPACE::shared::nan("")).is_nan()); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::round(0.0)); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::roundeven(0.0)); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::trunc(0.0)); - EXPECT_EQ(0, LIBC_NAMESPACE::shared::isnan(0.0)); + // TODO: isnan clashes with a macro defined in + // EXPECT_EQ(0, LIBC_NAMESPACE::shared::isnan(0.0)); } // TODO: Enable the tests when double-double type is supported. From acb7c448a31789e55bd134e1abfa106b427c2a9c Mon Sep 17 00:00:00 2001 From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com> Date: Wed, 13 May 2026 02:55:29 +0300 Subject: [PATCH 530/538] [libc][NFC] Add `is_constant_evaluated` support for GCC9 (#197327) --- libc/src/__support/macros/attributes.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libc/src/__support/macros/attributes.h b/libc/src/__support/macros/attributes.h index cc28e7e7cbcf1..96b96dca96c82 100644 --- a/libc/src/__support/macros/attributes.h +++ b/libc/src/__support/macros/attributes.h @@ -30,8 +30,14 @@ #define LIBC_UNUSED __attribute__((unused)) #ifndef LIBC_HAS_BUILTIN_IS_CONSTANT_EVALUATED +#if (defined(LIBC_COMPILER_IS_GCC) && (LIBC_COMPILER_GCC_VER >= 900)) || \ + (defined(LIBC_COMPILER_IS_CLANG) && LIBC_COMPILER_CLANG_VER >= 900) +#define LIBC_HAS_BUILTIN_IS_CONSTANT_EVALUATED 1 +#else #define LIBC_HAS_BUILTIN_IS_CONSTANT_EVALUATED \ (__has_builtin(__builtin_is_constant_evaluated)) +#endif // (defined(LIBC_COMPILER_IS_GCC) && (LIBC_COMPILER_GCC_VER >= 900)) || + // (defined(LIBC_COMPILER_IS_CLANG) && LIBC_COMPILER_CLANG #endif // LIBC_HAS_BUILTIN_IS_CONSTANT_EVALUATED // TODO: Remove the macro once Clang/LLVM bump their minimum compilers' version. @@ -42,7 +48,7 @@ #if LIBC_ENABLE_CONSTEXPR && \ (LIBC_HAS_BUILTIN_IS_CONSTANT_EVALUATED || \ (defined(LIBC_COMPILER_IS_GCC) && (LIBC_COMPILER_GCC_VER >= 900)) || \ - (defined(LIBC_COMPILER_IS_CLANG) && LIBC_COMPILER_CLANG_VER >= 1100)) + (defined(LIBC_COMPILER_IS_CLANG) && LIBC_COMPILER_CLANG_VER >= 900)) #define LIBC_HAS_CONSTANT_EVALUATION #define LIBC_CONSTEXPR constexpr #else From b239b5c07145df1fc3503bcde58481e21ba265a1 Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Wed, 13 May 2026 02:13:23 +0200 Subject: [PATCH 531/538] [DirectX][ObjectYAML] Add ILDN part support (#194508) Add support for DXContainer ILDN part in the ObjectYAML pipeline so it can be represented in structured YAML and round-tripped through yaml2obj/obj2yaml. ILDN part is meant to store the name of PDB file that contains shader debug info. --- llvm/include/llvm/BinaryFormat/DXContainer.h | 13 ++++ .../BinaryFormat/DXContainerConstants.def | 1 + llvm/include/llvm/MC/DXContainerInfo.h | 32 ++++++++++ llvm/include/llvm/Object/DXContainer.h | 5 ++ .../include/llvm/ObjectYAML/DXContainerYAML.h | 11 ++++ llvm/lib/MC/CMakeLists.txt | 1 + llvm/lib/MC/DXContainerInfo.cpp | 41 ++++++++++++ llvm/lib/Object/DXContainer.cpp | 48 +++++++++++++- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 16 +++++ llvm/lib/ObjectYAML/DXContainerYAML.cpp | 8 +++ .../DXContainer/ILDNPart-compute-flags.yaml | 26 ++++++++ .../DXContainer/ILDNPart-compute-length.yaml | 26 ++++++++ .../DXContainer/ILDNPart-compute.yaml | 25 ++++++++ .../tools/obj2yaml/DXContainer/ILDNPart.yaml | 64 +++++++++++++++++++ llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 9 +++ llvm/unittests/Object/DXContainerTest.cpp | 36 +++++++++++ .../ObjectYAML/DXContainerYAMLTest.cpp | 33 ++++++++++ 17 files changed, 393 insertions(+), 2 deletions(-) create mode 100644 llvm/include/llvm/MC/DXContainerInfo.h create mode 100644 llvm/lib/MC/DXContainerInfo.cpp create mode 100644 llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-flags.yaml create mode 100644 llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-length.yaml create mode 100644 llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute.yaml create mode 100644 llvm/test/tools/obj2yaml/DXContainer/ILDNPart.yaml diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 99bd32a2ff87a..ae572ae7d965f 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -805,6 +805,19 @@ enum class RootSignatureVersion { V1_2 = 0x3, }; +struct DebugNameHeader { + uint16_t Flags; + /// Debug file name length, without null terminator. + uint16_t NameLength; + + void swapBytes() { + sys::swapByteOrder(Flags); + sys::swapByteOrder(NameLength); + } +}; + +static_assert(sizeof(DebugNameHeader) == 4, "DebugNameHeader size incorrect."); + } // namespace dxbc } // namespace llvm diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index f576d958037cd..4c5070d18578c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -1,6 +1,7 @@ #ifdef CONTAINER_PART CONTAINER_PART(DXIL) +CONTAINER_PART(ILDN) CONTAINER_PART(SFI0) CONTAINER_PART(HASH) CONTAINER_PART(PSV0) diff --git a/llvm/include/llvm/MC/DXContainerInfo.h b/llvm/include/llvm/MC/DXContainerInfo.h new file mode 100644 index 0000000000000..78d4b4da45558 --- /dev/null +++ b/llvm/include/llvm/MC/DXContainerInfo.h @@ -0,0 +1,32 @@ +//===----- llvm/MC/DXContainerInfo.h - DXContainer Info ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MC_DXCONTAINERINFO_H +#define LLVM_MC_DXCONTAINERINFO_H + +#include "llvm/Object/DXContainer.h" + +namespace llvm { + +class raw_ostream; + +namespace mcdxbc { + +struct DebugName { + object::DXContainer::ILDNData BaseData; + + DebugName() { BaseData.first.Flags = 0; } + + void setFileName(StringRef FileName); + void write(raw_ostream &OS) const; +}; + +} // namespace mcdxbc +} // namespace llvm + +#endif // LLVM_MC_DXCONTAINERINFO_H diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index c5888b87d6ad7..f2ce39770b097 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -460,6 +460,7 @@ class Signature { class DXContainer { public: using DXILData = std::pair; + using ILDNData = std::pair; private: DXContainer(MemoryBufferRef O); @@ -475,10 +476,12 @@ class DXContainer { DirectX::Signature InputSignature; DirectX::Signature OutputSignature; DirectX::Signature PatchConstantSignature; + std::optional DebugName; Error parseHeader(); Error parsePartOffsets(); Error parseDXILHeader(StringRef Part); + Error parseDebugName(StringRef Part); Error parseShaderFeatureFlags(StringRef Part); Error parseHash(StringRef Part); Error parseRootSignature(StringRef Part); @@ -563,6 +566,8 @@ class DXContainer { const std::optional &getDXIL() const { return DXIL; } + const std::optional getDebugName() const { return DebugName; } + std::optional getShaderFeatureFlags() const { return ShaderFeatureFlags; } diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index e95e047f546d7..6f62df54cd2bb 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -296,6 +296,12 @@ struct Signature { llvm::SmallVector Parameters; }; +struct DebugName { + std::optional Flags; + std::optional NameLength; + std::string DebugName; +}; + struct Part { Part() = default; Part(std::string N, uint32_t S) : Name(N), Size(S) {} @@ -307,6 +313,7 @@ struct Part { std::optional Info; std::optional Signature; std::optional RootSignature; + std::optional DebugName; }; struct Object { @@ -373,6 +380,10 @@ template <> struct MappingTraits { LLVM_ABI static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV); }; +template <> struct MappingTraits { + LLVM_ABI static void mapping(IO &IO, DXContainerYAML::DebugName &DebugName); +}; + template <> struct MappingTraits { LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Part &Version); }; diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index 7a9e26af415c6..b85cbaa08a653 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMMC ConstantPools.cpp + DXContainerInfo.cpp DXContainerPSVInfo.cpp DXContainerRootSignature.cpp ELFObjectWriter.cpp diff --git a/llvm/lib/MC/DXContainerInfo.cpp b/llvm/lib/MC/DXContainerInfo.cpp new file mode 100644 index 0000000000000..f32a0fe3a7cdf --- /dev/null +++ b/llvm/lib/MC/DXContainerInfo.cpp @@ -0,0 +1,41 @@ +//===- llvm/MC/DXContainerInfo.cpp - DXContainer Info -----*- C++ -------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/DXContainerInfo.h" +#include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Object/DXContainer.h" +#include "llvm/Support/SwapByteOrder.h" +#include + +using namespace llvm; +using namespace llvm::mcdxbc; + +template +static void writeStruct(raw_ostream &OS, StructT S) { + static_assert(std::is_class() && + "This method must be used for writing structure types"); + if (sys::IsBigEndianHost) + S.swapBytes(); + OS.write(reinterpret_cast(&S), sizeof(StructT)); +} + +static void writeString(raw_ostream &OS, StringRef S) { + OS.write(S.data(), S.size()); + // Write null terminator. + OS.write_zeros(1); +} + +void DebugName::setFileName(StringRef DebugFileName) { + BaseData.first.NameLength = DebugFileName.size(); + BaseData.second = DebugFileName; +} + +void DebugName::write(raw_ostream &OS) const { + writeStruct(OS, BaseData.first); + writeString(OS, BaseData.second.substr(0, BaseData.first.NameLength)); +} diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 7b7b8d88c63fc..713958b9252ad 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -20,10 +20,14 @@ static Error parseFailed(const Twine &Msg) { return make_error(Msg.str(), object_error::parse_failed); } +static bool readIsOutOfBounds(StringRef Buffer, const char *Src, size_t Size) { + return Src < Buffer.begin() || Src + Size > Buffer.end(); +} + template static Error readStruct(StringRef Buffer, const char *Src, T &Struct) { // Don't read before the beginning or past the end of the file - if (Src < Buffer.begin() || Src + sizeof(T) > Buffer.end()) + if (readIsOutOfBounds(Buffer, Src, sizeof(T))) return parseFailed("Reading structure out of file bounds"); memcpy(&Struct, Src, sizeof(T)); @@ -39,7 +43,7 @@ static Error readInteger(StringRef Buffer, const char *Src, T &Val, static_assert(std::is_integral_v, "Cannot call readInteger on non-integral type."); // Don't read before the beginning or past the end of the file - if (Src < Buffer.begin() || Src + sizeof(T) > Buffer.end()) + if (readIsOutOfBounds(Buffer, Src, sizeof(T))) return parseFailed(Twine("Reading ") + Str + " out of file bounds"); // The DXContainer offset table is comprised of uint32_t values but not padded @@ -55,6 +59,22 @@ static Error readInteger(StringRef Buffer, const char *Src, T &Val, return Error::success(); } +static Error readString(StringRef Buffer, const char *&Src, size_t MaxSize, + StringRef &Val, Twine Desc) { + if (readIsOutOfBounds(Buffer, Src, MaxSize)) + return parseFailed(Desc + " is out of file bounds"); + + // Ensure that the null-terminator is somewhere within MaxSize bytes. + Buffer = Buffer.substr(Src - Buffer.data(), MaxSize); + size_t Length = Buffer.find('\0'); + if (Length == Buffer.npos) + return parseFailed(Desc + " does not end with null-terminator"); + + Val = StringRef(Buffer.data(), Length); + Src += Length + 1; + return Error::success(); +} + DXContainer::DXContainer(MemoryBufferRef O) : Data(O) {} Error DXContainer::parseHeader() { @@ -73,6 +93,26 @@ Error DXContainer::parseDXILHeader(StringRef Part) { return Error::success(); } +Error DXContainer::parseDebugName(StringRef Part) { + if (DebugName) + return parseFailed("more than one ILDN part is present in the file"); + const char *Current = Part.begin(); + dxbc::DebugNameHeader Header; + if (Error Err = readStruct(Part, Current, Header)) + return Err; + Current += sizeof(Header); + + StringRef Name; + if (Error Err = readString(Part, Current, Header.NameLength + 1, Name, + "debug file name")) + return Err; + if (Name.size() != Header.NameLength) + return parseFailed("debug file name length mismatch"); + DebugName.emplace(Header, Name.data()); + + return Error::success(); +} + Error DXContainer::parseShaderFeatureFlags(StringRef Part) { if (ShaderFeatureFlags) return parseFailed("More than one SFI0 part is present in the file"); @@ -177,6 +217,10 @@ Error DXContainer::parsePartOffsets() { if (Error Err = parseDXILHeader(PartData)) return Err; break; + case dxbc::PartType::ILDN: + if (Error Err = parseDebugName(PartData)) + return Err; + break; case dxbc::PartType::SFI0: if (Error Err = parseShaderFeatureFlags(PartData)) return Err; diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 312c74befd752..05937999991b6 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/MC/DXContainerInfo.h" #include "llvm/MC/DXContainerPSVInfo.h" #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/ObjectYAML/ObjectYAML.h" @@ -172,6 +173,21 @@ Error DXContainerWriter::writeParts(raw_ostream &OS) { } break; } + case dxbc::PartType::ILDN: { + if (!P.DebugName) + continue; + + mcdxbc::DebugName DebugName; + DebugName.setFileName(P.DebugName->DebugName); + // Override default flags with value from YAML. + if (P.DebugName->Flags) + DebugName.BaseData.first.Flags = *P.DebugName->Flags; + // Override computed filename length with value from YAML. + if (P.DebugName->NameLength) + DebugName.BaseData.first.NameLength = *P.DebugName->NameLength; + DebugName.write(OS); + break; + } case dxbc::PartType::SFI0: { // If we don't have any flags we can continue here and the data will be // zeroed out. diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index f81bf5f55cddd..9e7d9be552e62 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -535,6 +535,13 @@ void MappingTraits::mapping( #include "llvm/BinaryFormat/DXContainerConstants.def" } +void MappingTraits::mapping( + IO &IO, DXContainerYAML::DebugName &DebugName) { + IO.mapOptional("Flags", DebugName.Flags); + IO.mapOptional("NameLength", DebugName.NameLength); + IO.mapRequired("DebugName", DebugName.DebugName); +} + void MappingTraits::mapping(IO &IO, DXContainerYAML::Part &P) { IO.mapRequired("Name", P.Name); @@ -545,6 +552,7 @@ void MappingTraits::mapping(IO &IO, IO.mapOptional("PSVInfo", P.Info); IO.mapOptional("Signature", P.Signature); IO.mapOptional("RootSignature", P.RootSignature); + IO.mapOptional("DebugName", P.DebugName); } void MappingTraits::mapping( diff --git a/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-flags.yaml b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-flags.yaml new file mode 100644 index 0000000000000..ea92645b28436 --- /dev/null +++ b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-flags.yaml @@ -0,0 +1,26 @@ +# RUN: yaml2obj %s 2>&1 | obj2yaml 2>&1 | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + FileSize: 88 + PartCount: 1 + PartOffsets: [36] +Parts: + - Name: ILDN + Size: 44 + DebugName: + NameLength: 36 + DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb +... + +#CHECK: - Name: ILDN +#CHECK-NEXT: Size: 44 +#CHECK-NEXT: DebugName: +#CHECK-NEXT: Flags: 0 +#CHECK-NEXT: NameLength: 36 +#CHECK-NEXT: DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb diff --git a/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-length.yaml b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-length.yaml new file mode 100644 index 0000000000000..128135ed91188 --- /dev/null +++ b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute-length.yaml @@ -0,0 +1,26 @@ +# RUN: yaml2obj %s 2>&1 | obj2yaml 2>&1 | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + FileSize: 88 + PartCount: 1 + PartOffsets: [36] +Parts: + - Name: ILDN + Size: 44 + DebugName: + Flags: 0 + DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb +... + +#CHECK: - Name: ILDN +#CHECK-NEXT: Size: 44 +#CHECK-NEXT: DebugName: +#CHECK-NEXT: Flags: 0 +#CHECK-NEXT: NameLength: 36 +#CHECK-NEXT: DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb diff --git a/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute.yaml b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute.yaml new file mode 100644 index 0000000000000..f6f47f1809c97 --- /dev/null +++ b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart-compute.yaml @@ -0,0 +1,25 @@ +# RUN: yaml2obj %s 2>&1 | obj2yaml 2>&1 | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + FileSize: 88 + PartCount: 1 + PartOffsets: [36] +Parts: + - Name: ILDN + Size: 44 + DebugName: + DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb +... + +#CHECK: - Name: ILDN +#CHECK-NEXT: Size: 44 +#CHECK-NEXT: DebugName: +#CHECK-NEXT: Flags: 0 +#CHECK-NEXT: NameLength: 36 +#CHECK-NEXT: DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb diff --git a/llvm/test/tools/obj2yaml/DXContainer/ILDNPart.yaml b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart.yaml new file mode 100644 index 0000000000000..02e42417f39d8 --- /dev/null +++ b/llvm/test/tools/obj2yaml/DXContainer/ILDNPart.yaml @@ -0,0 +1,64 @@ +# RUN: yaml2obj %s 2>&1 | obj2yaml 2>&1 | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + FileSize: 3600 + PartCount: 8 + PartOffsets: [64, 80, 96, 112, 240, 1936, 1964, 2016] +Parts: + - Name: FKE0 + Size: 8 + - Name: FKE1 + Size: 8 + - Name: FKE2 + Size: 8 + - Name: FKE3 + Size: 120 + - Name: FKE4 + Size: 1688 + - Name: FKE5 + Size: 20 + - Name: ILDN + Size: 44 + DebugName: + Flags: 0 + NameLength: 36 + DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb + - Name: DXIL + Size: 28 + Program: + MajorVersion: 6 + MinorVersion: 5 + ShaderKind: 5 + Size: 7 + DXILMajorVersion: 1 + DXILMinorVersion: 5 + DXILSize: 4 + DXIL: [ 0x42, 0x43, 0xC0, 0xDE, ] +... + + + + +#CHECK: - Name: ILDN +#CHECK-NEXT: Size: 44 +#CHECK-NEXT: DebugName: +#CHECK-NEXT: Flags: 0 +#CHECK-NEXT: NameLength: 36 +#CHECK-NEXT: DebugName: 0b40fc8650d90fa2e9fd5cadc8eaaace.pdb +#CHECK-NEXT: - Name: DXIL +#CHECK-NEXT: Size: 28 +#CHECK-NEXT: Program: +#CHECK-NEXT: MajorVersion: 6 +#CHECK-NEXT: MinorVersion: 5 +#CHECK-NEXT: ShaderKind: 5 +#CHECK-NEXT: Size: 7 +#CHECK-NEXT: DXILMajorVersion: 1 +#CHECK-NEXT: DXILMinorVersion: 5 +#CHECK-NEXT: DXILSize: 4 +#CHECK-NEXT: DXIL: [ 0x42, 0x43, 0xC0, 0xDE diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index 09a6422d0d7b9..8095bf8b24abe 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -70,6 +70,15 @@ dumpDXContainer(MemoryBufferRef Source) { DXIL->second, DXIL->second + DXIL->first.Bitcode.Size)}; break; } + case dxbc::PartType::ILDN: { + std::optional DebugName = Container.getDebugName(); + assert(DebugName && "Since we are iterating and found a ILDN part, this " + "should never not have a value"); + NewPart.DebugName = DXContainerYAML::DebugName{ + DebugName->first.Flags, DebugName->first.NameLength, + DebugName->second.str()}; + break; + } case dxbc::PartType::SFI0: { std::optional Flags = Container.getShaderFeatureFlags(); // Omit the flags in the YAML if they are missing or zero. diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index ac8b45825c04d..3769b527c65ac 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -221,6 +221,42 @@ TEST(DXCFile, ParseDXILPart) { EXPECT_EQ(Header.Bitcode.MinorVersion, 5u); } +// This test verifies that ILDN part is correctly parsed. +// This test is based on the binary output constructed from this yaml. +// --- !dxcontainer +// Header: +// Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, +// 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] +// Version: +// Major: 1 +// Minor: 0 +// PartCount: 1 +// Parts: +// - Name: ILDN +// Size: 12 +// DebugName: +// Flags: 0 +// NameLength: 7 +// DebugName: abc.pdb +// ... +TEST(DXCFile, ParseILDNPart) { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x49, 0x4C, 0x44, 0x4E, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x61, 0x62, 0x63, 0x2E, 0x70, 0x64, 0x62, 0x00}; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<116>(Buffer))); + EXPECT_EQ(C.getHeader().PartCount, 1u); + const std::optional &ILDN = C.getDebugName(); + EXPECT_TRUE(ILDN.has_value()); + dxbc::DebugNameHeader Header = ILDN->first; + EXPECT_EQ(Header.Flags, 0u); + EXPECT_EQ(Header.NameLength, 7u); + EXPECT_EQ(ILDN->second, "abc.pdb"); +} + static Expected generateDXContainer(StringRef Yaml, SmallVectorImpl &BinaryData) { DXContainerYAML::Object Obj; diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index 1b21fe01dfca9..d6226b368e5df 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -577,3 +577,36 @@ TEST(RootSignature, ParseStaticSamplersV13) { EXPECT_EQ(Storage.size(), 148U); EXPECT_TRUE(memcmp(Buffer, Storage.data(), 148U) == 0); } + +TEST(DXCFile, ParseILDNPart) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 36 ] +Parts: + - Name: ILDN + Size: 12 + DebugName: + Flags: 0 + NameLength: 7 + DebugName: abc.pdb + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x49, 0x4C, 0x44, 0x4E, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, + 0x61, 0x62, 0x63, 0x2E, 0x70, 0x64, 0x62, 0x00}; + + EXPECT_EQ(Storage.size(), 56u); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 56u) == 0); +} From 7565c837c5556b5731e2bde555a1941e48083255 Mon Sep 17 00:00:00 2001 From: Yeoul Na Date: Tue, 12 May 2026 17:16:33 -0700 Subject: [PATCH 532/538] [BoundsSafety][NFC] Introduce LateParsedTypeAttribute for late-parsed type attributes (#192799) Preparatory refactoring for llvm/llvm-project#179612. The new late parsing approach needs a distinct data structure to carry type-attribute-specific information through the late parsing pipeline, separate from declaration-level late-parsed attributes. - Add LateParsedTypeAttribute subtyping LateParsedAttribute - Add ParseLexedTypeAttribute and LateTypeAttrParserCallback to Parser - Extract the shared token setup, parsing, and cleanup logic from ParseLexedCAttribute and ParseLexedTypeAttribute into a common ParseLexedCAttributeTokens helper. --- clang/include/clang/Parse/Parser.h | 63 +++++++++++++++++++++-- clang/include/clang/Sema/DeclSpec.h | 14 +++-- clang/lib/Parse/ParseCXXInlineMethods.cpp | 2 + clang/lib/Parse/ParseDecl.cpp | 52 ++++++++++++++----- 4 files changed, 111 insertions(+), 20 deletions(-) diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index cec1dc99e90d8..dc3dc8a4ae0e9 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -190,6 +190,12 @@ class LateParsedDeclaration { /// FIXME: Perhaps we should change the name of LateParsedDeclaration to /// LateParsedTokens. struct LateParsedAttribute : public LateParsedDeclaration { + + enum class Kind { + Declaration, + Type, + }; + Parser *Self; CachedTokens Toks; IdentifierInfo &AttrName; @@ -197,13 +203,49 @@ struct LateParsedAttribute : public LateParsedDeclaration { SourceLocation AttrNameLoc; SmallVector Decls; +private: + Kind K; + +protected: + explicit LateParsedAttribute(Parser *P, IdentifierInfo &Name, + SourceLocation Loc, Kind K) + : Self(P), AttrName(Name), AttrNameLoc(Loc), K(K) {} + +public: explicit LateParsedAttribute(Parser *P, IdentifierInfo &Name, SourceLocation Loc) - : Self(P), AttrName(Name), AttrNameLoc(Loc) {} + : LateParsedAttribute(P, Name, Loc, Kind::Declaration) {} void ParseLexedAttributes() override; void addDecl(Decl *D) { Decls.push_back(D); } + + Kind getKind() const { return K; } + + static bool classof(const LateParsedAttribute *LA) { return true; } +}; + +/// A late-parsed attribute that will be applied as a type attribute. +/// Unlike LateParsedAttribute (which applies to declarations via +/// ActOnFinishDelayedAttribute), this stores cached tokens that are +/// parsed during type construction when the placeholder LateParsedAttrType +/// is replaced with a concrete type (e.g., CountAttributedType). +struct LateParsedTypeAttribute : public LateParsedAttribute { + + explicit LateParsedTypeAttribute(Parser *P, IdentifierInfo &Name, + SourceLocation Loc) + : LateParsedAttribute(P, Name, Loc, Kind::Type) {} + + void ParseLexedAttributes() override; + + /// Parse this late-parsed type attribute and store results in OutAttrs. + /// This method can be called from Sema during type transformation to + /// parse the cached tokens and produce the final attribute. + void ParseInto(ParsedAttributes &OutAttrs); + + static bool classof(const LateParsedAttribute *LA) { + return LA->getKind() == Kind::Type; + } }; /// Parser - This implements a parser for the C family of languages. After @@ -1165,6 +1207,7 @@ class Parser : public CodeCompletionHandler { private: friend struct LateParsedAttribute; + friend struct LateParsedTypeAttribute; struct ParsingClass; @@ -1475,7 +1518,7 @@ class Parser : public CodeCompletionHandler { const char *&PrevSpec, unsigned &DiagID, bool &isInvalid); - void ParseLexedCAttributeList(LateParsedAttrList &LA, bool EnterScope, + void ParseLexedCAttributeList(LateParsedAttrList &LA, ParsedAttributes *OutAttrs = nullptr); /// Finish parsing an attribute for which parsing was delayed. @@ -1483,9 +1526,23 @@ class Parser : public CodeCompletionHandler { /// for each LateParsedAttribute. We consume the saved tokens and /// create an attribute with the arguments filled in. We add this /// to the Attribute list for the decl. - void ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, + void ParseLexedCAttribute(LateParsedAttribute &LA, ParsedAttributes *OutAttrs = nullptr); + void ParseLexedTypeAttribute(LateParsedTypeAttribute &LA, + ParsedAttributes &OutAttrs); + + /// Parse cached tokens for a late-parsed attribute and return the parsed + /// attributes. Shared implementation used by both ParseLexedCAttribute and + /// ParseLexedTypeAttribute. + ParsedAttributes ParseLexedCAttributeTokens(LateParsedAttribute &LA); + + /// Helper function to move LateParsedTypeAttribute pointers from one list + /// to another. Filters type attributes from \p From and appends them to \p + /// To. + static void TakeTypeAttrsAppendingFrom(LateParsedAttrList &To, + LateParsedAttrList &From); + void ParseLexedPragmas(ParsingClass &Class); void ParseLexedPragma(LateParsedPragma &LP); diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h index 61706bc8f4229..b3c459821c79c 100644 --- a/clang/include/clang/Sema/DeclSpec.h +++ b/clang/include/clang/Sema/DeclSpec.h @@ -51,6 +51,7 @@ namespace clang { class OverflowBehaviorType; struct TemplateIdAnnotation; struct LateParsedAttribute; + struct LateParsedTypeAttribute; /// Represents a C++ nested-name-specifier or a global scope specifier. /// @@ -1254,20 +1255,25 @@ typedef SmallVector CachedTokens; class LateParsedAttrList : public SmallVector { public: LateParsedAttrList(bool PSoon = false, - bool LateAttrParseExperimentalExtOnly = false) + bool LateAttrParseExperimentalExtOnly = false, + bool LateAttrParseTypeAttrOnly = false) : ParseSoon(PSoon), - LateAttrParseExperimentalExtOnly(LateAttrParseExperimentalExtOnly) {} + LateAttrParseExperimentalExtOnly(LateAttrParseExperimentalExtOnly), + LateAttrParseTypeAttrOnly(LateAttrParseTypeAttrOnly) {} - bool parseSoon() { return ParseSoon; } + bool parseSoon() const { return ParseSoon; } /// returns true iff the attribute to be parsed should only be late parsed /// if it is annotated with `LateAttrParseExperimentalExt` - bool lateAttrParseExperimentalExtOnly() { + bool lateAttrParseExperimentalExtOnly() const { return LateAttrParseExperimentalExtOnly; } + bool lateAttrParseTypeAttrOnly() const { return LateAttrParseTypeAttrOnly; } + private: bool ParseSoon; // Are we planning to parse these shortly after creation? bool LateAttrParseExperimentalExtOnly; + bool LateAttrParseTypeAttrOnly; }; /// One instance of this struct is used for each type in a diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp index c1c567f7828cf..6189c854e5fbf 100644 --- a/clang/lib/Parse/ParseCXXInlineMethods.cpp +++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp @@ -319,6 +319,8 @@ void LateParsedAttribute::ParseLexedAttributes() { Self->ParseLexedAttribute(*this, true, false); } +void LateParsedTypeAttribute::ParseLexedAttributes() {} + void Parser::LateParsedPragma::ParseLexedPragmas() { Self->ParseLexedPragma(*this); } diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 1a04ca7f43647..75ad821c245a5 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -4849,19 +4849,18 @@ void Parser::ParseStructDeclaration( // TODO: All callers of this function should be moved to // `Parser::ParseLexedAttributeList`. -void Parser::ParseLexedCAttributeList(LateParsedAttrList &LAs, bool EnterScope, +void Parser::ParseLexedCAttributeList(LateParsedAttrList &LAs, ParsedAttributes *OutAttrs) { assert(LAs.parseSoon() && "Attribute list should be marked for immediate parsing."); for (auto *LA : LAs) { - ParseLexedCAttribute(*LA, EnterScope, OutAttrs); + ParseLexedCAttribute(*LA, OutAttrs); delete LA; } LAs.clear(); } -void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, - ParsedAttributes *OutAttrs) { +ParsedAttributes Parser::ParseLexedCAttributeTokens(LateParsedAttribute &LA) { // Create a fake EOF so that attribute parsing won't go off the end of the // attribute. Token AttrEnd; @@ -4880,9 +4879,6 @@ void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, // as when we entered this function. ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true); - // TODO: Use `EnterScope` - (void)EnterScope; - ParsedAttributes Attrs(AttrFactory); assert(LA.Decls.size() <= 1 && @@ -4892,9 +4888,6 @@ void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, ParseGNUAttributeArgs(&LA.AttrName, LA.AttrNameLoc, Attrs, nullptr, nullptr, SourceLocation(), ParsedAttr::Form::GNU(), nullptr); - for (auto *D : LA.Decls) - Actions.ActOnFinishDelayedAttribute(getCurScope(), D, Attrs); - // Due to a parsing error, we either went over the cached tokens or // there are still cached tokens left, so we skip the leftover tokens. while (Tok.isNot(tok::eof)) @@ -4904,9 +4897,42 @@ void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, if (Tok.is(tok::eof) && Tok.getEofData() == AttrEnd.getEofData()) ConsumeAnyToken(); - if (OutAttrs) { + return Attrs; +} + +void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, + ParsedAttributes *OutAttrs) { + ParsedAttributes Attrs = ParseLexedCAttributeTokens(LA); + + for (Decl *D : LA.Decls) + Actions.ActOnFinishDelayedAttribute(getCurScope(), D, Attrs); + + if (OutAttrs) OutAttrs->takeAllAppendingFrom(Attrs); - } +} + +void Parser::ParseLexedTypeAttribute(LateParsedTypeAttribute &LA, + ParsedAttributes &OutAttrs) { + ParsedAttributes Attrs = ParseLexedCAttributeTokens(LA); + OutAttrs.takeAllAppendingFrom(Attrs); +} + +void LateParsedTypeAttribute::ParseInto(ParsedAttributes &OutAttrs) { + // Delegate to the Parser that created this attribute + Self->ParseLexedTypeAttribute(*this, OutAttrs); +} + +void Parser::TakeTypeAttrsAppendingFrom(LateParsedAttrList &To, + LateParsedAttrList &From) { + LateParsedAttrList::iterator It = + llvm::remove_if(From, [&](LateParsedAttribute *LA) { + if (isa(LA)) { + To.push_back(LA); + return true; + } + return false; + }); + From.erase(It, From.end()); } void Parser::ParseStructUnionBody(SourceLocation RecordLoc, @@ -5035,7 +5061,7 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc, MaybeParseGNUAttributes(attrs, &LateFieldAttrs); // Late parse field attributes if necessary. - ParseLexedCAttributeList(LateFieldAttrs, /*EnterScope=*/false); + ParseLexedCAttributeList(LateFieldAttrs); SmallVector FieldDecls(TagDecl->fields()); From 05afc9a322bf31283c2c43bb07ac09cef032f604 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 12 May 2026 17:27:47 -0700 Subject: [PATCH 533/538] [CIR] Fix problems with stale insert locations (#197296) This change fixes three places where we were copying the builder rather than getting a reference to it, leading to potential problems with incorrect state, including insert locations. Two of these cases were causing observable problems, and I'm adding regression tests for those. The third doesn't seem to have caused any actual problems, but I'm changing it to avoid potential problems in the future. Assisted-by: Cursor / claude-opus-4.7-thinking-xhigh --- clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp | 2 +- .../CodeGen/initializer-list-size-cleanup.cpp | 32 +++++++++++++ .../CIR/CodeGen/three-way-compare-cleanup.cpp | 47 +++++++++++++++++++ 4 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 clang/test/CIR/CodeGen/initializer-list-size-cleanup.cpp create mode 100644 clang/test/CIR/CodeGen/three-way-compare-cleanup.cpp diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index b561d4abeceda..a6722fe1eee88 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -365,7 +365,7 @@ class AggExprEmitter : public StmtVisitor { cgf.cgm.errorNYI(e->getBeginLoc(), "aggregate three-way comparison"); mlir::Location loc = cgf.getLoc(e->getSourceRange()); - CIRGenBuilderTy builder = cgf.getBuilder(); + CIRGenBuilderTy &builder = cgf.getBuilder(); if (e->getType()->isAnyComplexType()) cgf.cgm.errorNYI(e->getBeginLoc(), "VisitBinCmp: complex type"); @@ -540,7 +540,7 @@ class AggExprEmitter : public StmtVisitor { /// real initializer list. void VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *e) { ASTContext &ctx = cgf.getContext(); - CIRGenBuilderTy builder = cgf.getBuilder(); + CIRGenBuilderTy &builder = cgf.getBuilder(); mlir::Location loc = cgf.getLoc(e->getExprLoc()); LValue array = cgf.emitLValue(e->getSubExpr()); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index de866f286a0e1..752a1d3e6e220 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -366,7 +366,7 @@ static void emitNullBaseClassInitialization(CIRGenFunction &cgf, assert(stores.size() == 1 && "Expected only one store"); assert(stores[0].first == CharUnits::Zero() && "Expected store to begin at offset zero"); - CIRGenBuilderTy builder = cgf.getBuilder(); + CIRGenBuilderTy &builder = cgf.getBuilder(); mlir::Location loc = cgf.getLoc(base->getBeginLoc()); builder.createStore(loc, builder.getConstant(loc, nullConstantForBase), destPtr); diff --git a/clang/test/CIR/CodeGen/initializer-list-size-cleanup.cpp b/clang/test/CIR/CodeGen/initializer-list-size-cleanup.cpp new file mode 100644 index 0000000000000..a7f4dc4933714 --- /dev/null +++ b/clang/test/CIR/CodeGen/initializer-list-size-cleanup.cpp @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR + +namespace std { +template class initializer_list { + const T *data; + __SIZE_TYPE__ len; + +public: + initializer_list(); +}; +} // namespace std + +struct Elem { + Elem(int); + ~Elem(); +}; + +struct Container { + Container(std::initializer_list); + ~Container(); +}; + +void build_container() { + Container c = {1, 2, 3}; +} + +// CIR-LABEL: cir.func {{.*}}@_Z15build_containerv +// CIR: cir.cleanup.scope { +// CIR: %[[LEN_CONST:.*]] = cir.const #cir.int<3> : !u64i +// CIR: %[[LEN_PTR:.*]] = cir.get_member {{.*}}[1] {name = "len"} +// CIR: cir.store {{.*}} %[[LEN_CONST]], %[[LEN_PTR]] diff --git a/clang/test/CIR/CodeGen/three-way-compare-cleanup.cpp b/clang/test/CIR/CodeGen/three-way-compare-cleanup.cpp new file mode 100644 index 0000000000000..7e51bf382d0fe --- /dev/null +++ b/clang/test/CIR/CodeGen/three-way-compare-cleanup.cpp @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR + +namespace std { +struct strong_ordering { + int v; + constexpr strong_ordering(int v) : v(v) {} + static const strong_ordering equal; + static const strong_ordering less; + static const strong_ordering greater; +}; +inline constexpr strong_ordering strong_ordering::equal{0}; +inline constexpr strong_ordering strong_ordering::less{-1}; +inline constexpr strong_ordering strong_ordering::greater{1}; +} // namespace std + +struct Holder { + int v; + Holder(int x); + ~Holder(); + operator int() const; +}; + +auto three_way_cmp_with_temp(int a) { + return Holder(a).operator int() <=> 0; +} + +// CIR-LABEL: cir.func {{.*}}three_way_cmp_with_temp +// CIR: cir.call @_ZN6HolderC1Ei +// CIR: cir.cleanup.scope { +// CIR: %[[CONV:.*]] = cir.call @_ZNK6HoldercviEv +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i +// CIR: %[[LT:.*]] = cir.const #cir.int<-1> : !s32i +// CIR: %[[EQ:.*]] = cir.const #cir.int<0> : !s32i +// CIR: %[[GT:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[CMP_LT:.*]] = cir.cmp lt %[[CONV]], %[[ZERO]] +// CIR: %[[SEL1:.*]] = cir.select if %[[CMP_LT]] then %[[LT]] else %[[GT]] +// CIR: %[[CMP_EQ:.*]] = cir.cmp eq %[[CONV]], %[[ZERO]] +// CIR: %[[RESULT:.*]] = cir.select if %[[CMP_EQ]] then %[[EQ]] else %[[SEL1]] +// CIR: %[[FIELD:.*]] = cir.get_member {{.*}}[0] {name = "v"} +// CIR: cir.store {{.*}} %[[RESULT]], %[[FIELD]] +// CIR: cir.yield +// CIR: } cleanup normal { +// CIR: cir.call @_ZN6HolderD1Ev +// CIR: cir.yield +// CIR: } +// CIR: cir.return From 3938a371ef46982d0910c96a998ae9cda82ca1f2 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Tue, 12 May 2026 17:34:27 -0700 Subject: [PATCH 534/538] [clang][Modules] Use strict comparison at build-session validation boundary (#197203) The relocation check used `IFVT >= session` while input validation used `IFVT > session`, silently skipping the relocation check when both landed in the same truncated second. Share a single helper with the strict comparison. This was noticed when the accompanying test was flaky on green dragon macOS bot. Also: * Drop the test's `sleep 1` timing dependency by future-dating the session timestamp directly. * Remove the dead `touch %t/session.timestamp` lines in the ClangScanDeps test (the scanner ignores the file's mtime). hopefully resolves: rdar://173816745 --- clang/lib/Serialization/ASTReader.cpp | 30 +++++++++++++------ ...ild-session-validation-relocated-modules.c | 4 +-- ...ild-session-validation-relocated-modules.c | 8 ++--- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index dfd714dd53814..0f834859e982a 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -3179,6 +3179,18 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock( } } +/// Returns {build-session validation applies, MF was validated this session}. +static std::pair +wasValidatedInBuildSession(const ModuleFile &MF, + const HeaderSearchOptions &HSOpts) { + const bool EnablesBSValidation = + HSOpts.ModulesValidateOncePerBuildSession && MF.Kind == MK_ImplicitModule; + const bool WasValidated = + EnablesBSValidation && + MF.InputFilesValidationTimestamp > HSOpts.BuildSessionTimestamp; + return {EnablesBSValidation, WasValidated}; +} + ASTReader::RelocationResult ASTReader::getModuleForRelocationChecks(ModuleFile &F, bool DirectoryCheck) { // Don't emit module relocation errors if we have -fno-validate-pch. @@ -3201,12 +3213,13 @@ ASTReader::getModuleForRelocationChecks(ModuleFile &F, bool DirectoryCheck) { // When only validating modules once per build session, // Skip check if the timestamp is up to date or module was built in same build // session. - if (HSOpts.ModulesValidateOncePerBuildSession && IsImplicitModule) { - if (F.InputFilesValidationTimestamp >= HSOpts.BuildSessionTimestamp) - return {std::nullopt, IgnoreError}; - if (static_cast(F.ModTime) >= HSOpts.BuildSessionTimestamp) - return {std::nullopt, IgnoreError}; - } + auto [EnablesBSValidation, WasValidated] = + wasValidatedInBuildSession(F, HSOpts); + if (WasValidated) + return {std::nullopt, IgnoreError}; + if (EnablesBSValidation && + static_cast(F.ModTime) >= HSOpts.BuildSessionTimestamp) + return {std::nullopt, IgnoreError}; Diag(diag::remark_module_check_relocation) << F.ModuleName << F.FileName; @@ -3294,9 +3307,8 @@ ASTReader::ReadControlBlock(ModuleFile &F, F.InputFilesValidationStatus = ValidateSystemInputs ? InputFilesValidation::AllFiles : InputFilesValidation::UserFiles; - if (HSOpts.ModulesValidateOncePerBuildSession && - F.InputFilesValidationTimestamp > HSOpts.BuildSessionTimestamp && - F.Kind == MK_ImplicitModule) { + auto [_, WasValidated] = wasValidatedInBuildSession(F, HSOpts); + if (WasValidated) { N = ForceValidateUserInputs ? NumUserInputs : 0; F.InputFilesValidationStatus = ForceValidateUserInputs diff --git a/clang/test/ClangScanDeps/build-session-validation-relocated-modules.c b/clang/test/ClangScanDeps/build-session-validation-relocated-modules.c index 47919229512d0..5a0744718685b 100644 --- a/clang/test/ClangScanDeps/build-session-validation-relocated-modules.c +++ b/clang/test/ClangScanDeps/build-session-validation-relocated-modules.c @@ -12,14 +12,12 @@ // RUN: split-file %s %t // RUN: sed -e "s|DIR|%/t|g" %t/compile-commands.json.in > %t/compile-commands.json -// RUN: touch %t/session.timestamp // RUN: clang-scan-deps -format experimental-full -j 1 \ // RUN: -compilation-database %t/compile-commands.json -o %t/deps1.json // RUN: cat %t/deps1.json | FileCheck %s --check-prefix=DEPS1 // Model update where same framework appears in earlier search path. // This can occur on an incremental build where dependency relationships are updated. -// RUN: touch %t/session.timestamp // RUN: sleep 1 // RUN: mkdir %t/preferred_frameworks/ // RUN: cp -r %t/fallback_frameworks/MovedDep.framework %t/preferred_frameworks/ @@ -46,6 +44,8 @@ } ] +//--- session.timestamp + //--- fallback_frameworks/MovedDep.framework/Modules/module.modulemap framework module MovedDep { header "MovedDep.h" } //--- fallback_frameworks/MovedDep.framework/Headers/MovedDep.h diff --git a/clang/test/Modules/build-session-validation-relocated-modules.c b/clang/test/Modules/build-session-validation-relocated-modules.c index 173d35528fb4f..1cf1bb20902c4 100644 --- a/clang/test/Modules/build-session-validation-relocated-modules.c +++ b/clang/test/Modules/build-session-validation-relocated-modules.c @@ -26,11 +26,9 @@ // RUN: -fbuild-session-file=%t/session.timestamp -fmodules-validate-once-per-build-session \ // RUN: -Xclang -fno-modules-check-relocated -Rmodule-validation 2>&1 | FileCheck %s --check-prefix=NO_RELOC -// Ensure future new timestamp doesn't have same time as older one. -// RUN: sleep 1 - -// Now remove the disablement and check. -// RUN: touch %t/session.timestamp +// Force session.timestamp into the future so it is strictly greater than any +// cached pcm or timestamp mtime, regardless of wall-clock resolution. +// RUN: %python -c "import os, time; t = time.time() + 3600; os.utime('%/t/session.timestamp', (t, t))" // RUN: %clang -fmodules -fimplicit-module-maps -fsyntax-only %t/tu1.c \ // RUN: -fmodules-cache-path=%t/cache -F%t/preferred_frameworks -F%t/fallback_frameworks \ // RUN: -fbuild-session-file=%t/session.timestamp -fmodules-validate-once-per-build-session \ From add7a0e372013e648d5fc506867620d413abb31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 12 May 2026 17:39:28 -0700 Subject: [PATCH 535/538] [flang][cuda] Add support for -gpu=pinned (#197304) Add a new language feature to implicitly treat dynamic allocation as pinned allocation. --- .../include/flang/Support/Fortran-features.h | 2 +- flang/lib/Semantics/resolve-names.cpp | 31 ++++++++++++------- flang/lib/Support/Fortran-features.cpp | 1 + flang/test/Lower/CUDA/cuda-gpu-pinned.f90 | 16 ++++++++++ flang/tools/bbc/bbc.cpp | 5 ++- 5 files changed, 42 insertions(+), 13 deletions(-) create mode 100644 flang/test/Lower/CUDA/cuda-gpu-pinned.f90 diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h index af72b71d9d1e6..899819a4b096c 100644 --- a/flang/include/flang/Support/Fortran-features.h +++ b/flang/include/flang/Support/Fortran-features.h @@ -59,7 +59,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, PointerPassObject, MultipleIdenticalDATA, DefaultStructConstructorNullPointer, AssumedRankIoItem, MultipleProgramUnitsOnSameLine, AllocatedForAssociated, - OpenMPThreadprivateEquivalence, RelaxedCLoc) + OpenMPThreadprivateEquivalence, RelaxedCLoc, CudaPinned) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index b6c2e32b16a0e..54ff4e6570eb3 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -10183,18 +10183,27 @@ void ResolveNamesVisitor::FinishSpecificationPart( SetBindNameOn(symbol); } } - // Implicitly treat allocatable arrays as managed when feature is enabled. - // This is done after all explicit CUDA attributes have been processed. - // Only applies when CUDA Fortran is enabled; otherwise -gpu=mem:managed - // on a non-CUDA-Fortran translation unit (e.g. pure OpenACC) would - // incorrectly route every allocatable through the CUDA Fortran managed - // descriptor pipeline. - if (context().languageFeatures().IsEnabled( - common::LanguageFeature::CudaManaged) && - context().languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) - if (auto *object{symbol.detailsIf()}) - if (IsAllocatable(symbol) && !object->cudaDataAttr()) + + if (auto *object{symbol.detailsIf()}) { + if (IsAllocatable(symbol) && !object->cudaDataAttr()) { + // Implicitly treat allocatable arrays as managed when feature is + // enabled. This is done after all explicit CUDA attributes have been + // processed. Only applies when CUDA Fortran is enabled; otherwise + // -gpu=mem:managed on a non-CUDA-Fortran translation unit (e.g. pure + // OpenACC) would incorrectly route every allocatable through the CUDA + // Fortran managed descriptor pipeline. + if (context().languageFeatures().IsEnabled( + common::LanguageFeature::CudaManaged) && + context().languageFeatures().IsEnabled( + common::LanguageFeature::CUDA)) object->set_cudaDataAttr(common::CUDADataAttr::Managed); + // Implicitly treat allocatable arrays as pinned when feature is + // enabled. + else if (context().languageFeatures().IsEnabled( + common::LanguageFeature::CudaPinned)) + object->set_cudaDataAttr(common::CUDADataAttr::Pinned); + } + } } currScope().InstantiateDerivedTypes(); for (const auto &decl : decls) { diff --git a/flang/lib/Support/Fortran-features.cpp b/flang/lib/Support/Fortran-features.cpp index 54c8931da17d3..4e16d6d28b7c5 100644 --- a/flang/lib/Support/Fortran-features.cpp +++ b/flang/lib/Support/Fortran-features.cpp @@ -134,6 +134,7 @@ LanguageFeatureControl::LanguageFeatureControl() { disable_.set(LanguageFeature::CUDA); // !@cuf disable_.set(LanguageFeature::CudaManaged); disable_.set(LanguageFeature::CudaUnified); + disable_.set(LanguageFeature::CudaPinned); disable_.set(LanguageFeature::ImplicitNoneTypeNever); disable_.set(LanguageFeature::ImplicitNoneTypeAlways); disable_.set(LanguageFeature::ImplicitNoneExternal); diff --git a/flang/test/Lower/CUDA/cuda-gpu-pinned.f90 b/flang/test/Lower/CUDA/cuda-gpu-pinned.f90 new file mode 100644 index 0000000000000..35c14aef00cf1 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-gpu-pinned.f90 @@ -0,0 +1,16 @@ +! RUN: bbc -emit-hlfir -gpu=pinned -fcuda %s -o - | FileCheck %s + +integer, allocatable :: a(:) +integer, allocatable, device :: b(:) +allocate(a(10)) +allocate(b(10)) +deallocate(a) +deallocate(b) +end + +! CHECK-LABEL: func.func @_QQmain() +! CHECK: cuf.allocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 +! CHECK: cuf.allocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 +! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 +! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 + diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index bc6e9eb67e132..5c2837a8a36b4 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -232,7 +232,8 @@ static llvm::cl::opt llvm::cl::init(false)); static llvm::cl::opt - enableGPUMode("gpu", llvm::cl::desc("Enable GPU Mode managed|unified"), + enableGPUMode("gpu", + llvm::cl::desc("Enable GPU Mode managed|unified|pinned"), llvm::cl::init("")); static llvm::cl::opt @@ -667,6 +668,8 @@ int main(int argc, char **argv) { options.features.Enable(Fortran::common::LanguageFeature::CudaManaged); else if (enableGPUMode == "unified") options.features.Enable(Fortran::common::LanguageFeature::CudaUnified); + else if (enableGPUMode == "pinned") + options.features.Enable(Fortran::common::LanguageFeature::CudaPinned); if (fixedForm) { options.isFixedForm = fixedForm; From 86f7e93921fdd33e0eeb3cf138be7f9bc6feb07d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 12 May 2026 20:48:30 -0400 Subject: [PATCH 536/538] [SLP] Check candidates instead of instructionsstate for reduction type in revec mode Better to ceheck the first candidate instead of InstructionsState, since it might be non-valid for constants. Fixes https://github.com/llvm/llvm-project/pull/197291#issuecomment-4435105325 Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/197335 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../X86/revec-constant-reduction.ll | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/revec-constant-reduction.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d5f284bebd940..2825a4c7acd73 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -28662,7 +28662,7 @@ class HorizontalReduction { IsSupportedHorRdxIdentityOp = RK == ReductionOrdering::Unordered && RdxKind != RecurKind::Mul && RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd && - (!SLPReVec || !S.getMainOp()->getType()->isVectorTy()); + (!SLPReVec || !Candidates.front()->getType()->isVectorTy()); // Gather same values. SmallMapVector SameValuesCounter; if (IsSupportedHorRdxIdentityOp) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-constant-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-constant-reduction.ll new file mode 100644 index 0000000000000..85ffc81564bb0 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-constant-reduction.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-revec < %s | FileCheck %s + +define i1 @test(ptr %0, i64 %1, i64 %2) { +; CHECK-LABEL: define i1 @test( +; CHECK-SAME: ptr [[TMP0:%.*]], i64 [[TMP1:%.*]], i64 [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne ptr [[TMP0]], null +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = and i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = and i1 [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp samesign ult i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP8]], i1 [[TMP9]], i1 false +; CHECK-NEXT: ret i1 [[TMP10]] +; + %4 = icmp ne i64 %1, 0 + %5 = icmp ne ptr %0, null + %6 = icmp eq i64 %2, 0 + %7 = and i1 %5, %6 + %8 = and i1 %4, %7 + %9 = icmp samesign ult i64 %1, 2 + %10 = select i1 %8, i1 %9, i1 false + ret i1 %10 +} From e8c6318c2a07f8eb4643db610854b0e993a4ed6e Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 12 May 2026 17:56:30 -0700 Subject: [PATCH 537/538] [Bazel] Drop :errno use from some libc math support libraries (#197336) We only actually need the errno header. Using errno as a dependency pulls in a c++ source file which requires compilation which precludes use in a header library. --- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 4 ---- 1 file changed, 4 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 5ca3d23ad40fe..499c3f6e4f30d 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -6180,7 +6180,6 @@ libc_support_library( ":__support_macros_optimization", ":__support_macros_properties_cpu_features", ":__support_sincosf_utils", - ":errno", ], ) @@ -6195,7 +6194,6 @@ libc_support_library( ":__support_fputil_multiply_add", ":__support_macros_optimization", ":__support_math_sincosf16_utils", - ":errno", ":llvm_libc_macros_float16_macros", ], ) @@ -6362,7 +6360,6 @@ libc_support_library( ":__support_macros_config", ":__support_macros_optimization", ":__support_math_exp10f_utils", - ":errno", ], ) @@ -6382,7 +6379,6 @@ libc_support_library( ":__support_macros_optimization", ":__support_macros_properties_cpu_features", ":__support_math_exp10f16_utils", - ":errno", ":llvm_libc_macros_float16_macros", ], ) From bd3ad408d5cc7a7462ad69cd27c6b12f29a1db11 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Tue, 12 May 2026 18:03:08 -0700 Subject: [PATCH 538/538] [Instrumentor] Add Alloca and Function support; stack usage example (#195378) This adds support for alloca instrumentation and function pre/post instrumentation. Alloca support follows load/store support directly. Functions require special care to determine the insertion points. Together, we can showcase how the stack high watermark can be profiled, see InstrumentorStackUsage.cpp. --- .../Instrumentor/InstrumentorStackUsage.cpp | 37 +++ clang/test/Instrumentor/StackUsageRT.cpp | 60 ++++ clang/test/Instrumentor/StackUsageRT.json | 54 +++ clang/test/Instrumentor/lit.local.cfg | 2 + .../llvm/Transforms/IPO/Instrumentor.h | 138 +++++++- .../Transforms/IPO/InstrumentorConfigFile.h | 2 +- llvm/lib/Passes/PassBuilderPipelines.cpp | 4 +- llvm/lib/Transforms/IPO/Instrumentor.cpp | 314 +++++++++++++++++- .../Transforms/IPO/InstrumentorConfigFile.cpp | 21 +- .../Instrumentor/alloca_and_function.ll | 57 ++++ .../Instrumentor/default_config.json | 59 ++++ 11 files changed, 721 insertions(+), 27 deletions(-) create mode 100644 clang/test/Instrumentor/InstrumentorStackUsage.cpp create mode 100644 clang/test/Instrumentor/StackUsageRT.cpp create mode 100644 clang/test/Instrumentor/StackUsageRT.json create mode 100644 clang/test/Instrumentor/lit.local.cfg create mode 100644 llvm/test/Instrumentation/Instrumentor/alloca_and_function.ll diff --git a/clang/test/Instrumentor/InstrumentorStackUsage.cpp b/clang/test/Instrumentor/InstrumentorStackUsage.cpp new file mode 100644 index 0000000000000..ade099764143c --- /dev/null +++ b/clang/test/Instrumentor/InstrumentorStackUsage.cpp @@ -0,0 +1,37 @@ +// NOTE: Assertions have been autogenerated by utils/update_test_checks.py +// RUN: %clangxx -O0 %S/StackUsageRT.cpp -o %t.StackUsageRT.o -c +// RUN: %clangxx -O0 -mllvm -enable-instrumentor -mllvm -instrumentor-read-config-file=%S/StackUsageRT.json %t.StackUsageRT.o -o %t %s +// RUN: %t | FileCheck %s + +static void foobar(int *A, int N) { + int B[100]; + for (int i = 0; i < 100; ++i) { + B[i] = i + N; + } + if (N-- > 0) + foobar(B, N); + for (int i = 0; i < 100; ++i) { + A[i] += B[i]; + } +} + +static void bar(int *A, int N) { + foobar(A, N); +} + +int main(void) { + int A[100] = {0}; + foobar(A, 4); + bar(A, 3); + foobar(A, 5); + foobar(A, 2); +} + +// CHECK: Stack usage peaked at 2512 in +// CHECK: foobar(int +// CHECK: foobar(int +// CHECK: foobar(int +// CHECK: foobar(int +// CHECK: foobar(int +// CHECK: foobar(int +// CHECK: main diff --git a/clang/test/Instrumentor/StackUsageRT.cpp b/clang/test/Instrumentor/StackUsageRT.cpp new file mode 100644 index 0000000000000..a3bfe33d82fce --- /dev/null +++ b/clang/test/Instrumentor/StackUsageRT.cpp @@ -0,0 +1,60 @@ +//===-- examples/Instrumentor/stack_usage.c - An example Instrumentor use -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +struct StackTracker { + std::list CallStack; + int64_t FunctionStackUsage = 0; + int64_t TotalStackUsage = 0; + + std::list HighWaterMarkCallStack; + int64_t HighWaterMark = 0; + + ~StackTracker() { + printf("Stack usage peaked at %" PRId64 " in\n", HighWaterMark); + HighWaterMarkCallStack.reverse(); + for (char *Name : HighWaterMarkCallStack) + printf("- %s\n", Name); + } + + void enter(char *Name) { + FunctionStackUsage = 0; + CallStack.push_back(Name); + } + void exit(char *Name) { + CallStack.pop_back(); + TotalStackUsage -= FunctionStackUsage; + } + + void allocate(int64_t size) { + TotalStackUsage += size; + FunctionStackUsage += size; + if (TotalStackUsage <= HighWaterMark) + return; + HighWaterMark = TotalStackUsage; + HighWaterMarkCallStack = CallStack; + } +}; + +static thread_local StackTracker ST; + +extern "C" { + +void __stack_usage_pre_function(char *Name) { ST.enter(Name); } + +void __stack_usage_post_function(char *Name) { ST.exit(Name); } + +void __stack_usage_pre_alloca(int64_t size) { ST.allocate(size); } +} diff --git a/clang/test/Instrumentor/StackUsageRT.json b/clang/test/Instrumentor/StackUsageRT.json new file mode 100644 index 0000000000000..491ab9cf5ea05 --- /dev/null +++ b/clang/test/Instrumentor/StackUsageRT.json @@ -0,0 +1,54 @@ +{ + "configuration": { + "runtime_prefix": "__stack_usage_", + "runtime_prefix.description": "The runtime API prefix.", + "demangle_function_names": true, + "demangle_function_names.description": "Demangle functions names passed to the runtime." + }, + "function_pre": { + "function": { + "enabled": true, + "address": false, + "address.description": "The function address.", + "name": true, + "name.description": "The function name.", + "num_arguments": false, + "num_arguments.description": "Number of function arguments (without varargs).", + "arguments": false, + "arguments.description": "Description of the arguments.", + "is_main": false, + "is_main.description": "Flag to indicate it is the main function.", + "id": false, + "id.description": "A unique ID associated with the given instrumentor call" + } + }, + "function_post": { + "function": { + "enabled": true, + "address": false, + "address.description": "The function address.", + "name": true, + "name.description": "The function name.", + "num_arguments": false, + "num_arguments.description": "Number of function arguments (without varargs).", + "arguments": false, + "arguments.description": "Description of the arguments.", + "is_main": false, + "is_main.description": "Flag to indicate it is the main function.", + "id": false, + "id.description": "A unique ID associated with the given instrumentor call" + } + }, + "instruction_pre": { + "alloca": { + "enabled": true, + "size": true, + "size.replace": false, + "size.description": "The allocation size.", + "alignment": false, + "alignment.description": "The allocation alignment.", + "id": false, + "id.description": "A unique ID associated with the given instrumentor call" + } + } +} diff --git a/clang/test/Instrumentor/lit.local.cfg b/clang/test/Instrumentor/lit.local.cfg new file mode 100644 index 0000000000000..afb6cf1a99e25 --- /dev/null +++ b/clang/test/Instrumentor/lit.local.cfg @@ -0,0 +1,2 @@ +config.suffixes.add(".cpp") +config.excludes = ["StackUsageRT.cpp"] diff --git a/llvm/include/llvm/Transforms/IPO/Instrumentor.h b/llvm/include/llvm/Transforms/IPO/Instrumentor.h index 7dfc342031579..9f41190ece0dd 100644 --- a/llvm/include/llvm/Transforms/IPO/Instrumentor.h +++ b/llvm/include/llvm/Transforms/IPO/Instrumentor.h @@ -15,6 +15,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EnumeratedArray.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -358,6 +359,9 @@ struct InstrumentationConfig { "Regular expression to be matched against the module target. " "Only targets that match this regex will be instrumented", ""); + DemangleFunctionNames = BaseConfigurationOption::createBoolOption( + *this, "demangle_function_names", + "Demangle functions names passed to the runtime.", true); HostEnabled = BaseConfigurationOption::createBoolOption( *this, "host_enabled", "Instrument non-GPU targets", true); GPUEnabled = BaseConfigurationOption::createBoolOption( @@ -394,12 +398,31 @@ struct InstrumentationConfig { return Obj; } + /// Mapping to remember global strings passed to the runtime. + DenseMap GlobalStringsMap; + + /// Mapping from constants to globals with the constant as initializer. + DenseMap ConstantGlobalsCache; + + Constant *getGlobalString(StringRef S, InstrumentorIRBuilderTy &IIRB) { + Constant *&V = GlobalStringsMap[SS.save(S)]; + if (!V) { + auto &M = *IIRB.IRB.GetInsertBlock()->getModule(); + V = IIRB.IRB.CreateGlobalString( + S, getRTName() + ".str", + M.getDataLayout().getDefaultGlobalsAddressSpace(), &M); + if (V->getType() != IIRB.IRB.getPtrTy()) + V = ConstantExpr::getAddrSpaceCast(V, IIRB.IRB.getPtrTy()); + } + return V; + } /// The list of enabled base configuration options. SmallVector BaseConfigurationOptions; /// The base configuration options. std::unique_ptr RuntimePrefix; std::unique_ptr RuntimeStubsFile; + std::unique_ptr DemangleFunctionNames; std::unique_ptr TargetRegex; std::unique_ptr HostEnabled; std::unique_ptr GPUEnabled; @@ -539,6 +562,96 @@ struct InstructionIO : public InstrumentationOpportunity { } }; +/// The instrumentation opportunity for functions. +struct FunctionIO final : public InstrumentationOpportunity { + FunctionIO(bool IsPRE) + : InstrumentationOpportunity( + InstrumentationLocation(InstrumentationLocation( + IsPRE ? InstrumentationLocation::FUNCTION_PRE + : InstrumentationLocation::FUNCTION_POST))) {} + + enum ConfigKind { + PassAddress = 0, + PassName, + PassNumArguments, + PassArguments, + ReplaceArguments, + PassIsMain, + PassId, + NumConfig, + }; + + struct ConfigTy final : public BaseConfigTy { + std::function ArgFilter; + + ConfigTy(bool Enable = true) : BaseConfigTy(Enable) {} + } Config; + + StringRef getName() const override { return "function"; } + + void init(InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB, + ConfigTy *UserConfig = nullptr); + + static Value *getFunctionAddress(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + static Value *getFunctionName(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + Value *getNumArguments(Value &V, Type &Ty, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + Value *getArguments(Value &V, Type &Ty, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + Value *setArguments(Value &V, Value &NewV, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + static Value *isMainFunction(Value &V, Type &Ty, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + + static void populate(InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto *PreIO = IConf.allocate(true); + PreIO->init(IConf, IIRB); + auto *PostIO = IConf.allocate(false); + PostIO->init(IConf, IIRB); + } +}; + +/// The instrumentation opportunity for alloca instructions. +struct AllocaIO final : public InstructionIO { + AllocaIO(bool IsPRE) : InstructionIO(IsPRE) {} + + enum ConfigKind { + PassAddress = 0, + ReplaceAddress, + PassSize, + ReplaceSize, + PassAlignment, + PassId, + NumConfig, + }; + + using ConfigTy = BaseConfigTy; + ConfigTy Config; + + void init(InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB, + ConfigTy *UserConfig = nullptr); + + static Value *getSize(Value &V, Type &Ty, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + static Value *setSize(Value &V, Value &NewV, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + static Value *getAlignment(Value &V, Type &Ty, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB); + + static void populate(InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto *PreIO = IConf.allocate(true); + PreIO->init(IConf, IIRB); + auto *PostIO = IConf.allocate(false); + PostIO->init(IConf, IIRB); + } +}; + /// The instrumentation opportunity for store instructions. struct StoreIO : public InstructionIO { virtual ~StoreIO() {}; @@ -608,10 +721,10 @@ struct StoreIO : public InstructionIO { /// instrumentation calls. static void populate(InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB) { - for (auto IsPRE : {true, false}) { - auto *AIC = IConf.allocate(IsPRE); - AIC->init(IConf, IIRB); - } + auto *PreIO = IConf.allocate(true); + PreIO->init(IConf, IIRB); + auto *PostIO = IConf.allocate(false); + PostIO->init(IConf, IIRB); } }; @@ -683,10 +796,10 @@ struct LoadIO : public InstructionIO { /// Create the store opportunities for PRE and POST positions. static void populate(InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB) { - for (auto IsPRE : {true, false}) { - auto *AIC = IConf.allocate(IsPRE); - AIC->init(IConf, IIRB); - } + auto *PreIO = IConf.allocate(true); + PreIO->init(IConf, IIRB); + auto *PostIO = IConf.allocate(false); + PostIO->init(IConf, IIRB); } }; @@ -697,6 +810,9 @@ class InstrumentorPass : public RequiredPassInfoMixin { using InstrumentationConfig = instrumentor::InstrumentationConfig; using InstrumentorIRBuilderTy = instrumentor::InstrumentorIRBuilderTy; + /// File system to be used for read operations. + IntrusiveRefCntPtr FS; + /// The configuration and IR builder provided by the user. InstrumentationConfig *UserIConf; InstrumentorIRBuilderTy *UserIIRB; @@ -710,9 +826,9 @@ class InstrumentorPass : public RequiredPassInfoMixin { /// provided, a default builder is used. When the configuration is not /// provided, it is read from the config file if available and otherwise a /// default configuration is used. - InstrumentorPass(InstrumentationConfig *IC = nullptr, - InstrumentorIRBuilderTy *IIRB = nullptr) - : UserIConf(IC), UserIIRB(IIRB) {} + InstrumentorPass(IntrusiveRefCntPtr FS = nullptr, + InstrumentationConfig *IC = nullptr, + InstrumentorIRBuilderTy *IIRB = nullptr); PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); }; diff --git a/llvm/include/llvm/Transforms/IPO/InstrumentorConfigFile.h b/llvm/include/llvm/Transforms/IPO/InstrumentorConfigFile.h index cae68f4b34f08..28efa77d772c4 100644 --- a/llvm/include/llvm/Transforms/IPO/InstrumentorConfigFile.h +++ b/llvm/include/llvm/Transforms/IPO/InstrumentorConfigFile.h @@ -25,7 +25,7 @@ void writeConfigToJSON(InstrumentationConfig &IConf, StringRef OutputFile, /// Read the configuration from the file with path \p InputFile into /p IConf. bool readConfigFromJSON(InstrumentationConfig &IConf, StringRef InputFile, - LLVMContext &Ctx); + LLVMContext &Ctx, vfs::FileSystem &FS); } // end namespace instrumentor } // end namespace llvm diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 2d2867b9b84d1..b96f5626734d3 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1648,7 +1648,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // Run the Instrumentor pass late. if (EnableInstrumentor) - MPM.addPass(InstrumentorPass()); + MPM.addPass(InstrumentorPass(FS)); // Split out cold code. Splitting is done late to avoid hiding context from // other optimizations and inadvertently regressing performance. The tradeoff @@ -2429,7 +2429,7 @@ PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, invokeOptimizerLastEPCallbacks(MPM, Level, Phase); if (EnableInstrumentor) - MPM.addPass(InstrumentorPass()); + MPM.addPass(InstrumentorPass(FS)); if (isLTOPreLink(Phase)) addRequiredLTOPreLinkPasses(MPM); diff --git a/llvm/lib/Transforms/IPO/Instrumentor.cpp b/llvm/lib/Transforms/IPO/Instrumentor.cpp index 33f00be11084a..63ee807887463 100644 --- a/llvm/lib/Transforms/IPO/Instrumentor.cpp +++ b/llvm/lib/Transforms/IPO/Instrumentor.cpp @@ -16,11 +16,13 @@ #include "llvm/Transforms/IPO/InstrumentorStubPrinter.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/iterator.h" +#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -42,6 +44,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include #include @@ -231,11 +235,46 @@ bool InstrumentorImpl::instrumentFunction(Function &Fn) { return Changed; InstrumentationCaches ICaches; + SmallVector FinalTIs; ReversePostOrderTraversal RPOT(&Fn); - for (auto &It : RPOT) + for (auto &It : RPOT) { for (auto &I : *It) Changed |= instrumentInstruction(I, ICaches); + auto *TI = It->getTerminator(); + if (!TI->getNumSuccessors()) + FinalTIs.push_back(TI); + } + + Value *FPtr = &Fn; + for (auto &[Name, IO] : + IConf.IChoices[InstrumentationLocation::FUNCTION_PRE]) { + if (!IO->Enabled) + continue; + // Count epochs eagerly. + ++IIRB.Epoch; + + IIRB.IRB.SetInsertPoint( + cast(FPtr)->getEntryBlock().getFirstInsertionPt()); + ensureDbgLoc(IIRB.IRB); + Changed |= bool(IO->instrument(FPtr, IConf, IIRB, ICaches)); + IIRB.returnAllocas(); + } + + for (auto &[Name, IO] : + IConf.IChoices[InstrumentationLocation::FUNCTION_POST]) { + if (!IO->Enabled) + continue; + // Count epochs eagerly. + ++IIRB.Epoch; + + for (Instruction *FinalTI : FinalTIs) { + IIRB.IRB.SetInsertPoint(FinalTI); + ensureDbgLoc(IIRB.IRB); + Changed |= bool(IO->instrument(FPtr, IConf, IIRB, ICaches)); + IIRB.returnAllocas(); + } + } return Changed; } @@ -244,12 +283,14 @@ bool InstrumentorImpl::instrument() { if (!shouldInstrumentTarget()) return Changed; - for (auto &It : IConf.IChoices[InstrumentationLocation::INSTRUCTION_PRE]) - if (It.second->Enabled) - InstChoicesPRE[It.second->getOpcode()] = It.second; - for (auto &It : IConf.IChoices[InstrumentationLocation::INSTRUCTION_POST]) - if (It.second->Enabled) - InstChoicesPOST[It.second->getOpcode()] = It.second; + for (auto &[Name, IO] : + IConf.IChoices[InstrumentationLocation::INSTRUCTION_PRE]) + if (IO->Enabled) + InstChoicesPRE[IO->getOpcode()] = IO; + for (auto &[Name, IO] : + IConf.IChoices[InstrumentationLocation::INSTRUCTION_POST]) + if (IO->Enabled) + InstChoicesPOST[IO->getOpcode()] = IO; for (Function &Fn : M) Changed |= instrumentFunction(Fn); @@ -257,11 +298,19 @@ bool InstrumentorImpl::instrument() { return Changed; } +InstrumentorPass::InstrumentorPass(IntrusiveRefCntPtr FS, + InstrumentationConfig *IC, + InstrumentorIRBuilderTy *IIRB) + : FS(FS), UserIConf(IC), UserIIRB(IIRB) { + if (!FS) + this->FS = vfs::getRealFileSystem(); +} + PreservedAnalyses InstrumentorPass::run(Module &M, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB, bool ReadConfig) { InstrumentorImpl Impl(IConf, IIRB, M); - if (ReadConfig && !readConfigFromJSON(IConf, ReadConfigFile, IIRB.Ctx)) + if (ReadConfig && !readConfigFromJSON(IConf, ReadConfigFile, IIRB.Ctx, *FS)) return PreservedAnalyses::all(); writeConfigToJSON(IConf, WriteConfigFile, IIRB.Ctx); @@ -315,6 +364,8 @@ BaseConfigurationOption::createStringOption(InstrumentationConfig &IConf, void InstrumentationConfig::populate(InstrumentorIRBuilderTy &IIRB) { /// List of all instrumentation opportunities. + FunctionIO::populate(*this, IIRB); + AllocaIO::populate(*this, IIRB); LoadIO::populate(*this, IIRB); StoreIO::populate(*this, IIRB); } @@ -525,6 +576,253 @@ CallInst *IRTCallDescription::createLLVMCall(Value *&V, return CI; } +template constexpr static Value *getValue(Ty &ValueOrUse) { + if constexpr (std::is_same::value) + return ValueOrUse.get(); + else + return static_cast(&ValueOrUse); +} + +template +static Value *createValuePack(const Range &R, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto *Fn = IIRB.IRB.GetInsertBlock()->getParent(); + auto *I32Ty = IIRB.IRB.getInt32Ty(); + SmallVector ConstantValues; + SmallVector> Values; + SmallVector Types; + for (auto &RE : R) { + Value *V = getValue(RE); + if (!V->getType()->isSized()) + continue; + auto VSize = IIRB.DL.getTypeAllocSize(V->getType()); + ConstantValues.push_back(getCI(I32Ty, VSize)); + Types.push_back(I32Ty); + ConstantValues.push_back(getCI(I32Ty, V->getType()->getTypeID())); + Types.push_back(I32Ty); + if (uint32_t MisAlign = VSize % 8) { + Types.push_back(ArrayType::get(IIRB.Int8Ty, 8 - MisAlign)); + ConstantValues.push_back(ConstantArray::getNullValue(Types.back())); + } + Types.push_back(V->getType()); + if (auto *C = dyn_cast(V)) { + ConstantValues.push_back(C); + continue; + } + Values.push_back({V, ConstantValues.size()}); + ConstantValues.push_back(Constant::getNullValue(V->getType())); + } + if (Types.empty()) + return ConstantPointerNull::get(IIRB.PtrTy); + + StructType *STy = StructType::get(Fn->getContext(), Types, /*isPacked=*/true); + Constant *Initializer = ConstantStruct::get(STy, ConstantValues); + + GlobalVariable *&GV = IConf.ConstantGlobalsCache[Initializer]; + if (!GV) + GV = new GlobalVariable(*Fn->getParent(), STy, false, + GlobalValue::InternalLinkage, Initializer, + IConf.getRTName("", "value_pack")); + + auto *AI = IIRB.getAlloca(Fn, STy); + IIRB.IRB.CreateMemCpy(AI, AI->getAlign(), GV, MaybeAlign(GV->getAlignment()), + IIRB.DL.getTypeAllocSize(STy)); + for (auto [Param, Idx] : Values) { + auto *Ptr = IIRB.IRB.CreateStructGEP(STy, AI, Idx); + IIRB.IRB.CreateStore(Param, Ptr); + } + return AI; +} + +template +static void readValuePack(const Range &R, Value &Pack, + InstrumentorIRBuilderTy &IIRB, + function_ref SetterCB) { + auto *Fn = IIRB.IRB.GetInsertBlock()->getParent(); + auto &DL = Fn->getDataLayout(); + SmallVector ParameterValues; + unsigned Offset = 0; + for (const auto &[Idx, RE] : enumerate(R)) { + Value *V = getValue(RE); + if (!V->getType()->isSized()) + continue; + Offset += 8; + auto VSize = DL.getTypeAllocSize(V->getType()); + auto Padding = alignTo(VSize, 8) - VSize; + Offset += Padding; + auto *Ptr = IIRB.IRB.CreateConstInBoundsGEP1_32(IIRB.Int8Ty, &Pack, Offset); + auto *NewV = IIRB.IRB.CreateLoad(V->getType(), Ptr); + SetterCB(Idx, NewV); + Offset += VSize; + } +} + +/// FunctionIO +/// { +void FunctionIO::init(InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB, ConfigTy *UserConfig) { + using namespace std::placeholders; + if (UserConfig) + Config = *UserConfig; + + bool IsPRE = getLocationKind() == InstrumentationLocation::FUNCTION_PRE; + if (Config.has(PassAddress)) + IRTArgs.push_back(IRTArg(IIRB.PtrTy, "address", "The function address.", + IRTArg::NONE, getFunctionAddress)); + if (Config.has(PassName)) + IRTArgs.push_back(IRTArg(IIRB.PtrTy, "name", "The function name.", + IRTArg::STRING, getFunctionName)); + if (Config.has(PassNumArguments)) + IRTArgs.push_back( + IRTArg(IIRB.Int32Ty, "num_arguments", + "Number of function arguments (without varargs).", IRTArg::NONE, + std::bind(&FunctionIO::getNumArguments, this, _1, _2, _3, _4))); + if (Config.has(PassArguments)) + IRTArgs.push_back( + IRTArg(IIRB.PtrTy, "arguments", "Description of the arguments.", + IsPRE && Config.has(ReplaceArguments) ? IRTArg::REPLACABLE_CUSTOM + : IRTArg::NONE, + std::bind(&FunctionIO::getArguments, this, _1, _2, _3, _4), + std::bind(&FunctionIO::setArguments, this, _1, _2, _3, _4))); + if (Config.has(PassIsMain)) + IRTArgs.push_back(IRTArg(IIRB.Int8Ty, "is_main", + "Flag to indicate it is the main function.", + IRTArg::NONE, isMainFunction)); + addCommonArgs(IConf, IIRB.Ctx, Config.has(PassId)); + IConf.addChoice(*this, IIRB.Ctx); +} + +Value *FunctionIO::getFunctionAddress(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto &Fn = cast(V); + if (Fn.isIntrinsic()) + return Constant::getNullValue(&Ty); + return &V; +} +Value *FunctionIO::getFunctionName(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto &Fn = cast(V); + return IConf.getGlobalString(IConf.DemangleFunctionNames->getBool() + ? demangle(Fn.getName()) + : Fn.getName(), + IIRB); +} +Value *FunctionIO::getNumArguments(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto &Fn = cast(V); + if (!Config.ArgFilter) + return getCI(&Ty, Fn.arg_size()); + auto FRange = make_filter_range(Fn.args(), Config.ArgFilter); + return getCI(&Ty, std::distance(FRange.begin(), FRange.end())); +} +Value *FunctionIO::getArguments(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto &Fn = cast(V); + if (!Config.ArgFilter) + return createValuePack(Fn.args(), IConf, IIRB); + return createValuePack(make_filter_range(Fn.args(), Config.ArgFilter), IConf, + IIRB); +} +Value *FunctionIO::setArguments(Value &V, Value &NewV, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto &Fn = cast(V); + auto *AIt = Fn.arg_begin(); + auto CB = [&](int Idx, Value *ReplV) { + while (Config.ArgFilter && !Config.ArgFilter(*AIt)) + ++AIt; + Fn.getArg(Idx)->replaceUsesWithIf(ReplV, [&](Use &U) { + return IIRB.NewInsts.lookup(cast(U.getUser())) != IIRB.Epoch; + }); + ++AIt; + }; + if (!Config.ArgFilter) + readValuePack(Fn.args(), NewV, IIRB, CB); + else + readValuePack(make_filter_range(Fn.args(), Config.ArgFilter), NewV, IIRB, + CB); + return &Fn; +} +Value *FunctionIO::isMainFunction(Value &V, Type &Ty, + InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + auto &Fn = cast(V); + return getCI(&Ty, Fn.getName() == "main"); +} + +///} + +/// AllocaIO +///{ +void AllocaIO::init(InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB, + ConfigTy *UserConfig) { + if (UserConfig) + Config = *UserConfig; + + bool IsPRE = getLocationKind() == InstrumentationLocation::INSTRUCTION_PRE; + if (!IsPRE && Config.has(PassAddress)) + IRTArgs.push_back( + IRTArg(IIRB.PtrTy, "address", "The allocated memory address.", + Config.has(ReplaceAddress) ? IRTArg::REPLACABLE : IRTArg::NONE, + InstrumentationOpportunity::getValue, + InstrumentationOpportunity::replaceValue)); + if (Config.has(PassSize)) + IRTArgs.push_back(IRTArg( + IIRB.Int64Ty, "size", "The allocation size.", + (IsPRE && Config.has(ReplaceSize)) ? IRTArg::REPLACABLE : IRTArg::NONE, + getSize, setSize)); + if (Config.has(PassAlignment)) + IRTArgs.push_back(IRTArg(IIRB.Int64Ty, "alignment", + "The allocation alignment.", IRTArg::NONE, + getAlignment)); + + addCommonArgs(IConf, IIRB.Ctx, Config.has(PassId)); + IConf.addChoice(*this, IIRB.Ctx); +} + +Value *AllocaIO::getSize(Value &V, Type &Ty, InstrumentationConfig &IO, + InstrumentorIRBuilderTy &IIRB) { + auto &AI = cast(V); + const DataLayout &DL = AI.getDataLayout(); + Value *SizeValue = nullptr; + TypeSize TypeSize = DL.getTypeAllocSize(AI.getAllocatedType()); + if (TypeSize.isFixed()) { + SizeValue = getCI(&Ty, TypeSize.getFixedValue()); + } else { + auto *NullPtr = ConstantPointerNull::get(AI.getType()); + SizeValue = IIRB.IRB.CreatePtrToInt( + IIRB.IRB.CreateGEP(AI.getAllocatedType(), NullPtr, + {IIRB.IRB.getInt32(1)}), + &Ty); + } + if (AI.isArrayAllocation()) + SizeValue = IIRB.IRB.CreateMul( + SizeValue, IIRB.IRB.CreateZExtOrBitCast(AI.getArraySize(), &Ty)); + return SizeValue; +} + +Value *AllocaIO::setSize(Value &V, Value &NewV, InstrumentationConfig &IO, + InstrumentorIRBuilderTy &IIRB) { + auto &AI = cast(V); + const DataLayout &DL = AI.getDataLayout(); + auto *NewAI = IIRB.IRB.CreateAlloca(IIRB.IRB.getInt8Ty(), + DL.getAllocaAddrSpace(), &NewV); + NewAI->setAlignment(AI.getAlign()); + AI.replaceAllUsesWith(NewAI); + IIRB.eraseLater(&AI); + return NewAI; +} + +Value *AllocaIO::getAlignment(Value &V, Type &Ty, InstrumentationConfig &IConf, + InstrumentorIRBuilderTy &IIRB) { + return getCI(&Ty, cast(V).getAlign().value()); +} +///} + void StoreIO::init(InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB, ConfigTy *UserConfig) { if (UserConfig) diff --git a/llvm/lib/Transforms/IPO/InstrumentorConfigFile.cpp b/llvm/lib/Transforms/IPO/InstrumentorConfigFile.cpp index a2707193da1a7..7ee800a5c96a0 100644 --- a/llvm/lib/Transforms/IPO/InstrumentorConfigFile.cpp +++ b/llvm/lib/Transforms/IPO/InstrumentorConfigFile.cpp @@ -20,9 +20,21 @@ #include "llvm/Support/JSON.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/StringSaver.h" +#include "llvm/Support/VirtualFileSystem.h" #include +using namespace llvm; + +static Expected> +setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) { + auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN() + : FS.getBufferForFile(Filename); + if (std::error_code EC = BufferOrErr.getError()) + return errorCodeToError(EC); + return std::move(BufferOrErr.get()); +} + namespace llvm { namespace instrumentor { @@ -96,16 +108,15 @@ void writeConfigToJSON(InstrumentationConfig &IConf, StringRef OutputFile, } bool readConfigFromJSON(InstrumentationConfig &IConf, StringRef InputFile, - LLVMContext &Ctx) { + LLVMContext &Ctx, vfs::FileSystem &FS) { if (InputFile.empty()) return true; - std::error_code EC; - auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(InputFile); - if (std::error_code EC = BufferOrErr.getError()) { + auto BufferOrErr = setupMemoryBuffer(InputFile, FS); + if (Error E = BufferOrErr.takeError()) { Ctx.diagnose(DiagnosticInfoInstrumentation( Twine("failed to open instrumentor configuration file for reading: ") + - EC.message(), + toString(std::move(E)), DS_Warning)); return false; } diff --git a/llvm/test/Instrumentation/Instrumentor/alloca_and_function.ll b/llvm/test/Instrumentation/Instrumentor/alloca_and_function.ll new file mode 100644 index 0000000000000..a20c26daad558 --- /dev/null +++ b/llvm/test/Instrumentation/Instrumentor/alloca_and_function.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=instrumentor -S | FileCheck %s + +; Check that we pack the arguments into a value_pack and unpack them again after the pre_function call. +; Check that we replace the argument uses witht he unpacked values. +; Check that we replace the alloca with the post_alloca returned value. + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +declare void @use(ptr) + +;. +; CHECK: @__instrumentor_.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1 +; CHECK: @__instrumentor_value_pack = internal global <{ i32, i32, [6 x i8], i16, i32, i32, [4 x i8], float }> <{ i32 2, i32 12, [6 x i8] zeroinitializer, i16 0, i32 4, i32 2, [4 x i8] zeroinitializer, float 0.000000e+00 }> +;. +define float @foo(i16 %a, float %b) { +; CHECK-LABEL: define float @foo( +; CHECK-SAME: i16 [[A:%.*]], float [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP7:%.*]] = alloca <{ i32, i32, [6 x i8], i16, i32, i32, [4 x i8], float }>, align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP7]], ptr @__instrumentor_value_pack, i64 32, i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw <{ i32, i32, [6 x i8], i16, i32, i32, [4 x i8], float }>, ptr [[TMP7]], i32 0, i32 3 +; CHECK-NEXT: store i16 [[A]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw <{ i32, i32, [6 x i8], i16, i32, i32, [4 x i8], float }>, ptr [[TMP7]], i32 0, i32 7 +; CHECK-NEXT: store float [[B]], ptr [[TMP9]], align 4 +; CHECK-NEXT: call void @__instrumentor_pre_function(ptr @foo, ptr @__instrumentor_.str, i32 2, ptr [[TMP7]], i8 0, i32 3) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 14 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 28 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @__instrumentor_pre_alloca(i64 2, i64 16, i32 1) #[[ATTR1]] +; CHECK-NEXT: [[TMP1:%.*]] = alloca i8, i64 [[TMP0]], align 16 +; CHECK-NEXT: [[TMP13:%.*]] = call ptr @__instrumentor_post_alloca(ptr [[TMP1]], i64 2, i64 16, i32 -1) #[[ATTR1]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i16 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = call ptr @__instrumentor_pre_store(ptr [[TMP13]], i32 0, i64 [[TMP10]], i64 2, i64 2, i32 12, i32 0, i8 1, i8 0, i32 2) #[[ATTR1]] +; CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP14]], align 2 +; CHECK-NEXT: call void @__instrumentor_post_store(ptr [[TMP13]], i32 0, i64 [[TMP10]], i64 2, i64 2, i32 12, i32 0, i8 1, i8 0, i32 -2) #[[ATTR1]] +; CHECK-NEXT: call void @use(ptr [[TMP13]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP7]], ptr @__instrumentor_value_pack, i64 32, i1 false) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw <{ i32, i32, [6 x i8], i16, i32, i32, [4 x i8], float }>, ptr [[TMP7]], i32 0, i32 3 +; CHECK-NEXT: store i16 [[A]], ptr [[TMP12]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw <{ i32, i32, [6 x i8], i16, i32, i32, [4 x i8], float }>, ptr [[TMP7]], i32 0, i32 7 +; CHECK-NEXT: store float [[B]], ptr [[TMP11]], align 4 +; CHECK-NEXT: call void @__instrumentor_post_function(ptr @foo, ptr @__instrumentor_.str, i32 2, ptr [[TMP7]], i8 0, i32 -4) #[[ATTR1]] +; CHECK-NEXT: ret float [[TMP6]] +; +entry: + %0 = alloca i16, align 16 + store i16 %a, ptr %0 + call void @use(ptr %0) + ret float %b +} +; +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +; CHECK: attributes #[[ATTR1]] = { willreturn } +;. diff --git a/llvm/test/Instrumentation/Instrumentor/default_config.json b/llvm/test/Instrumentation/Instrumentor/default_config.json index 263ab58e2566d..336dc20cfd5e0 100644 --- a/llvm/test/Instrumentation/Instrumentor/default_config.json +++ b/llvm/test/Instrumentation/Instrumentor/default_config.json @@ -6,11 +6,48 @@ "runtime_stubs_file.description": "The file into which runtime stubs should be written.", "target_regex": "", "target_regex.description": "Regular expression to be matched against the module target. Only targets that match this regex will be instrumented", + "demangle_function_names": true, + "demangle_function_names.description": "Demangle functions names passed to the runtime.", "host_enabled": true, "host_enabled.description": "Instrument non-GPU targets", "gpu_enabled": true, "gpu_enabled.description": "Instrument GPU targets" }, + "function_pre": { + "function": { + "enabled": true, + "address": true, + "address.description": "The function address.", + "name": true, + "name.description": "The function name.", + "num_arguments": true, + "num_arguments.description": "Number of function arguments (without varargs).", + "arguments": true, + "arguments.replace": true, + "arguments.description": "Description of the arguments.", + "is_main": true, + "is_main.description": "Flag to indicate it is the main function.", + "id": true, + "id.description": "A unique ID associated with the given instrumentor call" + } + }, + "function_post": { + "function": { + "enabled": true, + "address": true, + "address.description": "The function address.", + "name": true, + "name.description": "The function name.", + "num_arguments": true, + "num_arguments.description": "Number of function arguments (without varargs).", + "arguments": true, + "arguments.description": "Description of the arguments.", + "is_main": true, + "is_main.description": "Flag to indicate it is the main function.", + "id": true, + "id.description": "A unique ID associated with the given instrumentor call" + } + }, "instruction_pre": { "load": { "enabled": true, @@ -34,6 +71,16 @@ "id": true, "id.description": "A unique ID associated with the given instrumentor call" }, + "alloca": { + "enabled": true, + "size": true, + "size.replace": true, + "size.description": "The allocation size.", + "alignment": true, + "alignment.description": "The allocation alignment.", + "id": true, + "id.description": "A unique ID associated with the given instrumentor call" + }, "store": { "enabled": true, "pointer": true, @@ -84,6 +131,18 @@ "id": true, "id.description": "A unique ID associated with the given instrumentor call" }, + "alloca": { + "enabled": true, + "address": true, + "address.replace": true, + "address.description": "The allocated memory address.", + "size": true, + "size.description": "The allocation size.", + "alignment": true, + "alignment.description": "The allocation alignment.", + "id": true, + "id.description": "A unique ID associated with the given instrumentor call" + }, "store": { "enabled": true, "pointer": true,